{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "a78759c8", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/datasets/load.py:1461: FutureWarning: The repository for Shitao/MLDR contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/Shitao/MLDR\n", "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n", "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n", " warnings.warn(\n", "Downloading builder script: 5.70kB [00:00, 5.21MB/s]\n", "Downloading readme: 4.78kB [00:00, 7.81MB/s]\n", "Downloading data: 100%|██████████| 1.26G/1.26G [02:22<00:00, 8.80MB/s] \n", "Downloading data: 100%|██████████| 20.9M/20.9M [00:02<00:00, 7.81MB/s]\n", "Downloading data: 100%|██████████| 82.1M/82.1M [00:07<00:00, 11.2MB/s] \n", "Generating train split: 10000 examples [00:34, 286.48 examples/s]\n", "Generating dev split: 200 examples [00:00, 381.83 examples/s]\n", "Generating test split: 800 examples [00:02, 337.61 examples/s]\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset('Shitao/MLDR', \"en\")\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "c91f659a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "6306\n", "9008\n" ] } ], "source": [ "all_dataset = []\n", "for data in dataset[\"train\"]:\n", " if len(data[\"answers\"][\"text\"]) > 0:\n", " all_dataset.append({'question': data['question'], 'passgae_positive': [data['context']], 'passgae_negative': []})\n", "\n", "print(len(all_dataset))\n", "print(len(dataset[\"train\"]))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d66809ce", "metadata": {}, "outputs": [], "source": [ "print(all_dataset[10])" ] }, { "cell_type": "code", "execution_count": 6, "id": "e2f94154", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10000" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dataset[\"train\"])" ] }, { "cell_type": "code", "execution_count": null, "id": "cdded156", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }