{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a78759c8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/datasets/load.py:1461: FutureWarning: The repository for Shitao/MLDR contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/Shitao/MLDR\n",
      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
      "  warnings.warn(\n",
      "Downloading builder script: 5.70kB [00:00, 5.21MB/s]\n",
      "Downloading readme: 4.78kB [00:00, 7.81MB/s]\n",
      "Downloading data: 100%|██████████| 1.26G/1.26G [02:22<00:00, 8.80MB/s]   \n",
      "Downloading data: 100%|██████████| 20.9M/20.9M [00:02<00:00, 7.81MB/s]\n",
      "Downloading data: 100%|██████████| 82.1M/82.1M [00:07<00:00, 11.2MB/s]   \n",
      "Generating train split: 10000 examples [00:34, 286.48 examples/s]\n",
      "Generating dev split: 200 examples [00:00, 381.83 examples/s]\n",
      "Generating test split: 800 examples [00:02, 337.61 examples/s]\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "dataset = load_dataset('Shitao/MLDR', \"en\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "c91f659a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6306\n",
      "9008\n"
     ]
    }
   ],
   "source": [
    "all_dataset = []\n",
    "for data in dataset[\"train\"]:\n",
    "    if len(data[\"answers\"][\"text\"]) > 0:\n",
    "        all_dataset.append({'question': data['question'], 'passgae_positive': [data['context']], 'passgae_negative': []})\n",
    "\n",
    "print(len(all_dataset))\n",
    "print(len(dataset[\"train\"]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d66809ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(all_dataset[10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e2f94154",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10000"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dataset[\"train\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cdded156",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}