2025-11-09 13:44:28 +00:00

123 lines
3.4 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a78759c8",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/datasets/load.py:1461: FutureWarning: The repository for Shitao/MLDR contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/Shitao/MLDR\n",
"You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
"Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
" warnings.warn(\n",
"Downloading builder script: 5.70kB [00:00, 5.21MB/s]\n",
"Downloading readme: 4.78kB [00:00, 7.81MB/s]\n",
"Downloading data: 100%|██████████| 1.26G/1.26G [02:22<00:00, 8.80MB/s] \n",
"Downloading data: 100%|██████████| 20.9M/20.9M [00:02<00:00, 7.81MB/s]\n",
"Downloading data: 100%|██████████| 82.1M/82.1M [00:07<00:00, 11.2MB/s] \n",
"Generating train split: 10000 examples [00:34, 286.48 examples/s]\n",
"Generating dev split: 200 examples [00:00, 381.83 examples/s]\n",
"Generating test split: 800 examples [00:02, 337.61 examples/s]\n"
]
}
],
"source": [
"from datasets import load_dataset\n",
"\n",
"dataset = load_dataset('Shitao/MLDR', \"en\")\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c91f659a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"6306\n",
"9008\n"
]
}
],
"source": [
"all_dataset = []\n",
"for data in dataset[\"train\"]:\n",
" if len(data[\"answers\"][\"text\"]) > 0:\n",
" all_dataset.append({'question': data['question'], 'passgae_positive': [data['context']], 'passgae_negative': []})\n",
"\n",
"print(len(all_dataset))\n",
"print(len(dataset[\"train\"]))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d66809ce",
"metadata": {},
"outputs": [],
"source": [
"print(all_dataset[10])"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e2f94154",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10000"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(dataset[\"train\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cdded156",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}