123 lines
3.4 KiB
Plaintext
123 lines
3.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "a78759c8",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
|
" from .autonotebook import tqdm as notebook_tqdm\n",
|
|
"/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/datasets/load.py:1461: FutureWarning: The repository for Shitao/MLDR contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/Shitao/MLDR\n",
|
|
"You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
|
|
"Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
|
|
" warnings.warn(\n",
|
|
"Downloading builder script: 5.70kB [00:00, 5.21MB/s]\n",
|
|
"Downloading readme: 4.78kB [00:00, 7.81MB/s]\n",
|
|
"Downloading data: 100%|██████████| 1.26G/1.26G [02:22<00:00, 8.80MB/s] \n",
|
|
"Downloading data: 100%|██████████| 20.9M/20.9M [00:02<00:00, 7.81MB/s]\n",
|
|
"Downloading data: 100%|██████████| 82.1M/82.1M [00:07<00:00, 11.2MB/s] \n",
|
|
"Generating train split: 10000 examples [00:34, 286.48 examples/s]\n",
|
|
"Generating dev split: 200 examples [00:00, 381.83 examples/s]\n",
|
|
"Generating test split: 800 examples [00:02, 337.61 examples/s]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from datasets import load_dataset\n",
|
|
"\n",
|
|
"dataset = load_dataset('Shitao/MLDR', \"en\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "c91f659a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"6306\n",
|
|
"9008\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"all_dataset = []\n",
|
|
"for data in dataset[\"train\"]:\n",
|
|
" if len(data[\"answers\"][\"text\"]) > 0:\n",
|
|
" all_dataset.append({'question': data['question'], 'passgae_positive': [data['context']], 'passgae_negative': []})\n",
|
|
"\n",
|
|
"print(len(all_dataset))\n",
|
|
"print(len(dataset[\"train\"]))\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d66809ce",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"print(all_dataset[10])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "e2f94154",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"10000"
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"len(dataset[\"train\"])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cdded156",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|