153 lines
5.6 KiB
Plaintext
153 lines
5.6 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "a78759c8",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||
"/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/datasets/load.py:1461: FutureWarning: The repository for Gholamreza/pquad contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/Gholamreza/pquad\n",
|
||
"You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
|
||
"Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
|
||
" warnings.warn(\n",
|
||
"Downloading builder script: 4.41kB [00:00, 4.07MB/s]\n",
|
||
"Downloading readme: 5.15kB [00:00, 7.92MB/s]\n",
|
||
"Downloading data: 100%|██████████| 26.4M/26.4M [01:05<00:00, 406kB/s] \n",
|
||
"Downloading data: 100%|██████████| 3.49M/3.49M [00:00<00:00, 5.18MB/s]\n",
|
||
"Downloading data: 100%|██████████| 3.45M/3.45M [00:00<00:00, 5.38MB/s]\n",
|
||
"Generating train split: 0%| | 0/63994 [00:00<?, ? examples/s]"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/root/.cache/huggingface/datasets/downloads/e49d5f650d69a5999fe6ceb4438a023cccdcf3e6519abc4dabce736f91595591\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Generating train split: 100%|██████████| 63994/63994 [00:02<00:00, 21411.84 examples/s]\n",
|
||
"Generating validation split: 21%|██▏ | 1703/7976 [00:00<00:00, 16945.09 examples/s]"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/root/.cache/huggingface/datasets/downloads/ea42ddfa9db6f39bc3249a878c853a6f6b466f6217a360bbb8afbac9410d84cc\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Generating validation split: 100%|██████████| 7976/7976 [00:00<00:00, 23678.57 examples/s]\n",
|
||
"Generating test split: 18%|█▊ | 1434/8002 [00:00<00:00, 10262.32 examples/s]"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"/root/.cache/huggingface/datasets/downloads/d6ba3b80ff2a6d0333454fac286694b5e777518ea141e0dcd7c0558b71624882\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Generating test split: 100%|██████████| 8002/8002 [00:00<00:00, 20511.40 examples/s]\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from datasets import load_dataset\n",
|
||
"\n",
|
||
"dataset = load_dataset(\"Gholamreza/pquad\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "c91f659a",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"48273\n",
|
||
"63994\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"all_dataset = []\n",
|
||
"for data in dataset[\"train\"]:\n",
|
||
" if len(data[\"answers\"][\"text\"]) > 0:\n",
|
||
" all_dataset.append({'question': data['question'], 'passgae_positive': [data['context']], 'passgae_negative': []})\n",
|
||
"\n",
|
||
"\n",
|
||
"print(len(all_dataset))\n",
|
||
"print(len(dataset[\"train\"]))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "d66809ce",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"{'question': 'در 816 مرعشی از حکمرانی چه کسی در تنکابن یاد کرده\\u200cاست؟', 'passgae_positive': ['در ۸۰۶ خواندمیر به ولایت تنکابن اشاره کرده و در ۸۱۶ مرعشی از حکمرانیِ سیدداوود کارکیای تنکابنی، فرزند سیدهادی کیا، در تنکابن یاد کرده\\u200cاست. مَلک کیومرث ــ که در ۸۳۰ به مخالفت با سادات گیلان برخاسته بود ــ در ۸۳۱ عمارت خاصة سید داوود کارکیای تنکابنی را که در اواخر تابستان هنوز در ییلاق به سر می\\u200cبرد، آتش زد و برخی اهالی را به قتل رساند. در ۸۶۵ مازندرانی از «موضع تنکابن» در «مملکت گیلان» نام برده\\u200cاست. مرعشی در ۸۸۹ به حرکت خود از کِلیشُم (از قرای ییلاقی تنکابن) به تنکابن برای تصرف «دشت تنکابن» اشاره کرده\\u200cاست.'], 'passgae_negative': []}\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(all_dataset[10000])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "9a566e69",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.12"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|