embedding_model/data_preprocess/data_loader_longrag-fa.ipynb
2025-11-06 07:18:15 +00:00

123 lines
5.3 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "a78759c8",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"Downloading readme: 2.68kB [00:00, 2.49MB/s]\n",
"Downloading data: 100%|██████████| 68.3k/68.3k [00:00<00:00, 160kB/s]\n",
"Generating test split: 1400 examples [00:00, 159163.68 examples/s]\n"
]
}
],
"source": [
"from datasets import load_dataset\n",
"\n",
"dataset = load_dataset(\"MCINext/LongRag-Fa\")\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "c91f659a",
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "'train'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m all_dataset \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m data \u001b[38;5;129;01min\u001b[39;00m \u001b[43mdataset\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrain\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m:\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(data)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n",
"File \u001b[0;32m/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/datasets/dataset_dict.py:74\u001b[0m, in \u001b[0;36mDatasetDict.__getitem__\u001b[0;34m(self, k)\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, k) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Dataset:\n\u001b[1;32m 73\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(k, (\u001b[38;5;28mstr\u001b[39m, NamedSplit)) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__getitem__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 76\u001b[0m available_suggested_splits \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 77\u001b[0m split \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m (Split\u001b[38;5;241m.\u001b[39mTRAIN, Split\u001b[38;5;241m.\u001b[39mTEST, Split\u001b[38;5;241m.\u001b[39mVALIDATION) \u001b[38;5;28;01mif\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n\u001b[1;32m 78\u001b[0m ]\n",
"\u001b[0;31mKeyError\u001b[0m: 'train'"
]
}
],
"source": [
"all_dataset = []\n",
"for data in dataset[\"train\"]:\n",
" print(data)\n",
" break\n",
" # if len(data[\"answers\"][\"text\"]) > 0:\n",
" # all_dataset.append({'question': data['question'], 'passgae_positive': [data['context']], 'passgae_negative': []})\n",
"\n",
"print(len(all_dataset))\n",
"print(len(dataset[\"train\"]))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d66809ce",
"metadata": {},
"outputs": [],
"source": [
"print(all_dataset[10])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e2f94154",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_id': 'q0', 'text': 'یک سرمایه\\u200cگذار باید برای بررسی دقیق گزارش\\u200cهای مالی سالانه یا فصلی یک شرکت، با تمرکز بر شاخص\\u200cهای اقتصادی عملکرد، چه رویکرد سیستماتیکی را دنبال کند؟\\n'}\n"
]
}
],
"source": [
"import json\n",
"\n",
"with open(\"./data/longrag/queries.jsonl\", \"r\") as f:\n",
" for data in f:\n",
" data = json.loads(data)\n",
" print(data)\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "efae8a38",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}