{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "a78759c8", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "Downloading readme: 2.68kB [00:00, 2.49MB/s]\n", "Downloading data: 100%|██████████| 68.3k/68.3k [00:00<00:00, 160kB/s]\n", "Generating test split: 1400 examples [00:00, 159163.68 examples/s]\n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "dataset = load_dataset(\"MCINext/LongRag-Fa\")\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "c91f659a", "metadata": {}, "outputs": [ { "ename": "KeyError", "evalue": "'train'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[2], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m all_dataset \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m data \u001b[38;5;129;01min\u001b[39;00m \u001b[43mdataset\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtrain\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m:\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(data)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n", "File \u001b[0;32m/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/datasets/dataset_dict.py:74\u001b[0m, in \u001b[0;36mDatasetDict.__getitem__\u001b[0;34m(self, k)\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, k) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Dataset:\n\u001b[1;32m 73\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(k, (\u001b[38;5;28mstr\u001b[39m, NamedSplit)) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m---> 74\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__getitem__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 76\u001b[0m available_suggested_splits \u001b[38;5;241m=\u001b[39m [\n\u001b[1;32m 77\u001b[0m split \u001b[38;5;28;01mfor\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m (Split\u001b[38;5;241m.\u001b[39mTRAIN, Split\u001b[38;5;241m.\u001b[39mTEST, Split\u001b[38;5;241m.\u001b[39mVALIDATION) \u001b[38;5;28;01mif\u001b[39;00m split \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n\u001b[1;32m 78\u001b[0m ]\n", "\u001b[0;31mKeyError\u001b[0m: 'train'" ] } ], "source": [ "all_dataset = []\n", "for data in dataset[\"train\"]:\n", " print(data)\n", " break\n", " # if len(data[\"answers\"][\"text\"]) > 0:\n", " # all_dataset.append({'question': data['question'], 'passgae_positive': [data['context']], 'passgae_negative': []})\n", "\n", "print(len(all_dataset))\n", "print(len(dataset[\"train\"]))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d66809ce", "metadata": {}, "outputs": [], "source": [ "print(all_dataset[10])" ] }, { "cell_type": "code", "execution_count": 7, "id": "e2f94154", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'_id': 'q0', 'text': 'یک سرمایه\\u200cگذار باید برای بررسی دقیق گزارش\\u200cهای مالی سالانه یا فصلی یک شرکت، با تمرکز بر شاخص\\u200cهای اقتصادی عملکرد، چه رویکرد سیستماتیکی را دنبال کند؟\\n'}\n" ] } ], "source": [ "import json\n", "\n", "with open(\"./data/longrag/queries.jsonl\", \"r\") as f:\n", " for data in f:\n", " data = json.loads(data)\n", " print(data)\n", " break" ] }, { "cell_type": "code", "execution_count": null, "id": "efae8a38", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }