317 lines
13 KiB
Plaintext
317 lines
13 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 43,
|
||
"id": "a78759c8",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"1000212\n",
|
||
"250666\n",
|
||
"270642\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import json\n",
|
||
"\n",
|
||
"dataset_synthetic_scores = []\n",
|
||
"with open('/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/train.jsonl', 'r', encoding='utf-8') as f:\n",
|
||
" for line in f:\n",
|
||
" data = json.loads(line)\n",
|
||
" dataset_synthetic_scores.append(data)\n",
|
||
"\n",
|
||
"dataset_synthetic_queries = {}\n",
|
||
"with open('/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/queries.jsonl', 'r', encoding='utf-8') as f:\n",
|
||
" for line in f:\n",
|
||
" json_data = json.loads(line)\n",
|
||
" dataset_synthetic_queries[json_data['_id']] = json_data\n",
|
||
"\n",
|
||
"dataset_synthetic_corpus = {}\n",
|
||
"with open('/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/corpus.jsonl', 'r', encoding='utf-8') as f:\n",
|
||
" for line in f:\n",
|
||
" json_data = json.loads(line)\n",
|
||
" dataset_synthetic_corpus[json_data['_id']] = json_data\n",
|
||
"\n",
|
||
"print(len(dataset_synthetic_scores))\n",
|
||
"print(len(dataset_synthetic_queries))\n",
|
||
"print(len(dataset_synthetic_corpus))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 44,
|
||
"id": "bbb2657f",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"223423\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"all_dataset = {}\n",
|
||
"for data_topic in dataset_synthetic_scores:\n",
|
||
" \n",
|
||
" query_id = data_topic['query-id']\n",
|
||
" corpus_id = int(data_topic['corpus-id'])\n",
|
||
" score = data_topic['score']\n",
|
||
"\n",
|
||
" if (query_id in dataset_synthetic_queries) and (corpus_id in dataset_synthetic_corpus):\n",
|
||
" if score == \"1\":\n",
|
||
" if query_id in all_dataset:\n",
|
||
" all_dataset[query_id]['passgae_positive'].append(dataset_synthetic_corpus[corpus_id]['text'])\n",
|
||
" else:\n",
|
||
" all_dataset[query_id] = {'question': dataset_synthetic_queries[query_id]['text'], \n",
|
||
" 'passgae_positive': [dataset_synthetic_corpus[corpus_id]['text']], \n",
|
||
" 'passgae_negative': [], \n",
|
||
" 'passage_negative_random': []}\n",
|
||
" else:\n",
|
||
" if query_id in all_dataset:\n",
|
||
" all_dataset[query_id]['passgae_negative'].append(dataset_synthetic_corpus[corpus_id]['text'])\n",
|
||
" else:\n",
|
||
" all_dataset[query_id] = {'question': dataset_synthetic_queries[query_id]['text'], \n",
|
||
" 'passgae_positive': [],\n",
|
||
" 'passgae_negative': [dataset_synthetic_corpus[corpus_id]['text']],\n",
|
||
" 'passage_negative_random': []}\n",
|
||
"\n",
|
||
"\n",
|
||
"all_dataset = list(all_dataset.values())\n",
|
||
"print(len(all_dataset))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 45,
|
||
"id": "74ef02a1",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"{'question': 'چه کسانی باید کتاب لوازم نویسندگی را بخوانند؟',\n",
|
||
" 'passgae_positive': ['این کتاب به ویژه برای علاقه مندان تازه کار به هنر داستان نویسی مفید است. افرادی که مشتاق یادگیری نمایشنامه، فیلمنامه، حکایت یا قصه هستند می توانند از این راهنمای کاربردی بهره ببرند.'],\n",
|
||
" 'passgae_negative': ['این کتاب در دسته بندی پژوهش ادبی، مجموعه آموزش نویسندگی قرار می گیرد. همچنین این کتاب به عنوان یک نقشه راه برای افرادی که ایده ای را پرورش داده اند و قصد دارند آن را با قلمی رسا بیان کنند، پیشنهاد شده است.',\n",
|
||
" 'این کتاب به ویژه برای افرادی که در حوزه ادبیات کودک فعالیت می\\u200cکنند یا قصد ورود به این حوزه را دارند، مفید و سازنده است. خواندن این کتاب می\\u200cتواند به نویسندگان کمک کند تا با بازار نویسندگی برای کودکان آشنا شوند و مهارت\\u200cهای لازم برای نوشتن آثار مناسب برای این گروه سنی را کسب کنند.',\n",
|
||
" \"کتاب 'همه چیز درباره نویسندگی خلاق' نکات کلیدی و چالش\\u200cهای مختلفی را برای نویسندگان تازه\\u200cکار ارائه می\\u200cدهد. این نکات شامل تکنیک\\u200cهای فرّار برای غلبه بر خشک\\u200cطبعی در نویسندگی، منابع الهام\\u200cبخش، مثال\\u200cها و گزیده\\u200cهای مختلف است. همچنین، مصاحبه\\u200cهایی با نویسندگان موفق در این کتاب وجود دارد که می\\u200cتواند به خوانندگان انگیزه و الهام بیشتری برای نوشتن بدهد.\"],\n",
|
||
" 'passage_negative_random': []}"
|
||
]
|
||
},
|
||
"execution_count": 45,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"all_dataset[71]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 48,
|
||
"id": "8e167b4b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"question:\n",
|
||
"چگونه خنده میتواند به کاهش استرس کمک کند؟\n",
|
||
"--------------------------------\n",
|
||
"passgae_positive:\n",
|
||
"خنده به عنوان یک واکنش طبیعی بدن، میتواند به کاهش سطح هورمونهای استرس مانند کورتیزول کمک کند. تحقیقی از دانشگاه کانزاس نشان داده است که خندیدن در شرایط استرسزا، ضربان قلب افراد را کاهش میدهد و به آنها احساس آرامش بیشتری میدهد. این اثرات مثبت به ویژه در خندههای اجتماعی مشهود است، که نشان میدهد حتی لبخند زدن نیز میتواند به کاهش استرس کمک کند.\n",
|
||
"--------------------------------\n",
|
||
"خندیدن به اشتباهات میتواند به عنوان یک مکانیزم مقابلهای عمل کند که به افراد کمک میکند تا با فشارهای روانی و استرسهای روزمره کنار بیایند. این عمل نه تنها به کاهش تنشهای عاطفی کمک میکند، بلکه میتواند به بهبود روابط اجتماعی نیز منجر شود. در واقع، افرادی که قادر به خندیدن به اشتباهات خود هستند، معمولاً احساس راحتی بیشتری در تعاملات اجتماعی دارند و میتوانند به راحتی با دیگران ارتباط برقرار کنند.\n",
|
||
"{{\"result\": \"1\"}}\n"
|
||
]
|
||
},
|
||
{
|
||
"ename": "NameError",
|
||
"evalue": "name 'result' is not defined",
|
||
"output_type": "error",
|
||
"traceback": [
|
||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||
"Cell \u001b[0;32mIn[48], line 57\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m match:\n\u001b[1;32m 56\u001b[0m result \u001b[38;5;241m=\u001b[39m match\u001b[38;5;241m.\u001b[39mgroup(\u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m---> 57\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mresult\u001b[49m)\n\u001b[1;32m 58\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m--------------------------------\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
||
"\u001b[0;31mNameError\u001b[0m: name 'result' is not defined"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import requests\n",
|
||
"from dotenv import load_dotenv\n",
|
||
"import os\n",
|
||
"import re\n",
|
||
"\n",
|
||
"load_dotenv()\n",
|
||
"\n",
|
||
"qwen = False\n",
|
||
"if qwen:\n",
|
||
" url = \"https://qwen3.chatllm.aiengines.ir/v1/chat/completions\"\n",
|
||
" model = \"Qwen/Qwen3-4B-Instruct-2507\"\n",
|
||
" headers = {\"Content-Type\": \"application/json\", \"Authorization\": f\"Bearer {os.getenv('LLM_AS_RERANKER_PASS')}\"}\n",
|
||
"else:\n",
|
||
" url = \"http://192.168.130.206:4001/v1/chat/completions\"\n",
|
||
" model = \"google/gemma-3-27b-it\"\n",
|
||
" headers = {\"Content-Type\": \"application/json\"}\n",
|
||
"\n",
|
||
"instruction = \"\"\"\n",
|
||
"You are a helpful assistant that help me to find that the text is relevant to the question or not.\n",
|
||
"You are given a question and a text.\n",
|
||
"You must evaluate the text based on the question and return \"1\" if the text is relevant to the question and \"0\" if the text is not relevant to the question.\n",
|
||
" \n",
|
||
"be carefull, I have chosen the text randomly from my dataset so the text must answer the question independently.\n",
|
||
"You must return the result in the following format:\n",
|
||
"{{\"result\": \"1\" or \"0\"}}\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"\n",
|
||
"id = 7850\n",
|
||
"\n",
|
||
"print(\"question:\")\n",
|
||
"print(all_dataset[id][\"question\"])\n",
|
||
"print(\"--------------------------------\")\n",
|
||
"print(\"passgae_positive:\")\n",
|
||
"print(all_dataset[id][\"passgae_positive\"][0])\n",
|
||
"print(\"--------------------------------\")\n",
|
||
"for i in range(len(all_dataset[id][\"passgae_negative\"])):\n",
|
||
" question, passgae_negative = all_dataset[id]['question'], all_dataset[id][\"passgae_negative\"][i]\n",
|
||
" input_message = f\"\"\"{{\"question\": \"{question}\", \"text\": \"{passgae_negative}\"}}\"\"\"\n",
|
||
" messages = [{\"role\": \"system\", \"content\": instruction}, {\"role\": \"user\", \"content\": input_message}]\n",
|
||
"\n",
|
||
" payload = {\n",
|
||
" \"model\": model,\n",
|
||
" \"messages\": messages,\n",
|
||
" \"max_tokens\": 100\n",
|
||
" }\n",
|
||
"\n",
|
||
" req = requests.post(url, headers=headers, json=payload)\n",
|
||
" print(all_dataset[id][\"passgae_negative\"][i])\n",
|
||
" print(req.json()['choices'][0]['message']['content'])\n",
|
||
" out = req.json()['choices'][0]['message']['content']\n",
|
||
" \n",
|
||
" match = re.search(r'\"result\":\\s*([\\d\\.]+)', out)\n",
|
||
"\n",
|
||
" if match:\n",
|
||
" result = match.group(1)\n",
|
||
" print(result)\n",
|
||
" print(\"--------------------------------\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 50,
|
||
"id": "24b13c5b",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'{{\"result\": \"1\"}}'"
|
||
]
|
||
},
|
||
"execution_count": 50,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"out"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 53,
|
||
"id": "30586c26",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"1\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import re\n",
|
||
"\n",
|
||
"out = '{\"result\": \"1\"}'\n",
|
||
"match = re.search(r'\"result\":\\s*\"?([\\d\\.]+)\"?', out)\n",
|
||
"\n",
|
||
"\n",
|
||
"result = match.group(1)\n",
|
||
"\n",
|
||
"print(result)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 57,
|
||
"id": "9fb33634",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"r = [\"\"]\n",
|
||
"if not r:\n",
|
||
" print(\"empty\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "4917b3a0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import faiss\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"\n",
|
||
"x = np.array([1, 2, 3, 4, 5])\n",
|
||
"\n",
|
||
"faiss.normalize_L2(x)\n",
|
||
"\n",
|
||
"print(x)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"id": "ea9dcd98",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": ".venv",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.12"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|