add modify question

2025-11-16 15:30:36 +00:00 · 2025-11-16 15:30:36 +00:00 · 0ec2c4d8ce
commit 0ec2c4d8ce
parent c06572dedb
13 changed files with 5414 additions and 1277 deletions
--- a/data_preprocess/generate_random_negative_sample.py
+++ b/data_preprocess/generate_random_negative_sample.py
@ -8,12 +8,12 @@ from data_preprocess.text_embedder import TextEmbedder

 THRESHOLD_MULTIPLY = 0.95
 RANDOM_NEGATIVE_COUNT = 6
-batch_size = 100
+batch_size = 1000

 text_embedder = TextEmbedder()


-def generate_random_negative_sample(all_dataset):
+def generate_random_negative_sample(all_dataset, corpus_list=[]):
    """
    generate random negative sample from dataset
    Args:
@ -35,7 +35,7 @@ def generate_random_negative_sample(all_dataset):
        for id in range(i, min(i + batch_size, len_dataset)):
            question_list.append(all_dataset[id]['question'])

-        question_embeddings = text_embedder.embed_texts(question_list)
+        question_embeddings = text_embedder.embed_texts(question_list, do_preprocess=False, convert_to_numpy=False)

        count = 0   
        for id in range(i, min(i + batch_size, len_dataset)):
@ -52,7 +52,7 @@ def generate_random_negative_sample(all_dataset):
            for passage in all_dataset[id]['passage_positive']:
                passage_positive_list.append(passage)

-        passage_positive_embeddings = text_embedder.embed_texts(passage_positive_list)
+        passage_positive_embeddings = text_embedder.embed_texts(passage_positive_list, do_preprocess=False, convert_to_numpy=False)

        count = 0
        for id in range(i, min(i + batch_size, len_dataset)):
@ -71,7 +71,7 @@ def generate_random_negative_sample(all_dataset):
            for passage in all_dataset[id]['passage_negative']:
                passage_negative_list.append(passage)

-        passage_negative_embeddings = text_embedder.embed_texts(passage_negative_list)
+        passage_negative_embeddings = text_embedder.embed_texts(passage_negative_list, do_preprocess=False, convert_to_numpy=False)

        count = 0
        for id in range(i, min(i + batch_size, len_dataset)):
@ -80,10 +80,18 @@ def generate_random_negative_sample(all_dataset):
                all_texts.append(all_dataset[id]['passage_negative'][passage_id])
                count += 1

+    print("calculate corpus embeddings")
+    # calculate corpus embeddings
+    for i in tqdm(range(0, len(corpus_list), batch_size)):
+        corpus_embeddings = text_embedder.embed_texts(corpus_list[i:i+batch_size], do_preprocess=False, convert_to_numpy=False)
+        all_embeddings.extend(corpus_embeddings)
+        all_texts.extend(corpus_list[i:i+batch_size])
+
    ############ Create FAISS index ############
    all_embeddings = np.array(all_embeddings, dtype=np.float32)
    dim = all_embeddings.shape[1]
-    index = faiss.IndexFlatIP(dim)
+    # index = faiss.IndexFlatIP(dim)
+    index = faiss.IndexHNSWFlat(dim, 32, faiss.METRIC_INNER_PRODUCT)
    faiss.normalize_L2(all_embeddings)
    index.add(all_embeddings)

@ -94,15 +102,16 @@ def generate_random_negative_sample(all_dataset):
        question_embeddings = all_dataset_embeddings[id]['question_embedding']
        question_embeddings_normalized = np.array([question_embeddings], dtype=np.float32)
        faiss.normalize_L2(question_embeddings_normalized)
-        passage_positive_embeddings = all_dataset_embeddings[id]['passage_positive_embedding'][0]
+        # passage_positive_embeddings = all_dataset_embeddings[id]['passage_positive_embedding'][0]

-        score_question_passage_positive = np.dot(question_embeddings, passage_positive_embeddings)
+        # score_question_passage_positive = np.dot(question_embeddings, passage_positive_embeddings)

-        num_retrieved = 30
+        num_retrieved = 15
        vector_scores, vector_ids = index.search(question_embeddings_normalized, num_retrieved)
        for vector_score, vector_id in zip(vector_scores[0], vector_ids[0]):
-            if (all_texts[vector_id] not in not_valid_passages) and (vector_score < THRESHOLD_MULTIPLY * score_question_passage_positive):
+            if (all_texts[vector_id] not in not_valid_passages):# and (vector_score < THRESHOLD_MULTIPLY * score_question_passage_positive):
                all_dataset[id]['passage_negative_random'].append(all_texts[vector_id])
+                not_valid_passages.append(all_texts[vector_id])
            
            if len(all_dataset[id]['passage_negative_random']) >= RANDOM_NEGATIVE_COUNT:
                break
--- a/data_preprocess/modify_question_model.py
+++ b/data_preprocess/modify_question_model.py
@ -0,0 +1,103 @@
+from typing import List, Dict, Any
+import json
+import asyncio
+import aiohttp
+import time
+import re
+
+
+
+model_url = "http://192.168.130.206:4001/v1"
+model = "google/gemma-3-27b-it"
+
+
+class LLMModel:
+    def __init__(self):
+
+        self.instruction = """
+            You are a helpful assistant that help to me to modify and change the input question.
+            I will give you a question and its text and you must replace the words of question with synonyms or similar words.
+
+            ## Important:
+            - replace the words of question with synonyms or similar words.
+
+            -the question must be in persian language.
+
+            return the question nothing else.
+            """
+
+    async def run_llm(self, session, question, text):
+        """
+        Run the llm model.
+        Args:
+            session: The session to use for the request.
+            question: The question to evaluate the text.
+            text: The text to evaluate.
+        Returns:
+            The result of the text.
+        """
+        headers = {"Content-Type": "application/json"}
+
+        input_message = f"""{{"question": "{question}", "text": "{text}"}}"""
+        messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
+
+        payload = {
+            "model": model,
+            "messages": messages,
+            "max_tokens": 100
+        }
+        try:
+            async with session.post(model_url + "/chat/completions", headers=headers, json=payload) as resp:
+                resp.raise_for_status()
+                response = await resp.json()
+            
+            result = response['choices'][0]['message']['content']
+            print(f"question: {question}")
+            print(f"result: {result}")
+            print("--------------------------------")
+            
+            return result
+
+        except Exception as e:
+            try:
+                print(f"Error in llm model {response}: {e}")
+            except:
+                print(f"Error in llm model: {e}")
+            return ""
+
+
+    async def run_llm_async(self, question_list, text_list):
+        """
+        Send all chunk requests concurrently.
+        Args:
+            question_list: The list of questions.
+            text_list: The list of texts.
+        Returns:
+            The list of results.
+        """
+        async with aiohttp.ClientSession() as session:
+            tasks = [self.run_llm(session, question, text) for question, text in zip(question_list, text_list)]
+            results = await asyncio.gather(*tasks)
+        return results
+
+
+    def modify_question_llm(self, query_list: List[str], text_list: List[str]) -> List[Dict[str, Any]]:
+        """
+        Modify question of the documents based on the query using the LLM model.
+        Args:
+            query_list: The list of queries.
+            text_list: The list of texts.
+        Returns:
+            The list of modified questions.
+        """
+        if not text_list:
+            return []
+
+        start_time = time.time()
+        results = asyncio.run(self.run_llm_async(query_list, text_list))
+        end_time = time.time()
+        # print(f"Time taken for llm model: {end_time - start_time} seconds")
+        
+        return results
+
+
--- a/data_preprocess/preprocess_v1.py
+++ b/data_preprocess/preprocess_v1.py
@ -153,9 +153,9 @@ def main(output_path):
    #load synthetic dataset
    print("--------------------------------")
    print("loading synthetic dataset")
-    synthetic_train_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/train.jsonl"
-    synthetic_corpus_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/corpus.jsonl"
-    synthetic_queries_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/queries.jsonl"
+    synthetic_train_path = "/home/firouzi/embedding_model/research_notebook/data/synthetic-persian-qa-retrieval/train.jsonl"
+    synthetic_corpus_path = "/home/firouzi/embedding_model/research_notebook/data/synthetic-persian-qa-retrieval/corpus.jsonl"
+    synthetic_queries_path = "/home/firouzi/embedding_model/research_notebook/data/synthetic-persian-qa-retrieval/queries.jsonl"

    synthetic_dataset = load_synthetic_dataset(synthetic_train_path, synthetic_queries_path, synthetic_corpus_path)
    print(f"synthetic dataset loaded : {len(synthetic_dataset)} samples")
@ -173,11 +173,11 @@ def main(output_path):
    print(f"successfully merged synthetic and pquad dataset")
    print("--------------------------------")

-    # removing false negative samples from all dataset
-    print("start to remove false negative samples from all dataset")
-    all_dataset = remove_false_negative(all_dataset, random_negative_sample=False)
-    print(f"successfully removed false negative samples from all dataset")
-    print("--------------------------------")
+    # # removing false negative samples from all dataset
+    # print("start to remove false negative samples from all dataset")
+    # all_dataset = remove_false_negative(all_dataset, random_negative_sample=False)
+    # print(f"successfully removed false negative samples from all dataset")
+    # print("--------------------------------")

    # with open("/home/firouzi/embedding_model/data/train.json", "r", encoding="utf-8") as f:
    #     all_dataset = json.load(f)
--- a/data_preprocess/preprocess_v2.py
+++ b/data_preprocess/preprocess_v2.py
@ -0,0 +1,160 @@
+import argparse
+from datasets import load_dataset
+import json
+from tqdm import tqdm
+
+from data_preprocess.remove_false_negative_model import LLMModel
+from data_preprocess.generate_random_negative_sample import generate_random_negative_sample
+
+
+llm_model = LLMModel()
+
+
+def load_msmarco_dataset():
+    """
+    load pquad dataset from huggingface
+    output:
+    [{
+        "question": "",
+        "passage_positive": [],
+        "passage_negative": [],
+        "passage_negative_random": []
+    }]
+    """
+    print("start loading msmarco dataset")
+    name = "MCINext/msmarco-fa"
+    dataset_qrel = load_dataset(name)["train"]
+
+    print("start loading corpus")
+    dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
+    dataset_corpus = {}
+    for data in dataset_corpus_list:
+        dataset_corpus[str(data["_id"])] = data["text"]
+
+    print("start loading queries")
+    dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
+    dataset_queries = {}
+    for data in dataset_queries_list:
+        dataset_queries[str(data["_id"])] = data["text"]
+
+
+    dataset = []
+    print("start creating dataset")
+    for data in tqdm(dataset_qrel):
+
+        if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
+            dataset.append({
+                "question": dataset_queries[data["query-id"]],
+                "passage_positive": [dataset_corpus[data["corpus-id"]]],
+                "passage_negative": [],
+                "passage_negative_random": [],
+            })
+
+    print(f"length of dataset: {len(dataset)}")
+    print("--------------------------------")
+    return dataset, list(dataset_corpus.values())
+
+
+def remove_false_negative(dataset, random_negative_sample=False):
+    """
+    remove false negative samples from synthetic dataset
+    Args:
+        dataset: list of dicts
+    Returns:
+        dataset: list of dicts
+    """
+    if random_negative_sample:
+        negative_name = "passage_negative_random"
+    else:
+        negative_name = "passage_negative"
+
+    # calculate passage negative embeddings
+    negative_count_all = 0
+    negative_count_removed = 0
+    len_dataset = len(dataset)
+    batch_size = 50
+    for i in tqdm(range(0, len_dataset, batch_size)):
+
+        question_list = []
+        passage_negative_list = []
+        for id in range(i, min(i + batch_size, len_dataset)):
+            for passage in dataset[id][negative_name]:
+                question_list.append(dataset[id]['question'])
+                passage_negative_list.append(passage)
+
+        results = llm_model.remove_false_negative_llm(question_list, passage_negative_list)
+
+        negative_count_removed += len([_ for _ in results if _ == "1"])
+        negative_count_all += len(results)
+
+        count = 0
+        for id in range(i, min(i + batch_size, len_dataset)):
+            new_negative_list = []
+            for passage_id in range(len(dataset[id][negative_name])):
+                if results[count] == "0":
+                    new_negative_list.append(dataset[id][negative_name][passage_id])
+                count += 1
+            dataset[id][negative_name] = new_negative_list
+
+    print(f"removed {negative_count_removed} false negative samples from {negative_count_all} samples")
+    print("--------------------------------")
+
+    return dataset
+
+
+def save_dataset(dataset, output_path):
+    """
+    save dataset to json file
+    Args:
+        dataset: list of dicts
+        output_path: path to save dataset
+    """
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(dataset, f, ensure_ascii=False, indent=4)
+
+
+def main(output_path):
+
+    #load msmarco dataset
+    print("--------------------------------")
+    all_dataset, corpus_list = load_msmarco_dataset()
+    print(f"msmarco dataset loaded : {len(all_dataset)} samples")
+    print("--------------------------------")
+
+    #generate random negative samples
+    print("start to generate random negative samples")
+    all_dataset = generate_random_negative_sample(all_dataset, corpus_list)
+    print(f"successfully generated random negative samples")
+    print("--------------------------------")
+
+    # removing random false negative samples from all dataset
+    print("start to remove random false negative samples from all dataset")
+    all_dataset = remove_false_negative(all_dataset, random_negative_sample=True)
+    print(f"successfully removed random false negative samples from all dataset")
+    print("--------------------------------")
+
+    # save dataset
+    print("start to save dataset")
+    save_dataset(all_dataset, output_path)
+    print(f"successfully saved dataset")
+    print("--------------------------------")
+
+if __name__ == "__main__":
+    """
+    preprocess dataset for training
+
+    pipelines:
+        load msmarco dataset from huggingface
+        generate random negative samples
+        save dataset to json file
+
+
+    python preprocess_v2.py --output_path /home/firouzi/embedding_model/data/v2/msmarco_train.json
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_path", type=str, required=True)
+    args = parser.parse_args()
+
+    output_path = args.output_path
+
+    main(output_path)
--- a/data_preprocess/preprocess_v3.py
+++ b/data_preprocess/preprocess_v3.py
@ -0,0 +1,142 @@
+import argparse
+from datasets import load_dataset
+import json
+from tqdm import tqdm
+import time
+
+from data_preprocess.modify_question_model import LLMModel
+
+
+llm_model = LLMModel()
+
+
+def load_msmarco_dataset():
+    """
+    load pquad dataset from huggingface
+    output:
+    [{
+        "question": "",
+        "passage_positive": [],
+        "passage_negative": [],
+        "passage_negative_random": []
+    }]
+    """
+    print("start loading msmarco dataset")
+    name = "MCINext/msmarco-fa"
+    dataset_qrel = load_dataset(name)["train"]
+
+    print("start loading corpus")
+    dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
+    dataset_corpus = {}
+    for data in dataset_corpus_list:
+        dataset_corpus[str(data["_id"])] = data["text"]
+
+    print("start loading queries")
+    dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
+    dataset_queries = {}
+    for data in dataset_queries_list:
+        dataset_queries[str(data["_id"])] = data["text"]
+
+
+    dataset = []
+    print("start creating dataset")
+    for data in tqdm(dataset_qrel):
+
+        if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
+            dataset.append({
+                "question": dataset_queries[data["query-id"]],
+                "passage_positive": [dataset_corpus[data["corpus-id"]]],
+                "new_question": "",
+                "passage_negative": [],
+                "passage_negative_random": [],
+            })
+
+    print(f"length of dataset: {len(dataset)}")
+    print("--------------------------------")
+    return dataset
+
+
+def modify_question(dataset):
+    """
+    modify question of dataset
+    Args:
+        dataset: list of dicts
+    Returns:
+        dataset: list of dicts
+    """
+
+    len_dataset = len(dataset)
+    batch_size = 50
+    for i in tqdm(range(0, len_dataset, batch_size)):
+
+        question_list = []
+        passage_positive_list = []
+        for id in range(i, min(i + batch_size, len_dataset)):
+            question_list.append(dataset[id]['question'])
+            passage_positive_list.append(dataset[id]['passage_positive'][0])
+
+        results = llm_model.modify_question_llm(question_list, passage_positive_list)
+
+        time.sleep(2)
+        
+        count = 0
+        for id in range(i, min(i + batch_size, len_dataset)):
+            dataset[id]["new_question"] = results[count]
+            count += 1
+
+    print(f"successfully modified question")
+    print("--------------------------------")
+
+    return dataset
+
+
+def save_dataset(dataset, output_path):
+    """
+    save dataset to json file
+    Args:
+        dataset: list of dicts
+        output_path: path to save dataset
+    """
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(dataset, f, ensure_ascii=False, indent=4)
+
+
+def main(output_path):
+
+    #load msmarco dataset
+    print("--------------------------------")
+    all_dataset = load_msmarco_dataset()
+    print(f"msmarco dataset loaded : {len(all_dataset)} samples")
+    print("--------------------------------")
+
+    # removing random false negative samples from all dataset
+    print("start to modify question")
+    all_dataset = modify_question(all_dataset[:270000])
+    print(f"successfully modified question")
+    print("--------------------------------")
+
+    # save dataset
+    print("start to save dataset")
+    save_dataset(all_dataset, output_path)
+    print(f"successfully saved dataset")
+    print("--------------------------------")
+
+if __name__ == "__main__":
+    """
+    preprocess dataset for training
+
+    pipelines:
+        load msmarco dataset from huggingface
+        generate random negative samples
+        save dataset to json file
+
+
+    python preprocess_v2.py --output_path /home/firouzi/embedding_model/data/v2/msmarco_train.json
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_path", type=str, required=True)
+    args = parser.parse_args()
+
+    output_path = args.output_path
+
+    main(output_path)
--- a/data_preprocess/remove_false_negative_model.py
+++ b/data_preprocess/remove_false_negative_model.py
@ -102,7 +102,7 @@ class LLMModel:
        start_time = time.time()
        results = asyncio.run(self.run_llm_async(query_list, text_list))
        end_time = time.time()
-        print(f"Time taken for llm model: {end_time - start_time} seconds")
+        # print(f"Time taken for llm model: {end_time - start_time} seconds")
        
        return results

--- a/data_preprocess/stopwords.txt
+++ b/data_preprocess/stopwords.txt
--- a/data_preprocess/text_embedder.py
+++ b/data_preprocess/text_embedder.py
@ -3,6 +3,7 @@ import requests
 import numpy as np
 from dotenv import load_dotenv
 import os
+import time

 load_dotenv()

@ -20,21 +21,26 @@ class TextEmbedder:
        return text
    

-    def embed_texts(self, texts:list[str])->list[list[float]]:
+    def embed_texts(self, texts:list[str], do_preprocess=True, convert_to_numpy=True)->list[list[float]]:
        """
        Embed texts using the model.
        """
        if texts == []:
            return []
-            
-        texts = [self.preprocess_embedder(text) for text in texts]
+                
+        if do_preprocess:
+            texts = [self.preprocess_embedder(text) for text in texts]
        
        payload = {
            "model": self.model_name,
            "input": texts
        }
        responses = requests.post("http://78.38.161.78:3094/v1/embeddings", headers=self.headers, json=payload)
-        embeddings = [np.array(response["embedding"], dtype=np.float32) for response in responses.json()["data"]]
-        
+
+        if convert_to_numpy:
+            embeddings = [np.array(response["embedding"], dtype=np.float32) for response in responses.json()["data"]]
+        else:
+            embeddings = [response["embedding"] for response in responses.json()["data"]]
+
        return embeddings
      
--- a/research_notebook/data_preprocess/arguana_v2.py
+++ b/research_notebook/data_preprocess/arguana_v2.py
@ -6,15 +6,17 @@ from tqdm import tqdm
 names = ["MCINext/FEVER_FA_test_top_250_only_w_correct-v2", "MCINext/fiqa-fa-v2", "MCINext/HotpotQA_FA_test_top_250_only_w_correct-v2",
 "MCINext/MSMARCO_FA_test_top_250_only_w_correct-v2", "MCINext/NQ_FA_test_top_250_only_w_correct-v2", "MCINext/quora-fa-v2", "MCINext/scifact-fa-v2",
 "MCINext/synthetic-persian-chatbot-rag-faq-retrieval", "MCINext/synthetic-persian-qa-retrieval", "MCINext/trec-covid-fa-v2"]
+
+names = names[3:4]
 for name in tqdm(names):
    print(f"loading {name}")
    dataset_qrel = load_dataset(name)["test"]
-    dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
+    dataset_corpus_list = load_dataset(name,data_files="corpus/corpus.jsonl")["train"]
    dataset_corpus = {}
    for data in dataset_corpus_list:
        dataset_corpus[data["_id"]] = data["text"]

-    dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
+    dataset_queries_list = load_dataset(name,data_files="queries/queries.jsonl")["train"]
    dataset_queries = {}
    for data in dataset_queries_list:
        dataset_queries[data["_id"]] = data["text"]
--- a/research_notebook/data_preprocess/data_loader_gholam_pquad.ipynb
+++ b/research_notebook/data_preprocess/data_loader_gholam_pquad.ipynb
@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
   "id": "a78759c8",
   "metadata": {},
   "outputs": [
@ -11,11 +11,7 @@
     "output_type": "stream",
     "text": [
      "/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n",
-      "/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/datasets/load.py:1461: FutureWarning: The repository for Gholamreza/pquad contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/Gholamreza/pquad\n",
-      "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
-      "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
-      "  warnings.warn(\n"
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
@ -27,7 +23,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "id": "c91f659a",
   "metadata": {},
   "outputs": [
@ -54,7 +50,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
   "id": "d66809ce",
   "metadata": {},
   "outputs": [
@ -62,12 +58,12 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "{'question': 'جنگ جهانی اول در چه تاریخی پایان یافت؟', 'passgae_positive': [], 'passgae_negative': ['در سال ۱۸۷۱ امپراتوری آلمان با اتحاد پروس و کنفدراسیون جرمن شمالی توسط اتو ون بیسمارک به وجود آمد. این کشور قدرتمند تا سال ۱۹۱۸ ادامه یافت و با عنوان رایش دوم مشهور شد. بیسمارک توانست استان\\u200cهای جدید زیادی را طی جنگ\\u200cهای مبتکرانهٔ کوتاه و دیپلماتیک به دست آورد. او با اتریش هم پیمان شد تا دانمارک را شکست دهد و ناحیهٔ شلزویگ-هولشتاین را تصرف کند. او جنگ اتریش و پروس (آسترو-پروسیان) را آغاز کرد و پیروز شد اما اینکار فقط برای این بود که ایتالیا طرف آلمان را بگیرد. سپس پروس وارد جنگ فرانسه و پروس (فرانکو-پروسین) (۷۱-۱۸۷۰) شد و توانست شکست کاملی به فرانسه وارد سازد. ویلهلم اول به عنوان آخرین توهین به فرانسوی\\u200cها در کاخ ورسای در قلب فرانسه به عنوان امپراتور آلمان سوگند خورد. امپراتوری آلمان تا پایان جنگ جهانی اول یعنی زمانی که فرانسه توانست در پیمان ورسای تلافی بکند در اوج خود بود.']}\n"
+      "{'question': 'سام میرزا در چه تاریخی توسط نادرشاه دستگیر شد؟', 'passage_positive': ['این ضربت سخت خیال نادر را پریشان کرد و رضا قلی میرزا را که در رکاب بود در طهران گذاشت و خود به داغستان رفت در این سفر اگرچه بعضی از رؤسای طوایف لزکی از در اطاعت درآمدند لیکن غالب سکنه داغستان به قلل جبال پرارتفاع پناه گرفتند و از هر طرف به تعرّض اردوی نادر دست زدند و لطمات بسیار به ایشان وارد آوردند حتّی موقعی به خیمه خود نادر نیز تعرّض رساندند. در رمضان ۱۱۵۴ موقعیکه نادر هنوز در داغستان بود غلامی را که مرتکب انداختن تیر در جنگل سوادکوه شده بود بخدمت او آوردند. نادر او را کور کرد. شخصی بنام سام میرزا که به ادّعای فرزندی شاه سلطان حسین در آذربایجان به سلطنت طلبی برخاسته و محمّد خان پسر سرخای خان لزگی و خوانین دربند و داغستان را با خود همدست نموده بود. نادر توسّط نصر اللّه میرزا و چند تن از سرداران خود انقلاب این حدود را بالاخره خواباند و سام میرزا در ذی\\u200cالقعده ۱۱۵۶ دستگیر گردید.'], 'passage_negative': [], 'passage_negative_random': []}\n"
     ]
    }
   ],
   "source": [
-    "print(all_dataset[1240])"
+    "print(all_dataset[1241])"
   ]
  },
  {
--- a/research_notebook/data_preprocess/data_loader_msmarco.ipynb
+++ b/research_notebook/data_preprocess/data_loader_msmarco.ipynb
--- a/research_notebook/data_preprocess/test.ipynb
+++ b/research_notebook/data_preprocess/test.ipynb
--- a/train/gemma/gemma_train.py
+++ b/train/gemma/gemma_train.py
@ -58,7 +58,7 @@ def main(add_prompt, lora):
        learning_rate=2e-5,
        warmup_ratio=0.05,
        logging_steps=10,
-        report_to="tensorboard",
+        report_to="mlflow",
        save_steps=10000,
        save_total_limit=2,
    )