add evaluation

2025-11-12 15:02:02 +00:00 · 2025-11-12 15:02:02 +00:00 · c06572dedb
commit c06572dedb
parent bc2cc07411
12 changed files with 3098 additions and 34 deletions
--- a/evaluation/evaluate.py
+++ b/evaluation/evaluate.py
@ -2,8 +2,11 @@ import argparse
 import json
 import math
 import importlib
-import tqdm
+from tqdm import tqdm
 from hazm import Normalizer
+import random
+import numpy as np
+import faiss

 normalizer = Normalizer()

@ -11,7 +14,7 @@ normalizer = Normalizer()

 def load_dataset(input_file):
    with open(input_file, "r", encoding="utf-8") as f:
-        dataset = json.load(f)
+        dataset = json.load(f)[:1000]
    return dataset


@ -32,7 +35,7 @@ def calculate_ndcg(scores, n):
        return idcg

    dcg = calculate_dcg(scores, n)
-    idcg = calculate_idcg(scores, n)
+    idcg = 1 #calculate_idcg(scores, n)
    ndcg = dcg/idcg
    return ndcg

@ -40,14 +43,12 @@ def calculate_ndcg(scores, n):
 def calculate_recall(scores):

    try:
-        num_ground_truth = scores.count(4)
-        if num_ground_truth == 0:
-            num_ground_truth = scores.count(3)
+        num_ground_truth = scores.count(1)

-        recall_7 = scores[:7].count(4) / num_ground_truth
-        recall_12 = scores[:12].count(4) / num_ground_truth
-        recall_20 = scores[:20].count(4) / num_ground_truth
-        recall_variant = scores[:scores.count(4)].count(4) / scores.count(4)
+        recall_7 = scores[:7].count(1) / num_ground_truth
+        recall_12 = scores[:12].count(1) / num_ground_truth
+        recall_20 = scores[:20].count(1) / num_ground_truth
+        recall_variant = scores[:scores.count(1)].count(1) / scores.count(1)

        return recall_7, recall_12, recall_20, recall_variant
    except:
@ -55,9 +56,9 @@ def calculate_recall(scores):


 def calculate_precision(scores):
-    precision_7 = scores[:7].count(4) / 7
-    precision_12 = scores[:12].count(4) / 12
-    precision_20 = scores[:20].count(4) / 20
+    precision_7 = scores[:7].count(1) / 7
+    precision_12 = scores[:12].count(1) / 12
+    precision_20 = scores[:20].count(1) / 20

    return precision_7, precision_12, precision_20

@ -85,14 +86,90 @@ def run(input_file, model):
    precision_7_scores = []
    precision_12_scores = []
    precision_20_scores = []
-    dataset = load_dataset(input_file)
-    for count, data in enumerate(tqdm.tqdm(dataset)):
-        question = data["question"]
-        chunks = [data["chunks"][str(id)] for id in range(len(data["chunks"].keys()))]
-        scores_llm = [data["scores"][str(id)] for id in range(len(data["chunks"].keys()))]
-        scores_embed = []
+    all_dataset = load_dataset(input_file)[:1000]
+
+    batch_size = 100
+    len_dataset = len(all_dataset)
+    all_dataset_embeddings = [{'question_embedding': "", 'passage_positive_embedding': []} for _ in range(len_dataset)]
+
+    all_embeddings = []
+    all_texts = []
+
+    print("calculate question embeddings")
+    # calculate question embeddings
+    for i in tqdm(range(0, len_dataset, batch_size)):
+
+        question_list = []
+        for id in range(i, min(i + batch_size, len_dataset)):
+            question_list.append(all_dataset[id]['question'])
+
+        question_embeddings = model.embed_texts(question_list, query_is=True)
+
+        count = 0   
+        for id in range(i, min(i + batch_size, len_dataset)):
+            all_dataset_embeddings[id]['question_embedding'] = question_embeddings[count]
+            count += 1
+
+
+    print("calculate passage positive embeddings")
+    # calculate passage positive embeddings
+    for i in tqdm(range(0, len_dataset, batch_size)):
+
+        passage_positive_list = []
+        for id in range(i, min(i + batch_size, len_dataset)):
+            for passage in all_dataset[id]['passage_positive']:
+                passage_positive_list.append(passage)
+
+        passage_positive_embeddings = model.embed_texts(passage_positive_list)
+
+        count = 0
+        for id in range(i, min(i + batch_size, len_dataset)):
+            for passage_id in range(len(all_dataset[id]['passage_positive'])):
+                all_dataset_embeddings[id]['passage_positive_embedding'].append(passage_positive_embeddings[count])
+                all_embeddings.append(passage_positive_embeddings[count])
+                all_texts.append(all_dataset[id]['passage_positive'][passage_id])
+                count += 1
+
+    print("calculate passage negative embeddings")
+    # calculate passage negative embeddings
+    for i in tqdm(range(0, len_dataset, batch_size)):
+
+        passage_negative_list = []
+        for id in range(i, min(i + batch_size, len_dataset)):
+            for passage in all_dataset[id]['passage_negative']:
+                passage_negative_list.append(passage)
+
+        passage_negative_embeddings = model.embed_texts(passage_negative_list)
+
+        count = 0
+        for id in range(i, min(i + batch_size, len_dataset)):
+            for passage_id in range(len(all_dataset[id]['passage_negative'])):
+                all_embeddings.append(passage_negative_embeddings[count])
+                all_texts.append(all_dataset[id]['passage_negative'][passage_id])
+                count += 1
+
+    #create faiss index
+    all_embeddings = np.array(all_embeddings, dtype=np.float32)
+    print(f"all_embeddings shape: {all_embeddings.shape}")
+    dim = all_embeddings.shape[1]
+    index = faiss.IndexFlatIP(dim)
+    faiss.normalize_L2(all_embeddings)
+    index.add(all_embeddings)
+
+    for count, data in enumerate(tqdm(all_dataset)):
+        #get top 10 chunks
+        question_embeddings = all_dataset_embeddings[count]['question_embedding']
+        question_embeddings_normalized = np.array([question_embeddings], dtype=np.float32)
+        faiss.normalize_L2(question_embeddings_normalized)
+        scores_embed, ids_embed = index.search(question_embeddings_normalized, 10)
+        chunks = [all_texts[id] for id in ids_embed[0]]
+
+        scores_llm = []
        for chunk in chunks:
-            scores_embed.append(model.run(preprocess_reranker(question, preprocess=True), preprocess_reranker(chunk, preprocess=True, add_extra_word=False)))
+            if chunk in data["passage_positive"]:
+                scores_llm.append(1)
+            else:
+                scores_llm.append(0)
        
        # print(f"question {count}: {question}")
        # for i in range(len(scores_embed)):
--- a/evaluation/models_50/pycache/gemma_embed.cpython-310.pyc
+++ b/evaluation/models_50/pycache/gemma_embed.cpython-310.pyc
--- a/evaluation/models_50/pycache/gemma_embed_train.cpython-310.pyc
+++ b/evaluation/models_50/pycache/gemma_embed_train.cpython-310.pyc
--- a/evaluation/models_50/gemma_embed.py
+++ b/evaluation/models_50/gemma_embed.py
@ -0,0 +1,15 @@
+from sentence_transformers import SentenceTransformer
+
+
+class model():
+    def __init__(self):
+        from sentence_transformers import SentenceTransformer
+        self.model = SentenceTransformer("google/embeddinggemma-300m")
+    
+
+    def run(self, question:str, chunk:str)->int:
+        query_embeddings = self.model.encode_query(question)
+        document_embeddings = self.model.encode_document(chunk)
+        similarities = self.model.similarity(query_embeddings, document_embeddings)
+        return similarities
+      
--- a/evaluation/models_50/gemma_embed_train.py
+++ b/evaluation/models_50/gemma_embed_train.py
@ -0,0 +1,17 @@
+from sentence_transformers import SentenceTransformer
+
+
+class model():
+    def __init__(self):
+        from sentence_transformers import SentenceTransformer
+        # self.model = SentenceTransformer("./models/gemma/checkpoint-33246")
+        self.model = SentenceTransformer("google/embeddinggemma-300m")
+        self.model.load_adapter("./models/gemma/checkpoint-33246")
+    
+
+    def run(self, question:str, chunk:str)->int:
+        query_embeddings = self.model.encode_query(question)
+        document_embeddings = self.model.encode_document(chunk)
+        similarities = self.model.similarity(query_embeddings, document_embeddings)
+        return similarities
+      
--- a/notes.txt
+++ b/notes.txt
@ -20,6 +20,7 @@

 10-Synthetic-persian-qa-retrieval dataset : question = 223423, passage = 250000 : negetaive passage are not exactly different : needs preprocessing

+evaluation : 50 question of rahbar
 no train
 NDCG: 0.8452119768348717
 Recall 7: 0.3373666606161222
@ -30,16 +31,6 @@ Precision 7: 0.4714285714285715
 Precision 12: 0.41999999999999993
 Precision 20: 0.358

-train with 100
-NDCG: 0.8007791818263832
-Recall 7: 0.2617863643550479
-Recall 12: 0.3759745806720163
-Recall 20: 0.5564983103150418
-Recall Variant: 0.36642345327979325
-Precision 7: 0.3828571428571429
-Precision 12: 0.3449999999999999
-Precision 20: 0.311
-
 train with 100 with lora 
 NDCG: 0.8432282495018343
 Recall 7: 0.33695911259587386
@ -50,4 +41,21 @@ Precision 7: 0.4685714285714285
 Precision 12: 0.4099999999999999
 Precision 20: 0.35200000000000004

-train with 100 with promt
+train with 33000 steps on all dataset
+NDCG: 0.8414338101165514
+Recall 7: 0.3118752420460591
+Recall 12: 0.4692991653842038
+Recall 20: 0.6261433602218365
+Recall Variant: 0.43146001721540145
+Precision 7: 0.4514285714285714
+Precision 12: 0.4049999999999999
+Precision 20: 0.348
+
+
+evaluation dataset_test : 1000 sample
+
+no train :
+NDCG: 0.991
+
+train with 33000 steps on all dataset : 
+NDCG: 0.9975
--- a/research_notebook/data_preprocess/arguana_v2.py
+++ b/research_notebook/data_preprocess/arguana_v2.py
@ -0,0 +1,37 @@
+import json
+from datasets import load_dataset
+from tqdm import tqdm
+
+
+names = ["MCINext/FEVER_FA_test_top_250_only_w_correct-v2", "MCINext/fiqa-fa-v2", "MCINext/HotpotQA_FA_test_top_250_only_w_correct-v2",
+"MCINext/MSMARCO_FA_test_top_250_only_w_correct-v2", "MCINext/NQ_FA_test_top_250_only_w_correct-v2", "MCINext/quora-fa-v2", "MCINext/scifact-fa-v2",
+"MCINext/synthetic-persian-chatbot-rag-faq-retrieval", "MCINext/synthetic-persian-qa-retrieval", "MCINext/trec-covid-fa-v2"]
+for name in tqdm(names):
+    print(f"loading {name}")
+    dataset_qrel = load_dataset(name)["test"]
+    dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
+    dataset_corpus = {}
+    for data in dataset_corpus_list:
+        dataset_corpus[data["_id"]] = data["text"]
+
+    dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
+    dataset_queries = {}
+    for data in dataset_queries_list:
+        dataset_queries[data["_id"]] = data["text"]
+
+
+    dataset = []
+    print("start creating dataset")
+    for data in dataset_qrel:
+
+        if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
+            dataset.append({
+                "question": dataset_queries[data["query-id"]],
+                "passage_positive": [dataset_corpus[data["corpus-id"]]],
+                "passage_negative": [],
+                "passage_negative_random": [],
+            })
+
+    print(f"length of dataset: {len(dataset)}")
+    with open(f"./research_notebook/data/mci/{name.split('/')[-1]}_v2.json", "w") as f:
+        json.dump(dataset, f, indent=4, ensure_ascii=False)
--- a/research_notebook/data_preprocess/scidocs_v2.py
+++ b/research_notebook/data_preprocess/scidocs_v2.py
@ -0,0 +1,30 @@
+import json
+from datasets import load_dataset
+
+dataset_qrel = load_dataset("MCINext/scidocs-fa-v2")["test"]
+dataset_corpus_list = load_dataset("MCINext/scidocs-fa-v2",data_files="corpus.jsonl")["train"]
+dataset_corpus = {}
+for data in dataset_corpus_list:
+    dataset_corpus[data["_id"]] = data["text"]
+
+dataset_queries_list = load_dataset("MCINext/scidocs-fa-v2",data_files="queries.jsonl")["train"]
+dataset_queries = {}
+for data in dataset_queries_list:
+    dataset_queries[data["_id"]] = data["text"]
+
+
+dataset = []
+print("start creating dataset")
+for data in dataset_qrel:
+
+    if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
+        dataset.append({
+            "question": dataset_queries[data["query-id"]],
+            "passage_positive": [dataset_corpus[data["corpus-id"]]],
+            "passage_negative": [],
+            "passage_negative_random": [],
+        })
+
+print(f"length of dataset: {len(dataset)}")
+with open("./research_notebook/data/scidocs/scidocs_v2.json", "w") as f:
+    json.dump(dataset, f, indent=4, ensure_ascii=False)
--- a/research_notebook/data_preprocess/test.ipynb
+++ b/research_notebook/data_preprocess/test.ipynb
--- a/research_notebook/train/train_gemma.ipynb
+++ b/research_notebook/train/train_gemma.ipynb
--- a/train/gemma/gemma_train.py
+++ b/train/gemma/gemma_train.py
@ -52,12 +52,13 @@ def main(add_prompt, lora):

    args = SentenceTransformerTrainingArguments(
        output_dir="./models/gemma",
-        num_train_epochs=1,
+        # num_train_epochs=1,
+        max_steps=50, 
        per_device_train_batch_size=32,
        learning_rate=2e-5,
        warmup_ratio=0.05,
-        logging_steps=train_dataset.num_rows,
-        report_to="none",
+        logging_steps=10,
+        report_to="tensorboard",
        save_steps=10000,
        save_total_limit=2,
    )
--- a/train/gemma/test.py
+++ b/train/gemma/test.py
@ -0,0 +1,88 @@
+import torch
+from sentence_transformers import SentenceTransformer
+
+
+model_id = "google/embeddinggemma-300M"
+model = SentenceTransformer(model_id)
+
+print("Original model")
+k = 0
+for name, param in model.named_parameters():
+    print(name)
+    print(param)
+    k += 1
+    if k > 1:
+        break
+
+model_id = "./models/gemma/checkpoint-33246"
+model_lora = SentenceTransformer(model_id)
+
+print("LoRA model")
+k = 0
+for name, param in model_lora.named_parameters():
+    print(name)
+    print(param)
+    k += 1
+    if k == 3:
+        a = param
+    if k == 4:
+        b = param
+
+    if k > 3:
+        delta = (b @ a) * 2.0
+        print(delta)
+        break
+print(k)
+
+
+import torch
+
+import torch
+
+def compare_lora_to_base(model_lora, model_base, lora_scale=1.0):
+    """
+    Compare how much each weight matrix has changed between
+    the base model and the LoRA-adapted model.
+    """
+    report = []
+    total_change = 0.0
+    total_params = 0
+    has_lora = []
+    no_lora = []
+    for name, module in model_lora.named_modules():
+        # LoRA modules typically have lora_A and lora_B
+        if hasattr(module, "lora_A") and hasattr(module, "lora_B"):
+            A = module.lora_A["default"].weight.data
+            B = module.lora_B["default"].weight.data
+            delta = (B @ A) * lora_scale
+
+            # Find matching base layer
+            try:
+                base_weight = model_base.get_submodule(name).weight.data
+                has_lora.append(name)
+            except Exception:
+                no_lora.append(name)
+
+            new_weight = base_weight + delta
+
+            diff = (new_weight - base_weight).abs()
+            relative_change = diff / (base_weight.abs() + 1e-8)
+            mean_change = relative_change.mean().item() * 100
+
+            report.append((name, mean_change))
+            total_change += relative_change.sum().item()
+            total_params += relative_change.numel()
+
+        else:
+            no_lora.append(name)
+    print("has_lora", has_lora)
+    print("no_lora", no_lora)
+    print("lora num", len(has_lora))
+    print("no lora num", len(no_lora))
+    overall_change = (total_change / total_params) * 100 if total_params > 0 else 0.0
+    return report, overall_change
+
+
+report, overall_change = compare_lora_to_base(model_lora, model, lora_scale=2.0)
+
+print(f"overall_change: {overall_change}")