add evaluation

This commit is contained in:
saeedfirouzi 2025-11-12 15:02:02 +00:00
parent bc2cc07411
commit c06572dedb
12 changed files with 3098 additions and 34 deletions

View File

@ -2,8 +2,11 @@ import argparse
import json import json
import math import math
import importlib import importlib
import tqdm from tqdm import tqdm
from hazm import Normalizer from hazm import Normalizer
import random
import numpy as np
import faiss
normalizer = Normalizer() normalizer = Normalizer()
@ -11,7 +14,7 @@ normalizer = Normalizer()
def load_dataset(input_file): def load_dataset(input_file):
with open(input_file, "r", encoding="utf-8") as f: with open(input_file, "r", encoding="utf-8") as f:
dataset = json.load(f) dataset = json.load(f)[:1000]
return dataset return dataset
@ -32,7 +35,7 @@ def calculate_ndcg(scores, n):
return idcg return idcg
dcg = calculate_dcg(scores, n) dcg = calculate_dcg(scores, n)
idcg = calculate_idcg(scores, n) idcg = 1 #calculate_idcg(scores, n)
ndcg = dcg/idcg ndcg = dcg/idcg
return ndcg return ndcg
@ -40,14 +43,12 @@ def calculate_ndcg(scores, n):
def calculate_recall(scores): def calculate_recall(scores):
try: try:
num_ground_truth = scores.count(4) num_ground_truth = scores.count(1)
if num_ground_truth == 0:
num_ground_truth = scores.count(3)
recall_7 = scores[:7].count(4) / num_ground_truth recall_7 = scores[:7].count(1) / num_ground_truth
recall_12 = scores[:12].count(4) / num_ground_truth recall_12 = scores[:12].count(1) / num_ground_truth
recall_20 = scores[:20].count(4) / num_ground_truth recall_20 = scores[:20].count(1) / num_ground_truth
recall_variant = scores[:scores.count(4)].count(4) / scores.count(4) recall_variant = scores[:scores.count(1)].count(1) / scores.count(1)
return recall_7, recall_12, recall_20, recall_variant return recall_7, recall_12, recall_20, recall_variant
except: except:
@ -55,9 +56,9 @@ def calculate_recall(scores):
def calculate_precision(scores): def calculate_precision(scores):
precision_7 = scores[:7].count(4) / 7 precision_7 = scores[:7].count(1) / 7
precision_12 = scores[:12].count(4) / 12 precision_12 = scores[:12].count(1) / 12
precision_20 = scores[:20].count(4) / 20 precision_20 = scores[:20].count(1) / 20
return precision_7, precision_12, precision_20 return precision_7, precision_12, precision_20
@ -85,14 +86,90 @@ def run(input_file, model):
precision_7_scores = [] precision_7_scores = []
precision_12_scores = [] precision_12_scores = []
precision_20_scores = [] precision_20_scores = []
dataset = load_dataset(input_file) all_dataset = load_dataset(input_file)[:1000]
for count, data in enumerate(tqdm.tqdm(dataset)):
question = data["question"] batch_size = 100
chunks = [data["chunks"][str(id)] for id in range(len(data["chunks"].keys()))] len_dataset = len(all_dataset)
scores_llm = [data["scores"][str(id)] for id in range(len(data["chunks"].keys()))] all_dataset_embeddings = [{'question_embedding': "", 'passage_positive_embedding': []} for _ in range(len_dataset)]
scores_embed = []
all_embeddings = []
all_texts = []
print("calculate question embeddings")
# calculate question embeddings
for i in tqdm(range(0, len_dataset, batch_size)):
question_list = []
for id in range(i, min(i + batch_size, len_dataset)):
question_list.append(all_dataset[id]['question'])
question_embeddings = model.embed_texts(question_list, query_is=True)
count = 0
for id in range(i, min(i + batch_size, len_dataset)):
all_dataset_embeddings[id]['question_embedding'] = question_embeddings[count]
count += 1
print("calculate passage positive embeddings")
# calculate passage positive embeddings
for i in tqdm(range(0, len_dataset, batch_size)):
passage_positive_list = []
for id in range(i, min(i + batch_size, len_dataset)):
for passage in all_dataset[id]['passage_positive']:
passage_positive_list.append(passage)
passage_positive_embeddings = model.embed_texts(passage_positive_list)
count = 0
for id in range(i, min(i + batch_size, len_dataset)):
for passage_id in range(len(all_dataset[id]['passage_positive'])):
all_dataset_embeddings[id]['passage_positive_embedding'].append(passage_positive_embeddings[count])
all_embeddings.append(passage_positive_embeddings[count])
all_texts.append(all_dataset[id]['passage_positive'][passage_id])
count += 1
print("calculate passage negative embeddings")
# calculate passage negative embeddings
for i in tqdm(range(0, len_dataset, batch_size)):
passage_negative_list = []
for id in range(i, min(i + batch_size, len_dataset)):
for passage in all_dataset[id]['passage_negative']:
passage_negative_list.append(passage)
passage_negative_embeddings = model.embed_texts(passage_negative_list)
count = 0
for id in range(i, min(i + batch_size, len_dataset)):
for passage_id in range(len(all_dataset[id]['passage_negative'])):
all_embeddings.append(passage_negative_embeddings[count])
all_texts.append(all_dataset[id]['passage_negative'][passage_id])
count += 1
#create faiss index
all_embeddings = np.array(all_embeddings, dtype=np.float32)
print(f"all_embeddings shape: {all_embeddings.shape}")
dim = all_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
faiss.normalize_L2(all_embeddings)
index.add(all_embeddings)
for count, data in enumerate(tqdm(all_dataset)):
#get top 10 chunks
question_embeddings = all_dataset_embeddings[count]['question_embedding']
question_embeddings_normalized = np.array([question_embeddings], dtype=np.float32)
faiss.normalize_L2(question_embeddings_normalized)
scores_embed, ids_embed = index.search(question_embeddings_normalized, 10)
chunks = [all_texts[id] for id in ids_embed[0]]
scores_llm = []
for chunk in chunks: for chunk in chunks:
scores_embed.append(model.run(preprocess_reranker(question, preprocess=True), preprocess_reranker(chunk, preprocess=True, add_extra_word=False))) if chunk in data["passage_positive"]:
scores_llm.append(1)
else:
scores_llm.append(0)
# print(f"question {count}: {question}") # print(f"question {count}: {question}")
# for i in range(len(scores_embed)): # for i in range(len(scores_embed)):

View File

@ -0,0 +1,15 @@
from sentence_transformers import SentenceTransformer
class model():
def __init__(self):
from sentence_transformers import SentenceTransformer
self.model = SentenceTransformer("google/embeddinggemma-300m")
def run(self, question:str, chunk:str)->int:
query_embeddings = self.model.encode_query(question)
document_embeddings = self.model.encode_document(chunk)
similarities = self.model.similarity(query_embeddings, document_embeddings)
return similarities

View File

@ -0,0 +1,17 @@
from sentence_transformers import SentenceTransformer
class model():
def __init__(self):
from sentence_transformers import SentenceTransformer
# self.model = SentenceTransformer("./models/gemma/checkpoint-33246")
self.model = SentenceTransformer("google/embeddinggemma-300m")
self.model.load_adapter("./models/gemma/checkpoint-33246")
def run(self, question:str, chunk:str)->int:
query_embeddings = self.model.encode_query(question)
document_embeddings = self.model.encode_document(chunk)
similarities = self.model.similarity(query_embeddings, document_embeddings)
return similarities

View File

@ -20,6 +20,7 @@
10-Synthetic-persian-qa-retrieval dataset : question = 223423, passage = 250000 : negetaive passage are not exactly different : needs preprocessing 10-Synthetic-persian-qa-retrieval dataset : question = 223423, passage = 250000 : negetaive passage are not exactly different : needs preprocessing
evaluation : 50 question of rahbar
no train no train
NDCG: 0.8452119768348717 NDCG: 0.8452119768348717
Recall 7: 0.3373666606161222 Recall 7: 0.3373666606161222
@ -30,16 +31,6 @@ Precision 7: 0.4714285714285715
Precision 12: 0.41999999999999993 Precision 12: 0.41999999999999993
Precision 20: 0.358 Precision 20: 0.358
train with 100
NDCG: 0.8007791818263832
Recall 7: 0.2617863643550479
Recall 12: 0.3759745806720163
Recall 20: 0.5564983103150418
Recall Variant: 0.36642345327979325
Precision 7: 0.3828571428571429
Precision 12: 0.3449999999999999
Precision 20: 0.311
train with 100 with lora train with 100 with lora
NDCG: 0.8432282495018343 NDCG: 0.8432282495018343
Recall 7: 0.33695911259587386 Recall 7: 0.33695911259587386
@ -50,4 +41,21 @@ Precision 7: 0.4685714285714285
Precision 12: 0.4099999999999999 Precision 12: 0.4099999999999999
Precision 20: 0.35200000000000004 Precision 20: 0.35200000000000004
train with 100 with promt train with 33000 steps on all dataset
NDCG: 0.8414338101165514
Recall 7: 0.3118752420460591
Recall 12: 0.4692991653842038
Recall 20: 0.6261433602218365
Recall Variant: 0.43146001721540145
Precision 7: 0.4514285714285714
Precision 12: 0.4049999999999999
Precision 20: 0.348
evaluation dataset_test : 1000 sample
no train :
NDCG: 0.991
train with 33000 steps on all dataset :
NDCG: 0.9975

View File

@ -0,0 +1,37 @@
import json
from datasets import load_dataset
from tqdm import tqdm
names = ["MCINext/FEVER_FA_test_top_250_only_w_correct-v2", "MCINext/fiqa-fa-v2", "MCINext/HotpotQA_FA_test_top_250_only_w_correct-v2",
"MCINext/MSMARCO_FA_test_top_250_only_w_correct-v2", "MCINext/NQ_FA_test_top_250_only_w_correct-v2", "MCINext/quora-fa-v2", "MCINext/scifact-fa-v2",
"MCINext/synthetic-persian-chatbot-rag-faq-retrieval", "MCINext/synthetic-persian-qa-retrieval", "MCINext/trec-covid-fa-v2"]
for name in tqdm(names):
print(f"loading {name}")
dataset_qrel = load_dataset(name)["test"]
dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
dataset_corpus = {}
for data in dataset_corpus_list:
dataset_corpus[data["_id"]] = data["text"]
dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
dataset_queries = {}
for data in dataset_queries_list:
dataset_queries[data["_id"]] = data["text"]
dataset = []
print("start creating dataset")
for data in dataset_qrel:
if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
dataset.append({
"question": dataset_queries[data["query-id"]],
"passage_positive": [dataset_corpus[data["corpus-id"]]],
"passage_negative": [],
"passage_negative_random": [],
})
print(f"length of dataset: {len(dataset)}")
with open(f"./research_notebook/data/mci/{name.split('/')[-1]}_v2.json", "w") as f:
json.dump(dataset, f, indent=4, ensure_ascii=False)

View File

@ -0,0 +1,30 @@
import json
from datasets import load_dataset
dataset_qrel = load_dataset("MCINext/scidocs-fa-v2")["test"]
dataset_corpus_list = load_dataset("MCINext/scidocs-fa-v2",data_files="corpus.jsonl")["train"]
dataset_corpus = {}
for data in dataset_corpus_list:
dataset_corpus[data["_id"]] = data["text"]
dataset_queries_list = load_dataset("MCINext/scidocs-fa-v2",data_files="queries.jsonl")["train"]
dataset_queries = {}
for data in dataset_queries_list:
dataset_queries[data["_id"]] = data["text"]
dataset = []
print("start creating dataset")
for data in dataset_qrel:
if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
dataset.append({
"question": dataset_queries[data["query-id"]],
"passage_positive": [dataset_corpus[data["corpus-id"]]],
"passage_negative": [],
"passage_negative_random": [],
})
print(f"length of dataset: {len(dataset)}")
with open("./research_notebook/data/scidocs/scidocs_v2.json", "w") as f:
json.dump(dataset, f, indent=4, ensure_ascii=False)

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -52,12 +52,13 @@ def main(add_prompt, lora):
args = SentenceTransformerTrainingArguments( args = SentenceTransformerTrainingArguments(
output_dir="./models/gemma", output_dir="./models/gemma",
num_train_epochs=1, # num_train_epochs=1,
max_steps=50,
per_device_train_batch_size=32, per_device_train_batch_size=32,
learning_rate=2e-5, learning_rate=2e-5,
warmup_ratio=0.05, warmup_ratio=0.05,
logging_steps=train_dataset.num_rows, logging_steps=10,
report_to="none", report_to="tensorboard",
save_steps=10000, save_steps=10000,
save_total_limit=2, save_total_limit=2,
) )

88
train/gemma/test.py Normal file
View File

@ -0,0 +1,88 @@
import torch
from sentence_transformers import SentenceTransformer
model_id = "google/embeddinggemma-300M"
model = SentenceTransformer(model_id)
print("Original model")
k = 0
for name, param in model.named_parameters():
print(name)
print(param)
k += 1
if k > 1:
break
model_id = "./models/gemma/checkpoint-33246"
model_lora = SentenceTransformer(model_id)
print("LoRA model")
k = 0
for name, param in model_lora.named_parameters():
print(name)
print(param)
k += 1
if k == 3:
a = param
if k == 4:
b = param
if k > 3:
delta = (b @ a) * 2.0
print(delta)
break
print(k)
import torch
import torch
def compare_lora_to_base(model_lora, model_base, lora_scale=1.0):
"""
Compare how much each weight matrix has changed between
the base model and the LoRA-adapted model.
"""
report = []
total_change = 0.0
total_params = 0
has_lora = []
no_lora = []
for name, module in model_lora.named_modules():
# LoRA modules typically have lora_A and lora_B
if hasattr(module, "lora_A") and hasattr(module, "lora_B"):
A = module.lora_A["default"].weight.data
B = module.lora_B["default"].weight.data
delta = (B @ A) * lora_scale
# Find matching base layer
try:
base_weight = model_base.get_submodule(name).weight.data
has_lora.append(name)
except Exception:
no_lora.append(name)
new_weight = base_weight + delta
diff = (new_weight - base_weight).abs()
relative_change = diff / (base_weight.abs() + 1e-8)
mean_change = relative_change.mean().item() * 100
report.append((name, mean_change))
total_change += relative_change.sum().item()
total_params += relative_change.numel()
else:
no_lora.append(name)
print("has_lora", has_lora)
print("no_lora", no_lora)
print("lora num", len(has_lora))
print("no lora num", len(no_lora))
overall_change = (total_change / total_params) * 100 if total_params > 0 else 0.0
return report, overall_change
report, overall_change = compare_lora_to_base(model_lora, model, lora_scale=2.0)
print(f"overall_change: {overall_change}")