add evaluation

This commit is contained in:
saeedfirouzi 2025-11-12 15:02:02 +00:00
parent bc2cc07411
commit c06572dedb
12 changed files with 3098 additions and 34 deletions

View File

@ -2,8 +2,11 @@ import argparse
import json
import math
import importlib
import tqdm
from tqdm import tqdm
from hazm import Normalizer
import random
import numpy as np
import faiss
normalizer = Normalizer()
@ -11,7 +14,7 @@ normalizer = Normalizer()
def load_dataset(input_file):
with open(input_file, "r", encoding="utf-8") as f:
dataset = json.load(f)
dataset = json.load(f)[:1000]
return dataset
@ -32,7 +35,7 @@ def calculate_ndcg(scores, n):
return idcg
dcg = calculate_dcg(scores, n)
idcg = calculate_idcg(scores, n)
idcg = 1 #calculate_idcg(scores, n)
ndcg = dcg/idcg
return ndcg
@ -40,14 +43,12 @@ def calculate_ndcg(scores, n):
def calculate_recall(scores):
try:
num_ground_truth = scores.count(4)
if num_ground_truth == 0:
num_ground_truth = scores.count(3)
num_ground_truth = scores.count(1)
recall_7 = scores[:7].count(4) / num_ground_truth
recall_12 = scores[:12].count(4) / num_ground_truth
recall_20 = scores[:20].count(4) / num_ground_truth
recall_variant = scores[:scores.count(4)].count(4) / scores.count(4)
recall_7 = scores[:7].count(1) / num_ground_truth
recall_12 = scores[:12].count(1) / num_ground_truth
recall_20 = scores[:20].count(1) / num_ground_truth
recall_variant = scores[:scores.count(1)].count(1) / scores.count(1)
return recall_7, recall_12, recall_20, recall_variant
except:
@ -55,9 +56,9 @@ def calculate_recall(scores):
def calculate_precision(scores):
precision_7 = scores[:7].count(4) / 7
precision_12 = scores[:12].count(4) / 12
precision_20 = scores[:20].count(4) / 20
precision_7 = scores[:7].count(1) / 7
precision_12 = scores[:12].count(1) / 12
precision_20 = scores[:20].count(1) / 20
return precision_7, precision_12, precision_20
@ -85,14 +86,90 @@ def run(input_file, model):
precision_7_scores = []
precision_12_scores = []
precision_20_scores = []
dataset = load_dataset(input_file)
for count, data in enumerate(tqdm.tqdm(dataset)):
question = data["question"]
chunks = [data["chunks"][str(id)] for id in range(len(data["chunks"].keys()))]
scores_llm = [data["scores"][str(id)] for id in range(len(data["chunks"].keys()))]
scores_embed = []
all_dataset = load_dataset(input_file)[:1000]
batch_size = 100
len_dataset = len(all_dataset)
all_dataset_embeddings = [{'question_embedding': "", 'passage_positive_embedding': []} for _ in range(len_dataset)]
all_embeddings = []
all_texts = []
print("calculate question embeddings")
# calculate question embeddings
for i in tqdm(range(0, len_dataset, batch_size)):
question_list = []
for id in range(i, min(i + batch_size, len_dataset)):
question_list.append(all_dataset[id]['question'])
question_embeddings = model.embed_texts(question_list, query_is=True)
count = 0
for id in range(i, min(i + batch_size, len_dataset)):
all_dataset_embeddings[id]['question_embedding'] = question_embeddings[count]
count += 1
print("calculate passage positive embeddings")
# calculate passage positive embeddings
for i in tqdm(range(0, len_dataset, batch_size)):
passage_positive_list = []
for id in range(i, min(i + batch_size, len_dataset)):
for passage in all_dataset[id]['passage_positive']:
passage_positive_list.append(passage)
passage_positive_embeddings = model.embed_texts(passage_positive_list)
count = 0
for id in range(i, min(i + batch_size, len_dataset)):
for passage_id in range(len(all_dataset[id]['passage_positive'])):
all_dataset_embeddings[id]['passage_positive_embedding'].append(passage_positive_embeddings[count])
all_embeddings.append(passage_positive_embeddings[count])
all_texts.append(all_dataset[id]['passage_positive'][passage_id])
count += 1
print("calculate passage negative embeddings")
# calculate passage negative embeddings
for i in tqdm(range(0, len_dataset, batch_size)):
passage_negative_list = []
for id in range(i, min(i + batch_size, len_dataset)):
for passage in all_dataset[id]['passage_negative']:
passage_negative_list.append(passage)
passage_negative_embeddings = model.embed_texts(passage_negative_list)
count = 0
for id in range(i, min(i + batch_size, len_dataset)):
for passage_id in range(len(all_dataset[id]['passage_negative'])):
all_embeddings.append(passage_negative_embeddings[count])
all_texts.append(all_dataset[id]['passage_negative'][passage_id])
count += 1
#create faiss index
all_embeddings = np.array(all_embeddings, dtype=np.float32)
print(f"all_embeddings shape: {all_embeddings.shape}")
dim = all_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
faiss.normalize_L2(all_embeddings)
index.add(all_embeddings)
for count, data in enumerate(tqdm(all_dataset)):
#get top 10 chunks
question_embeddings = all_dataset_embeddings[count]['question_embedding']
question_embeddings_normalized = np.array([question_embeddings], dtype=np.float32)
faiss.normalize_L2(question_embeddings_normalized)
scores_embed, ids_embed = index.search(question_embeddings_normalized, 10)
chunks = [all_texts[id] for id in ids_embed[0]]
scores_llm = []
for chunk in chunks:
scores_embed.append(model.run(preprocess_reranker(question, preprocess=True), preprocess_reranker(chunk, preprocess=True, add_extra_word=False)))
if chunk in data["passage_positive"]:
scores_llm.append(1)
else:
scores_llm.append(0)
# print(f"question {count}: {question}")
# for i in range(len(scores_embed)):

View File

@ -0,0 +1,15 @@
from sentence_transformers import SentenceTransformer
class model():
def __init__(self):
from sentence_transformers import SentenceTransformer
self.model = SentenceTransformer("google/embeddinggemma-300m")
def run(self, question:str, chunk:str)->int:
query_embeddings = self.model.encode_query(question)
document_embeddings = self.model.encode_document(chunk)
similarities = self.model.similarity(query_embeddings, document_embeddings)
return similarities

View File

@ -0,0 +1,17 @@
from sentence_transformers import SentenceTransformer
class model():
def __init__(self):
from sentence_transformers import SentenceTransformer
# self.model = SentenceTransformer("./models/gemma/checkpoint-33246")
self.model = SentenceTransformer("google/embeddinggemma-300m")
self.model.load_adapter("./models/gemma/checkpoint-33246")
def run(self, question:str, chunk:str)->int:
query_embeddings = self.model.encode_query(question)
document_embeddings = self.model.encode_document(chunk)
similarities = self.model.similarity(query_embeddings, document_embeddings)
return similarities

View File

@ -20,6 +20,7 @@
10-Synthetic-persian-qa-retrieval dataset : question = 223423, passage = 250000 : negetaive passage are not exactly different : needs preprocessing
evaluation : 50 question of rahbar
no train
NDCG: 0.8452119768348717
Recall 7: 0.3373666606161222
@ -30,16 +31,6 @@ Precision 7: 0.4714285714285715
Precision 12: 0.41999999999999993
Precision 20: 0.358
train with 100
NDCG: 0.8007791818263832
Recall 7: 0.2617863643550479
Recall 12: 0.3759745806720163
Recall 20: 0.5564983103150418
Recall Variant: 0.36642345327979325
Precision 7: 0.3828571428571429
Precision 12: 0.3449999999999999
Precision 20: 0.311
train with 100 with lora
NDCG: 0.8432282495018343
Recall 7: 0.33695911259587386
@ -50,4 +41,21 @@ Precision 7: 0.4685714285714285
Precision 12: 0.4099999999999999
Precision 20: 0.35200000000000004
train with 100 with promt
train with 33000 steps on all dataset
NDCG: 0.8414338101165514
Recall 7: 0.3118752420460591
Recall 12: 0.4692991653842038
Recall 20: 0.6261433602218365
Recall Variant: 0.43146001721540145
Precision 7: 0.4514285714285714
Precision 12: 0.4049999999999999
Precision 20: 0.348
evaluation dataset_test : 1000 sample
no train :
NDCG: 0.991
train with 33000 steps on all dataset :
NDCG: 0.9975

View File

@ -0,0 +1,37 @@
import json
from datasets import load_dataset
from tqdm import tqdm
names = ["MCINext/FEVER_FA_test_top_250_only_w_correct-v2", "MCINext/fiqa-fa-v2", "MCINext/HotpotQA_FA_test_top_250_only_w_correct-v2",
"MCINext/MSMARCO_FA_test_top_250_only_w_correct-v2", "MCINext/NQ_FA_test_top_250_only_w_correct-v2", "MCINext/quora-fa-v2", "MCINext/scifact-fa-v2",
"MCINext/synthetic-persian-chatbot-rag-faq-retrieval", "MCINext/synthetic-persian-qa-retrieval", "MCINext/trec-covid-fa-v2"]
for name in tqdm(names):
print(f"loading {name}")
dataset_qrel = load_dataset(name)["test"]
dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
dataset_corpus = {}
for data in dataset_corpus_list:
dataset_corpus[data["_id"]] = data["text"]
dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
dataset_queries = {}
for data in dataset_queries_list:
dataset_queries[data["_id"]] = data["text"]
dataset = []
print("start creating dataset")
for data in dataset_qrel:
if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
dataset.append({
"question": dataset_queries[data["query-id"]],
"passage_positive": [dataset_corpus[data["corpus-id"]]],
"passage_negative": [],
"passage_negative_random": [],
})
print(f"length of dataset: {len(dataset)}")
with open(f"./research_notebook/data/mci/{name.split('/')[-1]}_v2.json", "w") as f:
json.dump(dataset, f, indent=4, ensure_ascii=False)

View File

@ -0,0 +1,30 @@
import json
from datasets import load_dataset
dataset_qrel = load_dataset("MCINext/scidocs-fa-v2")["test"]
dataset_corpus_list = load_dataset("MCINext/scidocs-fa-v2",data_files="corpus.jsonl")["train"]
dataset_corpus = {}
for data in dataset_corpus_list:
dataset_corpus[data["_id"]] = data["text"]
dataset_queries_list = load_dataset("MCINext/scidocs-fa-v2",data_files="queries.jsonl")["train"]
dataset_queries = {}
for data in dataset_queries_list:
dataset_queries[data["_id"]] = data["text"]
dataset = []
print("start creating dataset")
for data in dataset_qrel:
if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
dataset.append({
"question": dataset_queries[data["query-id"]],
"passage_positive": [dataset_corpus[data["corpus-id"]]],
"passage_negative": [],
"passage_negative_random": [],
})
print(f"length of dataset: {len(dataset)}")
with open("./research_notebook/data/scidocs/scidocs_v2.json", "w") as f:
json.dump(dataset, f, indent=4, ensure_ascii=False)

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -52,12 +52,13 @@ def main(add_prompt, lora):
args = SentenceTransformerTrainingArguments(
output_dir="./models/gemma",
num_train_epochs=1,
# num_train_epochs=1,
max_steps=50,
per_device_train_batch_size=32,
learning_rate=2e-5,
warmup_ratio=0.05,
logging_steps=train_dataset.num_rows,
report_to="none",
logging_steps=10,
report_to="tensorboard",
save_steps=10000,
save_total_limit=2,
)

88
train/gemma/test.py Normal file
View File

@ -0,0 +1,88 @@
import torch
from sentence_transformers import SentenceTransformer
model_id = "google/embeddinggemma-300M"
model = SentenceTransformer(model_id)
print("Original model")
k = 0
for name, param in model.named_parameters():
print(name)
print(param)
k += 1
if k > 1:
break
model_id = "./models/gemma/checkpoint-33246"
model_lora = SentenceTransformer(model_id)
print("LoRA model")
k = 0
for name, param in model_lora.named_parameters():
print(name)
print(param)
k += 1
if k == 3:
a = param
if k == 4:
b = param
if k > 3:
delta = (b @ a) * 2.0
print(delta)
break
print(k)
import torch
import torch
def compare_lora_to_base(model_lora, model_base, lora_scale=1.0):
"""
Compare how much each weight matrix has changed between
the base model and the LoRA-adapted model.
"""
report = []
total_change = 0.0
total_params = 0
has_lora = []
no_lora = []
for name, module in model_lora.named_modules():
# LoRA modules typically have lora_A and lora_B
if hasattr(module, "lora_A") and hasattr(module, "lora_B"):
A = module.lora_A["default"].weight.data
B = module.lora_B["default"].weight.data
delta = (B @ A) * lora_scale
# Find matching base layer
try:
base_weight = model_base.get_submodule(name).weight.data
has_lora.append(name)
except Exception:
no_lora.append(name)
new_weight = base_weight + delta
diff = (new_weight - base_weight).abs()
relative_change = diff / (base_weight.abs() + 1e-8)
mean_change = relative_change.mean().item() * 100
report.append((name, mean_change))
total_change += relative_change.sum().item()
total_params += relative_change.numel()
else:
no_lora.append(name)
print("has_lora", has_lora)
print("no_lora", no_lora)
print("lora num", len(has_lora))
print("no lora num", len(no_lora))
overall_change = (total_change / total_params) * 100 if total_params > 0 else 0.0
return report, overall_change
report, overall_change = compare_lora_to_base(model_lora, model, lora_scale=2.0)
print(f"overall_change: {overall_change}")