add evaluation
This commit is contained in:
parent
bc2cc07411
commit
c06572dedb
@ -2,8 +2,11 @@ import argparse
|
|||||||
import json
|
import json
|
||||||
import math
|
import math
|
||||||
import importlib
|
import importlib
|
||||||
import tqdm
|
from tqdm import tqdm
|
||||||
from hazm import Normalizer
|
from hazm import Normalizer
|
||||||
|
import random
|
||||||
|
import numpy as np
|
||||||
|
import faiss
|
||||||
|
|
||||||
normalizer = Normalizer()
|
normalizer = Normalizer()
|
||||||
|
|
||||||
@ -11,7 +14,7 @@ normalizer = Normalizer()
|
|||||||
|
|
||||||
def load_dataset(input_file):
|
def load_dataset(input_file):
|
||||||
with open(input_file, "r", encoding="utf-8") as f:
|
with open(input_file, "r", encoding="utf-8") as f:
|
||||||
dataset = json.load(f)
|
dataset = json.load(f)[:1000]
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
@ -32,7 +35,7 @@ def calculate_ndcg(scores, n):
|
|||||||
return idcg
|
return idcg
|
||||||
|
|
||||||
dcg = calculate_dcg(scores, n)
|
dcg = calculate_dcg(scores, n)
|
||||||
idcg = calculate_idcg(scores, n)
|
idcg = 1 #calculate_idcg(scores, n)
|
||||||
ndcg = dcg/idcg
|
ndcg = dcg/idcg
|
||||||
return ndcg
|
return ndcg
|
||||||
|
|
||||||
@ -40,14 +43,12 @@ def calculate_ndcg(scores, n):
|
|||||||
def calculate_recall(scores):
|
def calculate_recall(scores):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
num_ground_truth = scores.count(4)
|
num_ground_truth = scores.count(1)
|
||||||
if num_ground_truth == 0:
|
|
||||||
num_ground_truth = scores.count(3)
|
|
||||||
|
|
||||||
recall_7 = scores[:7].count(4) / num_ground_truth
|
recall_7 = scores[:7].count(1) / num_ground_truth
|
||||||
recall_12 = scores[:12].count(4) / num_ground_truth
|
recall_12 = scores[:12].count(1) / num_ground_truth
|
||||||
recall_20 = scores[:20].count(4) / num_ground_truth
|
recall_20 = scores[:20].count(1) / num_ground_truth
|
||||||
recall_variant = scores[:scores.count(4)].count(4) / scores.count(4)
|
recall_variant = scores[:scores.count(1)].count(1) / scores.count(1)
|
||||||
|
|
||||||
return recall_7, recall_12, recall_20, recall_variant
|
return recall_7, recall_12, recall_20, recall_variant
|
||||||
except:
|
except:
|
||||||
@ -55,9 +56,9 @@ def calculate_recall(scores):
|
|||||||
|
|
||||||
|
|
||||||
def calculate_precision(scores):
|
def calculate_precision(scores):
|
||||||
precision_7 = scores[:7].count(4) / 7
|
precision_7 = scores[:7].count(1) / 7
|
||||||
precision_12 = scores[:12].count(4) / 12
|
precision_12 = scores[:12].count(1) / 12
|
||||||
precision_20 = scores[:20].count(4) / 20
|
precision_20 = scores[:20].count(1) / 20
|
||||||
|
|
||||||
return precision_7, precision_12, precision_20
|
return precision_7, precision_12, precision_20
|
||||||
|
|
||||||
@ -85,14 +86,90 @@ def run(input_file, model):
|
|||||||
precision_7_scores = []
|
precision_7_scores = []
|
||||||
precision_12_scores = []
|
precision_12_scores = []
|
||||||
precision_20_scores = []
|
precision_20_scores = []
|
||||||
dataset = load_dataset(input_file)
|
all_dataset = load_dataset(input_file)[:1000]
|
||||||
for count, data in enumerate(tqdm.tqdm(dataset)):
|
|
||||||
question = data["question"]
|
batch_size = 100
|
||||||
chunks = [data["chunks"][str(id)] for id in range(len(data["chunks"].keys()))]
|
len_dataset = len(all_dataset)
|
||||||
scores_llm = [data["scores"][str(id)] for id in range(len(data["chunks"].keys()))]
|
all_dataset_embeddings = [{'question_embedding': "", 'passage_positive_embedding': []} for _ in range(len_dataset)]
|
||||||
scores_embed = []
|
|
||||||
|
all_embeddings = []
|
||||||
|
all_texts = []
|
||||||
|
|
||||||
|
print("calculate question embeddings")
|
||||||
|
# calculate question embeddings
|
||||||
|
for i in tqdm(range(0, len_dataset, batch_size)):
|
||||||
|
|
||||||
|
question_list = []
|
||||||
|
for id in range(i, min(i + batch_size, len_dataset)):
|
||||||
|
question_list.append(all_dataset[id]['question'])
|
||||||
|
|
||||||
|
question_embeddings = model.embed_texts(question_list, query_is=True)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for id in range(i, min(i + batch_size, len_dataset)):
|
||||||
|
all_dataset_embeddings[id]['question_embedding'] = question_embeddings[count]
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
|
||||||
|
print("calculate passage positive embeddings")
|
||||||
|
# calculate passage positive embeddings
|
||||||
|
for i in tqdm(range(0, len_dataset, batch_size)):
|
||||||
|
|
||||||
|
passage_positive_list = []
|
||||||
|
for id in range(i, min(i + batch_size, len_dataset)):
|
||||||
|
for passage in all_dataset[id]['passage_positive']:
|
||||||
|
passage_positive_list.append(passage)
|
||||||
|
|
||||||
|
passage_positive_embeddings = model.embed_texts(passage_positive_list)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for id in range(i, min(i + batch_size, len_dataset)):
|
||||||
|
for passage_id in range(len(all_dataset[id]['passage_positive'])):
|
||||||
|
all_dataset_embeddings[id]['passage_positive_embedding'].append(passage_positive_embeddings[count])
|
||||||
|
all_embeddings.append(passage_positive_embeddings[count])
|
||||||
|
all_texts.append(all_dataset[id]['passage_positive'][passage_id])
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
print("calculate passage negative embeddings")
|
||||||
|
# calculate passage negative embeddings
|
||||||
|
for i in tqdm(range(0, len_dataset, batch_size)):
|
||||||
|
|
||||||
|
passage_negative_list = []
|
||||||
|
for id in range(i, min(i + batch_size, len_dataset)):
|
||||||
|
for passage in all_dataset[id]['passage_negative']:
|
||||||
|
passage_negative_list.append(passage)
|
||||||
|
|
||||||
|
passage_negative_embeddings = model.embed_texts(passage_negative_list)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for id in range(i, min(i + batch_size, len_dataset)):
|
||||||
|
for passage_id in range(len(all_dataset[id]['passage_negative'])):
|
||||||
|
all_embeddings.append(passage_negative_embeddings[count])
|
||||||
|
all_texts.append(all_dataset[id]['passage_negative'][passage_id])
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
#create faiss index
|
||||||
|
all_embeddings = np.array(all_embeddings, dtype=np.float32)
|
||||||
|
print(f"all_embeddings shape: {all_embeddings.shape}")
|
||||||
|
dim = all_embeddings.shape[1]
|
||||||
|
index = faiss.IndexFlatIP(dim)
|
||||||
|
faiss.normalize_L2(all_embeddings)
|
||||||
|
index.add(all_embeddings)
|
||||||
|
|
||||||
|
for count, data in enumerate(tqdm(all_dataset)):
|
||||||
|
#get top 10 chunks
|
||||||
|
question_embeddings = all_dataset_embeddings[count]['question_embedding']
|
||||||
|
question_embeddings_normalized = np.array([question_embeddings], dtype=np.float32)
|
||||||
|
faiss.normalize_L2(question_embeddings_normalized)
|
||||||
|
scores_embed, ids_embed = index.search(question_embeddings_normalized, 10)
|
||||||
|
chunks = [all_texts[id] for id in ids_embed[0]]
|
||||||
|
|
||||||
|
scores_llm = []
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
scores_embed.append(model.run(preprocess_reranker(question, preprocess=True), preprocess_reranker(chunk, preprocess=True, add_extra_word=False)))
|
if chunk in data["passage_positive"]:
|
||||||
|
scores_llm.append(1)
|
||||||
|
else:
|
||||||
|
scores_llm.append(0)
|
||||||
|
|
||||||
# print(f"question {count}: {question}")
|
# print(f"question {count}: {question}")
|
||||||
# for i in range(len(scores_embed)):
|
# for i in range(len(scores_embed)):
|
||||||
|
|||||||
BIN
evaluation/models_50/__pycache__/gemma_embed.cpython-310.pyc
Normal file
BIN
evaluation/models_50/__pycache__/gemma_embed.cpython-310.pyc
Normal file
Binary file not shown.
Binary file not shown.
15
evaluation/models_50/gemma_embed.py
Normal file
15
evaluation/models_50/gemma_embed.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
|
||||||
|
class model():
|
||||||
|
def __init__(self):
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
self.model = SentenceTransformer("google/embeddinggemma-300m")
|
||||||
|
|
||||||
|
|
||||||
|
def run(self, question:str, chunk:str)->int:
|
||||||
|
query_embeddings = self.model.encode_query(question)
|
||||||
|
document_embeddings = self.model.encode_document(chunk)
|
||||||
|
similarities = self.model.similarity(query_embeddings, document_embeddings)
|
||||||
|
return similarities
|
||||||
|
|
||||||
17
evaluation/models_50/gemma_embed_train.py
Normal file
17
evaluation/models_50/gemma_embed_train.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
|
||||||
|
class model():
|
||||||
|
def __init__(self):
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
# self.model = SentenceTransformer("./models/gemma/checkpoint-33246")
|
||||||
|
self.model = SentenceTransformer("google/embeddinggemma-300m")
|
||||||
|
self.model.load_adapter("./models/gemma/checkpoint-33246")
|
||||||
|
|
||||||
|
|
||||||
|
def run(self, question:str, chunk:str)->int:
|
||||||
|
query_embeddings = self.model.encode_query(question)
|
||||||
|
document_embeddings = self.model.encode_document(chunk)
|
||||||
|
similarities = self.model.similarity(query_embeddings, document_embeddings)
|
||||||
|
return similarities
|
||||||
|
|
||||||
30
notes.txt
30
notes.txt
@ -20,6 +20,7 @@
|
|||||||
|
|
||||||
10-Synthetic-persian-qa-retrieval dataset : question = 223423, passage = 250000 : negetaive passage are not exactly different : needs preprocessing
|
10-Synthetic-persian-qa-retrieval dataset : question = 223423, passage = 250000 : negetaive passage are not exactly different : needs preprocessing
|
||||||
|
|
||||||
|
evaluation : 50 question of rahbar
|
||||||
no train
|
no train
|
||||||
NDCG: 0.8452119768348717
|
NDCG: 0.8452119768348717
|
||||||
Recall 7: 0.3373666606161222
|
Recall 7: 0.3373666606161222
|
||||||
@ -30,16 +31,6 @@ Precision 7: 0.4714285714285715
|
|||||||
Precision 12: 0.41999999999999993
|
Precision 12: 0.41999999999999993
|
||||||
Precision 20: 0.358
|
Precision 20: 0.358
|
||||||
|
|
||||||
train with 100
|
|
||||||
NDCG: 0.8007791818263832
|
|
||||||
Recall 7: 0.2617863643550479
|
|
||||||
Recall 12: 0.3759745806720163
|
|
||||||
Recall 20: 0.5564983103150418
|
|
||||||
Recall Variant: 0.36642345327979325
|
|
||||||
Precision 7: 0.3828571428571429
|
|
||||||
Precision 12: 0.3449999999999999
|
|
||||||
Precision 20: 0.311
|
|
||||||
|
|
||||||
train with 100 with lora
|
train with 100 with lora
|
||||||
NDCG: 0.8432282495018343
|
NDCG: 0.8432282495018343
|
||||||
Recall 7: 0.33695911259587386
|
Recall 7: 0.33695911259587386
|
||||||
@ -50,4 +41,21 @@ Precision 7: 0.4685714285714285
|
|||||||
Precision 12: 0.4099999999999999
|
Precision 12: 0.4099999999999999
|
||||||
Precision 20: 0.35200000000000004
|
Precision 20: 0.35200000000000004
|
||||||
|
|
||||||
train with 100 with promt
|
train with 33000 steps on all dataset
|
||||||
|
NDCG: 0.8414338101165514
|
||||||
|
Recall 7: 0.3118752420460591
|
||||||
|
Recall 12: 0.4692991653842038
|
||||||
|
Recall 20: 0.6261433602218365
|
||||||
|
Recall Variant: 0.43146001721540145
|
||||||
|
Precision 7: 0.4514285714285714
|
||||||
|
Precision 12: 0.4049999999999999
|
||||||
|
Precision 20: 0.348
|
||||||
|
|
||||||
|
|
||||||
|
evaluation dataset_test : 1000 sample
|
||||||
|
|
||||||
|
no train :
|
||||||
|
NDCG: 0.991
|
||||||
|
|
||||||
|
train with 33000 steps on all dataset :
|
||||||
|
NDCG: 0.9975
|
||||||
37
research_notebook/data_preprocess/arguana_v2.py
Normal file
37
research_notebook/data_preprocess/arguana_v2.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
import json
|
||||||
|
from datasets import load_dataset
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
names = ["MCINext/FEVER_FA_test_top_250_only_w_correct-v2", "MCINext/fiqa-fa-v2", "MCINext/HotpotQA_FA_test_top_250_only_w_correct-v2",
|
||||||
|
"MCINext/MSMARCO_FA_test_top_250_only_w_correct-v2", "MCINext/NQ_FA_test_top_250_only_w_correct-v2", "MCINext/quora-fa-v2", "MCINext/scifact-fa-v2",
|
||||||
|
"MCINext/synthetic-persian-chatbot-rag-faq-retrieval", "MCINext/synthetic-persian-qa-retrieval", "MCINext/trec-covid-fa-v2"]
|
||||||
|
for name in tqdm(names):
|
||||||
|
print(f"loading {name}")
|
||||||
|
dataset_qrel = load_dataset(name)["test"]
|
||||||
|
dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
|
||||||
|
dataset_corpus = {}
|
||||||
|
for data in dataset_corpus_list:
|
||||||
|
dataset_corpus[data["_id"]] = data["text"]
|
||||||
|
|
||||||
|
dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
|
||||||
|
dataset_queries = {}
|
||||||
|
for data in dataset_queries_list:
|
||||||
|
dataset_queries[data["_id"]] = data["text"]
|
||||||
|
|
||||||
|
|
||||||
|
dataset = []
|
||||||
|
print("start creating dataset")
|
||||||
|
for data in dataset_qrel:
|
||||||
|
|
||||||
|
if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
|
||||||
|
dataset.append({
|
||||||
|
"question": dataset_queries[data["query-id"]],
|
||||||
|
"passage_positive": [dataset_corpus[data["corpus-id"]]],
|
||||||
|
"passage_negative": [],
|
||||||
|
"passage_negative_random": [],
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"length of dataset: {len(dataset)}")
|
||||||
|
with open(f"./research_notebook/data/mci/{name.split('/')[-1]}_v2.json", "w") as f:
|
||||||
|
json.dump(dataset, f, indent=4, ensure_ascii=False)
|
||||||
30
research_notebook/data_preprocess/scidocs_v2.py
Normal file
30
research_notebook/data_preprocess/scidocs_v2.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
import json
|
||||||
|
from datasets import load_dataset
|
||||||
|
|
||||||
|
dataset_qrel = load_dataset("MCINext/scidocs-fa-v2")["test"]
|
||||||
|
dataset_corpus_list = load_dataset("MCINext/scidocs-fa-v2",data_files="corpus.jsonl")["train"]
|
||||||
|
dataset_corpus = {}
|
||||||
|
for data in dataset_corpus_list:
|
||||||
|
dataset_corpus[data["_id"]] = data["text"]
|
||||||
|
|
||||||
|
dataset_queries_list = load_dataset("MCINext/scidocs-fa-v2",data_files="queries.jsonl")["train"]
|
||||||
|
dataset_queries = {}
|
||||||
|
for data in dataset_queries_list:
|
||||||
|
dataset_queries[data["_id"]] = data["text"]
|
||||||
|
|
||||||
|
|
||||||
|
dataset = []
|
||||||
|
print("start creating dataset")
|
||||||
|
for data in dataset_qrel:
|
||||||
|
|
||||||
|
if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
|
||||||
|
dataset.append({
|
||||||
|
"question": dataset_queries[data["query-id"]],
|
||||||
|
"passage_positive": [dataset_corpus[data["corpus-id"]]],
|
||||||
|
"passage_negative": [],
|
||||||
|
"passage_negative_random": [],
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"length of dataset: {len(dataset)}")
|
||||||
|
with open("./research_notebook/data/scidocs/scidocs_v2.json", "w") as f:
|
||||||
|
json.dump(dataset, f, indent=4, ensure_ascii=False)
|
||||||
File diff suppressed because one or more lines are too long
1475
research_notebook/train/train_gemma.ipynb
Normal file
1475
research_notebook/train/train_gemma.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
@ -52,12 +52,13 @@ def main(add_prompt, lora):
|
|||||||
|
|
||||||
args = SentenceTransformerTrainingArguments(
|
args = SentenceTransformerTrainingArguments(
|
||||||
output_dir="./models/gemma",
|
output_dir="./models/gemma",
|
||||||
num_train_epochs=1,
|
# num_train_epochs=1,
|
||||||
|
max_steps=50,
|
||||||
per_device_train_batch_size=32,
|
per_device_train_batch_size=32,
|
||||||
learning_rate=2e-5,
|
learning_rate=2e-5,
|
||||||
warmup_ratio=0.05,
|
warmup_ratio=0.05,
|
||||||
logging_steps=train_dataset.num_rows,
|
logging_steps=10,
|
||||||
report_to="none",
|
report_to="tensorboard",
|
||||||
save_steps=10000,
|
save_steps=10000,
|
||||||
save_total_limit=2,
|
save_total_limit=2,
|
||||||
)
|
)
|
||||||
|
|||||||
88
train/gemma/test.py
Normal file
88
train/gemma/test.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
import torch
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
|
||||||
|
model_id = "google/embeddinggemma-300M"
|
||||||
|
model = SentenceTransformer(model_id)
|
||||||
|
|
||||||
|
print("Original model")
|
||||||
|
k = 0
|
||||||
|
for name, param in model.named_parameters():
|
||||||
|
print(name)
|
||||||
|
print(param)
|
||||||
|
k += 1
|
||||||
|
if k > 1:
|
||||||
|
break
|
||||||
|
|
||||||
|
model_id = "./models/gemma/checkpoint-33246"
|
||||||
|
model_lora = SentenceTransformer(model_id)
|
||||||
|
|
||||||
|
print("LoRA model")
|
||||||
|
k = 0
|
||||||
|
for name, param in model_lora.named_parameters():
|
||||||
|
print(name)
|
||||||
|
print(param)
|
||||||
|
k += 1
|
||||||
|
if k == 3:
|
||||||
|
a = param
|
||||||
|
if k == 4:
|
||||||
|
b = param
|
||||||
|
|
||||||
|
if k > 3:
|
||||||
|
delta = (b @ a) * 2.0
|
||||||
|
print(delta)
|
||||||
|
break
|
||||||
|
print(k)
|
||||||
|
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
def compare_lora_to_base(model_lora, model_base, lora_scale=1.0):
|
||||||
|
"""
|
||||||
|
Compare how much each weight matrix has changed between
|
||||||
|
the base model and the LoRA-adapted model.
|
||||||
|
"""
|
||||||
|
report = []
|
||||||
|
total_change = 0.0
|
||||||
|
total_params = 0
|
||||||
|
has_lora = []
|
||||||
|
no_lora = []
|
||||||
|
for name, module in model_lora.named_modules():
|
||||||
|
# LoRA modules typically have lora_A and lora_B
|
||||||
|
if hasattr(module, "lora_A") and hasattr(module, "lora_B"):
|
||||||
|
A = module.lora_A["default"].weight.data
|
||||||
|
B = module.lora_B["default"].weight.data
|
||||||
|
delta = (B @ A) * lora_scale
|
||||||
|
|
||||||
|
# Find matching base layer
|
||||||
|
try:
|
||||||
|
base_weight = model_base.get_submodule(name).weight.data
|
||||||
|
has_lora.append(name)
|
||||||
|
except Exception:
|
||||||
|
no_lora.append(name)
|
||||||
|
|
||||||
|
new_weight = base_weight + delta
|
||||||
|
|
||||||
|
diff = (new_weight - base_weight).abs()
|
||||||
|
relative_change = diff / (base_weight.abs() + 1e-8)
|
||||||
|
mean_change = relative_change.mean().item() * 100
|
||||||
|
|
||||||
|
report.append((name, mean_change))
|
||||||
|
total_change += relative_change.sum().item()
|
||||||
|
total_params += relative_change.numel()
|
||||||
|
|
||||||
|
else:
|
||||||
|
no_lora.append(name)
|
||||||
|
print("has_lora", has_lora)
|
||||||
|
print("no_lora", no_lora)
|
||||||
|
print("lora num", len(has_lora))
|
||||||
|
print("no lora num", len(no_lora))
|
||||||
|
overall_change = (total_change / total_params) * 100 if total_params > 0 else 0.0
|
||||||
|
return report, overall_change
|
||||||
|
|
||||||
|
|
||||||
|
report, overall_change = compare_lora_to_base(model_lora, model, lora_scale=2.0)
|
||||||
|
|
||||||
|
print(f"overall_change: {overall_change}")
|
||||||
Loading…
x
Reference in New Issue
Block a user