diff --git a/evaluation/evaluate.py b/evaluation/evaluate.py new file mode 100644 index 0000000..9ab4a4d --- /dev/null +++ b/evaluation/evaluate.py @@ -0,0 +1,148 @@ +import argparse +import json +import math +import importlib +import tqdm +from hazm import Normalizer + +normalizer = Normalizer() + + + +def load_dataset(input_file): + with open(input_file, "r", encoding="utf-8") as f: + dataset = json.load(f) + return dataset + + +def calculate_ndcg(scores, n): + def calculate_dcg(scores, n): + idcg = 0 + for i in range(n): + a = (2 ** scores[i]) - 1 + b = math.log2(i + 2) + + idcg += (a/b) + return idcg + + def calculate_idcg(scores, n): + new_scores = scores.copy() + new_scores.sort(reverse=True) + idcg = calculate_dcg(new_scores, n) + return idcg + + dcg = calculate_dcg(scores, n) + idcg = calculate_idcg(scores, n) + ndcg = dcg/idcg + return ndcg + + +def calculate_recall(scores): + + try: + num_ground_truth = scores.count(4) + if num_ground_truth == 0: + num_ground_truth = scores.count(3) + + recall_7 = scores[:7].count(4) / num_ground_truth + recall_12 = scores[:12].count(4) / num_ground_truth + recall_20 = scores[:20].count(4) / num_ground_truth + recall_variant = scores[:scores.count(4)].count(4) / scores.count(4) + + return recall_7, recall_12, recall_20, recall_variant + except: + return 0, 0, 0, 0 + + +def calculate_precision(scores): + precision_7 = scores[:7].count(4) / 7 + precision_12 = scores[:12].count(4) / 12 + precision_20 = scores[:20].count(4) / 20 + + return precision_7, precision_12, precision_20 + + +def preprocess_reranker(text:str, preprocess:bool=True, add_extra_word:bool=False): + if preprocess: + text = text.replace("\n", ".") + text = normalizer.normalize(text) + + if add_extra_word: + text += " رهبر انقلاب اسلامی حضرت امام خامنه ای " + + return text + + +def run(input_file, model): + module = importlib.import_module("evaluation.models." + model) + model = module.model() + + ndcg_scores = [] + recall_7_scores = [] + recall_12_scores = [] + recall_20_scores = [] + recall_variant_scores = [] + precision_7_scores = [] + precision_12_scores = [] + precision_20_scores = [] + dataset = load_dataset(input_file) + for count, data in enumerate(tqdm.tqdm(dataset)): + question = data["question"] + chunks = [data["chunks"][str(id)] for id in range(len(data["chunks"].keys()))] + scores_llm = [data["scores"][str(id)] for id in range(len(data["chunks"].keys()))] + scores_embed = [] + for chunk in chunks: + scores_embed.append(model.run(preprocess_reranker(question, preprocess=True), preprocess_reranker(chunk, preprocess=True, add_extra_word=False))) + + # print(f"question {count}: {question}") + # for i in range(len(scores_embed)): + # print(f"chunk {i}: scores_embed {scores_embed[i]}, scores_llm {scores_llm[i]}") + # print("--------------------------------\n") + sorted_pairs = sorted(zip(scores_embed, scores_llm), reverse=True) + scores = [rel for _, rel in sorted_pairs] + #calculate ndcg + ndcg = calculate_ndcg(scores, len(scores)) + ndcg_scores.append(ndcg) + + #calculate recall + recall_7, recall_12, recall_20, recall_variant = calculate_recall(scores) + recall_7_scores.append(recall_7) + recall_12_scores.append(recall_12) + recall_20_scores.append(recall_20) + recall_variant_scores.append(recall_variant) + + #calculate precision + precision_7, precision_12, precision_20 = calculate_precision(scores) + precision_7_scores.append(precision_7) + precision_12_scores.append(precision_12) + precision_20_scores.append(precision_20) + + print(f"NDCG: {sum(ndcg_scores)/len(ndcg_scores)}") + print(f"Recall 7: {sum(recall_7_scores)/len(recall_7_scores)}") + print(f"Recall 12: {sum(recall_12_scores)/len(recall_12_scores)}") + print(f"Recall 20: {sum(recall_20_scores)/len(recall_20_scores)}") + print(f"Recall Variant: {sum(recall_variant_scores)/len(recall_variant_scores)}") + print(f"Precision 7: {sum(precision_7_scores)/len(precision_7_scores)}") + print(f"Precision 12: {sum(precision_12_scores)/len(precision_12_scores)}") + print(f"Precision 20: {sum(precision_20_scores)/len(precision_20_scores)}") + + +def main(): + """ + -First give your questions to generate_dataset.py and generate a json file and give the path as input_file. + -Second create your model class in ./models folder similar to sample_model.py + -Third run the script with the following command: + python evaluate.py --input_file --model + """ + parser = argparse.ArgumentParser() + parser.add_argument('--input_file', help='json input file path') + parser.add_argument('--model', help='the path of model class') + + args = parser.parse_args() + + print(f"Start to evaluate the model {args.model} with normalizer and extra words input file {args.input_file}") + run(args.input_file, args.model) + + +if __name__ == "__main__": + exit(main()) \ No newline at end of file diff --git a/evaluation/evaluate_50.py b/evaluation/evaluate_50.py new file mode 100644 index 0000000..9ab4a4d --- /dev/null +++ b/evaluation/evaluate_50.py @@ -0,0 +1,148 @@ +import argparse +import json +import math +import importlib +import tqdm +from hazm import Normalizer + +normalizer = Normalizer() + + + +def load_dataset(input_file): + with open(input_file, "r", encoding="utf-8") as f: + dataset = json.load(f) + return dataset + + +def calculate_ndcg(scores, n): + def calculate_dcg(scores, n): + idcg = 0 + for i in range(n): + a = (2 ** scores[i]) - 1 + b = math.log2(i + 2) + + idcg += (a/b) + return idcg + + def calculate_idcg(scores, n): + new_scores = scores.copy() + new_scores.sort(reverse=True) + idcg = calculate_dcg(new_scores, n) + return idcg + + dcg = calculate_dcg(scores, n) + idcg = calculate_idcg(scores, n) + ndcg = dcg/idcg + return ndcg + + +def calculate_recall(scores): + + try: + num_ground_truth = scores.count(4) + if num_ground_truth == 0: + num_ground_truth = scores.count(3) + + recall_7 = scores[:7].count(4) / num_ground_truth + recall_12 = scores[:12].count(4) / num_ground_truth + recall_20 = scores[:20].count(4) / num_ground_truth + recall_variant = scores[:scores.count(4)].count(4) / scores.count(4) + + return recall_7, recall_12, recall_20, recall_variant + except: + return 0, 0, 0, 0 + + +def calculate_precision(scores): + precision_7 = scores[:7].count(4) / 7 + precision_12 = scores[:12].count(4) / 12 + precision_20 = scores[:20].count(4) / 20 + + return precision_7, precision_12, precision_20 + + +def preprocess_reranker(text:str, preprocess:bool=True, add_extra_word:bool=False): + if preprocess: + text = text.replace("\n", ".") + text = normalizer.normalize(text) + + if add_extra_word: + text += " رهبر انقلاب اسلامی حضرت امام خامنه ای " + + return text + + +def run(input_file, model): + module = importlib.import_module("evaluation.models." + model) + model = module.model() + + ndcg_scores = [] + recall_7_scores = [] + recall_12_scores = [] + recall_20_scores = [] + recall_variant_scores = [] + precision_7_scores = [] + precision_12_scores = [] + precision_20_scores = [] + dataset = load_dataset(input_file) + for count, data in enumerate(tqdm.tqdm(dataset)): + question = data["question"] + chunks = [data["chunks"][str(id)] for id in range(len(data["chunks"].keys()))] + scores_llm = [data["scores"][str(id)] for id in range(len(data["chunks"].keys()))] + scores_embed = [] + for chunk in chunks: + scores_embed.append(model.run(preprocess_reranker(question, preprocess=True), preprocess_reranker(chunk, preprocess=True, add_extra_word=False))) + + # print(f"question {count}: {question}") + # for i in range(len(scores_embed)): + # print(f"chunk {i}: scores_embed {scores_embed[i]}, scores_llm {scores_llm[i]}") + # print("--------------------------------\n") + sorted_pairs = sorted(zip(scores_embed, scores_llm), reverse=True) + scores = [rel for _, rel in sorted_pairs] + #calculate ndcg + ndcg = calculate_ndcg(scores, len(scores)) + ndcg_scores.append(ndcg) + + #calculate recall + recall_7, recall_12, recall_20, recall_variant = calculate_recall(scores) + recall_7_scores.append(recall_7) + recall_12_scores.append(recall_12) + recall_20_scores.append(recall_20) + recall_variant_scores.append(recall_variant) + + #calculate precision + precision_7, precision_12, precision_20 = calculate_precision(scores) + precision_7_scores.append(precision_7) + precision_12_scores.append(precision_12) + precision_20_scores.append(precision_20) + + print(f"NDCG: {sum(ndcg_scores)/len(ndcg_scores)}") + print(f"Recall 7: {sum(recall_7_scores)/len(recall_7_scores)}") + print(f"Recall 12: {sum(recall_12_scores)/len(recall_12_scores)}") + print(f"Recall 20: {sum(recall_20_scores)/len(recall_20_scores)}") + print(f"Recall Variant: {sum(recall_variant_scores)/len(recall_variant_scores)}") + print(f"Precision 7: {sum(precision_7_scores)/len(precision_7_scores)}") + print(f"Precision 12: {sum(precision_12_scores)/len(precision_12_scores)}") + print(f"Precision 20: {sum(precision_20_scores)/len(precision_20_scores)}") + + +def main(): + """ + -First give your questions to generate_dataset.py and generate a json file and give the path as input_file. + -Second create your model class in ./models folder similar to sample_model.py + -Third run the script with the following command: + python evaluate.py --input_file --model + """ + parser = argparse.ArgumentParser() + parser.add_argument('--input_file', help='json input file path') + parser.add_argument('--model', help='the path of model class') + + args = parser.parse_args() + + print(f"Start to evaluate the model {args.model} with normalizer and extra words input file {args.input_file}") + run(args.input_file, args.model) + + +if __name__ == "__main__": + exit(main()) \ No newline at end of file diff --git a/notes.txt b/notes.txt index f16d2d9..4eaf6ef 100644 --- a/notes.txt +++ b/notes.txt @@ -18,4 +18,36 @@ 9-longragfa dataset: it is long doc and query and for evaluation : question = 250, passage = 1500 : not using -10-Synthetic-persian-qa-retrieval dataset : question = 223423, passage = 250000 : negetaive passage are not exactly different : needs preprocessing \ No newline at end of file +10-Synthetic-persian-qa-retrieval dataset : question = 223423, passage = 250000 : negetaive passage are not exactly different : needs preprocessing + +no train +NDCG: 0.8452119768348717 +Recall 7: 0.3373666606161222 +Recall 12: 0.48390155482482855 +Recall 20: 0.6340810809380268 +Recall Variant: 0.44313617731261423 +Precision 7: 0.4714285714285715 +Precision 12: 0.41999999999999993 +Precision 20: 0.358 + +train with 100 +NDCG: 0.8007791818263832 +Recall 7: 0.2617863643550479 +Recall 12: 0.3759745806720163 +Recall 20: 0.5564983103150418 +Recall Variant: 0.36642345327979325 +Precision 7: 0.3828571428571429 +Precision 12: 0.3449999999999999 +Precision 20: 0.311 + +train with 100 with lora +NDCG: 0.8432282495018343 +Recall 7: 0.33695911259587386 +Recall 12: 0.4729916144600827 +Recall 20: 0.6212526155736547 +Recall Variant: 0.43208929205133273 +Precision 7: 0.4685714285714285 +Precision 12: 0.4099999999999999 +Precision 20: 0.35200000000000004 + +train with 100 with promt diff --git a/research_notebook/data_preprocess/bge.ipynb b/research_notebook/data_preprocess/bge.ipynb index 5c98d62..eb2c9b0 100644 --- a/research_notebook/data_preprocess/bge.ipynb +++ b/research_notebook/data_preprocess/bge.ipynb @@ -11,10 +11,7 @@ "output_type": "stream", "text": [ "/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "Downloading readme: 100%|██████████| 419/419 [00:00<00:00, 1.18MB/s]\n", - "Downloading data: 100%|██████████| 1.59M/1.59M [00:01<00:00, 1.03MB/s]\n", - "Generating train split: 100%|██████████| 7000/7000 [00:00<00:00, 175360.77 examples/s]\n" + " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], @@ -56,15 +53,7 @@ "execution_count": 3, "id": "5ba361dd", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Map: 100%|██████████| 7000/7000 [00:00<00:00, 19176.72 examples/s]\n" - ] - } - ], + "outputs": [], "source": [ "import numpy as np\n", "\n", @@ -102,48 +91,78 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, "id": "a35c1466", "metadata": {}, "outputs": [], "source": [ - "split = ds.train_test_split(test_size=0.1, shuffle=True, seed=520)\n", + "split = ds.train_test_split(test_size=0.02, shuffle=True, seed=520)\n", "train = split[\"train\"]\n", "test = split[\"test\"]" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "24f3f7fb", + "execution_count": 13, + "id": "aec6787d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "140" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(test)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c5cc42ed", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Creating json from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 26.22ba/s]\n" + "Creating json from Arrow format: 0%| | 0/7 [00:00 🤖 Score: ", similarities.numpy()[0][idx]) + +query = "I want to start a tax-free installment investment, what should I do?" +documents = ["Opening a NISA Account", "Opening a Regular Savings Account", "Home Loan Application Guide"] + +get_scores(query, documents) + +from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments +from sentence_transformers.losses import MultipleNegativesRankingLoss +from transformers import TrainerCallback + +loss = MultipleNegativesRankingLoss(model) + +args = SentenceTransformerTrainingArguments( + # Required parameter: + output_dir="my-embedding-gemma", + # Optional training parameters: + prompts=model.prompts[task_name], # use model's prompt to train + num_train_epochs=5, + per_device_train_batch_size=1, + learning_rate=2e-5, + warmup_ratio=0.1, + # Optional tracking/debugging parameters: + logging_steps=train_dataset.num_rows, + report_to="none", +) + +class MyCallback(TrainerCallback): + "A callback that evaluates the model at the end of eopch" + def __init__(self, evaluate): + self.evaluate = evaluate # evaluate function + + def on_log(self, args, state, control, **kwargs): + # Evaluate the model using text generation + print(f"Step {state.global_step} finished. Running evaluation:") + self.evaluate() + +def evaluate(): + get_scores(query, documents) + +trainer = SentenceTransformerTrainer( + model=model, + args=args, + train_dataset=train_dataset, + loss=loss, + callbacks=[MyCallback(evaluate)] +) +trainer.train() + +get_scores(query, documents) \ No newline at end of file