148 lines
5.0 KiB
Python
148 lines
5.0 KiB
Python
import argparse
|
|
import json
|
|
import math
|
|
import importlib
|
|
import tqdm
|
|
from hazm import Normalizer
|
|
|
|
normalizer = Normalizer()
|
|
|
|
|
|
|
|
def load_dataset(input_file):
|
|
with open(input_file, "r", encoding="utf-8") as f:
|
|
dataset = json.load(f)
|
|
return dataset
|
|
|
|
|
|
def calculate_ndcg(scores, n):
|
|
def calculate_dcg(scores, n):
|
|
idcg = 0
|
|
for i in range(n):
|
|
a = (2 ** scores[i]) - 1
|
|
b = math.log2(i + 2)
|
|
|
|
idcg += (a/b)
|
|
return idcg
|
|
|
|
def calculate_idcg(scores, n):
|
|
new_scores = scores.copy()
|
|
new_scores.sort(reverse=True)
|
|
idcg = calculate_dcg(new_scores, n)
|
|
return idcg
|
|
|
|
dcg = calculate_dcg(scores, n)
|
|
idcg = calculate_idcg(scores, n)
|
|
ndcg = dcg/idcg
|
|
return ndcg
|
|
|
|
|
|
def calculate_recall(scores):
|
|
|
|
try:
|
|
num_ground_truth = scores.count(4)
|
|
if num_ground_truth == 0:
|
|
num_ground_truth = scores.count(3)
|
|
|
|
recall_7 = scores[:7].count(4) / num_ground_truth
|
|
recall_12 = scores[:12].count(4) / num_ground_truth
|
|
recall_20 = scores[:20].count(4) / num_ground_truth
|
|
recall_variant = scores[:scores.count(4)].count(4) / scores.count(4)
|
|
|
|
return recall_7, recall_12, recall_20, recall_variant
|
|
except:
|
|
return 0, 0, 0, 0
|
|
|
|
|
|
def calculate_precision(scores):
|
|
precision_7 = scores[:7].count(4) / 7
|
|
precision_12 = scores[:12].count(4) / 12
|
|
precision_20 = scores[:20].count(4) / 20
|
|
|
|
return precision_7, precision_12, precision_20
|
|
|
|
|
|
def preprocess_reranker(text:str, preprocess:bool=True, add_extra_word:bool=False):
|
|
if preprocess:
|
|
text = text.replace("\n", ".")
|
|
text = normalizer.normalize(text)
|
|
|
|
if add_extra_word:
|
|
text += " رهبر انقلاب اسلامی حضرت امام خامنه ای "
|
|
|
|
return text
|
|
|
|
|
|
def run(input_file, model):
|
|
module = importlib.import_module("evaluation.models." + model)
|
|
model = module.model()
|
|
|
|
ndcg_scores = []
|
|
recall_7_scores = []
|
|
recall_12_scores = []
|
|
recall_20_scores = []
|
|
recall_variant_scores = []
|
|
precision_7_scores = []
|
|
precision_12_scores = []
|
|
precision_20_scores = []
|
|
dataset = load_dataset(input_file)
|
|
for count, data in enumerate(tqdm.tqdm(dataset)):
|
|
question = data["question"]
|
|
chunks = [data["chunks"][str(id)] for id in range(len(data["chunks"].keys()))]
|
|
scores_llm = [data["scores"][str(id)] for id in range(len(data["chunks"].keys()))]
|
|
scores_embed = []
|
|
for chunk in chunks:
|
|
scores_embed.append(model.run(preprocess_reranker(question, preprocess=True), preprocess_reranker(chunk, preprocess=True, add_extra_word=False)))
|
|
|
|
# print(f"question {count}: {question}")
|
|
# for i in range(len(scores_embed)):
|
|
# print(f"chunk {i}: scores_embed {scores_embed[i]}, scores_llm {scores_llm[i]}")
|
|
# print("--------------------------------\n")
|
|
sorted_pairs = sorted(zip(scores_embed, scores_llm), reverse=True)
|
|
scores = [rel for _, rel in sorted_pairs]
|
|
#calculate ndcg
|
|
ndcg = calculate_ndcg(scores, len(scores))
|
|
ndcg_scores.append(ndcg)
|
|
|
|
#calculate recall
|
|
recall_7, recall_12, recall_20, recall_variant = calculate_recall(scores)
|
|
recall_7_scores.append(recall_7)
|
|
recall_12_scores.append(recall_12)
|
|
recall_20_scores.append(recall_20)
|
|
recall_variant_scores.append(recall_variant)
|
|
|
|
#calculate precision
|
|
precision_7, precision_12, precision_20 = calculate_precision(scores)
|
|
precision_7_scores.append(precision_7)
|
|
precision_12_scores.append(precision_12)
|
|
precision_20_scores.append(precision_20)
|
|
|
|
print(f"NDCG: {sum(ndcg_scores)/len(ndcg_scores)}")
|
|
print(f"Recall 7: {sum(recall_7_scores)/len(recall_7_scores)}")
|
|
print(f"Recall 12: {sum(recall_12_scores)/len(recall_12_scores)}")
|
|
print(f"Recall 20: {sum(recall_20_scores)/len(recall_20_scores)}")
|
|
print(f"Recall Variant: {sum(recall_variant_scores)/len(recall_variant_scores)}")
|
|
print(f"Precision 7: {sum(precision_7_scores)/len(precision_7_scores)}")
|
|
print(f"Precision 12: {sum(precision_12_scores)/len(precision_12_scores)}")
|
|
print(f"Precision 20: {sum(precision_20_scores)/len(precision_20_scores)}")
|
|
|
|
|
|
def main():
|
|
"""
|
|
-First give your questions to generate_dataset.py and generate a json file and give the path as input_file.
|
|
-Second create your model class in ./models folder similar to sample_model.py
|
|
-Third run the script with the following command:
|
|
python evaluate.py --input_file <path_to_your_json_file> --model <path_to_your_model_class>
|
|
"""
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--input_file', help='json input file path')
|
|
parser.add_argument('--model', help='the path of model class')
|
|
|
|
args = parser.parse_args()
|
|
|
|
print(f"Start to evaluate the model {args.model} with normalizer and extra words input file {args.input_file}")
|
|
run(args.input_file, args.model)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
exit(main()) |