embedding_model/evaluation/evaluate.py

import argparse
import json
import math
import importlib
import tqdm
from hazm import Normalizer

normalizer = Normalizer()


def load_dataset(input_file):
    with open(input_file, "r", encoding="utf-8") as f:
        dataset = json.load(f)
    return dataset


def calculate_ndcg(scores, n):
    def calculate_dcg(scores, n):
        idcg = 0
        for i in range(n):
            a = (2 ** scores[i]) - 1
            b = math.log2(i + 2)

            idcg += (a/b)
        return idcg

    def calculate_idcg(scores, n):
        new_scores = scores.copy()
        new_scores.sort(reverse=True)
        idcg = calculate_dcg(new_scores, n)
        return idcg

    dcg = calculate_dcg(scores, n)
    idcg = calculate_idcg(scores, n)
    ndcg = dcg/idcg
    return ndcg


def calculate_recall(scores):

    try:
        num_ground_truth = scores.count(4)
        if num_ground_truth == 0:
            num_ground_truth = scores.count(3)

        recall_7 = scores[:7].count(4) / num_ground_truth
        recall_12 = scores[:12].count(4) / num_ground_truth
        recall_20 = scores[:20].count(4) / num_ground_truth
        recall_variant = scores[:scores.count(4)].count(4) / scores.count(4)

        return recall_7, recall_12, recall_20, recall_variant
    except:
        return 0, 0, 0, 0


def calculate_precision(scores):
    precision_7 = scores[:7].count(4) / 7
    precision_12 = scores[:12].count(4) / 12
    precision_20 = scores[:20].count(4) / 20

    return precision_7, precision_12, precision_20


def preprocess_reranker(text:str, preprocess:bool=True, add_extra_word:bool=False):
    if preprocess:
        text = text.replace("\n", ".")
        text = normalizer.normalize(text)

    if add_extra_word:
        text += " رهبر انقلاب اسلامی حضرت امام خامنه ای "

    return text


def run(input_file, model):
    module = importlib.import_module("evaluation.models." + model)
    model = module.model()

    ndcg_scores = []
    recall_7_scores = []
    recall_12_scores = []
    recall_20_scores = []
    recall_variant_scores = []
    precision_7_scores = []
    precision_12_scores = []
    precision_20_scores = []
    dataset = load_dataset(input_file)
    for count, data in enumerate(tqdm.tqdm(dataset)):
        question = data["question"]
        chunks = [data["chunks"][str(id)] for id in range(len(data["chunks"].keys()))]
        scores_llm = [data["scores"][str(id)] for id in range(len(data["chunks"].keys()))]
        scores_embed = []
        for chunk in chunks:
            scores_embed.append(model.run(preprocess_reranker(question, preprocess=True), preprocess_reranker(chunk, preprocess=True, add_extra_word=False)))

        # print(f"question {count}: {question}")
        # for i in range(len(scores_embed)):
        #     print(f"chunk {i}: scores_embed {scores_embed[i]}, scores_llm {scores_llm[i]}")
        # print("--------------------------------\n")
        sorted_pairs = sorted(zip(scores_embed, scores_llm), reverse=True)
        scores = [rel for _, rel in sorted_pairs]
        #calculate ndcg
        ndcg = calculate_ndcg(scores, len(scores))
        ndcg_scores.append(ndcg)

        #calculate recall
        recall_7, recall_12, recall_20, recall_variant = calculate_recall(scores)
        recall_7_scores.append(recall_7)
        recall_12_scores.append(recall_12)
        recall_20_scores.append(recall_20)
        recall_variant_scores.append(recall_variant)

        #calculate precision
        precision_7, precision_12, precision_20 = calculate_precision(scores)
        precision_7_scores.append(precision_7)
        precision_12_scores.append(precision_12)
        precision_20_scores.append(precision_20)

    print(f"NDCG: {sum(ndcg_scores)/len(ndcg_scores)}")
    print(f"Recall 7: {sum(recall_7_scores)/len(recall_7_scores)}")
    print(f"Recall 12: {sum(recall_12_scores)/len(recall_12_scores)}")
    print(f"Recall 20: {sum(recall_20_scores)/len(recall_20_scores)}")
    print(f"Recall Variant: {sum(recall_variant_scores)/len(recall_variant_scores)}")
    print(f"Precision 7: {sum(precision_7_scores)/len(precision_7_scores)}")
    print(f"Precision 12: {sum(precision_12_scores)/len(precision_12_scores)}")
    print(f"Precision 20: {sum(precision_20_scores)/len(precision_20_scores)}")


def main():
    """
    -First give your questions to generate_dataset.py and generate a json file and give the path as input_file.
    -Second create your model class in ./models folder similar to sample_model.py
    -Third run the script with the following command:
        python evaluate.py --input_file <path_to_your_json_file> --model <path_to_your_model_class>
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_file', help='json input file path')
    parser.add_argument('--model', help='the path of model class')

    args = parser.parse_args()

    print(f"Start to evaluate the model {args.model} with normalizer and extra words input file {args.input_file}")
    run(args.input_file, args.model)


if __name__ == "__main__":
    exit(main())