import argparse from datasets import load_dataset import json from tqdm import tqdm import time from data_preprocess.modify_question_model import LLMModel llm_model = LLMModel() def load_msmarco_dataset(): """ load pquad dataset from huggingface output: [{ "question": "", "passage_positive": [], "passage_negative": [], "passage_negative_random": [] }] """ print("start loading msmarco dataset") name = "MCINext/msmarco-fa" dataset_qrel = load_dataset(name)["train"] print("start loading corpus") dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"] dataset_corpus = {} for data in dataset_corpus_list: dataset_corpus[str(data["_id"])] = data["text"] print("start loading queries") dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"] dataset_queries = {} for data in dataset_queries_list: dataset_queries[str(data["_id"])] = data["text"] dataset = [] print("start creating dataset") for data in tqdm(dataset_qrel): if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus: dataset.append({ "question": dataset_queries[data["query-id"]], "passage_positive": [dataset_corpus[data["corpus-id"]]], "new_question": "", "passage_negative": [], "passage_negative_random": [], }) print(f"length of dataset: {len(dataset)}") print("--------------------------------") return dataset def modify_question(dataset): """ modify question of dataset Args: dataset: list of dicts Returns: dataset: list of dicts """ len_dataset = len(dataset) batch_size = 50 for i in tqdm(range(0, len_dataset, batch_size)): question_list = [] passage_positive_list = [] for id in range(i, min(i + batch_size, len_dataset)): question_list.append(dataset[id]['question']) passage_positive_list.append(dataset[id]['passage_positive'][0]) results = llm_model.modify_question_llm(question_list, passage_positive_list) time.sleep(2) count = 0 for id in range(i, min(i + batch_size, len_dataset)): dataset[id]["new_question"] = results[count] count += 1 print(f"successfully modified question") print("--------------------------------") return dataset def save_dataset(dataset, output_path): """ save dataset to json file Args: dataset: list of dicts output_path: path to save dataset """ with open(output_path, 'w', encoding='utf-8') as f: json.dump(dataset, f, ensure_ascii=False, indent=4) def main(output_path): #load msmarco dataset print("--------------------------------") all_dataset = load_msmarco_dataset() print(f"msmarco dataset loaded : {len(all_dataset)} samples") print("--------------------------------") # removing random false negative samples from all dataset print("start to modify question") all_dataset = modify_question(all_dataset[:270000]) print(f"successfully modified question") print("--------------------------------") # save dataset print("start to save dataset") save_dataset(all_dataset, output_path) print(f"successfully saved dataset") print("--------------------------------") if __name__ == "__main__": """ preprocess dataset for training pipelines: load msmarco dataset from huggingface generate random negative samples save dataset to json file python preprocess_v2.py --output_path /home/firouzi/embedding_model/data/v2/msmarco_train.json """ parser = argparse.ArgumentParser() parser.add_argument("--output_path", type=str, required=True) args = parser.parse_args() output_path = args.output_path main(output_path)