import json from datasets import load_dataset dataset_qrel = load_dataset("MCINext/scidocs-fa-v2")["test"] dataset_corpus_list = load_dataset("MCINext/scidocs-fa-v2",data_files="corpus.jsonl")["train"] dataset_corpus = {} for data in dataset_corpus_list: dataset_corpus[data["_id"]] = data["text"] dataset_queries_list = load_dataset("MCINext/scidocs-fa-v2",data_files="queries.jsonl")["train"] dataset_queries = {} for data in dataset_queries_list: dataset_queries[data["_id"]] = data["text"] dataset = [] print("start creating dataset") for data in dataset_qrel: if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus: dataset.append({ "question": dataset_queries[data["query-id"]], "passage_positive": [dataset_corpus[data["corpus-id"]]], "passage_negative": [], "passage_negative_random": [], }) print(f"length of dataset: {len(dataset)}") with open("./research_notebook/data/scidocs/scidocs_v2.json", "w") as f: json.dump(dataset, f, indent=4, ensure_ascii=False)