30 lines
1.1 KiB
Python
30 lines
1.1 KiB
Python
import json
|
|
from datasets import load_dataset
|
|
|
|
dataset_qrel = load_dataset("MCINext/scidocs-fa-v2")["test"]
|
|
dataset_corpus_list = load_dataset("MCINext/scidocs-fa-v2",data_files="corpus.jsonl")["train"]
|
|
dataset_corpus = {}
|
|
for data in dataset_corpus_list:
|
|
dataset_corpus[data["_id"]] = data["text"]
|
|
|
|
dataset_queries_list = load_dataset("MCINext/scidocs-fa-v2",data_files="queries.jsonl")["train"]
|
|
dataset_queries = {}
|
|
for data in dataset_queries_list:
|
|
dataset_queries[data["_id"]] = data["text"]
|
|
|
|
|
|
dataset = []
|
|
print("start creating dataset")
|
|
for data in dataset_qrel:
|
|
|
|
if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
|
|
dataset.append({
|
|
"question": dataset_queries[data["query-id"]],
|
|
"passage_positive": [dataset_corpus[data["corpus-id"]]],
|
|
"passage_negative": [],
|
|
"passage_negative_random": [],
|
|
})
|
|
|
|
print(f"length of dataset: {len(dataset)}")
|
|
with open("./research_notebook/data/scidocs/scidocs_v2.json", "w") as f:
|
|
json.dump(dataset, f, indent=4, ensure_ascii=False) |