embedding_model/data_preprocess/preprocess_v3.py
2025-11-16 15:30:36 +00:00

142 lines
3.9 KiB
Python

import argparse
from datasets import load_dataset
import json
from tqdm import tqdm
import time
from data_preprocess.modify_question_model import LLMModel
llm_model = LLMModel()
def load_msmarco_dataset():
"""
load pquad dataset from huggingface
output:
[{
"question": "",
"passage_positive": [],
"passage_negative": [],
"passage_negative_random": []
}]
"""
print("start loading msmarco dataset")
name = "MCINext/msmarco-fa"
dataset_qrel = load_dataset(name)["train"]
print("start loading corpus")
dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
dataset_corpus = {}
for data in dataset_corpus_list:
dataset_corpus[str(data["_id"])] = data["text"]
print("start loading queries")
dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
dataset_queries = {}
for data in dataset_queries_list:
dataset_queries[str(data["_id"])] = data["text"]
dataset = []
print("start creating dataset")
for data in tqdm(dataset_qrel):
if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
dataset.append({
"question": dataset_queries[data["query-id"]],
"passage_positive": [dataset_corpus[data["corpus-id"]]],
"new_question": "",
"passage_negative": [],
"passage_negative_random": [],
})
print(f"length of dataset: {len(dataset)}")
print("--------------------------------")
return dataset
def modify_question(dataset):
"""
modify question of dataset
Args:
dataset: list of dicts
Returns:
dataset: list of dicts
"""
len_dataset = len(dataset)
batch_size = 50
for i in tqdm(range(0, len_dataset, batch_size)):
question_list = []
passage_positive_list = []
for id in range(i, min(i + batch_size, len_dataset)):
question_list.append(dataset[id]['question'])
passage_positive_list.append(dataset[id]['passage_positive'][0])
results = llm_model.modify_question_llm(question_list, passage_positive_list)
time.sleep(2)
count = 0
for id in range(i, min(i + batch_size, len_dataset)):
dataset[id]["new_question"] = results[count]
count += 1
print(f"successfully modified question")
print("--------------------------------")
return dataset
def save_dataset(dataset, output_path):
"""
save dataset to json file
Args:
dataset: list of dicts
output_path: path to save dataset
"""
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(dataset, f, ensure_ascii=False, indent=4)
def main(output_path):
#load msmarco dataset
print("--------------------------------")
all_dataset = load_msmarco_dataset()
print(f"msmarco dataset loaded : {len(all_dataset)} samples")
print("--------------------------------")
# removing random false negative samples from all dataset
print("start to modify question")
all_dataset = modify_question(all_dataset[:270000])
print(f"successfully modified question")
print("--------------------------------")
# save dataset
print("start to save dataset")
save_dataset(all_dataset, output_path)
print(f"successfully saved dataset")
print("--------------------------------")
if __name__ == "__main__":
"""
preprocess dataset for training
pipelines:
load msmarco dataset from huggingface
generate random negative samples
save dataset to json file
python preprocess_v2.py --output_path /home/firouzi/embedding_model/data/v2/msmarco_train.json
"""
parser = argparse.ArgumentParser()
parser.add_argument("--output_path", type=str, required=True)
args = parser.parse_args()
output_path = args.output_path
main(output_path)