import argparse from datasets import load_dataset import json from tqdm import tqdm from data_preprocess.remove_false_negative_model import LLMModel from data_preprocess.generate_random_negative_sample import generate_random_negative_sample llm_model = LLMModel() def load_synthetic_dataset(synthetic_train_path, synthetic_queries_path, synthetic_corpus_path): """ load synthetic dataset from local jsonl files output: [{ "question": "", "passgae_positive": [], "passgae_negative": [], "passage_negative_random": [] }] """ dataset_synthetic_scores = [] with open(synthetic_train_path, 'r', encoding='utf-8') as f: for line in f: data = json.loads(line) dataset_synthetic_scores.append(data) dataset_synthetic_queries = {} with open(synthetic_queries_path, 'r', encoding='utf-8') as f: for line in f: json_data = json.loads(line) dataset_synthetic_queries[json_data['_id']] = json_data dataset_synthetic_corpus = {} with open(synthetic_corpus_path, 'r', encoding='utf-8') as f: for line in f: json_data = json.loads(line) dataset_synthetic_corpus[json_data['_id']] = json_data #create a json which has question, passgae_positive, passgae_negative, passage_negative_random all_dataset = {} for data_topic in dataset_synthetic_scores: query_id = data_topic['query-id'] corpus_id = int(data_topic['corpus-id']) score = data_topic['score'] if (query_id in dataset_synthetic_queries) and (corpus_id in dataset_synthetic_corpus): if score == "1": if query_id in all_dataset: all_dataset[query_id]['passage_positive'].append(dataset_synthetic_corpus[corpus_id]['text']) else: all_dataset[query_id] = {'question': dataset_synthetic_queries[query_id]['text'], 'passage_positive': [dataset_synthetic_corpus[corpus_id]['text']], 'passage_negative': [], 'passage_negative_random': []} else: if query_id in all_dataset: all_dataset[query_id]['passage_negative'].append(dataset_synthetic_corpus[corpus_id]['text']) else: all_dataset[query_id] = {'question': dataset_synthetic_queries[query_id]['text'], 'passage_positive': [], 'passage_negative': [dataset_synthetic_corpus[corpus_id]['text']], 'passage_negative_random': []} all_dataset = list(all_dataset.values()) return all_dataset def load_pquad_dataset(): """ load pquad dataset from huggingface output: [{ "question": "", "passage_positive": [], "passage_negative": [], "passage_negative_random": [] }] """ dataset = load_dataset("Gholamreza/pquad", trust_remote_code=True) all_dataset = [] for data in dataset["train"]: if len(data["answers"]["text"]) > 0: all_dataset.append({'question': data['question'], 'passage_positive': [data['context']], 'passage_negative': [], 'passage_negative_random': []}) return all_dataset def remove_false_negative(dataset): """ remove false negative samples from synthetic dataset Args: dataset: list of dicts Returns: dataset: list of dicts """ # calculate passage negative embeddings negative_count_all = 0 negative_count_removed = 0 len_dataset = len(dataset) batch_size = 100 for i in tqdm(range(0, len_dataset, batch_size)): question_list = [] passage_negative_list = [] for id in range(i, min(i + batch_size, len_dataset)): for passage in dataset[id]['passage_negative']: question_list.append(dataset[id]['question']) passage_negative_list.append(passage) results = llm_model.remove_false_negative_llm(question_list, passage_negative_list) negative_count_removed += len([_ for _ in results if _ == "1"]) negative_count_all += len(results) count = 0 for id in range(i, min(i + batch_size, len_dataset)): new_negative_list = [] for passage_id in range(len(dataset[id]['passage_negative'])): if results[count] == "0": new_negative_list.append(dataset[id]['passage_negative'][passage_id]) count += 1 dataset[id]['passage_negative'] = new_negative_list print(f"removed {negative_count_removed} false negative samples from {negative_count_all} samples") print("--------------------------------") return dataset def save_dataset(dataset, output_path): """ save dataset to json file Args: dataset: list of dicts output_path: path to save dataset """ with open(output_path, 'w', encoding='utf-8') as f: json.dump(dataset, f, ensure_ascii=False, indent=4) def main(output_path): #load synthetic dataset print("--------------------------------") print("loading synthetic dataset") synthetic_train_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/train.jsonl" synthetic_corpus_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/corpus.jsonl" synthetic_queries_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/queries.jsonl" synthetic_dataset = load_synthetic_dataset(synthetic_train_path, synthetic_queries_path, synthetic_corpus_path) print(f"synthetic dataset loaded : {len(synthetic_dataset)} samples") print("--------------------------------") #load pquad dataset print("loading pquad dataset") pquad_dataset = load_pquad_dataset() print(f"pquad dataset loaded : {len(pquad_dataset)} samples") print("--------------------------------") # removing false negative samples from synthetic dataset print("start to remove false negative samples from synthetic dataset") synthetic_dataset = remove_false_negative(synthetic_dataset) print(f"successfully removed false negative samples from synthetic dataset") print("--------------------------------") # removing false negative samples from pquad dataset print("start to remove false negative samples from pquad dataset") pquad_dataset = remove_false_negative(pquad_dataset) print(f"successfully removed false negative samples from pquad dataset") print("--------------------------------") # merge synthetic and pquad dataset print("start to merge synthetic and pquad dataset") all_dataset = synthetic_dataset + pquad_dataset print(f"successfully merged synthetic and pquad dataset") print("--------------------------------") #generate random negative samples print("start to generate random negative samples") all_dataset = generate_random_negative_sample(all_dataset) print(f"successfully generated random negative samples") print("--------------------------------") # save dataset print("start to save dataset") save_dataset(all_dataset, output_path) print(f"successfully saved dataset") print("--------------------------------") if __name__ == "__main__": """ preprocess dataset for training pipelines: load synthetic dataset from local jsonl files load pquad dataset from huggingface remove false negative samples from synthetic dataset remove false negative samples from pquad dataset merge synthetic and pquad dataset generate random negative samples save dataset to json file python preprocess_v1.py --output_path /home/firouzi/embedding_model/data/train.json """ parser = argparse.ArgumentParser() parser.add_argument("--output_path", type=str, required=True) args = parser.parse_args() output_path = args.output_path main(output_path)