add modify question

This commit is contained in:
saeedfirouzi 2025-11-16 15:30:36 +00:00
parent c06572dedb
commit 0ec2c4d8ce
13 changed files with 5414 additions and 1277 deletions

View File

@ -8,12 +8,12 @@ from data_preprocess.text_embedder import TextEmbedder
THRESHOLD_MULTIPLY = 0.95
RANDOM_NEGATIVE_COUNT = 6
batch_size = 100
batch_size = 1000
text_embedder = TextEmbedder()
def generate_random_negative_sample(all_dataset):
def generate_random_negative_sample(all_dataset, corpus_list=[]):
"""
generate random negative sample from dataset
Args:
@ -35,7 +35,7 @@ def generate_random_negative_sample(all_dataset):
for id in range(i, min(i + batch_size, len_dataset)):
question_list.append(all_dataset[id]['question'])
question_embeddings = text_embedder.embed_texts(question_list)
question_embeddings = text_embedder.embed_texts(question_list, do_preprocess=False, convert_to_numpy=False)
count = 0
for id in range(i, min(i + batch_size, len_dataset)):
@ -52,7 +52,7 @@ def generate_random_negative_sample(all_dataset):
for passage in all_dataset[id]['passage_positive']:
passage_positive_list.append(passage)
passage_positive_embeddings = text_embedder.embed_texts(passage_positive_list)
passage_positive_embeddings = text_embedder.embed_texts(passage_positive_list, do_preprocess=False, convert_to_numpy=False)
count = 0
for id in range(i, min(i + batch_size, len_dataset)):
@ -71,7 +71,7 @@ def generate_random_negative_sample(all_dataset):
for passage in all_dataset[id]['passage_negative']:
passage_negative_list.append(passage)
passage_negative_embeddings = text_embedder.embed_texts(passage_negative_list)
passage_negative_embeddings = text_embedder.embed_texts(passage_negative_list, do_preprocess=False, convert_to_numpy=False)
count = 0
for id in range(i, min(i + batch_size, len_dataset)):
@ -80,10 +80,18 @@ def generate_random_negative_sample(all_dataset):
all_texts.append(all_dataset[id]['passage_negative'][passage_id])
count += 1
print("calculate corpus embeddings")
# calculate corpus embeddings
for i in tqdm(range(0, len(corpus_list), batch_size)):
corpus_embeddings = text_embedder.embed_texts(corpus_list[i:i+batch_size], do_preprocess=False, convert_to_numpy=False)
all_embeddings.extend(corpus_embeddings)
all_texts.extend(corpus_list[i:i+batch_size])
############ Create FAISS index ############
all_embeddings = np.array(all_embeddings, dtype=np.float32)
dim = all_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
# index = faiss.IndexFlatIP(dim)
index = faiss.IndexHNSWFlat(dim, 32, faiss.METRIC_INNER_PRODUCT)
faiss.normalize_L2(all_embeddings)
index.add(all_embeddings)
@ -94,15 +102,16 @@ def generate_random_negative_sample(all_dataset):
question_embeddings = all_dataset_embeddings[id]['question_embedding']
question_embeddings_normalized = np.array([question_embeddings], dtype=np.float32)
faiss.normalize_L2(question_embeddings_normalized)
passage_positive_embeddings = all_dataset_embeddings[id]['passage_positive_embedding'][0]
# passage_positive_embeddings = all_dataset_embeddings[id]['passage_positive_embedding'][0]
score_question_passage_positive = np.dot(question_embeddings, passage_positive_embeddings)
# score_question_passage_positive = np.dot(question_embeddings, passage_positive_embeddings)
num_retrieved = 30
num_retrieved = 15
vector_scores, vector_ids = index.search(question_embeddings_normalized, num_retrieved)
for vector_score, vector_id in zip(vector_scores[0], vector_ids[0]):
if (all_texts[vector_id] not in not_valid_passages) and (vector_score < THRESHOLD_MULTIPLY * score_question_passage_positive):
if (all_texts[vector_id] not in not_valid_passages):# and (vector_score < THRESHOLD_MULTIPLY * score_question_passage_positive):
all_dataset[id]['passage_negative_random'].append(all_texts[vector_id])
not_valid_passages.append(all_texts[vector_id])
if len(all_dataset[id]['passage_negative_random']) >= RANDOM_NEGATIVE_COUNT:
break

View File

@ -0,0 +1,103 @@
from typing import List, Dict, Any
import json
import asyncio
import aiohttp
import time
import re
model_url = "http://192.168.130.206:4001/v1"
model = "google/gemma-3-27b-it"
class LLMModel:
def __init__(self):
self.instruction = """
You are a helpful assistant that help to me to modify and change the input question.
I will give you a question and its text and you must replace the words of question with synonyms or similar words.
## Important:
- replace the words of question with synonyms or similar words.
-the question must be in persian language.
return the question nothing else.
"""
async def run_llm(self, session, question, text):
"""
Run the llm model.
Args:
session: The session to use for the request.
question: The question to evaluate the text.
text: The text to evaluate.
Returns:
The result of the text.
"""
headers = {"Content-Type": "application/json"}
input_message = f"""{{"question": "{question}", "text": "{text}"}}"""
messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
payload = {
"model": model,
"messages": messages,
"max_tokens": 100
}
try:
async with session.post(model_url + "/chat/completions", headers=headers, json=payload) as resp:
resp.raise_for_status()
response = await resp.json()
result = response['choices'][0]['message']['content']
print(f"question: {question}")
print(f"result: {result}")
print("--------------------------------")
return result
except Exception as e:
try:
print(f"Error in llm model {response}: {e}")
except:
print(f"Error in llm model: {e}")
return ""
async def run_llm_async(self, question_list, text_list):
"""
Send all chunk requests concurrently.
Args:
question_list: The list of questions.
text_list: The list of texts.
Returns:
The list of results.
"""
async with aiohttp.ClientSession() as session:
tasks = [self.run_llm(session, question, text) for question, text in zip(question_list, text_list)]
results = await asyncio.gather(*tasks)
return results
def modify_question_llm(self, query_list: List[str], text_list: List[str]) -> List[Dict[str, Any]]:
"""
Modify question of the documents based on the query using the LLM model.
Args:
query_list: The list of queries.
text_list: The list of texts.
Returns:
The list of modified questions.
"""
if not text_list:
return []
start_time = time.time()
results = asyncio.run(self.run_llm_async(query_list, text_list))
end_time = time.time()
# print(f"Time taken for llm model: {end_time - start_time} seconds")
return results

View File

@ -153,9 +153,9 @@ def main(output_path):
#load synthetic dataset
print("--------------------------------")
print("loading synthetic dataset")
synthetic_train_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/train.jsonl"
synthetic_corpus_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/corpus.jsonl"
synthetic_queries_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/queries.jsonl"
synthetic_train_path = "/home/firouzi/embedding_model/research_notebook/data/synthetic-persian-qa-retrieval/train.jsonl"
synthetic_corpus_path = "/home/firouzi/embedding_model/research_notebook/data/synthetic-persian-qa-retrieval/corpus.jsonl"
synthetic_queries_path = "/home/firouzi/embedding_model/research_notebook/data/synthetic-persian-qa-retrieval/queries.jsonl"
synthetic_dataset = load_synthetic_dataset(synthetic_train_path, synthetic_queries_path, synthetic_corpus_path)
print(f"synthetic dataset loaded : {len(synthetic_dataset)} samples")
@ -173,11 +173,11 @@ def main(output_path):
print(f"successfully merged synthetic and pquad dataset")
print("--------------------------------")
# removing false negative samples from all dataset
print("start to remove false negative samples from all dataset")
all_dataset = remove_false_negative(all_dataset, random_negative_sample=False)
print(f"successfully removed false negative samples from all dataset")
print("--------------------------------")
# # removing false negative samples from all dataset
# print("start to remove false negative samples from all dataset")
# all_dataset = remove_false_negative(all_dataset, random_negative_sample=False)
# print(f"successfully removed false negative samples from all dataset")
# print("--------------------------------")
# with open("/home/firouzi/embedding_model/data/train.json", "r", encoding="utf-8") as f:
# all_dataset = json.load(f)

View File

@ -0,0 +1,160 @@
import argparse
from datasets import load_dataset
import json
from tqdm import tqdm
from data_preprocess.remove_false_negative_model import LLMModel
from data_preprocess.generate_random_negative_sample import generate_random_negative_sample
llm_model = LLMModel()
def load_msmarco_dataset():
"""
load pquad dataset from huggingface
output:
[{
"question": "",
"passage_positive": [],
"passage_negative": [],
"passage_negative_random": []
}]
"""
print("start loading msmarco dataset")
name = "MCINext/msmarco-fa"
dataset_qrel = load_dataset(name)["train"]
print("start loading corpus")
dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
dataset_corpus = {}
for data in dataset_corpus_list:
dataset_corpus[str(data["_id"])] = data["text"]
print("start loading queries")
dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
dataset_queries = {}
for data in dataset_queries_list:
dataset_queries[str(data["_id"])] = data["text"]
dataset = []
print("start creating dataset")
for data in tqdm(dataset_qrel):
if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
dataset.append({
"question": dataset_queries[data["query-id"]],
"passage_positive": [dataset_corpus[data["corpus-id"]]],
"passage_negative": [],
"passage_negative_random": [],
})
print(f"length of dataset: {len(dataset)}")
print("--------------------------------")
return dataset, list(dataset_corpus.values())
def remove_false_negative(dataset, random_negative_sample=False):
"""
remove false negative samples from synthetic dataset
Args:
dataset: list of dicts
Returns:
dataset: list of dicts
"""
if random_negative_sample:
negative_name = "passage_negative_random"
else:
negative_name = "passage_negative"
# calculate passage negative embeddings
negative_count_all = 0
negative_count_removed = 0
len_dataset = len(dataset)
batch_size = 50
for i in tqdm(range(0, len_dataset, batch_size)):
question_list = []
passage_negative_list = []
for id in range(i, min(i + batch_size, len_dataset)):
for passage in dataset[id][negative_name]:
question_list.append(dataset[id]['question'])
passage_negative_list.append(passage)
results = llm_model.remove_false_negative_llm(question_list, passage_negative_list)
negative_count_removed += len([_ for _ in results if _ == "1"])
negative_count_all += len(results)
count = 0
for id in range(i, min(i + batch_size, len_dataset)):
new_negative_list = []
for passage_id in range(len(dataset[id][negative_name])):
if results[count] == "0":
new_negative_list.append(dataset[id][negative_name][passage_id])
count += 1
dataset[id][negative_name] = new_negative_list
print(f"removed {negative_count_removed} false negative samples from {negative_count_all} samples")
print("--------------------------------")
return dataset
def save_dataset(dataset, output_path):
"""
save dataset to json file
Args:
dataset: list of dicts
output_path: path to save dataset
"""
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(dataset, f, ensure_ascii=False, indent=4)
def main(output_path):
#load msmarco dataset
print("--------------------------------")
all_dataset, corpus_list = load_msmarco_dataset()
print(f"msmarco dataset loaded : {len(all_dataset)} samples")
print("--------------------------------")
#generate random negative samples
print("start to generate random negative samples")
all_dataset = generate_random_negative_sample(all_dataset, corpus_list)
print(f"successfully generated random negative samples")
print("--------------------------------")
# removing random false negative samples from all dataset
print("start to remove random false negative samples from all dataset")
all_dataset = remove_false_negative(all_dataset, random_negative_sample=True)
print(f"successfully removed random false negative samples from all dataset")
print("--------------------------------")
# save dataset
print("start to save dataset")
save_dataset(all_dataset, output_path)
print(f"successfully saved dataset")
print("--------------------------------")
if __name__ == "__main__":
"""
preprocess dataset for training
pipelines:
load msmarco dataset from huggingface
generate random negative samples
save dataset to json file
python preprocess_v2.py --output_path /home/firouzi/embedding_model/data/v2/msmarco_train.json
"""
parser = argparse.ArgumentParser()
parser.add_argument("--output_path", type=str, required=True)
args = parser.parse_args()
output_path = args.output_path
main(output_path)

View File

@ -0,0 +1,142 @@
import argparse
from datasets import load_dataset
import json
from tqdm import tqdm
import time
from data_preprocess.modify_question_model import LLMModel
llm_model = LLMModel()
def load_msmarco_dataset():
"""
load pquad dataset from huggingface
output:
[{
"question": "",
"passage_positive": [],
"passage_negative": [],
"passage_negative_random": []
}]
"""
print("start loading msmarco dataset")
name = "MCINext/msmarco-fa"
dataset_qrel = load_dataset(name)["train"]
print("start loading corpus")
dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
dataset_corpus = {}
for data in dataset_corpus_list:
dataset_corpus[str(data["_id"])] = data["text"]
print("start loading queries")
dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
dataset_queries = {}
for data in dataset_queries_list:
dataset_queries[str(data["_id"])] = data["text"]
dataset = []
print("start creating dataset")
for data in tqdm(dataset_qrel):
if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
dataset.append({
"question": dataset_queries[data["query-id"]],
"passage_positive": [dataset_corpus[data["corpus-id"]]],
"new_question": "",
"passage_negative": [],
"passage_negative_random": [],
})
print(f"length of dataset: {len(dataset)}")
print("--------------------------------")
return dataset
def modify_question(dataset):
"""
modify question of dataset
Args:
dataset: list of dicts
Returns:
dataset: list of dicts
"""
len_dataset = len(dataset)
batch_size = 50
for i in tqdm(range(0, len_dataset, batch_size)):
question_list = []
passage_positive_list = []
for id in range(i, min(i + batch_size, len_dataset)):
question_list.append(dataset[id]['question'])
passage_positive_list.append(dataset[id]['passage_positive'][0])
results = llm_model.modify_question_llm(question_list, passage_positive_list)
time.sleep(2)
count = 0
for id in range(i, min(i + batch_size, len_dataset)):
dataset[id]["new_question"] = results[count]
count += 1
print(f"successfully modified question")
print("--------------------------------")
return dataset
def save_dataset(dataset, output_path):
"""
save dataset to json file
Args:
dataset: list of dicts
output_path: path to save dataset
"""
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(dataset, f, ensure_ascii=False, indent=4)
def main(output_path):
#load msmarco dataset
print("--------------------------------")
all_dataset = load_msmarco_dataset()
print(f"msmarco dataset loaded : {len(all_dataset)} samples")
print("--------------------------------")
# removing random false negative samples from all dataset
print("start to modify question")
all_dataset = modify_question(all_dataset[:270000])
print(f"successfully modified question")
print("--------------------------------")
# save dataset
print("start to save dataset")
save_dataset(all_dataset, output_path)
print(f"successfully saved dataset")
print("--------------------------------")
if __name__ == "__main__":
"""
preprocess dataset for training
pipelines:
load msmarco dataset from huggingface
generate random negative samples
save dataset to json file
python preprocess_v2.py --output_path /home/firouzi/embedding_model/data/v2/msmarco_train.json
"""
parser = argparse.ArgumentParser()
parser.add_argument("--output_path", type=str, required=True)
args = parser.parse_args()
output_path = args.output_path
main(output_path)

View File

@ -102,7 +102,7 @@ class LLMModel:
start_time = time.time()
results = asyncio.run(self.run_llm_async(query_list, text_list))
end_time = time.time()
print(f"Time taken for llm model: {end_time - start_time} seconds")
# print(f"Time taken for llm model: {end_time - start_time} seconds")
return results

File diff suppressed because it is too large Load Diff

View File

@ -3,6 +3,7 @@ import requests
import numpy as np
from dotenv import load_dotenv
import os
import time
load_dotenv()
@ -20,21 +21,26 @@ class TextEmbedder:
return text
def embed_texts(self, texts:list[str])->list[list[float]]:
def embed_texts(self, texts:list[str], do_preprocess=True, convert_to_numpy=True)->list[list[float]]:
"""
Embed texts using the model.
"""
if texts == []:
return []
texts = [self.preprocess_embedder(text) for text in texts]
if do_preprocess:
texts = [self.preprocess_embedder(text) for text in texts]
payload = {
"model": self.model_name,
"input": texts
}
responses = requests.post("http://78.38.161.78:3094/v1/embeddings", headers=self.headers, json=payload)
embeddings = [np.array(response["embedding"], dtype=np.float32) for response in responses.json()["data"]]
if convert_to_numpy:
embeddings = [np.array(response["embedding"], dtype=np.float32) for response in responses.json()["data"]]
else:
embeddings = [response["embedding"] for response in responses.json()["data"]]
return embeddings

View File

@ -6,15 +6,17 @@ from tqdm import tqdm
names = ["MCINext/FEVER_FA_test_top_250_only_w_correct-v2", "MCINext/fiqa-fa-v2", "MCINext/HotpotQA_FA_test_top_250_only_w_correct-v2",
"MCINext/MSMARCO_FA_test_top_250_only_w_correct-v2", "MCINext/NQ_FA_test_top_250_only_w_correct-v2", "MCINext/quora-fa-v2", "MCINext/scifact-fa-v2",
"MCINext/synthetic-persian-chatbot-rag-faq-retrieval", "MCINext/synthetic-persian-qa-retrieval", "MCINext/trec-covid-fa-v2"]
names = names[3:4]
for name in tqdm(names):
print(f"loading {name}")
dataset_qrel = load_dataset(name)["test"]
dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
dataset_corpus_list = load_dataset(name,data_files="corpus/corpus.jsonl")["train"]
dataset_corpus = {}
for data in dataset_corpus_list:
dataset_corpus[data["_id"]] = data["text"]
dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
dataset_queries_list = load_dataset(name,data_files="queries/queries.jsonl")["train"]
dataset_queries = {}
for data in dataset_queries_list:
dataset_queries[data["_id"]] = data["text"]

View File

@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"id": "a78759c8",
"metadata": {},
"outputs": [
@ -11,11 +11,7 @@
"output_type": "stream",
"text": [
"/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/datasets/load.py:1461: FutureWarning: The repository for Gholamreza/pquad contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/Gholamreza/pquad\n",
"You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
"Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
" warnings.warn(\n"
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
@ -27,7 +23,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"id": "c91f659a",
"metadata": {},
"outputs": [
@ -54,7 +50,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"id": "d66809ce",
"metadata": {},
"outputs": [
@ -62,12 +58,12 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'question': 'جنگ جهانی اول در چه تاریخی پایان یافت؟', 'passgae_positive': [], 'passgae_negative': ['در سال ۱۸۷۱ امپراتوری آلمان با اتحاد پروس و کنفدراسیون جرمن شمالی توسط اتو ون بیسمارک به وجود آمد. این کشور قدرتمند تا سال ۱۹۱۸ ادامه یافت و با عنوان رایش دوم مشهور شد. بیسمارک توانست استان\\u200cهای جدید زیادی را طی جنگ\\u200cهای مبتکرانهٔ کوتاه و دیپلماتیک به دست آورد. او با اتریش هم پیمان شد تا دانمارک را شکست دهد و ناحیهٔ شلزویگ-هولشتاین را تصرف کند. او جنگ اتریش و پروس (آسترو-پروسیان) را آغاز کرد و پیروز شد اما اینکار فقط برای این بود که ایتالیا طرف آلمان را بگیرد. سپس پروس وارد جنگ فرانسه و پروس (فرانکو-پروسین) (۷۱-۱۸۷۰) شد و توانست شکست کاملی به فرانسه وارد سازد. ویلهلم اول به عنوان آخرین توهین به فرانسوی\\u200cها در کاخ ورسای در قلب فرانسه به عنوان امپراتور آلمان سوگند خورد. امپراتوری آلمان تا پایان جنگ جهانی اول یعنی زمانی که فرانسه توانست در پیمان ورسای تلافی بکند در اوج خود بود.']}\n"
"{'question': 'سام میرزا در چه تاریخی توسط نادرشاه دستگیر شد؟', 'passage_positive': ['این ضربت سخت خیال نادر را پریشان کرد و رضا قلی میرزا را که در رکاب بود در طهران گذاشت و خود به داغستان رفت در این سفر اگرچه بعضی از رؤسای طوایف لزکی از در اطاعت درآمدند لیکن غالب سکنه داغستان به قلل جبال پرارتفاع پناه گرفتند و از هر طرف به تعرّض اردوی نادر دست زدند و لطمات بسیار به ایشان وارد آوردند حتّی موقعی به خیمه خود نادر نیز تعرّض رساندند. در رمضان ۱۱۵۴ موقعیکه نادر هنوز در داغستان بود غلامی را که مرتکب انداختن تیر در جنگل سوادکوه شده بود بخدمت او آوردند. نادر او را کور کرد. شخصی بنام سام میرزا که به ادّعای فرزندی شاه سلطان حسین در آذربایجان به سلطنت طلبی برخاسته و محمّد خان پسر سرخای خان لزگی و خوانین دربند و داغستان را با خود همدست نموده بود. نادر توسّط نصر اللّه میرزا و چند تن از سرداران خود انقلاب این حدود را بالاخره خواباند و سام میرزا در ذی\\u200cالقعده ۱۱۵۶ دستگیر گردید.'], 'passage_negative': [], 'passage_negative_random': []}\n"
]
}
],
"source": [
"print(all_dataset[1240])"
"print(all_dataset[1241])"
]
},
{

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -58,7 +58,7 @@ def main(add_prompt, lora):
learning_rate=2e-5,
warmup_ratio=0.05,
logging_steps=10,
report_to="tensorboard",
report_to="mlflow",
save_steps=10000,
save_total_limit=2,
)