add modify question
This commit is contained in:
parent
c06572dedb
commit
0ec2c4d8ce
@ -8,12 +8,12 @@ from data_preprocess.text_embedder import TextEmbedder
|
||||
|
||||
THRESHOLD_MULTIPLY = 0.95
|
||||
RANDOM_NEGATIVE_COUNT = 6
|
||||
batch_size = 100
|
||||
batch_size = 1000
|
||||
|
||||
text_embedder = TextEmbedder()
|
||||
|
||||
|
||||
def generate_random_negative_sample(all_dataset):
|
||||
def generate_random_negative_sample(all_dataset, corpus_list=[]):
|
||||
"""
|
||||
generate random negative sample from dataset
|
||||
Args:
|
||||
@ -35,7 +35,7 @@ def generate_random_negative_sample(all_dataset):
|
||||
for id in range(i, min(i + batch_size, len_dataset)):
|
||||
question_list.append(all_dataset[id]['question'])
|
||||
|
||||
question_embeddings = text_embedder.embed_texts(question_list)
|
||||
question_embeddings = text_embedder.embed_texts(question_list, do_preprocess=False, convert_to_numpy=False)
|
||||
|
||||
count = 0
|
||||
for id in range(i, min(i + batch_size, len_dataset)):
|
||||
@ -52,7 +52,7 @@ def generate_random_negative_sample(all_dataset):
|
||||
for passage in all_dataset[id]['passage_positive']:
|
||||
passage_positive_list.append(passage)
|
||||
|
||||
passage_positive_embeddings = text_embedder.embed_texts(passage_positive_list)
|
||||
passage_positive_embeddings = text_embedder.embed_texts(passage_positive_list, do_preprocess=False, convert_to_numpy=False)
|
||||
|
||||
count = 0
|
||||
for id in range(i, min(i + batch_size, len_dataset)):
|
||||
@ -71,7 +71,7 @@ def generate_random_negative_sample(all_dataset):
|
||||
for passage in all_dataset[id]['passage_negative']:
|
||||
passage_negative_list.append(passage)
|
||||
|
||||
passage_negative_embeddings = text_embedder.embed_texts(passage_negative_list)
|
||||
passage_negative_embeddings = text_embedder.embed_texts(passage_negative_list, do_preprocess=False, convert_to_numpy=False)
|
||||
|
||||
count = 0
|
||||
for id in range(i, min(i + batch_size, len_dataset)):
|
||||
@ -80,10 +80,18 @@ def generate_random_negative_sample(all_dataset):
|
||||
all_texts.append(all_dataset[id]['passage_negative'][passage_id])
|
||||
count += 1
|
||||
|
||||
print("calculate corpus embeddings")
|
||||
# calculate corpus embeddings
|
||||
for i in tqdm(range(0, len(corpus_list), batch_size)):
|
||||
corpus_embeddings = text_embedder.embed_texts(corpus_list[i:i+batch_size], do_preprocess=False, convert_to_numpy=False)
|
||||
all_embeddings.extend(corpus_embeddings)
|
||||
all_texts.extend(corpus_list[i:i+batch_size])
|
||||
|
||||
############ Create FAISS index ############
|
||||
all_embeddings = np.array(all_embeddings, dtype=np.float32)
|
||||
dim = all_embeddings.shape[1]
|
||||
index = faiss.IndexFlatIP(dim)
|
||||
# index = faiss.IndexFlatIP(dim)
|
||||
index = faiss.IndexHNSWFlat(dim, 32, faiss.METRIC_INNER_PRODUCT)
|
||||
faiss.normalize_L2(all_embeddings)
|
||||
index.add(all_embeddings)
|
||||
|
||||
@ -94,15 +102,16 @@ def generate_random_negative_sample(all_dataset):
|
||||
question_embeddings = all_dataset_embeddings[id]['question_embedding']
|
||||
question_embeddings_normalized = np.array([question_embeddings], dtype=np.float32)
|
||||
faiss.normalize_L2(question_embeddings_normalized)
|
||||
passage_positive_embeddings = all_dataset_embeddings[id]['passage_positive_embedding'][0]
|
||||
# passage_positive_embeddings = all_dataset_embeddings[id]['passage_positive_embedding'][0]
|
||||
|
||||
score_question_passage_positive = np.dot(question_embeddings, passage_positive_embeddings)
|
||||
# score_question_passage_positive = np.dot(question_embeddings, passage_positive_embeddings)
|
||||
|
||||
num_retrieved = 30
|
||||
num_retrieved = 15
|
||||
vector_scores, vector_ids = index.search(question_embeddings_normalized, num_retrieved)
|
||||
for vector_score, vector_id in zip(vector_scores[0], vector_ids[0]):
|
||||
if (all_texts[vector_id] not in not_valid_passages) and (vector_score < THRESHOLD_MULTIPLY * score_question_passage_positive):
|
||||
if (all_texts[vector_id] not in not_valid_passages):# and (vector_score < THRESHOLD_MULTIPLY * score_question_passage_positive):
|
||||
all_dataset[id]['passage_negative_random'].append(all_texts[vector_id])
|
||||
not_valid_passages.append(all_texts[vector_id])
|
||||
|
||||
if len(all_dataset[id]['passage_negative_random']) >= RANDOM_NEGATIVE_COUNT:
|
||||
break
|
||||
|
||||
103
data_preprocess/modify_question_model.py
Normal file
103
data_preprocess/modify_question_model.py
Normal file
@ -0,0 +1,103 @@
|
||||
from typing import List, Dict, Any
|
||||
import json
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import time
|
||||
import re
|
||||
|
||||
|
||||
|
||||
model_url = "http://192.168.130.206:4001/v1"
|
||||
model = "google/gemma-3-27b-it"
|
||||
|
||||
|
||||
class LLMModel:
|
||||
def __init__(self):
|
||||
|
||||
self.instruction = """
|
||||
You are a helpful assistant that help to me to modify and change the input question.
|
||||
I will give you a question and its text and you must replace the words of question with synonyms or similar words.
|
||||
|
||||
## Important:
|
||||
- replace the words of question with synonyms or similar words.
|
||||
|
||||
-the question must be in persian language.
|
||||
|
||||
return the question nothing else.
|
||||
"""
|
||||
|
||||
async def run_llm(self, session, question, text):
|
||||
"""
|
||||
Run the llm model.
|
||||
Args:
|
||||
session: The session to use for the request.
|
||||
question: The question to evaluate the text.
|
||||
text: The text to evaluate.
|
||||
Returns:
|
||||
The result of the text.
|
||||
"""
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
input_message = f"""{{"question": "{question}", "text": "{text}"}}"""
|
||||
messages = [{"role": "system", "content": self.instruction}, {"role": "user", "content": input_message}]
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"max_tokens": 100
|
||||
}
|
||||
try:
|
||||
async with session.post(model_url + "/chat/completions", headers=headers, json=payload) as resp:
|
||||
resp.raise_for_status()
|
||||
response = await resp.json()
|
||||
|
||||
result = response['choices'][0]['message']['content']
|
||||
print(f"question: {question}")
|
||||
print(f"result: {result}")
|
||||
print("--------------------------------")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
try:
|
||||
print(f"Error in llm model {response}: {e}")
|
||||
except:
|
||||
print(f"Error in llm model: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
async def run_llm_async(self, question_list, text_list):
|
||||
"""
|
||||
Send all chunk requests concurrently.
|
||||
Args:
|
||||
question_list: The list of questions.
|
||||
text_list: The list of texts.
|
||||
Returns:
|
||||
The list of results.
|
||||
"""
|
||||
async with aiohttp.ClientSession() as session:
|
||||
tasks = [self.run_llm(session, question, text) for question, text in zip(question_list, text_list)]
|
||||
results = await asyncio.gather(*tasks)
|
||||
return results
|
||||
|
||||
|
||||
def modify_question_llm(self, query_list: List[str], text_list: List[str]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Modify question of the documents based on the query using the LLM model.
|
||||
Args:
|
||||
query_list: The list of queries.
|
||||
text_list: The list of texts.
|
||||
Returns:
|
||||
The list of modified questions.
|
||||
"""
|
||||
if not text_list:
|
||||
return []
|
||||
|
||||
start_time = time.time()
|
||||
results = asyncio.run(self.run_llm_async(query_list, text_list))
|
||||
end_time = time.time()
|
||||
# print(f"Time taken for llm model: {end_time - start_time} seconds")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@ -153,9 +153,9 @@ def main(output_path):
|
||||
#load synthetic dataset
|
||||
print("--------------------------------")
|
||||
print("loading synthetic dataset")
|
||||
synthetic_train_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/train.jsonl"
|
||||
synthetic_corpus_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/corpus.jsonl"
|
||||
synthetic_queries_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/queries.jsonl"
|
||||
synthetic_train_path = "/home/firouzi/embedding_model/research_notebook/data/synthetic-persian-qa-retrieval/train.jsonl"
|
||||
synthetic_corpus_path = "/home/firouzi/embedding_model/research_notebook/data/synthetic-persian-qa-retrieval/corpus.jsonl"
|
||||
synthetic_queries_path = "/home/firouzi/embedding_model/research_notebook/data/synthetic-persian-qa-retrieval/queries.jsonl"
|
||||
|
||||
synthetic_dataset = load_synthetic_dataset(synthetic_train_path, synthetic_queries_path, synthetic_corpus_path)
|
||||
print(f"synthetic dataset loaded : {len(synthetic_dataset)} samples")
|
||||
@ -173,11 +173,11 @@ def main(output_path):
|
||||
print(f"successfully merged synthetic and pquad dataset")
|
||||
print("--------------------------------")
|
||||
|
||||
# removing false negative samples from all dataset
|
||||
print("start to remove false negative samples from all dataset")
|
||||
all_dataset = remove_false_negative(all_dataset, random_negative_sample=False)
|
||||
print(f"successfully removed false negative samples from all dataset")
|
||||
print("--------------------------------")
|
||||
# # removing false negative samples from all dataset
|
||||
# print("start to remove false negative samples from all dataset")
|
||||
# all_dataset = remove_false_negative(all_dataset, random_negative_sample=False)
|
||||
# print(f"successfully removed false negative samples from all dataset")
|
||||
# print("--------------------------------")
|
||||
|
||||
# with open("/home/firouzi/embedding_model/data/train.json", "r", encoding="utf-8") as f:
|
||||
# all_dataset = json.load(f)
|
||||
|
||||
160
data_preprocess/preprocess_v2.py
Normal file
160
data_preprocess/preprocess_v2.py
Normal file
@ -0,0 +1,160 @@
|
||||
import argparse
|
||||
from datasets import load_dataset
|
||||
import json
|
||||
from tqdm import tqdm
|
||||
|
||||
from data_preprocess.remove_false_negative_model import LLMModel
|
||||
from data_preprocess.generate_random_negative_sample import generate_random_negative_sample
|
||||
|
||||
|
||||
llm_model = LLMModel()
|
||||
|
||||
|
||||
def load_msmarco_dataset():
|
||||
"""
|
||||
load pquad dataset from huggingface
|
||||
output:
|
||||
[{
|
||||
"question": "",
|
||||
"passage_positive": [],
|
||||
"passage_negative": [],
|
||||
"passage_negative_random": []
|
||||
}]
|
||||
"""
|
||||
print("start loading msmarco dataset")
|
||||
name = "MCINext/msmarco-fa"
|
||||
dataset_qrel = load_dataset(name)["train"]
|
||||
|
||||
print("start loading corpus")
|
||||
dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
|
||||
dataset_corpus = {}
|
||||
for data in dataset_corpus_list:
|
||||
dataset_corpus[str(data["_id"])] = data["text"]
|
||||
|
||||
print("start loading queries")
|
||||
dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
|
||||
dataset_queries = {}
|
||||
for data in dataset_queries_list:
|
||||
dataset_queries[str(data["_id"])] = data["text"]
|
||||
|
||||
|
||||
dataset = []
|
||||
print("start creating dataset")
|
||||
for data in tqdm(dataset_qrel):
|
||||
|
||||
if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
|
||||
dataset.append({
|
||||
"question": dataset_queries[data["query-id"]],
|
||||
"passage_positive": [dataset_corpus[data["corpus-id"]]],
|
||||
"passage_negative": [],
|
||||
"passage_negative_random": [],
|
||||
})
|
||||
|
||||
print(f"length of dataset: {len(dataset)}")
|
||||
print("--------------------------------")
|
||||
return dataset, list(dataset_corpus.values())
|
||||
|
||||
|
||||
def remove_false_negative(dataset, random_negative_sample=False):
|
||||
"""
|
||||
remove false negative samples from synthetic dataset
|
||||
Args:
|
||||
dataset: list of dicts
|
||||
Returns:
|
||||
dataset: list of dicts
|
||||
"""
|
||||
if random_negative_sample:
|
||||
negative_name = "passage_negative_random"
|
||||
else:
|
||||
negative_name = "passage_negative"
|
||||
|
||||
# calculate passage negative embeddings
|
||||
negative_count_all = 0
|
||||
negative_count_removed = 0
|
||||
len_dataset = len(dataset)
|
||||
batch_size = 50
|
||||
for i in tqdm(range(0, len_dataset, batch_size)):
|
||||
|
||||
question_list = []
|
||||
passage_negative_list = []
|
||||
for id in range(i, min(i + batch_size, len_dataset)):
|
||||
for passage in dataset[id][negative_name]:
|
||||
question_list.append(dataset[id]['question'])
|
||||
passage_negative_list.append(passage)
|
||||
|
||||
results = llm_model.remove_false_negative_llm(question_list, passage_negative_list)
|
||||
|
||||
negative_count_removed += len([_ for _ in results if _ == "1"])
|
||||
negative_count_all += len(results)
|
||||
|
||||
count = 0
|
||||
for id in range(i, min(i + batch_size, len_dataset)):
|
||||
new_negative_list = []
|
||||
for passage_id in range(len(dataset[id][negative_name])):
|
||||
if results[count] == "0":
|
||||
new_negative_list.append(dataset[id][negative_name][passage_id])
|
||||
count += 1
|
||||
dataset[id][negative_name] = new_negative_list
|
||||
|
||||
print(f"removed {negative_count_removed} false negative samples from {negative_count_all} samples")
|
||||
print("--------------------------------")
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
def save_dataset(dataset, output_path):
|
||||
"""
|
||||
save dataset to json file
|
||||
Args:
|
||||
dataset: list of dicts
|
||||
output_path: path to save dataset
|
||||
"""
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(dataset, f, ensure_ascii=False, indent=4)
|
||||
|
||||
|
||||
def main(output_path):
|
||||
|
||||
#load msmarco dataset
|
||||
print("--------------------------------")
|
||||
all_dataset, corpus_list = load_msmarco_dataset()
|
||||
print(f"msmarco dataset loaded : {len(all_dataset)} samples")
|
||||
print("--------------------------------")
|
||||
|
||||
#generate random negative samples
|
||||
print("start to generate random negative samples")
|
||||
all_dataset = generate_random_negative_sample(all_dataset, corpus_list)
|
||||
print(f"successfully generated random negative samples")
|
||||
print("--------------------------------")
|
||||
|
||||
# removing random false negative samples from all dataset
|
||||
print("start to remove random false negative samples from all dataset")
|
||||
all_dataset = remove_false_negative(all_dataset, random_negative_sample=True)
|
||||
print(f"successfully removed random false negative samples from all dataset")
|
||||
print("--------------------------------")
|
||||
|
||||
# save dataset
|
||||
print("start to save dataset")
|
||||
save_dataset(all_dataset, output_path)
|
||||
print(f"successfully saved dataset")
|
||||
print("--------------------------------")
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
preprocess dataset for training
|
||||
|
||||
pipelines:
|
||||
load msmarco dataset from huggingface
|
||||
generate random negative samples
|
||||
save dataset to json file
|
||||
|
||||
|
||||
python preprocess_v2.py --output_path /home/firouzi/embedding_model/data/v2/msmarco_train.json
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--output_path", type=str, required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
output_path = args.output_path
|
||||
|
||||
main(output_path)
|
||||
142
data_preprocess/preprocess_v3.py
Normal file
142
data_preprocess/preprocess_v3.py
Normal file
@ -0,0 +1,142 @@
|
||||
import argparse
|
||||
from datasets import load_dataset
|
||||
import json
|
||||
from tqdm import tqdm
|
||||
import time
|
||||
|
||||
from data_preprocess.modify_question_model import LLMModel
|
||||
|
||||
|
||||
llm_model = LLMModel()
|
||||
|
||||
|
||||
def load_msmarco_dataset():
|
||||
"""
|
||||
load pquad dataset from huggingface
|
||||
output:
|
||||
[{
|
||||
"question": "",
|
||||
"passage_positive": [],
|
||||
"passage_negative": [],
|
||||
"passage_negative_random": []
|
||||
}]
|
||||
"""
|
||||
print("start loading msmarco dataset")
|
||||
name = "MCINext/msmarco-fa"
|
||||
dataset_qrel = load_dataset(name)["train"]
|
||||
|
||||
print("start loading corpus")
|
||||
dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
|
||||
dataset_corpus = {}
|
||||
for data in dataset_corpus_list:
|
||||
dataset_corpus[str(data["_id"])] = data["text"]
|
||||
|
||||
print("start loading queries")
|
||||
dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
|
||||
dataset_queries = {}
|
||||
for data in dataset_queries_list:
|
||||
dataset_queries[str(data["_id"])] = data["text"]
|
||||
|
||||
|
||||
dataset = []
|
||||
print("start creating dataset")
|
||||
for data in tqdm(dataset_qrel):
|
||||
|
||||
if data["query-id"] in dataset_queries and data["corpus-id"] in dataset_corpus:
|
||||
dataset.append({
|
||||
"question": dataset_queries[data["query-id"]],
|
||||
"passage_positive": [dataset_corpus[data["corpus-id"]]],
|
||||
"new_question": "",
|
||||
"passage_negative": [],
|
||||
"passage_negative_random": [],
|
||||
})
|
||||
|
||||
print(f"length of dataset: {len(dataset)}")
|
||||
print("--------------------------------")
|
||||
return dataset
|
||||
|
||||
|
||||
def modify_question(dataset):
|
||||
"""
|
||||
modify question of dataset
|
||||
Args:
|
||||
dataset: list of dicts
|
||||
Returns:
|
||||
dataset: list of dicts
|
||||
"""
|
||||
|
||||
len_dataset = len(dataset)
|
||||
batch_size = 50
|
||||
for i in tqdm(range(0, len_dataset, batch_size)):
|
||||
|
||||
question_list = []
|
||||
passage_positive_list = []
|
||||
for id in range(i, min(i + batch_size, len_dataset)):
|
||||
question_list.append(dataset[id]['question'])
|
||||
passage_positive_list.append(dataset[id]['passage_positive'][0])
|
||||
|
||||
results = llm_model.modify_question_llm(question_list, passage_positive_list)
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
count = 0
|
||||
for id in range(i, min(i + batch_size, len_dataset)):
|
||||
dataset[id]["new_question"] = results[count]
|
||||
count += 1
|
||||
|
||||
print(f"successfully modified question")
|
||||
print("--------------------------------")
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
def save_dataset(dataset, output_path):
|
||||
"""
|
||||
save dataset to json file
|
||||
Args:
|
||||
dataset: list of dicts
|
||||
output_path: path to save dataset
|
||||
"""
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(dataset, f, ensure_ascii=False, indent=4)
|
||||
|
||||
|
||||
def main(output_path):
|
||||
|
||||
#load msmarco dataset
|
||||
print("--------------------------------")
|
||||
all_dataset = load_msmarco_dataset()
|
||||
print(f"msmarco dataset loaded : {len(all_dataset)} samples")
|
||||
print("--------------------------------")
|
||||
|
||||
# removing random false negative samples from all dataset
|
||||
print("start to modify question")
|
||||
all_dataset = modify_question(all_dataset[:270000])
|
||||
print(f"successfully modified question")
|
||||
print("--------------------------------")
|
||||
|
||||
# save dataset
|
||||
print("start to save dataset")
|
||||
save_dataset(all_dataset, output_path)
|
||||
print(f"successfully saved dataset")
|
||||
print("--------------------------------")
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""
|
||||
preprocess dataset for training
|
||||
|
||||
pipelines:
|
||||
load msmarco dataset from huggingface
|
||||
generate random negative samples
|
||||
save dataset to json file
|
||||
|
||||
|
||||
python preprocess_v2.py --output_path /home/firouzi/embedding_model/data/v2/msmarco_train.json
|
||||
"""
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--output_path", type=str, required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
output_path = args.output_path
|
||||
|
||||
main(output_path)
|
||||
@ -102,7 +102,7 @@ class LLMModel:
|
||||
start_time = time.time()
|
||||
results = asyncio.run(self.run_llm_async(query_list, text_list))
|
||||
end_time = time.time()
|
||||
print(f"Time taken for llm model: {end_time - start_time} seconds")
|
||||
# print(f"Time taken for llm model: {end_time - start_time} seconds")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
3102
data_preprocess/stopwords.txt
Normal file
3102
data_preprocess/stopwords.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -3,6 +3,7 @@ import requests
|
||||
import numpy as np
|
||||
from dotenv import load_dotenv
|
||||
import os
|
||||
import time
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -20,21 +21,26 @@ class TextEmbedder:
|
||||
return text
|
||||
|
||||
|
||||
def embed_texts(self, texts:list[str])->list[list[float]]:
|
||||
def embed_texts(self, texts:list[str], do_preprocess=True, convert_to_numpy=True)->list[list[float]]:
|
||||
"""
|
||||
Embed texts using the model.
|
||||
"""
|
||||
if texts == []:
|
||||
return []
|
||||
|
||||
texts = [self.preprocess_embedder(text) for text in texts]
|
||||
|
||||
if do_preprocess:
|
||||
texts = [self.preprocess_embedder(text) for text in texts]
|
||||
|
||||
payload = {
|
||||
"model": self.model_name,
|
||||
"input": texts
|
||||
}
|
||||
responses = requests.post("http://78.38.161.78:3094/v1/embeddings", headers=self.headers, json=payload)
|
||||
embeddings = [np.array(response["embedding"], dtype=np.float32) for response in responses.json()["data"]]
|
||||
|
||||
|
||||
if convert_to_numpy:
|
||||
embeddings = [np.array(response["embedding"], dtype=np.float32) for response in responses.json()["data"]]
|
||||
else:
|
||||
embeddings = [response["embedding"] for response in responses.json()["data"]]
|
||||
|
||||
return embeddings
|
||||
|
||||
@ -6,15 +6,17 @@ from tqdm import tqdm
|
||||
names = ["MCINext/FEVER_FA_test_top_250_only_w_correct-v2", "MCINext/fiqa-fa-v2", "MCINext/HotpotQA_FA_test_top_250_only_w_correct-v2",
|
||||
"MCINext/MSMARCO_FA_test_top_250_only_w_correct-v2", "MCINext/NQ_FA_test_top_250_only_w_correct-v2", "MCINext/quora-fa-v2", "MCINext/scifact-fa-v2",
|
||||
"MCINext/synthetic-persian-chatbot-rag-faq-retrieval", "MCINext/synthetic-persian-qa-retrieval", "MCINext/trec-covid-fa-v2"]
|
||||
|
||||
names = names[3:4]
|
||||
for name in tqdm(names):
|
||||
print(f"loading {name}")
|
||||
dataset_qrel = load_dataset(name)["test"]
|
||||
dataset_corpus_list = load_dataset(name,data_files="corpus.jsonl")["train"]
|
||||
dataset_corpus_list = load_dataset(name,data_files="corpus/corpus.jsonl")["train"]
|
||||
dataset_corpus = {}
|
||||
for data in dataset_corpus_list:
|
||||
dataset_corpus[data["_id"]] = data["text"]
|
||||
|
||||
dataset_queries_list = load_dataset(name,data_files="queries.jsonl")["train"]
|
||||
dataset_queries_list = load_dataset(name,data_files="queries/queries.jsonl")["train"]
|
||||
dataset_queries = {}
|
||||
for data in dataset_queries_list:
|
||||
dataset_queries[data["_id"]] = data["text"]
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 1,
|
||||
"id": "a78759c8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -11,11 +11,7 @@
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||
"/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/datasets/load.py:1461: FutureWarning: The repository for Gholamreza/pquad contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/Gholamreza/pquad\n",
|
||||
"You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
|
||||
"Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
|
||||
" warnings.warn(\n"
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -27,7 +23,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 2,
|
||||
"id": "c91f659a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -54,7 +50,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 4,
|
||||
"id": "d66809ce",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -62,12 +58,12 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'question': 'جنگ جهانی اول در چه تاریخی پایان یافت؟', 'passgae_positive': [], 'passgae_negative': ['در سال ۱۸۷۱ امپراتوری آلمان با اتحاد پروس و کنفدراسیون جرمن شمالی توسط اتو ون بیسمارک به وجود آمد. این کشور قدرتمند تا سال ۱۹۱۸ ادامه یافت و با عنوان رایش دوم مشهور شد. بیسمارک توانست استان\\u200cهای جدید زیادی را طی جنگ\\u200cهای مبتکرانهٔ کوتاه و دیپلماتیک به دست آورد. او با اتریش هم پیمان شد تا دانمارک را شکست دهد و ناحیهٔ شلزویگ-هولشتاین را تصرف کند. او جنگ اتریش و پروس (آسترو-پروسیان) را آغاز کرد و پیروز شد اما اینکار فقط برای این بود که ایتالیا طرف آلمان را بگیرد. سپس پروس وارد جنگ فرانسه و پروس (فرانکو-پروسین) (۷۱-۱۸۷۰) شد و توانست شکست کاملی به فرانسه وارد سازد. ویلهلم اول به عنوان آخرین توهین به فرانسوی\\u200cها در کاخ ورسای در قلب فرانسه به عنوان امپراتور آلمان سوگند خورد. امپراتوری آلمان تا پایان جنگ جهانی اول یعنی زمانی که فرانسه توانست در پیمان ورسای تلافی بکند در اوج خود بود.']}\n"
|
||||
"{'question': 'سام میرزا در چه تاریخی توسط نادرشاه دستگیر شد؟', 'passage_positive': ['این ضربت سخت خیال نادر را پریشان کرد و رضا قلی میرزا را که در رکاب بود در طهران گذاشت و خود به داغستان رفت در این سفر اگرچه بعضی از رؤسای طوایف لزکی از در اطاعت درآمدند لیکن غالب سکنه داغستان به قلل جبال پرارتفاع پناه گرفتند و از هر طرف به تعرّض اردوی نادر دست زدند و لطمات بسیار به ایشان وارد آوردند حتّی موقعی به خیمه خود نادر نیز تعرّض رساندند. در رمضان ۱۱۵۴ موقعیکه نادر هنوز در داغستان بود غلامی را که مرتکب انداختن تیر در جنگل سوادکوه شده بود بخدمت او آوردند. نادر او را کور کرد. شخصی بنام سام میرزا که به ادّعای فرزندی شاه سلطان حسین در آذربایجان به سلطنت طلبی برخاسته و محمّد خان پسر سرخای خان لزگی و خوانین دربند و داغستان را با خود همدست نموده بود. نادر توسّط نصر اللّه میرزا و چند تن از سرداران خود انقلاب این حدود را بالاخره خواباند و سام میرزا در ذی\\u200cالقعده ۱۱۵۶ دستگیر گردید.'], 'passage_negative': [], 'passage_negative_random': []}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(all_dataset[1240])"
|
||||
"print(all_dataset[1241])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
1531
research_notebook/data_preprocess/data_loader_msmarco.ipynb
Normal file
1531
research_notebook/data_preprocess/data_loader_msmarco.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@ -58,7 +58,7 @@ def main(add_prompt, lora):
|
||||
learning_rate=2e-5,
|
||||
warmup_ratio=0.05,
|
||||
logging_steps=10,
|
||||
report_to="tensorboard",
|
||||
report_to="mlflow",
|
||||
save_steps=10000,
|
||||
save_total_limit=2,
|
||||
)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user