add convert dataset and train qwen
This commit is contained in:
parent
0de8db232b
commit
793508dbd0
12
.gitignore
vendored
12
.gitignore
vendored
@ -1,5 +1,5 @@
|
|||||||
data_preprocess/data/*
|
data_preprocess/data/*
|
||||||
data
|
data/models
|
||||||
*/__pycache__/*
|
*/__pycache__/*
|
||||||
.env
|
.env
|
||||||
.venv
|
.venv
|
||||||
@ -9,3 +9,13 @@ models
|
|||||||
research_notebook/data
|
research_notebook/data
|
||||||
train/qwen/output
|
train/qwen/output
|
||||||
train/qwen/mlruns
|
train/qwen/mlruns
|
||||||
|
output
|
||||||
|
data/dataset/__pycache__
|
||||||
|
data/dataset/generated_250000_general/__pycache__
|
||||||
|
data/dataset/generated_250000_general/generated_250000_general.jsonl
|
||||||
|
data/dataset/generated_250000_religous/250_religous_ready.jsonl
|
||||||
|
!data/dataset/generated_250000_religous/convert_to_jsonl.py
|
||||||
|
data/dataset/generated_250000_religous/__pycache__
|
||||||
|
data/dataset/my_local_dataset/__pycache__
|
||||||
|
data/dataset/v11_dataset_hn/__pycache__
|
||||||
|
data/dataset/v11_generated/__pycache__
|
||||||
|
|||||||
14
data/dataset/generated_250000_general/convert_to_jsonl.py
Normal file
14
data/dataset/generated_250000_general/convert_to_jsonl.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
file_path = os.path.dirname(__file__)
|
||||||
|
input_file = file_path + "/generated_250000_general.json"
|
||||||
|
output_file = file_path + "/generated_250000_general.jsonl"
|
||||||
|
|
||||||
|
with open(input_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
|
||||||
|
data = json.load(f_in) # لیست رکوردها
|
||||||
|
for record in data:
|
||||||
|
json_line = json.dumps(record, ensure_ascii=False)
|
||||||
|
f_out.write(json_line + "\n")
|
||||||
|
|
||||||
|
print(f"Converted {input_file} to {output_file}")
|
||||||
57
data/dataset/generated_250000_general/generated.py
Normal file
57
data/dataset/generated_250000_general/generated.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
from swift.llm import ResponsePreprocessor, DatasetMeta, register_dataset, SubsetDataset, load_dataset
|
||||||
|
from typing import Dict, Any
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class CustomPreprocessor(ResponsePreprocessor):
|
||||||
|
# def __init__(self, *, columns = None, **kwargs):
|
||||||
|
# super().__init__(columns=columns, **kwargs)
|
||||||
|
# self.num_all_negative = 0
|
||||||
|
def get_detailed_instruct(self, task_description: str, query: str) -> str:
|
||||||
|
return f'Instruct: {task_description}\nQuery:{query}'
|
||||||
|
|
||||||
|
def add_template(self, text):
|
||||||
|
task = 'Given a web search query, retrieve relevant passages that answer the query'
|
||||||
|
return self.get_detailed_instruct(task, text)
|
||||||
|
|
||||||
|
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
query = self.add_template(row["query"])
|
||||||
|
passage_positive = row["passage_positive"]
|
||||||
|
passage_negative = row["passage_negative"]
|
||||||
|
passage_negative_random = row["passage_negative_random"]
|
||||||
|
|
||||||
|
all_neg = passage_negative + passage_negative_random
|
||||||
|
all_neg = list(set(all_neg))
|
||||||
|
# self.num_all_negative += len(all_neg)
|
||||||
|
|
||||||
|
row = {
|
||||||
|
# 'query': [{'role': 'user', 'content': query, 'loss': None}],
|
||||||
|
'query': query,
|
||||||
|
'positive_messages': [
|
||||||
|
[{'role': 'user', 'content': passage_positive[i]}] for i in range(len(passage_positive))
|
||||||
|
],
|
||||||
|
'negative_messages': [
|
||||||
|
[{'role': 'user', 'content': all_neg[i]}] for i in range(len(all_neg))
|
||||||
|
],
|
||||||
|
# 'label': 1.0
|
||||||
|
}
|
||||||
|
if len(row["negative_messages"]) == 0:
|
||||||
|
del row["negative_messages"]
|
||||||
|
return super().preprocess(row)
|
||||||
|
|
||||||
|
|
||||||
|
register_dataset(
|
||||||
|
DatasetMeta(
|
||||||
|
dataset_path=os.path.dirname(__file__) + '/generated_250000_general.jsonl',
|
||||||
|
dataset_name="generated_250000_general",
|
||||||
|
# subsets=[SubsetDataset('train', split=['train']), SubsetDataset('test', split=['test'])],
|
||||||
|
preprocess_func=CustomPreprocessor(),
|
||||||
|
))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# load_dataset returns train_dataset and val_dataset based on `split_dataset_ratio`
|
||||||
|
# Here, since we didn't pass `split_dataset_ratio` (defaults to 0), we take the first one (index 0)
|
||||||
|
dataset = load_dataset('generated_250000_general')[0]
|
||||||
|
test_dataset = load_dataset('swift/financial_classification:test')[0]
|
||||||
|
print(f'dataset[0]: {dataset[0]}')
|
||||||
|
print(f'test_dataset[0]: {test_dataset[0]}')
|
||||||
14
data/dataset/generated_250000_religous/convert_to_jsonl.py
Normal file
14
data/dataset/generated_250000_religous/convert_to_jsonl.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
file_path = os.path.dirname(__file__)
|
||||||
|
input_file = file_path + "/250_religous_ready.json"
|
||||||
|
output_file = file_path + "/250_religous_ready.jsonl"
|
||||||
|
|
||||||
|
with open(input_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out:
|
||||||
|
data = json.load(f_in) # لیست رکوردها
|
||||||
|
for record in data:
|
||||||
|
json_line = json.dumps(record, ensure_ascii=False)
|
||||||
|
f_out.write(json_line + "\n")
|
||||||
|
|
||||||
|
print(f"Converted {input_file} to {output_file}")
|
||||||
57
data/dataset/generated_250000_religous/generated.py
Normal file
57
data/dataset/generated_250000_religous/generated.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
from swift.llm import ResponsePreprocessor, DatasetMeta, register_dataset, SubsetDataset, load_dataset
|
||||||
|
from typing import Dict, Any
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class CustomPreprocessor(ResponsePreprocessor):
|
||||||
|
# def __init__(self, *, columns = None, **kwargs):
|
||||||
|
# super().__init__(columns=columns, **kwargs)
|
||||||
|
# self.num_all_negative = 0
|
||||||
|
def get_detailed_instruct(self, task_description: str, query: str) -> str:
|
||||||
|
return f'Instruct: {task_description}\nQuery:{query}'
|
||||||
|
|
||||||
|
def add_template(self, text):
|
||||||
|
task = 'Given a web search query, retrieve relevant passages that answer the query'
|
||||||
|
return self.get_detailed_instruct(task, text)
|
||||||
|
|
||||||
|
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
query = self.add_template(row["query"])
|
||||||
|
passage_positive = row["passage_positive"]
|
||||||
|
passage_negative = row["passage_negative"]
|
||||||
|
passage_negative_random = row["passage_negative_random"]
|
||||||
|
|
||||||
|
all_neg = passage_negative + passage_negative_random
|
||||||
|
all_neg = list(set(all_neg))
|
||||||
|
# self.num_all_negative += len(all_neg)
|
||||||
|
|
||||||
|
row = {
|
||||||
|
# 'query': [{'role': 'user', 'content': query, 'loss': None}],
|
||||||
|
'query': query,
|
||||||
|
'positive_messages': [
|
||||||
|
[{'role': 'user', 'content': passage_positive[i]}] for i in range(len(passage_positive))
|
||||||
|
],
|
||||||
|
'negative_messages': [
|
||||||
|
[{'role': 'user', 'content': all_neg[i]}] for i in range(len(all_neg))
|
||||||
|
],
|
||||||
|
# 'label': 1.0
|
||||||
|
}
|
||||||
|
if len(row["negative_messages"]) == 0:
|
||||||
|
del row["negative_messages"]
|
||||||
|
return super().preprocess(row)
|
||||||
|
|
||||||
|
|
||||||
|
register_dataset(
|
||||||
|
DatasetMeta(
|
||||||
|
dataset_path=os.path.dirname(__file__) + '/250_religous_ready.jsonl',
|
||||||
|
dataset_name="generated_250000_religous",
|
||||||
|
# subsets=[SubsetDataset('train', split=['train']), SubsetDataset('test', split=['test'])],
|
||||||
|
preprocess_func=CustomPreprocessor(),
|
||||||
|
))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# load_dataset returns train_dataset and val_dataset based on `split_dataset_ratio`
|
||||||
|
# Here, since we didn't pass `split_dataset_ratio` (defaults to 0), we take the first one (index 0)
|
||||||
|
dataset = load_dataset('generated_250000_religous')[0]
|
||||||
|
test_dataset = load_dataset('swift/financial_classification:test')[0]
|
||||||
|
print(f'dataset[0]: {dataset[0]}')
|
||||||
|
print(f'test_dataset[0]: {test_dataset[0]}')
|
||||||
58
data/dataset/my_local_dataset/my_dataset_register.py
Normal file
58
data/dataset/my_local_dataset/my_dataset_register.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
from swift.llm import ResponsePreprocessor, DatasetMeta, register_dataset, SubsetDataset, load_dataset
|
||||||
|
from typing import Dict, Any
|
||||||
|
import os
|
||||||
|
|
||||||
|
class CustomPreprocessor(ResponsePreprocessor):
|
||||||
|
def __init__(self, *, columns = None, **kwargs):
|
||||||
|
super().__init__(columns=columns, **kwargs)
|
||||||
|
self.num_all_negative = 0
|
||||||
|
|
||||||
|
def get_detailed_instruct(self, task_description: str, query: str) -> str:
|
||||||
|
return f'Instruct: {task_description}\nQuery:{query}'
|
||||||
|
|
||||||
|
def add_template(self, text):
|
||||||
|
task = 'Given a web search query, retrieve relevant passages that answer the query'
|
||||||
|
return self.get_detailed_instruct(task, text)
|
||||||
|
|
||||||
|
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
query = self.add_template(row["query"])
|
||||||
|
passage_positive = row["passage_positive"]
|
||||||
|
passage_negative = row["passage_negative"]
|
||||||
|
passage_negative_random = row["passage_negative_random"]
|
||||||
|
|
||||||
|
all_neg = passage_negative + passage_negative_random
|
||||||
|
all_neg = list(set(all_neg))
|
||||||
|
|
||||||
|
self.num_all_negative += len(all_neg)
|
||||||
|
|
||||||
|
row = {
|
||||||
|
# 'query': [{'role': 'user', 'content': query, 'loss': None}],
|
||||||
|
'query': query,
|
||||||
|
'positive_messages': [
|
||||||
|
[{'role': 'user', 'content': passage_positive[i]}] for i in range(len(passage_positive))
|
||||||
|
],
|
||||||
|
'negative_messages': [
|
||||||
|
[{'role': 'user', 'content': all_neg[i]}] for i in range(len(all_neg))
|
||||||
|
],
|
||||||
|
# 'label': 1.0
|
||||||
|
}
|
||||||
|
if len(row["negative_messages"]) == 0:
|
||||||
|
del row["negative_messages"]
|
||||||
|
return super().preprocess(row)
|
||||||
|
|
||||||
|
|
||||||
|
register_dataset(
|
||||||
|
DatasetMeta(
|
||||||
|
dataset_path=os.path.dirname(__file__) + '/dataset_train.json',
|
||||||
|
dataset_name="my_local_dataset",
|
||||||
|
# subsets=[SubsetDataset('train', split=['train']), SubsetDataset('test', split=['test'])],
|
||||||
|
preprocess_func=CustomPreprocessor(),
|
||||||
|
))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# load_dataset returns train_dataset and val_dataset based on `split_dataset_ratio`
|
||||||
|
# Here, since we didn't pass `split_dataset_ratio` (defaults to 0), we take the first one (index 0)
|
||||||
|
dataset = load_dataset('my_local_dataset')[0]
|
||||||
|
test_dataset = load_dataset('swift/financial_classification:test')[0]
|
||||||
|
print(f'dataset[0]: {dataset[0]}')
|
||||||
|
print(f'test_dataset[0]: {test_dataset[0]}')
|
||||||
57
data/dataset/v11_dataset_hn/generated.py
Normal file
57
data/dataset/v11_dataset_hn/generated.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
from swift.llm import ResponsePreprocessor, DatasetMeta, register_dataset, SubsetDataset, load_dataset
|
||||||
|
from typing import Dict, Any
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class CustomPreprocessor(ResponsePreprocessor):
|
||||||
|
# def __init__(self, *, columns = None, **kwargs):
|
||||||
|
# super().__init__(columns=columns, **kwargs)
|
||||||
|
# self.num_all_negative = 0
|
||||||
|
def get_detailed_instruct(self, task_description: str, query: str) -> str:
|
||||||
|
return f'Instruct: {task_description}\nQuery:{query}'
|
||||||
|
|
||||||
|
def add_template(self, text):
|
||||||
|
task = 'Given a web search query, retrieve relevant passages that answer the query'
|
||||||
|
return self.get_detailed_instruct(task, text)
|
||||||
|
|
||||||
|
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
query = self.add_template(row["query"])
|
||||||
|
passage_positive = row["passage_positive"]
|
||||||
|
passage_negative = row["passage_negative"]
|
||||||
|
passage_negative_random = row["passage_negative_random"]
|
||||||
|
|
||||||
|
all_neg = passage_negative + passage_negative_random
|
||||||
|
all_neg = list(set(all_neg))
|
||||||
|
# self.num_all_negative += len(all_neg)
|
||||||
|
|
||||||
|
row = {
|
||||||
|
# 'query': [{'role': 'user', 'content': query, 'loss': None}],
|
||||||
|
'query': query,
|
||||||
|
'positive_messages': [
|
||||||
|
[{'role': 'user', 'content': passage_positive[i]}] for i in range(len(passage_positive))
|
||||||
|
],
|
||||||
|
'negative_messages': [
|
||||||
|
[{'role': 'user', 'content': all_neg[i]}] for i in range(len(all_neg))
|
||||||
|
],
|
||||||
|
# 'label': 1.0
|
||||||
|
}
|
||||||
|
if len(row["negative_messages"]) == 0:
|
||||||
|
del row["negative_messages"]
|
||||||
|
return super().preprocess(row)
|
||||||
|
|
||||||
|
|
||||||
|
register_dataset(
|
||||||
|
DatasetMeta(
|
||||||
|
dataset_path=os.path.dirname(__file__) + '/v11_dataset_hn.json',
|
||||||
|
dataset_name="v11_dataset_hn",
|
||||||
|
# subsets=[SubsetDataset('train', split=['train']), SubsetDataset('test', split=['test'])],
|
||||||
|
preprocess_func=CustomPreprocessor(),
|
||||||
|
))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# load_dataset returns train_dataset and val_dataset based on `split_dataset_ratio`
|
||||||
|
# Here, since we didn't pass `split_dataset_ratio` (defaults to 0), we take the first one (index 0)
|
||||||
|
dataset = load_dataset('v11_dataset_hn')[0]
|
||||||
|
test_dataset = load_dataset('swift/financial_classification:test')[0]
|
||||||
|
print(f'dataset[0]: {dataset[0]}')
|
||||||
|
print(f'test_dataset[0]: {test_dataset[0]}')
|
||||||
53
data/dataset/v11_generated/generated.py
Normal file
53
data/dataset/v11_generated/generated.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
from swift.llm import ResponsePreprocessor, DatasetMeta, register_dataset, SubsetDataset, load_dataset
|
||||||
|
from typing import Dict, Any
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class CustomPreprocessor(ResponsePreprocessor):
|
||||||
|
def get_detailed_instruct(self, task_description: str, query: str) -> str:
|
||||||
|
return f'Instruct: {task_description}\nQuery:{query}'
|
||||||
|
|
||||||
|
def add_template(self, text):
|
||||||
|
task = 'Given a web search query, retrieve relevant passages that answer the query'
|
||||||
|
return self.get_detailed_instruct(task, text)
|
||||||
|
|
||||||
|
def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
query = self.add_template(row["query"])
|
||||||
|
passage_positive = [row["document"]]
|
||||||
|
passage_negative = []
|
||||||
|
passage_negative_random = []
|
||||||
|
|
||||||
|
all_neg = passage_negative + passage_negative_random
|
||||||
|
all_neg = list(set(all_neg))
|
||||||
|
|
||||||
|
row = {
|
||||||
|
# 'query': [{'role': 'user', 'content': query, 'loss': None}],
|
||||||
|
'query': query,
|
||||||
|
'positive_messages': [
|
||||||
|
[{'role': 'user', 'content': passage_positive[i]}] for i in range(len(passage_positive))
|
||||||
|
],
|
||||||
|
'negative_messages': [
|
||||||
|
[{'role': 'user', 'content': all_neg[i]}] for i in range(len(all_neg))
|
||||||
|
],
|
||||||
|
# 'label': 1.0
|
||||||
|
}
|
||||||
|
if len(row["negative_messages"]) == 0:
|
||||||
|
del row["negative_messages"]
|
||||||
|
return super().preprocess(row)
|
||||||
|
|
||||||
|
|
||||||
|
register_dataset(
|
||||||
|
DatasetMeta(
|
||||||
|
dataset_path=os.path.dirname(__file__) + '/v11_dataset.json',
|
||||||
|
dataset_name="v11_generated_dataset",
|
||||||
|
# subsets=[SubsetDataset('train', split=['train']), SubsetDataset('test', split=['test'])],
|
||||||
|
preprocess_func=CustomPreprocessor(),
|
||||||
|
))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# load_dataset returns train_dataset and val_dataset based on `split_dataset_ratio`
|
||||||
|
# Here, since we didn't pass `split_dataset_ratio` (defaults to 0), we take the first one (index 0)
|
||||||
|
dataset = load_dataset('v11_generated_dataset')[0]
|
||||||
|
test_dataset = load_dataset('swift/financial_classification:test')[0]
|
||||||
|
print(f'dataset[0]: {dataset[0]}')
|
||||||
|
print(f'test_dataset[0]: {test_dataset[0]}')
|
||||||
@ -56,7 +56,7 @@ class CustomModel:
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
) -> np.ndarray:
|
) -> np.ndarray:
|
||||||
|
|
||||||
embedding_url = "http://127.0.0.1:5000/embedding"
|
embedding_url = "http://127.0.0.1:5015/embedding"
|
||||||
|
|
||||||
if prompt_type == None:
|
if prompt_type == None:
|
||||||
template = "document"
|
template = "document"
|
||||||
@ -89,6 +89,8 @@ def is_dataset_cached(dataset_name):
|
|||||||
|
|
||||||
def evaluate():
|
def evaluate():
|
||||||
model_name = "Qwen3-Embedding-0.6B"
|
model_name = "Qwen3-Embedding-0.6B"
|
||||||
|
# model_name = "KaLM-embedding-multilingual-mini-instruct-v2.5"
|
||||||
|
# model_name = "KaLM-Embedding-Gemma3-12B-2511"
|
||||||
# model_name = "llama-embed-nemotron-8b"
|
# model_name = "llama-embed-nemotron-8b"
|
||||||
# model_name = "embeddinggemma-300m"
|
# model_name = "embeddinggemma-300m"
|
||||||
model = CustomModel(model_name)
|
model = CustomModel(model_name)
|
||||||
|
|||||||
@ -3,6 +3,9 @@
|
|||||||
|
|
||||||
nproc_per_node=1
|
nproc_per_node=1
|
||||||
|
|
||||||
|
# INFONCE_HARD_NEGATIVES=1 \
|
||||||
|
# INFONCE_MASK_FAKE_NEGATIVE=True \
|
||||||
|
|
||||||
MLFLOW_TRACKING_URI=http://0.0.0.0:5004 \
|
MLFLOW_TRACKING_URI=http://0.0.0.0:5004 \
|
||||||
INFONCE_USE_BATCH=True \
|
INFONCE_USE_BATCH=True \
|
||||||
CUDA_VISIBLE_DEVICES=0 \
|
CUDA_VISIBLE_DEVICES=0 \
|
||||||
@ -16,8 +19,8 @@ swift sft \
|
|||||||
--lora_alpha 32 \
|
--lora_alpha 32 \
|
||||||
--target_modules all-linear \
|
--target_modules all-linear \
|
||||||
--max_length 2048 \
|
--max_length 2048 \
|
||||||
--dataset v11_dataset_hn \
|
--dataset generated_250000_religous \
|
||||||
--custom_register_path $(pwd)/../../data/dataset/v11_dataset_hn/generated.py \
|
--custom_register_path $(pwd)/../../data/dataset/generated_250000_religous/generated.py \
|
||||||
--split_dataset_ratio 0.005 \
|
--split_dataset_ratio 0.005 \
|
||||||
--eval_strategy steps \
|
--eval_strategy steps \
|
||||||
--output_dir output \
|
--output_dir output \
|
||||||
|
|||||||
@ -33,8 +33,8 @@ def main():
|
|||||||
file_path = os.path.dirname(__file__)
|
file_path = os.path.dirname(__file__)
|
||||||
|
|
||||||
base_model_path = file_path + "/../../data/models/Qwen3-Embedding-0.6B/model"
|
base_model_path = file_path + "/../../data/models/Qwen3-Embedding-0.6B/model"
|
||||||
peft_model_path = file_path + "/output/v17-20251202-223944/checkpoint-387"
|
peft_model_path = file_path + "/output/v23-20251214-111804/checkpoint-3632"
|
||||||
save_path = file_path + "/output/v17-20251202-223944/merged_checkpoint-387"
|
save_path = file_path + "/output/v23-20251214-111804/merged_checkpoint-3632"
|
||||||
merge(base_model_path, peft_model_path, save_path)
|
merge(base_model_path, peft_model_path, save_path)
|
||||||
|
|
||||||
items = ["1_Pooling", "config_sentence_transformers.json", "merges.txt", "modules.json", "README.md", "tokenizer_config.json", "tokenizer.json",
|
items = ["1_Pooling", "config_sentence_transformers.json", "merges.txt", "modules.json", "README.md", "tokenizer_config.json", "tokenizer.json",
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user