From 793508dbd0b50bb6aeee637f0b02d724de01af6a Mon Sep 17 00:00:00 2001 From: "a.hediehloo" Date: Sun, 21 Dec 2025 12:09:32 +0000 Subject: [PATCH] add convert dataset and train qwen --- .gitignore | 12 +++- .../convert_to_jsonl.py | 14 +++++ .../generated_250000_general/generated.py | 57 ++++++++++++++++++ .../convert_to_jsonl.py | 14 +++++ .../generated_250000_religous/generated.py | 57 ++++++++++++++++++ .../my_local_dataset/my_dataset_register.py | 58 +++++++++++++++++++ data/dataset/v11_dataset_hn/generated.py | 57 ++++++++++++++++++ data/dataset/v11_generated/generated.py | 53 +++++++++++++++++ evaluation/evaluation.py | 4 +- train/qwen/a.sh | 7 ++- train/qwen/merge_model.py | 4 +- 11 files changed, 331 insertions(+), 6 deletions(-) create mode 100644 data/dataset/generated_250000_general/convert_to_jsonl.py create mode 100644 data/dataset/generated_250000_general/generated.py create mode 100644 data/dataset/generated_250000_religous/convert_to_jsonl.py create mode 100644 data/dataset/generated_250000_religous/generated.py create mode 100644 data/dataset/my_local_dataset/my_dataset_register.py create mode 100644 data/dataset/v11_dataset_hn/generated.py create mode 100644 data/dataset/v11_generated/generated.py diff --git a/.gitignore b/.gitignore index b84bb2a..9fe8795 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ data_preprocess/data/* -data +data/models */__pycache__/* .env .venv @@ -9,3 +9,13 @@ models research_notebook/data train/qwen/output train/qwen/mlruns +output +data/dataset/__pycache__ +data/dataset/generated_250000_general/__pycache__ +data/dataset/generated_250000_general/generated_250000_general.jsonl +data/dataset/generated_250000_religous/250_religous_ready.jsonl +!data/dataset/generated_250000_religous/convert_to_jsonl.py +data/dataset/generated_250000_religous/__pycache__ +data/dataset/my_local_dataset/__pycache__ +data/dataset/v11_dataset_hn/__pycache__ +data/dataset/v11_generated/__pycache__ diff --git a/data/dataset/generated_250000_general/convert_to_jsonl.py b/data/dataset/generated_250000_general/convert_to_jsonl.py new file mode 100644 index 0000000..6fbd279 --- /dev/null +++ b/data/dataset/generated_250000_general/convert_to_jsonl.py @@ -0,0 +1,14 @@ +import json +import os + +file_path = os.path.dirname(__file__) +input_file = file_path + "/generated_250000_general.json" +output_file = file_path + "/generated_250000_general.jsonl" + +with open(input_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out: + data = json.load(f_in) # لیست رکوردها + for record in data: + json_line = json.dumps(record, ensure_ascii=False) + f_out.write(json_line + "\n") + +print(f"Converted {input_file} to {output_file}") \ No newline at end of file diff --git a/data/dataset/generated_250000_general/generated.py b/data/dataset/generated_250000_general/generated.py new file mode 100644 index 0000000..d31dacb --- /dev/null +++ b/data/dataset/generated_250000_general/generated.py @@ -0,0 +1,57 @@ +from swift.llm import ResponsePreprocessor, DatasetMeta, register_dataset, SubsetDataset, load_dataset +from typing import Dict, Any +import os + + +class CustomPreprocessor(ResponsePreprocessor): + # def __init__(self, *, columns = None, **kwargs): + # super().__init__(columns=columns, **kwargs) + # self.num_all_negative = 0 + def get_detailed_instruct(self, task_description: str, query: str) -> str: + return f'Instruct: {task_description}\nQuery:{query}' + + def add_template(self, text): + task = 'Given a web search query, retrieve relevant passages that answer the query' + return self.get_detailed_instruct(task, text) + + def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: + query = self.add_template(row["query"]) + passage_positive = row["passage_positive"] + passage_negative = row["passage_negative"] + passage_negative_random = row["passage_negative_random"] + + all_neg = passage_negative + passage_negative_random + all_neg = list(set(all_neg)) + # self.num_all_negative += len(all_neg) + + row = { + # 'query': [{'role': 'user', 'content': query, 'loss': None}], + 'query': query, + 'positive_messages': [ + [{'role': 'user', 'content': passage_positive[i]}] for i in range(len(passage_positive)) + ], + 'negative_messages': [ + [{'role': 'user', 'content': all_neg[i]}] for i in range(len(all_neg)) + ], + # 'label': 1.0 + } + if len(row["negative_messages"]) == 0: + del row["negative_messages"] + return super().preprocess(row) + + +register_dataset( + DatasetMeta( + dataset_path=os.path.dirname(__file__) + '/generated_250000_general.jsonl', + dataset_name="generated_250000_general", + # subsets=[SubsetDataset('train', split=['train']), SubsetDataset('test', split=['test'])], + preprocess_func=CustomPreprocessor(), + )) + +if __name__ == '__main__': + # load_dataset returns train_dataset and val_dataset based on `split_dataset_ratio` + # Here, since we didn't pass `split_dataset_ratio` (defaults to 0), we take the first one (index 0) + dataset = load_dataset('generated_250000_general')[0] + test_dataset = load_dataset('swift/financial_classification:test')[0] + print(f'dataset[0]: {dataset[0]}') + print(f'test_dataset[0]: {test_dataset[0]}') \ No newline at end of file diff --git a/data/dataset/generated_250000_religous/convert_to_jsonl.py b/data/dataset/generated_250000_religous/convert_to_jsonl.py new file mode 100644 index 0000000..45c6e7e --- /dev/null +++ b/data/dataset/generated_250000_religous/convert_to_jsonl.py @@ -0,0 +1,14 @@ +import json +import os + +file_path = os.path.dirname(__file__) +input_file = file_path + "/250_religous_ready.json" +output_file = file_path + "/250_religous_ready.jsonl" + +with open(input_file, "r", encoding="utf-8") as f_in, open(output_file, "w", encoding="utf-8") as f_out: + data = json.load(f_in) # لیست رکوردها + for record in data: + json_line = json.dumps(record, ensure_ascii=False) + f_out.write(json_line + "\n") + +print(f"Converted {input_file} to {output_file}") \ No newline at end of file diff --git a/data/dataset/generated_250000_religous/generated.py b/data/dataset/generated_250000_religous/generated.py new file mode 100644 index 0000000..4d588ca --- /dev/null +++ b/data/dataset/generated_250000_religous/generated.py @@ -0,0 +1,57 @@ +from swift.llm import ResponsePreprocessor, DatasetMeta, register_dataset, SubsetDataset, load_dataset +from typing import Dict, Any +import os + + +class CustomPreprocessor(ResponsePreprocessor): + # def __init__(self, *, columns = None, **kwargs): + # super().__init__(columns=columns, **kwargs) + # self.num_all_negative = 0 + def get_detailed_instruct(self, task_description: str, query: str) -> str: + return f'Instruct: {task_description}\nQuery:{query}' + + def add_template(self, text): + task = 'Given a web search query, retrieve relevant passages that answer the query' + return self.get_detailed_instruct(task, text) + + def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: + query = self.add_template(row["query"]) + passage_positive = row["passage_positive"] + passage_negative = row["passage_negative"] + passage_negative_random = row["passage_negative_random"] + + all_neg = passage_negative + passage_negative_random + all_neg = list(set(all_neg)) + # self.num_all_negative += len(all_neg) + + row = { + # 'query': [{'role': 'user', 'content': query, 'loss': None}], + 'query': query, + 'positive_messages': [ + [{'role': 'user', 'content': passage_positive[i]}] for i in range(len(passage_positive)) + ], + 'negative_messages': [ + [{'role': 'user', 'content': all_neg[i]}] for i in range(len(all_neg)) + ], + # 'label': 1.0 + } + if len(row["negative_messages"]) == 0: + del row["negative_messages"] + return super().preprocess(row) + + +register_dataset( + DatasetMeta( + dataset_path=os.path.dirname(__file__) + '/250_religous_ready.jsonl', + dataset_name="generated_250000_religous", + # subsets=[SubsetDataset('train', split=['train']), SubsetDataset('test', split=['test'])], + preprocess_func=CustomPreprocessor(), + )) + +if __name__ == '__main__': + # load_dataset returns train_dataset and val_dataset based on `split_dataset_ratio` + # Here, since we didn't pass `split_dataset_ratio` (defaults to 0), we take the first one (index 0) + dataset = load_dataset('generated_250000_religous')[0] + test_dataset = load_dataset('swift/financial_classification:test')[0] + print(f'dataset[0]: {dataset[0]}') + print(f'test_dataset[0]: {test_dataset[0]}') \ No newline at end of file diff --git a/data/dataset/my_local_dataset/my_dataset_register.py b/data/dataset/my_local_dataset/my_dataset_register.py new file mode 100644 index 0000000..8d0d103 --- /dev/null +++ b/data/dataset/my_local_dataset/my_dataset_register.py @@ -0,0 +1,58 @@ +from swift.llm import ResponsePreprocessor, DatasetMeta, register_dataset, SubsetDataset, load_dataset +from typing import Dict, Any +import os + +class CustomPreprocessor(ResponsePreprocessor): + def __init__(self, *, columns = None, **kwargs): + super().__init__(columns=columns, **kwargs) + self.num_all_negative = 0 + + def get_detailed_instruct(self, task_description: str, query: str) -> str: + return f'Instruct: {task_description}\nQuery:{query}' + + def add_template(self, text): + task = 'Given a web search query, retrieve relevant passages that answer the query' + return self.get_detailed_instruct(task, text) + + def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: + query = self.add_template(row["query"]) + passage_positive = row["passage_positive"] + passage_negative = row["passage_negative"] + passage_negative_random = row["passage_negative_random"] + + all_neg = passage_negative + passage_negative_random + all_neg = list(set(all_neg)) + + self.num_all_negative += len(all_neg) + + row = { + # 'query': [{'role': 'user', 'content': query, 'loss': None}], + 'query': query, + 'positive_messages': [ + [{'role': 'user', 'content': passage_positive[i]}] for i in range(len(passage_positive)) + ], + 'negative_messages': [ + [{'role': 'user', 'content': all_neg[i]}] for i in range(len(all_neg)) + ], + # 'label': 1.0 + } + if len(row["negative_messages"]) == 0: + del row["negative_messages"] + return super().preprocess(row) + + +register_dataset( + DatasetMeta( + dataset_path=os.path.dirname(__file__) + '/dataset_train.json', + dataset_name="my_local_dataset", + # subsets=[SubsetDataset('train', split=['train']), SubsetDataset('test', split=['test'])], + preprocess_func=CustomPreprocessor(), + )) + +if __name__ == '__main__': + # load_dataset returns train_dataset and val_dataset based on `split_dataset_ratio` + # Here, since we didn't pass `split_dataset_ratio` (defaults to 0), we take the first one (index 0) + dataset = load_dataset('my_local_dataset')[0] + test_dataset = load_dataset('swift/financial_classification:test')[0] + print(f'dataset[0]: {dataset[0]}') + print(f'test_dataset[0]: {test_dataset[0]}') \ No newline at end of file diff --git a/data/dataset/v11_dataset_hn/generated.py b/data/dataset/v11_dataset_hn/generated.py new file mode 100644 index 0000000..df88d4b --- /dev/null +++ b/data/dataset/v11_dataset_hn/generated.py @@ -0,0 +1,57 @@ +from swift.llm import ResponsePreprocessor, DatasetMeta, register_dataset, SubsetDataset, load_dataset +from typing import Dict, Any +import os + + +class CustomPreprocessor(ResponsePreprocessor): + # def __init__(self, *, columns = None, **kwargs): + # super().__init__(columns=columns, **kwargs) + # self.num_all_negative = 0 + def get_detailed_instruct(self, task_description: str, query: str) -> str: + return f'Instruct: {task_description}\nQuery:{query}' + + def add_template(self, text): + task = 'Given a web search query, retrieve relevant passages that answer the query' + return self.get_detailed_instruct(task, text) + + def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: + query = self.add_template(row["query"]) + passage_positive = row["passage_positive"] + passage_negative = row["passage_negative"] + passage_negative_random = row["passage_negative_random"] + + all_neg = passage_negative + passage_negative_random + all_neg = list(set(all_neg)) + # self.num_all_negative += len(all_neg) + + row = { + # 'query': [{'role': 'user', 'content': query, 'loss': None}], + 'query': query, + 'positive_messages': [ + [{'role': 'user', 'content': passage_positive[i]}] for i in range(len(passage_positive)) + ], + 'negative_messages': [ + [{'role': 'user', 'content': all_neg[i]}] for i in range(len(all_neg)) + ], + # 'label': 1.0 + } + if len(row["negative_messages"]) == 0: + del row["negative_messages"] + return super().preprocess(row) + + +register_dataset( + DatasetMeta( + dataset_path=os.path.dirname(__file__) + '/v11_dataset_hn.json', + dataset_name="v11_dataset_hn", + # subsets=[SubsetDataset('train', split=['train']), SubsetDataset('test', split=['test'])], + preprocess_func=CustomPreprocessor(), + )) + +if __name__ == '__main__': + # load_dataset returns train_dataset and val_dataset based on `split_dataset_ratio` + # Here, since we didn't pass `split_dataset_ratio` (defaults to 0), we take the first one (index 0) + dataset = load_dataset('v11_dataset_hn')[0] + test_dataset = load_dataset('swift/financial_classification:test')[0] + print(f'dataset[0]: {dataset[0]}') + print(f'test_dataset[0]: {test_dataset[0]}') \ No newline at end of file diff --git a/data/dataset/v11_generated/generated.py b/data/dataset/v11_generated/generated.py new file mode 100644 index 0000000..66e58df --- /dev/null +++ b/data/dataset/v11_generated/generated.py @@ -0,0 +1,53 @@ +from swift.llm import ResponsePreprocessor, DatasetMeta, register_dataset, SubsetDataset, load_dataset +from typing import Dict, Any +import os + + +class CustomPreprocessor(ResponsePreprocessor): + def get_detailed_instruct(self, task_description: str, query: str) -> str: + return f'Instruct: {task_description}\nQuery:{query}' + + def add_template(self, text): + task = 'Given a web search query, retrieve relevant passages that answer the query' + return self.get_detailed_instruct(task, text) + + def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]: + query = self.add_template(row["query"]) + passage_positive = [row["document"]] + passage_negative = [] + passage_negative_random = [] + + all_neg = passage_negative + passage_negative_random + all_neg = list(set(all_neg)) + + row = { + # 'query': [{'role': 'user', 'content': query, 'loss': None}], + 'query': query, + 'positive_messages': [ + [{'role': 'user', 'content': passage_positive[i]}] for i in range(len(passage_positive)) + ], + 'negative_messages': [ + [{'role': 'user', 'content': all_neg[i]}] for i in range(len(all_neg)) + ], + # 'label': 1.0 + } + if len(row["negative_messages"]) == 0: + del row["negative_messages"] + return super().preprocess(row) + + +register_dataset( + DatasetMeta( + dataset_path=os.path.dirname(__file__) + '/v11_dataset.json', + dataset_name="v11_generated_dataset", + # subsets=[SubsetDataset('train', split=['train']), SubsetDataset('test', split=['test'])], + preprocess_func=CustomPreprocessor(), + )) + +if __name__ == '__main__': + # load_dataset returns train_dataset and val_dataset based on `split_dataset_ratio` + # Here, since we didn't pass `split_dataset_ratio` (defaults to 0), we take the first one (index 0) + dataset = load_dataset('v11_generated_dataset')[0] + test_dataset = load_dataset('swift/financial_classification:test')[0] + print(f'dataset[0]: {dataset[0]}') + print(f'test_dataset[0]: {test_dataset[0]}') \ No newline at end of file diff --git a/evaluation/evaluation.py b/evaluation/evaluation.py index c7952d8..8d53429 100644 --- a/evaluation/evaluation.py +++ b/evaluation/evaluation.py @@ -56,7 +56,7 @@ class CustomModel: **kwargs, ) -> np.ndarray: - embedding_url = "http://127.0.0.1:5000/embedding" + embedding_url = "http://127.0.0.1:5015/embedding" if prompt_type == None: template = "document" @@ -89,6 +89,8 @@ def is_dataset_cached(dataset_name): def evaluate(): model_name = "Qwen3-Embedding-0.6B" + # model_name = "KaLM-embedding-multilingual-mini-instruct-v2.5" + # model_name = "KaLM-Embedding-Gemma3-12B-2511" # model_name = "llama-embed-nemotron-8b" # model_name = "embeddinggemma-300m" model = CustomModel(model_name) diff --git a/train/qwen/a.sh b/train/qwen/a.sh index 2fc6570..2b3c156 100644 --- a/train/qwen/a.sh +++ b/train/qwen/a.sh @@ -3,6 +3,9 @@ nproc_per_node=1 +# INFONCE_HARD_NEGATIVES=1 \ +# INFONCE_MASK_FAKE_NEGATIVE=True \ + MLFLOW_TRACKING_URI=http://0.0.0.0:5004 \ INFONCE_USE_BATCH=True \ CUDA_VISIBLE_DEVICES=0 \ @@ -16,8 +19,8 @@ swift sft \ --lora_alpha 32 \ --target_modules all-linear \ --max_length 2048 \ - --dataset v11_dataset_hn \ - --custom_register_path $(pwd)/../../data/dataset/v11_dataset_hn/generated.py \ + --dataset generated_250000_religous \ + --custom_register_path $(pwd)/../../data/dataset/generated_250000_religous/generated.py \ --split_dataset_ratio 0.005 \ --eval_strategy steps \ --output_dir output \ diff --git a/train/qwen/merge_model.py b/train/qwen/merge_model.py index 71ccdb1..82e7e35 100644 --- a/train/qwen/merge_model.py +++ b/train/qwen/merge_model.py @@ -33,8 +33,8 @@ def main(): file_path = os.path.dirname(__file__) base_model_path = file_path + "/../../data/models/Qwen3-Embedding-0.6B/model" - peft_model_path = file_path + "/output/v17-20251202-223944/checkpoint-387" - save_path = file_path + "/output/v17-20251202-223944/merged_checkpoint-387" + peft_model_path = file_path + "/output/v23-20251214-111804/checkpoint-3632" + save_path = file_path + "/output/v23-20251214-111804/merged_checkpoint-3632" merge(base_model_path, peft_model_path, save_path) items = ["1_Pooling", "config_sentence_transformers.json", "merges.txt", "modules.json", "README.md", "tokenizer_config.json", "tokenizer.json",