train jina
This commit is contained in:
parent
318c183d48
commit
467c21ce7e
@ -150,40 +150,40 @@ def save_dataset(dataset, output_path):
|
||||
|
||||
def main(output_path):
|
||||
|
||||
# #load synthetic dataset
|
||||
# print("--------------------------------")
|
||||
# print("loading synthetic dataset")
|
||||
# synthetic_train_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/train.jsonl"
|
||||
# synthetic_corpus_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/corpus.jsonl"
|
||||
# synthetic_queries_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/queries.jsonl"
|
||||
#load synthetic dataset
|
||||
print("--------------------------------")
|
||||
print("loading synthetic dataset")
|
||||
synthetic_train_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/train.jsonl"
|
||||
synthetic_corpus_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/corpus.jsonl"
|
||||
synthetic_queries_path = "/home/firouzi/embedding_model/data_preprocess_notebook/data/synthetic-persian-qa-retrieval/queries.jsonl"
|
||||
|
||||
# synthetic_dataset = load_synthetic_dataset(synthetic_train_path, synthetic_queries_path, synthetic_corpus_path)
|
||||
# print(f"synthetic dataset loaded : {len(synthetic_dataset)} samples")
|
||||
# print("--------------------------------")
|
||||
synthetic_dataset = load_synthetic_dataset(synthetic_train_path, synthetic_queries_path, synthetic_corpus_path)
|
||||
print(f"synthetic dataset loaded : {len(synthetic_dataset)} samples")
|
||||
print("--------------------------------")
|
||||
|
||||
# #load pquad dataset
|
||||
# print("loading pquad dataset")
|
||||
# pquad_dataset = load_pquad_dataset()
|
||||
# print(f"pquad dataset loaded : {len(pquad_dataset)} samples")
|
||||
# print("--------------------------------")
|
||||
#load pquad dataset
|
||||
print("loading pquad dataset")
|
||||
pquad_dataset = load_pquad_dataset()
|
||||
print(f"pquad dataset loaded : {len(pquad_dataset)} samples")
|
||||
print("--------------------------------")
|
||||
|
||||
# # merge synthetic and pquad dataset
|
||||
# print("start to merge synthetic and pquad dataset")
|
||||
# all_dataset = synthetic_dataset + pquad_dataset
|
||||
# print(f"successfully merged synthetic and pquad dataset")
|
||||
# print("--------------------------------")
|
||||
# merge synthetic and pquad dataset
|
||||
print("start to merge synthetic and pquad dataset")
|
||||
all_dataset = synthetic_dataset + pquad_dataset
|
||||
print(f"successfully merged synthetic and pquad dataset")
|
||||
print("--------------------------------")
|
||||
|
||||
# # removing false negative samples from all dataset
|
||||
# print("start to remove false negative samples from all dataset")
|
||||
# all_dataset = remove_false_negative(all_dataset, random_negative_sample=False)
|
||||
# print(f"successfully removed false negative samples from all dataset")
|
||||
# print("--------------------------------")
|
||||
# removing false negative samples from all dataset
|
||||
print("start to remove false negative samples from all dataset")
|
||||
all_dataset = remove_false_negative(all_dataset, random_negative_sample=False)
|
||||
print(f"successfully removed false negative samples from all dataset")
|
||||
print("--------------------------------")
|
||||
|
||||
with open("/home/firouzi/embedding_model/data/train.json", "r", encoding="utf-8") as f:
|
||||
all_dataset = json.load(f)
|
||||
# with open("/home/firouzi/embedding_model/data/train.json", "r", encoding="utf-8") as f:
|
||||
# all_dataset = json.load(f)
|
||||
|
||||
for i in range(len(all_dataset)):
|
||||
all_dataset[i]['passage_negative_random'] = []
|
||||
# for i in range(len(all_dataset)):
|
||||
# all_dataset[i]['passage_negative_random'] = []
|
||||
|
||||
#generate random negative samples
|
||||
print("start to generate random negative samples")
|
||||
|
||||
@ -1,3 +1,5 @@
|
||||
python-dotenv==1.1.1
|
||||
hazm
|
||||
faiss-cpu
|
||||
hazm=0.10.0
|
||||
faiss-cpu==1.12.0
|
||||
sentence-transformers==5.1.2
|
||||
einops==0.8.1
|
||||
173
research_notebook/data_preprocess/bge.ipynb
Normal file
173
research_notebook/data_preprocess/bge.ipynb
Normal file
@ -0,0 +1,173 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "9dbad513",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/firouzi/embedding_model/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||
" from .autonotebook import tqdm as notebook_tqdm\n",
|
||||
"Downloading readme: 100%|██████████| 419/419 [00:00<00:00, 1.18MB/s]\n",
|
||||
"Downloading data: 100%|██████████| 1.59M/1.59M [00:01<00:00, 1.03MB/s]\n",
|
||||
"Generating train split: 100%|██████████| 7000/7000 [00:00<00:00, 175360.77 examples/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from datasets import load_dataset\n",
|
||||
"\n",
|
||||
"ds = load_dataset(\"virattt/financial-qa-10K\", split=\"train\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "7330f385",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'query': 'What area did NVIDIA initially focus on before expanding to other computationally intensive fields?',\n",
|
||||
" 'pos': 'Since our original focus on PC graphics, we have expanded to several other large and important computationally intensive fields.',\n",
|
||||
" 'id': '0'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ds = ds.select_columns(column_names=[\"question\", \"context\"])\n",
|
||||
"ds = ds.rename_column(\"question\", \"query\")\n",
|
||||
"ds = ds.rename_column(\"context\", \"pos\")\n",
|
||||
"ds = ds.add_column(\"id\", [str(i) for i in range(len(ds))])\n",
|
||||
"ds[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "5ba361dd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Map: 100%|██████████| 7000/7000 [00:00<00:00, 19176.72 examples/s]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"np.random.seed(520)\n",
|
||||
"neg_num = 10\n",
|
||||
"\n",
|
||||
"def str_to_lst(data):\n",
|
||||
" data[\"pos\"] = [data[\"pos\"]]\n",
|
||||
" return data\n",
|
||||
"\n",
|
||||
"# sample negative texts\n",
|
||||
"new_col = []\n",
|
||||
"for i in range(len(ds)):\n",
|
||||
" ids = np.random.randint(0, len(ds), size=neg_num)\n",
|
||||
" while i in ids:\n",
|
||||
" ids = np.random.randint(0, len(ds), size=neg_num)\n",
|
||||
" neg = [ds[i.item()][\"pos\"] for i in ids]\n",
|
||||
" new_col.append(neg)\n",
|
||||
"ds = ds.add_column(\"neg\", new_col)\n",
|
||||
"\n",
|
||||
"# change the key of 'pos' to a list\n",
|
||||
"ds = ds.map(str_to_lst)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "bf3241ca",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"instruction = \"Represent this sentence for searching relevant passages: \"\n",
|
||||
"ds = ds.add_column(\"prompt\", [instruction]*len(ds))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "a35c1466",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"split = ds.train_test_split(test_size=0.1, shuffle=True, seed=520)\n",
|
||||
"train = split[\"train\"]\n",
|
||||
"test = split[\"test\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "24f3f7fb",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Creating json from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 26.22ba/s]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"16583481"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"train.to_json(\"training.json\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c5cc42ed",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
@ -141,6 +141,18 @@
|
||||
"id": "53e5e322",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datasets import load_dataset\n",
|
||||
"\n",
|
||||
"ds = load_dataset(\"virattt/financial-qa-10K\", split=\"train\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1fabd9d8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from datasets import Dataset
|
||||
from datasets import Dataset, load_dataset, VerificationMode
|
||||
import json
|
||||
from sentence_transformers import (
|
||||
SentenceTransformer,
|
||||
@ -10,15 +10,18 @@ from sentence_transformers.training_args import BatchSamplers
|
||||
from sentence_transformers.evaluation import RerankingEvaluator
|
||||
|
||||
|
||||
print("start")
|
||||
########### Load model ###########
|
||||
print("loading model")
|
||||
# 1. Load a model to finetune with 2. (Optional) model card data
|
||||
model = SentenceTransformer("jinaai/jina-embeddings-v3",
|
||||
trust_remote_code=True,
|
||||
local_files_only=False,
|
||||
model_kwargs={'default_task': 'retrieval'})
|
||||
model_kwargs={'default_task': 'retrieval.passage'})
|
||||
|
||||
|
||||
########### Load dataset ###########
|
||||
print("loading dataset")
|
||||
# 3. Load a dataset to finetune on
|
||||
with open("/home/firouzi/embedding_model/data/train_100.json", "r", encoding="utf-8") as f:
|
||||
all_dataset = json.load(f)
|
||||
@ -32,7 +35,7 @@ negatives_4 = []
|
||||
negatives_5 = []
|
||||
for data in all_dataset:
|
||||
anchors.append(data["question"])
|
||||
positives.append(data["passage_positive"])
|
||||
positives.append(data["passage_positive"][0])
|
||||
all_negatives = data["passage_negative"] + data["passage_negative_random"]
|
||||
if len(all_negatives) < 5:
|
||||
for i in range(5 - len(all_negatives)):
|
||||
@ -57,35 +60,44 @@ dataset_split = dataset.train_test_split(test_size=0.05, seed=42)
|
||||
|
||||
train_dataset = dataset_split["train"]
|
||||
eval_dataset = dataset_split["test"]
|
||||
# print(train_dataset[1])
|
||||
|
||||
# dataset = load_dataset("persiannlp/parsinlu_reading_comprehension", verification_mode=VerificationMode.NO_CHECKS)
|
||||
# train_dataset = dataset["train"]
|
||||
# print(train_dataset[1])
|
||||
########### Load loss function ###########
|
||||
print("loading loss function")
|
||||
# 4. Define a loss function
|
||||
loss = MultipleNegativesRankingLoss(model)
|
||||
# loss =
|
||||
|
||||
########### Load training arguments ###########
|
||||
print("loading training arguments")
|
||||
# 5. (Optional) Specify training arguments
|
||||
args = SentenceTransformerTrainingArguments(
|
||||
# Required parameter:
|
||||
output_dir="models/jina_v3",
|
||||
# Optional training parameters:
|
||||
num_train_epochs=1,
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=16,
|
||||
per_device_train_batch_size=4,
|
||||
per_device_eval_batch_size=4,
|
||||
learning_rate=2e-5,
|
||||
warmup_ratio=0.1,
|
||||
fp16=True, # Set to False if you get an error that your GPU can't run on FP16
|
||||
bf16=False, # Set to True if you have a GPU that supports BF16
|
||||
batch_sampler=BatchSamplers.NO_DUPLICATES, # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
|
||||
batch_sampler=BatchSamplers.BATCH_SAMPLER, # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
|
||||
# Optional tracking/debugging parameters:
|
||||
eval_strategy="steps",
|
||||
eval_steps=100,
|
||||
# eval_strategy="steps",
|
||||
# eval_steps=5,
|
||||
save_strategy="steps",
|
||||
save_steps=100,
|
||||
save_steps=10,
|
||||
save_total_limit=2,
|
||||
logging_steps=100,
|
||||
logging_steps=5,
|
||||
run_name="jina_v3", # Will be used in W&B if `wandb` is installed
|
||||
)
|
||||
|
||||
########### Load evaluator ###########
|
||||
print("loading evaluator")
|
||||
# 6. (Optional) Create an evaluator & evaluate the base model
|
||||
eval_dataset_evaluator = [
|
||||
{
|
||||
@ -99,17 +111,18 @@ dev_evaluator = RerankingEvaluator(
|
||||
name="jina_v3",
|
||||
samples=eval_dataset_evaluator,
|
||||
)
|
||||
dev_evaluator(model)
|
||||
# dev_evaluator(model)
|
||||
|
||||
########### Load trainer ###########
|
||||
print("loading trainer")
|
||||
# 7. Create a trainer & train
|
||||
trainer = SentenceTransformerTrainer(
|
||||
model=model,
|
||||
args=args,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
# eval_dataset=eval_dataset,
|
||||
loss=loss,
|
||||
evaluator=dev_evaluator,
|
||||
# evaluator=dev_evaluator,
|
||||
)
|
||||
trainer.train()
|
||||
|
||||
@ -124,5 +137,8 @@ trainer.train()
|
||||
# test_evaluator(model)
|
||||
|
||||
########### Save the trained model ###########
|
||||
print("saving model")
|
||||
# 8. Save the trained model
|
||||
model.save_pretrained("models/jina_v3")
|
||||
print("model saved")
|
||||
print("end")
|
||||
|
||||
101
train/jina/test.py
Normal file
101
train/jina/test.py
Normal file
@ -0,0 +1,101 @@
|
||||
from sentence_transformers import (
|
||||
SentenceTransformer,
|
||||
InputExample,
|
||||
SentenceTransformerTrainingArguments,
|
||||
SentenceTransformerTrainer,
|
||||
)
|
||||
from sentence_transformers.losses import TripletLoss, MatryoshkaLoss, TripletDistanceMetric
|
||||
from sentence_transformers.evaluation import TripletEvaluator, SimilarityFunction, SequentialEvaluator
|
||||
from transformers import EarlyStoppingCallback
|
||||
import torch
|
||||
import os
|
||||
import json
|
||||
from datasets import Dataset
|
||||
|
||||
model = SentenceTransformer("jinaai/jina-embeddings-v3",
|
||||
trust_remote_code=True,
|
||||
local_files_only=False,
|
||||
model_kwargs={'default_task': 'text-matching'})
|
||||
|
||||
print("loading dataset")
|
||||
# 3. Load a dataset to finetune on
|
||||
with open("/home/firouzi/embedding_model/data/train_100.json", "r", encoding="utf-8") as f:
|
||||
all_dataset = json.load(f)
|
||||
|
||||
anchors = []
|
||||
positives = []
|
||||
negatives_1 = []
|
||||
negatives_2 = []
|
||||
negatives_3 = []
|
||||
negatives_4 = []
|
||||
negatives_5 = []
|
||||
for data in all_dataset:
|
||||
anchors.append(data["question"])
|
||||
positives.append(data["passage_positive"])
|
||||
all_negatives = data["passage_negative"] + data["passage_negative_random"]
|
||||
if len(all_negatives) < 5:
|
||||
for i in range(5 - len(all_negatives)):
|
||||
all_negatives.append(all_negatives[0])
|
||||
negatives_1.append(all_negatives[0])
|
||||
negatives_2.append(all_negatives[1])
|
||||
negatives_3.append(all_negatives[2])
|
||||
negatives_4.append(all_negatives[3])
|
||||
negatives_5.append(all_negatives[4])
|
||||
|
||||
dataset = Dataset.from_dict({
|
||||
"anchor": anchors,
|
||||
"positive": positives,
|
||||
"negative": negatives_1,
|
||||
})
|
||||
|
||||
dataset_split = dataset.train_test_split(test_size=0.05, seed=42)
|
||||
|
||||
train_dataset = dataset_split["train"]
|
||||
eval_dataset = dataset_split["test"]
|
||||
|
||||
|
||||
loss = TripletLoss(model,
|
||||
distance_metric=TripletDistanceMetric.COSINE,
|
||||
triplet_margin=0.75)
|
||||
|
||||
dev_evaluator = TripletEvaluator(
|
||||
anchors=eval_dataset["anchor"],
|
||||
positives=eval_dataset["positive"],
|
||||
negatives=eval_dataset["negative"],
|
||||
main_similarity_function=SimilarityFunction.COSINE
|
||||
)
|
||||
|
||||
training_args = SentenceTransformerTrainingArguments(
|
||||
output_dir="save_dir",
|
||||
num_train_epochs=1,
|
||||
per_device_train_batch_size=2,
|
||||
per_device_eval_batch_size=2,
|
||||
learning_rate=2.5e-5,
|
||||
warmup_ratio=0.1,
|
||||
greater_is_better=True,
|
||||
load_best_model_at_end = True,
|
||||
metric_for_best_model="eval_cosine_accuracy",
|
||||
fp16=False,
|
||||
bf16=True,
|
||||
eval_strategy="steps",
|
||||
eval_steps=50,
|
||||
save_strategy="steps",
|
||||
save_steps=50,
|
||||
save_total_limit=10,
|
||||
logging_steps=50,
|
||||
logging_first_step=True,
|
||||
)
|
||||
|
||||
trainer = SentenceTransformerTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset,
|
||||
loss=loss,
|
||||
evaluator=dev_evaluator,
|
||||
)
|
||||
|
||||
# pretraining_encoding = model.encode(["The human torch was denied a bank loan."])
|
||||
# print("Pre-training encoding:", pretraining_encoding)
|
||||
|
||||
# Begine fine tuning
|
||||
trainer.train()
|
||||
109
train/jina/test_2.py
Normal file
109
train/jina/test_2.py
Normal file
@ -0,0 +1,109 @@
|
||||
from datasets import Dataset
|
||||
import json
|
||||
from sentence_transformers import (
|
||||
SentenceTransformer,
|
||||
SentenceTransformerTrainer,
|
||||
SentenceTransformerTrainingArguments,
|
||||
)
|
||||
from sentence_transformers.losses import MultipleNegativesRankingLoss
|
||||
from sentence_transformers.training_args import BatchSamplers
|
||||
from sentence_transformers.evaluation import RerankingEvaluator
|
||||
|
||||
|
||||
print("start")
|
||||
########### Load model ###########
|
||||
print("loading model")
|
||||
# 1. Load a model to finetune with 2. (Optional) model card data
|
||||
model = SentenceTransformer("jinaai/jina-embeddings-v3",
|
||||
trust_remote_code=True,
|
||||
local_files_only=False,
|
||||
model_kwargs={'default_task': 'retrieval.passage'})
|
||||
|
||||
|
||||
|
||||
########### Load dataset ###########
|
||||
print("loading dataset")
|
||||
# 3. Load a dataset to finetune on
|
||||
with open("/home/firouzi/embedding_model/data/train_100.json", "r", encoding="utf-8") as f:
|
||||
all_dataset = json.load(f)
|
||||
|
||||
# MultipleNegativesRankingLoss expects InputExample(texts=[anchor, positive])
|
||||
# Your explicit negatives will be ignored, and in-batch negatives will be used.
|
||||
from sentence_transformers.data import InputExample
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
all_examples = []
|
||||
for data in all_dataset:
|
||||
all_examples.append(InputExample(texts=[data["question"], data["passage_positive"]]))
|
||||
|
||||
# Split the dataset into train and evaluation
|
||||
train_examples, eval_examples = train_test_split(all_examples, test_size=0.05, random_state=42)
|
||||
|
||||
print(f"Training with {len(train_examples)} examples")
|
||||
print(f"Evaluating with {len(eval_examples)} examples")
|
||||
|
||||
########### Load loss function ###########
|
||||
print("loading loss function")
|
||||
# 4. Define a loss function
|
||||
loss = MultipleNegativesRankingLoss(model)
|
||||
|
||||
########### Load evaluator ###########
|
||||
print("loading evaluator")
|
||||
# 6. (Optional) Create an evaluator
|
||||
# The evaluator format you had was correct, but we need to build it from the 'eval_examples'
|
||||
eval_dataset_evaluator = []
|
||||
for data in all_dataset: # We can still use all_dataset to find the matching negatives
|
||||
example_query = data["question"]
|
||||
example_positive = data["passage_positive"]
|
||||
|
||||
# Find if this example is in our eval set
|
||||
is_in_eval = False
|
||||
for eval_ex in eval_examples:
|
||||
if eval_ex.texts[0] == example_query and eval_ex.texts[1] == example_positive:
|
||||
is_in_eval = True
|
||||
break
|
||||
|
||||
if is_in_eval:
|
||||
all_negatives = data["passage_negative"] + data["passage_negative_random"]
|
||||
if len(all_negatives) < 5:
|
||||
for i in range(5 - len(all_negatives)):
|
||||
all_negatives.append(all_negatives[0]) # Pad negatives
|
||||
|
||||
eval_dataset_evaluator.append({
|
||||
"query": example_query,
|
||||
"positive": [example_positive],
|
||||
"negative": all_negatives[:5], # Use your original negatives for evaluation
|
||||
})
|
||||
|
||||
dev_evaluator = RerankingEvaluator(
|
||||
name="jina_v3",
|
||||
samples=eval_dataset_evaluator,
|
||||
)
|
||||
# dev_evaluator(model) # You can still run this to check base performance
|
||||
|
||||
########### Train the model ###########
|
||||
print("starting training with model.fit()")
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
# Create a DataLoader for the training examples
|
||||
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)
|
||||
|
||||
# 7. Train the model using model.fit()
|
||||
model.fit(
|
||||
train_objectives=[(train_dataloader, loss)],
|
||||
evaluator=dev_evaluator,
|
||||
epochs=1,
|
||||
evaluation_steps=5,
|
||||
warmup_steps=int(len(train_dataloader) * 0.1), # 10% warmup
|
||||
output_path="models/jina_v3",
|
||||
save_best_model=True,
|
||||
show_progress_bar=True,
|
||||
use_amp=True, # Replaces fp16=True
|
||||
)
|
||||
|
||||
########### Save the trained model ###########
|
||||
# model.fit() already saves the best model to output_path, but you can save again
|
||||
print("saving final model")
|
||||
model.save_pretrained("models/jina_v3_final")
|
||||
print("model saved")
|
||||
print("end")
|
||||
141
train/jina/test_3.py
Normal file
141
train/jina/test_3.py
Normal file
@ -0,0 +1,141 @@
|
||||
from datasets import Dataset
|
||||
import json
|
||||
from sentence_transformers import (
|
||||
SentenceTransformer,
|
||||
SentenceTransformerTrainer,
|
||||
SentenceTransformerTrainingArguments,
|
||||
)
|
||||
from sentence_transformers.losses import MultipleNegativesRankingLoss
|
||||
from sentence_transformers.training_args import BatchSamplers
|
||||
from sentence_transformers.evaluation import RerankingEvaluator
|
||||
|
||||
|
||||
print("start")
|
||||
########### Load model ###########
|
||||
print("loading model")
|
||||
# 1. Load a model to finetune with 2. (Optional) model card data
|
||||
model = SentenceTransformer("jinaai/jina-embeddings-v3",
|
||||
trust_remote_code=True,
|
||||
local_files_only=False,
|
||||
model_kwargs={'default_task': 'retrieval.passage'})
|
||||
|
||||
|
||||
########### Load dataset ###########
|
||||
print("loading dataset")
|
||||
# 3. Load a dataset to finetune on
|
||||
with open("/home/firouzi/embedding_model/data/train_100.json", "r", encoding="utf-8") as f:
|
||||
all_dataset = json.load(f)
|
||||
|
||||
anchors = []
|
||||
positives = []
|
||||
negatives_1 = []
|
||||
negatives_2 = []
|
||||
negatives_3 = []
|
||||
negatives_4 = []
|
||||
negatives_5 = []
|
||||
for data in all_dataset:
|
||||
anchors.append(data["question"])
|
||||
positives.append(data["passage_positive"])
|
||||
all_negatives = data["passage_negative"] + data["passage_negative_random"]
|
||||
if len(all_negatives) < 5:
|
||||
for i in range(5 - len(all_negatives)):
|
||||
all_negatives.append(all_negatives[0])
|
||||
negatives_1.append(all_negatives[0])
|
||||
negatives_2.append(all_negatives[1])
|
||||
negatives_3.append(all_negatives[2])
|
||||
negatives_4.append(all_negatives[3])
|
||||
negatives_5.append(all_negatives[4])
|
||||
|
||||
dataset = Dataset.from_dict({
|
||||
"anchor": anchors,
|
||||
"positive": positives,
|
||||
"negative_1": negatives_1,
|
||||
"negative_2": negatives_2,
|
||||
"negative_3": negatives_3,
|
||||
"negative_4": negatives_4,
|
||||
"negative_5": negatives_5,
|
||||
})
|
||||
|
||||
dataset_split = dataset.train_test_split(test_size=0.05, seed=42)
|
||||
|
||||
train_dataset = dataset_split["train"]
|
||||
eval_dataset = dataset_split["test"]
|
||||
########### Load loss function ###########
|
||||
print("loading loss function")
|
||||
# 4. Define a loss function
|
||||
loss = MultipleNegativesRankingLoss(model)
|
||||
|
||||
########### Load training arguments ###########
|
||||
print("loading training arguments")
|
||||
# 5. (Optional) Specify training arguments
|
||||
args = SentenceTransformerTrainingArguments(
|
||||
# Required parameter:
|
||||
output_dir="models/jina_v3",
|
||||
# Optional training parameters:
|
||||
num_train_epochs=1,
|
||||
per_device_train_batch_size=4,
|
||||
per_device_eval_batch_size=4,
|
||||
learning_rate=2e-5,
|
||||
warmup_ratio=0.1,
|
||||
fp16=True, # Set to False if you get an error that your GPU can't run on FP16
|
||||
bf16=False, # Set to True if you have a GPU that supports BF16
|
||||
batch_sampler=BatchSamplers.NO_DUPLICATES, # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
|
||||
# Optional tracking/debugging parameters:
|
||||
eval_strategy="steps",
|
||||
eval_steps=5,
|
||||
save_strategy="steps",
|
||||
save_steps=10,
|
||||
save_total_limit=2,
|
||||
logging_steps=5,
|
||||
run_name="jina_v3", # Will be used in W&B if `wandb` is installed
|
||||
)
|
||||
|
||||
########### Load evaluator ###########
|
||||
print("loading evaluator")
|
||||
# 6. (Optional) Create an evaluator & evaluate the base model
|
||||
eval_dataset_evaluator = [
|
||||
{
|
||||
"query": sample["anchor"],
|
||||
"positive": [sample["positive"]],
|
||||
"negative": [sample["negative_1"], sample["negative_2"], sample["negative_3"], sample["negative_4"], sample["negative_5"]],
|
||||
}
|
||||
for sample in eval_dataset
|
||||
]
|
||||
dev_evaluator = RerankingEvaluator(
|
||||
name="jina_v3",
|
||||
samples=eval_dataset_evaluator,
|
||||
)
|
||||
# dev_evaluator(model)
|
||||
|
||||
########### Load trainer ###########
|
||||
print("loading trainer")
|
||||
# 7. Create a trainer & train
|
||||
# 7. Train the model using model.fit()
|
||||
model.fit(
|
||||
train_objectives=[(train_dataset, loss)],
|
||||
evaluator=dev_evaluator,
|
||||
epochs=1,
|
||||
evaluation_steps=5,
|
||||
warmup_steps=int(len(train_dataset) * 0.1), # 10% warmup
|
||||
output_path="models/jina_v3",
|
||||
save_best_model=True,
|
||||
show_progress_bar=True,
|
||||
use_amp=True, # Replaces fp16=True
|
||||
)
|
||||
|
||||
########### Load test evaluator ###########
|
||||
# (Optional) Evaluate the trained model on the test set
|
||||
# test_evaluator = TripletEvaluator(
|
||||
# anchors=test_dataset["anchor"],
|
||||
# positives=test_dataset["positive"],
|
||||
# negatives=test_dataset["negative"],
|
||||
# name="all-nli-test",
|
||||
# )
|
||||
# test_evaluator(model)
|
||||
|
||||
########### Save the trained model ###########
|
||||
print("saving model")
|
||||
# 8. Save the trained model
|
||||
model.save_pretrained("models/jina_v3")
|
||||
print("model saved")
|
||||
print("end")
|
||||
Loading…
x
Reference in New Issue
Block a user