embedding_model/train/jina/test_2.py

from datasets import Dataset
import json
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import RerankingEvaluator


print("start")
########### Load model ###########
print("loading model")
# 1. Load a model to finetune with 2. (Optional) model card data
model = SentenceTransformer("jinaai/jina-embeddings-v3",
                           trust_remote_code=True,
                           local_files_only=False,
                           model_kwargs={'default_task': 'retrieval.passage'})


########### Load dataset ###########
print("loading dataset")
# 3. Load a dataset to finetune on
with open("/home/firouzi/embedding_model/data/train_100.json", "r", encoding="utf-8") as f:
    all_dataset = json.load(f)

# MultipleNegativesRankingLoss expects InputExample(texts=[anchor, positive])
# Your explicit negatives will be ignored, and in-batch negatives will be used.
from sentence_transformers.data import InputExample
from sklearn.model_selection import train_test_split

all_examples = []
for data in all_dataset:
    all_examples.append(InputExample(texts=[data["question"], data["passage_positive"]]))

# Split the dataset into train and evaluation
train_examples, eval_examples = train_test_split(all_examples, test_size=0.05, random_state=42)

print(f"Training with {len(train_examples)} examples")
print(f"Evaluating with {len(eval_examples)} examples")

########### Load loss function ###########
print("loading loss function")
# 4. Define a loss function
loss = MultipleNegativesRankingLoss(model)

########### Load evaluator ###########
print("loading evaluator")
# 6. (Optional) Create an evaluator
# The evaluator format you had was correct, but we need to build it from the 'eval_examples'
eval_dataset_evaluator = []
for data in all_dataset: # We can still use all_dataset to find the matching negatives
    example_query = data["question"]
    example_positive = data["passage_positive"]

    # Find if this example is in our eval set
    is_in_eval = False
    for eval_ex in eval_examples:
        if eval_ex.texts[0] == example_query and eval_ex.texts[1] == example_positive:
            is_in_eval = True
            break

    if is_in_eval:
        all_negatives = data["passage_negative"] + data["passage_negative_random"]
        if len(all_negatives) < 5:
            for i in range(5 - len(all_negatives)):
                all_negatives.append(all_negatives[0]) # Pad negatives

        eval_dataset_evaluator.append({
            "query": example_query,
            "positive": [example_positive],
            "negative": all_negatives[:5], # Use your original negatives for evaluation
        })

dev_evaluator = RerankingEvaluator(
    name="jina_v3",
    samples=eval_dataset_evaluator,
)
# dev_evaluator(model) # You can still run this to check base performance

########### Train the model ###########
print("starting training with model.fit()")
from torch.utils.data import DataLoader

# Create a DataLoader for the training examples
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)

# 7. Train the model using model.fit()
model.fit(
    train_objectives=[(train_dataloader, loss)],
    evaluator=dev_evaluator,
    epochs=1,
    evaluation_steps=5,
    warmup_steps=int(len(train_dataloader) * 0.1), # 10% warmup
    output_path="models/jina_v3",
    save_best_model=True,
    show_progress_bar=True,
    use_amp=True, # Replaces fp16=True
)

########### Save the trained model ###########
# model.fit() already saves the best model to output_path, but you can save again
print("saving final model")
model.save_pretrained("models/jina_v3_final")
print("model saved")
print("end")