# pip install flash-attn --no-build-isolation nproc_per_node=1 INFONCE_USE_BATCH=False \ CUDA_VISIBLE_DEVICES=0 \ NPROC_PER_NODE=$nproc_per_node \ swift sft \ --model $(pwd)/../../data/models/Qwen3-Embedding-0.6B/model \ --task_type embedding \ --model_type qwen3_emb \ --train_type lora \ --lora_rank 8 \ --lora_alpha 16 \ --target_modules all-linear \ --dataset my_local_dataset \ --custom_register_path $(pwd)/../../data/dataset/my_dataset_register.py \ --split_dataset_ratio 0.005 \ --eval_strategy steps \ --output_dir output \ --eval_steps 1000 \ --num_train_epochs 1 \ --save_steps 1000 \ --save_total_limit 10 \ --per_device_train_batch_size 16 \ --per_device_eval_batch_size 16 \ --gradient_accumulation_steps 4 \ --learning_rate 2.4e-5 \ --loss_type infonce \ --label_names labels \ --dataloader_drop_last true \ --deepspeed zero3