2025-06-02 23:16:03 -07:00

235 lines
10 KiB
Bash

#!/usr/bin/env bash
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
export PYTHONPATH=$PYTHONPATH:/workspace/CosyVoice
# export HF_HOME="/lustre/fsw/general_sa/yuekaiz/.cache/huggingface"
set -eou pipefail
stage=$1
stop_stage=$2
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
if [ $stage -le 17 ] && [ $stop_stage -ge 17 ]; then
echo "cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -"
if [ ! -L "/workspace/slam" ]; then
cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -
fi
log "stage 17: Training Speech2Speech Model, full parameters"
exp_dir=./qwen_omni/exp_speech2text_first_multi_en_continuation_second_three_s2s
pretrained_dir=./qwen_omni/exp_speech2text
ngpu=4
latest_checkpoint_step=-1
# Check if exp_dir exists and is a directory
if [ -d "$exp_dir" ]; then
# List directories matching checkpoint-* and find the one with the largest step number
for checkpoint_dir in $(ls -d $exp_dir/checkpoint-*/ 2>/dev/null | sort -V); do
checkpoint_name=$(basename "$checkpoint_dir") # e.g., checkpoint-1000
# Extract step number using parameter expansion
current_step=${checkpoint_name#checkpoint-}
# Ensure current_step is a number
if [[ "$current_step" =~ ^[0-9]+$ ]] && [ "$current_step" -gt "$latest_checkpoint_step" ]; then
latest_checkpoint_step=$current_step
fi
done
fi
train_cmd_args="--max-duration 200 \
--enable-musan False \
--exp-dir $exp_dir \
--last-stage-model-path $pretrained_dir/checkpoint-58548/pytorch_model.bin \
--speech-encoder-path-or-name models/large-v2.pt \
--llm-path-or-name models/Qwen2.5-0.5B-Instruct \
--on-the-fly-feats True --on-the-fly-speed-perturb False\
--deepspeed \
--huggingface-dataset-path-or-name /lustre/fsw/general_sa/yuekaiz/s2s \
--deepspeed_config ./qwen_omni/ds_config_zero1.json \
--use-flash-attn True --on-the-fly-feats True \
--dataset vocalnet_ultrachat_voiceassistant_instruct_s2s --num-epochs 10 \
--use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output False"
if [ "$latest_checkpoint_step" -ge 0 ]; then
log "Continuing training from checkpoint-$latest_checkpoint_step"
step=$latest_checkpoint_step
train_cmd_args="$train_cmd_args --pretrained-model-path $exp_dir/checkpoint-${step}/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-${step}/sampler.pt"
else
log "Starting training from scratch as no checkpoint was found in $exp_dir"
# No pretrained model or sampler state dict needed for the first run
fi
torchrun --nproc_per_node $ngpu --nnodes $SLURM_JOB_NUM_NODES --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d --rdzv_id $SLURM_JOBID ./qwen_omni/train.py \
$train_cmd_args
fi
if [ $stage -le 18 ] && [ $stop_stage -ge 18 ]; then
echo "cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -"
# check if the link exists, if not exist, create it
if [ ! -L "/workspace/slam" ]; then
cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -
fi
log "stage 17: Training Speech2Speech Model, full parameters"
exp_dir=./qwen_omni/exp_speech2text_first_multi_en_continuation_second_three_s2s_librispeech
pretrained_dir=./qwen_omni/exp_speech2text
ngpu=4
latest_checkpoint_step=-1
# Check if exp_dir exists and is a directory
if [ -d "$exp_dir" ]; then
# List directories matching checkpoint-* and find the one with the largest step number
for checkpoint_dir in $(ls -d $exp_dir/checkpoint-*/ 2>/dev/null | sort -V); do
checkpoint_name=$(basename "$checkpoint_dir") # e.g., checkpoint-1000
# Extract step number using parameter expansion
current_step=${checkpoint_name#checkpoint-}
# Ensure current_step is a number
if [[ "$current_step" =~ ^[0-9]+$ ]] && [ "$current_step" -gt "$latest_checkpoint_step" ]; then
latest_checkpoint_step=$current_step
fi
done
fi
train_cmd_args="--max-duration 200 \
--enable-musan False \
--exp-dir $exp_dir \
--last-stage-model-path $pretrained_dir/checkpoint-58548/pytorch_model.bin \
--speech-encoder-path-or-name models/large-v2.pt \
--llm-path-or-name models/Qwen2.5-0.5B-Instruct \
--on-the-fly-feats True --on-the-fly-speed-perturb False\
--deepspeed \
--huggingface-dataset-path-or-name /lustre/fsw/general_sa/yuekaiz/s2s \
--deepspeed_config ./qwen_omni/ds_config_zero1.json \
--use-flash-attn True --on-the-fly-feats True \
--dataset vocalnet_ultrachat_voiceassistant_instruct_s2s_librispeech --num-epochs 10 \
--use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output False"
if [ "$latest_checkpoint_step" -ge 0 ]; then
log "Continuing training from checkpoint-$latest_checkpoint_step"
step=$latest_checkpoint_step
train_cmd_args="$train_cmd_args --pretrained-model-path $exp_dir/checkpoint-${step}/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-${step}/sampler.pt"
else
log "Starting training from scratch as no checkpoint was found in $exp_dir"
# No pretrained model or sampler state dict needed for the first run
fi
torchrun --nproc_per_node $ngpu --nnodes $SLURM_JOB_NUM_NODES --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d --rdzv_id $SLURM_JOBID ./qwen_omni/train.py \
$train_cmd_args
fi
if [ $stage -le 19 ] && [ $stop_stage -ge 19 ]; then
log "stage 19: Training TTS Model"
exp_dir=./qwen_omni/exp_tts_ultra_chat_voice_assistant
exp_dir=./qwen_omni/exp_tts_emilia_en_tts_only_template
exp_dir=./qwen_omni/exp_tts_emilia_en_tts_three_concat
pretrained_dir=./qwen_omni/exp_speech2text
ngpu=4
latest_checkpoint_step=-1
# Check if exp_dir exists and is a directory
if [ -d "$exp_dir" ]; then
# List directories matching checkpoint-* and find the one with the largest step number
for checkpoint_dir in $(ls -d $exp_dir/checkpoint-*/ 2>/dev/null | sort -V); do
checkpoint_name=$(basename "$checkpoint_dir") # e.g., checkpoint-1000
# Extract step number using parameter expansion
current_step=${checkpoint_name#checkpoint-}
# Ensure current_step is a number
if [[ "$current_step" =~ ^[0-9]+$ ]] && [ "$current_step" -gt "$latest_checkpoint_step" ]; then
latest_checkpoint_step=$current_step
fi
done
fi
# --dataset ultra_chat_voice_assistant
train_cmd_args="--batch-size 30 \
--exp-dir $exp_dir \
--llm-path-or-name models/Qwen2.5-0.5B-Instruct \
--enable-speech-input False \
--deepspeed \
--dataset /lustre/fsw/general_sa/yuekaiz/s2s/VoxBox/manifests_emilia_en \
--deepspeed_config ./qwen_omni/ds_config_zero1.json \
--use-flash-attn True \
--num-epochs 3 \
--use-lora False --unfreeze-llm False --enable-speech-output True"
if [ "$latest_checkpoint_step" -ge 0 ]; then
log "Continuing training from checkpoint-$latest_checkpoint_step"
step=$latest_checkpoint_step
train_cmd_args="$train_cmd_args --pretrained-model-path $exp_dir/checkpoint-${step}/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-${step}/sampler.pt"
else
log "Starting training from scratch as no checkpoint was found in $exp_dir"
# No pretrained model or sampler state dict needed for the first run
fi
torchrun --nproc_per_node $ngpu --nnodes $SLURM_JOB_NUM_NODES --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d --rdzv_id $SLURM_JOBID ./qwen_omni/train_tts.py \
$train_cmd_args
fi
# if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then
# log "stage 20: Training TTS Model"
# echo "cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -"
# if [ ! -L "/workspace/slam" ]; then
# cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -
# fi
# exp_dir=./qwen_omni/exp_test
# ngpu=4
# latest_checkpoint_step=-1
# # Check if exp_dir exists and is a directory
# if [ -d "$exp_dir" ]; then
# # List directories matching checkpoint-* and find the one with the largest step number
# for checkpoint_dir in $(ls -d $exp_dir/checkpoint-*/ 2>/dev/null | sort -V); do
# checkpoint_name=$(basename "$checkpoint_dir") # e.g., checkpoint-1000
# # Extract step number using parameter expansion
# current_step=${checkpoint_name#checkpoint-}
# # Ensure current_step is a number
# if [[ "$current_step" =~ ^[0-9]+$ ]] && [ "$current_step" -gt "$latest_checkpoint_step" ]; then
# latest_checkpoint_step=$current_step
# fi
# done
# fi
# train_cmd_args="--max-duration 150 \
# --enable-musan False \
# --exp-dir $exp_dir \
# --speech-encoder-path-or-name models/large-v2.pt \
# --llm-path-or-name Qwen/Qwen2.5-0.5B-Instruct \
# --dataset vocalnet_ultrachat_voiceassistant \
# --manifest-dir data/fbank \
# --deepspeed \
# --deepspeed_config ./qwen_omni/ds_config_zero1.json \
# --use-flash-attn True --on-the-fly-feats True \
# --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output True"
# if [ "$latest_checkpoint_step" -ge 0 ]; then
# log "Continuing training from checkpoint-$latest_checkpoint_step"
# step=$latest_checkpoint_step
# train_cmd_args="$train_cmd_args --pretrained-model-path $exp_dir/checkpoint-${step}/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-${step}/sampler.pt"
# else
# log "Starting training from scratch as no checkpoint was found in $exp_dir"
# # No pretrained model or sampler state dict needed for the first run
# fi
# torchrun --nproc_per_node $ngpu --nnodes $SLURM_JOB_NUM_NODES --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d --rdzv_id $SLURM_JOBID ./qwen_omni/train.py \
# $train_cmd_args
# fi
# if [ $stage -le 21 ] && [ $stop_stage -ge 21 ]; then
# log "stage 21: TTS Decoding Test Set"
# exp_dir=./qwen_omni/exp_tts
# torchrun --nproc_per_node=2 ./qwen_omni/decode_tts.py \
# --exp-dir $exp_dir \
# --speech-encoder-path-or-name models/large-v2.pt \
# --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
# --pretrained-model-path $exp_dir/checkpoint-32001/pytorch_model.bin \
# --use-flash-attn True \
# --enable-speech-output True \
# --token2wav-path /workspace/CosyVoice2-0.5B \
# --use-lora True
# fi