icefall/egs/speech_llm/SPEECH2SPEECH/exp.sh

#!/usr/bin/env bash

# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python


set -eou pipefail

stage=$1
stop_stage=$2


log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}

if [ $stage -le 17 ] && [ $stop_stage -ge 17 ]; then
  echo "cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -"
  if [ ! -L "/workspace/slam" ]; then
    cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -
  fi
  log "stage 17: Training Speech2Speech Model, full parameters"
  exp_dir=./qwen_omni/exp_speech2text_first_multi_en_continuation_second_three_s2s
  pretrained_dir=./qwen_omni/exp_speech2text
  ngpu=4

  latest_checkpoint_step=-1
  # Check if exp_dir exists and is a directory
  if [ -d "$exp_dir" ]; then
    # List directories matching checkpoint-* and find the one with the largest step number
    for checkpoint_dir in $(ls -d $exp_dir/checkpoint-*/ 2>/dev/null | sort -V); do
      checkpoint_name=$(basename "$checkpoint_dir") # e.g., checkpoint-1000
      # Extract step number using parameter expansion
      current_step=${checkpoint_name#checkpoint-}
      # Ensure current_step is a number
      if [[ "$current_step" =~ ^[0-9]+$ ]] && [ "$current_step" -gt "$latest_checkpoint_step" ]; then
        latest_checkpoint_step=$current_step
      fi
    done
  fi

  train_cmd_args="--max-duration 200 \
    --enable-musan False \
    --exp-dir $exp_dir \
    --last-stage-model-path $pretrained_dir/checkpoint-58548/pytorch_model.bin \
    --speech-encoder-path-or-name models/large-v2.pt \
    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
    --on-the-fly-feats True --on-the-fly-speed-perturb False\
    --deepspeed \
    --huggingface-dataset-path-or-name /lustre/fsw/general_sa/yuekaiz/s2s \
    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
    --use-flash-attn True --on-the-fly-feats True \
    --dataset vocalnet_ultrachat_voiceassistant_instruct_s2s --num-epochs 10 \
    --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output False"

  if [ "$latest_checkpoint_step" -ge 0 ]; then
    log "Continuing training from checkpoint-$latest_checkpoint_step"
    step=$latest_checkpoint_step
    train_cmd_args="$train_cmd_args --pretrained-model-path $exp_dir/checkpoint-${step}/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-${step}/sampler.pt"
  else
    log "Starting training from scratch as no checkpoint was found in $exp_dir"
    # No pretrained model or sampler state dict needed for the first run
  fi

  torchrun --nproc_per_node $ngpu --nnodes $SLURM_JOB_NUM_NODES --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d --rdzv_id $SLURM_JOBID ./qwen_omni/train.py \
    $train_cmd_args
fi

if [ $stage -le 18 ] && [ $stop_stage -ge 18 ]; then
  echo "cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -"
  # check if the link exists, if not exist, create it
  if [ ! -L "/workspace/slam" ]; then
    cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -
  fi
  log "stage 17: Training Speech2Speech Model, full parameters"
  exp_dir=./qwen_omni/exp_speech2text_first_multi_en_continuation_second_three_s2s_librispeech
  pretrained_dir=./qwen_omni/exp_speech2text
  ngpu=4

  latest_checkpoint_step=-1
  # Check if exp_dir exists and is a directory
  if [ -d "$exp_dir" ]; then
    # List directories matching checkpoint-* and find the one with the largest step number
    for checkpoint_dir in $(ls -d $exp_dir/checkpoint-*/ 2>/dev/null | sort -V); do
      checkpoint_name=$(basename "$checkpoint_dir") # e.g., checkpoint-1000
      # Extract step number using parameter expansion
      current_step=${checkpoint_name#checkpoint-}
      # Ensure current_step is a number
      if [[ "$current_step" =~ ^[0-9]+$ ]] && [ "$current_step" -gt "$latest_checkpoint_step" ]; then
        latest_checkpoint_step=$current_step
      fi
    done
  fi

  train_cmd_args="--max-duration 200 \
    --enable-musan False \
    --exp-dir $exp_dir \
    --last-stage-model-path $pretrained_dir/checkpoint-58548/pytorch_model.bin \
    --speech-encoder-path-or-name models/large-v2.pt \
    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
    --on-the-fly-feats True --on-the-fly-speed-perturb False\
    --deepspeed \
    --huggingface-dataset-path-or-name /lustre/fsw/general_sa/yuekaiz/s2s \
    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
    --use-flash-attn True --on-the-fly-feats True \
    --dataset vocalnet_ultrachat_voiceassistant_instruct_s2s_librispeech --num-epochs 10 \
    --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output False"

  if [ "$latest_checkpoint_step" -ge 0 ]; then
    log "Continuing training from checkpoint-$latest_checkpoint_step"
    step=$latest_checkpoint_step
    train_cmd_args="$train_cmd_args --pretrained-model-path $exp_dir/checkpoint-${step}/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-${step}/sampler.pt"
  else
    log "Starting training from scratch as no checkpoint was found in $exp_dir"
    # No pretrained model or sampler state dict needed for the first run
  fi

  torchrun --nproc_per_node $ngpu --nnodes $SLURM_JOB_NUM_NODES --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d --rdzv_id $SLURM_JOBID ./qwen_omni/train.py \
    $train_cmd_args
fi

export HF_HOME="/lustre/fsw/general_sa/yuekaiz/.cache/huggingface"
if [ $stage -le 19 ] && [ $stop_stage -ge 19 ]; then
  log "stage 19: Training TTS Model"
  exp_dir=./qwen_omni/exp_tts
  pretrained_dir=./qwen_omni/exp_speech2text
  ngpu=4

  latest_checkpoint_step=-1
  # Check if exp_dir exists and is a directory
  if [ -d "$exp_dir" ]; then
    # List directories matching checkpoint-* and find the one with the largest step number
    for checkpoint_dir in $(ls -d $exp_dir/checkpoint-*/ 2>/dev/null | sort -V); do
      checkpoint_name=$(basename "$checkpoint_dir") # e.g., checkpoint-1000
      # Extract step number using parameter expansion
      current_step=${checkpoint_name#checkpoint-}
      # Ensure current_step is a number
      if [[ "$current_step" =~ ^[0-9]+$ ]] && [ "$current_step" -gt "$latest_checkpoint_step" ]; then
        latest_checkpoint_step=$current_step
      fi
    done
  fi

  train_cmd_args="--batch-size 64 \
    --exp-dir $exp_dir \
    --last-stage-model-path $pretrained_dir/checkpoint-58548/pytorch_model.bin \
    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
    --enable-speech-input False \
    --deepspeed \
    --dataset /lustre/fsw/general_sa/yuekaiz/s2s/emilia_en \
    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
    --use-flash-attn True  \
    --num-epochs 2 \
    --use-lora False --unfreeze-llm False --enable-speech-output True"

  if [ "$latest_checkpoint_step" -ge 0 ]; then
    log "Continuing training from checkpoint-$latest_checkpoint_step"
    step=$latest_checkpoint_step
    train_cmd_args="$train_cmd_args --pretrained-model-path $exp_dir/checkpoint-${step}/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-${step}/sampler.pt"
  else
    log "Starting training from scratch as no checkpoint was found in $exp_dir"
    # No pretrained model or sampler state dict needed for the first run
  fi

  torchrun --nproc_per_node $ngpu --nnodes $SLURM_JOB_NUM_NODES --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d --rdzv_id $SLURM_JOBID ./qwen_omni/train_tts.py \
    $train_cmd_args
fi


if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then
  log "stage 20: Training TTS Model"
  echo "cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -"
  if [ ! -L "/workspace/slam" ]; then
    cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -
  fi
  exp_dir=./qwen_omni/exp_test
  ngpu=4

  latest_checkpoint_step=-1
  # Check if exp_dir exists and is a directory
  if [ -d "$exp_dir" ]; then
    # List directories matching checkpoint-* and find the one with the largest step number
    for checkpoint_dir in $(ls -d $exp_dir/checkpoint-*/ 2>/dev/null | sort -V); do
      checkpoint_name=$(basename "$checkpoint_dir") # e.g., checkpoint-1000
      # Extract step number using parameter expansion
      current_step=${checkpoint_name#checkpoint-}
      # Ensure current_step is a number
      if [[ "$current_step" =~ ^[0-9]+$ ]] && [ "$current_step" -gt "$latest_checkpoint_step" ]; then
        latest_checkpoint_step=$current_step
      fi
    done
  fi

  train_cmd_args="--max-duration 150 \
    --enable-musan False \
    --exp-dir $exp_dir \
    --speech-encoder-path-or-name models/large-v2.pt \
    --llm-path-or-name Qwen/Qwen2.5-0.5B-Instruct \
    --dataset vocalnet_ultrachat_voiceassistant \
    --manifest-dir data/fbank \
    --deepspeed \
    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
    --use-flash-attn True --on-the-fly-feats True \
    --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output True"

  if [ "$latest_checkpoint_step" -ge 0 ]; then
    log "Continuing training from checkpoint-$latest_checkpoint_step"
    step=$latest_checkpoint_step
    train_cmd_args="$train_cmd_args --pretrained-model-path $exp_dir/checkpoint-${step}/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-${step}/sampler.pt"
  else
    log "Starting training from scratch as no checkpoint was found in $exp_dir"
    # No pretrained model or sampler state dict needed for the first run
  fi

  torchrun --nproc_per_node $ngpu --nnodes $SLURM_JOB_NUM_NODES --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d --rdzv_id $SLURM_JOBID ./qwen_omni/train.py \
    $train_cmd_args
fi

if [ $stage -le 21 ] && [ $stop_stage -ge 21 ]; then
  log "stage 21: TTS Decoding Test Set"
  exp_dir=./qwen_omni/exp_tts
  torchrun --nproc_per_node=4 python3 ./qwen_omni/decode_tts.py \
    --exp-dir $exp_dir \
    --speech-encoder-path-or-name models/large-v2.pt  \
    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
    --pretrained-model-path $exp_dir/checkpoint-32001/pytorch_model.bin \
    --use-flash-attn True \
    --enable-speech-output True \
    --token2wav-path /lustre/fsw/general_sa/yuekaiz/s2s/CosyVoice2-0.5B \
    --use-lora True
fi