#!/usr/bin/env bash

# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

export PYTHONPATH=$PYTHONPATH:/workspace/icefall

set -eou pipefail

stage=$1
stop_stage=$2
# All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data

log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}


if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "stage 0: Clone CosyVoice repo and install requirements inside the container"
  # docker: ghcr.io/swivid/f5-tts:main
  pip install k2==1.24.4.dev20241030+cuda12.4.torch2.4.0 -f https://k2-fsa.github.io/k2/cuda.html
  git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git /workspace/CosyVoice
  cd /workspace/CosyVoice
  # If you failed to clone submodule due to network failures, please run following command until success
  git submodule update --init --recursive
  pip install -r qwen_omni/requirements.txt
  pip install -r qwen_omni/requirements-cosyvoice.txt

  # For Chinese only dataset, you can use the following command to download the Chinese fine-tuned whisper model.
  huggingface-cli download --local-dir models/whisper yuekai/icefall_asr_multi-hans-zh_whisper
  # Cosyvoice pretrained model for speech token2wav module
  huggingface-cli download --local-dir models/CosyVoice-300M-SFT FunAudioLLM/CosyVoice-300M-SFT
  # Qwen Pretrained model
  huggingface-cli download --local-dir models/Qwen2.5-0.5B-Instruct Qwen/Qwen2.5-0.5B-Instruct
  # Qwen-Omni like speech2speech model trained on worstchan/Belle_1.4M-SLAM-Omni
  huggingface-cli download --local-dir models/qwen-omni-like-speech2speech-belle-1.4M yuekai/qwen-omni-like-speech2speech-belle-1.4M

  # For Gradio demo, we follow https://arxiv.org/abs/2412.15649 to use ASR model to decode the history speech as context.
  pip install sherpa-onnx
  model_path=local/sherpa-onnx-paraformer-zh-2023-09-14
  if [ ! -d $model_path ]; then
    wget -nc https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
    tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 -C local
  fi
fi
export PYTHONPATH=$PYTHONPATH:/workspace/CosyVoice

if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "stage 1: Compute fbank feature from huggingface"
  python3 local/compute_whisper_fbank.py \
   --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \
   --out-dir data/fbank_test \
   --huggingface-dataset-path-or-name /workspace/Belle_1.4M-SLAM-Omni \
   --audio-key question_audio --text-key answer \
   --prefix belle
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Combine features"
  manifest_dir=data/fbank
  if [ ! -f $manifest_dir/cuts_belle_00001-01600.jsonl.gz ]; then
    mv $manifest_dir/cuts_belle.00000.jsonl.gz ./
    # exclude cust_belle_00000.jsonl.gz for valid and test set
    pieces=$(find $manifest_dir -name "cuts_belle.*.jsonl.gz" | sort)
    echo $pieces | wc
    lhotse combine $pieces data/fbank/cuts_belle_00001-01600.jsonl.gz
    mv ./cuts_belle.00000.jsonl.gz $manifest_dir # put it back
    cd $manifest_dir && ln -s cuts_belle_00001-01600.jsonl.gz cuts_belle_train.jsonl.gz
    ln -s cuts_belle.00000.jsonl.gz cuts_belle_test.jsonl.gz && cd -
  fi
fi

ngpu=8
exp_dir=./qwen_omni/exp_speech2speech
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "stage 3: Training Speech2Speech Model"
  torchrun --nproc_per_node $ngpu ./qwen_omni/train.py \
    --max-duration 50 \
    --enable-musan False \
    --exp-dir $exp_dir \
    --speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt \
    --llm-path-or-name Qwen/Qwen2.5-0.5B-Instruct \
    --manifest-dir data/fbank \
    --deepspeed \
    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
    --use-flash-attn True \
    --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output True
fi

if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "stage 4: Decoding, only support batch_size=1 for now."
  cd $exp_dir && ln -s ../../models/qwen-omni-like-speech2speech-belle-1.4M/pytorch_model.bin epoch-999.pt && cd -
  python3 ./qwen_omni/decode.py \
    --max-duration 1 \
    --exp-dir $exp_dir \
    --speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt  \
    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
    --epoch 999 --avg 1 \
    --manifest-dir data/fbank \
    --use-flash-attn True \
    --method e2e-epoch10_speech2speech \
    --enable-speech-output True \
    --token2wav-path models/CosyVoice-300M-SFT \
    --use-lora True
fi

if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "stage 5: Gradio Demo"
  python3 ./qwen_omni/web_demo.py \
    --speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt  \
    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
    --checkpoint-path $exp_dir/epoch-999.pt \
    --use-flash-attn True \
    --enable-speech-output True \
    --asr-model-dir local/sherpa-onnx-paraformer-zh-2023-09-14 \
    --use-lora True --token2wav-path /workspace/CosyVoice-300M-SFT --share
fi

if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "stage 6: Compute fbank feature from huggingface"
  # CUDA_VISIBLE_DEVICES=0 python3 local/compute_whisper_fbank.py \
  #  --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \
  #  --out-dir data/fbank_voice_assistant \
  #  --huggingface-dataset-path-or-name worstchan/VoiceAssistant-400K-SLAM-Omni \
  #  --audio-key question_audio --text-key answer \
  #  --prefix voice_assistant
  CUDA_VISIBLE_DEVICES=0 python3 local/compute_whisper_fbank.py \
   --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \
   --out-dir data/fbank_voice_assistant_cosy2 \
   --json-file-path /workspace/slam/VoiceAssistant-430K-vocalnet/VoiceAssistant-430K.json \
   --prefix voice_assistant
fi

if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
  log "stage 7: Compute fbank feature from huggingface"
  # CUDA_VISIBLE_DEVICES=1 python3 local/compute_whisper_fbank.py \
  #  --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \
  #  --out-dir data/fbank_ultrachat \
  #  --huggingface-dataset-path-or-name worstchan/UltraChat-300K-SLAM-Omni \
  #  --audio-key question_audio --text-key answer \
  #  --prefix ultrachat
  CUDA_VISIBLE_DEVICES=1 python3 local/compute_whisper_fbank.py \
   --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \
   --out-dir data/fbank_ultrachat_cosy2 \
   --json-file-path /workspace/slam/UltraChat-vocalnet/UltraChat.json \
   --prefix ultrachat
fi

if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
  log "stage 8: Compute fbank feature from huggingface"

  CUDA_VISIBLE_DEVICES=1 python3 local/compute_whisper_fbank.py \
   --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \
   --out-dir data/fbank_gigaspeech \
   --huggingface-dataset-path-or-name speechcolab/gigaspeech \
   --subset test --split test \
   --audio-key audio --text-key text \
   --prefix gigaspeech

  CUDA_VISIBLE_DEVICES=0 python3 local/compute_whisper_fbank.py \
   --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb True \
   --out-dir data/fbank_gigaspeech \
   --huggingface-dataset-path-or-name speechcolab/gigaspeech \
   --subset xl --split train \
   --audio-key audio --text-key text \
   --prefix gigaspeech
fi

# cd /workspace && ln -s /lustre/fsw/general_sa/yuekaiz/s2s slam && cd -
ngpu=4
exp_dir=./qwen_omni/exp_speech2speech_en
if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
  log "stage 10: Training Speech2Speech Model"
  torchrun --nproc_per_node $ngpu ./qwen_omni/train.py \
    --max-duration 150 \
    --enable-musan False \
    --exp-dir $exp_dir \
    --speech-encoder-path-or-name models/large-v2.pt \
    --llm-path-or-name Qwen/Qwen2.5-0.5B-Instruct \
    --dataset-format vocalnet \
    --manifest-dir data/fbank \
    --deepspeed \
    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
    --use-flash-attn True --on-the-fly-feats True \
    --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output True
fi


if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
  log "stage 11: Decoding EN, val set only support batch_size=1 for now."
  exp_dir=./qwen_omni/exp_speech2speech_en_continue
  # cd $exp_dir && ln -s ../../models/qwen-omni-like-speech2speech-belle-1.4M/pytorch_model.bin epoch-999.pt && cd -
  python3 ./qwen_omni/decode.py \
    --max-duration 1 \
    --exp-dir $exp_dir \
    --speech-encoder-path-or-name models/large-v2.pt  \
    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
    --epoch 997 --avg 1 \
    --manifest-dir data/fbank \
    --use-flash-attn True \
    --method e2e-epoch4_speech2speech \
    --enable-speech-output True \
    --token2wav-path /workspace/CosyVoice2-0.5B \
    --use-lora True
fi


if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
  log "stage 12: Decoding EN voicebench"
  exp_dir=./qwen_omni/exp_speech2speech_en_continue
  torchrun --nproc_per_node=2 \
   ./qwen_omni/decode_dist.py \
    --output-dir $exp_dir/log_voicebench \
    --speech-encoder-path-or-name models/large-v2.pt  \
    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
    --use-flash-attn True \
    --enable-speech-output True \
    --checkpoint-path $exp_dir/epoch-10-checkpoint-40000.pt/pytorch_model.bin \
    --use-lora True --subset-name openbookqa --split-name test
fi


if [ $stage -le 13 ] && [ $stop_stage -ge 13 ]; then
  log "stage 13: Server"
  exp_dir=./qwen_omni/exp_speech2speech_en_continue
  python3 ./qwen_omni/server.py \
    --speech-encoder-path-or-name models/large-v2.pt  \
    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
    --checkpoint-path $exp_dir/epoch-10-checkpoint-40000.pt/pytorch_model.bin \
    --use-flash-attn True \
    --enable-speech-output True \
    --use-lora True
fi

if [ $stage -le 14 ] && [ $stop_stage -ge 14 ]; then
  log "stage 14: Client"
  exp_dir=./qwen_omni/exp_speech2text_first_libri_continuation_second_ce
  exp_dir=./qwen_omni/exp_speech2text_first_asr_second_ce
  exp_dir=./qwen_omni/exp_speech2text_first_multi_en_continuation_second_qa
  exp_dir=./qwen_omni/exp_speech2text_first_multi_en_continuation_second_three_s2s_librispeech
  # exp_dir=./qwen_omni/exp_speech2text_first_multi_en_continuation_second_three_s2s
  # The final assignment of datasets in the original script is used here:
  # (alpacaeval_full wildvoice mmsu advbench bbh ifeval commoneval openbookqa sd-qa)
  declare -a target_datasets=("alpacaeval_full" "wildvoice" "ifeval" "commoneval" "openbookqa" "sd-qa" "advbench" "bbh" "mmsu")
  declare -a target_datasets=("alpacaeval_full" "wildvoice" "ifeval" "commoneval" "openbookqa" "sd-qa" "advbench" "bbh")
  declare -a target_datasets=("mmsu")

  NUM_CLIENT_JOBS=4 # Number of parallel client jobs
  BASE_PORT=8000    # Base port for servers

  log "Starting $NUM_CLIENT_JOBS parallel client jobs to process ${#target_datasets[@]} datasets."

  for job_id in $(seq 0 $(($NUM_CLIENT_JOBS - 1)))
  do
    ( # Start a subshell for backgrounding this client job's tasks
      current_port=$(expr $BASE_PORT + $job_id)
      log "Client Job $job_id: Initializing. Will connect to port $current_port."
      
      processed_count_for_this_job=0
      # Iterate over all datasets using their indices
      for i in "${!target_datasets[@]}"; do
        # Assign dataset to job_id in a round-robin fashion
        if [ $(($i % $NUM_CLIENT_JOBS)) -eq $job_id ]; then
          dataset="${target_datasets[$i]}"
          
          # local split_name # Determine split_name based on dataset
          if [ "$dataset" == "sd-qa" ]; then
            split_name="usa"
          else
            split_name="test"
          fi
          
          log "Client Job $job_id (Port $current_port): Processing dataset '$dataset' (split '$split_name')"
          python3 ./qwen_omni/client.py \
            --subset-name "$dataset" \
            --split-name "$split_name" \
            --output-dir "$exp_dir/results" \
            --port "$current_port" # Assuming client.py accepts --port
          
          if [ $? -ne 0 ]; then
            log "Client Job $job_id (Port $current_port): ERROR processing dataset '$dataset'."
          fi
          processed_count_for_this_job=$(($processed_count_for_this_job + 1))
        fi
      done
      log "Client Job $job_id (Port $current_port): Finished. Processed $processed_count_for_this_job datasets."
    ) & # Run this client job's subshell in the background
  done

  log "All client jobs launched. Waiting for completion..."
  wait # Wait for all backgrounded client jobs to complete
  log "All client jobs have completed."
fi

if [ $stage -le 15 ] && [ $stop_stage -ge 15 ]; then
  log "stage 15: Training Speech2Speech Model, adaptor only"
  exp_dir=./qwen_omni/exp_speech2text
  ngpu=2
  torchrun --nproc_per_node $ngpu ./qwen_omni/train.py \
    --max-duration 700 \
    --enable-musan False \
    --audio-key audio --text-key continuation \
    --exp-dir $exp_dir \
    --speech-encoder-path-or-name models/large-v2.pt \
    --llm-path-or-name Qwen/Qwen2.5-0.5B-Instruct \
    --on-the-fly-feats True \
    --deepspeed \
    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
    --use-flash-attn True \
    --dataset-format speech_continuation \
    --start-epoch 4 --pretrained-model-path $exp_dir/epoch-3/pytorch_model.bin \
    --use-lora False --unfreeze-llm False --unfreeze-speech-projector True --enable-speech-output False
fi

if [ $stage -le 16 ] && [ $stop_stage -ge 16 ]; then
  log "stage 16: Training Speech2Speech Model, adaptor only"
  exp_dir=./qwen_omni/exp_speech2text
  ngpu=4

  latest_checkpoint_step=-1
  # Check if exp_dir exists and is a directory
  if [ -d "$exp_dir" ]; then
    # List directories matching checkpoint-* and find the one with the largest step number
    for checkpoint_dir in $(ls -d $exp_dir/checkpoint-*/ 2>/dev/null | sort -V); do
      checkpoint_name=$(basename "$checkpoint_dir") # e.g., checkpoint-1000
      # Extract step number using parameter expansion
      current_step=${checkpoint_name#checkpoint-}
      # Ensure current_step is a number
      if [[ "$current_step" =~ ^[0-9]+$ ]] && [ "$current_step" -gt "$latest_checkpoint_step" ]; then
        latest_checkpoint_step=$current_step
      fi
    done
  fi

  train_cmd_args="--max-duration 800 \
    --enable-musan False \
    --audio-key audio --text-key continuation \
    --exp-dir $exp_dir \
    --speech-encoder-path-or-name models/large-v2.pt \
    --llm-path-or-name Qwen/Qwen2.5-0.5B-Instruct \
    --on-the-fly-feats True \
    --deepspeed \
    --huggingface-dataset-path-or-name /lustre/fsw/general_sa/yuekaiz/s2s \
    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
    --use-flash-attn True \
    --dataset-format speech_continuation \
    --use-lora False --unfreeze-llm False --unfreeze-speech-projector True --enable-speech-output False"

  if [ "$latest_checkpoint_step" -ge 0 ]; then
    log "Continuing training from checkpoint-$latest_checkpoint_step"
    step=$latest_checkpoint_step
    train_cmd_args="$train_cmd_args --pretrained-model-path $exp_dir/checkpoint-${step}/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-${step}/sampler.pt"
  else
    log "Starting training from scratch as no checkpoint was found in $exp_dir"
    # No pretrained model or sampler state dict needed for the first run
  fi

  torchrun --nproc_per_node $ngpu --nnodes $SLURM_JOB_NUM_NODES --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT --rdzv_backend c10d --rdzv_id $SLURM_JOBID ./qwen_omni/train.py \
    $train_cmd_args
fi


if [ $stage -le 17 ] && [ $stop_stage -ge 17 ]; then
  # pip install gradio sherpa-onnx
  log "stage 17: Server for adapter only speech continuation"
  exp_dir=./qwen_omni/exp_speech2text_first_libri_continuation_second_ce
  exp_dir=./qwen_omni/exp_speech2text_first_asr_second_ce
  exp_dir=./qwen_omni/exp_speech2text_first_multi_en_continuation_second_qa
  exp_dir=./qwen_omni/exp_speech2text_first_multi_en_continuation_second_three_s2s_librispeech
  exp_dir=./qwen_omni/exp_speech2text_first_multi_en_continuation_second_three_s2s

  N_GPUS=4 # Define the number of GPUs/processes you want to launch

  for id in $(seq 0 $(($N_GPUS - 1)))
  do
    log "Launching server on GPU $id with port $(expr 8000 + $id)"
    CUDA_VISIBLE_DEVICES=$id python3 ./qwen_omni/server.py \
      --speech-encoder-path-or-name models/large-v2.pt  \
      --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
      --checkpoint-path $exp_dir/checkpoint-55276/pytorch_model.bin \
      --use-flash-attn True \
      --enable-speech-output False \
      --port $(expr 18000 + $id) \
      --use-lora True &
  done

  wait # Wait for all background processes to complete
fi

if [ $stage -le 18 ] && [ $stop_stage -ge 18 ]; then
  log "stage 18: Training kl-div Speech2Speech Model, adaptor only"
  exp_dir=./qwen_omni/exp_speech2text_kl
  ngpu=2
  torchrun --nproc_per_node $ngpu ./qwen_omni/train.py \
    --max-duration 700 \
    --enable-musan False \
    --audio-key audio --text-key continuation \
    --exp-dir $exp_dir \
    --speech-encoder-path-or-name models/large-v2.pt \
    --llm-path-or-name Qwen/Qwen2.5-0.5B-Instruct \
    --on-the-fly-feats True \
    --deepspeed \
    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
    --use-flash-attn True \
    --dataset-format speech_continuation \
    --loss-type kl_div --dataset librispeech \
    --pretrained-model-path $exp_dir/checkpoint-1001/pytorch_model.bin --sampler-state-dict-path $exp_dir/checkpoint-1001/sampler.pt \
    --use-lora False --unfreeze-llm False --unfreeze-speech-projector True --enable-speech-output False
fi

if [ $stage -le 19 ] && [ $stop_stage -ge 19 ]; then
  log "stage 19: Server for kl loss"
  exp_dir=./qwen_omni/exp_speech2text_kl
  python3 ./qwen_omni/server.py \
    --speech-encoder-path-or-name models/large-v2.pt  \
    --llm-path-or-name models/Qwen2.5-0.5B-Instruct \
    --checkpoint-path $exp_dir/epoch-10/pytorch_model.bin \
    --use-flash-attn True \
    --enable-speech-output False \
    --use-lora False --prompt-template qa
fi

if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then
  log "stage 20: Training Speech2Speech Model, adaptor + lora, second stage"
  exp_dir=./qwen_omni/exp_speech2text_kl_llm
  pretrained_dir=./qwen_omni/exp_speech2text_kl
  ngpu=2
  torchrun --nproc_per_node $ngpu ./qwen_omni/train.py \
    --max-duration 200 \
    --enable-musan False \
    --exp-dir $exp_dir \
    --speech-encoder-path-or-name models/large-v2.pt \
    --llm-path-or-name Qwen/Qwen2.5-0.5B-Instruct \
    --deepspeed \
    --deepspeed_config ./qwen_omni/ds_config_zero1.json \
    --use-flash-attn True \
    --pretrained-model-path $pretrained_dir/epoch-10/pytorch_model.bin \
    --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output False --dataset-format vocalnet
fi