#!/usr/bin/env bash # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python export PYTHONPATH=$PYTHONPATH:/workspace/icefall set -eou pipefail stage=$1 stop_stage=$2 # All files generated by this script are saved in "data". # You can safely remove "data" and rerun this script to regenerate it. mkdir -p data log() { # This function is from espnet local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "stage 0: Clone CosyVoice repo and install requirements inside the container" # docker: ghcr.io/swivid/f5-tts:main pip install k2==1.24.4.dev20241030+cuda12.4.torch2.4.0 -f https://k2-fsa.github.io/k2/cuda.html git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git /workspace/CosyVoice cd /workspace/CosyVoice # If you failed to clone submodule due to network failures, please run following command until success git submodule update --init --recursive pip install -r qwen_omni/requirements.txt pip install -r qwen_omni/requirements-cosyvoice.txt # For Chinese only dataset, you can use the following command to download the Chinese fine-tuned whisper model. huggingface-cli download --local-dir models/whisper yuekai/icefall_asr_multi-hans-zh_whisper # Cosyvoice pretrained model for speech token2wav module huggingface-cli download --local-dir models/CosyVoice-300M-SFT FunAudioLLM/CosyVoice-300M-SFT # Qwen Pretrained model huggingface-cli download --local-dir models/Qwen2.5-0.5B-Instruct Qwen/Qwen2.5-0.5B-Instruct # Qwen-Omni like speech2speech model trained on worstchan/Belle_1.4M-SLAM-Omni huggingface-cli download --local-dir models/qwen-omni-like-speech2speech-belle-1.4M yuekai/qwen-omni-like-speech2speech-belle-1.4M # For Gradio demo, we follow https://arxiv.org/abs/2412.15649 to use ASR model to decode the history speech as context. pip install sherpa-onnx model_path=local/sherpa-onnx-paraformer-zh-2023-09-14 if [ ! -d $model_path ]; then wget -nc https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 -C local fi fi export PYTHONPATH=$PYTHONPATH:/workspace/CosyVoice if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "stage 1: Compute fbank feature from huggingface" python3 local/compute_whisper_fbank.py \ --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \ --out-dir data/fbank_test \ --huggingface-dataset-path-or-name /workspace/Belle_1.4M-SLAM-Omni \ --audio-key question_audio --text-key answer \ --prefix belle fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Combine features" manifest_dir=data/fbank if [ ! -f $manifest_dir/cuts_belle_00001-01600.jsonl.gz ]; then mv $manifest_dir/cuts_belle.00000.jsonl.gz ./ # exclude cust_belle_00000.jsonl.gz for valid and test set pieces=$(find $manifest_dir -name "cuts_belle.*.jsonl.gz" | sort) echo $pieces | wc lhotse combine $pieces data/fbank/cuts_belle_00001-01600.jsonl.gz mv ./cuts_belle.00000.jsonl.gz $manifest_dir # put it back cd $manifest_dir && ln -s cuts_belle_00001-01600.jsonl.gz cuts_belle_train.jsonl.gz ln -s cuts_belle.00000.jsonl.gz cuts_belle_test.jsonl.gz && cd - fi fi ngpu=8 exp_dir=./qwen_omni/exp_speech2speech if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "stage 3: Training Speech2Speech Model" torchrun --nproc_per_node $ngpu ./qwen_omni/train.py \ --max-duration 50 \ --enable-musan False \ --exp-dir $exp_dir \ --speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt \ --llm-path-or-name Qwen/Qwen2.5-0.5B-Instruct \ --manifest-dir data/fbank \ --deepspeed \ --deepspeed_config ./qwen_omni/ds_config_zero1.json \ --use-flash-attn True \ --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output True fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "stage 4: Decoding, only support batch_size=1 for now." cd $exp_dir && ln -s ../../models/qwen-omni-like-speech2speech-belle-1.4M/pytorch_model.bin epoch-999.pt && cd - python3 ./qwen_omni/decode.py \ --max-duration 1 \ --exp-dir $exp_dir \ --speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt \ --llm-path-or-name models/Qwen2.5-0.5B-Instruct \ --epoch 999 --avg 1 \ --manifest-dir data/fbank \ --use-flash-attn True \ --method e2e-epoch10_speech2speech \ --enable-speech-output True \ --token2wav-path models/CosyVoice-300M-SFT \ --use-lora True fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "stage 5: Gradio Demo" python3 ./qwen_omni/web_demo.py \ --speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt \ --llm-path-or-name models/Qwen2.5-0.5B-Instruct \ --checkpoint-path $exp_dir/epoch-999.pt \ --use-flash-attn True \ --enable-speech-output True \ --asr-model-dir local/sherpa-onnx-paraformer-zh-2023-09-14 \ --use-lora True --token2wav-path /workspace/CosyVoice-300M-SFT --share fi if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then log "stage 1: Compute fbank feature from huggingface" # CUDA_VISIBLE_DEVICES=0 python3 local/compute_whisper_fbank.py \ # --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \ # --out-dir data/fbank_voice_assistant \ # --huggingface-dataset-path-or-name worstchan/VoiceAssistant-400K-SLAM-Omni \ # --audio-key question_audio --text-key answer \ # --prefix voice_assistant CUDA_VISIBLE_DEVICES=0 python3 local/compute_whisper_fbank.py \ --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \ --out-dir data/fbank_voice_assistant_cosy2 \ --json-file-path /workspace/slam/VoiceAssistant-430K-vocalnet/VoiceAssistant-430K.json \ --prefix voice_assistant fi if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then log "stage 7: Compute fbank feature from huggingface" # CUDA_VISIBLE_DEVICES=1 python3 local/compute_whisper_fbank.py \ # --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \ # --out-dir data/fbank_ultrachat \ # --huggingface-dataset-path-or-name worstchan/UltraChat-300K-SLAM-Omni \ # --audio-key question_audio --text-key answer \ # --prefix ultrachat CUDA_VISIBLE_DEVICES=1 python3 local/compute_whisper_fbank.py \ --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \ --out-dir data/fbank_ultrachat_cosy2 \ --json-file-path /workspace/slam/UltraChat-vocalnet/UltraChat.json \ --prefix ultrachat fi if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then log "stage 8: Compute fbank feature from huggingface" CUDA_VISIBLE_DEVICES=1 python3 local/compute_whisper_fbank.py \ --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \ --out-dir data/fbank_gigaspeech \ --huggingface-dataset-path-or-name speechcolab/gigaspeech \ --subset test --split test \ --audio-key audio --text-key text \ --prefix gigaspeech fi if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then log "stage 9: Compute fbank feature from huggingface" CUDA_VISIBLE_DEVICES=0 python3 local/compute_whisper_fbank.py \ --num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb True \ --out-dir data/fbank_gigaspeech \ --huggingface-dataset-path-or-name speechcolab/gigaspeech \ --subset xl --split train \ --audio-key audio --text-key text \ --prefix gigaspeech fi ngpu=2 exp_dir=./qwen_omni/exp_speech2speech_en if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then log "stage 10: Training Speech2Speech Model" torchrun --nproc_per_node $ngpu ./qwen_omni/train.py \ --max-duration 50 \ --enable-musan False \ --exp-dir $exp_dir \ --speech-encoder-path-or-name models/large-v2.pt \ --llm-path-or-name Qwen/Qwen2.5-0.5B-Instruct \ --dataset-format vocalnet \ --manifest-dir data/fbank \ --deepspeed \ --deepspeed_config ./qwen_omni/ds_config_zero1.json \ --use-flash-attn True \ --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output True fi