#!/usr/bin/env bash # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python export PYTHONPATH=$PYTHONPATH:/workspace/slam/icefall_omni set -eou pipefail stage=$1 stop_stage=$2 # All files generated by this script are saved in "data". # You can safely remove "data" and rerun this script to regenerate it. mkdir -p data log() { # This function is from espnet local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "stage 0: " pip uninstall lhotse cd /workspace/slam/lhotse git config --global --add safe.directory /workspace/slam/lhotse pip install -e '.[dev]' cd - pip install -r slam_omni/requirements.txt fi if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "stage 1: Download whisper-large-v2 multi-hans-zh fbank feature from huggingface" python3 local/compute_whisper_fbank.py fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Combine features" manifest_dir=data/fbank if [ ! -f $manifest_dir/cuts_belle_00001-01600.jsonl.gz ]; then pieces=$(find $manifest_dir -name "cuts_belle.*.jsonl.gz" | sort) # # remove cust_belle_00000.jsonl.gz from pieces # pieces=$(echo $pieces | sed 's/cuts_belle.00000.jsonl.gz//g') echo $pieces | wc lhotse combine $pieces data/fbank/cuts_belle_00001-01600.jsonl.gz cd $manifest_dir && ln -s cuts_belle_00001-01600.jsonl.gz cuts_belle_train.jsonl.gz && cd - fi fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "stage 3: " exp_dir=./slam_omni/exp_speech2speech_rerun python3 ./slam_omni/decode.py \ --max-duration 1 \ --exp-dir $exp_dir \ --speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt \ --llm-path-or-name models/Qwen2.5-0.5B-Instruct \ --epoch 997 --avg 1 \ --manifest-dir data/fbank \ --use-flash-attn True \ --method small_test_speech2speech_rerun \ --enable-speech-output True \ --use-lora True # --on-the-fly-feats True fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "stage 4: " ngpu=8 torchrun --nproc_per_node $ngpu ./slam_omni/train.py \ --max-duration 80 \ --enable-musan False \ --exp-dir ./slam_omni/exp_speech2text \ --speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt \ --llm-path-or-name models/Qwen2.5-0.5B-Instruct \ --manifest-dir data/fbank \ --deepspeed \ --deepspeed_config ./slam_omni/ds_config_zero1.json \ --use-flash-attn True \ --pretrained-model-path slam_omni/exp_speech2text/epoch-1-checkpoint-5000.pt/pytorch_model.bin \ --sampler-state-dict-path slam_omni/exp_speech2text/epoch-1-checkpoint-5000-sampler.pt \ --use-lora True --unfreeze-llm True fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "stage 5: " ngpu=8 exp_dir=./slam_omni/exp_speech2speech_rerun # exp_dir_new=./slam_omni/exp_s2s torchrun --nproc_per_node $ngpu ./slam_omni/train.py \ --max-duration 50 \ --enable-musan False \ --exp-dir $exp_dir \ --speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt \ --llm-path-or-name models/Qwen2.5-0.5B-Instruct \ --manifest-dir data/fbank \ --deepspeed \ --deepspeed_config ./slam_omni/ds_config_zero1.json \ --use-flash-attn True \ --pretrained-model-path $exp_dir/epoch-1-checkpoint-15000.pt/pytorch_model.bin \ --sampler-state-dict-path $exp_dir/epoch-1-checkpoint-15000-sampler.pt \ --use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output True # --pretrained-model-path slam_omni/exp_speech2text/epoch-1-checkpoint-5000.pt/pytorch_model.bin \ # --sampler-state-dict-path $exp_dir/epoch-1-checkpoint-35000-sampler.pt \ fi