mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 10:02:22 +00:00
144 lines
5.2 KiB
Bash
144 lines
5.2 KiB
Bash
#!/usr/bin/env bash
|
|
|
|
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
|
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
|
export PYTHONPATH=$PYTHONPATH:/workspace/slam/icefall_omni
|
|
set -eou pipefail
|
|
|
|
stage=$1
|
|
stop_stage=$2
|
|
# All files generated by this script are saved in "data".
|
|
# You can safely remove "data" and rerun this script to regenerate it.
|
|
mkdir -p data
|
|
|
|
log() {
|
|
# This function is from espnet
|
|
local fname=${BASH_SOURCE[1]##*/}
|
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
|
}
|
|
|
|
|
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
|
log "stage 0: "
|
|
#pip uninstall lhotse
|
|
#cd /workspace/slam/lhotse
|
|
#git config --global --add safe.directory /workspace/slam/lhotse
|
|
#pip install -e '.[dev]'
|
|
cd -
|
|
pip install -r slam_omni/requirements.txt
|
|
fi
|
|
|
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|
log "stage 1: Download whisper-large-v2 multi-hans-zh fbank feature from huggingface"
|
|
|
|
python3 local/compute_whisper_fbank.py \
|
|
--num-mel-bins 80 --whisper-fbank True --resample-to-16kHz True --speed-perturb False \
|
|
--out-dir data/fbank_test \
|
|
--huggingface-dataset-path-or-name /workspace/Belle_1.4M-SLAM-Omni \
|
|
--audio-key question_audio --text-key answer \
|
|
--prefix belle
|
|
fi
|
|
|
|
|
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|
log "Stage 2: Combine features"
|
|
manifest_dir=data/fbank
|
|
if [ ! -f $manifest_dir/cuts_belle_00001-01600.jsonl.gz ]; then
|
|
pieces=$(find $manifest_dir -name "cuts_belle.*.jsonl.gz" | sort)
|
|
# # remove cust_belle_00000.jsonl.gz from pieces
|
|
# pieces=$(echo $pieces | sed 's/cuts_belle.00000.jsonl.gz//g')
|
|
echo $pieces | wc
|
|
lhotse combine $pieces data/fbank/cuts_belle_00001-01600.jsonl.gz
|
|
cd $manifest_dir && ln -s cuts_belle_00001-01600.jsonl.gz cuts_belle_train.jsonl.gz && cd -
|
|
fi
|
|
fi
|
|
|
|
|
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|
log "stage 3: "
|
|
exp_dir=./slam_omni/exp_speech2speech_rerun
|
|
export PYTHONPATH=$PYTHONPATH:/workspace/CosyVoice
|
|
python3 ./slam_omni/decode.py \
|
|
--max-duration 1 \
|
|
--exp-dir $exp_dir \
|
|
--speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt \
|
|
--llm-path-or-name models/Qwen2.5-0.5B-Instruct \
|
|
--epoch 999 --avg 1 \
|
|
--manifest-dir data/fbank \
|
|
--use-flash-attn True \
|
|
--method e2e-epoch10_speech2speech_rerun \
|
|
--enable-speech-output True \
|
|
--token2wav-path /workspace/CosyVoice-300M-SFT \
|
|
--use-lora True # --on-the-fly-feats True
|
|
|
|
fi
|
|
|
|
|
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|
log "stage 4: "
|
|
ngpu=8
|
|
torchrun --nproc_per_node $ngpu ./slam_omni/train.py \
|
|
--max-duration 80 \
|
|
--enable-musan False \
|
|
--exp-dir ./slam_omni/exp_speech2text \
|
|
--speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt \
|
|
--llm-path-or-name models/Qwen2.5-0.5B-Instruct \
|
|
--manifest-dir data/fbank \
|
|
--deepspeed \
|
|
--deepspeed_config ./slam_omni/ds_config_zero1.json \
|
|
--use-flash-attn True \
|
|
--pretrained-model-path slam_omni/exp_speech2text/epoch-1-checkpoint-5000.pt/pytorch_model.bin \
|
|
--sampler-state-dict-path slam_omni/exp_speech2text/epoch-1-checkpoint-5000-sampler.pt \
|
|
--use-lora True --unfreeze-llm True
|
|
fi
|
|
|
|
|
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|
log "stage 5: "
|
|
ngpu=8
|
|
exp_dir=./slam_omni/exp_speech2speech_rerun
|
|
# exp_dir_new=./slam_omni/exp_s2s
|
|
torchrun --nproc_per_node $ngpu ./slam_omni/train.py \
|
|
--max-duration 50 \
|
|
--enable-musan False \
|
|
--exp-dir $exp_dir \
|
|
--speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt \
|
|
--llm-path-or-name models/Qwen2.5-0.5B-Instruct \
|
|
--manifest-dir data/fbank \
|
|
--deepspeed \
|
|
--deepspeed_config ./slam_omni/ds_config_zero1.json \
|
|
--use-flash-attn True \
|
|
--pretrained-model-path $exp_dir/epoch-1-checkpoint-15000.pt/pytorch_model.bin \
|
|
--sampler-state-dict-path $exp_dir/epoch-1-checkpoint-15000-sampler.pt \
|
|
--use-lora True --unfreeze-llm True --unfreeze-speech-projector True --enable-speech-output True
|
|
# --pretrained-model-path slam_omni/exp_speech2text/epoch-1-checkpoint-5000.pt/pytorch_model.bin \
|
|
# --sampler-state-dict-path $exp_dir/epoch-1-checkpoint-35000-sampler.pt \
|
|
|
|
fi
|
|
|
|
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
|
log "stage 6: "
|
|
export PYTHONPATH=$PYTHONPATH:/workspace/CosyVoice
|
|
exp_dir=./slam_omni/exp_speech2speech_rerun
|
|
python3 ./slam_omni/web_demo.py \
|
|
--speech-encoder-path-or-name models/whisper/v1.1/whisper-large-v2-multi-hans-zh-epoch-3-avg-10.pt \
|
|
--llm-path-or-name models/Qwen2.5-0.5B-Instruct \
|
|
--checkpoint-path $exp_dir/epoch-998.pt \
|
|
--use-flash-attn True \
|
|
--enable-speech-output True \
|
|
--asr-model-dir local/sherpa-onnx-paraformer-zh-2023-09-14 \
|
|
--use-lora True --token2wav-path /workspace/CosyVoice-300M-SFT --share
|
|
|
|
fi
|
|
|
|
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
|
log "stage 7: "
|
|
model_path=local/sherpa-onnx-paraformer-zh-2023-09-14
|
|
|
|
if [ ! -d $model_path ]; then
|
|
pip install sherpa-onnx
|
|
wget -nc https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2
|
|
tar xvf sherpa-onnx-paraformer-zh-2023-09-14.tar.bz2 -C local
|
|
fi
|
|
fi
|