From 91f13826d7781e9be27501787edd368d6f036334 Mon Sep 17 00:00:00 2001 From: pkufool Date: Mon, 5 Feb 2024 17:50:28 +0800 Subject: [PATCH] Add wenetspeech run.sh --- egs/wenetspeech/KWS/run.sh | 197 ++++++++++++++++++++++ egs/wenetspeech/KWS/zipformer/finetune.py | 4 +- 2 files changed, 199 insertions(+), 2 deletions(-) create mode 100644 egs/wenetspeech/KWS/run.sh diff --git a/egs/wenetspeech/KWS/run.sh b/egs/wenetspeech/KWS/run.sh new file mode 100644 index 000000000..914756cda --- /dev/null +++ b/egs/wenetspeech/KWS/run.sh @@ -0,0 +1,197 @@ +#!/usr/bin/env bash + +# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + +set -eou pipefail + +export CUDA_VISIBLE_DEVICES="0,1,2,3" +export PYTHONPATH=../../../:$PYTHONPATH + +stage=0 +stop_stage=100 + +pre_trained_model_host=github + +. shared/parse_options.sh || exit 1 + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + + +if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then + log "Stage -1: Download a pre-trained model." + + +fi + + + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "Stage 0: Train a model." + if [ ! -e data/fbank/.gigaspeech.done ]; then + log "You need to run the prepare.sh first." + exit -1 + fi + + python ./zipformer/train.py \ + --world-size 4 \ + --exp-dir zipformer/exp \ + --decoder-dim 320 \ + --joiner-dim 320 \ + --num-encoder-layers 1,1,1,1,1,1 \ + --feedforward-dim 192,192,192,192,192,192 \ + --encoder-dim 128,128,128,128,128,128 \ + --encoder-unmasked-dim 128,128,128,128,128,128 \ + --num-epochs 15 \ + --lr-epochs 1.5 \ + --use-fp16 1 \ + --start-epoch 1 \ + --training-subset L \ + --pinyin-type partial_with_tone \ + --causal 1 \ + --lang-dir data/lang_partial_tone \ + --max-duration 1000 +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Decode the model." + for t in small, large; do + python ./zipformer/decode.py \ + --epoch 15 \ + --avg 2 \ + --exp-dir ./zipformer/exp \ + --lang-dir ./data/lang_partial_tone \ + --pinyin-type partial_with_tone \ + --causal 1 \ + --chunk-size 16 \ + --left-context-frames 64 \ + --decoder-dim 320 \ + --joiner-dim 320 \ + --num-encoder-layers 1,1,1,1,1,1 \ + --feedforward-dim 192,192,192,192,192,192 \ + --encoder-dim 128,128,128,128,128,128 \ + --encoder-unmasked-dim 128,128,128,128,128,128 \ + --test-set $t \ + --keywords-score 1.0 \ + --keywords-threshold 0.35 \ + --keywords-file ./data/commands_${t}.txt \ + --max-duration 3000 + done +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Export the model." + + python ./zipformer/export.py \ + --epoch 15 \ + --avg 2 \ + --exp-dir ./zipformer/exp \ + --tokens data/lang_partial_tone/tokens.txt \ + --causal 1 \ + --chunk-size 16 \ + --left-context-frames 64 \ + --decoder-dim 320 \ + --joiner-dim 320 \ + --num-encoder-layers 1,1,1,1,1,1 \ + --feedforward-dim 192,192,192,192,192,192 \ + --encoder-dim 128,128,128,128,128,128 \ + --encoder-unmasked-dim 128,128,128,128,128,128 + + python ./zipformer/export_onnx_streaming.py \ + --exp-dir zipformer/exp \ + --tokens data/lang_partial_tone/tokens.txt \ + --epoch 15 \ + --avg 2 \ + --chunk-size 16 \ + --left-context-frames 128 \ + --decoder-dim 320 \ + --joiner-dim 320 \ + --num-encoder-layers 1,1,1,1,1,1 \ + --feedforward-dim 192,192,192,192,192,192 \ + --encoder-dim 128,128,128,128,128,128 \ + --encoder-unmasked-dim 128,128,128,128,128,128 \ + --causal 1 +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 2: Finetune the model" + + # The following configuration of lr schedule should work well + # You may also tune the following parameters to adjust learning rate schedule + base_lr=0.0005 + lr_epochs=100 + lr_batches=100000 + + # We recommend to start from an averaged model + finetune_ckpt=zipformer/exp/pretrained.pt + + ./zipformer/finetune.py \ + --world-size 4 \ + --num-epochs 10 \ + --start-epoch 1 \ + --exp-dir zipformer/exp_finetune + --lang-dir ./data/lang_partial_tone \ + --pinyin-type partial_with_tone \ + --use-fp16 1 \ + --decoder-dim 320 \ + --joiner-dim 320 \ + --num-encoder-layers 1,1,1,1,1,1 \ + --feedforward-dim 192,192,192,192,192,192 \ + --encoder-dim 128,128,128,128,128,128 \ + --encoder-unmasked-dim 128,128,128,128,128,128 \ + --causal 1 \ + --base-lr $base_lr \ + --lr-epochs $lr_epochs \ + --lr-batches $lr_batches \ + --finetune-ckpt $finetune_ckpt \ + --max-duration 1500 +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 1: Decode the finetuned model." + for t in small, large; do + python ./zipformer/decode.py \ + --epoch 15 \ + --avg 2 \ + --exp-dir ./zipformer/exp_finetune \ + --lang-dir ./data/lang_partial_tone \ + --pinyin-type partial_with_tone \ + --causal 1 \ + --chunk-size 16 \ + --left-context-frames 64 \ + --decoder-dim 320 \ + --joiner-dim 320 \ + --num-encoder-layers 1,1,1,1,1,1 \ + --feedforward-dim 192,192,192,192,192,192 \ + --encoder-dim 128,128,128,128,128,128 \ + --encoder-unmasked-dim 128,128,128,128,128,128 \ + --test-set $t \ + --keywords-score 1.0 \ + --keywords-threshold 0.35 \ + --keywords-file ./data/commands_${t}.txt \ + --max-duration 3000 + done +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 2: Export the finetuned model." + + python ./zipformer/export_onnx_streaming.py \ + --exp-dir zipformer/exp_finetune \ + --tokens data/lang_partial_tone/tokens.txt \ + --epoch 15 \ + --avg 2 \ + --chunk-size 16 \ + --left-context-frames 128 \ + --decoder-dim 320 \ + --joiner-dim 320 \ + --num-encoder-layers 1,1,1,1,1,1 \ + --feedforward-dim 192,192,192,192,192,192 \ + --encoder-dim 128,128,128,128,128,128 \ + --encoder-unmasked-dim 128,128,128,128,128,128 \ + --causal 1 +fi diff --git a/egs/wenetspeech/KWS/zipformer/finetune.py b/egs/wenetspeech/KWS/zipformer/finetune.py index 7456c60dc..6f34989e2 100755 --- a/egs/wenetspeech/KWS/zipformer/finetune.py +++ b/egs/wenetspeech/KWS/zipformer/finetune.py @@ -185,9 +185,9 @@ def get_parser(): default="partial_with_tone", help=""" The style of the output pinyin, should be: - full_with_tone : zhong1 guo2 + full_with_tone : zhōng guó full_no_tone : zhong guo - partial_with_tone : zh ong1 g uo2 + partial_with_tone : zh ōng g uó partial_no_tone : zh ong g uo """, )