Add wenetspeech run.sh

2025-12-11 06:55:27 +00:00 · 2024-02-05 17:50:28 +08:00 · 2024-02-05 17:50:28 +08:00 · 91f13826d7
commit 91f13826d7
parent f2f4087778
2 changed files with 199 additions and 2 deletions
--- a/egs/wenetspeech/KWS/run.sh
+++ b/egs/wenetspeech/KWS/run.sh
@ -0,0 +1,197 @@
 #!/usr/bin/env bash
 # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 export PYTHONPATH=../../../:$PYTHONPATH
 stage=0
 stop_stage=100
 pre_trained_model_host=github
 . shared/parse_options.sh || exit 1
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
  log "Stage -1: Download a pre-trained model."
 fi
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "Stage 0: Train a model."
  if [ ! -e data/fbank/.gigaspeech.done ]; then
    log "You need to run the prepare.sh first."
    exit -1
  fi
  python ./zipformer/train.py \
      --world-size 4 \
      --exp-dir zipformer/exp \
      --decoder-dim 320 \
      --joiner-dim 320 \
      --num-encoder-layers 1,1,1,1,1,1 \
      --feedforward-dim 192,192,192,192,192,192 \
      --encoder-dim 128,128,128,128,128,128 \
      --encoder-unmasked-dim 128,128,128,128,128,128 \
      --num-epochs 15 \
      --lr-epochs 1.5 \
      --use-fp16 1 \
      --start-epoch 1 \
      --training-subset L \
      --pinyin-type partial_with_tone \
      --causal 1 \
      --lang-dir data/lang_partial_tone \
      --max-duration 1000
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Decode the model."
  for t in small, large; do
    python ./zipformer/decode.py \
        --epoch 15 \
        --avg 2 \
        --exp-dir ./zipformer/exp \
        --lang-dir ./data/lang_partial_tone \
        --pinyin-type partial_with_tone \
        --causal 1 \
        --chunk-size 16 \
        --left-context-frames 64 \
        --decoder-dim 320 \
        --joiner-dim 320 \
        --num-encoder-layers 1,1,1,1,1,1 \
        --feedforward-dim 192,192,192,192,192,192 \
        --encoder-dim 128,128,128,128,128,128 \
        --encoder-unmasked-dim 128,128,128,128,128,128 \
        --test-set $t \
        --keywords-score 1.0 \
        --keywords-threshold 0.35 \
        --keywords-file ./data/commands_${t}.txt  \
        --max-duration 3000
  done
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Export the model."
  python ./zipformer/export.py \
      --epoch 15 \
      --avg 2 \
      --exp-dir ./zipformer/exp \
      --tokens data/lang_partial_tone/tokens.txt \
      --causal 1 \
      --chunk-size 16 \
      --left-context-frames 64 \
      --decoder-dim 320 \
      --joiner-dim 320 \
      --num-encoder-layers 1,1,1,1,1,1 \
      --feedforward-dim 192,192,192,192,192,192 \
      --encoder-dim 128,128,128,128,128,128 \
      --encoder-unmasked-dim 128,128,128,128,128,128
  python ./zipformer/export_onnx_streaming.py \
    --exp-dir zipformer/exp \
    --tokens data/lang_partial_tone/tokens.txt \
    --epoch 15 \
    --avg 2 \
    --chunk-size 16 \
    --left-context-frames 128 \
    --decoder-dim 320 \
    --joiner-dim 320 \
    --num-encoder-layers 1,1,1,1,1,1 \
    --feedforward-dim 192,192,192,192,192,192 \
    --encoder-dim 128,128,128,128,128,128 \
    --encoder-unmasked-dim 128,128,128,128,128,128 \
    --causal 1
 fi 
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 2: Finetune the model"
  # The following configuration of lr schedule should work well
  # You may also tune the following parameters to adjust learning rate schedule
  base_lr=0.0005
  lr_epochs=100
  lr_batches=100000
  # We recommend to start from an averaged model
  finetune_ckpt=zipformer/exp/pretrained.pt
  ./zipformer/finetune.py \
    --world-size 4 \
    --num-epochs 10 \
    --start-epoch 1 \
    --exp-dir zipformer/exp_finetune
    --lang-dir ./data/lang_partial_tone \
    --pinyin-type partial_with_tone \
    --use-fp16 1 \
    --decoder-dim 320 \
    --joiner-dim 320 \
    --num-encoder-layers 1,1,1,1,1,1 \
    --feedforward-dim 192,192,192,192,192,192 \
    --encoder-dim 128,128,128,128,128,128 \
    --encoder-unmasked-dim 128,128,128,128,128,128 \
    --causal 1 \
    --base-lr $base_lr \
    --lr-epochs $lr_epochs \
    --lr-batches $lr_batches \
    --finetune-ckpt $finetune_ckpt \
    --max-duration 1500
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 1: Decode the finetuned model."
  for t in small, large; do
    python ./zipformer/decode.py \
        --epoch 15 \
        --avg 2 \
        --exp-dir ./zipformer/exp_finetune \
        --lang-dir ./data/lang_partial_tone \
        --pinyin-type partial_with_tone \
        --causal 1 \
        --chunk-size 16 \
        --left-context-frames 64 \
        --decoder-dim 320 \
        --joiner-dim 320 \
        --num-encoder-layers 1,1,1,1,1,1 \
        --feedforward-dim 192,192,192,192,192,192 \
        --encoder-dim 128,128,128,128,128,128 \
        --encoder-unmasked-dim 128,128,128,128,128,128 \
        --test-set $t \
        --keywords-score 1.0 \
        --keywords-threshold 0.35 \
        --keywords-file ./data/commands_${t}.txt  \
        --max-duration 3000
  done
 fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "Stage 2: Export the finetuned model."
  python ./zipformer/export_onnx_streaming.py \
    --exp-dir zipformer/exp_finetune \
    --tokens data/lang_partial_tone/tokens.txt \
    --epoch 15 \
    --avg 2 \
    --chunk-size 16 \
    --left-context-frames 128 \
    --decoder-dim 320 \
    --joiner-dim 320 \
    --num-encoder-layers 1,1,1,1,1,1 \
    --feedforward-dim 192,192,192,192,192,192 \
    --encoder-dim 128,128,128,128,128,128 \
    --encoder-unmasked-dim 128,128,128,128,128,128 \
    --causal 1
 fi 
--- a/egs/wenetspeech/KWS/zipformer/finetune.py
+++ b/egs/wenetspeech/KWS/zipformer/finetune.py
@ -185,9 +185,9 @@ def get_parser():
        default="partial_with_tone",
        help="""
            The style of the output pinyin, should be:
-              full_with_tone : zhong1 guo2
+              full_with_tone : zhōng guó
              full_no_tone : zhong guo
-              partial_with_tone : zh ong1 g uo2
+              partial_with_tone : zh ōng g uó
              partial_no_tone : zh ong g uo
        """,
    )