From 91f13826d7781e9be27501787edd368d6f036334 Mon Sep 17 00:00:00 2001
From: pkufool <wkang.pku@gmail.com>
Date: Mon, 5 Feb 2024 17:50:28 +0800
Subject: [PATCH] Add wenetspeech run.sh

---
 egs/wenetspeech/KWS/run.sh                | 197 ++++++++++++++++++++++
 egs/wenetspeech/KWS/zipformer/finetune.py |   4 +-
 2 files changed, 199 insertions(+), 2 deletions(-)
 create mode 100644 egs/wenetspeech/KWS/run.sh

diff --git a/egs/wenetspeech/KWS/run.sh b/egs/wenetspeech/KWS/run.sh
new file mode 100644
index 000000000..914756cda
--- /dev/null
+++ b/egs/wenetspeech/KWS/run.sh
@@ -0,0 +1,197 @@
+#!/usr/bin/env bash
+
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+set -eou pipefail
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+export PYTHONPATH=../../../:$PYTHONPATH
+
+stage=0
+stop_stage=100
+
+pre_trained_model_host=github
+
+. shared/parse_options.sh || exit 1
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+
+if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
+  log "Stage -1: Download a pre-trained model."
+
+
+fi
+
+
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Train a model."
+  if [ ! -e data/fbank/.gigaspeech.done ]; then
+    log "You need to run the prepare.sh first."
+    exit -1
+  fi
+  
+  python ./zipformer/train.py \
+      --world-size 4 \
+      --exp-dir zipformer/exp \
+      --decoder-dim 320 \
+      --joiner-dim 320 \
+      --num-encoder-layers 1,1,1,1,1,1 \
+      --feedforward-dim 192,192,192,192,192,192 \
+      --encoder-dim 128,128,128,128,128,128 \
+      --encoder-unmasked-dim 128,128,128,128,128,128 \
+      --num-epochs 15 \
+      --lr-epochs 1.5 \
+      --use-fp16 1 \
+      --start-epoch 1 \
+      --training-subset L \
+      --pinyin-type partial_with_tone \
+      --causal 1 \
+      --lang-dir data/lang_partial_tone \
+      --max-duration 1000
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Decode the model."
+  for t in small, large; do
+    python ./zipformer/decode.py \
+        --epoch 15 \
+        --avg 2 \
+        --exp-dir ./zipformer/exp \
+        --lang-dir ./data/lang_partial_tone \
+        --pinyin-type partial_with_tone \
+        --causal 1 \
+        --chunk-size 16 \
+        --left-context-frames 64 \
+        --decoder-dim 320 \
+        --joiner-dim 320 \
+        --num-encoder-layers 1,1,1,1,1,1 \
+        --feedforward-dim 192,192,192,192,192,192 \
+        --encoder-dim 128,128,128,128,128,128 \
+        --encoder-unmasked-dim 128,128,128,128,128,128 \
+        --test-set $t \
+        --keywords-score 1.0 \
+        --keywords-threshold 0.35 \
+        --keywords-file ./data/commands_${t}.txt  \
+        --max-duration 3000
+  done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Export the model."
+
+  python ./zipformer/export.py \
+      --epoch 15 \
+      --avg 2 \
+      --exp-dir ./zipformer/exp \
+      --tokens data/lang_partial_tone/tokens.txt \
+      --causal 1 \
+      --chunk-size 16 \
+      --left-context-frames 64 \
+      --decoder-dim 320 \
+      --joiner-dim 320 \
+      --num-encoder-layers 1,1,1,1,1,1 \
+      --feedforward-dim 192,192,192,192,192,192 \
+      --encoder-dim 128,128,128,128,128,128 \
+      --encoder-unmasked-dim 128,128,128,128,128,128
+
+  python ./zipformer/export_onnx_streaming.py \
+    --exp-dir zipformer/exp \
+    --tokens data/lang_partial_tone/tokens.txt \
+    --epoch 15 \
+    --avg 2 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --decoder-dim 320 \
+    --joiner-dim 320 \
+    --num-encoder-layers 1,1,1,1,1,1 \
+    --feedforward-dim 192,192,192,192,192,192 \
+    --encoder-dim 128,128,128,128,128,128 \
+    --encoder-unmasked-dim 128,128,128,128,128,128 \
+    --causal 1
+fi 
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 2: Finetune the model"
+  
+  # The following configuration of lr schedule should work well
+  # You may also tune the following parameters to adjust learning rate schedule
+  base_lr=0.0005
+  lr_epochs=100
+  lr_batches=100000
+
+  # We recommend to start from an averaged model
+  finetune_ckpt=zipformer/exp/pretrained.pt
+
+  ./zipformer/finetune.py \
+    --world-size 4 \
+    --num-epochs 10 \
+    --start-epoch 1 \
+    --exp-dir zipformer/exp_finetune
+    --lang-dir ./data/lang_partial_tone \
+    --pinyin-type partial_with_tone \
+    --use-fp16 1 \
+    --decoder-dim 320 \
+    --joiner-dim 320 \
+    --num-encoder-layers 1,1,1,1,1,1 \
+    --feedforward-dim 192,192,192,192,192,192 \
+    --encoder-dim 128,128,128,128,128,128 \
+    --encoder-unmasked-dim 128,128,128,128,128,128 \
+    --causal 1 \
+    --base-lr $base_lr \
+    --lr-epochs $lr_epochs \
+    --lr-batches $lr_batches \
+    --finetune-ckpt $finetune_ckpt \
+    --max-duration 1500
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  log "Stage 1: Decode the finetuned model."
+  for t in small, large; do
+    python ./zipformer/decode.py \
+        --epoch 15 \
+        --avg 2 \
+        --exp-dir ./zipformer/exp_finetune \
+        --lang-dir ./data/lang_partial_tone \
+        --pinyin-type partial_with_tone \
+        --causal 1 \
+        --chunk-size 16 \
+        --left-context-frames 64 \
+        --decoder-dim 320 \
+        --joiner-dim 320 \
+        --num-encoder-layers 1,1,1,1,1,1 \
+        --feedforward-dim 192,192,192,192,192,192 \
+        --encoder-dim 128,128,128,128,128,128 \
+        --encoder-unmasked-dim 128,128,128,128,128,128 \
+        --test-set $t \
+        --keywords-score 1.0 \
+        --keywords-threshold 0.35 \
+        --keywords-file ./data/commands_${t}.txt  \
+        --max-duration 3000
+  done
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 2: Export the finetuned model."
+
+  python ./zipformer/export_onnx_streaming.py \
+    --exp-dir zipformer/exp_finetune \
+    --tokens data/lang_partial_tone/tokens.txt \
+    --epoch 15 \
+    --avg 2 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --decoder-dim 320 \
+    --joiner-dim 320 \
+    --num-encoder-layers 1,1,1,1,1,1 \
+    --feedforward-dim 192,192,192,192,192,192 \
+    --encoder-dim 128,128,128,128,128,128 \
+    --encoder-unmasked-dim 128,128,128,128,128,128 \
+    --causal 1
+fi 
diff --git a/egs/wenetspeech/KWS/zipformer/finetune.py b/egs/wenetspeech/KWS/zipformer/finetune.py
index 7456c60dc..6f34989e2 100755
--- a/egs/wenetspeech/KWS/zipformer/finetune.py
+++ b/egs/wenetspeech/KWS/zipformer/finetune.py
@@ -185,9 +185,9 @@ def get_parser():
         default="partial_with_tone",
         help="""
             The style of the output pinyin, should be:
-              full_with_tone : zhong1 guo2
+              full_with_tone : zhōng guó
               full_no_tone : zhong guo
-              partial_with_tone : zh ong1 g uo2
+              partial_with_tone : zh ōng g uó
               partial_no_tone : zh ong g uo
         """,
     )