Merge remote-tracking branch 'k2-fsa/master'

2025-12-11 06:55:27 +00:00 · 2023-02-03 15:06:54 +08:00 · 2023-02-03 15:06:54 +08:00 · dd0047e605
commit dd0047e605
parent 3b142dbef3 bffce413f0
876 changed files with 89314 additions and 6770 deletions
--- a/.flake8
+++ b/.flake8
@ -1,7 +1,7 @@
 [flake8]
 show-source=true
 statistics=true
-max-line-length = 80
+max-line-length = 88
 per-file-ignores =
    # line too long
    icefall/diagnostics.py: E501,
@ -11,7 +11,8 @@ per-file-ignores =
    egs/*/ASR/*/scaling.py: E501,
    egs/librispeech/ASR/lstm_transducer_stateless*/*.py: E501, E203
    egs/librispeech/ASR/conv_emformer_transducer_stateless*/*.py: E501, E203
-    egs/librispeech/ASR/conformer_ctc2/*py: E501,
+    egs/librispeech/ASR/conformer_ctc*/*py: E501,
    egs/librispeech/ASR/zipformer_mmi/*.py: E501, E203
    egs/librispeech/ASR/RESULTS.md: E999,
    # invalid escape sequence (cause by tex formular), W605
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -0,0 +1,3 @@
 # Migrate to 88 characters per line (see: https://github.com/lhotse-speech/lhotse/issues/890)
 107df3b115a58f1b68a6458c3f94a130004be34c
 d31db010371a4128856480382876acdc0d1739ed
--- a/.github/scripts/run-librispeech-conformer-ctc3-2022-11-28.sh
+++ b/.github/scripts/run-librispeech-conformer-ctc3-2022-11-28.sh
@ -0,0 +1,123 @@
 #!/usr/bin/env bash
 set -e
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-conformer-ctc3-2022-11-27
 log "Downloading pre-trained model from $repo_url"
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 git lfs pull --include "data/lang_bpe_500/HLG.pt"
 git lfs pull --include "data/lang_bpe_500/L.pt"
 git lfs pull --include "data/lang_bpe_500/LG.pt"
 git lfs pull --include "data/lang_bpe_500/Linv.pt"
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "data/lm/G_4_gram.pt"
 git lfs pull --include "exp/jit_trace.pt"
 git lfs pull --include "exp/pretrained.pt"
 ln -s pretrained.pt epoch-99.pt
 ls -lh *.pt
 popd
 log "Decode with models exported by torch.jit.trace()"
 for m in ctc-decoding 1best; do
  ./conformer_ctc3/jit_pretrained.py \
    --model-filename $repo/exp/jit_trace.pt \
    --words-file $repo/data/lang_bpe_500/words.txt  \
    --HLG $repo/data/lang_bpe_500/HLG.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --G $repo/data/lm/G_4_gram.pt \
    --method $m \
    --sample-rate 16000 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 log "Export to torchscript model"
 ./conformer_ctc3/export.py \
  --exp-dir $repo/exp \
  --lang-dir $repo/data/lang_bpe_500 \
  --jit-trace 1 \
  --epoch 99 \
  --avg 1 \
  --use-averaged-model 0
 ls -lh $repo/exp/*.pt
 log "Decode with models exported by torch.jit.trace()"
 for m in ctc-decoding 1best; do
  ./conformer_ctc3/jit_pretrained.py \
    --model-filename $repo/exp/jit_trace.pt \
    --words-file $repo/data/lang_bpe_500/words.txt  \
    --HLG $repo/data/lang_bpe_500/HLG.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --G $repo/data/lm/G_4_gram.pt \
    --method $m \
    --sample-rate 16000 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for m in ctc-decoding 1best; do
  ./conformer_ctc3/pretrained.py \
    --checkpoint $repo/exp/pretrained.pt \
    --words-file $repo/data/lang_bpe_500/words.txt  \
    --HLG $repo/data/lang_bpe_500/HLG.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --G $repo/data/lm/G_4_gram.pt \
    --method $m \
    --sample-rate 16000 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p conformer_ctc3/exp
  ln -s $PWD/$repo/exp/pretrained.pt conformer_ctc3/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh conformer_ctc3/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in ctc-decoding 1best; do
    log "Decoding with $method"
    ./conformer_ctc3/decode.py \
      --epoch 999 \
      --avg 1 \
      --use-averaged-model 0 \
      --exp-dir conformer_ctc3/exp/ \
      --max-duration $max_duration \
      --decoding-method $method \
      --lm-dir data/lm
  done
  rm conformer_ctc3/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-conv-emformer-transducer-stateless2-2022-12-05.sh
+++ b/.github/scripts/run-librispeech-conv-emformer-transducer-stateless2-2022-12-05.sh
@ -0,0 +1,79 @@
 #!/usr/bin/env bash
 #
 set -e
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
 log "Downloading pre-trained model from $repo_url"
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 pushd $repo
 git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 cd exp
 ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-99.pt
 popd
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 log  "Install ncnn and pnnx"
 # We are using a modified ncnn here. Will try to merge it to the official repo
 # of ncnn
 git clone https://github.com/csukuangfj/ncnn
 pushd ncnn
 git submodule init
 git submodule update python/pybind11
 python3 setup.py bdist_wheel
 ls -lh dist/
 pip install dist/*.whl
 cd tools/pnnx
 mkdir build
 cd build
 cmake -D Python3_EXECUTABLE=/opt/hostedtoolcache/Python/3.8.14/x64/bin/python3 ..
 make -j4 pnnx
 ./src/pnnx || echo "pass"
 popd
 log "Test exporting to pnnx format"
 ./conv_emformer_transducer_stateless2/export-for-ncnn.py \
  --exp-dir $repo/exp \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --epoch 99 \
  --avg 1 \
  --use-averaged-model 0 \
  \
  --num-encoder-layers 12 \
  --chunk-length 32 \
  --cnn-module-kernel 31 \
  --left-context-length 32 \
  --right-context-length 8 \
  --memory-size 32
 ./ncnn/tools/pnnx/build/src/pnnx $repo/exp/encoder_jit_trace-pnnx.pt
 ./ncnn/tools/pnnx/build/src/pnnx $repo/exp/decoder_jit_trace-pnnx.pt
 ./ncnn/tools/pnnx/build/src/pnnx $repo/exp/joiner_jit_trace-pnnx.pt
 ./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
 --tokens $repo/data/lang_bpe_500/tokens.txt \
 --encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
 --encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
 --decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
 --decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
 --joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
 --joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
 $repo/test_wavs/1089-134686-0001.wav
--- a/.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
+++ b/.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
@ -16,6 +16,7 @@ log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 abs_repo=$(realpath $repo)
 log "Display test files"
 tree $repo/
@ -174,6 +175,92 @@ done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_LABEL_NAME}" == x"shallow-fusion" ]]; then
  lm_repo_url=https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm
  log "Download pre-trained RNN-LM model from ${lm_repo_url}"
  GIT_LFS_SKIP_SMUDGE=1 git clone $lm_repo_url
  lm_repo=$(basename $lm_repo_url)
  pushd $lm_repo
  git lfs pull --include "exp/pretrained.pt"
  mv exp/pretrained.pt exp/epoch-88.pt
  popd
  mkdir -p lstm_transducer_stateless2/exp
  ln -sf $PWD/$repo/exp/pretrained.pt lstm_transducer_stateless2/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh lstm_transducer_stateless2/exp
  log "Decoding test-clean and test-other with RNN LM"
  ./lstm_transducer_stateless2/decode.py \
    --use-averaged-model 0 \
    --epoch 999 \
    --avg 1 \
    --exp-dir lstm_transducer_stateless2/exp \
    --max-duration 600 \
    --decoding-method modified_beam_search_lm_shallow_fusion \
    --beam 4 \
    --use-shallow-fusion 1 \
    --lm-type rnn \
    --lm-exp-dir $lm_repo/exp \
    --lm-epoch 88 \
    --lm-avg 1 \
    --lm-scale 0.3 \
    --rnn-lm-num-layers 3 \
    --rnn-lm-tie-weights 1
 fi
 if [[ x"${GITHUB_EVENT_LABEL_NAME}" == x"LODR" ]]; then
  bigram_repo_url=https://huggingface.co/marcoyang/librispeech_bigram
  log "Download bi-gram LM from ${bigram_repo_url}"
  GIT_LFS_SKIP_SMUDGE=1 git clone $bigram_repo_url
  bigramlm_repo=$(basename $bigram_repo_url)
  pushd $bigramlm_repo
  git lfs pull --include "2gram.fst.txt"
  cp 2gram.fst.txt $abs_repo/data/lang_bpe_500/.
  popd
  lm_repo_url=https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm
  log "Download pre-trained RNN-LM model from ${lm_repo_url}"
  GIT_LFS_SKIP_SMUDGE=1 git clone $lm_repo_url
  lm_repo=$(basename $lm_repo_url)
  pushd $lm_repo
  git lfs pull --include "exp/pretrained.pt"
  mv exp/pretrained.pt exp/epoch-88.pt
  popd
  mkdir -p lstm_transducer_stateless2/exp
  ln -sf $PWD/$repo/exp/pretrained.pt lstm_transducer_stateless2/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh lstm_transducer_stateless2/exp
  log "Decoding test-clean and test-other"
  ./lstm_transducer_stateless2/decode.py \
    --use-averaged-model 0 \
    --epoch 999 \
    --avg 1 \
    --exp-dir lstm_transducer_stateless2/exp \
    --max-duration 600 \
    --decoding-method modified_beam_search_LODR \
    --beam 4 \
    --use-shallow-fusion 1 \
    --lm-type rnn \
    --lm-exp-dir $lm_repo/exp \
    --lm-scale 0.4 \
    --lm-epoch 88 \
    --rnn-lm-avg 1 \
    --rnn-lm-num-layers 3 \
    --rnn-lm-tie-weights 1 \
    --tokens-ngram 2 \
    --ngram-lm-scale -0.16
 fi
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" ]]; then
  mkdir -p lstm_transducer_stateless2/exp
  ln -s $PWD/$repo/exp/pretrained.pt lstm_transducer_stateless2/exp/epoch-999.pt
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
@ -83,4 +83,5 @@ if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" ==
  done
  rm pruned_transducer_stateless2/exp/*.pt
  rm -r data/lang_bpe_500
 fi
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
@ -82,4 +82,5 @@ if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" ==
  done
  rm pruned_transducer_stateless3/exp/*.pt
  rm -r data/lang_bpe_500
 fi
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-2022-11-11.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-2022-11-11.sh
@ -0,0 +1,137 @@
 #!/usr/bin/env bash
 set -e
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/cpu_jit.pt"
 git lfs pull --include "exp/pretrained.pt"
 ln -s pretrained.pt epoch-99.pt
 ls -lh *.pt
 popd
 log "Test exporting to ONNX format"
 ./pruned_transducer_stateless7/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --epoch 99 \
  --avg 1 \
  --onnx 1
 log "Export to torchscript model"
 ./pruned_transducer_stateless7/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --epoch 99 \
  --avg 1 \
  --jit 1
 ls -lh $repo/exp/*.pt
 log "Decode with ONNX models"
 ./pruned_transducer_stateless7/onnx_check.py \
  --jit-filename $repo/exp/cpu_jit.pt \
  --onnx-encoder-filename $repo/exp/encoder.onnx \
  --onnx-decoder-filename $repo/exp/decoder.onnx \
  --onnx-joiner-filename $repo/exp/joiner.onnx \
  --onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
  --onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx
 ./pruned_transducer_stateless7/onnx_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --encoder-model-filename $repo/exp/encoder.onnx \
  --decoder-model-filename $repo/exp/decoder.onnx \
  --joiner-model-filename $repo/exp/joiner.onnx \
  --joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
  --joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
 log "Decode with models exported by torch.jit.script()"
 ./pruned_transducer_stateless7/jit_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --nn-model-filename $repo/exp/cpu_jit.pt \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless7/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./pruned_transducer_stateless7/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless7/exp
  ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless7/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh pruned_transducer_stateless7/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless7/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --use-averaged-model 0 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless7/exp
  done
  rm pruned_transducer_stateless7/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-2022-12-01.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-2022-12-01.sh
@ -0,0 +1,151 @@
 #!/usr/bin/env bash
 set -e
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01
 log "Downloading pre-trained model from $repo_url"
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 git lfs pull --include "data/lang_bpe_500/HLG.pt"
 git lfs pull --include "data/lang_bpe_500/L.pt"
 git lfs pull --include "data/lang_bpe_500/LG.pt"
 git lfs pull --include "data/lang_bpe_500/Linv.pt"
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "data/lm/G_4_gram.pt"
 git lfs pull --include "exp/cpu_jit.pt"
 git lfs pull --include "exp/pretrained.pt"
 ln -s pretrained.pt epoch-99.pt
 ls -lh *.pt
 popd
 log "Export to torchscript model"
 ./pruned_transducer_stateless7_ctc/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --epoch 99 \
  --avg 1 \
  --jit 1
 ls -lh $repo/exp/*.pt
 log "Decode with models exported by torch.jit.script()"
 ./pruned_transducer_stateless7_ctc/jit_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --nn-model-filename $repo/exp/cpu_jit.pt \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
 for m in ctc-decoding 1best; do
  ./pruned_transducer_stateless7_ctc/jit_pretrained_ctc.py \
    --model-filename $repo/exp/cpu_jit.pt \
    --words-file $repo/data/lang_bpe_500/words.txt  \
    --HLG $repo/data/lang_bpe_500/HLG.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --G $repo/data/lm/G_4_gram.pt \
    --method $m \
    --sample-rate 16000 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless7_ctc/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./pruned_transducer_stateless7_ctc/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for m in ctc-decoding 1best; do
  ./pruned_transducer_stateless7_ctc/pretrained_ctc.py \
    --checkpoint $repo/exp/pretrained.pt \
    --words-file $repo/data/lang_bpe_500/words.txt  \
    --HLG $repo/data/lang_bpe_500/HLG.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --G $repo/data/lm/G_4_gram.pt \
    --method $m \
    --sample-rate 16000 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless7_ctc/exp
  ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless7_ctc/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh pruned_transducer_stateless7_ctc/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless7_ctc/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --use-averaged-model 0 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless7_ctc/exp
  done
  for m in ctc-decoding 1best; do
    ./pruned_transducer_stateless7_ctc/ctc_decode.py \
        --epoch 999 \
        --avg 1 \
        --exp-dir ./pruned_transducer_stateless7_ctc/exp \
        --max-duration $max_duration \
        --use-averaged-model 0 \
        --decoding-method $m \
        --hlg-scale 0.6 \
        --lm-dir data/lm
  done
  rm pruned_transducer_stateless7_ctc/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
@ -0,0 +1,148 @@
 #!/usr/bin/env bash
 set -e
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2022-12-14
 log "Downloading pre-trained model from $repo_url"
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 git lfs pull --include "data/lang_bpe_500/HLG.pt"
 git lfs pull --include "data/lang_bpe_500/L.pt"
 git lfs pull --include "data/lang_bpe_500/LG.pt"
 git lfs pull --include "data/lang_bpe_500/Linv.pt"
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/cpu_jit.pt"
 git lfs pull --include "exp/pretrained.pt"
 ln -s pretrained.pt epoch-99.pt
 ls -lh *.pt
 popd
 log "Export to torchscript model"
 ./pruned_transducer_stateless7_ctc_bs/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --epoch 99 \
  --avg 1 \
  --jit 1
 ls -lh $repo/exp/*.pt
 log "Decode with models exported by torch.jit.script()"
 ./pruned_transducer_stateless7_ctc_bs/jit_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --nn-model-filename $repo/exp/cpu_jit.pt \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
 for m in ctc-decoding 1best; do
  ./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py \
    --model-filename $repo/exp/cpu_jit.pt \
    --words-file $repo/data/lang_bpe_500/words.txt  \
    --HLG $repo/data/lang_bpe_500/HLG.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --method $m \
    --sample-rate 16000 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless7_ctc_bs/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./pruned_transducer_stateless7_ctc_bs/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for m in ctc-decoding 1best; do
  ./pruned_transducer_stateless7_ctc_bs/pretrained_ctc.py \
    --checkpoint $repo/exp/pretrained.pt \
    --words-file $repo/data/lang_bpe_500/words.txt  \
    --HLG $repo/data/lang_bpe_500/HLG.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --method $m \
    --sample-rate 16000 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless7_ctc_bs/exp
  ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless7_ctc_bs/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh pruned_transducer_stateless7_ctc_bs/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless7_ctc_bs/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --use-averaged-model 0 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless7_ctc_bs/exp
  done
  for m in ctc-decoding 1best; do
    ./pruned_transducer_stateless7_ctc_bs/ctc_decode.py \
        --epoch 999 \
        --avg 1 \
        --exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
        --max-duration $max_duration \
        --use-averaged-model 0 \
        --decoding-method $m \
        --hlg-scale 0.6
  done
  rm pruned_transducer_stateless7_ctc_bs/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-streaming-2022-12-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-streaming-2022-12-29.sh
@ -0,0 +1,148 @@
 #!/usr/bin/env bash
 set -e
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/cpu_jit.pt"
 git lfs pull --include "exp/pretrained.pt"
 git lfs pull --include "exp/encoder_jit_trace.pt"
 git lfs pull --include "exp/decoder_jit_trace.pt"
 git lfs pull --include "exp/joiner_jit_trace.pt"
 ln -s pretrained.pt epoch-99.pt
 ls -lh *.pt
 popd
 log "Export to torchscript model"
 ./pruned_transducer_stateless7_streaming/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --decode-chunk-len 32 \
  --epoch 99 \
  --avg 1 \
  --jit 1
 ls -lh $repo/exp/*.pt
 log "Decode with models exported by torch.jit.script()"
 ./pruned_transducer_stateless7_streaming/jit_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --nn-model-filename $repo/exp/cpu_jit.pt \
  --decode-chunk-len 32 \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
 log "Export to torchscript model by torch.jit.trace()"
 ./pruned_transducer_stateless7_streaming/jit_trace_export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --decode-chunk-len 32 \
  --epoch 99 \
  --avg 1
 log "Decode with models exported by torch.jit.trace()"
 ./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --encoder-model-filename $repo/exp/encoder_jit_trace.pt \
  --decoder-model-filename $repo/exp/decoder_jit_trace.pt \
  --joiner-model-filename $repo/exp/joiner_jit_trace.pt \
  --decode-chunk-len 32 \
  $repo/test_wavs/1089-134686-0001.wav
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless7_streaming/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --decode-chunk-len 32 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./pruned_transducer_stateless7_streaming/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --decode-chunk-len 32 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless7_streaming/exp
  ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless7_streaming/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh pruned_transducer_stateless7_streaming/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  num_decode_stream=200
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "decoding with $method"
    ./pruned_transducer_stateless7_streaming/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --use-averaged-model 0 \
      --max-duration $max_duration \
      --decode-chunk-len 32 \
      --exp-dir pruned_transducer_stateless7_streaming/exp
  done
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless7_streaming/streaming_decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --use-averaged-model 0 \
      --decode-chunk-len 32 \
      --num-decode-streams $num_decode_stream
      --exp-dir pruned_transducer_stateless7_streaming/exp
  done
  rm pruned_transducer_stateless7_streaming/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless8-2022-11-14.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless8-2022-11-14.sh
@ -0,0 +1,116 @@
 #!/usr/bin/env bash
 set -e
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/cpu_jit.pt"
 git lfs pull --include "exp/pretrained.pt"
 ln -s pretrained.pt epoch-99.pt
 ls -lh *.pt
 popd
 log "Decode with models exported by torch.jit.script()"
 ./pruned_transducer_stateless8/jit_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --nn-model-filename $repo/exp/cpu_jit.pt \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
 log "Export to torchscript model"
 ./pruned_transducer_stateless8/export.py \
  --exp-dir $repo/exp \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --use-averaged-model false \
  --epoch 99 \
  --avg 1 \
  --jit 1
 ls -lh $repo/exp/*.pt
 log "Decode with models exported by torch.jit.script()"
 ./pruned_transducer_stateless8/jit_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --nn-model-filename $repo/exp/cpu_jit.pt \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless8/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./pruned_transducer_stateless8/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless8/exp
  ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless8/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh pruned_transducer_stateless8/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless8/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --use-averaged-model 0 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless8/exp
  done
  rm pruned_transducer_stateless8/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-zipformer-mmi-2022-12-08.sh
+++ b/.github/scripts/run-librispeech-zipformer-mmi-2022-12-08.sh
@ -0,0 +1,103 @@
 #!/usr/bin/env bash
 set -e
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-mmi-2022-12-08
 log "Downloading pre-trained model from $repo_url"
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 git lfs pull --include "data/lang_bpe_500/3gram.pt"
 git lfs pull --include "data/lang_bpe_500/4gram.pt"
 git lfs pull --include "data/lang_bpe_500/L.pt"
 git lfs pull --include "data/lang_bpe_500/LG.pt"
 git lfs pull --include "data/lang_bpe_500/Linv.pt"
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/cpu_jit.pt"
 git lfs pull --include "exp/pretrained.pt"
 ln -s pretrained.pt epoch-99.pt
 ls -lh *.pt
 popd
 log "Export to torchscript model"
 ./zipformer_mmi/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --epoch 99 \
  --avg 1 \
  --jit 1
 ls -lh $repo/exp/*.pt
 log "Decode with models exported by torch.jit.script()"
 ./zipformer_mmi/jit_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --nn-model-filename $repo/exp/cpu_jit.pt \
  --lang-dir $repo/data/lang_bpe_500 \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
 for method in 1best nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescoring-4-gram; do
  log "$method"
  ./zipformer_mmi/pretrained.py \
    --method $method \
    --checkpoint $repo/exp/pretrained.pt \
    --lang-dir $repo/data/lang_bpe_500 \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p zipformer_mmi/exp
  ln -s $PWD/$repo/exp/pretrained.pt zipformer_mmi/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh zipformer_mmi/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in 1best nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescoring-4-gram; do
    log "Decoding with $method"
    ./zipformer_mmi/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --use-averaged-model 0 \
      --nbest-scale 1.2 \
      --hp-scale 1.0 \
      --max-duration $max_duration \
      --lang-dir $repo/data/lang_bpe_500 \
      --exp-dir zipformer_mmi/exp
  done
  rm zipformer_mmi/exp/*.pt
 fi
--- a/.github/workflows/build-doc.yml
+++ b/.github/workflows/build-doc.yml
@ -26,6 +26,10 @@ on:
  pull_request:
    types: [labeled]
 concurrency:
  group: build_doc-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  build-doc:
    if: github.event.label.name == 'doc' || github.event_name == 'push'
--- a/.github/workflows/run-aishell-2022-06-20.yml
+++ b/.github/workflows/run-aishell-2022-06-20.yml
@ -34,6 +34,10 @@ on:
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_aishell_2022_06_20-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_aishell_2022_06_20:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
--- a/.github/workflows/run-gigaspeech-2022-05-13.yml
+++ b/.github/workflows/run-gigaspeech-2022-05-13.yml
@ -33,6 +33,10 @@ on:
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_gigaspeech_2022_05_13-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_gigaspeech_2022_05_13:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
--- a/.github/workflows/run-librispeech-2022-03-12.yml
+++ b/.github/workflows/run-librispeech-2022-03-12.yml
@ -33,6 +33,10 @@ on:
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_2022_03_12-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_2022_03_12:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
--- a/.github/workflows/run-librispeech-2022-04-29.yml
+++ b/.github/workflows/run-librispeech-2022-04-29.yml
@ -33,6 +33,10 @@ on:
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_2022_04_29-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_2022_04_29:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
--- a/.github/workflows/run-librispeech-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-2022-05-13.yml
@ -33,6 +33,10 @@ on:
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_2022_05_13-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_2022_05_13:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
--- a/.github/workflows/run-librispeech-2022-11-11-stateless7.yml
+++ b/.github/workflows/run-librispeech-2022-11-11-stateless7.yml
@ -0,0 +1,159 @@
 # Copyright      2022  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-11-11-stateless7
 # zipformer
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_2022_11_11_zipformer-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_2022_11_11_zipformer:
    if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        python-version: [3.8]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}-2022-09-25
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-pruned-transducer-stateless7-2022-11-11.sh
      - name: Display decoding results for librispeech pruned_transducer_stateless7
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./pruned_transducer_stateless7/exp
          cd pruned_transducer_stateless7
          echo "results for pruned_transducer_stateless7"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for librispeech pruned_transducer_stateless7
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-2022-11-11
          path: egs/librispeech/ASR/pruned_transducer_stateless7/exp/
--- a/.github/workflows/run-librispeech-2022-11-14-stateless8.yml
+++ b/.github/workflows/run-librispeech-2022-11-14-stateless8.yml
@ -0,0 +1,159 @@
 # Copyright      2022  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-11-14-stateless8
 # zipformer
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_2022_11_14_zipformer_stateless8-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_2022_11_14_zipformer_stateless8:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        python-version: [3.8]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}-2022-09-25
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-pruned-transducer-stateless8-2022-11-14.sh
      - name: Display decoding results for librispeech pruned_transducer_stateless8
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./pruned_transducer_stateless8/exp
          cd pruned_transducer_stateless8
          echo "results for pruned_transducer_stateless8"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for librispeech pruned_transducer_stateless8
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless8-2022-11-14
          path: egs/librispeech/ASR/pruned_transducer_stateless8/exp/
--- a/.github/workflows/run-librispeech-2022-12-01-stateless7-ctc.yml
+++ b/.github/workflows/run-librispeech-2022-12-01-stateless7-ctc.yml
@ -0,0 +1,163 @@
 # Copyright      2022  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-12-01-stateless7-ctc
 # zipformer
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_librispeech_2022_11_11_zipformer:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        python-version: [3.8]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}-2022-09-25
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-2022-12-01.sh
      - name: Display decoding results for librispeech pruned_transducer_stateless7_ctc
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./pruned_transducer_stateless7_ctc/exp
          cd pruned_transducer_stateless7_ctc
          echo "results for pruned_transducer_stateless7_ctc"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===ctc decoding==="
          find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===1best==="
          find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for librispeech pruned_transducer_stateless7_ctc
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-ctc-2022-12-01
          path: egs/librispeech/ASR/pruned_transducer_stateless7_ctc/exp/
--- a/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml
+++ b/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml
@ -0,0 +1,167 @@
 # Copyright      2022  Zengwei Yao
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-12-08-zipformer-mmi
 # zipformer
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_2022_12_08_zipformer-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_2022_12_08_zipformer:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        python-version: [3.8]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}-2022-09-25
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-zipformer-mmi-2022-12-08.sh
      - name: Display decoding results for librispeech zipformer-mmi
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./zipformer-mmi/exp
          cd zipformer-mmi
          echo "results for zipformer-mmi"
          echo "===1best==="
          find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===nbest==="
          find exp/nbest -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/nbest -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===nbest-rescoring-LG==="
          find exp/nbest-rescoring-LG -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/nbest-rescoring-LG -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===nbest-rescoring-3-gram==="
          find exp/nbest-rescoring-3-gram -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/nbest-rescoring-3-gram -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===nbest-rescoring-4-gram==="
          find exp/nbest-rescoring-4-gram -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/nbest-rescoring-4-gram -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for librispeech zipformer-mmi
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer_mmi-2022-12-08
          path: egs/librispeech/ASR/zipformer_mmi/exp/
--- a/.github/workflows/run-librispeech-2022-12-15-stateless7-ctc-bs.yml
+++ b/.github/workflows/run-librispeech-2022-12-15-stateless7-ctc-bs.yml
@ -0,0 +1,163 @@
 # Copyright      2022  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-12-15-stateless7-ctc-bs
 # zipformer
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_librispeech_2022_12_15_zipformer_ctc_bs:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event.label.name == 'blank-skip' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        python-version: [3.8]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}-2022-09-25
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
      - name: Display decoding results for librispeech pruned_transducer_stateless7_ctc_bs
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./pruned_transducer_stateless7_ctc_bs/exp
          cd pruned_transducer_stateless7_ctc_bs
          echo "results for pruned_transducer_stateless7_ctc_bs"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===ctc decoding==="
          find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===1best==="
          find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for librispeech pruned_transducer_stateless7_ctc_bs
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-ctc-bs-2022-12-15
          path: egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/exp/
--- a/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml
+++ b/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml
@ -0,0 +1,172 @@
 # Copyright      2022  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-12-29-stateless7-streaming
 # zipformer
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_2022_12_29_zipformer_streaming-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_2022_12_29_zipformer_streaming:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event.label.name == 'streaming-zipformer' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        python-version: [3.8]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}-2022-09-25
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-pruned-transducer-stateless7-streaming-2022-12-29.sh
      - name: Display decoding results for librispeech pruned_transducer_stateless7_streaming
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./pruned_transducer_stateless7_streaming/exp
          cd pruned_transducer_stateless7_streaming
          echo "results for pruned_transducer_stateless7_streaming"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===streaming greedy search==="
          find exp/streaming/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/streaming/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===streaming fast_beam_search==="
          find exp/streaming/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/streaming/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===streaming modified beam search==="
          find exp/streaming/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/streaming/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for librispeech pruned_transducer_stateless7_streaming
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-streaming-2022-12-29
          path: egs/librispeech/ASR/pruned_transducer_stateless7_streaming/exp/
--- a/.github/workflows/run-librispeech-conformer-ctc3-2022-11-28.yml
+++ b/.github/workflows/run-librispeech-conformer-ctc3-2022-11-28.yml
@ -0,0 +1,155 @@
 # Copyright      2022  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-conformer-ctc3-2022-11-28
 # zipformer
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_2022_11_28_conformer_ctc3-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_2022_11_28_conformer_ctc3:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        python-version: [3.8]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}-2022-09-25
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-conformer-ctc3-2022-11-28.sh
      - name: Display decoding results for librispeech conformer_ctc3
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./conformer_ctc3/exp
          cd conformer_ctc3
          echo "results for conformer_ctc3"
          echo "===ctc-decoding==="
          find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===1best==="
          find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for librispeech conformer_ctc3
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-conformer_ctc3-2022-11-28
          path: egs/librispeech/ASR/conformer_ctc3/exp/
--- a/.github/workflows/run-librispeech-conv-emformer-transducer-stateless2-2022-12-05.yml
+++ b/.github/workflows/run-librispeech-conv-emformer-transducer-stateless2-2022-12-05.yml
@ -0,0 +1,77 @@
 name: run-librispeech-conv-emformer-transducer-stateless2-2022-12-05
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_librispeech_conv_emformer_transducer_stateless2_2022_12_05:
    if: github.event.label.name == 'ready' || github.event.label.name == 'ncnn' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        python-version: [3.8]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | grep -v kaldifst | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}-2022-09-25
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-conv-emformer-transducer-stateless2-2022-12-05.sh
--- a/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
+++ b/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
@ -16,9 +16,13 @@ on:
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_lstm_transducer_stateless2_2022_09_03-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_lstm_transducer_stateless2_2022_09_03:
-    if: github.event.label.name == 'ready' || github.event.label.name == 'ncnn' || github.event.label.name == 'onnx' || github.event_name == 'push' || github.event_name == 'schedule'
+    if: github.event.label.name == 'ready' || github.event.label.name == 'LODR' || github.event.label.name == 'shallow-fusion' || github.event.label.name == 'ncnn' || github.event.label.name == 'onnx' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
@ -107,7 +111,7 @@ jobs:
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
-          .github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
+          .github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.sh
      - name: Display decoding results for lstm_transducer_stateless2
        if: github.event_name == 'schedule'
@ -128,9 +132,32 @@ jobs:
          find modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Display decoding results for lstm_transducer_stateless2
        if: github.event.label.name == 'shallow-fusion'
        shell: bash
        run: |
          cd egs/librispeech/ASR
          tree lstm_transducer_stateless2/exp
          cd lstm_transducer_stateless2/exp
          echo "===modified_beam_search_lm_shallow_fusion==="
          echo "===Using RNNLM==="
          find modified_beam_search_lm_shallow_fusion  -name "log-*rnn*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find modified_beam_search_lm_shallow_fusion  -name "log-*rnn*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Display decoding results for lstm_transducer_stateless2
        if: github.event.label.name == 'LODR'
        shell: bash
        run: |
          cd egs/librispeech/ASR
          tree lstm_transducer_stateless2/exp
          cd lstm_transducer_stateless2/exp
          echo "===modified_beam_search_rnnlm_LODR==="
          find modified_beam_search_LODR  -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find modified_beam_search_LODR  -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for lstm_transducer_stateless2
        uses: actions/upload-artifact@v2
-        if: github.event_name == 'schedule'
+        if: github.event_name == 'schedule' || github.event.label.name == 'shallow-fusion' || github.event.label.name == 'LODR'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-lstm_transducer_stateless2-2022-09-03
          path: egs/librispeech/ASR/lstm_transducer_stateless2/exp/
--- a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
@ -33,6 +33,10 @@ on:
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_pruned_transducer_stateless3_2022_05_13-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_pruned_transducer_stateless3_2022_05_13:
    if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
--- a/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
+++ b/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
@ -33,6 +33,10 @@ on:
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_streaming_2022_06_26-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_streaming_2022_06_26:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
--- a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
+++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
@ -33,6 +33,10 @@ on:
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_2022_04_19-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_2022_04_19:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
--- a/.github/workflows/run-pretrained-conformer-ctc.yml
+++ b/.github/workflows/run-pretrained-conformer-ctc.yml
@ -23,6 +23,10 @@ on:
  pull_request:
    types: [labeled]
 concurrency:
  group: run_pre_trained_conformer_ctc-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_pre_trained_conformer_ctc:
    if: github.event.label.name == 'ready' || github.event_name == 'push'
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@ -32,6 +32,10 @@ on:
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_pre_trained_transducer_stateless_multi_datasets_librispeech_100h-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_pre_trained_transducer_stateless_multi_datasets_librispeech_100h:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@ -32,6 +32,10 @@ on:
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_pre_trained_transducer_stateless_multi_datasets_librispeech_960h-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_pre_trained_transducer_stateless_multi_datasets_librispeech_960h:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
--- a/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
@ -23,6 +23,10 @@ on:
  pull_request:
    types: [labeled]
 concurrency:
  group: run_pre_trained_transducer_stateless_modified_2_aishell-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_pre_trained_transducer_stateless_modified_2_aishell:
    if: github.event.label.name == 'ready' || github.event_name == 'push'
--- a/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
@ -23,6 +23,10 @@ on:
  pull_request:
    types: [labeled]
 concurrency:
  group: run_pre_trained_transducer_stateless_modified_aishell-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_pre_trained_transducer_stateless_modified_aishell:
    if: github.event.label.name == 'ready' || github.event_name == 'push'
--- a/.github/workflows/run-pretrained-transducer-stateless.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@ -32,6 +32,10 @@ on:
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_pre_trained_transducer_stateless-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_pre_trained_transducer_stateless:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
--- a/.github/workflows/run-pretrained-transducer.yml
+++ b/.github/workflows/run-pretrained-transducer.yml
@ -23,6 +23,10 @@ on:
  pull_request:
    types: [labeled]
 concurrency:
  group: run_pre_trained_transducer-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_pre_trained_transducer:
    if: github.event.label.name == 'ready' || github.event_name == 'push'
--- a/.github/workflows/run-ptb-rnn-lm.yml
+++ b/.github/workflows/run-ptb-rnn-lm.yml
@ -0,0 +1,71 @@
 name: run-ptb-rnn-lm-training
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_ptb_rnn_lm_training-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_ptb_rnn_lm_training:
    if: github.event.label.name == 'ready' || github.event.label.name == 'rnnlm' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        python-version: ["3.8"]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | grep -v kaldifst | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Prepare data
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          cd egs/ptb/LM
          ./prepare.sh
      - name: Run training
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
          cd egs/ptb/LM
          ./train-rnn-lm.sh --world-size 1 --num-epochs 5 --use-epoch 4 --use-avg 2
      - name: Upload pretrained models
        uses: actions/upload-artifact@v2
        if: github.event.label.name == 'ready' || github.event.label.name == 'rnnlm' || github.event_name == 'push' || github.event_name == 'schedule'
        with:
          name: python-${{ matrix.python-version }}-ubuntu-rnn-lm-ptb
          path: egs/ptb/LM/my-rnnlm-exp/
--- a/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
+++ b/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
@ -23,8 +23,12 @@ on:
  pull_request:
    types: [labeled]
 concurrency:
  group: run_wenetspeech_pruned_transducer_stateless2-${{ github.ref }}
  cancel-in-progress: true
 jobs:
-  run_librispeech_pruned_transducer_stateless3_2022_05_13:
+  run_wenetspeech_pruned_transducer_stateless2:
    if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event_name == 'push' || github.event.label.name == 'wenetspeech'
    runs-on: ${{ matrix.os }}
    strategy:
--- a/.github/workflows/run-yesno-recipe.yml
+++ b/.github/workflows/run-yesno-recipe.yml
@ -21,11 +21,15 @@ on:
    branches:
      - master
  pull_request:
-    types: [labeled]
+    branches:
      - master
 concurrency:
  group: run-yesno-recipe-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run-yesno-recipe:
    if: github.event.label.name == 'ready' || github.event_name == 'push'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
@ -61,7 +65,7 @@ jobs:
      - name: Install Python dependencies
        run: |
-          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          grep -v '^#' ./requirements-ci.txt  | grep -v kaldifst | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@ -24,6 +24,10 @@ on:
    branches:
      - master
 concurrency:
  group: style_check-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  style_check:
    runs-on: ${{ matrix.os }}
@ -45,17 +49,18 @@ jobs:
      - name: Install Python dependencies
        run: |
-          python3 -m pip install --upgrade pip black==21.6b0 flake8==3.9.2 click==8.0.4
+          python3 -m pip install --upgrade pip black==22.3.0 flake8==5.0.4 click==8.1.0
-          # See https://github.com/psf/black/issues/2964
+          # Click issue fixed in https://github.com/psf/black/pull/2966
          # The version of click should be selected from 8.0.0, 8.0.1, 8.0.2, 8.0.3, and 8.0.4
      - name: Run flake8
        shell: bash
        working-directory: ${{github.workspace}}
        run: |
          # stop the build if there are Python syntax errors or undefined names
-          flake8 . --count --show-source --statistics
+          flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-          flake8 .
+          # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
          flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 \
            --statistics --extend-ignore=E203,E266,E501,F401,E402,F403,F841,W503
      - name: Run black
        shell: bash
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -21,26 +21,23 @@ on:
    branches:
      - master
  pull_request:
-    types: [labeled]
+    branches:
      - master
 concurrency:
  group: test-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  test:
    if: github.event.label.name == 'ready' || github.event_name == 'push'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        # os: [ubuntu-18.04, macos-10.15]
+        os: [ubuntu-latest]
-        # disable macOS test for now.
+        python-version: ["3.8"]
-        os: [ubuntu-18.04]
+        torch: ["1.10.0"]
-        python-version: [3.7, 3.8]
+        torchaudio: ["0.10.0"]
-        torch: ["1.8.0", "1.11.0"]
+        k2-version: ["1.23.2.dev20221201"]
        torchaudio: ["0.8.0", "0.11.0"]
        k2-version: ["1.15.1.dev20220427"]
        exclude:
          - torch: "1.8.0"
            torchaudio: "0.11.0"
          - torch: "1.11.0"
            torchaudio: "0.8.0"
      fail-fast: false
@ -67,11 +64,7 @@ jobs:
          # numpy 1.20.x does not support python 3.6
          pip install numpy==1.19
          pip install torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          if [[ ${{ matrix.torchaudio }} == "0.11.0" ]]; then
          pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          else
            pip install torchaudio==${{ matrix.torchaudio }}
          fi
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
          pip install git+https://github.com/lhotse-speech/lhotse
@ -79,6 +72,8 @@ jobs:
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
          pip install kaldifst
          pip install onnxruntime
          pip install -r requirements.txt
      - name: Install graphviz
@ -118,10 +113,12 @@ jobs:
          cd ../pruned_transducer_stateless4
          pytest -v -s
          cd ../pruned_transducer_stateless7
          pytest -v -s
          cd ../transducer_stateless
          pytest -v -s
          if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then
          cd ../transducer
          pytest -v -s
@ -130,7 +127,6 @@ jobs:
          cd ../transducer_lstm
          pytest -v -s
          fi
      - name: Run tests
        if: startsWith(matrix.os, 'macos')
@ -161,7 +157,6 @@ jobs:
          cd ../transducer_stateless
          pytest -v -s
          if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then
          cd ../transducer
          pytest -v -s
@ -170,4 +165,3 @@ jobs:
          cd ../transducer_lstm
          pytest -v -s
          fi
--- a/.gitignore
+++ b/.gitignore
@ -11,5 +11,26 @@ log
 *.bak
 *-bak
 *bak.py
 # Ignore Mac system files
 .DS_store
 # Ignore node_modules folder
 node_modules
 # ignore .nfs
 .nfs*
 # Ignore all text files
 *.txt
 # Ignore files related to API keys
 .env
 # Ignore SASS config files
 .sass-cache
 *.param
 *.bin
 .DS_Store
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,26 +1,38 @@
 repos:
  - repo: https://github.com/psf/black
-    rev: 21.6b0
+    rev: 22.3.0
    hooks:
      - id: black
-        args: [--line-length=80]
+        args: ["--line-length=88"]
-        additional_dependencies: ['click==8.0.1']
+        additional_dependencies: ['click==8.1.0']
        exclude: icefall\/__init__\.py
  - repo: https://github.com/PyCQA/flake8
-    rev: 3.9.2
+    rev: 5.0.4
    hooks:
      - id: flake8
-        args: [--max-line-length=80]
+        args: ["--max-line-length=88", "--extend-ignore=E203,E266,E501,F401,E402,F403,F841,W503"]
      # What are we ignoring here?
      # E203: whitespace before ':'
      # E266: too many leading '#' for block comment
      # E501: line too long
      # F401: module imported but unused
      # E402: module level import not at top of file
      # F403: 'from module import *' used; unable to detect undefined names
      # F841: local variable is assigned to but never used
      # W503: line break before binary operator
      # In addition, the default ignore list is:
      # E121,E123,E126,E226,E24,E704,W503,W504
  - repo: https://github.com/pycqa/isort
-    rev: 5.9.2
+    rev: 5.10.1
    hooks:
      - id: isort
-        args: [--profile=black, --line-length=80]
+        args: ["--profile=black"]
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.0.1
+    rev: v4.2.0
    hooks:
      - id: check-executables-have-shebangs
      - id: end-of-file-fixer
--- a/README.md
+++ b/README.md
@ -82,7 +82,7 @@ The WER for this model is:
 |-----|------------|------------|
 | WER | 6.59       | 17.69      |
-We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model:  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1kNmDXNMwREi0rZGAOIAOJo93REBuOTcd?usp=sharing)
+We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model:  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1-iSfQMp2So-We_Uu49N4AAcMInB72u9z?usp=sharing)
 #### Transducer: Conformer encoder + LSTM decoder
@ -162,7 +162,7 @@ The CER for this model is:
 |-----|-------|
 | CER | 10.16 |
-We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model:  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qULaGvXq7PCu_P61oubfz9b53JzY4H3z?usp=sharing)
+We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model:  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1jbyzYq3ytm6j2nlEt-diQm-6QVWyDDEa?usp=sharing)
 ### TIMIT
--- a/docker/README.md
+++ b/docker/README.md
@ -72,14 +72,14 @@ docker run -it --runtime=nvidia --shm-size=2gb --name=icefall --gpus all icefall
 ```
 ### Tips:
-1. Since your data and models most probably won't be in the docker, you must use the -v flag to access the host machine. Do this by specifying `-v {/path/in/docker}:{/path/in/host/machine}`. 
+1. Since your data and models most probably won't be in the docker, you must use the -v flag to access the host machine. Do this by specifying `-v {/path/in/host/machine}:{/path/in/docker}`.
 2. Also, if your environment requires a proxy, this would be a good time to add it in too: `-e http_proxy=http://aaa.bb.cc.net:8080 -e https_proxy=http://aaa.bb.cc.net:8080`.
 Overall, your docker run command should look like this.
 ```bash
-docker run -it --runtime=nvidia --shm-size=2gb --name=icefall --gpus all -v {/path/in/docker}:{/path/in/host/machine} -e http_proxy=http://aaa.bb.cc.net:8080 -e https_proxy=http://aaa.bb.cc.net:8080 icefall/pytorch1.12.1
+docker run -it --runtime=nvidia --shm-size=2gb --name=icefall --gpus all -v {/path/in/host/machine}:{/path/in/docker} -e http_proxy=http://aaa.bb.cc.net:8080 -e https_proxy=http://aaa.bb.cc.net:8080 icefall/pytorch1.12.1
 ```
 You can explore more docker run options [here](https://docs.docker.com/engine/reference/commandline/run/) to suit your environment.
--- a/docker/Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8/Dockerfile
+++ b/docker/Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8/Dockerfile
@ -51,8 +51,9 @@ RUN wget -P /opt https://downloads.xiph.org/releases/flac/flac-1.3.2.tar.xz  &&
    find /opt/flac-1.3.2  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
    cd -
-RUN pip install kaldiio graphviz && \
+RUN conda install -y -c pytorch torchaudio=0.12 && \
-	conda install -y -c pytorch torchaudio
+    pip install graphviz
 #install k2 from source
 RUN git clone https://github.com/k2-fsa/k2.git /opt/k2 && \
@ -67,6 +68,7 @@ RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
 	cd /workspace/icefall && \
 	pip install -r requirements.txt
 RUN pip install kaldifeat
 ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
 WORKDIR /workspace/icefall
--- a/docker/Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8/Dockerfile
+++ b/docker/Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8/Dockerfile
@ -69,8 +69,8 @@ RUN wget -P /opt https://downloads.xiph.org/releases/flac/flac-1.3.2.tar.xz  &&
    find /opt/flac-1.3.2  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
    cd -
-RUN pip install kaldiio graphviz && \
+RUN conda install -y -c pytorch torchaudio=0.7.1 && \
-	conda install -y -c pytorch torchaudio=0.7.1
+    pip install graphviz
 #install k2 from source
 RUN git clone https://github.com/k2-fsa/k2.git /opt/k2 && \
@ -88,4 +88,3 @@ RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
 ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
 WORKDIR /workspace/icefall
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1,24 @@
 ## Usage
 ```bash
 cd /path/to/icefall/docs
 pip install -r requirements.txt
 make clean
 make html
 cd build/html
 python3 -m http.server 8000
 ```
 It prints:
 ```
 Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ...
 ```
 Open your browser and go to <http://0.0.0.0:8000/> to view the generated
 documentation.
 Done!
 **Hint**: You can change the port number when starting the server.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -78,3 +78,12 @@ html_context = {
 }
 todo_include_todos = True
 rst_epilog = """
 .. _sherpa-ncnn: https://github.com/k2-fsa/sherpa-ncnn
 .. _icefall: https://github.com/k2-fsa/icefall
 .. _git-lfs: https://git-lfs.com/
 .. _ncnn: https://github.com/tencent/ncnn
 .. _LibriSpeech: https://www.openslr.org/12
 .. _musan: http://www.openslr.org/17/
 """
--- a/docs/source/contributing/code-style.rst
+++ b/docs/source/contributing/code-style.rst
@ -11,9 +11,9 @@ We use the following tools to make the code style to be as consistent as possibl
 The following versions of the above tools are used:
-  - ``black == 12.6b0``
+  - ``black == 22.3.0``
-  - ``flake8 == 3.9.2``
+  - ``flake8 == 5.0.4``
-  - ``isort == 5.9.2``
+  - ``isort == 5.10.1``
 After running the following commands:
@ -54,10 +54,17 @@ it should succeed this time:
 If you want to check the style of your code before ``git commit``, you
 can do the following:
  .. code-block:: bash
    $ pre-commit install
    $ pre-commit run
 Or without installing the pre-commit hooks:
  .. code-block:: bash
    $ cd icefall
-    $ pip install black==21.6b0 flake8==3.9.2 isort==5.9.2
+    $ pip install black==22.3.0 flake8==5.0.4 isort==5.10.1
    $ black --check your_changed_file.py
    $ black your_changed_file.py  # modify it in-place
    $
--- a/docs/source/faqs.rst
+++ b/docs/source/faqs.rst
@ -0,0 +1,107 @@
 Frequently Asked Questions (FAQs)
 =================================
 In this section, we collect issues reported by users and post the corresponding
 solutions.
 OSError: libtorch_hip.so: cannot open shared object file: no such file or directory
 -----------------------------------------------------------------------------------
 One user is using the following code to install ``torch`` and ``torchaudio``:
 .. code-block:: bash
  pip install \
    torch==1.10.0+cu111 \
    torchvision==0.11.0+cu111 \
    torchaudio==0.10.0 \
    -f https://download.pytorch.org/whl/torch_stable.html
 and it throws the following error when running ``tdnn/train.py``:
 .. code-block::
  OSError: libtorch_hip.so: cannot open shared object file: no such file or directory
 The fix is to specify the CUDA version while installing ``torchaudio``. That
 is, change ``torchaudio==0.10.0`` to ``torchaudio==0.10.0+cu11```. Therefore,
 the correct command is:
 .. code-block:: bash
  pip install \
    torch==1.10.0+cu111 \
    torchvision==0.11.0+cu111 \
    torchaudio==0.10.0+cu111 \
    -f https://download.pytorch.org/whl/torch_stable.html
 AttributeError: module 'distutils' has no attribute 'version'
 -------------------------------------------------------------
 The error log is:
 .. code-block::
  Traceback (most recent call last):
    File "./tdnn/train.py", line 14, in <module>
      from asr_datamodule import YesNoAsrDataModule
    File "/home/xxx/code/next-gen-kaldi/icefall/egs/yesno/ASR/tdnn/asr_datamodule.py", line 34, in <module>
      from icefall.dataset.datamodule import DataModule
    File "/home/xxx/code/next-gen-kaldi/icefall/icefall/__init__.py", line 3, in <module>
      from . import (
    File "/home/xxx/code/next-gen-kaldi/icefall/icefall/decode.py", line 23, in <module>
      from icefall.utils import add_eos, add_sos, get_texts
    File "/home/xxx/code/next-gen-kaldi/icefall/icefall/utils.py", line 39, in <module>
      from torch.utils.tensorboard import SummaryWriter
    File "/home/xxx/tool/miniconda3/envs/yyy/lib/python3.8/site-packages/torch/utils/tensorboard/__init__.py", line 4, in <module>
      LooseVersion = distutils.version.LooseVersion
  AttributeError: module 'distutils' has no attribute 'version'
 The fix is:
 .. code-block:: bash
  pip uninstall setuptools
  pip install setuptools==58.0.4
 ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
 --------------------------------------------------------------------------------------------
 If you are using ``conda`` and encounter the following issue:
 .. code-block::
  Traceback (most recent call last):
    File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 24, in <module>
      from _k2 import DeterminizeWeightPushingType
  ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
  During handling of the above exception, another exception occurred:
  Traceback (most recent call last):
    File "/k2-dev/yangyifan/icefall/egs/librispeech/ASR/./pruned_transducer_stateless7_ctc_bs/decode.py", line 104, in <module>
      import k2
    File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 30, in <module>
      raise ImportError(
  ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
  Note: If you're using anaconda and importing k2 on MacOS,
        you can probably fix this by setting the environment variable:
    export DYLD_LIBRARY_PATH=$CONDA_PREFIX/lib/python3.10/site-packages:$DYLD_LIBRARY_PATH
 Please first try to find where ``libpython3.10.so.1.0`` locates.
 For instance,
 .. code-block:: bash
  cd $CONDA_PREFIX/lib
  find . -name "libpython*"
 If you are able to find it inside ``$CODNA_PREFIX/lib``, please set the
 following environment variable:
 .. code-block:: bash
  export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -21,7 +21,16 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
   :caption: Contents:
   installation/index
   faqs
   model-export/index
 .. toctree::
   :maxdepth: 3
   recipes/index
 .. toctree::
   :maxdepth: 2
   contributing/index
   huggingface/index
--- a/docs/source/installation/index.rst
+++ b/docs/source/installation/index.rst
@ -393,6 +393,17 @@ Now let us run the training part:
  We use ``export CUDA_VISIBLE_DEVICES=""`` so that ``icefall`` uses CPU
  even if there are GPUs available.
 .. hint::
   In case you get a ``Segmentation fault (core dump)`` error, please use:
      .. code-block:: bash
        export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
   See more at `<https://github.com/k2-fsa/icefall/issues/674>` if you are
   interested.
 The training log is given below:
 .. code-block::
--- a/docs/source/model-export/code/export-conv-emformer-transducer-for-ncnn-output.txt
+++ b/docs/source/model-export/code/export-conv-emformer-transducer-for-ncnn-output.txt
@ -0,0 +1,21 @@
 2023-01-11 12:15:38,677 INFO [export-for-ncnn.py:220] device: cpu
 2023-01-11 12:15:38,681 INFO [export-for-ncnn.py:229] {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_v
 alid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampl
 ing_factor': 4, 'decoder_dim': 512, 'joiner_dim': 512, 'model_warm_step': 3000, 'env_info': {'k2-version': '1.23.2', 'k2-build-type':
 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'a34171ed85605b0926eebbd0463d059431f4f74a', 'k2-git-date': 'Wed Dec 14 00:06:38 2022',
 'lhotse-version': '1.12.0.dev+missing.version.file', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': False, 'torch-cuda-vers
 ion': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'fix-stateless3-train-2022-12-27', 'icefall-git-sha1': '530e8a1-dirty', '
 icefall-git-date': 'Tue Dec 27 13:59:18 2022', 'icefall-path': '/star-fj/fangjun/open-source/icefall', 'k2-path': '/star-fj/fangjun/op
 en-source/k2/k2/python/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/open-source/lhotse/lhotse/__init__.py', 'hostname': 'de-74279
 -k2-train-3-1220120619-7695ff496b-s9n4w', 'IP address': '127.0.0.1'}, 'epoch': 30, 'iter': 0, 'avg': 1, 'exp_dir': PosixPath('icefa
 ll-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp'), 'bpe_model': './icefall-asr-librispeech-conv-emformer-transdu
 cer-stateless2-2022-07-05//data/lang_bpe_500/bpe.model', 'jit': False, 'context_size': 2, 'use_averaged_model': False, 'encoder_dim':
 512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'cnn_module_kernel': 31, 'left_context_length': 32, 'chunk_length'
 : 32, 'right_context_length': 8, 'memory_size': 32, 'blank_id': 0, 'vocab_size': 500}
 2023-01-11 12:15:38,681 INFO [export-for-ncnn.py:231] About to create model
 2023-01-11 12:15:40,053 INFO [checkpoint.py:112] Loading checkpoint from icefall-asr-librispeech-conv-emformer-transducer-stateless2-2
 022-07-05/exp/epoch-30.pt
 2023-01-11 12:15:40,708 INFO [export-for-ncnn.py:315] Number of model parameters: 75490012
 2023-01-11 12:15:41,681 INFO [export-for-ncnn.py:318] Using torch.jit.trace()
 2023-01-11 12:15:41,681 INFO [export-for-ncnn.py:320] Exporting encoder
 2023-01-11 12:15:41,682 INFO [export-for-ncnn.py:149] chunk_length: 32, right_context_length: 8
--- a/docs/source/model-export/code/generate-int-8-scale-table-for-conv-emformer.txt
+++ b/docs/source/model-export/code/generate-int-8-scale-table-for-conv-emformer.txt
@ -0,0 +1,104 @@
 Don't Use GPU. has_gpu: 0, config.use_vulkan_compute: 1
 num encoder conv layers: 88
 num joiner conv layers: 3
 num files: 3
 Processing ../test_wavs/1089-134686-0001.wav
 Processing ../test_wavs/1221-135766-0001.wav
 Processing ../test_wavs/1221-135766-0002.wav
 Processing ../test_wavs/1089-134686-0001.wav
 Processing ../test_wavs/1221-135766-0001.wav
 Processing ../test_wavs/1221-135766-0002.wav
 ----------encoder----------
 conv_87                                  : max = 15.942385        threshold = 15.938493        scale = 7.968131
 conv_88                                  : max = 35.442448        threshold = 15.549335        scale = 8.167552
 conv_89                                  : max = 23.228289        threshold = 8.001738         scale = 15.871552
 linear_90                                : max = 3.976146         threshold = 1.101789         scale = 115.267128
 linear_91                                : max = 6.962030         threshold = 5.162033         scale = 24.602713
 linear_92                                : max = 12.323041        threshold = 3.853959         scale = 32.953129
 linear_94                                : max = 6.905416         threshold = 4.648006         scale = 27.323545
 linear_93                                : max = 6.905416         threshold = 5.474093         scale = 23.200188
 linear_95                                : max = 1.888012         threshold = 1.403563         scale = 90.483986
 linear_96                                : max = 6.856741         threshold = 5.398679         scale = 23.524273
 linear_97                                : max = 9.635942         threshold = 2.613655         scale = 48.590950
 linear_98                                : max = 6.460340         threshold = 5.670146         scale = 22.398010
 linear_99                                : max = 9.532276         threshold = 2.585537         scale = 49.119396
 linear_101                               : max = 6.585871         threshold = 5.719224         scale = 22.205809
 linear_100                               : max = 6.585871         threshold = 5.751382         scale = 22.081648
 linear_102                               : max = 1.593344         threshold = 1.450581         scale = 87.551147
 linear_103                               : max = 6.592681         threshold = 5.705824         scale = 22.257959
 linear_104                               : max = 8.752957         threshold = 1.980955         scale = 64.110489
 linear_105                               : max = 6.696240         threshold = 5.877193         scale = 21.608953
 linear_106                               : max = 9.059659         threshold = 2.643138         scale = 48.048950
 linear_108                               : max = 6.975461         threshold = 4.589567         scale = 27.671457
 linear_107                               : max = 6.975461         threshold = 6.190381         scale = 20.515701
 linear_109                               : max = 3.710759         threshold = 2.305635         scale = 55.082436
 linear_110                               : max = 7.531228         threshold = 5.731162         scale = 22.159557
 linear_111                               : max = 10.528083        threshold = 2.259322         scale = 56.211544
 linear_112                               : max = 8.148807         threshold = 5.500842         scale = 23.087374
 linear_113                               : max = 8.592566         threshold = 1.948851         scale = 65.166611
 linear_115                               : max = 8.437109         threshold = 5.608947         scale = 22.642395
 linear_114                               : max = 8.437109         threshold = 6.193942         scale = 20.503904
 linear_116                               : max = 3.966980         threshold = 3.200896         scale = 39.676392
 linear_117                               : max = 9.451303         threshold = 6.061664         scale = 20.951344
 linear_118                               : max = 12.077262        threshold = 3.965800         scale = 32.023804
 linear_119                               : max = 9.671615         threshold = 4.847613         scale = 26.198460
 linear_120                               : max = 8.625638         threshold = 3.131427         scale = 40.556595
 linear_122                               : max = 10.274080        threshold = 4.888716         scale = 25.978189
 linear_121                               : max = 10.274080        threshold = 5.420480         scale = 23.429659
 linear_123                               : max = 4.826197         threshold = 3.599617         scale = 35.281532
 linear_124                               : max = 11.396383        threshold = 7.325849         scale = 17.335875
 linear_125                               : max = 9.337198         threshold = 3.941410         scale = 32.221970
 linear_126                               : max = 9.699965         threshold = 4.842878         scale = 26.224073
 linear_127                               : max = 8.775370         threshold = 3.884215         scale = 32.696438
 linear_129                               : max = 9.872276         threshold = 4.837319         scale = 26.254213
 linear_128                               : max = 9.872276         threshold = 7.180057         scale = 17.687883
 linear_130                               : max = 4.150427         threshold = 3.454298         scale = 36.765789
 linear_131                               : max = 11.112692        threshold = 7.924847         scale = 16.025545
 linear_132                               : max = 11.852893        threshold = 3.116593         scale = 40.749626
 linear_133                               : max = 11.517084        threshold = 5.024665         scale = 25.275314
 linear_134                               : max = 10.683807        threshold = 3.878618         scale = 32.743618
 linear_136                               : max = 12.421055        threshold = 6.322729         scale = 20.086264
 linear_135                               : max = 12.421055        threshold = 5.309880         scale = 23.917679
 linear_137                               : max = 4.827781         threshold = 3.744595         scale = 33.915554
 linear_138                               : max = 14.422395        threshold = 7.742882         scale = 16.402161
 linear_139                               : max = 8.527538         threshold = 3.866123         scale = 32.849449
 linear_140                               : max = 12.128619        threshold = 4.657793         scale = 27.266134
 linear_141                               : max = 9.839593         threshold = 3.845993         scale = 33.021378
 linear_143                               : max = 12.442304        threshold = 7.099039         scale = 17.889746
 linear_142                               : max = 12.442304        threshold = 5.325038         scale = 23.849592
 linear_144                               : max = 5.929444         threshold = 5.618206         scale = 22.605080
 linear_145                               : max = 13.382126        threshold = 9.321095         scale = 13.625010
 linear_146                               : max = 9.894987         threshold = 3.867645         scale = 32.836517
 linear_147                               : max = 10.915313        threshold = 4.906028         scale = 25.886522
 linear_148                               : max = 9.614287         threshold = 3.908151         scale = 32.496181
 linear_150                               : max = 11.724932        threshold = 4.485588         scale = 28.312899
 linear_149                               : max = 11.724932        threshold = 5.161146         scale = 24.606939
 linear_151                               : max = 7.164453         threshold = 5.847355         scale = 21.719223
 linear_152                               : max = 13.086471        threshold = 5.984121         scale = 21.222834
 linear_153                               : max = 11.099524        threshold = 3.991601         scale = 31.816805
 linear_154                               : max = 10.054585        threshold = 4.489706         scale = 28.286930
 linear_155                               : max = 12.389185        threshold = 3.100321         scale = 40.963501
 linear_157                               : max = 9.982999         threshold = 5.154796         scale = 24.637253
 linear_156                               : max = 9.982999         threshold = 8.537706         scale = 14.875190
 linear_158                               : max = 8.420287         threshold = 6.502287         scale = 19.531588
 linear_159                               : max = 25.014746        threshold = 9.423280         scale = 13.477261
 linear_160                               : max = 45.633553        threshold = 5.715335         scale = 22.220921
 linear_161                               : max = 20.371849        threshold = 5.117830         scale = 24.815203
 linear_162                               : max = 12.492933        threshold = 3.126283         scale = 40.623318
 linear_164                               : max = 20.697504        threshold = 4.825712         scale = 26.317358
 linear_163                               : max = 20.697504        threshold = 5.078367         scale = 25.008038
 linear_165                               : max = 9.023975         threshold = 6.836278         scale = 18.577358
 linear_166                               : max = 34.860619        threshold = 7.259792         scale = 17.493614
 linear_167                               : max = 30.380934        threshold = 5.496160         scale = 23.107042
 linear_168                               : max = 20.691216        threshold = 4.733317         scale = 26.831076
 linear_169                               : max = 9.723948         threshold = 3.952728         scale = 32.129707
 linear_171                               : max = 21.034811        threshold = 5.366547         scale = 23.665123
 linear_170                               : max = 21.034811        threshold = 5.356277         scale = 23.710501
 linear_172                               : max = 10.556884        threshold = 5.729481         scale = 22.166058
 linear_173                               : max = 20.033039        threshold = 10.207264        scale = 12.442120
 linear_174                               : max = 11.597379        threshold = 2.658676         scale = 47.768131
 ----------joiner----------
 linear_2                                 : max = 19.293503        threshold = 14.305265        scale = 8.877850
 linear_1                                 : max = 10.812222        threshold = 8.766452         scale = 14.487047
 linear_3                                 : max = 0.999999         threshold = 0.999755         scale = 127.031174
 ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\(^0^)/...233...
--- a/docs/source/model-export/code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt
+++ b/docs/source/model-export/code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt
@ -0,0 +1,7 @@
 2023-01-11 14:02:12,216 INFO [streaming-ncnn-decode.py:320] {'tokens': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt', 'encoder_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param', 'encoder_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin', 'decoder_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param', 'decoder_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin', 'joiner_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param', 'joiner_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin', 'sound_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav'}
 T 51 32
 2023-01-11 14:02:13,141 INFO [streaming-ncnn-decode.py:328] Constructing Fbank computer
 2023-01-11 14:02:13,151 INFO [streaming-ncnn-decode.py:331] Reading sound files: ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
 2023-01-11 14:02:13,176 INFO [streaming-ncnn-decode.py:336] torch.Size([106000])
 2023-01-11 14:02:17,581 INFO [streaming-ncnn-decode.py:380] ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
 2023-01-11 14:02:17,581 INFO [streaming-ncnn-decode.py:381] AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
--- a/docs/source/model-export/export-ncnn.rst
+++ b/docs/source/model-export/export-ncnn.rst
@ -1,12 +1,771 @@
 Export to ncnn
 ==============
-We support exporting LSTM transducer models to `ncnn <https://github.com/tencent/ncnn>`_.
+We support exporting both
-
+`LSTM transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/lstm_transducer_stateless2>`_
-Please refer to :ref:`export-model-for-ncnn` for details.
+and
 `ConvEmformer transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2>`_
 to `ncnn <https://github.com/tencent/ncnn>`_.
 We also provide `<https://github.com/k2-fsa/sherpa-ncnn>`_
 performing speech recognition using ``ncnn`` with exported models.
-It has been tested on Linux, macOS, Windows, and Raspberry Pi. The project is
+It has been tested on Linux, macOS, Windows, ``Android``, and ``Raspberry Pi``.
-self-contained and can be statically linked to produce a binary containing
+
-everything needed.
+`sherpa-ncnn`_ is self-contained and can be statically linked to produce
 a binary containing everything needed. Please refer
 to its documentation for details:
 - `<https://k2-fsa.github.io/sherpa/ncnn/index.html>`_
 Export LSTM transducer models
 -----------------------------
 Please refer to :ref:`export-lstm-transducer-model-for-ncnn` for details.
 Export ConvEmformer transducer models
 -------------------------------------
 We use the pre-trained model from the following repository as an example:
  - `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
 We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
 .. hint::
  We use ``Ubuntu 18.04``, ``torch 1.10``, and ``Python 3.8`` for testing.
 .. caution::
  Please use a more recent version of PyTorch. For instance, ``torch 1.8``
  may ``not`` work.
 1. Download the pre-trained model
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. hint::
  You can also refer to `<https://k2-fsa.github.io/sherpa/cpp/pretrained_models/online_transducer.html#icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_ to download the pre-trained model.
  You have to install `git-lfs`_ before you continue.
 .. code-block:: bash
  cd egs/librispeech/ASR
  GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
  git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
  git lfs pull --include "data/lang_bpe_500/bpe.model"
  cd ..
 .. note::
  We download ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
 In the above code, we download the pre-trained model into the directory
 ``egs/librispeech/ASR/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05``.
 2. Install ncnn and pnnx
 ^^^^^^^^^^^^^^^^^^^^^^^^
 .. code-block:: bash
  # We put ncnn into $HOME/open-source/ncnn
  # You can change it to anywhere you like
  cd $HOME
  mkdir -p open-source
  cd open-source
  git clone https://github.com/csukuangfj/ncnn
  cd ncnn
  git submodule update --recursive --init
  # Note: We don't use "python setup.py install" or "pip install ." here
  mkdir -p build-wheel
  cd build-wheel
  cmake \
    -DCMAKE_BUILD_TYPE=Release \
    -DNCNN_PYTHON=ON \
    -DNCNN_BUILD_BENCHMARK=OFF \
    -DNCNN_BUILD_EXAMPLES=OFF \
    -DNCNN_BUILD_TOOLS=ON \
  ..
  make -j4
  cd ..
  # Note: $PWD here is $HOME/open-source/ncnn
  export PYTHONPATH=$PWD/python:$PYTHONPATH
  export PATH=$PWD/tools/pnnx/build/src:$PATH
  export PATH=$PWD/build-wheel/tools/quantize:$PATH
  # Now build pnnx
  cd tools/pnnx
  mkdir build
  cd build
  cmake ..
  make -j4
  ./src/pnnx
 Congratulations! You have successfully installed the following components:
  - ``pnxx``, which is an executable located in
    ``$HOME/open-source/ncnn/tools/pnnx/build/src``. We will use
    it to convert models exported by ``torch.jit.trace()``.
  - ``ncnn2int8``, which is an executable located in
    ``$HOME/open-source/ncnn/build-wheel/tools/quantize``. We will use
    it to quantize our models to ``int8``.
  - ``ncnn.cpython-38-x86_64-linux-gnu.so``, which is a Python module located
    in ``$HOME/open-source/ncnn/python/ncnn``.
    .. note::
      I am using ``Python 3.8``, so it
      is ``ncnn.cpython-38-x86_64-linux-gnu.so``. If you use a different
      version, say, ``Python 3.9``, the name would be
      ``ncnn.cpython-39-x86_64-linux-gnu.so``.
      Also, if you are not using Linux, the file name would also be different.
      But that does not matter. As long as you can compile it, it should work.
 We have set up ``PYTHONPATH`` so that you can use ``import ncnn`` in your
 Python code. We have also set up ``PATH`` so that you can use
 ``pnnx`` and ``ncnn2int8`` later in your terminal.
 .. caution::
  Please don't use `<https://github.com/tencent/ncnn>`_.
  We have made some modifications to the offical `ncnn`_.
  We will synchronize `<https://github.com/csukuangfj/ncnn>`_ periodically
  with the official one.
 3. Export the model via torch.jit.trace()
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 First, let us rename our pre-trained model:
 .. code-block::
  cd egs/librispeech/ASR
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp
  ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-30.pt
  cd ../..
 Next, we use the following code to export our model:
 .. code-block:: bash
  dir=./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/
  ./conv_emformer_transducer_stateless2/export-for-ncnn.py \
    --exp-dir $dir/exp \
    --bpe-model $dir/data/lang_bpe_500/bpe.model \
    --epoch 30 \
    --avg 1 \
    --use-averaged-model 0 \
    \
    --num-encoder-layers 12 \
    --chunk-length 32 \
    --cnn-module-kernel 31 \
    --left-context-length 32 \
    --right-context-length 8 \
    --memory-size 32 \
    --encoder-dim 512
 .. hint::
  We have renamed our model to ``epoch-30.pt`` so that we can use ``--epoch 30``.
  There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
  If you have trained a model by yourself and if you have all checkpoints
  available, please first use ``decode.py`` to tune ``--epoch --avg``
  and select the best combination with with ``--use-averaged-model 1``.
 .. note::
  You will see the following log output:
  .. literalinclude:: ./code/export-conv-emformer-transducer-for-ncnn-output.txt
  The log shows the model has ``75490012`` parameters, i.e., ``~75 M``.
  .. code-block::
    ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
    -rw-r--r-- 1 kuangfangjun root 289M Jan 11 12:05 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
  You can see that the file size of the pre-trained model is ``289 MB``, which
  is roughly ``75490012*4/1024/1024 = 287.97 MB``.
 After running ``conv_emformer_transducer_stateless2/export-for-ncnn.py``,
 we will get the following files:
 .. code-block:: bash
  ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*pnnx*
  -rw-r--r-- 1 kuangfangjun root 1010K Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.pt
  -rw-r--r-- 1 kuangfangjun root  283M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.pt
  -rw-r--r-- 1 kuangfangjun root  3.0M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.pt
 .. _conv-emformer-step-3-export-torchscript-model-via-pnnx:
 3. Export torchscript model via pnnx
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. hint::
  Make sure you have set up the ``PATH`` environment variable. Otherwise,
  it will throw an error saying that ``pnnx`` could not be found.
 Now, it's time to export our models to `ncnn`_ via ``pnnx``.
 .. code-block::
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
  pnnx ./encoder_jit_trace-pnnx.pt
  pnnx ./decoder_jit_trace-pnnx.pt
  pnnx ./joiner_jit_trace-pnnx.pt
 It will generate the following files:
 .. code-block:: bash
  ls -lh  icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*ncnn*{bin,param}
  -rw-r--r-- 1 kuangfangjun root 503K Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
  -rw-r--r-- 1 kuangfangjun root  437 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
  -rw-r--r-- 1 kuangfangjun root 142M Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
  -rw-r--r-- 1 kuangfangjun root  79K Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
  -rw-r--r-- 1 kuangfangjun root 1.5M Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
  -rw-r--r-- 1 kuangfangjun root  488 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
 There are two types of files:
 - ``param``: It is a text file containing the model architectures. You can
  use a text editor to view its content.
 - ``bin``: It is a binary file containing the model parameters.
 We compare the file sizes of the models below before and after converting via ``pnnx``:
 .. see https://tableconvert.com/restructuredtext-generator
 +----------------------------------+------------+
 | File name                        | File size  |
 +==================================+============+
 | encoder_jit_trace-pnnx.pt        | 283 MB     |
 +----------------------------------+------------+
 | decoder_jit_trace-pnnx.pt        | 1010 KB    |
 +----------------------------------+------------+
 | joiner_jit_trace-pnnx.pt         | 3.0 MB     |
 +----------------------------------+------------+
 | encoder_jit_trace-pnnx.ncnn.bin  | 142 MB     |
 +----------------------------------+------------+
 | decoder_jit_trace-pnnx.ncnn.bin  | 503 KB     |
 +----------------------------------+------------+
 | joiner_jit_trace-pnnx.ncnn.bin   | 1.5 MB     |
 +----------------------------------+------------+
 You can see that the file sizes of the models after conversion are about one half
 of the models before conversion:
  - encoder: 283 MB vs 142 MB
  - decoder: 1010 KB vs 503 KB
  - joiner: 3.0 MB vs 1.5 MB
 The reason is that by default ``pnnx`` converts ``float32`` parameters
 to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
 for ``float16``. Thus, it is ``twice smaller`` after conversion.
 .. hint::
  If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
  won't convert ``float32`` to ``float16``.
 4. Test the exported models in icefall
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. note::
  We assume you have set up the environment variable ``PYTHONPATH`` when
  building `ncnn`_.
 Now we have successfully converted our pre-trained model to `ncnn`_ format.
 The generated 6 files are what we need. You can use the following code to
 test the converted models:
 .. code-block:: bash
  ./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
    --tokens ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt \
    --encoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param \
    --encoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin \
    --decoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param \
    --decoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin \
    --joiner-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param \
    --joiner-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin \
    ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
 .. hint::
  `ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
  only 1 wave file as input.
 The output is given below:
 .. literalinclude:: ./code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt
 Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
 .. _conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn:
 5. Modify the exported encoder for sherpa-ncnn
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 In order to use the exported models in `sherpa-ncnn`_, we have to modify
 ``encoder_jit_trace-pnnx.ncnn.param``.
 Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
 .. code-block::
  7767517
  1060 1342
  Input                    in0                      0 1 in0
 **Explanation** of the above three lines:
  1. ``7767517``, it is a magic number and should not be changed.
  2. ``1060 1342``, the first number ``1060`` specifies the number of layers
     in this file, while ``1342`` specifies the number of intermediate outputs
     of this file
  3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
     is the layer name of this layer; ``0`` means this layer has no input;
     ``1`` means this layer has one output; ``in0`` is the output name of
     this layer.
 We need to add 1 extra line and also increment the number of layers.
 The result looks like below:
 .. code-block:: bash
  7767517
  1061 1342
  SherpaMetaData           sherpa_meta_data1        0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
  Input                    in0                      0 1 in0
 **Explanation**
  1. ``7767517``, it is still the same
  2. ``1061 1342``, we have added an extra layer, so we need to update ``1060`` to ``1061``.
     We don't need to change ``1342`` since the newly added layer has no inputs or outputs.
  3. ``SherpaMetaData  sherpa_meta_data1  0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512``
     This line is newly added. Its explanation is given below:
      - ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
      - ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
      - ``0 0`` means this layer has no inputs or output. Must be ``0 0``
      - ``0=1``, 0 is the key and 1 is the value. MUST be ``0=1``
      - ``1=12``, 1 is the key and 12 is the value of the
        parameter ``--num-encoder-layers`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      - ``2=32``, 2 is the key and 32 is the value of the
        parameter ``--memory-size`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      - ``3=31``, 3 is the key and 31 is the value of the
        parameter ``--cnn-module-kernel`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      - ``4=8``, 4 is the key and 8 is the value of the
        parameter ``--left-context-length`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      - ``5=32``, 5 is the key and 32 is the value of the
        parameter ``--chunk-length`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      - ``6=8``, 6 is the key and 8 is the value of the
        parameter ``--right-context-length`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      - ``7=512``, 7 is the key and 512 is the value of the
        parameter ``--encoder-dim`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      For ease of reference, we list the key-value pairs that you need to add
      in the following table. If your model has a different setting, please
      change the values for ``SherpaMetaData`` accordingly. Otherwise, you
      will be ``SAD``.
          +------+-----------------------------+
          | key  | value                       |
          +======+=============================+
          | 0    | 1 (fixed)                   |
          +------+-----------------------------+
          | 1    | ``--num-encoder-layers``    |
          +------+-----------------------------+
          | 2    | ``--memory-size``           |
          +------+-----------------------------+
          | 3    | ``--cnn-module-kernel``     |
          +------+-----------------------------+
          | 4    | ``--left-context-length``   |
          +------+-----------------------------+
          | 5    | ``--chunk-length``          |
          +------+-----------------------------+
          | 6    | ``--right-context-length``  |
          +------+-----------------------------+
          | 7    | ``--encoder-dim``           |
          +------+-----------------------------+
  4. ``Input in0 0 1 in0``. No need to change it.
 .. caution::
  When you add a new layer ``SherpaMetaData``, please remember to update the
  number of layers. In our case, update  ``1060`` to ``1061``. Otherwise,
  you will be SAD later.
 .. hint::
  After adding the new layer ``SherpaMetaData``, you cannot use this model
  with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
  supported only in `sherpa-ncnn`_.
 .. hint::
  `ncnn`_ is very flexible. You can add new layers to it just by text-editing
  the ``param`` file! You don't need to change the ``bin`` file.
 Now you can use this model in `sherpa-ncnn`_.
 Please refer to the following documentation:
  - Linux/macOS/Windows/arm/aarch64: `<https://k2-fsa.github.io/sherpa/ncnn/install/index.html>`_
  - Android: `<https://k2-fsa.github.io/sherpa/ncnn/android/index.html>`_
  - Python: `<https://k2-fsa.github.io/sherpa/ncnn/python/index.html>`_
 We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
  - `<https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html>`_
    You can find more usages there.
 6. (Optional) int8 quantization with sherpa-ncnn
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 This step is optional.
 In this step, we describe how to quantize our model with ``int8``.
 Change :ref:`conv-emformer-step-3-export-torchscript-model-via-pnnx` to
 disable ``fp16`` when using ``pnnx``:
 .. code-block::
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
  pnnx ./encoder_jit_trace-pnnx.pt fp16=0
  pnnx ./decoder_jit_trace-pnnx.pt
  pnnx ./joiner_jit_trace-pnnx.pt fp16=0
 .. note::
  We add ``fp16=0`` when exporting the encoder and joiner. `ncnn`_ does not
  support quantizing the decoder model yet. We will update this documentation
  once `ncnn`_ supports it. (Maybe in this year, 2023).
 It will generate the following files
 .. code-block:: bash
  ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*_jit_trace-pnnx.ncnn.{param,bin}
  -rw-r--r-- 1 kuangfangjun root 503K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
  -rw-r--r-- 1 kuangfangjun root  437 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
  -rw-r--r-- 1 kuangfangjun root 283M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
  -rw-r--r-- 1 kuangfangjun root  79K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
  -rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
  -rw-r--r-- 1 kuangfangjun root  488 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
 Let us compare again the file sizes:
 +----------------------------------------+------------+
 | File name                              | File size  |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.pt              | 283 MB     |
 +----------------------------------------+------------+
 | decoder_jit_trace-pnnx.pt              | 1010 KB    |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.pt               | 3.0 MB     |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB     |
 +----------------------------------------+------------+
 | decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB     |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.ncnn.bin  (fp16) | 1.5 MB     |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB     |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.ncnn.bin  (fp32) | 3.0 MB     |
 +----------------------------------------+------------+
 You can see that the file sizes are doubled when we disable ``fp16``.
 .. note::
  You can again use ``streaming-ncnn-decode.py`` to test the exported models.
 Next, follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
 to modify ``encoder_jit_trace-pnnx.ncnn.param``.
 Change
 .. code-block:: bash
  7767517
  1060 1342
  Input                    in0                      0 1 in0
 to
 .. code-block:: bash
  7767517
  1061 1342
  SherpaMetaData           sherpa_meta_data1        0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
  Input                    in0                      0 1 in0
 .. caution::
  Please follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
  to change the values for ``SherpaMetaData`` if your model uses a different setting.
 Next, let us compile `sherpa-ncnn`_ since we will quantize our models within
 `sherpa-ncnn`_.
 .. code-block:: bash
  # We will download sherpa-ncnn to $HOME/open-source/
  # You can change it to anywhere you like.
  cd $HOME
  mkdir -p open-source
  cd open-source
  git clone https://github.com/k2-fsa/sherpa-ncnn
  cd sherpa-ncnn
  mkdir build
  cd build
  cmake ..
  make -j 4
  ./bin/generate-int8-scale-table
  export PATH=$HOME/open-source/sherpa-ncnn/build/bin:$PATH
 The output of the above commands are:
 .. code-block:: bash
  (py38) kuangfangjun:build$ generate-int8-scale-table
  Please provide 10 arg. Currently given: 1
  Usage:
  generate-int8-scale-table encoder.param encoder.bin decoder.param decoder.bin joiner.param joiner.bin encoder-scale-table.txt joiner-scale-table.txt wave_filenames.txt
  Each line in wave_filenames.txt is a path to some 16k Hz mono wave file.
 We need to create a file ``wave_filenames.txt``, in which we need to put
 some calibration wave files. For testing purpose, we put the ``test_wavs``
 from the pre-trained model repository `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
 .. code-block:: bash
  cd egs/librispeech/ASR
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
  cat <<EOF > wave_filenames.txt
  ../test_wavs/1089-134686-0001.wav
  ../test_wavs/1221-135766-0001.wav
  ../test_wavs/1221-135766-0002.wav
  EOF
 Now we can calculate the scales needed for quantization with the calibration data:
 .. code-block:: bash
  cd egs/librispeech/ASR
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
  generate-int8-scale-table \
    ./encoder_jit_trace-pnnx.ncnn.param \
    ./encoder_jit_trace-pnnx.ncnn.bin \
    ./decoder_jit_trace-pnnx.ncnn.param \
    ./decoder_jit_trace-pnnx.ncnn.bin \
    ./joiner_jit_trace-pnnx.ncnn.param \
    ./joiner_jit_trace-pnnx.ncnn.bin \
    ./encoder-scale-table.txt \
    ./joiner-scale-table.txt \
    ./wave_filenames.txt
 The output logs are in the following:
 .. literalinclude:: ./code/generate-int-8-scale-table-for-conv-emformer.txt
 It generates the following two files:
 .. code-block:: bash
  $ ls -lh encoder-scale-table.txt joiner-scale-table.txt
  -rw-r--r-- 1 kuangfangjun root 955K Jan 11 17:28 encoder-scale-table.txt
  -rw-r--r-- 1 kuangfangjun root  18K Jan 11 17:28 joiner-scale-table.txt
 .. caution::
  Definitely, you need more calibration data to compute the scale table.
 Finally, let us use the scale table to quantize our models into ``int8``.
 .. code-block:: bash
  ncnn2int8
  usage: ncnn2int8 [inparam] [inbin] [outparam] [outbin] [calibration table]
 First, we quantize the encoder model:
 .. code-block:: bash
  cd egs/librispeech/ASR
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
  ncnn2int8 \
    ./encoder_jit_trace-pnnx.ncnn.param \
    ./encoder_jit_trace-pnnx.ncnn.bin \
    ./encoder_jit_trace-pnnx.ncnn.int8.param \
    ./encoder_jit_trace-pnnx.ncnn.int8.bin \
    ./encoder-scale-table.txt
 Next, we quantize the joiner model:
 .. code-block:: bash
  ncnn2int8 \
    ./joiner_jit_trace-pnnx.ncnn.param \
    ./joiner_jit_trace-pnnx.ncnn.bin \
    ./joiner_jit_trace-pnnx.ncnn.int8.param \
    ./joiner_jit_trace-pnnx.ncnn.int8.bin \
    ./joiner-scale-table.txt
 The above two commands generate the following 4 files:
 .. code-block:: bash
  -rw-r--r-- 1 kuangfangjun root  99M Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.bin
  -rw-r--r-- 1 kuangfangjun root  78K Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.param
  -rw-r--r-- 1 kuangfangjun root 774K Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.bin
  -rw-r--r-- 1 kuangfangjun root  496 Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.param
 Congratulations! You have successfully quantized your model from ``float32`` to ``int8``.
 .. caution::
  ``ncnn.int8.param`` and ``ncnn.int8.bin`` must be used in pairs.
  You can replace ``ncnn.param`` and ``ncnn.bin`` with ``ncnn.int8.param``
  and ``ncnn.int8.bin`` in `sherpa-ncnn`_ if you like.
  For instance, to use only the ``int8`` encoder in ``sherpa-ncnn``, you can
  replace the following invocation:
    .. code-block::
      cd egs/librispeech/ASR
      cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
      sherpa-ncnn \
        ../data/lang_bpe_500/tokens.txt \
        ./encoder_jit_trace-pnnx.ncnn.param \
        ./encoder_jit_trace-pnnx.ncnn.bin \
        ./decoder_jit_trace-pnnx.ncnn.param \
        ./decoder_jit_trace-pnnx.ncnn.bin \
        ./joiner_jit_trace-pnnx.ncnn.param \
        ./joiner_jit_trace-pnnx.ncnn.bin \
        ../test_wavs/1089-134686-0001.wav
  with
    .. code-block::
      cd egs/librispeech/ASR
      cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
      sherpa-ncnn \
        ../data/lang_bpe_500/tokens.txt \
        ./encoder_jit_trace-pnnx.ncnn.int8.param \
        ./encoder_jit_trace-pnnx.ncnn.int8.bin \
        ./decoder_jit_trace-pnnx.ncnn.param \
        ./decoder_jit_trace-pnnx.ncnn.bin \
        ./joiner_jit_trace-pnnx.ncnn.param \
        ./joiner_jit_trace-pnnx.ncnn.bin \
        ../test_wavs/1089-134686-0001.wav
 The following table compares again the file sizes:
 +----------------------------------------+------------+
 | File name                              | File size  |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.pt              | 283 MB     |
 +----------------------------------------+------------+
 | decoder_jit_trace-pnnx.pt              | 1010 KB    |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.pt               | 3.0 MB     |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB     |
 +----------------------------------------+------------+
 | decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB     |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.ncnn.bin  (fp16) | 1.5 MB     |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB     |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.ncnn.bin  (fp32) | 3.0 MB     |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.ncnn.int8.bin   | 99 MB      |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.ncnn.int8.bin    | 774 KB     |
 +----------------------------------------+------------+
 You can see that the file sizes of the model after ``int8`` quantization
 are much smaller.
 .. hint::
    Currently, only linear layers and convolutional layers are quantized
    with ``int8``, so you don't see an exact ``4x`` reduction in file sizes.
 .. note::
  You need to test the recognition accuracy after ``int8`` quantization.
 You can find the speed comparison at `<https://github.com/k2-fsa/sherpa-ncnn/issues/44>`_.
 That's it! Have fun with `sherpa-ncnn`_!
--- a/docs/source/model-export/export-with-torch-jit-script.rst
+++ b/docs/source/model-export/export-with-torch-jit-script.rst
@ -1,7 +1,7 @@
 .. _export-model-with-torch-jit-script:
 Export model with torch.jit.script()
-===================================
+====================================
 In this section, we describe how to export a model via
 ``torch.jit.script()``.
--- a/docs/source/recipes/Non-streaming-ASR/aishell/conformer_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/conformer_ctc.rst
@ -703,7 +703,7 @@ It will show you the following message:
 HLG decoding
-^^^^^^^^^^^^
+~~~~~~~~~~~~
 .. code-block:: bash
--- a/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-conformer-ctc-tensorboard-log.jpg
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-conformer-ctc-tensorboard-log.jpg
--- a/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-tdnn-lstm-ctc-tensorboard-log.jpg
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-tdnn-lstm-ctc-tensorboard-log.jpg
--- a/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-transducer_stateless_modified-tensorboard-log.png
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-transducer_stateless_modified-tensorboard-log.png
--- a/docs/source/recipes/Non-streaming-ASR/aishell/index.rst
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/index.rst
@ -19,4 +19,3 @@ It can be downloaded from `<https://www.openslr.org/33/>`_
   tdnn_lstm_ctc
   conformer_ctc
   stateless_transducer
--- a/docs/source/recipes/Non-streaming-ASR/aishell/stateless_transducer.rst
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/stateless_transducer.rst
--- a/docs/source/recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc.rst
@ -498,7 +498,7 @@ We do provide a colab notebook for this recipe showing how to use a pre-trained
 |aishell asr conformer ctc colab notebook|
 .. |aishell asr conformer ctc colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
-   :target: https://colab.research.google.com/drive/1qULaGvXq7PCu_P61oubfz9b53JzY4H3z
+   :target: https://colab.research.google.com/drive/1jbyzYq3ytm6j2nlEt-diQm-6QVWyDDEa?usp=sharing
 **Congratulations!** You have finished the aishell ASR recipe with
 TDNN-LSTM CTC models in ``icefall``.
--- a/docs/source/recipes/Non-streaming-ASR/index.rst
+++ b/docs/source/recipes/Non-streaming-ASR/index.rst
@ -0,0 +1,10 @@
 Non Streaming ASR
 =================
 .. toctree::
   :maxdepth: 2
   aishell/index
   librispeech/index
   timit/index
   yesno/index
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/conformer_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/conformer_ctc.rst
@ -888,7 +888,7 @@ It will show you the following message:
 CTC decoding
-^^^^^^^^^^^^
+~~~~~~~~~~~~
 .. code-block:: bash
@ -926,7 +926,7 @@ Its output is:
  YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
 HLG decoding
-^^^^^^^^^^^^
+~~~~~~~~~~~~
 .. code-block:: bash
@ -966,7 +966,7 @@ The output is:
 HLG decoding + n-gram LM rescoring
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
@ -1012,7 +1012,7 @@ The output is:
 HLG decoding + n-gram LM rescoring + attention decoder rescoring
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
@ -0,0 +1,223 @@
 Distillation with HuBERT
 ========================
 This tutorial shows you how to perform knowledge distillation in `icefall`_
 with the `LibriSpeech`_ dataset. The distillation method
 used here is called "Multi Vector Quantization Knowledge Distillation" (MVQ-KD).
 Please have a look at our paper `Predicting Multi-Codebook Vector Quantization Indexes for Knowledge Distillation <https://arxiv.org/abs/2211.00508>`_
 for more details about MVQ-KD.
 .. note::
    This tutorial is based on recipe
    `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_.
    Currently, we only implement MVQ-KD in this recipe. However, MVQ-KD is theoretically applicable to all recipes
    with only minor changes needed. Feel free to try out MVQ-KD in different recipes. If you
    encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`_.
 .. note::
  We assume you have read the page :ref:`install icefall` and have setup
  the environment for `icefall`_.
 .. HINT::
  We recommend you to use a GPU or several GPUs to run this recipe.
 Data preparation
 ----------------
 We first prepare necessary training data for `LibriSpeech`_.
 This is the same as in :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
 .. hint::
   The data preparation is the same as other recipes on LibriSpeech dataset,
   if you have finished this step, you can skip to :ref:`codebook_index_preparation` directly.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh
 The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
 All you need to do is to run it.
 The data preparation contains several stages, you can use the following two
 options:
  - ``--stage``
  - ``--stop-stage``
 to control which stage(s) should be run. By default, all stages are executed.
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh --stage 0 --stop-stage 0 # run only stage 0
  $ ./prepare.sh --stage 2 --stop-stage 5 # run from stage 2 to stage 5
 .. HINT::
  If you have pre-downloaded the `LibriSpeech`_
  dataset and the `musan`_ dataset, say,
  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
  ``./prepare.sh`` won't re-download them.
 .. NOTE::
  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
  are saved in ``./data`` directory.
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 .. _codebook_index_preparation:
 Codebook index preparation
 --------------------------
 Here, we prepare necessary data for MVQ-KD. This requires the generation
 of codebook indexes (please read our `paper <https://arxiv.org/abs/2211.00508>`_.
 if you are interested in details). In this tutorial, we use the pre-computed
 codebook indexes for convenience. The only thing you need to do is to
 run `./distillation_with_hubert.sh <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/distillation_with_hubert.sh>`_.
 .. note::
  There are 5 stages in total, the first and second stage will be automatically skipped
  when choosing to downloaded codebook indexes prepared by `icefall`_.
  Of course, you can extract and compute the codebook indexes by yourself. This
  will require you downloading a HuBERT-XL model and it can take a while for
  the extraction of codebook indexes.
 As usual, you can control the stages you want to run by specifying the following
 two options:
  - ``--stage``
  - ``--stop-stage``
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./distillation_with_hubert.sh --stage 0 --stop-stage 0 # run only stage 0
  $ ./distillation_with_hubert.sh --stage 2 --stop-stage 4 # run from stage 2 to stage 5
 Here are a few options in `./distillation_with_hubert.sh <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/distillation_with_hubert.sh>`_
 you need to know before you proceed.
 - ``--full_libri`` If True, use full 960h data. Otherwise only ``train-clean-100`` will be used
 - ``--use_extracted_codebook`` If True, the first two stages will be skipped and the codebook
  indexes uploaded by us will be downloaded.
 Since we are using the pre-computed codebook indexes, we set
 ``use_extracted_codebook=True``. If you want to do full `LibriSpeech`_
 experiments, please set ``full_libri=True``.
 The following command downloads the pre-computed codebook indexes
 and prepares MVQ-augmented training manifests.
 .. code-block:: bash
  $ ./distillation_with_hubert.sh --stage 2 --stop-stage 2 # run only stage 2
 Please see the
 following screenshot for the output of an example execution.
 .. figure:: ./images/distillation_codebook.png
  :width: 800
  :alt: Downloading codebook indexes and preparing training manifest.
  :align: center
  Downloading codebook indexes and preparing training manifest.
 .. hint::
  The codebook indexes we prepared for you in this tutorial
  are extracted from the 36-th layer of a fine-tuned HuBERT-XL model
  with 8 codebooks. If you want to try other configurations, please
  set ``use_extracted_codebook=False`` and set ``embedding_layer`` and
  ``num_codebooks`` by yourself.
 Now, you should see the following files under the directory ``./data/vq_fbank_layer36_cb8``.
 .. figure:: ./images/distillation_directory.png
  :width: 800
  :alt: MVQ-augmented training manifests
  :align: center
  MVQ-augmented training manifests.
 Whola! You are ready to perform knowledge distillation training now!
 Training
 --------
 To perform training, please run stage 3 by executing the following command.
 .. code-block:: bash
  $ ./prepare.sh --stage 3 --stop-stage 3 # run MVQ training
 Here is the code snippet for training:
 .. code-block:: bash
  WORLD_SIZE=$(echo ${CUDA_VISIBLE_DEVICES} | awk '{n=split($1, _, ","); print n}')
  ./pruned_transducer_stateless6/train.py \
    --manifest-dir ./data/vq_fbank_layer36_cb8 \
    --master-port 12359 \
    --full-libri $full_libri \
    --spec-aug-time-warp-factor -1 \
    --max-duration 300 \
    --world-size ${WORLD_SIZE} \
    --num-epochs 30 \
    --exp-dir $exp_dir \
    --enable-distillation True \
    --codebook-loss-scale 0.01
 There are a few training arguments in the following
 training commands that should be paid attention to.
  - ``--enable-distillation`` If True, knowledge distillation training is enabled.
  - ``--codebook-loss-scale`` The scale of the knowledge distillation loss.
  - ``--manifest-dir`` The path to the MVQ-augmented manifest.
 Decoding
 --------
 After training finished, you can test the performance on using
 the following command.
 .. code-block:: bash
  export CUDA_VISIBLE_DEVICES=0
  ./pruned_transducer_stateless6/train.py \
    --decoding-method "modified_beam_search" \
    --epoch 30 \
    --avg 10 \
    --max-duration 200 \
    --exp-dir $exp_dir \
    --enable-distillation True
 You should get similar results as `here <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS-100hours.md#distillation-with-hubert>`_.
 That's all! Feel free to experiment with your own setups and report your results.
 If you encounter any problems during training, please open up an issue `here <https://github.com/k2-fsa/icefall/issues>`_.
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_codebook.png
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_codebook.png
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_directory.png
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_directory.png
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-conformer-ctc-tensorboard-log.png
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-conformer-ctc-tensorboard-log.png
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-pruned-transducer-tensorboard-log.jpg
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-pruned-transducer-tensorboard-log.jpg
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst
@ -0,0 +1,12 @@
 LibriSpeech
 ===========
 .. toctree::
   :maxdepth: 1
   tdnn_lstm_ctc
   conformer_ctc
   pruned_transducer_stateless
   zipformer_mmi
   zipformer_ctc_blankskip
   distillation
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
@ -0,0 +1,548 @@
 .. _non_streaming_librispeech_pruned_transducer_stateless:
 Pruned transducer statelessX
 ============================
 This tutorial shows you how to run a conformer transducer model
 with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 .. Note::
   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_,
   We will take pruned_transducer_stateless4 as an example in this tutorial.
 .. HINT::
  We assume you have read the page :ref:`install icefall` and have setup
  the environment for ``icefall``.
 .. HINT::
  We recommend you to use a GPU or several GPUs to run this recipe.
 .. hint::
   Please scroll down to the bottom of this page to find download links
   for pretrained models if you don't want to train a model from scratch.
 We use pruned RNN-T to compute the loss.
 .. note::
   You can find the paper about pruned RNN-T at the following address:
   `<https://arxiv.org/abs/2206.13236>`_
 The transducer model consists of 3 parts:
  - Encoder, a.k.a, the transcription network. We use a Conformer model (the reworked version by Daniel Povey)
  - Decoder, a.k.a, the prediction network. We use a stateless model consisting of
    ``nn.Embedding`` and ``nn.Conv1d``
  - Joiner, a.k.a, the joint network.
 .. caution::
   Contrary to the conventional RNN-T models, we use a stateless decoder.
   That is, it has no recurrent connections.
 Data preparation
 ----------------
 .. hint::
   The data preparation is the same as other recipes on LibriSpeech dataset,
   if you have finished this step, you can skip to ``Training`` directly.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh
 The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
 All you need to do is to run it.
 The data preparation contains several stages, you can use the following two
 options:
  - ``--stage``
  - ``--stop-stage``
 to control which stage(s) should be run. By default, all stages are executed.
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh --stage 0 --stop-stage 0
 means to run only stage 0.
 To run stage 2 to stage 5, use:
 .. code-block:: bash
  $ ./prepare.sh --stage 2 --stop-stage 5
 .. HINT::
  If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
  dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
  ``./prepare.sh`` won't re-download them.
 .. NOTE::
  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
  are saved in ``./data`` directory.
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 Training
 --------
 Configurable options
 ~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless4/train.py --help
 shows you the training options that can be passed from the commandline.
 The following options are used quite often:
  - ``--exp-dir``
    The directory to save checkpoints, training logs and tensorboard.
  - ``--full-libri``
    If it's True, the training part uses all the training data, i.e.,
    960 hours. Otherwise, the training part uses only the subset
    ``train-clean-100``, which has 100 hours of training data.
    .. CAUTION::
      The training set is perturbed by speed with two factors: 0.9 and 1.1.
      If ``--full-libri`` is True, each epoch actually processes
      ``3x960 == 2880`` hours of data.
  - ``--num-epochs``
    It is the number of epochs to train. For instance,
    ``./pruned_transducer_stateless4/train.py --num-epochs 30`` trains for 30 epochs
    and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
    in the folder ``./pruned_transducer_stateless4/exp``.
  - ``--start-epoch``
    It's used to resume training.
    ``./pruned_transducer_stateless4/train.py --start-epoch 10`` loads the
    checkpoint ``./pruned_transducer_stateless4/exp/epoch-9.pt`` and starts
    training from epoch 10, based on the state from epoch 9.
  - ``--world-size``
    It is used for multi-GPU single-machine DDP training.
      - (a) If it is 1, then no DDP training is used.
      - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
    The following shows some use cases with it.
      **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
      GPU 2 for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="0,2"
          $ ./pruned_transducer_stateless4/train.py --world-size 2
      **Use case 2**: You have 4 GPUs and you want to use all of them
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ ./pruned_transducer_stateless4/train.py --world-size 4
      **Use case 3**: You have 4 GPUs but you only want to use GPU 3
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="3"
          $ ./pruned_transducer_stateless4/train.py --world-size 1
    .. caution::
      Only multi-GPU single-machine DDP training is implemented at present.
      Multi-GPU multi-machine DDP training will be added later.
  - ``--max-duration``
    It specifies the number of seconds over all utterances in a
    batch, before **padding**.
    If you encounter CUDA OOM, please reduce it.
    .. HINT::
      Due to padding, the number of seconds of all utterances in a
      batch will usually be larger than ``--max-duration``.
      A larger value for ``--max-duration`` may cause OOM during training,
      while a smaller value may increase the training time. You have to
      tune it.
  - ``--use-fp16``
    If it is True, the model will train with half precision, from our experiment
    results, by using half precision you can train with two times larger ``--max-duration``
    so as to get almost 2X speed up.
 Pre-configured options
 ~~~~~~~~~~~~~~~~~~~~~~
 There are some training options, e.g., number of encoder layers,
 encoder dimension, decoder dimension, number of warmup steps etc,
 that are not passed from the commandline.
 They are pre-configured by the function ``get_params()`` in
 `pruned_transducer_stateless4/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless4/train.py>`_
 You don't need to change these pre-configured parameters. If you really need to change
 them, please modify ``./pruned_transducer_stateless4/train.py`` directly.
 .. NOTE::
  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`_ are a little different from
  other recipes. It allows you to configure ``--num-encoder-layers``, ``--dim-feedforward``, ``--nhead``, ``--encoder-dim``, ``--decoder-dim``, ``--joiner-dim`` from commandline, so that you can train models with different size with pruned_transducer_stateless5.
 Training logs
 ~~~~~~~~~~~~~
 Training logs and checkpoints are saved in ``--exp-dir`` (e.g. ``pruned_transducer_stateless4/exp``.
 You will find the following files in that directory:
  - ``epoch-1.pt``, ``epoch-2.pt``, ...
    These are checkpoint files saved at the end of each epoch, containing model
    ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless4/train.py --start-epoch 11
  - ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
    These are checkpoint files saved every ``--save-every-n`` batches,
    containing model ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless4/train.py --start-batch 436000
  - ``tensorboard/``
    This folder contains tensorBoard logs. Training loss, validation loss, learning
    rate, etc, are recorded in these logs. You can visualize them by:
      .. code-block:: bash
        $ cd pruned_transducer_stateless4/exp/tensorboard
        $ tensorboard dev upload --logdir . --description "pruned transducer training for LibriSpeech with icefall"
    It will print something like below:
      .. code-block::
        TensorFlow installation not found - running with reduced feature set.
        Upload started and will continue reading any new data as it's added to the logdir.
        To stop uploading, press Ctrl-C.
        New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/QOGSPBgsR8KzcRMmie9JGw/
        [2022-11-20T15:50:50] Started scanning logdir.
        Uploading 4468 scalars...
        [2022-11-20T15:53:02] Total uploaded: 210171 scalars, 0 tensors, 0 binary objects
        Listening for new data in logdir...
    Note there is a URL in the above output. Click it and you will see
    the following screenshot:
      .. figure:: images/librispeech-pruned-transducer-tensorboard-log.jpg
         :width: 600
         :alt: TensorBoard screenshot
         :align: center
         :target: https://tensorboard.dev/experiment/QOGSPBgsR8KzcRMmie9JGw/
         TensorBoard screenshot.
  .. hint::
    If you don't have access to google, you can use the following command
    to view the tensorboard log locally:
      .. code-block:: bash
        cd pruned_transducer_stateless4/exp/tensorboard
        tensorboard --logdir . --port 6008
    It will print the following message:
      .. code-block::
        Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
        TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
    Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
    logs.
  - ``log/log-train-xxxx``
    It is the detailed training log in text format, same as the one
    you saw printed to the console during training.
 Usage example
 ~~~~~~~~~~~~~
 You can use the following command to start the training using 6 GPUs:
 .. code-block:: bash
  export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5"
  ./pruned_transducer_stateless4/train.py \
     --world-size 6 \
     --num-epochs 30 \
     --start-epoch 1 \
     --exp-dir pruned_transducer_stateless4/exp \
     --full-libri 1 \
     --max-duration 300
 Decoding
 --------
 The decoding part uses checkpoints saved by the training part, so you have
 to run the training part first.
 .. hint::
   There are two kinds of checkpoints:
    - (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
      of each epoch. You can pass ``--epoch`` to
      ``pruned_transducer_stateless4/decode.py`` to use them.
    - (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
      every ``--save-every-n`` batches. You can pass ``--iter`` to
      ``pruned_transducer_stateless4/decode.py`` to use them.
    We suggest that you try both types of checkpoints and choose the one
    that produces the lowest WERs.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless4/decode.py --help
 shows the options for decoding.
 The following shows two examples (for two types of checkpoints):
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for epoch in 25 20; do
      for avg in 7 5 3 1; do
        ./pruned_transducer_stateless4/decode.py \
          --epoch $epoch \
          --avg $avg \
          --exp-dir pruned_transducer_stateless4/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for iter in 474000; do
      for avg in 8 10 12 14 16 18; do
        ./pruned_transducer_stateless4/decode.py \
          --iter $iter \
          --avg $avg \
          --exp-dir pruned_transducer_stateless4/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 .. Note::
  Supporting decoding methods are as follows:
    - ``greedy_search`` : It takes the symbol with largest posterior probability
      of each frame as the decoding result.
    - ``beam_search`` :  It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf and
      `espnet/nets/beam_search_transducer.py <https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py#L247>`_
      is used as a reference. Basicly, it keeps topk states for each frame, and expands the kept states with their own contexts to
      next frame.
    - ``modified_beam_search`` : It implements the same algorithm as ``beam_search`` above, but it
      runs in batch mode with ``--max-sym-per-frame=1`` being hardcoded.
    - ``fast_beam_search`` : It implements graph composition between the output ``log_probs`` and
      given ``FSAs``. It is hard to describe the details in several lines of texts, you can read
      our paper in https://arxiv.org/pdf/2211.00484.pdf or our `rnnt decode code in k2 <https://github.com/k2-fsa/k2/blob/master/k2/csrc/rnnt_decode.h>`_. ``fast_beam_search`` can decode with ``FSAs`` on GPU efficiently.
    - ``fast_beam_search_LG`` : The same as ``fast_beam_search`` above, ``fast_beam_search`` uses
      an trivial graph that has only one state, while ``fast_beam_search_LG`` uses an LG graph
      (with N-gram LM).
    - ``fast_beam_search_nbest`` : It produces the decoding results as follows:
      - (1) Use ``fast_beam_search`` to get a lattice
      - (2) Select ``num_paths`` paths from the lattice using ``k2.random_paths()``
      - (3) Unique the selected paths
      - (4) Intersect the selected paths with the lattice and compute the
            shortest path from the intersection result
      - (5) The path with the largest score is used as the decoding output.
    - ``fast_beam_search_nbest_LG`` : It implements same logic as ``fast_beam_search_nbest``, the
      only difference is that it uses ``fast_beam_search_LG`` to generate the lattice.
 Export Model
 ------------
 `pruned_transducer_stateless4/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless4/export.py>`_ supports exporting checkpoints from ``pruned_transducer_stateless4/exp`` in the following ways.
 Export ``model.state_dict()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Checkpoints saved by ``pruned_transducer_stateless4/train.py`` also include
 ``optimizer.state_dict()``. It is useful for resuming training. But after training,
 we are interested only in ``model.state_dict()``. You can use the following
 command to extract ``model.state_dict()``.
 .. code-block:: bash
  # Assume that --epoch 25 --avg 3 produces the smallest WER
  # (You can get such information after running ./pruned_transducer_stateless4/decode.py)
  epoch=25
  avg=3
  ./pruned_transducer_stateless4/export.py \
    --exp-dir ./pruned_transducer_stateless4/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch $epoch \
    --avg  $avg
 It will generate a file ``./pruned_transducer_stateless4/exp/pretrained.pt``.
 .. hint::
   To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless4/decode.py``,
   you can run:
   .. code-block:: bash
      cd pruned_transducer_stateless4/exp
      ln -s pretrained.pt epoch-999.pt
   And then pass ``--epoch 999 --avg 1 --use-averaged-model 0`` to
   ``./pruned_transducer_stateless4/decode.py``.
 To use the exported model with ``./pruned_transducer_stateless4/pretrained.py``, you
 can run:
 .. code-block:: bash
  ./pruned_transducer_stateless4/pretrained.py \
    --checkpoint ./pruned_transducer_stateless4/exp/pretrained.pt \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --method greedy_search \
    /path/to/foo.wav \
    /path/to/bar.wav
 Export model using ``torch.jit.script()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  ./pruned_transducer_stateless4/export.py \
    --exp-dir ./pruned_transducer_stateless4/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 25 \
    --avg 3 \
    --jit 1
 It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
 load it by ``torch.jit.load("cpu_jit.pt")``.
 Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
 are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
 .. NOTE::
   You will need this ``cpu_jit.pt`` when deploying with Sherpa framework.
 Download pretrained models
 --------------------------
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
  - `pruned_transducer_stateless <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12>`_
  - `pruned_transducer_stateless2 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29>`_
  - `pruned_transducer_stateless4 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless4-2022-06-03>`_
  - `pruned_transducer_stateless5 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless5-2022-07-07>`_
  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
  for the details of the above pretrained models
 Deploy with Sherpa
 ------------------
 Please see `<https://k2-fsa.github.io/sherpa/python/offline_asr/conformer/librispeech.html#>`_
 for how to deploy the models in ``sherpa``.
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc.rst
@ -398,7 +398,7 @@ We provide a colab notebook for decoding with pre-trained model.
 |librispeech tdnn_lstm_ctc colab notebook|
 .. |librispeech tdnn_lstm_ctc colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
-   :target: https://colab.research.google.com/drive/1kNmDXNMwREi0rZGAOIAOJo93REBuOTcd
+   :target: https://colab.research.google.com/drive/1-iSfQMp2So-We_Uu49N4AAcMInB72u9z?usp=sharing
 **Congratulations!** You have finished the TDNN-LSTM-CTC recipe on librispeech in ``icefall``.
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip.rst
@ -0,0 +1,454 @@
 Zipformer CTC Blank Skip
 ========================
 .. hint::
   Please scroll down to the bottom of this page to find download links
   for pretrained models if you don't want to train a model from scratch.
 This tutorial shows you how to train a Zipformer model based on the guidance from 
 a co-trained CTC model using `blank skip method <https://arxiv.org/pdf/2210.16481.pdf>`_
 with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 .. note::
    We use both CTC and RNN-T loss to train. During the forward pass, the encoder output
    is first used to calculate the CTC posterior probability; then for each output frame,
    if its blank posterior is bigger than some threshold, it will be simply discarded
    from the encoder output. To prevent information loss, we also put a convolution module
    similar to the one used in conformer (referred to as “LConv”) before the frame reduction.
 Data preparation
 ----------------
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh
 The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
 All you need to do is to run it.
 .. note::
   We encourage you to read ``./prepare.sh``.
 The data preparation contains several stages. You can use the following two
 options:
  - ``--stage``
  - ``--stop-stage``
 to control which stage(s) should be run. By default, all stages are executed.
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh --stage 0 --stop-stage 0
 means to run only stage 0.
 To run stage 2 to stage 5, use:
 .. code-block:: bash
  $ ./prepare.sh --stage 2 --stop-stage 5
 .. hint::
  If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
  dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
  ``./prepare.sh`` won't re-download them.
 .. note::
  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
  are saved in ``./data`` directory.
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 Training
 --------
 For stability, it doesn`t use blank skip method until model warm-up.
 Configurable options
 ~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless7_ctc_bs/train.py --help
 shows you the training options that can be passed from the commandline.
 The following options are used quite often:
  - ``--full-libri``
    If it's True, the training part uses all the training data, i.e.,
    960 hours. Otherwise, the training part uses only the subset
    ``train-clean-100``, which has 100 hours of training data.
    .. CAUTION::
      The training set is perturbed by speed with two factors: 0.9 and 1.1.
      If ``--full-libri`` is True, each epoch actually processes
      ``3x960 == 2880`` hours of data.
  - ``--num-epochs``
    It is the number of epochs to train. For instance,
    ``./pruned_transducer_stateless7_ctc_bs/train.py --num-epochs 30`` trains for 30 epochs
    and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
    in the folder ``./pruned_transducer_stateless7_ctc_bs/exp``.
  - ``--start-epoch``
    It's used to resume training.
    ``./pruned_transducer_stateless7_ctc_bs/train.py --start-epoch 10`` loads the
    checkpoint ``./pruned_transducer_stateless7_ctc_bs/exp/epoch-9.pt`` and starts
    training from epoch 10, based on the state from epoch 9.
  - ``--world-size``
    It is used for multi-GPU single-machine DDP training.
      - (a) If it is 1, then no DDP training is used.
      - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
    The following shows some use cases with it.
      **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
      GPU 2 for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="0,2"
          $ ./pruned_transducer_stateless7_ctc_bs/train.py --world-size 2
      **Use case 2**: You have 4 GPUs and you want to use all of them
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ ./pruned_transducer_stateless7_ctc_bs/train.py --world-size 4
      **Use case 3**: You have 4 GPUs but you only want to use GPU 3
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="3"
          $ ./pruned_transducer_stateless7_ctc_bs/train.py --world-size 1
    .. caution::
      Only multi-GPU single-machine DDP training is implemented at present.
      Multi-GPU multi-machine DDP training will be added later.
  - ``--max-duration``
    It specifies the number of seconds over all utterances in a
    batch, before **padding**.
    If you encounter CUDA OOM, please reduce it.
    .. HINT::
      Due to padding, the number of seconds of all utterances in a
      batch will usually be larger than ``--max-duration``.
      A larger value for ``--max-duration`` may cause OOM during training,
      while a smaller value may increase the training time. You have to
      tune it.
 Pre-configured options
 ~~~~~~~~~~~~~~~~~~~~~~
 There are some training options, e.g., weight decay,
 number of warmup steps, results dir, etc,
 that are not passed from the commandline.
 They are pre-configured by the function ``get_params()`` in
 `pruned_transducer_stateless7_ctc_bs/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/train.py>`_
 You don't need to change these pre-configured parameters. If you really need to change
 them, please modify ``./pruned_transducer_stateless7_ctc_bs/train.py`` directly.
 Training logs
 ~~~~~~~~~~~~~
 Training logs and checkpoints are saved in ``pruned_transducer_stateless7_ctc_bs/exp``.
 You will find the following files in that directory:
  - ``epoch-1.pt``, ``epoch-2.pt``, ...
    These are checkpoint files saved at the end of each epoch, containing model
    ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless7_ctc_bs/train.py --start-epoch 11
  - ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
    These are checkpoint files saved every ``--save-every-n`` batches,
    containing model ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless7_ctc_bs/train.py --start-batch 436000
  - ``tensorboard/``
    This folder contains tensorBoard logs. Training loss, validation loss, learning
    rate, etc, are recorded in these logs. You can visualize them by:
      .. code-block:: bash
        $ cd pruned_transducer_stateless7_ctc_bs/exp/tensorboard
        $ tensorboard dev upload --logdir . --description "Zipformer-CTC co-training using blank skip for LibriSpeech with icefall"
    It will print something like below:
      .. code-block::
        TensorFlow installation not found - running with reduced feature set.
        Upload started and will continue reading any new data as it's added to the logdir.
        To stop uploading, press Ctrl-C.
        New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/xyOZUKpEQm62HBIlUD4uPA/
    Note there is a URL in the above output. Click it and you will see
    tensorboard.
  .. hint::
    If you don't have access to google, you can use the following command
    to view the tensorboard log locally:
      .. code-block:: bash
        cd pruned_transducer_stateless7_ctc_bs/exp/tensorboard
        tensorboard --logdir . --port 6008
    It will print the following message:
      .. code-block::
        Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
        TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
    Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
    logs.
  - ``log/log-train-xxxx``
    It is the detailed training log in text format, same as the one
    you saw printed to the console during training.
 Usage example
 ~~~~~~~~~~~~~
 You can use the following command to start the training using 4 GPUs:
 .. code-block:: bash
  export CUDA_VISIBLE_DEVICES="0,1,2,3"
  ./pruned_transducer_stateless7_ctc_bs/train.py \
    --world-size 4 \
    --num-epochs 30 \
    --start-epoch 1 \
    --full-libri 1 \
    --exp-dir pruned_transducer_stateless7_ctc_bs/exp \
    --max-duration 600 \
    --use-fp16 1
 Decoding
 --------
 The decoding part uses checkpoints saved by the training part, so you have
 to run the training part first.
 .. hint::
   There are two kinds of checkpoints:
    - (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
      of each epoch. You can pass ``--epoch`` to
      ``pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py`` to use them.
    - (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
      every ``--save-every-n`` batches. You can pass ``--iter`` to
      ``pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py`` to use them.
    We suggest that you try both types of checkpoints and choose the one
    that produces the lowest WERs.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py --help
 shows the options for decoding.
 The following shows the example using ``epoch-*.pt``:
 .. code-block:: bash
    for m in greedy_search fast_beam_search modified_beam_search; do
        ./pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py \
            --epoch 30 \
            --avg 13 \
            --exp-dir pruned_transducer_stateless7_ctc_bs/exp \
            --max-duration 600 \
            --decoding-method $m
    done
 To test CTC branch, you can use the following command:
 .. code-block:: bash
    for m in ctc-decoding 1best; do
        ./pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py \
            --epoch 30 \
            --avg 13 \
            --exp-dir pruned_transducer_stateless7_ctc_bs/exp \
            --max-duration 600 \
            --decoding-method $m
    done
 Export models
 -------------
 `pruned_transducer_stateless7_ctc_bs/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/export.py>`_ supports exporting checkpoints from ``pruned_transducer_stateless7_ctc_bs/exp`` in the following ways.
 Export ``model.state_dict()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Checkpoints saved by ``pruned_transducer_stateless7_ctc_bs/train.py`` also include
 ``optimizer.state_dict()``. It is useful for resuming training. But after training,
 we are interested only in ``model.state_dict()``. You can use the following
 command to extract ``model.state_dict()``.
 .. code-block:: bash
  ./pruned_transducer_stateless7_ctc_bs/export.py \
    --exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 30 \
    --avg 13 \
    --jit 0
 It will generate a file ``./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt``.
 .. hint::
   To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py``,
   you can run:
   .. code-block:: bash
      cd pruned_transducer_stateless7_ctc_bs/exp
      ln -s pretrained epoch-9999.pt
   And then pass ``--epoch 9999 --avg 1 --use-averaged-model 0`` to
   ``./pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py``.
 To use the exported model with ``./pruned_transducer_stateless7_ctc_bs/pretrained.py``, you
 can run:
 .. code-block:: bash
  ./pruned_transducer_stateless7_ctc_bs/pretrained.py \
    --checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --method greedy_search \
    /path/to/foo.wav \
    /path/to/bar.wav
 To test CTC branch using the exported model with ``./pruned_transducer_stateless7_ctc_bs/pretrained_ctc.py``:
 .. code-block:: bash
  ./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py \
    --checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
    --bpe-model data/lang_bpe_500/bpe.model \
    --method ctc-decoding \
    --sample-rate 16000 \
    /path/to/foo.wav \
    /path/to/bar.wav
 Export model using ``torch.jit.script()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  ./pruned_transducer_stateless7_ctc_bs/export.py \
    --exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 30 \
    --avg 13 \
    --jit 1
 It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
 load it by ``torch.jit.load("cpu_jit.pt")``.
 Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
 are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
 To use the generated files with ``./pruned_transducer_stateless7_ctc_bs/jit_pretrained.py``:
 .. code-block:: bash
  ./pruned_transducer_stateless7_ctc_bs/jit_pretrained.py \
    --nn-model-filename ./pruned_transducer_stateless7_ctc_bs/exp/cpu_jit.pt \
    /path/to/foo.wav \
    /path/to/bar.wav
 To test CTC branch using the generated files with ``./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py``:
 .. code-block:: bash
  ./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py \
    --model-filename ./pruned_transducer_stateless7_ctc_bs/exp/cpu_jit.pt \
    --bpe-model data/lang_bpe_500/bpe.model \
    --method ctc-decoding \
    --sample-rate 16000 \
    /path/to/foo.wav \
    /path/to/bar.wav
 Download pretrained models
 --------------------------
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
  - trained on LibriSpeech 100h: `<https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2022-12-14>`_
  - trained on LibriSpeech 960h: `<https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2023-01-29>`_
  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
  for the details of the above pretrained models
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/zipformer_mmi.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/zipformer_mmi.rst
@ -0,0 +1,422 @@
 Zipformer MMI
 ===============
 .. hint::
   Please scroll down to the bottom of this page to find download links
   for pretrained models if you don't want to train a model from scratch.
 This tutorial shows you how to train an Zipformer MMI model
 with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 We use LF-MMI to compute the loss.
 .. note::
   You can find the document about LF-MMI training at the following address:
   `<https://github.com/k2-fsa/next-gen-kaldi-wechat/blob/master/pdf/LF-MMI-training-and-decoding-in-k2-Part-I.pdf>`_
 Data preparation
 ----------------
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh
 The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
 All you need to do is to run it.
 .. note::
   We encourage you to read ``./prepare.sh``.
 The data preparation contains several stages. You can use the following two
 options:
  - ``--stage``
  - ``--stop-stage``
 to control which stage(s) should be run. By default, all stages are executed.
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh --stage 0 --stop-stage 0
 means to run only stage 0.
 To run stage 2 to stage 5, use:
 .. code-block:: bash
  $ ./prepare.sh --stage 2 --stop-stage 5
 .. hint::
  If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
  dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
  ``./prepare.sh`` won't re-download them.
 .. note::
  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
  are saved in ``./data`` directory.
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 Training
 --------
 For stability, it uses CTC loss for model warm-up and then switches to MMI loss.
 Configurable options
 ~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./zipformer_mmi/train.py --help
 shows you the training options that can be passed from the commandline.
 The following options are used quite often:
  - ``--full-libri``
    If it's True, the training part uses all the training data, i.e.,
    960 hours. Otherwise, the training part uses only the subset
    ``train-clean-100``, which has 100 hours of training data.
    .. CAUTION::
      The training set is perturbed by speed with two factors: 0.9 and 1.1.
      If ``--full-libri`` is True, each epoch actually processes
      ``3x960 == 2880`` hours of data.
  - ``--num-epochs``
    It is the number of epochs to train. For instance,
    ``./zipformer_mmi/train.py --num-epochs 30`` trains for 30 epochs
    and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
    in the folder ``./zipformer_mmi/exp``.
  - ``--start-epoch``
    It's used to resume training.
    ``./zipformer_mmi/train.py --start-epoch 10`` loads the
    checkpoint ``./zipformer_mmi/exp/epoch-9.pt`` and starts
    training from epoch 10, based on the state from epoch 9.
  - ``--world-size``
    It is used for multi-GPU single-machine DDP training.
      - (a) If it is 1, then no DDP training is used.
      - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
    The following shows some use cases with it.
      **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
      GPU 2 for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="0,2"
          $ ./zipformer_mmi/train.py --world-size 2
      **Use case 2**: You have 4 GPUs and you want to use all of them
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ ./zipformer_mmi/train.py --world-size 4
      **Use case 3**: You have 4 GPUs but you only want to use GPU 3
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="3"
          $ ./zipformer_mmi/train.py --world-size 1
    .. caution::
      Only multi-GPU single-machine DDP training is implemented at present.
      Multi-GPU multi-machine DDP training will be added later.
  - ``--max-duration``
    It specifies the number of seconds over all utterances in a
    batch, before **padding**.
    If you encounter CUDA OOM, please reduce it.
    .. HINT::
      Due to padding, the number of seconds of all utterances in a
      batch will usually be larger than ``--max-duration``.
      A larger value for ``--max-duration`` may cause OOM during training,
      while a smaller value may increase the training time. You have to
      tune it.
 Pre-configured options
 ~~~~~~~~~~~~~~~~~~~~~~
 There are some training options, e.g., weight decay,
 number of warmup steps, results dir, etc,
 that are not passed from the commandline.
 They are pre-configured by the function ``get_params()`` in
 `zipformer_mmi/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/zipformer_mmi/train.py>`_
 You don't need to change these pre-configured parameters. If you really need to change
 them, please modify ``./zipformer_mmi/train.py`` directly.
 Training logs
 ~~~~~~~~~~~~~
 Training logs and checkpoints are saved in ``zipformer_mmi/exp``.
 You will find the following files in that directory:
  - ``epoch-1.pt``, ``epoch-2.pt``, ...
    These are checkpoint files saved at the end of each epoch, containing model
    ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
      .. code-block:: bash
        $ ./zipformer_mmi/train.py --start-epoch 11
  - ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
    These are checkpoint files saved every ``--save-every-n`` batches,
    containing model ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
      .. code-block:: bash
        $ ./zipformer_mmi/train.py --start-batch 436000
  - ``tensorboard/``
    This folder contains tensorBoard logs. Training loss, validation loss, learning
    rate, etc, are recorded in these logs. You can visualize them by:
      .. code-block:: bash
        $ cd zipformer_mmi/exp/tensorboard
        $ tensorboard dev upload --logdir . --description "Zipformer MMI training for LibriSpeech with icefall"
    It will print something like below:
      .. code-block::
        TensorFlow installation not found - running with reduced feature set.
        Upload started and will continue reading any new data as it's added to the logdir.
        To stop uploading, press Ctrl-C.
        New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/xyOZUKpEQm62HBIlUD4uPA/
    Note there is a URL in the above output. Click it and you will see
    tensorboard.
  .. hint::
    If you don't have access to google, you can use the following command
    to view the tensorboard log locally:
      .. code-block:: bash
        cd zipformer_mmi/exp/tensorboard
        tensorboard --logdir . --port 6008
    It will print the following message:
      .. code-block::
        Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
        TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
    Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
    logs.
  - ``log/log-train-xxxx``
    It is the detailed training log in text format, same as the one
    you saw printed to the console during training.
 Usage example
 ~~~~~~~~~~~~~
 You can use the following command to start the training using 4 GPUs:
 .. code-block:: bash
  export CUDA_VISIBLE_DEVICES="0,1,2,3"
  ./zipformer_mmi/train.py \
    --world-size 4 \
    --num-epochs 30 \
    --start-epoch 1 \
    --full-libri 1 \
    --exp-dir zipformer_mmi/exp \
    --max-duration 500 \
    --use-fp16 1 \
    --num-workers 2
 Decoding
 --------
 The decoding part uses checkpoints saved by the training part, so you have
 to run the training part first.
 .. hint::
   There are two kinds of checkpoints:
    - (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
      of each epoch. You can pass ``--epoch`` to
      ``zipformer_mmi/decode.py`` to use them.
    - (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
      every ``--save-every-n`` batches. You can pass ``--iter`` to
      ``zipformer_mmi/decode.py`` to use them.
    We suggest that you try both types of checkpoints and choose the one
    that produces the lowest WERs.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./zipformer_mmi/decode.py --help
 shows the options for decoding.
 The following shows the example using ``epoch-*.pt``:
 .. code-block:: bash
  for m in nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescoring-4-gram; do
    ./zipformer_mmi/decode.py \
      --epoch 30 \
      --avg 10 \
      --exp-dir ./zipformer_mmi/exp/ \
      --max-duration 100 \
      --lang-dir data/lang_bpe_500 \
      --nbest-scale 1.2 \
      --hp-scale 1.0 \
      --decoding-method $m
  done
 Export models
 -------------
 `zipformer_mmi/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/zipformer_mmi/export.py>`_ supports exporting checkpoints from ``zipformer_mmi/exp`` in the following ways.
 Export ``model.state_dict()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Checkpoints saved by ``zipformer_mmi/train.py`` also include
 ``optimizer.state_dict()``. It is useful for resuming training. But after training,
 we are interested only in ``model.state_dict()``. You can use the following
 command to extract ``model.state_dict()``.
 .. code-block:: bash
  ./zipformer_mmi/export.py \
    --exp-dir ./zipformer_mmi/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 30 \
    --avg 9 \
    --jit 0
 It will generate a file ``./zipformer_mmi/exp/pretrained.pt``.
 .. hint::
   To use the generated ``pretrained.pt`` for ``zipformer_mmi/decode.py``,
   you can run:
   .. code-block:: bash
      cd zipformer_mmi/exp
      ln -s pretrained epoch-9999.pt
   And then pass ``--epoch 9999 --avg 1 --use-averaged-model 0`` to
   ``./zipformer_mmi/decode.py``.
 To use the exported model with ``./zipformer_mmi/pretrained.py``, you
 can run:
 .. code-block:: bash
  ./zipformer_mmi/pretrained.py \
    --checkpoint ./zipformer_mmi/exp/pretrained.pt \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --method 1best \
    /path/to/foo.wav \
    /path/to/bar.wav
 Export model using ``torch.jit.script()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  ./zipformer_mmi/export.py \
    --exp-dir ./zipformer_mmi/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 30 \
    --avg 9 \
    --jit 1
 It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
 load it by ``torch.jit.load("cpu_jit.pt")``.
 Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
 are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
 To use the generated files with ``./zipformer_mmi/jit_pretrained.py``:
 .. code-block:: bash
  ./zipformer_mmi/jit_pretrained.py \
    --nn-model-filename ./zipformer_mmi/exp/cpu_jit.pt \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --method 1best \
    /path/to/foo.wav \
    /path/to/bar.wav
 Download pretrained models
 --------------------------
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
  - `<https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-mmi-2022-12-08>`_
  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
  for the details of the above pretrained models
--- a/docs/source/recipes/Non-streaming-ASR/timit/index.rst
+++ b/docs/source/recipes/Non-streaming-ASR/timit/index.rst
@ -6,4 +6,3 @@ TIMIT
   tdnn_ligru_ctc
   tdnn_lstm_ctc
--- a/docs/source/recipes/Non-streaming-ASR/timit/tdnn_ligru_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/timit/tdnn_ligru_ctc.rst
--- a/docs/source/recipes/Non-streaming-ASR/timit/tdnn_lstm_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/timit/tdnn_lstm_ctc.rst
--- a/docs/source/recipes/Non-streaming-ASR/yesno/images/tdnn-tensorboard-log.png
+++ b/docs/source/recipes/Non-streaming-ASR/yesno/images/tdnn-tensorboard-log.png
--- a/docs/source/recipes/Non-streaming-ASR/yesno/index.rst
+++ b/docs/source/recipes/Non-streaming-ASR/yesno/index.rst
--- a/docs/source/recipes/Non-streaming-ASR/yesno/tdnn.rst
+++ b/docs/source/recipes/Non-streaming-ASR/yesno/tdnn.rst
--- a/docs/source/recipes/Streaming-ASR/index.rst
+++ b/docs/source/recipes/Streaming-ASR/index.rst
@ -0,0 +1,12 @@
 Streaming ASR
 =============
 .. toctree::
   :maxdepth: 1
   introduction
 .. toctree::
   :maxdepth: 2
   librispeech/index
--- a/docs/source/recipes/Streaming-ASR/introduction.rst
+++ b/docs/source/recipes/Streaming-ASR/introduction.rst
@ -0,0 +1,53 @@
 Introduction
 ============
 This page shows you how we implement streaming **X-former transducer** models for ASR.
 .. HINT::
   X-former transducer here means the encoder of the transducer model uses Multi-Head Attention,
   like `Conformer <https://arxiv.org/pdf/2005.08100.pdf>`_, `EmFormer <https://arxiv.org/pdf/2010.10759.pdf>`_ etc.
 Currently we have implemented two types of streaming models, one uses Conformer as encoder, the other uses Emformer as encoder.
 Streaming Conformer
 -------------------
 The main idea of training a streaming model is to make the model see limited contexts
 in training time, we can achieve this by applying a mask to the output of self-attention.
 In icefall, we implement the streaming conformer the way just like what `WeNet <https://arxiv.org/pdf/2012.05481.pdf>`_ did.
 .. NOTE::
   The conformer-transducer recipes in LibriSpeech datasets, like, `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
   `pruned_transducer_stateless3 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless3>`_,
   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_
   all support streaming.
 .. NOTE::
   Training a streaming conformer model in ``icefall`` is almost the same as training a
   non-streaming model, all you need to do is passing several extra arguments.
   See :doc:`Pruned transducer statelessX <librispeech/pruned_transducer_stateless>` for more details.
 .. HINT::
   If you want to modify a non-streaming conformer recipe to support both streaming and non-streaming, please refer
   to `this pull request <https://github.com/k2-fsa/icefall/pull/454>`_.  After adding the code needed by streaming training,
   you have to re-train it with the extra arguments metioned in the docs above to get a streaming model.
 Streaming Emformer
 ------------------
 The Emformer model proposed `here <https://arxiv.org/pdf/2010.10759.pdf>`_ uses more
 complicated techniques. It has a memory bank component to memorize history information,
 what' more, it also introduces right context in training time by hard-copying part of
 the input features.
 We have three variants of Emformer models in ``icefall``.
 - ``pruned_stateless_emformer_rnnt2`` using Emformer from torchaudio, see `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2>`_.
 - ``conv_emformer_transducer_stateless`` using ConvEmformer implemented by ourself. Different from the Emformer in torchaudio,
   ConvEmformer has a convolution in each layer and uses the mechanisms in our reworked conformer model.
   See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless>`_.
 - ``conv_emformer_transducer_stateless2`` using ConvEmformer implemented by ourself. The only difference from the above one is that
   it uses a simplified memory bank. See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2>`_.
--- a/docs/source/recipes/Streaming-ASR/librispeech/images/librispeech-lstm-transducer-tensorboard-log.png
+++ b/docs/source/recipes/Streaming-ASR/librispeech/images/librispeech-lstm-transducer-tensorboard-log.png
--- a/docs/source/recipes/Streaming-ASR/librispeech/images/streaming-librispeech-pruned-transducer-tensorboard-log.jpg
+++ b/docs/source/recipes/Streaming-ASR/librispeech/images/streaming-librispeech-pruned-transducer-tensorboard-log.jpg
--- a/docs/source/recipes/Streaming-ASR/librispeech/index.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/index.rst
@ -4,6 +4,8 @@ LibriSpeech
 .. toctree::
   :maxdepth: 1
-   tdnn_lstm_ctc
+   pruned_transducer_stateless
-   conformer_ctc
+
   lstm_pruned_stateless_transducer
   zipformer_transducer
--- a/docs/source/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst
@ -515,10 +515,10 @@ To use the generated files with ``./lstm_transducer_stateless2/jit_pretrained``:
   Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/english/server.html>`_
   for how to use the exported models in ``sherpa``.
-.. _export-model-for-ncnn:
+.. _export-lstm-transducer-model-for-ncnn:
-Export model for ncnn
+Export LSTM transducer models for ncnn
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 We support exporting pretrained LSTM transducer models to
 `ncnn <https://github.com/tencent/ncnn>`_ using
@ -531,16 +531,36 @@ First, let us install a modified version of ``ncnn``:
  git clone https://github.com/csukuangfj/ncnn
  cd ncnn
  git submodule update --recursive --init
-  python3 setup.py bdist_wheel
+
-  ls -lh dist/
+  # Note: We don't use "python setup.py install" or "pip install ." here
-  pip install ./dist/*.whl
+
  mkdir -p build-wheel
  cd build-wheel
  cmake \
    -DCMAKE_BUILD_TYPE=Release \
    -DNCNN_PYTHON=ON \
    -DNCNN_BUILD_BENCHMARK=OFF \
    -DNCNN_BUILD_EXAMPLES=OFF \
    -DNCNN_BUILD_TOOLS=ON \
    ..
  make -j4
  cd ..
  # Note: $PWD here is /path/to/ncnn
  export PYTHONPATH=$PWD/python:$PYTHONPATH
  export PATH=$PWD/tools/pnnx/build/src:$PATH
  export PATH=$PWD/build-wheel/tools/quantize:$PATH
  # now build pnnx
  cd tools/pnnx
  mkdir build
  cd build
  cmake ..
  make -j4
  export PATH=$PWD/src:$PATH
  ./src/pnnx
@ -549,6 +569,9 @@ First, let us install a modified version of ``ncnn``:
   We assume that you have added the path to the binary ``pnnx`` to the
   environment variable ``PATH``.
   We also assume that you have added ``build/tools/quantize`` to the environment
   variable ``PATH`` so that you are able to use ``ncnn2int8`` later.
 Second, let us export the model using ``torch.jit.trace()`` that is suitable
 for ``pnnx``:
@ -634,3 +657,6 @@ by visiting the following links:
 You can find more usages of the pretrained models in
 `<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/index.html>`_
 Export ConvEmformer transducer models for ncnn
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst
@ -0,0 +1,735 @@
 Pruned transducer statelessX
 ============================
 This tutorial shows you how to run a **streaming** conformer transducer model
 with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 .. Note::
   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_,
   We will take pruned_transducer_stateless4 as an example in this tutorial.
 .. HINT::
  We assume you have read the page :ref:`install icefall` and have setup
  the environment for ``icefall``.
 .. HINT::
  We recommend you to use a GPU or several GPUs to run this recipe.
 .. hint::
   Please scroll down to the bottom of this page to find download links
   for pretrained models if you don't want to train a model from scratch.
 We use pruned RNN-T to compute the loss.
 .. note::
   You can find the paper about pruned RNN-T at the following address:
   `<https://arxiv.org/abs/2206.13236>`_
 The transducer model consists of 3 parts:
  - Encoder, a.k.a, the transcription network. We use a Conformer model (the reworked version by Daniel Povey)
  - Decoder, a.k.a, the prediction network. We use a stateless model consisting of
    ``nn.Embedding`` and ``nn.Conv1d``
  - Joiner, a.k.a, the joint network.
 .. caution::
   Contrary to the conventional RNN-T models, we use a stateless decoder.
   That is, it has no recurrent connections.
 Data preparation
 ----------------
 .. hint::
   The data preparation is the same as other recipes on LibriSpeech dataset,
   if you have finished this step, you can skip to ``Training`` directly.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh
 The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
 All you need to do is to run it.
 The data preparation contains several stages, you can use the following two
 options:
  - ``--stage``
  - ``--stop-stage``
 to control which stage(s) should be run. By default, all stages are executed.
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh --stage 0 --stop-stage 0
 means to run only stage 0.
 To run stage 2 to stage 5, use:
 .. code-block:: bash
  $ ./prepare.sh --stage 2 --stop-stage 5
 .. HINT::
  If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
  dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
  ``./prepare.sh`` won't re-download them.
 .. NOTE::
  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
  are saved in ``./data`` directory.
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 Training
 --------
 .. NOTE::
   We put the streaming and non-streaming model in one recipe, to train a streaming model you only
   need to add **4** extra options comparing with training a non-streaming model. These options are
   ``--dynamic-chunk-training``, ``--num-left-chunks``, ``--causal-convolution``, ``--short-chunk-size``.
   You can see the configurable options below for their meanings or read https://arxiv.org/pdf/2012.05481.pdf for more details.
 Configurable options
 ~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless4/train.py --help
 shows you the training options that can be passed from the commandline.
 The following options are used quite often:
  - ``--exp-dir``
    The directory to save checkpoints, training logs and tensorboard.
  - ``--full-libri``
    If it's True, the training part uses all the training data, i.e.,
    960 hours. Otherwise, the training part uses only the subset
    ``train-clean-100``, which has 100 hours of training data.
    .. CAUTION::
      The training set is perturbed by speed with two factors: 0.9 and 1.1.
      If ``--full-libri`` is True, each epoch actually processes
      ``3x960 == 2880`` hours of data.
  - ``--num-epochs``
    It is the number of epochs to train. For instance,
    ``./pruned_transducer_stateless4/train.py --num-epochs 30`` trains for 30 epochs
    and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
    in the folder ``./pruned_transducer_stateless4/exp``.
  - ``--start-epoch``
    It's used to resume training.
    ``./pruned_transducer_stateless4/train.py --start-epoch 10`` loads the
    checkpoint ``./pruned_transducer_stateless4/exp/epoch-9.pt`` and starts
    training from epoch 10, based on the state from epoch 9.
  - ``--world-size``
    It is used for multi-GPU single-machine DDP training.
      - (a) If it is 1, then no DDP training is used.
      - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
    The following shows some use cases with it.
      **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
      GPU 2 for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="0,2"
          $ ./pruned_transducer_stateless4/train.py --world-size 2
      **Use case 2**: You have 4 GPUs and you want to use all of them
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ ./pruned_transducer_stateless4/train.py --world-size 4
      **Use case 3**: You have 4 GPUs but you only want to use GPU 3
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="3"
          $ ./pruned_transducer_stateless4/train.py --world-size 1
    .. caution::
      Only multi-GPU single-machine DDP training is implemented at present.
      Multi-GPU multi-machine DDP training will be added later.
  - ``--max-duration``
    It specifies the number of seconds over all utterances in a
    batch, before **padding**.
    If you encounter CUDA OOM, please reduce it.
    .. HINT::
      Due to padding, the number of seconds of all utterances in a
      batch will usually be larger than ``--max-duration``.
      A larger value for ``--max-duration`` may cause OOM during training,
      while a smaller value may increase the training time. You have to
      tune it.
  - ``--use-fp16``
    If it is True, the model will train with half precision, from our experiment
    results, by using half precision you can train with two times larger ``--max-duration``
    so as to get almost 2X speed up.
  - ``--dynamic-chunk-training``
    The flag that indicates whether to train a streaming model or not, it
    **MUST** be True if you want to train a streaming model.
  - ``--short-chunk-size``
    When training a streaming attention model with chunk masking, the chunk size
    would be either max sequence length of current batch or uniformly sampled from
    (1, short_chunk_size). The default value is 25, you don't have to change it most of the time.
  - ``--num-left-chunks``
    It indicates how many left context (in chunks) that can be seen when calculating attention.
    The default value is 4, you don't have to change it most of the time.
  - ``--causal-convolution``
    Whether to use causal convolution in conformer encoder layer, this requires
    to be True when training a streaming model.
 Pre-configured options
 ~~~~~~~~~~~~~~~~~~~~~~
 There are some training options, e.g., number of encoder layers,
 encoder dimension, decoder dimension, number of warmup steps etc,
 that are not passed from the commandline.
 They are pre-configured by the function ``get_params()`` in
 `pruned_transducer_stateless4/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless4/train.py>`_
 You don't need to change these pre-configured parameters. If you really need to change
 them, please modify ``./pruned_transducer_stateless4/train.py`` directly.
 .. NOTE::
  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`_ are a little different from
  other recipes. It allows you to configure ``--num-encoder-layers``, ``--dim-feedforward``, ``--nhead``, ``--encoder-dim``, ``--decoder-dim``, ``--joiner-dim`` from commandline, so that you can train models with different size with pruned_transducer_stateless5.
 Training logs
 ~~~~~~~~~~~~~
 Training logs and checkpoints are saved in ``--exp-dir`` (e.g. ``pruned_transducer_stateless4/exp``.
 You will find the following files in that directory:
  - ``epoch-1.pt``, ``epoch-2.pt``, ...
    These are checkpoint files saved at the end of each epoch, containing model
    ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless4/train.py --start-epoch 11
  - ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
    These are checkpoint files saved every ``--save-every-n`` batches,
    containing model ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless4/train.py --start-batch 436000
  - ``tensorboard/``
    This folder contains tensorBoard logs. Training loss, validation loss, learning
    rate, etc, are recorded in these logs. You can visualize them by:
      .. code-block:: bash
        $ cd pruned_transducer_stateless4/exp/tensorboard
        $ tensorboard dev upload --logdir . --description "pruned transducer training for LibriSpeech with icefall"
    It will print something like below:
      .. code-block::
        TensorFlow installation not found - running with reduced feature set.
        Upload started and will continue reading any new data as it's added to the logdir.
        To stop uploading, press Ctrl-C.
        New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/97VKXf80Ru61CnP2ALWZZg/
        [2022-11-20T15:50:50] Started scanning logdir.
        Uploading 4468 scalars...
        [2022-11-20T15:53:02] Total uploaded: 210171 scalars, 0 tensors, 0 binary objects
        Listening for new data in logdir...
    Note there is a URL in the above output. Click it and you will see
    the following screenshot:
      .. figure:: images/streaming-librispeech-pruned-transducer-tensorboard-log.jpg
         :width: 600
         :alt: TensorBoard screenshot
         :align: center
         :target: https://tensorboard.dev/experiment/97VKXf80Ru61CnP2ALWZZg/
         TensorBoard screenshot.
  .. hint::
    If you don't have access to google, you can use the following command
    to view the tensorboard log locally:
      .. code-block:: bash
        cd pruned_transducer_stateless4/exp/tensorboard
        tensorboard --logdir . --port 6008
    It will print the following message:
      .. code-block::
        Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
        TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
    Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
    logs.
  - ``log/log-train-xxxx``
    It is the detailed training log in text format, same as the one
    you saw printed to the console during training.
 Usage example
 ~~~~~~~~~~~~~
 You can use the following command to start the training using 4 GPUs:
 .. code-block:: bash
  export CUDA_VISIBLE_DEVICES="0,1,2,3"
  ./pruned_transducer_stateless4/train.py \
     --world-size 4 \
     --dynamic-chunk-training 1 \
     --causal-convolution 1 \
     --num-epochs 30 \
     --start-epoch 1 \
     --exp-dir pruned_transducer_stateless4/exp \
     --full-libri 1 \
     --max-duration 300
 .. NOTE::
   Comparing with training a non-streaming model, you only need to add two extra options,
   ``--dynamic-chunk-training 1``  and ``--causal-convolution 1`` .
 Decoding
 --------
 The decoding part uses checkpoints saved by the training part, so you have
 to run the training part first.
 .. hint::
   There are two kinds of checkpoints:
    - (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
      of each epoch. You can pass ``--epoch`` to
      ``pruned_transducer_stateless4/decode.py`` to use them.
    - (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
      every ``--save-every-n`` batches. You can pass ``--iter`` to
      ``pruned_transducer_stateless4/decode.py`` to use them.
    We suggest that you try both types of checkpoints and choose the one
    that produces the lowest WERs.
 .. tip::
    To decode a streaming model, you can use either ``simulate streaming decoding`` in ``decode.py`` or
    ``real streaming decoding`` in ``streaming_decode.py``, the difference between ``decode.py`` and
    ``streaming_decode.py`` is that, ``decode.py`` processes the whole acoustic frames at one time with masking (i.e. same as training),
    but ``streaming_decode.py`` processes the acoustic frames chunk by chunk (so it can only see limited context).
 .. NOTE::
   ``simulate streaming decoding`` in ``decode.py`` and ``real streaming decoding`` in ``streaming_decode.py`` should
   produce almost the same results given the same ``--decode-chunk-size`` and ``--left-context``.
 Simulate streaming decoding
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless4/decode.py --help
 shows the options for decoding.
 The following options are important for streaming models:
  ``--simulate-streaming``
    If you want to decode a streaming model with ``decode.py``, you **MUST** set
    ``--simulate-streaming`` to ``True``. ``simulate`` here means the acoustic frames
    are not processed frame by frame (or chunk by chunk), instead, the whole sequence
    is processed at one time with masking (the same as training).
  ``--causal-convolution``
    If True, the convolution module in encoder layers will be causal convolution.
    This is **MUST** be True when decoding with a streaming model.
  ``--decode-chunk-size``
    For streaming models, we will calculate the chunk-wise attention, ``--decode-chunk-size``
    indicates the chunk length (in frames after subsampling) for chunk-wise attention.
    For ``simulate streaming decoding`` the ``decode-chunk-size`` is used to generate
    the attention mask.
  ``--left-context``
    ``--left-context`` indicates how many left context frames (after subsampling) can be seen
    for current chunk when calculating chunk-wise attention. Normally, ``left-context`` should equal
    to ``decode-chunk-size * num-left-chunks``, where ``num-left-chunks`` is the option used
    to train this model. For ``simulate streaming decoding`` the ``left-context`` is used to generate
    the attention mask.
 The following shows two examples (for the two types of checkpoints):
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for epoch in 25 20; do
      for avg in 7 5 3 1; do
        ./pruned_transducer_stateless4/decode.py \
          --epoch $epoch \
          --avg $avg \
          --simulate-streaming 1 \
          --causal-convolution 1 \
          --decode-chunk-size 16 \
          --left-context 64 \
          --exp-dir pruned_transducer_stateless4/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for iter in 474000; do
      for avg in 8 10 12 14 16 18; do
        ./pruned_transducer_stateless4/decode.py \
          --iter $iter \
          --avg $avg \
          --simulate-streaming 1 \
          --causal-convolution 1 \
          --decode-chunk-size 16 \
          --left-context 64 \
          --exp-dir pruned_transducer_stateless4/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 Real streaming decoding
 ~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless4/streaming_decode.py --help
 shows the options for decoding.
 The following options are important for streaming models:
  ``--decode-chunk-size``
    For streaming models, we will calculate the chunk-wise attention, ``--decode-chunk-size``
    indicates the chunk length (in frames after subsampling) for chunk-wise attention.
    For ``real streaming decoding``, we will process ``decode-chunk-size`` acoustic frames at each time.
  ``--left-context``
    ``--left-context`` indicates how many left context frames (after subsampling) can be seen
    for current chunk when calculating chunk-wise attention. Normally, ``left-context`` should equal
    to ``decode-chunk-size * num-left-chunks``, where ``num-left-chunks`` is the option used
    to train this model.
  ``--num-decode-streams``
    The number of decoding streams that can be run in parallel (very similar to the ``bath size``).
    For ``real streaming decoding``, the batches will be packed dynamically, for example, if the
    ``num-decode-streams`` equals to 10, then, sequence 1 to 10 will be decoded at first, after a while,
    suppose sequence 1 and 2 are done, so, sequence 3 to 12 will be processed parallelly in a batch.
 .. NOTE::
   We also try adding ``--right-context`` in the real streaming decoding, but it seems not to benefit
   the performance for all the models, the reasons might be the training and decoding mismatch. You
   can try decoding with ``--right-context`` to see if it helps. The default value is 0.
 The following shows two examples (for the two types of checkpoints):
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for epoch in 25 20; do
      for avg in 7 5 3 1; do
        ./pruned_transducer_stateless4/decode.py \
          --epoch $epoch \
          --avg $avg \
          --decode-chunk-size 16 \
          --left-context 64 \
          --num-decode-streams 100 \
          --exp-dir pruned_transducer_stateless4/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for iter in 474000; do
      for avg in 8 10 12 14 16 18; do
        ./pruned_transducer_stateless4/decode.py \
          --iter $iter \
          --avg $avg \
          --decode-chunk-size 16 \
          --left-context 64 \
          --num-decode-streams 100 \
          --exp-dir pruned_transducer_stateless4/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 .. tip::
  Supporting decoding methods are as follows:
    - ``greedy_search`` : It takes the symbol with largest posterior probability
      of each frame as the decoding result.
    - ``beam_search`` :  It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf and
      `espnet/nets/beam_search_transducer.py <https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py#L247>`_
      is used as a reference. Basicly, it keeps topk states for each frame, and expands the kept states with their own contexts to
      next frame.
    - ``modified_beam_search`` : It implements the same algorithm as ``beam_search`` above, but it
      runs in batch mode with ``--max-sym-per-frame=1`` being hardcoded.
    - ``fast_beam_search`` : It implements graph composition between the output ``log_probs`` and
      given ``FSAs``. It is hard to describe the details in several lines of texts, you can read
      our paper in https://arxiv.org/pdf/2211.00484.pdf or our `rnnt decode code in k2 <https://github.com/k2-fsa/k2/blob/master/k2/csrc/rnnt_decode.h>`_. ``fast_beam_search`` can decode with ``FSAs`` on GPU efficiently.
    - ``fast_beam_search_LG`` : The same as ``fast_beam_search`` above, ``fast_beam_search`` uses
      an trivial graph that has only one state, while ``fast_beam_search_LG`` uses an LG graph
      (with N-gram LM).
    - ``fast_beam_search_nbest`` : It produces the decoding results as follows:
      - (1) Use ``fast_beam_search`` to get a lattice
      - (2) Select ``num_paths`` paths from the lattice using ``k2.random_paths()``
      - (3) Unique the selected paths
      - (4) Intersect the selected paths with the lattice and compute the
            shortest path from the intersection result
      - (5) The path with the largest score is used as the decoding output.
    - ``fast_beam_search_nbest_LG`` : It implements same logic as ``fast_beam_search_nbest``, the
      only difference is that it uses ``fast_beam_search_LG`` to generate the lattice.
 .. NOTE::
  The supporting decoding methods in ``streaming_decode.py`` might be less than that in ``decode.py``, if needed,
  you can implement them by yourself or file a issue in `icefall <https://github.com/k2-fsa/icefall/issues>`_ .
 Export Model
 ------------
 `pruned_transducer_stateless4/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless4/export.py>`_ supports exporting checkpoints from ``pruned_transducer_stateless4/exp`` in the following ways.
 Export ``model.state_dict()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Checkpoints saved by ``pruned_transducer_stateless4/train.py`` also include
 ``optimizer.state_dict()``. It is useful for resuming training. But after training,
 we are interested only in ``model.state_dict()``. You can use the following
 command to extract ``model.state_dict()``.
 .. code-block:: bash
  # Assume that --epoch 25 --avg 3 produces the smallest WER
  # (You can get such information after running ./pruned_transducer_stateless4/decode.py)
  epoch=25
  avg=3
  ./pruned_transducer_stateless4/export.py \
    --exp-dir ./pruned_transducer_stateless4/exp \
    --streaming-model 1 \
    --causal-convolution 1 \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch $epoch \
    --avg  $avg
 .. caution::
   ``--streaming-model`` and ``--causal-convolution`` require to be True to export
   a streaming mdoel.
 It will generate a file ``./pruned_transducer_stateless4/exp/pretrained.pt``.
 .. hint::
   To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless4/decode.py``,
   you can run:
   .. code-block:: bash
      cd pruned_transducer_stateless4/exp
      ln -s pretrained.pt epoch-999.pt
   And then pass ``--epoch 999 --avg 1 --use-averaged-model 0`` to
   ``./pruned_transducer_stateless4/decode.py``.
 To use the exported model with ``./pruned_transducer_stateless4/pretrained.py``, you
 can run:
 .. code-block:: bash
  ./pruned_transducer_stateless4/pretrained.py \
    --checkpoint ./pruned_transducer_stateless4/exp/pretrained.pt \
    --simulate-streaming 1 \
    --causal-convolution 1 \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --method greedy_search \
    /path/to/foo.wav \
    /path/to/bar.wav
 Export model using ``torch.jit.script()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  ./pruned_transducer_stateless4/export.py \
    --exp-dir ./pruned_transducer_stateless4/exp \
    --streaming-model 1 \
    --causal-convolution 1 \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 25 \
    --avg 3 \
    --jit 1
 .. caution::
   ``--streaming-model`` and ``--causal-convolution`` require to be True to export
   a streaming mdoel.
 It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
 load it by ``torch.jit.load("cpu_jit.pt")``.
 Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
 are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
 .. NOTE::
   You will need this ``cpu_jit.pt`` when deploying with Sherpa framework.
 Download pretrained models
 --------------------------
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
  - `pruned_transducer_stateless <https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless_20220625>`_
  - `pruned_transducer_stateless2 <https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless2_20220625>`_
  - `pruned_transducer_stateless4 <https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless4_20220625>`_
  - `pruned_transducer_stateless5 <https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless5_20220729>`_
  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
  for the details of the above pretrained models
 Deploy with Sherpa
 ------------------
 Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/conformer/index.html#>`_
 for how to deploy the models in ``sherpa``.
--- a/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst
@ -0,0 +1,654 @@
 Zipformer Transducer
 ====================
 This tutorial shows you how to run a **streaming** zipformer transducer model
 with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 .. Note::
   The tutorial is suitable for `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
 .. HINT::
  We assume you have read the page :ref:`install icefall` and have setup
  the environment for ``icefall``.
 .. HINT::
  We recommend you to use a GPU or several GPUs to run this recipe.
 .. hint::
   Please scroll down to the bottom of this page to find download links
   for pretrained models if you don't want to train a model from scratch.
 We use pruned RNN-T to compute the loss.
 .. note::
   You can find the paper about pruned RNN-T at the following address:
   `<https://arxiv.org/abs/2206.13236>`_
 The transducer model consists of 3 parts:
  - Encoder, a.k.a, the transcription network. We use a Zipformer model (proposed by Daniel Povey)
  - Decoder, a.k.a, the prediction network. We use a stateless model consisting of
    ``nn.Embedding`` and ``nn.Conv1d``
  - Joiner, a.k.a, the joint network.
 .. caution::
   Contrary to the conventional RNN-T models, we use a stateless decoder.
   That is, it has no recurrent connections.
 Data preparation
 ----------------
 .. hint::
   The data preparation is the same as other recipes on LibriSpeech dataset,
   if you have finished this step, you can skip to ``Training`` directly.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh
 The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
 All you need to do is to run it.
 The data preparation contains several stages, you can use the following two
 options:
  - ``--stage``
  - ``--stop-stage``
 to control which stage(s) should be run. By default, all stages are executed.
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh --stage 0 --stop-stage 0
 means to run only stage 0.
 To run stage 2 to stage 5, use:
 .. code-block:: bash
  $ ./prepare.sh --stage 2 --stop-stage 5
 .. HINT::
  If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
  dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
  ``./prepare.sh`` won't re-download them.
 .. NOTE::
  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
  are saved in ``./data`` directory.
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 Training
 --------
 Configurable options
 ~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless7_streaming/train.py --help
 shows you the training options that can be passed from the commandline.
 The following options are used quite often:
  - ``--exp-dir``
    The directory to save checkpoints, training logs and tensorboard.
  - ``--full-libri``
    If it's True, the training part uses all the training data, i.e.,
    960 hours. Otherwise, the training part uses only the subset
    ``train-clean-100``, which has 100 hours of training data.
    .. CAUTION::
      The training set is perturbed by speed with two factors: 0.9 and 1.1.
      If ``--full-libri`` is True, each epoch actually processes
      ``3x960 == 2880`` hours of data.
  - ``--num-epochs``
    It is the number of epochs to train. For instance,
    ``./pruned_transducer_stateless7_streaming/train.py --num-epochs 30`` trains for 30 epochs
    and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
    in the folder ``./pruned_transducer_stateless7_streaming/exp``.
  - ``--start-epoch``
    It's used to resume training.
    ``./pruned_transducer_stateless7_streaming/train.py --start-epoch 10`` loads the
    checkpoint ``./pruned_transducer_stateless7_streaming/exp/epoch-9.pt`` and starts
    training from epoch 10, based on the state from epoch 9.
  - ``--world-size``
    It is used for multi-GPU single-machine DDP training.
      - (a) If it is 1, then no DDP training is used.
      - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
    The following shows some use cases with it.
      **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
      GPU 2 for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="0,2"
          $ ./pruned_transducer_stateless7_streaming/train.py --world-size 2
      **Use case 2**: You have 4 GPUs and you want to use all of them
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ ./pruned_transducer_stateless7_streaming/train.py --world-size 4
      **Use case 3**: You have 4 GPUs but you only want to use GPU 3
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="3"
          $ ./pruned_transducer_stateless7_streaming/train.py --world-size 1
    .. caution::
      Only multi-GPU single-machine DDP training is implemented at present.
      Multi-GPU multi-machine DDP training will be added later.
  - ``--max-duration``
    It specifies the number of seconds over all utterances in a
    batch, before **padding**.
    If you encounter CUDA OOM, please reduce it.
    .. HINT::
      Due to padding, the number of seconds of all utterances in a
      batch will usually be larger than ``--max-duration``.
      A larger value for ``--max-duration`` may cause OOM during training,
      while a smaller value may increase the training time. You have to
      tune it.
  - ``--use-fp16``
    If it is True, the model will train with half precision, from our experiment
    results, by using half precision you can train with two times larger ``--max-duration``
    so as to get almost 2X speed up.
    We recommend using ``--use-fp16 True``.
  - ``--short-chunk-size``
    When training a streaming attention model with chunk masking, the chunk size
    would be either max sequence length of current batch or uniformly sampled from
    (1, short_chunk_size). The default value is 50, you don't have to change it most of the time.
  - ``--num-left-chunks``
    It indicates how many left context (in chunks) that can be seen when calculating attention.
    The default value is 4, you don't have to change it most of the time.
  - ``--decode-chunk-len``
    The chunk size for decoding (in frames before subsampling). It is used for validation.
    The default value is 32 (i.e., 320ms).
 Pre-configured options
 ~~~~~~~~~~~~~~~~~~~~~~
 There are some training options, e.g., number of encoder layers,
 encoder dimension, decoder dimension, number of warmup steps etc,
 that are not passed from the commandline.
 They are pre-configured by the function ``get_params()`` in
 `pruned_transducer_stateless7_streaming/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/train.py>`_
 You don't need to change these pre-configured parameters. If you really need to change
 them, please modify ``./pruned_transducer_stateless7_streaming/train.py`` directly.
 Training logs
 ~~~~~~~~~~~~~
 Training logs and checkpoints are saved in ``--exp-dir`` (e.g. ``pruned_transducer_stateless7_streaming/exp``.
 You will find the following files in that directory:
  - ``epoch-1.pt``, ``epoch-2.pt``, ...
    These are checkpoint files saved at the end of each epoch, containing model
    ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless7_streaming/train.py --start-epoch 11
  - ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
    These are checkpoint files saved every ``--save-every-n`` batches,
    containing model ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless7_streaming/train.py --start-batch 436000
  - ``tensorboard/``
    This folder contains tensorBoard logs. Training loss, validation loss, learning
    rate, etc, are recorded in these logs. You can visualize them by:
      .. code-block:: bash
        $ cd pruned_transducer_stateless7_streaming/exp/tensorboard
        $ tensorboard dev upload --logdir . --description "pruned transducer training for LibriSpeech with icefall"
  .. hint::
    If you don't have access to google, you can use the following command
    to view the tensorboard log locally:
      .. code-block:: bash
        cd pruned_transducer_stateless7_streaming/exp/tensorboard
        tensorboard --logdir . --port 6008
    It will print the following message:
      .. code-block::
        Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
        TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
    Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
    logs.
  - ``log/log-train-xxxx``
    It is the detailed training log in text format, same as the one
    you saw printed to the console during training.
 Usage example
 ~~~~~~~~~~~~~
 You can use the following command to start the training using 4 GPUs:
 .. code-block:: bash
  export CUDA_VISIBLE_DEVICES="0,1,2,3"
  ./pruned_transducer_stateless7_streaming/train.py \
    --world-size 4 \
    --num-epochs 30 \
    --start-epoch 1 \
    --use-fp16 1 \
    --exp-dir pruned_transducer_stateless7_streaming/exp \
    --full-libri 1 \
    --max-duration 550
 Decoding
 --------
 The decoding part uses checkpoints saved by the training part, so you have
 to run the training part first.
 .. hint::
   There are two kinds of checkpoints:
    - (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
      of each epoch. You can pass ``--epoch`` to
      ``pruned_transducer_stateless7_streaming/decode.py`` to use them.
    - (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
      every ``--save-every-n`` batches. You can pass ``--iter`` to
      ``pruned_transducer_stateless7_streaming/decode.py`` to use them.
    We suggest that you try both types of checkpoints and choose the one
    that produces the lowest WERs.
 .. tip::
    To decode a streaming model, you can use either ``simulate streaming decoding`` in ``decode.py`` or
    ``real chunk-wise streaming decoding`` in ``streaming_decode.py``. The difference between ``decode.py`` and
    ``streaming_decode.py`` is that, ``decode.py`` processes the whole acoustic frames at one time with masking (i.e. same as training),
    but ``streaming_decode.py`` processes the acoustic frames chunk by chunk.
 .. NOTE::
   ``simulate streaming decoding`` in ``decode.py`` and ``real chunk-size streaming decoding`` in ``streaming_decode.py`` should
   produce almost the same results given the same ``--decode-chunk-len``.
 Simulate streaming decoding
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless7_streaming/decode.py --help
 shows the options for decoding.
 The following options are important for streaming models:
  ``--decode-chunk-len``
    It is same as in ``train.py``, which specifies the chunk size for decoding (in frames before subsampling).
    The default value is 32 (i.e., 320ms).
 The following shows two examples (for the two types of checkpoints):
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for epoch in 30; do
      for avg in 12 11 10 9 8; do
        ./pruned_transducer_stateless7_streaming/decode.py \
          --epoch $epoch \
          --avg $avg \
          --decode-chunk-len 32 \
          --exp-dir pruned_transducer_stateless7_streaming/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for iter in 474000; do
      for avg in 8 10 12 14 16 18; do
        ./pruned_transducer_stateless7_streaming/decode.py \
          --iter $iter \
          --avg $avg \
          --decode-chunk-len 32 \
          --exp-dir pruned_transducer_stateless7_streaming/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 Real streaming decoding
 ~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless7_streaming/streaming_decode.py --help
 shows the options for decoding.
 The following options are important for streaming models:
  ``--decode-chunk-len``
    It is same as in ``train.py``, which specifies the chunk size for decoding (in frames before subsampling).
    The default value is 32 (i.e., 320ms).
    For ``real streaming decoding``, we will process ``decode-chunk-len`` acoustic frames at each time.
  ``--num-decode-streams``
    The number of decoding streams that can be run in parallel (very similar to the ``bath size``).
    For ``real streaming decoding``, the batches will be packed dynamically, for example, if the
    ``num-decode-streams`` equals to 10, then, sequence 1 to 10 will be decoded at first, after a while,
    suppose sequence 1 and 2 are done, so, sequence 3 to 12 will be processed parallelly in a batch.
 The following shows two examples (for the two types of checkpoints):
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for epoch in 30; do
      for avg in 12 11 10 9 8; do
        ./pruned_transducer_stateless7_streaming/decode.py \
          --epoch $epoch \
          --avg $avg \
          --decode-chunk-len 32 \
          --num-decode-streams 100 \
          --exp-dir pruned_transducer_stateless7_streaming/exp \
          --decoding-method $m
      done
    done
  done
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for iter in 474000; do
      for avg in 8 10 12 14 16 18; do
        ./pruned_transducer_stateless7_streaming/decode.py \
          --iter $iter \
          --avg $avg \
          --decode-chunk-len 16 \
          --num-decode-streams 100 \
          --exp-dir pruned_transducer_stateless7_streaming/exp \
          --decoding-method $m
      done
    done
  done
 .. tip::
  Supporting decoding methods are as follows:
    - ``greedy_search`` : It takes the symbol with largest posterior probability
      of each frame as the decoding result.
    - ``beam_search`` :  It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf and
      `espnet/nets/beam_search_transducer.py <https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py#L247>`_
      is used as a reference. Basicly, it keeps topk states for each frame, and expands the kept states with their own contexts to
      next frame.
    - ``modified_beam_search`` : It implements the same algorithm as ``beam_search`` above, but it
      runs in batch mode with ``--max-sym-per-frame=1`` being hardcoded.
    - ``fast_beam_search`` : It implements graph composition between the output ``log_probs`` and
      given ``FSAs``. It is hard to describe the details in several lines of texts, you can read
      our paper in https://arxiv.org/pdf/2211.00484.pdf or our `rnnt decode code in k2 <https://github.com/k2-fsa/k2/blob/master/k2/csrc/rnnt_decode.h>`_. ``fast_beam_search`` can decode with ``FSAs`` on GPU efficiently.
    - ``fast_beam_search_LG`` : The same as ``fast_beam_search`` above, ``fast_beam_search`` uses
      an trivial graph that has only one state, while ``fast_beam_search_LG`` uses an LG graph
      (with N-gram LM).
    - ``fast_beam_search_nbest`` : It produces the decoding results as follows:
      - (1) Use ``fast_beam_search`` to get a lattice
      - (2) Select ``num_paths`` paths from the lattice using ``k2.random_paths()``
      - (3) Unique the selected paths
      - (4) Intersect the selected paths with the lattice and compute the
            shortest path from the intersection result
      - (5) The path with the largest score is used as the decoding output.
    - ``fast_beam_search_nbest_LG`` : It implements same logic as ``fast_beam_search_nbest``, the
      only difference is that it uses ``fast_beam_search_LG`` to generate the lattice.
 .. NOTE::
  The supporting decoding methods in ``streaming_decode.py`` might be less than that in ``decode.py``, if needed,
  you can implement them by yourself or file a issue in `icefall <https://github.com/k2-fsa/icefall/issues>`_ .
 Export Model
 ------------
 Currently it supports exporting checkpoints from ``pruned_transducer_stateless7_streaming/exp`` in the following ways.
 Export ``model.state_dict()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Checkpoints saved by ``pruned_transducer_stateless7_streaming/train.py`` also include
 ``optimizer.state_dict()``. It is useful for resuming training. But after training,
 we are interested only in ``model.state_dict()``. You can use the following
 command to extract ``model.state_dict()``.
 .. code-block:: bash
  # Assume that --epoch 30 --avg 9 produces the smallest WER
  # (You can get such information after running ./pruned_transducer_stateless7_streaming/decode.py)
  epoch=30
  avg=9
  ./pruned_transducer_stateless7_streaming/export.py \
    --exp-dir ./pruned_transducer_stateless7_streaming/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch $epoch \
    --avg  $avg \
    --use-averaged-model=True \
    --decode-chunk-len 32
 It will generate a file ``./pruned_transducer_stateless7_streaming/exp/pretrained.pt``.
 .. hint::
   To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless7_streaming/decode.py``,
   you can run:
   .. code-block:: bash
      cd pruned_transducer_stateless7_streaming/exp
      ln -s pretrained.pt epoch-999.pt
   And then pass ``--epoch 999 --avg 1 --use-averaged-model 0`` to
   ``./pruned_transducer_stateless7_streaming/decode.py``.
 To use the exported model with ``./pruned_transducer_stateless7_streaming/pretrained.py``, you
 can run:
 .. code-block:: bash
  ./pruned_transducer_stateless7_streaming/pretrained.py \
    --checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --method greedy_search \
    --decode-chunk-len 32 \
    /path/to/foo.wav \
    /path/to/bar.wav
 Export model using ``torch.jit.script()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  ./pruned_transducer_stateless7_streaming/export.py \
    --exp-dir ./pruned_transducer_stateless7_streaming/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 30 \
    --avg 9 \
    --decode-chunk-len 32 \
    --jit 1
 .. caution::
   ``--decode-chunk-len`` is required to export a ScriptModule.
 It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
 load it by ``torch.jit.load("cpu_jit.pt")``.
 Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
 are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
 Export model using ``torch.jit.trace()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  epoch=30
  avg=9
  ./pruned_transducer_stateless7_streaming/jit_trace_export.py \
    --bpe-model data/lang_bpe_500/bpe.model \
    --use-averaged-model=True \
    --decode-chunk-len 32 \
    --exp-dir ./pruned_transducer_stateless7_streaming/exp \
    --epoch $epoch \
    --avg $avg
 .. caution::
   ``--decode-chunk-len`` is required to export a ScriptModule.
 It will generate 3 files:
  - ``./pruned_transducer_stateless7_streaming/exp/encoder_jit_trace.pt``
  - ``./pruned_transducer_stateless7_streaming/exp/decoder_jit_trace.pt``
  - ``./pruned_transducer_stateless7_streaming/exp/joiner_jit_trace.pt``
 To use the generated files with ``./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py``:
 .. code-block:: bash
  ./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py \
    --encoder-model-filename ./pruned_transducer_stateless7_streaming/exp/encoder_jit_trace.pt \
    --decoder-model-filename ./pruned_transducer_stateless7_streaming/exp/decoder_jit_trace.pt \
    --joiner-model-filename ./pruned_transducer_stateless7_streaming/exp/joiner_jit_trace.pt \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --decode-chunk-len 32 \
    /path/to/foo.wav
 Download pretrained models
 --------------------------
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
  - `pruned_transducer_stateless7_streaming <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`_
  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
  for the details of the above pretrained models
 Deploy with Sherpa
 ------------------
 Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/conformer/index.html#>`_
 for how to deploy the models in ``sherpa``.
--- a/docs/source/recipes/index.rst
+++ b/docs/source/recipes/index.rst
@ -13,7 +13,5 @@ We may add recipes for other tasks as well in the future.
   :maxdepth: 2
   :caption: Table of Contents
-   aishell/index
+   Non-streaming-ASR/index
-   librispeech/index
+   Streaming-ASR/index
   timit/index
   yesno/index
--- a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
+++ b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
@ -87,9 +87,7 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
            )
            if "train" in partition:
                cut_set = (
-                    cut_set
+                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                    + cut_set.perturb_speed(0.9)
                    + cut_set.perturb_speed(1.1)
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
@ -116,9 +114,7 @@ def get_args():
 if __name__ == "__main__":
-    formatter = (
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
--- a/egs/aidatatang_200zh/ASR/local/prepare_char.py
+++ b/egs/aidatatang_200zh/ASR/local/prepare_char.py
@ -86,9 +86,7 @@ def lexicon_to_fst_no_sil(
        cur_state = loop_state
        word = word2id[word]
-        pieces = [
+        pieces = [token2id[i] if i in token2id else token2id["<unk>"] for i in pieces]
            token2id[i] if i in token2id else token2id["<unk>"] for i in pieces
        ]
        for i in range(len(pieces) - 1):
            w = word if i == 0 else eps
@ -142,9 +140,7 @@ def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool:
    return False
-def generate_lexicon(
+def generate_lexicon(token_sym_table: Dict[str, int], words: List[str]) -> Lexicon:
    token_sym_table: Dict[str, int], words: List[str]
 ) -> Lexicon:
    """Generate a lexicon from a word list and token_sym_table.
    Args:
--- a/egs/aidatatang_200zh/ASR/local/prepare_lang.py
+++ b/egs/aidatatang_200zh/ASR/local/prepare_lang.py
@ -317,9 +317,7 @@ def lexicon_to_fst(
 def get_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument(
+    parser.add_argument("--lang-dir", type=str, help="The lang dir, data/lang_phone")
        "--lang-dir", type=str, help="The lang dir, data/lang_phone"
    )
    return parser.parse_args()
--- a/egs/aidatatang_200zh/ASR/local/test_prepare_lang.py
+++ b/egs/aidatatang_200zh/ASR/local/test_prepare_lang.py
@ -88,9 +88,7 @@ def test_read_lexicon(filename: str):
    fsa.aux_labels_sym = k2.SymbolTable.from_file("words.txt")
    fsa.draw("L.pdf", title="L")
-    fsa_disambig = lexicon_to_fst(
+    fsa_disambig = lexicon_to_fst(lexicon_disambig, phone2id=phone2id, word2id=word2id)
        lexicon_disambig, phone2id=phone2id, word2id=word2id
    )
    fsa_disambig.labels_sym = k2.SymbolTable.from_file("phones.txt")
    fsa_disambig.aux_labels_sym = k2.SymbolTable.from_file("words.txt")
    fsa_disambig.draw("L_disambig.pdf", title="L_disambig")
--- a/egs/aidatatang_200zh/ASR/local/text2token.py
+++ b/egs/aidatatang_200zh/ASR/local/text2token.py
@ -56,9 +56,7 @@ def get_parser():
    parser.add_argument(
        "--skip-ncols", "-s", default=0, type=int, help="skip first n columns"
    )
-    parser.add_argument(
+    parser.add_argument("--space", default="<space>", type=str, help="space symbol")
        "--space", default="<space>", type=str, help="space symbol"
    )
    parser.add_argument(
        "--non-lang-syms",
        "-l",
@ -66,9 +64,7 @@ def get_parser():
        type=str,
        help="list of non-linguistic symobles, e.g., <NOISE> etc.",
    )
-    parser.add_argument(
+    parser.add_argument("text", type=str, default=False, nargs="?", help="input text")
        "text", type=str, default=False, nargs="?", help="input text"
    )
    parser.add_argument(
        "--trans_type",
        "-t",
@ -108,8 +104,7 @@ def token2id(
            if token_type == "lazy_pinyin":
                text = lazy_pinyin(chars_list)
                sub_ids = [
-                    token_table[txt] if txt in token_table else oov_id
+                    token_table[txt] if txt in token_table else oov_id for txt in text
                    for txt in text
                ]
                ids.append(sub_ids)
            else:  # token_type = "pinyin"
@ -135,9 +130,7 @@ def main():
    if args.text:
        f = codecs.open(args.text, encoding="utf-8")
    else:
-        f = codecs.getreader("utf-8")(
+        f = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
            sys.stdin if is_python2 else sys.stdin.buffer
        )
    sys.stdout = codecs.getwriter("utf-8")(
        sys.stdout if is_python2 else sys.stdout.buffer
--- a/egs/aidatatang_200zh/ASR/prepare.sh
+++ b/egs/aidatatang_200zh/ASR/prepare.sh
@ -1,5 +1,8 @@
 #!/usr/bin/env bash
 # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail
 stage=-1
@ -113,4 +116,3 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
    ./local/prepare_char.py
  fi
 fi
--- a/Show More
+++ b/Show More
`@ -6,4 +6,3 @@ TIMIT`

	`tdnn_ligru_ctc`	`tdnn_ligru_ctc`
	`tdnn_lstm_ctc`	`tdnn_lstm_ctc`