Merge branch 'master' of https://github.com/k2-fsa/icefall

2025-09-12 02:24:20 +00:00 · 2023-01-15 13:44:45 -05:00 · 2023-01-15 13:44:45 -05:00 · b978c6de55
commit b978c6de55
parent 9d922ec2a0 5c8e9628cc
238 changed files with 35545 additions and 1075 deletions
--- a/.flake8
+++ b/.flake8
@ -1,7 +1,7 @@
 [flake8]
 show-source=true
 statistics=true
-max-line-length = 80
+max-line-length = 88
 per-file-ignores =
    # line too long
    icefall/diagnostics.py: E501,
@ -12,6 +12,7 @@ per-file-ignores =
    egs/librispeech/ASR/lstm_transducer_stateless*/*.py: E501, E203
    egs/librispeech/ASR/conv_emformer_transducer_stateless*/*.py: E501, E203
    egs/librispeech/ASR/conformer_ctc*/*py: E501,
    egs/librispeech/ASR/zipformer_mmi/*.py: E501, E203
    egs/librispeech/ASR/RESULTS.md: E999,
    # invalid escape sequence (cause by tex formular), W605
--- a/.github/scripts/run-librispeech-conformer-ctc3-2022-11-28.sh
+++ b/.github/scripts/run-librispeech-conformer-ctc3-2022-11-28.sh
@ -13,7 +13,6 @@ cd egs/librispeech/ASR
 repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-conformer-ctc3-2022-11-27
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
@ -23,7 +22,12 @@ soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
-git lfs pull --include "data/*"
+git lfs pull --include "data/lang_bpe_500/HLG.pt"
 git lfs pull --include "data/lang_bpe_500/L.pt"
 git lfs pull --include "data/lang_bpe_500/LG.pt"
 git lfs pull --include "data/lang_bpe_500/Linv.pt"
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "data/lm/G_4_gram.pt"
 git lfs pull --include "exp/jit_trace.pt"
 git lfs pull --include "exp/pretrained.pt"
 ln -s pretrained.pt epoch-99.pt
--- a/.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.sh
+++ b/.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.sh
@ -193,7 +193,7 @@ if [[ x"${GITHUB_EVENT_LABEL_NAME}" == x"shallow-fusion" ]]; then
  ls -lh data
  ls -lh lstm_transducer_stateless2/exp
-  log "Decoding test-clean and test-other"
+  log "Decoding test-clean and test-other with RNN LM"
  ./lstm_transducer_stateless2/decode.py \
    --use-averaged-model 0 \
@ -201,12 +201,14 @@ if [[ x"${GITHUB_EVENT_LABEL_NAME}" == x"shallow-fusion" ]]; then
    --avg 1 \
    --exp-dir lstm_transducer_stateless2/exp \
    --max-duration 600 \
-    --decoding-method modified_beam_search_rnnlm_shallow_fusion \
+    --decoding-method modified_beam_search_lm_shallow_fusion \
    --beam 4 \
-    --rnn-lm-scale 0.3 \
+    --use-shallow-fusion 1 \
-    --rnn-lm-exp-dir $lm_repo/exp \
+    --lm-type rnn \
-    --rnn-lm-epoch 88 \
+    --lm-exp-dir $lm_repo/exp \
-    --rnn-lm-avg 1 \
+    --lm-epoch 88 \
    --lm-avg 1 \
    --lm-scale 0.3 \
    --rnn-lm-num-layers 3 \
    --rnn-lm-tie-weights 1
 fi
@ -245,11 +247,13 @@ if [[ x"${GITHUB_EVENT_LABEL_NAME}" == x"LODR" ]]; then
    --avg 1 \
    --exp-dir lstm_transducer_stateless2/exp \
    --max-duration 600 \
-    --decoding-method modified_beam_search_rnnlm_LODR \
+    --decoding-method modified_beam_search_LODR \
    --beam 4 \
-    --rnn-lm-scale 0.3 \
+    --use-shallow-fusion 1 \
-    --rnn-lm-exp-dir $lm_repo/exp \
+    --lm-type rnn \
-    --rnn-lm-epoch 88 \
+    --lm-exp-dir $lm_repo/exp \
    --lm-scale 0.4 \
    --lm-epoch 88 \
    --rnn-lm-avg 1 \
    --rnn-lm-num-layers 3 \
    --rnn-lm-tie-weights 1 \
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-2022-11-11.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-2022-11-11.sh
@ -30,6 +30,15 @@ ln -s pretrained.pt epoch-99.pt
 ls -lh *.pt
 popd
 log "Test exporting to ONNX format"
 ./pruned_transducer_stateless7/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --epoch 99 \
  --avg 1 \
  --onnx 1
 log "Export to torchscript model"
 ./pruned_transducer_stateless7/export.py \
  --exp-dir $repo/exp \
@ -41,6 +50,27 @@ log "Export to torchscript model"
 ls -lh $repo/exp/*.pt
 log "Decode with ONNX models"
 ./pruned_transducer_stateless7/onnx_check.py \
  --jit-filename $repo/exp/cpu_jit.pt \
  --onnx-encoder-filename $repo/exp/encoder.onnx \
  --onnx-decoder-filename $repo/exp/decoder.onnx \
  --onnx-joiner-filename $repo/exp/joiner.onnx \
  --onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
  --onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx
 ./pruned_transducer_stateless7/onnx_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --encoder-model-filename $repo/exp/encoder.onnx \
  --decoder-model-filename $repo/exp/decoder.onnx \
  --joiner-model-filename $repo/exp/joiner.onnx \
  --joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
  --joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
 log "Decode with models exported by torch.jit.script()"
 ./pruned_transducer_stateless7/jit_pretrained.py \
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-2022-12-01.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-2022-12-01.sh
@ -13,7 +13,6 @@ cd egs/librispeech/ASR
 repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
@ -23,7 +22,12 @@ soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
-git lfs pull --include "data/*"
+git lfs pull --include "data/lang_bpe_500/HLG.pt"
 git lfs pull --include "data/lang_bpe_500/L.pt"
 git lfs pull --include "data/lang_bpe_500/LG.pt"
 git lfs pull --include "data/lang_bpe_500/Linv.pt"
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "data/lm/G_4_gram.pt"
 git lfs pull --include "exp/cpu_jit.pt"
 git lfs pull --include "exp/pretrained.pt"
 ln -s pretrained.pt epoch-99.pt
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
@ -0,0 +1,148 @@
 #!/usr/bin/env bash
 set -e
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2022-12-14
 log "Downloading pre-trained model from $repo_url"
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 git lfs pull --include "data/lang_bpe_500/HLG.pt"
 git lfs pull --include "data/lang_bpe_500/L.pt"
 git lfs pull --include "data/lang_bpe_500/LG.pt"
 git lfs pull --include "data/lang_bpe_500/Linv.pt"
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/cpu_jit.pt"
 git lfs pull --include "exp/pretrained.pt"
 ln -s pretrained.pt epoch-99.pt
 ls -lh *.pt
 popd
 log "Export to torchscript model"
 ./pruned_transducer_stateless7_ctc_bs/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --epoch 99 \
  --avg 1 \
  --jit 1
 ls -lh $repo/exp/*.pt
 log "Decode with models exported by torch.jit.script()"
 ./pruned_transducer_stateless7_ctc_bs/jit_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --nn-model-filename $repo/exp/cpu_jit.pt \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
 for m in ctc-decoding 1best; do
  ./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py \
    --model-filename $repo/exp/cpu_jit.pt \
    --words-file $repo/data/lang_bpe_500/words.txt  \
    --HLG $repo/data/lang_bpe_500/HLG.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --method $m \
    --sample-rate 16000 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless7_ctc_bs/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./pruned_transducer_stateless7_ctc_bs/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for m in ctc-decoding 1best; do
  ./pruned_transducer_stateless7_ctc_bs/pretrained_ctc.py \
    --checkpoint $repo/exp/pretrained.pt \
    --words-file $repo/data/lang_bpe_500/words.txt  \
    --HLG $repo/data/lang_bpe_500/HLG.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --method $m \
    --sample-rate 16000 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless7_ctc_bs/exp
  ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless7_ctc_bs/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh pruned_transducer_stateless7_ctc_bs/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless7_ctc_bs/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --use-averaged-model 0 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless7_ctc_bs/exp
  done
  for m in ctc-decoding 1best; do
    ./pruned_transducer_stateless7_ctc_bs/ctc_decode.py \
        --epoch 999 \
        --avg 1 \
        --exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
        --max-duration $max_duration \
        --use-averaged-model 0 \
        --decoding-method $m \
        --hlg-scale 0.6
  done
  rm pruned_transducer_stateless7_ctc_bs/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-streaming-2022-12-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-streaming-2022-12-29.sh
@ -0,0 +1,148 @@
 #!/usr/bin/env bash
 set -e
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/cpu_jit.pt"
 git lfs pull --include "exp/pretrained.pt"
 git lfs pull --include "exp/encoder_jit_trace.pt"
 git lfs pull --include "exp/decoder_jit_trace.pt"
 git lfs pull --include "exp/joiner_jit_trace.pt"
 ln -s pretrained.pt epoch-99.pt
 ls -lh *.pt
 popd
 log "Export to torchscript model"
 ./pruned_transducer_stateless7_streaming/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --decode-chunk-len 32 \
  --epoch 99 \
  --avg 1 \
  --jit 1
 ls -lh $repo/exp/*.pt
 log "Decode with models exported by torch.jit.script()"
 ./pruned_transducer_stateless7_streaming/jit_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --nn-model-filename $repo/exp/cpu_jit.pt \
  --decode-chunk-len 32 \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
 log "Export to torchscript model by torch.jit.trace()"
 ./pruned_transducer_stateless7_streaming/jit_trace_export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --decode-chunk-len 32 \
  --epoch 99 \
  --avg 1
 log "Decode with models exported by torch.jit.trace()"
 ./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --encoder-model-filename $repo/exp/encoder_jit_trace.pt \
  --decoder-model-filename $repo/exp/decoder_jit_trace.pt \
  --joiner-model-filename $repo/exp/joiner_jit_trace.pt \
  --decode-chunk-len 32 \
  $repo/test_wavs/1089-134686-0001.wav
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless7_streaming/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --decode-chunk-len 32 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./pruned_transducer_stateless7_streaming/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --decode-chunk-len 32 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless7_streaming/exp
  ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless7_streaming/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh pruned_transducer_stateless7_streaming/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  num_decode_stream=200
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "decoding with $method"
    ./pruned_transducer_stateless7_streaming/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --use-averaged-model 0 \
      --max-duration $max_duration \
      --decode-chunk-len 32 \
      --exp-dir pruned_transducer_stateless7_streaming/exp
  done
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless7_streaming/streaming_decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --use-averaged-model 0 \
      --decode-chunk-len 32 \
      --num-decode-streams $num_decode_stream
      --exp-dir pruned_transducer_stateless7_streaming/exp
  done
  rm pruned_transducer_stateless7_streaming/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-zipformer-mmi-2022-12-08.sh
+++ b/.github/scripts/run-librispeech-zipformer-mmi-2022-12-08.sh
@ -0,0 +1,103 @@
 #!/usr/bin/env bash
 set -e
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-mmi-2022-12-08
 log "Downloading pre-trained model from $repo_url"
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 git lfs pull --include "data/lang_bpe_500/3gram.pt"
 git lfs pull --include "data/lang_bpe_500/4gram.pt"
 git lfs pull --include "data/lang_bpe_500/L.pt"
 git lfs pull --include "data/lang_bpe_500/LG.pt"
 git lfs pull --include "data/lang_bpe_500/Linv.pt"
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/cpu_jit.pt"
 git lfs pull --include "exp/pretrained.pt"
 ln -s pretrained.pt epoch-99.pt
 ls -lh *.pt
 popd
 log "Export to torchscript model"
 ./zipformer_mmi/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --epoch 99 \
  --avg 1 \
  --jit 1
 ls -lh $repo/exp/*.pt
 log "Decode with models exported by torch.jit.script()"
 ./zipformer_mmi/jit_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --nn-model-filename $repo/exp/cpu_jit.pt \
  --lang-dir $repo/data/lang_bpe_500 \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
 for method in 1best nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescoring-4-gram; do
  log "$method"
  ./zipformer_mmi/pretrained.py \
    --method $method \
    --checkpoint $repo/exp/pretrained.pt \
    --lang-dir $repo/data/lang_bpe_500 \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p zipformer_mmi/exp
  ln -s $PWD/$repo/exp/pretrained.pt zipformer_mmi/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh zipformer_mmi/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in 1best nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescoring-4-gram; do
    log "Decoding with $method"
    ./zipformer_mmi/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --use-averaged-model 0 \
      --nbest-scale 1.2 \
      --hp-scale 1.0 \
      --max-duration $max_duration \
      --lang-dir $repo/data/lang_bpe_500 \
      --exp-dir zipformer_mmi/exp
  done
  rm zipformer_mmi/exp/*.pt
 fi
--- a/.github/workflows/run-librispeech-2022-11-11-stateless7.yml
+++ b/.github/workflows/run-librispeech-2022-11-11-stateless7.yml
@ -39,7 +39,7 @@ concurrency:
 jobs:
  run_librispeech_2022_11_11_zipformer:
-    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+    if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
--- a/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml
+++ b/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml
@ -0,0 +1,167 @@
 # Copyright      2022  Zengwei Yao
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-12-08-zipformer-mmi
 # zipformer
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_2022_12_08_zipformer-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_2022_12_08_zipformer:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        python-version: [3.8]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}-2022-09-25
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-zipformer-mmi-2022-12-08.sh
      - name: Display decoding results for librispeech zipformer-mmi
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./zipformer-mmi/exp
          cd zipformer-mmi
          echo "results for zipformer-mmi"
          echo "===1best==="
          find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===nbest==="
          find exp/nbest -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/nbest -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===nbest-rescoring-LG==="
          find exp/nbest-rescoring-LG -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/nbest-rescoring-LG -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===nbest-rescoring-3-gram==="
          find exp/nbest-rescoring-3-gram -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/nbest-rescoring-3-gram -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===nbest-rescoring-4-gram==="
          find exp/nbest-rescoring-4-gram -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/nbest-rescoring-4-gram -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for librispeech zipformer-mmi
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer_mmi-2022-12-08
          path: egs/librispeech/ASR/zipformer_mmi/exp/
--- a/.github/workflows/run-librispeech-2022-12-15-stateless7-ctc-bs.yml
+++ b/.github/workflows/run-librispeech-2022-12-15-stateless7-ctc-bs.yml
@ -0,0 +1,163 @@
 # Copyright      2022  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-12-15-stateless7-ctc-bs
 # zipformer
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_librispeech_2022_12_15_zipformer_ctc_bs:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event.label.name == 'blank-skip' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        python-version: [3.8]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}-2022-09-25
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
      - name: Display decoding results for librispeech pruned_transducer_stateless7_ctc_bs
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./pruned_transducer_stateless7_ctc_bs/exp
          cd pruned_transducer_stateless7_ctc_bs
          echo "results for pruned_transducer_stateless7_ctc_bs"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===ctc decoding==="
          find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===1best==="
          find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for librispeech pruned_transducer_stateless7_ctc_bs
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-ctc-bs-2022-12-15
          path: egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/exp/
--- a/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml
+++ b/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml
@ -0,0 +1,172 @@
 # Copyright      2022  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-12-29-stateless7-streaming
 # zipformer
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 concurrency:
  group: run_librispeech_2022_12_29_zipformer_streaming-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  run_librispeech_2022_12_29_zipformer_streaming:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event.label.name == 'streaming-zipformer' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        python-version: [3.8]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}-2022-09-25
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-pruned-transducer-stateless7-streaming-2022-12-29.sh
      - name: Display decoding results for librispeech pruned_transducer_stateless7_streaming
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./pruned_transducer_stateless7_streaming/exp
          cd pruned_transducer_stateless7_streaming
          echo "results for pruned_transducer_stateless7_streaming"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===streaming greedy search==="
          find exp/streaming/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/streaming/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===streaming fast_beam_search==="
          find exp/streaming/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/streaming/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===streaming modified beam search==="
          find exp/streaming/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/streaming/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for librispeech pruned_transducer_stateless7_streaming
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-streaming-2022-12-29
          path: egs/librispeech/ASR/pruned_transducer_stateless7_streaming/exp/
--- a/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
+++ b/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
@ -139,9 +139,10 @@ jobs:
          cd egs/librispeech/ASR
          tree lstm_transducer_stateless2/exp
          cd lstm_transducer_stateless2/exp
-          echo "===modified_beam_search_rnnlm_shallow_fusion==="
+          echo "===modified_beam_search_lm_shallow_fusion==="
-          find modified_beam_search_rnnlm_shallow_fusion  -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          echo "===Using RNNLM==="
-          find modified_beam_search_rnnlm_shallow_fusion  -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+          find modified_beam_search_lm_shallow_fusion  -name "log-*rnn*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find modified_beam_search_lm_shallow_fusion  -name "log-*rnn*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Display decoding results for lstm_transducer_stateless2
        if: github.event.label.name == 'LODR'
@ -151,8 +152,8 @@ jobs:
          tree lstm_transducer_stateless2/exp
          cd lstm_transducer_stateless2/exp
          echo "===modified_beam_search_rnnlm_LODR==="
-          find modified_beam_search_rnnlm_LODR  -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find modified_beam_search_LODR  -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
-          find modified_beam_search_rnnlm_LODR  -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+          find modified_beam_search_LODR  -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for lstm_transducer_stateless2
        uses: actions/upload-artifact@v2
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -113,6 +113,9 @@ jobs:
          cd ../pruned_transducer_stateless4
          pytest -v -s
          cd ../pruned_transducer_stateless7
          pytest -v -s
          cd ../transducer_stateless
          pytest -v -s
--- a/.gitignore
+++ b/.gitignore
@ -33,3 +33,4 @@ node_modules
 *.param
 *.bin
 .DS_Store
--- a/docs/README.md
+++ b/docs/README.md
@ -0,0 +1,24 @@
 ## Usage
 ```bash
 cd /path/to/icefall/docs
 pip install -r requirements.txt
 make clean
 make html
 cd build/html
 python3 -m http.server 8000
 ```
 It prints:
 ```
 Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ...
 ```
 Open your browser and go to <http://0.0.0.0:8000/> to view the generated
 documentation.
 Done!
 **Hint**: You can change the port number when starting the server.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -78,3 +78,12 @@ html_context = {
 }
 todo_include_todos = True
 rst_epilog = """
 .. _sherpa-ncnn: https://github.com/k2-fsa/sherpa-ncnn
 .. _icefall: https://github.com/k2-fsa/icefall
 .. _git-lfs: https://git-lfs.com/
 .. _ncnn: https://github.com/tencent/ncnn
 .. _LibriSpeech: https://www.openslr.org/12
 .. _musan: http://www.openslr.org/17/
 """
--- a/docs/source/faqs.rst
+++ b/docs/source/faqs.rst
@ -0,0 +1,107 @@
 Frequently Asked Questions (FAQs)
 =================================
 In this section, we collect issues reported by users and post the corresponding
 solutions.
 OSError: libtorch_hip.so: cannot open shared object file: no such file or directory
 -----------------------------------------------------------------------------------
 One user is using the following code to install ``torch`` and ``torchaudio``:
 .. code-block:: bash
  pip install \
    torch==1.10.0+cu111 \
    torchvision==0.11.0+cu111 \
    torchaudio==0.10.0 \
    -f https://download.pytorch.org/whl/torch_stable.html
 and it throws the following error when running ``tdnn/train.py``:
 .. code-block::
  OSError: libtorch_hip.so: cannot open shared object file: no such file or directory
 The fix is to specify the CUDA version while installing ``torchaudio``. That
 is, change ``torchaudio==0.10.0`` to ``torchaudio==0.10.0+cu11```. Therefore,
 the correct command is:
 .. code-block:: bash
  pip install \
    torch==1.10.0+cu111 \
    torchvision==0.11.0+cu111 \
    torchaudio==0.10.0+cu111 \
    -f https://download.pytorch.org/whl/torch_stable.html
 AttributeError: module 'distutils' has no attribute 'version'
 -------------------------------------------------------------
 The error log is:
 .. code-block::
  Traceback (most recent call last):
    File "./tdnn/train.py", line 14, in <module>
      from asr_datamodule import YesNoAsrDataModule
    File "/home/xxx/code/next-gen-kaldi/icefall/egs/yesno/ASR/tdnn/asr_datamodule.py", line 34, in <module>
      from icefall.dataset.datamodule import DataModule
    File "/home/xxx/code/next-gen-kaldi/icefall/icefall/__init__.py", line 3, in <module>
      from . import (
    File "/home/xxx/code/next-gen-kaldi/icefall/icefall/decode.py", line 23, in <module>
      from icefall.utils import add_eos, add_sos, get_texts
    File "/home/xxx/code/next-gen-kaldi/icefall/icefall/utils.py", line 39, in <module>
      from torch.utils.tensorboard import SummaryWriter
    File "/home/xxx/tool/miniconda3/envs/yyy/lib/python3.8/site-packages/torch/utils/tensorboard/__init__.py", line 4, in <module>
      LooseVersion = distutils.version.LooseVersion
  AttributeError: module 'distutils' has no attribute 'version'
 The fix is:
 .. code-block:: bash
  pip uninstall setuptools
  pip install setuptools==58.0.4
 ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
 --------------------------------------------------------------------------------------------
 If you are using ``conda`` and encounter the following issue:
 .. code-block::
  Traceback (most recent call last):
    File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 24, in <module>
      from _k2 import DeterminizeWeightPushingType
  ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
  During handling of the above exception, another exception occurred:
  Traceback (most recent call last):
    File "/k2-dev/yangyifan/icefall/egs/librispeech/ASR/./pruned_transducer_stateless7_ctc_bs/decode.py", line 104, in <module>
      import k2
    File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 30, in <module>
      raise ImportError(
  ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
  Note: If you're using anaconda and importing k2 on MacOS,
        you can probably fix this by setting the environment variable:
    export DYLD_LIBRARY_PATH=$CONDA_PREFIX/lib/python3.10/site-packages:$DYLD_LIBRARY_PATH
 Please first try to find where ``libpython3.10.so.1.0`` locates.
 For instance,
 .. code-block:: bash
  cd $CONDA_PREFIX/lib
  find . -name "libpython*"
 If you are able to find it inside ``$CODNA_PREFIX/lib``, please set the
 following environment variable:
 .. code-block:: bash
  export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -21,7 +21,16 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
   :caption: Contents:
   installation/index
   faqs
   model-export/index
 .. toctree::
   :maxdepth: 3
   recipes/index
 .. toctree::
   :maxdepth: 2
   contributing/index
   huggingface/index
--- a/docs/source/model-export/code/export-conv-emformer-transducer-for-ncnn-output.txt
+++ b/docs/source/model-export/code/export-conv-emformer-transducer-for-ncnn-output.txt
@ -0,0 +1,21 @@
 2023-01-11 12:15:38,677 INFO [export-for-ncnn.py:220] device: cpu
 2023-01-11 12:15:38,681 INFO [export-for-ncnn.py:229] {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_v
 alid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampl
 ing_factor': 4, 'decoder_dim': 512, 'joiner_dim': 512, 'model_warm_step': 3000, 'env_info': {'k2-version': '1.23.2', 'k2-build-type':
 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'a34171ed85605b0926eebbd0463d059431f4f74a', 'k2-git-date': 'Wed Dec 14 00:06:38 2022',
 'lhotse-version': '1.12.0.dev+missing.version.file', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': False, 'torch-cuda-vers
 ion': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'fix-stateless3-train-2022-12-27', 'icefall-git-sha1': '530e8a1-dirty', '
 icefall-git-date': 'Tue Dec 27 13:59:18 2022', 'icefall-path': '/star-fj/fangjun/open-source/icefall', 'k2-path': '/star-fj/fangjun/op
 en-source/k2/k2/python/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/open-source/lhotse/lhotse/__init__.py', 'hostname': 'de-74279
 -k2-train-3-1220120619-7695ff496b-s9n4w', 'IP address': '127.0.0.1'}, 'epoch': 30, 'iter': 0, 'avg': 1, 'exp_dir': PosixPath('icefa
 ll-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp'), 'bpe_model': './icefall-asr-librispeech-conv-emformer-transdu
 cer-stateless2-2022-07-05//data/lang_bpe_500/bpe.model', 'jit': False, 'context_size': 2, 'use_averaged_model': False, 'encoder_dim':
 512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'cnn_module_kernel': 31, 'left_context_length': 32, 'chunk_length'
 : 32, 'right_context_length': 8, 'memory_size': 32, 'blank_id': 0, 'vocab_size': 500}
 2023-01-11 12:15:38,681 INFO [export-for-ncnn.py:231] About to create model
 2023-01-11 12:15:40,053 INFO [checkpoint.py:112] Loading checkpoint from icefall-asr-librispeech-conv-emformer-transducer-stateless2-2
 022-07-05/exp/epoch-30.pt
 2023-01-11 12:15:40,708 INFO [export-for-ncnn.py:315] Number of model parameters: 75490012
 2023-01-11 12:15:41,681 INFO [export-for-ncnn.py:318] Using torch.jit.trace()
 2023-01-11 12:15:41,681 INFO [export-for-ncnn.py:320] Exporting encoder
 2023-01-11 12:15:41,682 INFO [export-for-ncnn.py:149] chunk_length: 32, right_context_length: 8
--- a/docs/source/model-export/code/generate-int-8-scale-table-for-conv-emformer.txt
+++ b/docs/source/model-export/code/generate-int-8-scale-table-for-conv-emformer.txt
@ -0,0 +1,104 @@
 Don't Use GPU. has_gpu: 0, config.use_vulkan_compute: 1
 num encoder conv layers: 88
 num joiner conv layers: 3
 num files: 3
 Processing ../test_wavs/1089-134686-0001.wav
 Processing ../test_wavs/1221-135766-0001.wav
 Processing ../test_wavs/1221-135766-0002.wav
 Processing ../test_wavs/1089-134686-0001.wav
 Processing ../test_wavs/1221-135766-0001.wav
 Processing ../test_wavs/1221-135766-0002.wav
 ----------encoder----------
 conv_87                                  : max = 15.942385        threshold = 15.938493        scale = 7.968131
 conv_88                                  : max = 35.442448        threshold = 15.549335        scale = 8.167552
 conv_89                                  : max = 23.228289        threshold = 8.001738         scale = 15.871552
 linear_90                                : max = 3.976146         threshold = 1.101789         scale = 115.267128
 linear_91                                : max = 6.962030         threshold = 5.162033         scale = 24.602713
 linear_92                                : max = 12.323041        threshold = 3.853959         scale = 32.953129
 linear_94                                : max = 6.905416         threshold = 4.648006         scale = 27.323545
 linear_93                                : max = 6.905416         threshold = 5.474093         scale = 23.200188
 linear_95                                : max = 1.888012         threshold = 1.403563         scale = 90.483986
 linear_96                                : max = 6.856741         threshold = 5.398679         scale = 23.524273
 linear_97                                : max = 9.635942         threshold = 2.613655         scale = 48.590950
 linear_98                                : max = 6.460340         threshold = 5.670146         scale = 22.398010
 linear_99                                : max = 9.532276         threshold = 2.585537         scale = 49.119396
 linear_101                               : max = 6.585871         threshold = 5.719224         scale = 22.205809
 linear_100                               : max = 6.585871         threshold = 5.751382         scale = 22.081648
 linear_102                               : max = 1.593344         threshold = 1.450581         scale = 87.551147
 linear_103                               : max = 6.592681         threshold = 5.705824         scale = 22.257959
 linear_104                               : max = 8.752957         threshold = 1.980955         scale = 64.110489
 linear_105                               : max = 6.696240         threshold = 5.877193         scale = 21.608953
 linear_106                               : max = 9.059659         threshold = 2.643138         scale = 48.048950
 linear_108                               : max = 6.975461         threshold = 4.589567         scale = 27.671457
 linear_107                               : max = 6.975461         threshold = 6.190381         scale = 20.515701
 linear_109                               : max = 3.710759         threshold = 2.305635         scale = 55.082436
 linear_110                               : max = 7.531228         threshold = 5.731162         scale = 22.159557
 linear_111                               : max = 10.528083        threshold = 2.259322         scale = 56.211544
 linear_112                               : max = 8.148807         threshold = 5.500842         scale = 23.087374
 linear_113                               : max = 8.592566         threshold = 1.948851         scale = 65.166611
 linear_115                               : max = 8.437109         threshold = 5.608947         scale = 22.642395
 linear_114                               : max = 8.437109         threshold = 6.193942         scale = 20.503904
 linear_116                               : max = 3.966980         threshold = 3.200896         scale = 39.676392
 linear_117                               : max = 9.451303         threshold = 6.061664         scale = 20.951344
 linear_118                               : max = 12.077262        threshold = 3.965800         scale = 32.023804
 linear_119                               : max = 9.671615         threshold = 4.847613         scale = 26.198460
 linear_120                               : max = 8.625638         threshold = 3.131427         scale = 40.556595
 linear_122                               : max = 10.274080        threshold = 4.888716         scale = 25.978189
 linear_121                               : max = 10.274080        threshold = 5.420480         scale = 23.429659
 linear_123                               : max = 4.826197         threshold = 3.599617         scale = 35.281532
 linear_124                               : max = 11.396383        threshold = 7.325849         scale = 17.335875
 linear_125                               : max = 9.337198         threshold = 3.941410         scale = 32.221970
 linear_126                               : max = 9.699965         threshold = 4.842878         scale = 26.224073
 linear_127                               : max = 8.775370         threshold = 3.884215         scale = 32.696438
 linear_129                               : max = 9.872276         threshold = 4.837319         scale = 26.254213
 linear_128                               : max = 9.872276         threshold = 7.180057         scale = 17.687883
 linear_130                               : max = 4.150427         threshold = 3.454298         scale = 36.765789
 linear_131                               : max = 11.112692        threshold = 7.924847         scale = 16.025545
 linear_132                               : max = 11.852893        threshold = 3.116593         scale = 40.749626
 linear_133                               : max = 11.517084        threshold = 5.024665         scale = 25.275314
 linear_134                               : max = 10.683807        threshold = 3.878618         scale = 32.743618
 linear_136                               : max = 12.421055        threshold = 6.322729         scale = 20.086264
 linear_135                               : max = 12.421055        threshold = 5.309880         scale = 23.917679
 linear_137                               : max = 4.827781         threshold = 3.744595         scale = 33.915554
 linear_138                               : max = 14.422395        threshold = 7.742882         scale = 16.402161
 linear_139                               : max = 8.527538         threshold = 3.866123         scale = 32.849449
 linear_140                               : max = 12.128619        threshold = 4.657793         scale = 27.266134
 linear_141                               : max = 9.839593         threshold = 3.845993         scale = 33.021378
 linear_143                               : max = 12.442304        threshold = 7.099039         scale = 17.889746
 linear_142                               : max = 12.442304        threshold = 5.325038         scale = 23.849592
 linear_144                               : max = 5.929444         threshold = 5.618206         scale = 22.605080
 linear_145                               : max = 13.382126        threshold = 9.321095         scale = 13.625010
 linear_146                               : max = 9.894987         threshold = 3.867645         scale = 32.836517
 linear_147                               : max = 10.915313        threshold = 4.906028         scale = 25.886522
 linear_148                               : max = 9.614287         threshold = 3.908151         scale = 32.496181
 linear_150                               : max = 11.724932        threshold = 4.485588         scale = 28.312899
 linear_149                               : max = 11.724932        threshold = 5.161146         scale = 24.606939
 linear_151                               : max = 7.164453         threshold = 5.847355         scale = 21.719223
 linear_152                               : max = 13.086471        threshold = 5.984121         scale = 21.222834
 linear_153                               : max = 11.099524        threshold = 3.991601         scale = 31.816805
 linear_154                               : max = 10.054585        threshold = 4.489706         scale = 28.286930
 linear_155                               : max = 12.389185        threshold = 3.100321         scale = 40.963501
 linear_157                               : max = 9.982999         threshold = 5.154796         scale = 24.637253
 linear_156                               : max = 9.982999         threshold = 8.537706         scale = 14.875190
 linear_158                               : max = 8.420287         threshold = 6.502287         scale = 19.531588
 linear_159                               : max = 25.014746        threshold = 9.423280         scale = 13.477261
 linear_160                               : max = 45.633553        threshold = 5.715335         scale = 22.220921
 linear_161                               : max = 20.371849        threshold = 5.117830         scale = 24.815203
 linear_162                               : max = 12.492933        threshold = 3.126283         scale = 40.623318
 linear_164                               : max = 20.697504        threshold = 4.825712         scale = 26.317358
 linear_163                               : max = 20.697504        threshold = 5.078367         scale = 25.008038
 linear_165                               : max = 9.023975         threshold = 6.836278         scale = 18.577358
 linear_166                               : max = 34.860619        threshold = 7.259792         scale = 17.493614
 linear_167                               : max = 30.380934        threshold = 5.496160         scale = 23.107042
 linear_168                               : max = 20.691216        threshold = 4.733317         scale = 26.831076
 linear_169                               : max = 9.723948         threshold = 3.952728         scale = 32.129707
 linear_171                               : max = 21.034811        threshold = 5.366547         scale = 23.665123
 linear_170                               : max = 21.034811        threshold = 5.356277         scale = 23.710501
 linear_172                               : max = 10.556884        threshold = 5.729481         scale = 22.166058
 linear_173                               : max = 20.033039        threshold = 10.207264        scale = 12.442120
 linear_174                               : max = 11.597379        threshold = 2.658676         scale = 47.768131
 ----------joiner----------
 linear_2                                 : max = 19.293503        threshold = 14.305265        scale = 8.877850
 linear_1                                 : max = 10.812222        threshold = 8.766452         scale = 14.487047
 linear_3                                 : max = 0.999999         threshold = 0.999755         scale = 127.031174
 ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\(^0^)/...233...
--- a/docs/source/model-export/code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt
+++ b/docs/source/model-export/code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt
@ -0,0 +1,7 @@
 2023-01-11 14:02:12,216 INFO [streaming-ncnn-decode.py:320] {'tokens': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt', 'encoder_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param', 'encoder_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin', 'decoder_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param', 'decoder_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin', 'joiner_param_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param', 'joiner_bin_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin', 'sound_filename': './icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav'}
 T 51 32
 2023-01-11 14:02:13,141 INFO [streaming-ncnn-decode.py:328] Constructing Fbank computer
 2023-01-11 14:02:13,151 INFO [streaming-ncnn-decode.py:331] Reading sound files: ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
 2023-01-11 14:02:13,176 INFO [streaming-ncnn-decode.py:336] torch.Size([106000])
 2023-01-11 14:02:17,581 INFO [streaming-ncnn-decode.py:380] ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
 2023-01-11 14:02:17,581 INFO [streaming-ncnn-decode.py:381] AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
--- a/docs/source/model-export/export-ncnn.rst
+++ b/docs/source/model-export/export-ncnn.rst
@ -1,12 +1,771 @@
 Export to ncnn
 ==============
-We support exporting LSTM transducer models to `ncnn <https://github.com/tencent/ncnn>`_.
+We support exporting both
-
+`LSTM transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/lstm_transducer_stateless2>`_
-Please refer to :ref:`export-model-for-ncnn` for details.
+and
 `ConvEmformer transducer models <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2>`_
 to `ncnn <https://github.com/tencent/ncnn>`_.
 We also provide `<https://github.com/k2-fsa/sherpa-ncnn>`_
 performing speech recognition using ``ncnn`` with exported models.
-It has been tested on Linux, macOS, Windows, and Raspberry Pi. The project is
+It has been tested on Linux, macOS, Windows, ``Android``, and ``Raspberry Pi``.
-self-contained and can be statically linked to produce a binary containing
+
-everything needed.
+`sherpa-ncnn`_ is self-contained and can be statically linked to produce
 a binary containing everything needed. Please refer
 to its documentation for details:
 - `<https://k2-fsa.github.io/sherpa/ncnn/index.html>`_
 Export LSTM transducer models
 -----------------------------
 Please refer to :ref:`export-lstm-transducer-model-for-ncnn` for details.
 Export ConvEmformer transducer models
 -------------------------------------
 We use the pre-trained model from the following repository as an example:
  - `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
 We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
 .. hint::
  We use ``Ubuntu 18.04``, ``torch 1.10``, and ``Python 3.8`` for testing.
 .. caution::
  Please use a more recent version of PyTorch. For instance, ``torch 1.8``
  may ``not`` work.
 1. Download the pre-trained model
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. hint::
  You can also refer to `<https://k2-fsa.github.io/sherpa/cpp/pretrained_models/online_transducer.html#icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_ to download the pre-trained model.
  You have to install `git-lfs`_ before you continue.
 .. code-block:: bash
  cd egs/librispeech/ASR
  GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
  git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
  git lfs pull --include "data/lang_bpe_500/bpe.model"
  cd ..
 .. note::
  We download ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
 In the above code, we download the pre-trained model into the directory
 ``egs/librispeech/ASR/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05``.
 2. Install ncnn and pnnx
 ^^^^^^^^^^^^^^^^^^^^^^^^
 .. code-block:: bash
  # We put ncnn into $HOME/open-source/ncnn
  # You can change it to anywhere you like
  cd $HOME
  mkdir -p open-source
  cd open-source
  git clone https://github.com/csukuangfj/ncnn
  cd ncnn
  git submodule update --recursive --init
  # Note: We don't use "python setup.py install" or "pip install ." here
  mkdir -p build-wheel
  cd build-wheel
  cmake \
    -DCMAKE_BUILD_TYPE=Release \
    -DNCNN_PYTHON=ON \
    -DNCNN_BUILD_BENCHMARK=OFF \
    -DNCNN_BUILD_EXAMPLES=OFF \
    -DNCNN_BUILD_TOOLS=ON \
  ..
  make -j4
  cd ..
  # Note: $PWD here is $HOME/open-source/ncnn
  export PYTHONPATH=$PWD/python:$PYTHONPATH
  export PATH=$PWD/tools/pnnx/build/src:$PATH
  export PATH=$PWD/build-wheel/tools/quantize:$PATH
  # Now build pnnx
  cd tools/pnnx
  mkdir build
  cd build
  cmake ..
  make -j4
  ./src/pnnx
 Congratulations! You have successfully installed the following components:
  - ``pnxx``, which is an executable located in
    ``$HOME/open-source/ncnn/tools/pnnx/build/src``. We will use
    it to convert models exported by ``torch.jit.trace()``.
  - ``ncnn2int8``, which is an executable located in
    ``$HOME/open-source/ncnn/build-wheel/tools/quantize``. We will use
    it to quantize our models to ``int8``.
  - ``ncnn.cpython-38-x86_64-linux-gnu.so``, which is a Python module located
    in ``$HOME/open-source/ncnn/python/ncnn``.
    .. note::
      I am using ``Python 3.8``, so it
      is ``ncnn.cpython-38-x86_64-linux-gnu.so``. If you use a different
      version, say, ``Python 3.9``, the name would be
      ``ncnn.cpython-39-x86_64-linux-gnu.so``.
      Also, if you are not using Linux, the file name would also be different.
      But that does not matter. As long as you can compile it, it should work.
 We have set up ``PYTHONPATH`` so that you can use ``import ncnn`` in your
 Python code. We have also set up ``PATH`` so that you can use
 ``pnnx`` and ``ncnn2int8`` later in your terminal.
 .. caution::
  Please don't use `<https://github.com/tencent/ncnn>`_.
  We have made some modifications to the offical `ncnn`_.
  We will synchronize `<https://github.com/csukuangfj/ncnn>`_ periodically
  with the official one.
 3. Export the model via torch.jit.trace()
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 First, let us rename our pre-trained model:
 .. code-block::
  cd egs/librispeech/ASR
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp
  ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-30.pt
  cd ../..
 Next, we use the following code to export our model:
 .. code-block:: bash
  dir=./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/
  ./conv_emformer_transducer_stateless2/export-for-ncnn.py \
    --exp-dir $dir/exp \
    --bpe-model $dir/data/lang_bpe_500/bpe.model \
    --epoch 30 \
    --avg 1 \
    --use-averaged-model 0 \
    \
    --num-encoder-layers 12 \
    --chunk-length 32 \
    --cnn-module-kernel 31 \
    --left-context-length 32 \
    --right-context-length 8 \
    --memory-size 32 \
    --encoder-dim 512
 .. hint::
  We have renamed our model to ``epoch-30.pt`` so that we can use ``--epoch 30``.
  There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
  If you have trained a model by yourself and if you have all checkpoints
  available, please first use ``decode.py`` to tune ``--epoch --avg``
  and select the best combination with with ``--use-averaged-model 1``.
 .. note::
  You will see the following log output:
  .. literalinclude:: ./code/export-conv-emformer-transducer-for-ncnn-output.txt
  The log shows the model has ``75490012`` parameters, i.e., ``~75 M``.
  .. code-block::
    ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
    -rw-r--r-- 1 kuangfangjun root 289M Jan 11 12:05 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
  You can see that the file size of the pre-trained model is ``289 MB``, which
  is roughly ``75490012*4/1024/1024 = 287.97 MB``.
 After running ``conv_emformer_transducer_stateless2/export-for-ncnn.py``,
 we will get the following files:
 .. code-block:: bash
  ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*pnnx*
  -rw-r--r-- 1 kuangfangjun root 1010K Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.pt
  -rw-r--r-- 1 kuangfangjun root  283M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.pt
  -rw-r--r-- 1 kuangfangjun root  3.0M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.pt
 .. _conv-emformer-step-3-export-torchscript-model-via-pnnx:
 3. Export torchscript model via pnnx
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. hint::
  Make sure you have set up the ``PATH`` environment variable. Otherwise,
  it will throw an error saying that ``pnnx`` could not be found.
 Now, it's time to export our models to `ncnn`_ via ``pnnx``.
 .. code-block::
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
  pnnx ./encoder_jit_trace-pnnx.pt
  pnnx ./decoder_jit_trace-pnnx.pt
  pnnx ./joiner_jit_trace-pnnx.pt
 It will generate the following files:
 .. code-block:: bash
  ls -lh  icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*ncnn*{bin,param}
  -rw-r--r-- 1 kuangfangjun root 503K Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
  -rw-r--r-- 1 kuangfangjun root  437 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
  -rw-r--r-- 1 kuangfangjun root 142M Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
  -rw-r--r-- 1 kuangfangjun root  79K Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
  -rw-r--r-- 1 kuangfangjun root 1.5M Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
  -rw-r--r-- 1 kuangfangjun root  488 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
 There are two types of files:
 - ``param``: It is a text file containing the model architectures. You can
  use a text editor to view its content.
 - ``bin``: It is a binary file containing the model parameters.
 We compare the file sizes of the models below before and after converting via ``pnnx``:
 .. see https://tableconvert.com/restructuredtext-generator
 +----------------------------------+------------+
 | File name                        | File size  |
 +==================================+============+
 | encoder_jit_trace-pnnx.pt        | 283 MB     |
 +----------------------------------+------------+
 | decoder_jit_trace-pnnx.pt        | 1010 KB    |
 +----------------------------------+------------+
 | joiner_jit_trace-pnnx.pt         | 3.0 MB     |
 +----------------------------------+------------+
 | encoder_jit_trace-pnnx.ncnn.bin  | 142 MB     |
 +----------------------------------+------------+
 | decoder_jit_trace-pnnx.ncnn.bin  | 503 KB     |
 +----------------------------------+------------+
 | joiner_jit_trace-pnnx.ncnn.bin   | 1.5 MB     |
 +----------------------------------+------------+
 You can see that the file sizes of the models after conversion are about one half
 of the models before conversion:
  - encoder: 283 MB vs 142 MB
  - decoder: 1010 KB vs 503 KB
  - joiner: 3.0 MB vs 1.5 MB
 The reason is that by default ``pnnx`` converts ``float32`` parameters
 to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
 for ``float16``. Thus, it is ``twice smaller`` after conversion.
 .. hint::
  If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
  won't convert ``float32`` to ``float16``.
 4. Test the exported models in icefall
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. note::
  We assume you have set up the environment variable ``PYTHONPATH`` when
  building `ncnn`_.
 Now we have successfully converted our pre-trained model to `ncnn`_ format.
 The generated 6 files are what we need. You can use the following code to
 test the converted models:
 .. code-block:: bash
  ./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
    --tokens ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt \
    --encoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param \
    --encoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin \
    --decoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param \
    --decoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin \
    --joiner-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param \
    --joiner-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin \
    ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
 .. hint::
  `ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
  only 1 wave file as input.
 The output is given below:
 .. literalinclude:: ./code/test-stremaing-ncnn-decode-conv-emformer-transducer-libri.txt
 Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
 .. _conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn:
 5. Modify the exported encoder for sherpa-ncnn
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 In order to use the exported models in `sherpa-ncnn`_, we have to modify
 ``encoder_jit_trace-pnnx.ncnn.param``.
 Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
 .. code-block::
  7767517
  1060 1342
  Input                    in0                      0 1 in0
 **Explanation** of the above three lines:
  1. ``7767517``, it is a magic number and should not be changed.
  2. ``1060 1342``, the first number ``1060`` specifies the number of layers
     in this file, while ``1342`` specifies the number of intermediate outputs
     of this file
  3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
     is the layer name of this layer; ``0`` means this layer has no input;
     ``1`` means this layer has one output; ``in0`` is the output name of
     this layer.
 We need to add 1 extra line and also increment the number of layers.
 The result looks like below:
 .. code-block:: bash
  7767517
  1061 1342
  SherpaMetaData           sherpa_meta_data1        0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
  Input                    in0                      0 1 in0
 **Explanation**
  1. ``7767517``, it is still the same
  2. ``1061 1342``, we have added an extra layer, so we need to update ``1060`` to ``1061``.
     We don't need to change ``1342`` since the newly added layer has no inputs or outputs.
  3. ``SherpaMetaData  sherpa_meta_data1  0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512``
     This line is newly added. Its explanation is given below:
      - ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
      - ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
      - ``0 0`` means this layer has no inputs or output. Must be ``0 0``
      - ``0=1``, 0 is the key and 1 is the value. MUST be ``0=1``
      - ``1=12``, 1 is the key and 12 is the value of the
        parameter ``--num-encoder-layers`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      - ``2=32``, 2 is the key and 32 is the value of the
        parameter ``--memory-size`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      - ``3=31``, 3 is the key and 31 is the value of the
        parameter ``--cnn-module-kernel`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      - ``4=8``, 4 is the key and 8 is the value of the
        parameter ``--left-context-length`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      - ``5=32``, 5 is the key and 32 is the value of the
        parameter ``--chunk-length`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      - ``6=8``, 6 is the key and 8 is the value of the
        parameter ``--right-context-length`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      - ``7=512``, 7 is the key and 512 is the value of the
        parameter ``--encoder-dim`` that you provided when running
        ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
      For ease of reference, we list the key-value pairs that you need to add
      in the following table. If your model has a different setting, please
      change the values for ``SherpaMetaData`` accordingly. Otherwise, you
      will be ``SAD``.
          +------+-----------------------------+
          | key  | value                       |
          +======+=============================+
          | 0    | 1 (fixed)                   |
          +------+-----------------------------+
          | 1    | ``--num-encoder-layers``    |
          +------+-----------------------------+
          | 2    | ``--memory-size``           |
          +------+-----------------------------+
          | 3    | ``--cnn-module-kernel``     |
          +------+-----------------------------+
          | 4    | ``--left-context-length``   |
          +------+-----------------------------+
          | 5    | ``--chunk-length``          |
          +------+-----------------------------+
          | 6    | ``--right-context-length``  |
          +------+-----------------------------+
          | 7    | ``--encoder-dim``           |
          +------+-----------------------------+
  4. ``Input in0 0 1 in0``. No need to change it.
 .. caution::
  When you add a new layer ``SherpaMetaData``, please remember to update the
  number of layers. In our case, update  ``1060`` to ``1061``. Otherwise,
  you will be SAD later.
 .. hint::
  After adding the new layer ``SherpaMetaData``, you cannot use this model
  with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
  supported only in `sherpa-ncnn`_.
 .. hint::
  `ncnn`_ is very flexible. You can add new layers to it just by text-editing
  the ``param`` file! You don't need to change the ``bin`` file.
 Now you can use this model in `sherpa-ncnn`_.
 Please refer to the following documentation:
  - Linux/macOS/Windows/arm/aarch64: `<https://k2-fsa.github.io/sherpa/ncnn/install/index.html>`_
  - Android: `<https://k2-fsa.github.io/sherpa/ncnn/android/index.html>`_
  - Python: `<https://k2-fsa.github.io/sherpa/ncnn/python/index.html>`_
 We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
  - `<https://k2-fsa.github.io/sherpa/ncnn/pretrained_models/index.html>`_
    You can find more usages there.
 6. (Optional) int8 quantization with sherpa-ncnn
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 This step is optional.
 In this step, we describe how to quantize our model with ``int8``.
 Change :ref:`conv-emformer-step-3-export-torchscript-model-via-pnnx` to
 disable ``fp16`` when using ``pnnx``:
 .. code-block::
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
  pnnx ./encoder_jit_trace-pnnx.pt fp16=0
  pnnx ./decoder_jit_trace-pnnx.pt
  pnnx ./joiner_jit_trace-pnnx.pt fp16=0
 .. note::
  We add ``fp16=0`` when exporting the encoder and joiner. `ncnn`_ does not
  support quantizing the decoder model yet. We will update this documentation
  once `ncnn`_ supports it. (Maybe in this year, 2023).
 It will generate the following files
 .. code-block:: bash
  ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*_jit_trace-pnnx.ncnn.{param,bin}
  -rw-r--r-- 1 kuangfangjun root 503K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
  -rw-r--r-- 1 kuangfangjun root  437 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
  -rw-r--r-- 1 kuangfangjun root 283M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
  -rw-r--r-- 1 kuangfangjun root  79K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
  -rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
  -rw-r--r-- 1 kuangfangjun root  488 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
 Let us compare again the file sizes:
 +----------------------------------------+------------+
 | File name                              | File size  |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.pt              | 283 MB     |
 +----------------------------------------+------------+
 | decoder_jit_trace-pnnx.pt              | 1010 KB    |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.pt               | 3.0 MB     |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB     |
 +----------------------------------------+------------+
 | decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB     |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.ncnn.bin  (fp16) | 1.5 MB     |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB     |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.ncnn.bin  (fp32) | 3.0 MB     |
 +----------------------------------------+------------+
 You can see that the file sizes are doubled when we disable ``fp16``.
 .. note::
  You can again use ``streaming-ncnn-decode.py`` to test the exported models.
 Next, follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
 to modify ``encoder_jit_trace-pnnx.ncnn.param``.
 Change
 .. code-block:: bash
  7767517
  1060 1342
  Input                    in0                      0 1 in0
 to
 .. code-block:: bash
  7767517
  1061 1342
  SherpaMetaData           sherpa_meta_data1        0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
  Input                    in0                      0 1 in0
 .. caution::
  Please follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
  to change the values for ``SherpaMetaData`` if your model uses a different setting.
 Next, let us compile `sherpa-ncnn`_ since we will quantize our models within
 `sherpa-ncnn`_.
 .. code-block:: bash
  # We will download sherpa-ncnn to $HOME/open-source/
  # You can change it to anywhere you like.
  cd $HOME
  mkdir -p open-source
  cd open-source
  git clone https://github.com/k2-fsa/sherpa-ncnn
  cd sherpa-ncnn
  mkdir build
  cd build
  cmake ..
  make -j 4
  ./bin/generate-int8-scale-table
  export PATH=$HOME/open-source/sherpa-ncnn/build/bin:$PATH
 The output of the above commands are:
 .. code-block:: bash
  (py38) kuangfangjun:build$ generate-int8-scale-table
  Please provide 10 arg. Currently given: 1
  Usage:
  generate-int8-scale-table encoder.param encoder.bin decoder.param decoder.bin joiner.param joiner.bin encoder-scale-table.txt joiner-scale-table.txt wave_filenames.txt
  Each line in wave_filenames.txt is a path to some 16k Hz mono wave file.
 We need to create a file ``wave_filenames.txt``, in which we need to put
 some calibration wave files. For testing purpose, we put the ``test_wavs``
 from the pre-trained model repository `<https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>`_
 .. code-block:: bash
  cd egs/librispeech/ASR
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
  cat <<EOF > wave_filenames.txt
  ../test_wavs/1089-134686-0001.wav
  ../test_wavs/1221-135766-0001.wav
  ../test_wavs/1221-135766-0002.wav
  EOF
 Now we can calculate the scales needed for quantization with the calibration data:
 .. code-block:: bash
  cd egs/librispeech/ASR
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
  generate-int8-scale-table \
    ./encoder_jit_trace-pnnx.ncnn.param \
    ./encoder_jit_trace-pnnx.ncnn.bin \
    ./decoder_jit_trace-pnnx.ncnn.param \
    ./decoder_jit_trace-pnnx.ncnn.bin \
    ./joiner_jit_trace-pnnx.ncnn.param \
    ./joiner_jit_trace-pnnx.ncnn.bin \
    ./encoder-scale-table.txt \
    ./joiner-scale-table.txt \
    ./wave_filenames.txt
 The output logs are in the following:
 .. literalinclude:: ./code/generate-int-8-scale-table-for-conv-emformer.txt
 It generates the following two files:
 .. code-block:: bash
  $ ls -lh encoder-scale-table.txt joiner-scale-table.txt
  -rw-r--r-- 1 kuangfangjun root 955K Jan 11 17:28 encoder-scale-table.txt
  -rw-r--r-- 1 kuangfangjun root  18K Jan 11 17:28 joiner-scale-table.txt
 .. caution::
  Definitely, you need more calibration data to compute the scale table.
 Finally, let us use the scale table to quantize our models into ``int8``.
 .. code-block:: bash
  ncnn2int8
  usage: ncnn2int8 [inparam] [inbin] [outparam] [outbin] [calibration table]
 First, we quantize the encoder model:
 .. code-block:: bash
  cd egs/librispeech/ASR
  cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
  ncnn2int8 \
    ./encoder_jit_trace-pnnx.ncnn.param \
    ./encoder_jit_trace-pnnx.ncnn.bin \
    ./encoder_jit_trace-pnnx.ncnn.int8.param \
    ./encoder_jit_trace-pnnx.ncnn.int8.bin \
    ./encoder-scale-table.txt
 Next, we quantize the joiner model:
 .. code-block:: bash
  ncnn2int8 \
    ./joiner_jit_trace-pnnx.ncnn.param \
    ./joiner_jit_trace-pnnx.ncnn.bin \
    ./joiner_jit_trace-pnnx.ncnn.int8.param \
    ./joiner_jit_trace-pnnx.ncnn.int8.bin \
    ./joiner-scale-table.txt
 The above two commands generate the following 4 files:
 .. code-block:: bash
  -rw-r--r-- 1 kuangfangjun root  99M Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.bin
  -rw-r--r-- 1 kuangfangjun root  78K Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.param
  -rw-r--r-- 1 kuangfangjun root 774K Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.bin
  -rw-r--r-- 1 kuangfangjun root  496 Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.param
 Congratulations! You have successfully quantized your model from ``float32`` to ``int8``.
 .. caution::
  ``ncnn.int8.param`` and ``ncnn.int8.bin`` must be used in pairs.
  You can replace ``ncnn.param`` and ``ncnn.bin`` with ``ncnn.int8.param``
  and ``ncnn.int8.bin`` in `sherpa-ncnn`_ if you like.
  For instance, to use only the ``int8`` encoder in ``sherpa-ncnn``, you can
  replace the following invocation:
    .. code-block::
      cd egs/librispeech/ASR
      cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
      sherpa-ncnn \
        ../data/lang_bpe_500/tokens.txt \
        ./encoder_jit_trace-pnnx.ncnn.param \
        ./encoder_jit_trace-pnnx.ncnn.bin \
        ./decoder_jit_trace-pnnx.ncnn.param \
        ./decoder_jit_trace-pnnx.ncnn.bin \
        ./joiner_jit_trace-pnnx.ncnn.param \
        ./joiner_jit_trace-pnnx.ncnn.bin \
        ../test_wavs/1089-134686-0001.wav
  with
    .. code-block::
      cd egs/librispeech/ASR
      cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
      sherpa-ncnn \
        ../data/lang_bpe_500/tokens.txt \
        ./encoder_jit_trace-pnnx.ncnn.int8.param \
        ./encoder_jit_trace-pnnx.ncnn.int8.bin \
        ./decoder_jit_trace-pnnx.ncnn.param \
        ./decoder_jit_trace-pnnx.ncnn.bin \
        ./joiner_jit_trace-pnnx.ncnn.param \
        ./joiner_jit_trace-pnnx.ncnn.bin \
        ../test_wavs/1089-134686-0001.wav
 The following table compares again the file sizes:
 +----------------------------------------+------------+
 | File name                              | File size  |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.pt              | 283 MB     |
 +----------------------------------------+------------+
 | decoder_jit_trace-pnnx.pt              | 1010 KB    |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.pt               | 3.0 MB     |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB     |
 +----------------------------------------+------------+
 | decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB     |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.ncnn.bin  (fp16) | 1.5 MB     |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB     |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.ncnn.bin  (fp32) | 3.0 MB     |
 +----------------------------------------+------------+
 | encoder_jit_trace-pnnx.ncnn.int8.bin   | 99 MB      |
 +----------------------------------------+------------+
 | joiner_jit_trace-pnnx.ncnn.int8.bin    | 774 KB     |
 +----------------------------------------+------------+
 You can see that the file sizes of the model after ``int8`` quantization
 are much smaller.
 .. hint::
    Currently, only linear layers and convolutional layers are quantized
    with ``int8``, so you don't see an exact ``4x`` reduction in file sizes.
 .. note::
  You need to test the recognition accuracy after ``int8`` quantization.
 You can find the speed comparison at `<https://github.com/k2-fsa/sherpa-ncnn/issues/44>`_.
 That's it! Have fun with `sherpa-ncnn`_!
--- a/docs/source/model-export/export-with-torch-jit-script.rst
+++ b/docs/source/model-export/export-with-torch-jit-script.rst
@ -1,7 +1,7 @@
 .. _export-model-with-torch-jit-script:
 Export model with torch.jit.script()
-===================================
+====================================
 In this section, we describe how to export a model via
 ``torch.jit.script()``.
--- a/docs/source/recipes/Non-streaming-ASR/aishell/conformer_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/conformer_ctc.rst
@ -703,7 +703,7 @@ It will show you the following message:
 HLG decoding
-^^^^^^^^^^^^
+~~~~~~~~~~~~
 .. code-block:: bash
--- a/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-conformer-ctc-tensorboard-log.jpg
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-conformer-ctc-tensorboard-log.jpg
--- a/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-tdnn-lstm-ctc-tensorboard-log.jpg
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-tdnn-lstm-ctc-tensorboard-log.jpg
--- a/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-transducer_stateless_modified-tensorboard-log.png
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-transducer_stateless_modified-tensorboard-log.png
--- a/docs/source/recipes/Non-streaming-ASR/aishell/index.rst
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/index.rst
--- a/docs/source/recipes/Non-streaming-ASR/aishell/stateless_transducer.rst
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/stateless_transducer.rst
--- a/docs/source/recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc.rst
--- a/docs/source/recipes/Non-streaming-ASR/index.rst
+++ b/docs/source/recipes/Non-streaming-ASR/index.rst
@ -0,0 +1,10 @@
 Non Streaming ASR
 =================
 .. toctree::
   :maxdepth: 2
   aishell/index
   librispeech/index
   timit/index
   yesno/index
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/conformer_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/conformer_ctc.rst
@ -888,7 +888,7 @@ It will show you the following message:
 CTC decoding
-^^^^^^^^^^^^
+~~~~~~~~~~~~
 .. code-block:: bash
@ -926,7 +926,7 @@ Its output is:
  YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
 HLG decoding
-^^^^^^^^^^^^
+~~~~~~~~~~~~
 .. code-block:: bash
@ -966,7 +966,7 @@ The output is:
 HLG decoding + n-gram LM rescoring
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
@ -1012,7 +1012,7 @@ The output is:
 HLG decoding + n-gram LM rescoring + attention decoder rescoring
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
@ -0,0 +1,223 @@
 Distillation with HuBERT
 ========================
 This tutorial shows you how to perform knowledge distillation in `icefall`_
 with the `LibriSpeech`_ dataset. The distillation method
 used here is called "Multi Vector Quantization Knowledge Distillation" (MVQ-KD).
 Please have a look at our paper `Predicting Multi-Codebook Vector Quantization Indexes for Knowledge Distillation <https://arxiv.org/abs/2211.00508>`_
 for more details about MVQ-KD.
 .. note::
    This tutorial is based on recipe
    `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_.
    Currently, we only implement MVQ-KD in this recipe. However, MVQ-KD is theoretically applicable to all recipes
    with only minor changes needed. Feel free to try out MVQ-KD in different recipes. If you
    encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`_.
 .. note::
  We assume you have read the page :ref:`install icefall` and have setup
  the environment for `icefall`_.
 .. HINT::
  We recommend you to use a GPU or several GPUs to run this recipe.
 Data preparation
 ----------------
 We first prepare necessary training data for `LibriSpeech`_.
 This is the same as in :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
 .. hint::
   The data preparation is the same as other recipes on LibriSpeech dataset,
   if you have finished this step, you can skip to :ref:`codebook_index_preparation` directly.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh
 The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
 All you need to do is to run it.
 The data preparation contains several stages, you can use the following two
 options:
  - ``--stage``
  - ``--stop-stage``
 to control which stage(s) should be run. By default, all stages are executed.
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh --stage 0 --stop-stage 0 # run only stage 0
  $ ./prepare.sh --stage 2 --stop-stage 5 # run from stage 2 to stage 5
 .. HINT::
  If you have pre-downloaded the `LibriSpeech`_
  dataset and the `musan`_ dataset, say,
  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
  ``./prepare.sh`` won't re-download them.
 .. NOTE::
  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
  are saved in ``./data`` directory.
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 .. _codebook_index_preparation:
 Codebook index preparation
 --------------------------
 Here, we prepare necessary data for MVQ-KD. This requires the generation
 of codebook indexes (please read our `paper <https://arxiv.org/abs/2211.00508>`_.
 if you are interested in details). In this tutorial, we use the pre-computed
 codebook indexes for convenience. The only thing you need to do is to
 run `./distillation_with_hubert.sh <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/distillation_with_hubert.sh>`_.
 .. note::
  There are 5 stages in total, the first and second stage will be automatically skipped
  when choosing to downloaded codebook indexes prepared by `icefall`_.
  Of course, you can extract and compute the codebook indexes by yourself. This
  will require you downloading a HuBERT-XL model and it can take a while for
  the extraction of codebook indexes.
 As usual, you can control the stages you want to run by specifying the following
 two options:
  - ``--stage``
  - ``--stop-stage``
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./distillation_with_hubert.sh --stage 0 --stop-stage 0 # run only stage 0
  $ ./distillation_with_hubert.sh --stage 2 --stop-stage 4 # run from stage 2 to stage 5
 Here are a few options in `./distillation_with_hubert.sh <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/distillation_with_hubert.sh>`_
 you need to know before you proceed.
 - ``--full_libri`` If True, use full 960h data. Otherwise only ``train-clean-100`` will be used
 - ``--use_extracted_codebook`` If True, the first two stages will be skipped and the codebook
  indexes uploaded by us will be downloaded.
 Since we are using the pre-computed codebook indexes, we set
 ``use_extracted_codebook=True``. If you want to do full `LibriSpeech`_
 experiments, please set ``full_libri=True``.
 The following command downloads the pre-computed codebook indexes
 and prepares MVQ-augmented training manifests.
 .. code-block:: bash
  $ ./distillation_with_hubert.sh --stage 2 --stop-stage 2 # run only stage 2
 Please see the
 following screenshot for the output of an example execution.
 .. figure:: ./images/distillation_codebook.png
  :width: 800
  :alt: Downloading codebook indexes and preparing training manifest.
  :align: center
  Downloading codebook indexes and preparing training manifest.
 .. hint::
  The codebook indexes we prepared for you in this tutorial
  are extracted from the 36-th layer of a fine-tuned HuBERT-XL model
  with 8 codebooks. If you want to try other configurations, please
  set ``use_extracted_codebook=False`` and set ``embedding_layer`` and
  ``num_codebooks`` by yourself.
 Now, you should see the following files under the directory ``./data/vq_fbank_layer36_cb8``.
 .. figure:: ./images/distillation_directory.png
  :width: 800
  :alt: MVQ-augmented training manifests
  :align: center
  MVQ-augmented training manifests.
 Whola! You are ready to perform knowledge distillation training now!
 Training
 --------
 To perform training, please run stage 3 by executing the following command.
 .. code-block:: bash
  $ ./prepare.sh --stage 3 --stop-stage 3 # run MVQ training
 Here is the code snippet for training:
 .. code-block:: bash
  WORLD_SIZE=$(echo ${CUDA_VISIBLE_DEVICES} | awk '{n=split($1, _, ","); print n}')
  ./pruned_transducer_stateless6/train.py \
    --manifest-dir ./data/vq_fbank_layer36_cb8 \
    --master-port 12359 \
    --full-libri $full_libri \
    --spec-aug-time-warp-factor -1 \
    --max-duration 300 \
    --world-size ${WORLD_SIZE} \
    --num-epochs 30 \
    --exp-dir $exp_dir \
    --enable-distillation True \
    --codebook-loss-scale 0.01
 There are a few training arguments in the following
 training commands that should be paid attention to.
  - ``--enable-distillation`` If True, knowledge distillation training is enabled.
  - ``--codebook-loss-scale`` The scale of the knowledge distillation loss.
  - ``--manifest-dir`` The path to the MVQ-augmented manifest.
 Decoding
 --------
 After training finished, you can test the performance on using
 the following command.
 .. code-block:: bash
  export CUDA_VISIBLE_DEVICES=0
  ./pruned_transducer_stateless6/train.py \
    --decoding-method "modified_beam_search" \
    --epoch 30 \
    --avg 10 \
    --max-duration 200 \
    --exp-dir $exp_dir \
    --enable-distillation True
 You should get similar results as `here <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS-100hours.md#distillation-with-hubert>`_.
 That's all! Feel free to experiment with your own setups and report your results.
 If you encounter any problems during training, please open up an issue `here <https://github.com/k2-fsa/icefall/issues>`_.
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_codebook.png
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_codebook.png
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_directory.png
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_directory.png
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-conformer-ctc-tensorboard-log.png
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-conformer-ctc-tensorboard-log.png
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-pruned-transducer-tensorboard-log.jpg
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-pruned-transducer-tensorboard-log.jpg
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst
@ -0,0 +1,12 @@
 LibriSpeech
 ===========
 .. toctree::
   :maxdepth: 1
   tdnn_lstm_ctc
   conformer_ctc
   pruned_transducer_stateless
   zipformer_mmi
   zipformer_ctc_blankskip
   distillation
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
@ -0,0 +1,548 @@
 .. _non_streaming_librispeech_pruned_transducer_stateless:
 Pruned transducer statelessX
 ============================
 This tutorial shows you how to run a conformer transducer model
 with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 .. Note::
   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_,
   We will take pruned_transducer_stateless4 as an example in this tutorial.
 .. HINT::
  We assume you have read the page :ref:`install icefall` and have setup
  the environment for ``icefall``.
 .. HINT::
  We recommend you to use a GPU or several GPUs to run this recipe.
 .. hint::
   Please scroll down to the bottom of this page to find download links
   for pretrained models if you don't want to train a model from scratch.
 We use pruned RNN-T to compute the loss.
 .. note::
   You can find the paper about pruned RNN-T at the following address:
   `<https://arxiv.org/abs/2206.13236>`_
 The transducer model consists of 3 parts:
  - Encoder, a.k.a, the transcription network. We use a Conformer model (the reworked version by Daniel Povey)
  - Decoder, a.k.a, the prediction network. We use a stateless model consisting of
    ``nn.Embedding`` and ``nn.Conv1d``
  - Joiner, a.k.a, the joint network.
 .. caution::
   Contrary to the conventional RNN-T models, we use a stateless decoder.
   That is, it has no recurrent connections.
 Data preparation
 ----------------
 .. hint::
   The data preparation is the same as other recipes on LibriSpeech dataset,
   if you have finished this step, you can skip to ``Training`` directly.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh
 The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
 All you need to do is to run it.
 The data preparation contains several stages, you can use the following two
 options:
  - ``--stage``
  - ``--stop-stage``
 to control which stage(s) should be run. By default, all stages are executed.
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh --stage 0 --stop-stage 0
 means to run only stage 0.
 To run stage 2 to stage 5, use:
 .. code-block:: bash
  $ ./prepare.sh --stage 2 --stop-stage 5
 .. HINT::
  If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
  dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
  ``./prepare.sh`` won't re-download them.
 .. NOTE::
  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
  are saved in ``./data`` directory.
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 Training
 --------
 Configurable options
 ~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless4/train.py --help
 shows you the training options that can be passed from the commandline.
 The following options are used quite often:
  - ``--exp-dir``
    The directory to save checkpoints, training logs and tensorboard.
  - ``--full-libri``
    If it's True, the training part uses all the training data, i.e.,
    960 hours. Otherwise, the training part uses only the subset
    ``train-clean-100``, which has 100 hours of training data.
    .. CAUTION::
      The training set is perturbed by speed with two factors: 0.9 and 1.1.
      If ``--full-libri`` is True, each epoch actually processes
      ``3x960 == 2880`` hours of data.
  - ``--num-epochs``
    It is the number of epochs to train. For instance,
    ``./pruned_transducer_stateless4/train.py --num-epochs 30`` trains for 30 epochs
    and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
    in the folder ``./pruned_transducer_stateless4/exp``.
  - ``--start-epoch``
    It's used to resume training.
    ``./pruned_transducer_stateless4/train.py --start-epoch 10`` loads the
    checkpoint ``./pruned_transducer_stateless4/exp/epoch-9.pt`` and starts
    training from epoch 10, based on the state from epoch 9.
  - ``--world-size``
    It is used for multi-GPU single-machine DDP training.
      - (a) If it is 1, then no DDP training is used.
      - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
    The following shows some use cases with it.
      **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
      GPU 2 for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="0,2"
          $ ./pruned_transducer_stateless4/train.py --world-size 2
      **Use case 2**: You have 4 GPUs and you want to use all of them
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ ./pruned_transducer_stateless4/train.py --world-size 4
      **Use case 3**: You have 4 GPUs but you only want to use GPU 3
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="3"
          $ ./pruned_transducer_stateless4/train.py --world-size 1
    .. caution::
      Only multi-GPU single-machine DDP training is implemented at present.
      Multi-GPU multi-machine DDP training will be added later.
  - ``--max-duration``
    It specifies the number of seconds over all utterances in a
    batch, before **padding**.
    If you encounter CUDA OOM, please reduce it.
    .. HINT::
      Due to padding, the number of seconds of all utterances in a
      batch will usually be larger than ``--max-duration``.
      A larger value for ``--max-duration`` may cause OOM during training,
      while a smaller value may increase the training time. You have to
      tune it.
  - ``--use-fp16``
    If it is True, the model will train with half precision, from our experiment
    results, by using half precision you can train with two times larger ``--max-duration``
    so as to get almost 2X speed up.
 Pre-configured options
 ~~~~~~~~~~~~~~~~~~~~~~
 There are some training options, e.g., number of encoder layers,
 encoder dimension, decoder dimension, number of warmup steps etc,
 that are not passed from the commandline.
 They are pre-configured by the function ``get_params()`` in
 `pruned_transducer_stateless4/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless4/train.py>`_
 You don't need to change these pre-configured parameters. If you really need to change
 them, please modify ``./pruned_transducer_stateless4/train.py`` directly.
 .. NOTE::
  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`_ are a little different from
  other recipes. It allows you to configure ``--num-encoder-layers``, ``--dim-feedforward``, ``--nhead``, ``--encoder-dim``, ``--decoder-dim``, ``--joiner-dim`` from commandline, so that you can train models with different size with pruned_transducer_stateless5.
 Training logs
 ~~~~~~~~~~~~~
 Training logs and checkpoints are saved in ``--exp-dir`` (e.g. ``pruned_transducer_stateless4/exp``.
 You will find the following files in that directory:
  - ``epoch-1.pt``, ``epoch-2.pt``, ...
    These are checkpoint files saved at the end of each epoch, containing model
    ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless4/train.py --start-epoch 11
  - ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
    These are checkpoint files saved every ``--save-every-n`` batches,
    containing model ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless4/train.py --start-batch 436000
  - ``tensorboard/``
    This folder contains tensorBoard logs. Training loss, validation loss, learning
    rate, etc, are recorded in these logs. You can visualize them by:
      .. code-block:: bash
        $ cd pruned_transducer_stateless4/exp/tensorboard
        $ tensorboard dev upload --logdir . --description "pruned transducer training for LibriSpeech with icefall"
    It will print something like below:
      .. code-block::
        TensorFlow installation not found - running with reduced feature set.
        Upload started and will continue reading any new data as it's added to the logdir.
        To stop uploading, press Ctrl-C.
        New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/QOGSPBgsR8KzcRMmie9JGw/
        [2022-11-20T15:50:50] Started scanning logdir.
        Uploading 4468 scalars...
        [2022-11-20T15:53:02] Total uploaded: 210171 scalars, 0 tensors, 0 binary objects
        Listening for new data in logdir...
    Note there is a URL in the above output. Click it and you will see
    the following screenshot:
      .. figure:: images/librispeech-pruned-transducer-tensorboard-log.jpg
         :width: 600
         :alt: TensorBoard screenshot
         :align: center
         :target: https://tensorboard.dev/experiment/QOGSPBgsR8KzcRMmie9JGw/
         TensorBoard screenshot.
  .. hint::
    If you don't have access to google, you can use the following command
    to view the tensorboard log locally:
      .. code-block:: bash
        cd pruned_transducer_stateless4/exp/tensorboard
        tensorboard --logdir . --port 6008
    It will print the following message:
      .. code-block::
        Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
        TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
    Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
    logs.
  - ``log/log-train-xxxx``
    It is the detailed training log in text format, same as the one
    you saw printed to the console during training.
 Usage example
 ~~~~~~~~~~~~~
 You can use the following command to start the training using 6 GPUs:
 .. code-block:: bash
  export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5"
  ./pruned_transducer_stateless4/train.py \
     --world-size 6 \
     --num-epochs 30 \
     --start-epoch 1 \
     --exp-dir pruned_transducer_stateless4/exp \
     --full-libri 1 \
     --max-duration 300
 Decoding
 --------
 The decoding part uses checkpoints saved by the training part, so you have
 to run the training part first.
 .. hint::
   There are two kinds of checkpoints:
    - (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
      of each epoch. You can pass ``--epoch`` to
      ``pruned_transducer_stateless4/decode.py`` to use them.
    - (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
      every ``--save-every-n`` batches. You can pass ``--iter`` to
      ``pruned_transducer_stateless4/decode.py`` to use them.
    We suggest that you try both types of checkpoints and choose the one
    that produces the lowest WERs.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless4/decode.py --help
 shows the options for decoding.
 The following shows two examples (for two types of checkpoints):
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for epoch in 25 20; do
      for avg in 7 5 3 1; do
        ./pruned_transducer_stateless4/decode.py \
          --epoch $epoch \
          --avg $avg \
          --exp-dir pruned_transducer_stateless4/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for iter in 474000; do
      for avg in 8 10 12 14 16 18; do
        ./pruned_transducer_stateless4/decode.py \
          --iter $iter \
          --avg $avg \
          --exp-dir pruned_transducer_stateless4/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 .. Note::
  Supporting decoding methods are as follows:
    - ``greedy_search`` : It takes the symbol with largest posterior probability
      of each frame as the decoding result.
    - ``beam_search`` :  It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf and
      `espnet/nets/beam_search_transducer.py <https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py#L247>`_
      is used as a reference. Basicly, it keeps topk states for each frame, and expands the kept states with their own contexts to
      next frame.
    - ``modified_beam_search`` : It implements the same algorithm as ``beam_search`` above, but it
      runs in batch mode with ``--max-sym-per-frame=1`` being hardcoded.
    - ``fast_beam_search`` : It implements graph composition between the output ``log_probs`` and
      given ``FSAs``. It is hard to describe the details in several lines of texts, you can read
      our paper in https://arxiv.org/pdf/2211.00484.pdf or our `rnnt decode code in k2 <https://github.com/k2-fsa/k2/blob/master/k2/csrc/rnnt_decode.h>`_. ``fast_beam_search`` can decode with ``FSAs`` on GPU efficiently.
    - ``fast_beam_search_LG`` : The same as ``fast_beam_search`` above, ``fast_beam_search`` uses
      an trivial graph that has only one state, while ``fast_beam_search_LG`` uses an LG graph
      (with N-gram LM).
    - ``fast_beam_search_nbest`` : It produces the decoding results as follows:
      - (1) Use ``fast_beam_search`` to get a lattice
      - (2) Select ``num_paths`` paths from the lattice using ``k2.random_paths()``
      - (3) Unique the selected paths
      - (4) Intersect the selected paths with the lattice and compute the
            shortest path from the intersection result
      - (5) The path with the largest score is used as the decoding output.
    - ``fast_beam_search_nbest_LG`` : It implements same logic as ``fast_beam_search_nbest``, the
      only difference is that it uses ``fast_beam_search_LG`` to generate the lattice.
 Export Model
 ------------
 `pruned_transducer_stateless4/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless4/export.py>`_ supports exporting checkpoints from ``pruned_transducer_stateless4/exp`` in the following ways.
 Export ``model.state_dict()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Checkpoints saved by ``pruned_transducer_stateless4/train.py`` also include
 ``optimizer.state_dict()``. It is useful for resuming training. But after training,
 we are interested only in ``model.state_dict()``. You can use the following
 command to extract ``model.state_dict()``.
 .. code-block:: bash
  # Assume that --epoch 25 --avg 3 produces the smallest WER
  # (You can get such information after running ./pruned_transducer_stateless4/decode.py)
  epoch=25
  avg=3
  ./pruned_transducer_stateless4/export.py \
    --exp-dir ./pruned_transducer_stateless4/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch $epoch \
    --avg  $avg
 It will generate a file ``./pruned_transducer_stateless4/exp/pretrained.pt``.
 .. hint::
   To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless4/decode.py``,
   you can run:
   .. code-block:: bash
      cd pruned_transducer_stateless4/exp
      ln -s pretrained.pt epoch-999.pt
   And then pass ``--epoch 999 --avg 1 --use-averaged-model 0`` to
   ``./pruned_transducer_stateless4/decode.py``.
 To use the exported model with ``./pruned_transducer_stateless4/pretrained.py``, you
 can run:
 .. code-block:: bash
  ./pruned_transducer_stateless4/pretrained.py \
    --checkpoint ./pruned_transducer_stateless4/exp/pretrained.pt \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --method greedy_search \
    /path/to/foo.wav \
    /path/to/bar.wav
 Export model using ``torch.jit.script()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  ./pruned_transducer_stateless4/export.py \
    --exp-dir ./pruned_transducer_stateless4/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 25 \
    --avg 3 \
    --jit 1
 It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
 load it by ``torch.jit.load("cpu_jit.pt")``.
 Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
 are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
 .. NOTE::
   You will need this ``cpu_jit.pt`` when deploying with Sherpa framework.
 Download pretrained models
 --------------------------
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
  - `pruned_transducer_stateless <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12>`_
  - `pruned_transducer_stateless2 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29>`_
  - `pruned_transducer_stateless4 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless4-2022-06-03>`_
  - `pruned_transducer_stateless5 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless5-2022-07-07>`_
  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
  for the details of the above pretrained models
 Deploy with Sherpa
 ------------------
 Please see `<https://k2-fsa.github.io/sherpa/python/offline_asr/conformer/librispeech.html#>`_
 for how to deploy the models in ``sherpa``.
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc.rst
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip.rst
@ -0,0 +1,453 @@
 Zipformer CTC Blank Skip
 ========================
 .. hint::
   Please scroll down to the bottom of this page to find download links
   for pretrained models if you don't want to train a model from scratch.
 This tutorial shows you how to train a Zipformer model based on the guidance from 
 a co-trained CTC model using `blank skip method <https://arxiv.org/pdf/2210.16481.pdf>`_
 with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 .. note::
    We use both CTC and RNN-T loss to train. During the forward pass, the encoder output
    is first used to calculate the CTC posterior probability; then for each output frame,
    if its blank posterior is bigger than some threshold, it will be simply discarded
    from the encoder output. To prevent information loss, we also put a convolution module
    similar to the one used in conformer (referred to as “LConv”) before the frame reduction.
 Data preparation
 ----------------
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh
 The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
 All you need to do is to run it.
 .. note::
   We encourage you to read ``./prepare.sh``.
 The data preparation contains several stages. You can use the following two
 options:
  - ``--stage``
  - ``--stop-stage``
 to control which stage(s) should be run. By default, all stages are executed.
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh --stage 0 --stop-stage 0
 means to run only stage 0.
 To run stage 2 to stage 5, use:
 .. code-block:: bash
  $ ./prepare.sh --stage 2 --stop-stage 5
 .. hint::
  If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
  dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
  ``./prepare.sh`` won't re-download them.
 .. note::
  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
  are saved in ``./data`` directory.
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 Training
 --------
 For stability, it doesn`t use blank skip method until model warm-up.
 Configurable options
 ~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless7_ctc_bs/train.py --help
 shows you the training options that can be passed from the commandline.
 The following options are used quite often:
  - ``--full-libri``
    If it's True, the training part uses all the training data, i.e.,
    960 hours. Otherwise, the training part uses only the subset
    ``train-clean-100``, which has 100 hours of training data.
    .. CAUTION::
      The training set is perturbed by speed with two factors: 0.9 and 1.1.
      If ``--full-libri`` is True, each epoch actually processes
      ``3x960 == 2880`` hours of data.
  - ``--num-epochs``
    It is the number of epochs to train. For instance,
    ``./pruned_transducer_stateless7_ctc_bs/train.py --num-epochs 30`` trains for 30 epochs
    and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
    in the folder ``./pruned_transducer_stateless7_ctc_bs/exp``.
  - ``--start-epoch``
    It's used to resume training.
    ``./pruned_transducer_stateless7_ctc_bs/train.py --start-epoch 10`` loads the
    checkpoint ``./pruned_transducer_stateless7_ctc_bs/exp/epoch-9.pt`` and starts
    training from epoch 10, based on the state from epoch 9.
  - ``--world-size``
    It is used for multi-GPU single-machine DDP training.
      - (a) If it is 1, then no DDP training is used.
      - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
    The following shows some use cases with it.
      **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
      GPU 2 for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="0,2"
          $ ./pruned_transducer_stateless7_ctc_bs/train.py --world-size 2
      **Use case 2**: You have 4 GPUs and you want to use all of them
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ ./pruned_transducer_stateless7_ctc_bs/train.py --world-size 4
      **Use case 3**: You have 4 GPUs but you only want to use GPU 3
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="3"
          $ ./pruned_transducer_stateless7_ctc_bs/train.py --world-size 1
    .. caution::
      Only multi-GPU single-machine DDP training is implemented at present.
      Multi-GPU multi-machine DDP training will be added later.
  - ``--max-duration``
    It specifies the number of seconds over all utterances in a
    batch, before **padding**.
    If you encounter CUDA OOM, please reduce it.
    .. HINT::
      Due to padding, the number of seconds of all utterances in a
      batch will usually be larger than ``--max-duration``.
      A larger value for ``--max-duration`` may cause OOM during training,
      while a smaller value may increase the training time. You have to
      tune it.
 Pre-configured options
 ~~~~~~~~~~~~~~~~~~~~~~
 There are some training options, e.g., weight decay,
 number of warmup steps, results dir, etc,
 that are not passed from the commandline.
 They are pre-configured by the function ``get_params()`` in
 `pruned_transducer_stateless7_ctc_bs/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/train.py>`_
 You don't need to change these pre-configured parameters. If you really need to change
 them, please modify ``./pruned_transducer_stateless7_ctc_bs/train.py`` directly.
 Training logs
 ~~~~~~~~~~~~~
 Training logs and checkpoints are saved in ``pruned_transducer_stateless7_ctc_bs/exp``.
 You will find the following files in that directory:
  - ``epoch-1.pt``, ``epoch-2.pt``, ...
    These are checkpoint files saved at the end of each epoch, containing model
    ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless7_ctc_bs/train.py --start-epoch 11
  - ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
    These are checkpoint files saved every ``--save-every-n`` batches,
    containing model ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless7_ctc_bs/train.py --start-batch 436000
  - ``tensorboard/``
    This folder contains tensorBoard logs. Training loss, validation loss, learning
    rate, etc, are recorded in these logs. You can visualize them by:
      .. code-block:: bash
        $ cd pruned_transducer_stateless7_ctc_bs/exp/tensorboard
        $ tensorboard dev upload --logdir . --description "Zipformer-CTC co-training using blank skip for LibriSpeech with icefall"
    It will print something like below:
      .. code-block::
        TensorFlow installation not found - running with reduced feature set.
        Upload started and will continue reading any new data as it's added to the logdir.
        To stop uploading, press Ctrl-C.
        New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/xyOZUKpEQm62HBIlUD4uPA/
    Note there is a URL in the above output. Click it and you will see
    tensorboard.
  .. hint::
    If you don't have access to google, you can use the following command
    to view the tensorboard log locally:
      .. code-block:: bash
        cd pruned_transducer_stateless7_ctc_bs/exp/tensorboard
        tensorboard --logdir . --port 6008
    It will print the following message:
      .. code-block::
        Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
        TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
    Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
    logs.
  - ``log/log-train-xxxx``
    It is the detailed training log in text format, same as the one
    you saw printed to the console during training.
 Usage example
 ~~~~~~~~~~~~~
 You can use the following command to start the training using 4 GPUs:
 .. code-block:: bash
  export CUDA_VISIBLE_DEVICES="0,1,2,3"
  ./pruned_transducer_stateless7_ctc_bs/train.py \
    --world-size 4 \
    --num-epochs 30 \
    --start-epoch 1 \
    --full-libri 1 \
    --exp-dir pruned_transducer_stateless7_ctc_bs/exp \
    --max-duration 600 \
    --use-fp16 1
 Decoding
 --------
 The decoding part uses checkpoints saved by the training part, so you have
 to run the training part first.
 .. hint::
   There are two kinds of checkpoints:
    - (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
      of each epoch. You can pass ``--epoch`` to
      ``pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py`` to use them.
    - (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
      every ``--save-every-n`` batches. You can pass ``--iter`` to
      ``pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py`` to use them.
    We suggest that you try both types of checkpoints and choose the one
    that produces the lowest WERs.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py --help
 shows the options for decoding.
 The following shows the example using ``epoch-*.pt``:
 .. code-block:: bash
    for m in greedy_search fast_beam_search modified_beam_search; do
        ./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py \
            --epoch 30 \
            --avg 13 \
            --exp-dir pruned_transducer_stateless7_ctc_bs/exp \
            --max-duration 600 \
            --decoding-method $m
    done
 To test CTC branch, you can use the following command:
 .. code-block:: bash
    for m in ctc-decoding 1best; do
        ./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py \
            --epoch 30 \
            --avg 13 \
            --exp-dir pruned_transducer_stateless7_ctc_bs/exp \
            --max-duration 600 \
            --decoding-method $m
    done
 Export models
 -------------
 `pruned_transducer_stateless7_ctc_bs/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/export.py>`_ supports exporting checkpoints from ``pruned_transducer_stateless7_ctc_bs/exp`` in the following ways.
 Export ``model.state_dict()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Checkpoints saved by ``pruned_transducer_stateless7_ctc_bs/train.py`` also include
 ``optimizer.state_dict()``. It is useful for resuming training. But after training,
 we are interested only in ``model.state_dict()``. You can use the following
 command to extract ``model.state_dict()``.
 .. code-block:: bash
  ./pruned_transducer_stateless7_ctc_bs/export.py \
    --exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 30 \
    --avg 13 \
    --jit 0
 It will generate a file ``./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt``.
 .. hint::
   To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py``,
   you can run:
   .. code-block:: bash
      cd pruned_transducer_stateless7_ctc_bs/exp
      ln -s pretrained epoch-9999.pt
   And then pass ``--epoch 9999 --avg 1 --use-averaged-model 0`` to
   ``./pruned_transducer_stateless7_ctc_bs/ctc_guild_decode_bs.py``.
 To use the exported model with ``./pruned_transducer_stateless7_ctc_bs/pretrained.py``, you
 can run:
 .. code-block:: bash
  ./pruned_transducer_stateless7_ctc_bs/pretrained.py \
    --checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --method greedy_search \
    /path/to/foo.wav \
    /path/to/bar.wav
 To test CTC branch using the exported model with ``./pruned_transducer_stateless7_ctc_bs/pretrained_ctc.py``:
 .. code-block:: bash
  ./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py \
    --checkpoint ./pruned_transducer_stateless7_ctc_bs/exp/pretrained.pt \
    --bpe-model data/lang_bpe_500/bpe.model \
    --method ctc-decoding \
    --sample-rate 16000 \
    /path/to/foo.wav \
    /path/to/bar.wav
 Export model using ``torch.jit.script()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  ./pruned_transducer_stateless7_ctc_bs/export.py \
    --exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 30 \
    --avg 13 \
    --jit 1
 It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
 load it by ``torch.jit.load("cpu_jit.pt")``.
 Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
 are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
 To use the generated files with ``./pruned_transducer_stateless7_ctc_bs/jit_pretrained.py``:
 .. code-block:: bash
  ./pruned_transducer_stateless7_ctc_bs/jit_pretrained.py \
    --nn-model-filename ./pruned_transducer_stateless7_ctc_bs/exp/cpu_jit.pt \
    /path/to/foo.wav \
    /path/to/bar.wav
 To test CTC branch using the generated files with ``./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py``:
 .. code-block:: bash
  ./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py \
    --model-filename ./pruned_transducer_stateless7_ctc_bs/exp/cpu_jit.pt \
    --bpe-model data/lang_bpe_500/bpe.model \
    --method ctc-decoding \
    --sample-rate 16000 \
    /path/to/foo.wav \
    /path/to/bar.wav
 Download pretrained models
 --------------------------
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
  - `<https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2022-12-14>`_
  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
  for the details of the above pretrained models
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/zipformer_mmi.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/zipformer_mmi.rst
@ -0,0 +1,422 @@
 Zipformer MMI
 ===============
 .. hint::
   Please scroll down to the bottom of this page to find download links
   for pretrained models if you don't want to train a model from scratch.
 This tutorial shows you how to train an Zipformer MMI model
 with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 We use LF-MMI to compute the loss.
 .. note::
   You can find the document about LF-MMI training at the following address:
   `<https://github.com/k2-fsa/next-gen-kaldi-wechat/blob/master/pdf/LF-MMI-training-and-decoding-in-k2-Part-I.pdf>`_
 Data preparation
 ----------------
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh
 The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
 All you need to do is to run it.
 .. note::
   We encourage you to read ``./prepare.sh``.
 The data preparation contains several stages. You can use the following two
 options:
  - ``--stage``
  - ``--stop-stage``
 to control which stage(s) should be run. By default, all stages are executed.
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh --stage 0 --stop-stage 0
 means to run only stage 0.
 To run stage 2 to stage 5, use:
 .. code-block:: bash
  $ ./prepare.sh --stage 2 --stop-stage 5
 .. hint::
  If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
  dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
  ``./prepare.sh`` won't re-download them.
 .. note::
  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
  are saved in ``./data`` directory.
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 Training
 --------
 For stability, it uses CTC loss for model warm-up and then switches to MMI loss.
 Configurable options
 ~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./zipformer_mmi/train.py --help
 shows you the training options that can be passed from the commandline.
 The following options are used quite often:
  - ``--full-libri``
    If it's True, the training part uses all the training data, i.e.,
    960 hours. Otherwise, the training part uses only the subset
    ``train-clean-100``, which has 100 hours of training data.
    .. CAUTION::
      The training set is perturbed by speed with two factors: 0.9 and 1.1.
      If ``--full-libri`` is True, each epoch actually processes
      ``3x960 == 2880`` hours of data.
  - ``--num-epochs``
    It is the number of epochs to train. For instance,
    ``./zipformer_mmi/train.py --num-epochs 30`` trains for 30 epochs
    and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
    in the folder ``./zipformer_mmi/exp``.
  - ``--start-epoch``
    It's used to resume training.
    ``./zipformer_mmi/train.py --start-epoch 10`` loads the
    checkpoint ``./zipformer_mmi/exp/epoch-9.pt`` and starts
    training from epoch 10, based on the state from epoch 9.
  - ``--world-size``
    It is used for multi-GPU single-machine DDP training.
      - (a) If it is 1, then no DDP training is used.
      - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
    The following shows some use cases with it.
      **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
      GPU 2 for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="0,2"
          $ ./zipformer_mmi/train.py --world-size 2
      **Use case 2**: You have 4 GPUs and you want to use all of them
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ ./zipformer_mmi/train.py --world-size 4
      **Use case 3**: You have 4 GPUs but you only want to use GPU 3
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="3"
          $ ./zipformer_mmi/train.py --world-size 1
    .. caution::
      Only multi-GPU single-machine DDP training is implemented at present.
      Multi-GPU multi-machine DDP training will be added later.
  - ``--max-duration``
    It specifies the number of seconds over all utterances in a
    batch, before **padding**.
    If you encounter CUDA OOM, please reduce it.
    .. HINT::
      Due to padding, the number of seconds of all utterances in a
      batch will usually be larger than ``--max-duration``.
      A larger value for ``--max-duration`` may cause OOM during training,
      while a smaller value may increase the training time. You have to
      tune it.
 Pre-configured options
 ~~~~~~~~~~~~~~~~~~~~~~
 There are some training options, e.g., weight decay,
 number of warmup steps, results dir, etc,
 that are not passed from the commandline.
 They are pre-configured by the function ``get_params()`` in
 `zipformer_mmi/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/zipformer_mmi/train.py>`_
 You don't need to change these pre-configured parameters. If you really need to change
 them, please modify ``./zipformer_mmi/train.py`` directly.
 Training logs
 ~~~~~~~~~~~~~
 Training logs and checkpoints are saved in ``zipformer_mmi/exp``.
 You will find the following files in that directory:
  - ``epoch-1.pt``, ``epoch-2.pt``, ...
    These are checkpoint files saved at the end of each epoch, containing model
    ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
      .. code-block:: bash
        $ ./zipformer_mmi/train.py --start-epoch 11
  - ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
    These are checkpoint files saved every ``--save-every-n`` batches,
    containing model ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
      .. code-block:: bash
        $ ./zipformer_mmi/train.py --start-batch 436000
  - ``tensorboard/``
    This folder contains tensorBoard logs. Training loss, validation loss, learning
    rate, etc, are recorded in these logs. You can visualize them by:
      .. code-block:: bash
        $ cd zipformer_mmi/exp/tensorboard
        $ tensorboard dev upload --logdir . --description "Zipformer MMI training for LibriSpeech with icefall"
    It will print something like below:
      .. code-block::
        TensorFlow installation not found - running with reduced feature set.
        Upload started and will continue reading any new data as it's added to the logdir.
        To stop uploading, press Ctrl-C.
        New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/xyOZUKpEQm62HBIlUD4uPA/
    Note there is a URL in the above output. Click it and you will see
    tensorboard.
  .. hint::
    If you don't have access to google, you can use the following command
    to view the tensorboard log locally:
      .. code-block:: bash
        cd zipformer_mmi/exp/tensorboard
        tensorboard --logdir . --port 6008
    It will print the following message:
      .. code-block::
        Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
        TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
    Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
    logs.
  - ``log/log-train-xxxx``
    It is the detailed training log in text format, same as the one
    you saw printed to the console during training.
 Usage example
 ~~~~~~~~~~~~~
 You can use the following command to start the training using 4 GPUs:
 .. code-block:: bash
  export CUDA_VISIBLE_DEVICES="0,1,2,3"
  ./zipformer_mmi/train.py \
    --world-size 4 \
    --num-epochs 30 \
    --start-epoch 1 \
    --full-libri 1 \
    --exp-dir zipformer_mmi/exp \
    --max-duration 500 \
    --use-fp16 1 \
    --num-workers 2
 Decoding
 --------
 The decoding part uses checkpoints saved by the training part, so you have
 to run the training part first.
 .. hint::
   There are two kinds of checkpoints:
    - (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
      of each epoch. You can pass ``--epoch`` to
      ``zipformer_mmi/decode.py`` to use them.
    - (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
      every ``--save-every-n`` batches. You can pass ``--iter`` to
      ``zipformer_mmi/decode.py`` to use them.
    We suggest that you try both types of checkpoints and choose the one
    that produces the lowest WERs.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./zipformer_mmi/decode.py --help
 shows the options for decoding.
 The following shows the example using ``epoch-*.pt``:
 .. code-block:: bash
  for m in nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescoring-4-gram; do
    ./zipformer_mmi/decode.py \
      --epoch 30 \
      --avg 10 \
      --exp-dir ./zipformer_mmi/exp/ \
      --max-duration 100 \
      --lang-dir data/lang_bpe_500 \
      --nbest-scale 1.2 \
      --hp-scale 1.0 \
      --decoding-method $m
  done
 Export models
 -------------
 `zipformer_mmi/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/zipformer_mmi/export.py>`_ supports exporting checkpoints from ``zipformer_mmi/exp`` in the following ways.
 Export ``model.state_dict()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Checkpoints saved by ``zipformer_mmi/train.py`` also include
 ``optimizer.state_dict()``. It is useful for resuming training. But after training,
 we are interested only in ``model.state_dict()``. You can use the following
 command to extract ``model.state_dict()``.
 .. code-block:: bash
  ./zipformer_mmi/export.py \
    --exp-dir ./zipformer_mmi/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 30 \
    --avg 9 \
    --jit 0
 It will generate a file ``./zipformer_mmi/exp/pretrained.pt``.
 .. hint::
   To use the generated ``pretrained.pt`` for ``zipformer_mmi/decode.py``,
   you can run:
   .. code-block:: bash
      cd zipformer_mmi/exp
      ln -s pretrained epoch-9999.pt
   And then pass ``--epoch 9999 --avg 1 --use-averaged-model 0`` to
   ``./zipformer_mmi/decode.py``.
 To use the exported model with ``./zipformer_mmi/pretrained.py``, you
 can run:
 .. code-block:: bash
  ./zipformer_mmi/pretrained.py \
    --checkpoint ./zipformer_mmi/exp/pretrained.pt \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --method 1best \
    /path/to/foo.wav \
    /path/to/bar.wav
 Export model using ``torch.jit.script()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  ./zipformer_mmi/export.py \
    --exp-dir ./zipformer_mmi/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 30 \
    --avg 9 \
    --jit 1
 It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
 load it by ``torch.jit.load("cpu_jit.pt")``.
 Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
 are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
 To use the generated files with ``./zipformer_mmi/jit_pretrained.py``:
 .. code-block:: bash
  ./zipformer_mmi/jit_pretrained.py \
    --nn-model-filename ./zipformer_mmi/exp/cpu_jit.pt \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --method 1best \
    /path/to/foo.wav \
    /path/to/bar.wav
 Download pretrained models
 --------------------------
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
  - `<https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-mmi-2022-12-08>`_
  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
  for the details of the above pretrained models
--- a/docs/source/recipes/Non-streaming-ASR/timit/index.rst
+++ b/docs/source/recipes/Non-streaming-ASR/timit/index.rst
--- a/docs/source/recipes/Non-streaming-ASR/timit/tdnn_ligru_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/timit/tdnn_ligru_ctc.rst
--- a/docs/source/recipes/Non-streaming-ASR/timit/tdnn_lstm_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/timit/tdnn_lstm_ctc.rst
--- a/docs/source/recipes/Non-streaming-ASR/yesno/images/tdnn-tensorboard-log.png
+++ b/docs/source/recipes/Non-streaming-ASR/yesno/images/tdnn-tensorboard-log.png
--- a/docs/source/recipes/Non-streaming-ASR/yesno/index.rst
+++ b/docs/source/recipes/Non-streaming-ASR/yesno/index.rst
--- a/docs/source/recipes/Non-streaming-ASR/yesno/tdnn.rst
+++ b/docs/source/recipes/Non-streaming-ASR/yesno/tdnn.rst
--- a/docs/source/recipes/Streaming-ASR/index.rst
+++ b/docs/source/recipes/Streaming-ASR/index.rst
@ -0,0 +1,12 @@
 Streaming ASR
 =============
 .. toctree::
   :maxdepth: 1
   introduction
 .. toctree::
   :maxdepth: 2
   librispeech/index
--- a/docs/source/recipes/Streaming-ASR/introduction.rst
+++ b/docs/source/recipes/Streaming-ASR/introduction.rst
@ -0,0 +1,52 @@
 Introduction
 ============
 This page shows you how we implement streaming **X-former transducer** models for ASR.
 .. HINT::
   X-former transducer here means the encoder of the transducer model uses Multi-Head Attention,
   like `Conformer <https://arxiv.org/pdf/2005.08100.pdf>`_, `EmFormer <https://arxiv.org/pdf/2010.10759.pdf>`_ etc.
 Currently we have implemented two types of streaming models, one uses Conformer as encoder, the other uses Emformer as encoder.
 Streaming Conformer
 -------------------
 The main idea of training a streaming model is to make the model see limited contexts
 in training time, we can achieve this by applying a mask to the output of self-attention.
 In icefall, we implement the streaming conformer the way just like what `WeNet <https://arxiv.org/pdf/2012.05481.pdf>`_ did.
 .. NOTE::
   The conformer-transducer recipes in LibriSpeech datasets, like, `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
   `pruned_transducer_stateless3 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless3>`_,
   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_
   all support streaming.
 .. NOTE::
   Training a streaming conformer model in ``icefall`` is almost the same as training a
   non-streaming model, all you need to do is passing several extra arguments.
   See :doc:`Pruned transducer statelessX <librispeech/pruned_transducer_stateless>` for more details.
 .. HINT::
   If you want to adapt a non-streaming conformer model to be streaming, please refer
   to `this pull request <https://github.com/k2-fsa/icefall/pull/454>`_.
 Streaming Emformer
 ------------------
 The Emformer model proposed `here <https://arxiv.org/pdf/2010.10759.pdf>`_ uses more
 complicated techniques. It has a memory bank component to memorize history information,
 what' more, it also introduces right context in training time by hard-copying part of
 the input features.
 We have three variants of Emformer models in ``icefall``.
 - ``pruned_stateless_emformer_rnnt2`` using Emformer from torchaudio, see `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2>`_.
 - ``conv_emformer_transducer_stateless`` using ConvEmformer implemented by ourself. Different from the Emformer in torchaudio,
   ConvEmformer has a convolution in each layer and uses the mechanisms in our reworked conformer model.
   See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless>`_.
 - ``conv_emformer_transducer_stateless2`` using ConvEmformer implemented by ourself. The only difference from the above one is that
   it uses a simplified memory bank. See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2>`_.
--- a/docs/source/recipes/Streaming-ASR/librispeech/images/librispeech-lstm-transducer-tensorboard-log.png
+++ b/docs/source/recipes/Streaming-ASR/librispeech/images/librispeech-lstm-transducer-tensorboard-log.png
--- a/docs/source/recipes/Streaming-ASR/librispeech/images/streaming-librispeech-pruned-transducer-tensorboard-log.jpg
+++ b/docs/source/recipes/Streaming-ASR/librispeech/images/streaming-librispeech-pruned-transducer-tensorboard-log.jpg
--- a/docs/source/recipes/Streaming-ASR/librispeech/index.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/index.rst
@ -4,6 +4,8 @@ LibriSpeech
 .. toctree::
   :maxdepth: 1
-   tdnn_lstm_ctc
+   pruned_transducer_stateless
-   conformer_ctc
+
   lstm_pruned_stateless_transducer
   zipformer_transducer
--- a/docs/source/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/lstm_pruned_stateless_transducer.rst
@ -515,10 +515,10 @@ To use the generated files with ``./lstm_transducer_stateless2/jit_pretrained``:
   Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/english/server.html>`_
   for how to use the exported models in ``sherpa``.
-.. _export-model-for-ncnn:
+.. _export-lstm-transducer-model-for-ncnn:
-Export model for ncnn
+Export LSTM transducer models for ncnn
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 We support exporting pretrained LSTM transducer models to
 `ncnn <https://github.com/tencent/ncnn>`_ using
@ -531,16 +531,36 @@ First, let us install a modified version of ``ncnn``:
  git clone https://github.com/csukuangfj/ncnn
  cd ncnn
  git submodule update --recursive --init
-  python3 setup.py bdist_wheel
+
-  ls -lh dist/
+  # Note: We don't use "python setup.py install" or "pip install ." here
-  pip install ./dist/*.whl
+
  mkdir -p build-wheel
  cd build-wheel
  cmake \
    -DCMAKE_BUILD_TYPE=Release \
    -DNCNN_PYTHON=ON \
    -DNCNN_BUILD_BENCHMARK=OFF \
    -DNCNN_BUILD_EXAMPLES=OFF \
    -DNCNN_BUILD_TOOLS=ON \
    ..
  make -j4
  cd ..
  # Note: $PWD here is /path/to/ncnn
  export PYTHONPATH=$PWD/python:$PYTHONPATH
  export PATH=$PWD/tools/pnnx/build/src:$PATH
  export PATH=$PWD/build-wheel/tools/quantize:$PATH
  # now build pnnx
  cd tools/pnnx
  mkdir build
  cd build
  cmake ..
  make -j4
  export PATH=$PWD/src:$PATH
  ./src/pnnx
@ -549,6 +569,9 @@ First, let us install a modified version of ``ncnn``:
   We assume that you have added the path to the binary ``pnnx`` to the
   environment variable ``PATH``.
   We also assume that you have added ``build/tools/quantize`` to the environment
   variable ``PATH`` so that you are able to use ``ncnn2int8`` later.
 Second, let us export the model using ``torch.jit.trace()`` that is suitable
 for ``pnnx``:
@ -634,3 +657,6 @@ by visiting the following links:
 You can find more usages of the pretrained models in
 `<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/index.html>`_
 Export ConvEmformer transducer models for ncnn
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst
@ -0,0 +1,735 @@
 Pruned transducer statelessX
 ============================
 This tutorial shows you how to run a **streaming** conformer transducer model
 with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 .. Note::
   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_,
   We will take pruned_transducer_stateless4 as an example in this tutorial.
 .. HINT::
  We assume you have read the page :ref:`install icefall` and have setup
  the environment for ``icefall``.
 .. HINT::
  We recommend you to use a GPU or several GPUs to run this recipe.
 .. hint::
   Please scroll down to the bottom of this page to find download links
   for pretrained models if you don't want to train a model from scratch.
 We use pruned RNN-T to compute the loss.
 .. note::
   You can find the paper about pruned RNN-T at the following address:
   `<https://arxiv.org/abs/2206.13236>`_
 The transducer model consists of 3 parts:
  - Encoder, a.k.a, the transcription network. We use a Conformer model (the reworked version by Daniel Povey)
  - Decoder, a.k.a, the prediction network. We use a stateless model consisting of
    ``nn.Embedding`` and ``nn.Conv1d``
  - Joiner, a.k.a, the joint network.
 .. caution::
   Contrary to the conventional RNN-T models, we use a stateless decoder.
   That is, it has no recurrent connections.
 Data preparation
 ----------------
 .. hint::
   The data preparation is the same as other recipes on LibriSpeech dataset,
   if you have finished this step, you can skip to ``Training`` directly.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh
 The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
 All you need to do is to run it.
 The data preparation contains several stages, you can use the following two
 options:
  - ``--stage``
  - ``--stop-stage``
 to control which stage(s) should be run. By default, all stages are executed.
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh --stage 0 --stop-stage 0
 means to run only stage 0.
 To run stage 2 to stage 5, use:
 .. code-block:: bash
  $ ./prepare.sh --stage 2 --stop-stage 5
 .. HINT::
  If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
  dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
  ``./prepare.sh`` won't re-download them.
 .. NOTE::
  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
  are saved in ``./data`` directory.
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 Training
 --------
 .. NOTE::
   We put the streaming and non-streaming model in one recipe, to train a streaming model you only
   need to add **4** extra options comparing with training a non-streaming model. These options are
   ``--dynamic-chunk-training``, ``--num-left-chunks``, ``--causal-convolution``, ``--short-chunk-size``.
   You can see the configurable options below for their meanings or read https://arxiv.org/pdf/2012.05481.pdf for more details.
 Configurable options
 ~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless4/train.py --help
 shows you the training options that can be passed from the commandline.
 The following options are used quite often:
  - ``--exp-dir``
    The directory to save checkpoints, training logs and tensorboard.
  - ``--full-libri``
    If it's True, the training part uses all the training data, i.e.,
    960 hours. Otherwise, the training part uses only the subset
    ``train-clean-100``, which has 100 hours of training data.
    .. CAUTION::
      The training set is perturbed by speed with two factors: 0.9 and 1.1.
      If ``--full-libri`` is True, each epoch actually processes
      ``3x960 == 2880`` hours of data.
  - ``--num-epochs``
    It is the number of epochs to train. For instance,
    ``./pruned_transducer_stateless4/train.py --num-epochs 30`` trains for 30 epochs
    and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
    in the folder ``./pruned_transducer_stateless4/exp``.
  - ``--start-epoch``
    It's used to resume training.
    ``./pruned_transducer_stateless4/train.py --start-epoch 10`` loads the
    checkpoint ``./pruned_transducer_stateless4/exp/epoch-9.pt`` and starts
    training from epoch 10, based on the state from epoch 9.
  - ``--world-size``
    It is used for multi-GPU single-machine DDP training.
      - (a) If it is 1, then no DDP training is used.
      - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
    The following shows some use cases with it.
      **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
      GPU 2 for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="0,2"
          $ ./pruned_transducer_stateless4/train.py --world-size 2
      **Use case 2**: You have 4 GPUs and you want to use all of them
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ ./pruned_transducer_stateless4/train.py --world-size 4
      **Use case 3**: You have 4 GPUs but you only want to use GPU 3
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="3"
          $ ./pruned_transducer_stateless4/train.py --world-size 1
    .. caution::
      Only multi-GPU single-machine DDP training is implemented at present.
      Multi-GPU multi-machine DDP training will be added later.
  - ``--max-duration``
    It specifies the number of seconds over all utterances in a
    batch, before **padding**.
    If you encounter CUDA OOM, please reduce it.
    .. HINT::
      Due to padding, the number of seconds of all utterances in a
      batch will usually be larger than ``--max-duration``.
      A larger value for ``--max-duration`` may cause OOM during training,
      while a smaller value may increase the training time. You have to
      tune it.
  - ``--use-fp16``
    If it is True, the model will train with half precision, from our experiment
    results, by using half precision you can train with two times larger ``--max-duration``
    so as to get almost 2X speed up.
  - ``--dynamic-chunk-training``
    The flag that indicates whether to train a streaming model or not, it
    **MUST** be True if you want to train a streaming model.
  - ``--short-chunk-size``
    When training a streaming attention model with chunk masking, the chunk size
    would be either max sequence length of current batch or uniformly sampled from
    (1, short_chunk_size). The default value is 25, you don't have to change it most of the time.
  - ``--num-left-chunks``
    It indicates how many left context (in chunks) that can be seen when calculating attention.
    The default value is 4, you don't have to change it most of the time.
  - ``--causal-convolution``
    Whether to use causal convolution in conformer encoder layer, this requires
    to be True when training a streaming model.
 Pre-configured options
 ~~~~~~~~~~~~~~~~~~~~~~
 There are some training options, e.g., number of encoder layers,
 encoder dimension, decoder dimension, number of warmup steps etc,
 that are not passed from the commandline.
 They are pre-configured by the function ``get_params()`` in
 `pruned_transducer_stateless4/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless4/train.py>`_
 You don't need to change these pre-configured parameters. If you really need to change
 them, please modify ``./pruned_transducer_stateless4/train.py`` directly.
 .. NOTE::
  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`_ are a little different from
  other recipes. It allows you to configure ``--num-encoder-layers``, ``--dim-feedforward``, ``--nhead``, ``--encoder-dim``, ``--decoder-dim``, ``--joiner-dim`` from commandline, so that you can train models with different size with pruned_transducer_stateless5.
 Training logs
 ~~~~~~~~~~~~~
 Training logs and checkpoints are saved in ``--exp-dir`` (e.g. ``pruned_transducer_stateless4/exp``.
 You will find the following files in that directory:
  - ``epoch-1.pt``, ``epoch-2.pt``, ...
    These are checkpoint files saved at the end of each epoch, containing model
    ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless4/train.py --start-epoch 11
  - ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
    These are checkpoint files saved every ``--save-every-n`` batches,
    containing model ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless4/train.py --start-batch 436000
  - ``tensorboard/``
    This folder contains tensorBoard logs. Training loss, validation loss, learning
    rate, etc, are recorded in these logs. You can visualize them by:
      .. code-block:: bash
        $ cd pruned_transducer_stateless4/exp/tensorboard
        $ tensorboard dev upload --logdir . --description "pruned transducer training for LibriSpeech with icefall"
    It will print something like below:
      .. code-block::
        TensorFlow installation not found - running with reduced feature set.
        Upload started and will continue reading any new data as it's added to the logdir.
        To stop uploading, press Ctrl-C.
        New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/97VKXf80Ru61CnP2ALWZZg/
        [2022-11-20T15:50:50] Started scanning logdir.
        Uploading 4468 scalars...
        [2022-11-20T15:53:02] Total uploaded: 210171 scalars, 0 tensors, 0 binary objects
        Listening for new data in logdir...
    Note there is a URL in the above output. Click it and you will see
    the following screenshot:
      .. figure:: images/streaming-librispeech-pruned-transducer-tensorboard-log.jpg
         :width: 600
         :alt: TensorBoard screenshot
         :align: center
         :target: https://tensorboard.dev/experiment/97VKXf80Ru61CnP2ALWZZg/
         TensorBoard screenshot.
  .. hint::
    If you don't have access to google, you can use the following command
    to view the tensorboard log locally:
      .. code-block:: bash
        cd pruned_transducer_stateless4/exp/tensorboard
        tensorboard --logdir . --port 6008
    It will print the following message:
      .. code-block::
        Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
        TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
    Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
    logs.
  - ``log/log-train-xxxx``
    It is the detailed training log in text format, same as the one
    you saw printed to the console during training.
 Usage example
 ~~~~~~~~~~~~~
 You can use the following command to start the training using 4 GPUs:
 .. code-block:: bash
  export CUDA_VISIBLE_DEVICES="0,1,2,3"
  ./pruned_transducer_stateless4/train.py \
     --world-size 4 \
     --dynamic-chunk-training 1 \
     --causal-convolution 1 \
     --num-epochs 30 \
     --start-epoch 1 \
     --exp-dir pruned_transducer_stateless4/exp \
     --full-libri 1 \
     --max-duration 300
 .. NOTE::
   Comparing with training a non-streaming model, you only need to add two extra options,
   ``--dynamic-chunk-training 1``  and ``--causal-convolution 1`` .
 Decoding
 --------
 The decoding part uses checkpoints saved by the training part, so you have
 to run the training part first.
 .. hint::
   There are two kinds of checkpoints:
    - (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
      of each epoch. You can pass ``--epoch`` to
      ``pruned_transducer_stateless4/decode.py`` to use them.
    - (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
      every ``--save-every-n`` batches. You can pass ``--iter`` to
      ``pruned_transducer_stateless4/decode.py`` to use them.
    We suggest that you try both types of checkpoints and choose the one
    that produces the lowest WERs.
 .. tip::
    To decode a streaming model, you can use either ``simulate streaming decoding`` in ``decode.py`` or
    ``real streaming decoding`` in ``streaming_decode.py``, the difference between ``decode.py`` and
    ``streaming_decode.py`` is that, ``decode.py`` processes the whole acoustic frames at one time with masking (i.e. same as training),
    but ``streaming_decode.py`` processes the acoustic frames chunk by chunk (so it can only see limited context).
 .. NOTE::
   ``simulate streaming decoding`` in ``decode.py`` and ``real streaming decoding`` in ``streaming_decode.py`` should
   produce almost the same results given the same ``--decode-chunk-size`` and ``--left-context``.
 Simulate streaming decoding
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless4/decode.py --help
 shows the options for decoding.
 The following options are important for streaming models:
  ``--simulate-streaming``
    If you want to decode a streaming model with ``decode.py``, you **MUST** set
    ``--simulate-streaming`` to ``True``. ``simulate`` here means the acoustic frames
    are not processed frame by frame (or chunk by chunk), instead, the whole sequence
    is processed at one time with masking (the same as training).
  ``--causal-convolution``
    If True, the convolution module in encoder layers will be causal convolution.
    This is **MUST** be True when decoding with a streaming model.
  ``--decode-chunk-size``
    For streaming models, we will calculate the chunk-wise attention, ``--decode-chunk-size``
    indicates the chunk length (in frames after subsampling) for chunk-wise attention.
    For ``simulate streaming decoding`` the ``decode-chunk-size`` is used to generate
    the attention mask.
  ``--left-context``
    ``--left-context`` indicates how many left context frames (after subsampling) can be seen
    for current chunk when calculating chunk-wise attention. Normally, ``left-context`` should equal
    to ``decode-chunk-size * num-left-chunks``, where ``num-left-chunks`` is the option used
    to train this model. For ``simulate streaming decoding`` the ``left-context`` is used to generate
    the attention mask.
 The following shows two examples (for the two types of checkpoints):
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for epoch in 25 20; do
      for avg in 7 5 3 1; do
        ./pruned_transducer_stateless4/decode.py \
          --epoch $epoch \
          --avg $avg \
          --simulate-streaming 1 \
          --causal-convolution 1 \
          --decode-chunk-size 16 \
          --left-context 64 \
          --exp-dir pruned_transducer_stateless4/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for iter in 474000; do
      for avg in 8 10 12 14 16 18; do
        ./pruned_transducer_stateless4/decode.py \
          --iter $iter \
          --avg $avg \
          --simulate-streaming 1 \
          --causal-convolution 1 \
          --decode-chunk-size 16 \
          --left-context 64 \
          --exp-dir pruned_transducer_stateless4/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 Real streaming decoding
 ~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless4/streaming_decode.py --help
 shows the options for decoding.
 The following options are important for streaming models:
  ``--decode-chunk-size``
    For streaming models, we will calculate the chunk-wise attention, ``--decode-chunk-size``
    indicates the chunk length (in frames after subsampling) for chunk-wise attention.
    For ``real streaming decoding``, we will process ``decode-chunk-size`` acoustic frames at each time.
  ``--left-context``
    ``--left-context`` indicates how many left context frames (after subsampling) can be seen
    for current chunk when calculating chunk-wise attention. Normally, ``left-context`` should equal
    to ``decode-chunk-size * num-left-chunks``, where ``num-left-chunks`` is the option used
    to train this model.
  ``--num-decode-streams``
    The number of decoding streams that can be run in parallel (very similar to the ``bath size``).
    For ``real streaming decoding``, the batches will be packed dynamically, for example, if the
    ``num-decode-streams`` equals to 10, then, sequence 1 to 10 will be decoded at first, after a while,
    suppose sequence 1 and 2 are done, so, sequence 3 to 12 will be processed parallelly in a batch.
 .. NOTE::
   We also try adding ``--right-context`` in the real streaming decoding, but it seems not to benefit
   the performance for all the models, the reasons might be the training and decoding mismatch. You
   can try decoding with ``--right-context`` to see if it helps. The default value is 0.
 The following shows two examples (for the two types of checkpoints):
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for epoch in 25 20; do
      for avg in 7 5 3 1; do
        ./pruned_transducer_stateless4/decode.py \
          --epoch $epoch \
          --avg $avg \
          --decode-chunk-size 16 \
          --left-context 64 \
          --num-decode-streams 100 \
          --exp-dir pruned_transducer_stateless4/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for iter in 474000; do
      for avg in 8 10 12 14 16 18; do
        ./pruned_transducer_stateless4/decode.py \
          --iter $iter \
          --avg $avg \
          --decode-chunk-size 16 \
          --left-context 64 \
          --num-decode-streams 100 \
          --exp-dir pruned_transducer_stateless4/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 .. tip::
  Supporting decoding methods are as follows:
    - ``greedy_search`` : It takes the symbol with largest posterior probability
      of each frame as the decoding result.
    - ``beam_search`` :  It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf and
      `espnet/nets/beam_search_transducer.py <https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py#L247>`_
      is used as a reference. Basicly, it keeps topk states for each frame, and expands the kept states with their own contexts to
      next frame.
    - ``modified_beam_search`` : It implements the same algorithm as ``beam_search`` above, but it
      runs in batch mode with ``--max-sym-per-frame=1`` being hardcoded.
    - ``fast_beam_search`` : It implements graph composition between the output ``log_probs`` and
      given ``FSAs``. It is hard to describe the details in several lines of texts, you can read
      our paper in https://arxiv.org/pdf/2211.00484.pdf or our `rnnt decode code in k2 <https://github.com/k2-fsa/k2/blob/master/k2/csrc/rnnt_decode.h>`_. ``fast_beam_search`` can decode with ``FSAs`` on GPU efficiently.
    - ``fast_beam_search_LG`` : The same as ``fast_beam_search`` above, ``fast_beam_search`` uses
      an trivial graph that has only one state, while ``fast_beam_search_LG`` uses an LG graph
      (with N-gram LM).
    - ``fast_beam_search_nbest`` : It produces the decoding results as follows:
      - (1) Use ``fast_beam_search`` to get a lattice
      - (2) Select ``num_paths`` paths from the lattice using ``k2.random_paths()``
      - (3) Unique the selected paths
      - (4) Intersect the selected paths with the lattice and compute the
            shortest path from the intersection result
      - (5) The path with the largest score is used as the decoding output.
    - ``fast_beam_search_nbest_LG`` : It implements same logic as ``fast_beam_search_nbest``, the
      only difference is that it uses ``fast_beam_search_LG`` to generate the lattice.
 .. NOTE::
  The supporting decoding methods in ``streaming_decode.py`` might be less than that in ``decode.py``, if needed,
  you can implement them by yourself or file a issue in `icefall <https://github.com/k2-fsa/icefall/issues>`_ .
 Export Model
 ------------
 `pruned_transducer_stateless4/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless4/export.py>`_ supports exporting checkpoints from ``pruned_transducer_stateless4/exp`` in the following ways.
 Export ``model.state_dict()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Checkpoints saved by ``pruned_transducer_stateless4/train.py`` also include
 ``optimizer.state_dict()``. It is useful for resuming training. But after training,
 we are interested only in ``model.state_dict()``. You can use the following
 command to extract ``model.state_dict()``.
 .. code-block:: bash
  # Assume that --epoch 25 --avg 3 produces the smallest WER
  # (You can get such information after running ./pruned_transducer_stateless4/decode.py)
  epoch=25
  avg=3
  ./pruned_transducer_stateless4/export.py \
    --exp-dir ./pruned_transducer_stateless4/exp \
    --streaming-model 1 \
    --causal-convolution 1 \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch $epoch \
    --avg  $avg
 .. caution::
   ``--streaming-model`` and ``--causal-convolution`` require to be True to export
   a streaming mdoel.
 It will generate a file ``./pruned_transducer_stateless4/exp/pretrained.pt``.
 .. hint::
   To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless4/decode.py``,
   you can run:
   .. code-block:: bash
      cd pruned_transducer_stateless4/exp
      ln -s pretrained.pt epoch-999.pt
   And then pass ``--epoch 999 --avg 1 --use-averaged-model 0`` to
   ``./pruned_transducer_stateless4/decode.py``.
 To use the exported model with ``./pruned_transducer_stateless4/pretrained.py``, you
 can run:
 .. code-block:: bash
  ./pruned_transducer_stateless4/pretrained.py \
    --checkpoint ./pruned_transducer_stateless4/exp/pretrained.pt \
    --simulate-streaming 1 \
    --causal-convolution 1 \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --method greedy_search \
    /path/to/foo.wav \
    /path/to/bar.wav
 Export model using ``torch.jit.script()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  ./pruned_transducer_stateless4/export.py \
    --exp-dir ./pruned_transducer_stateless4/exp \
    --streaming-model 1 \
    --causal-convolution 1 \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 25 \
    --avg 3 \
    --jit 1
 .. caution::
   ``--streaming-model`` and ``--causal-convolution`` require to be True to export
   a streaming mdoel.
 It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
 load it by ``torch.jit.load("cpu_jit.pt")``.
 Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
 are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
 .. NOTE::
   You will need this ``cpu_jit.pt`` when deploying with Sherpa framework.
 Download pretrained models
 --------------------------
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
  - `pruned_transducer_stateless <https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless_20220625>`_
  - `pruned_transducer_stateless2 <https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless2_20220625>`_
  - `pruned_transducer_stateless4 <https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless4_20220625>`_
  - `pruned_transducer_stateless5 <https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless5_20220729>`_
  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
  for the details of the above pretrained models
 Deploy with Sherpa
 ------------------
 Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/conformer/index.html#>`_
 for how to deploy the models in ``sherpa``.
--- a/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst
@ -0,0 +1,654 @@
 Zipformer Transducer
 ====================
 This tutorial shows you how to run a **streaming** zipformer transducer model
 with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 .. Note::
   The tutorial is suitable for `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
 .. HINT::
  We assume you have read the page :ref:`install icefall` and have setup
  the environment for ``icefall``.
 .. HINT::
  We recommend you to use a GPU or several GPUs to run this recipe.
 .. hint::
   Please scroll down to the bottom of this page to find download links
   for pretrained models if you don't want to train a model from scratch.
 We use pruned RNN-T to compute the loss.
 .. note::
   You can find the paper about pruned RNN-T at the following address:
   `<https://arxiv.org/abs/2206.13236>`_
 The transducer model consists of 3 parts:
  - Encoder, a.k.a, the transcription network. We use a Zipformer model (proposed by Daniel Povey)
  - Decoder, a.k.a, the prediction network. We use a stateless model consisting of
    ``nn.Embedding`` and ``nn.Conv1d``
  - Joiner, a.k.a, the joint network.
 .. caution::
   Contrary to the conventional RNN-T models, we use a stateless decoder.
   That is, it has no recurrent connections.
 Data preparation
 ----------------
 .. hint::
   The data preparation is the same as other recipes on LibriSpeech dataset,
   if you have finished this step, you can skip to ``Training`` directly.
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh
 The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
 All you need to do is to run it.
 The data preparation contains several stages, you can use the following two
 options:
  - ``--stage``
  - ``--stop-stage``
 to control which stage(s) should be run. By default, all stages are executed.
 For example,
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./prepare.sh --stage 0 --stop-stage 0
 means to run only stage 0.
 To run stage 2 to stage 5, use:
 .. code-block:: bash
  $ ./prepare.sh --stage 2 --stop-stage 5
 .. HINT::
  If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
  dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
  ``./prepare.sh`` won't re-download them.
 .. NOTE::
  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
  are saved in ``./data`` directory.
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 Training
 --------
 Configurable options
 ~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless7_streaming/train.py --help
 shows you the training options that can be passed from the commandline.
 The following options are used quite often:
  - ``--exp-dir``
    The directory to save checkpoints, training logs and tensorboard.
  - ``--full-libri``
    If it's True, the training part uses all the training data, i.e.,
    960 hours. Otherwise, the training part uses only the subset
    ``train-clean-100``, which has 100 hours of training data.
    .. CAUTION::
      The training set is perturbed by speed with two factors: 0.9 and 1.1.
      If ``--full-libri`` is True, each epoch actually processes
      ``3x960 == 2880`` hours of data.
  - ``--num-epochs``
    It is the number of epochs to train. For instance,
    ``./pruned_transducer_stateless7_streaming/train.py --num-epochs 30`` trains for 30 epochs
    and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
    in the folder ``./pruned_transducer_stateless7_streaming/exp``.
  - ``--start-epoch``
    It's used to resume training.
    ``./pruned_transducer_stateless7_streaming/train.py --start-epoch 10`` loads the
    checkpoint ``./pruned_transducer_stateless7_streaming/exp/epoch-9.pt`` and starts
    training from epoch 10, based on the state from epoch 9.
  - ``--world-size``
    It is used for multi-GPU single-machine DDP training.
      - (a) If it is 1, then no DDP training is used.
      - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
    The following shows some use cases with it.
      **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
      GPU 2 for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="0,2"
          $ ./pruned_transducer_stateless7_streaming/train.py --world-size 2
      **Use case 2**: You have 4 GPUs and you want to use all of them
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ ./pruned_transducer_stateless7_streaming/train.py --world-size 4
      **Use case 3**: You have 4 GPUs but you only want to use GPU 3
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/librispeech/ASR
          $ export CUDA_VISIBLE_DEVICES="3"
          $ ./pruned_transducer_stateless7_streaming/train.py --world-size 1
    .. caution::
      Only multi-GPU single-machine DDP training is implemented at present.
      Multi-GPU multi-machine DDP training will be added later.
  - ``--max-duration``
    It specifies the number of seconds over all utterances in a
    batch, before **padding**.
    If you encounter CUDA OOM, please reduce it.
    .. HINT::
      Due to padding, the number of seconds of all utterances in a
      batch will usually be larger than ``--max-duration``.
      A larger value for ``--max-duration`` may cause OOM during training,
      while a smaller value may increase the training time. You have to
      tune it.
  - ``--use-fp16``
    If it is True, the model will train with half precision, from our experiment
    results, by using half precision you can train with two times larger ``--max-duration``
    so as to get almost 2X speed up.
    We recommend using ``--use-fp16 True``.
  - ``--short-chunk-size``
    When training a streaming attention model with chunk masking, the chunk size
    would be either max sequence length of current batch or uniformly sampled from
    (1, short_chunk_size). The default value is 50, you don't have to change it most of the time.
  - ``--num-left-chunks``
    It indicates how many left context (in chunks) that can be seen when calculating attention.
    The default value is 4, you don't have to change it most of the time.
  - ``--decode-chunk-len``
    The chunk size for decoding (in frames before subsampling). It is used for validation.
    The default value is 32 (i.e., 320ms).
 Pre-configured options
 ~~~~~~~~~~~~~~~~~~~~~~
 There are some training options, e.g., number of encoder layers,
 encoder dimension, decoder dimension, number of warmup steps etc,
 that are not passed from the commandline.
 They are pre-configured by the function ``get_params()`` in
 `pruned_transducer_stateless7_streaming/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/train.py>`_
 You don't need to change these pre-configured parameters. If you really need to change
 them, please modify ``./pruned_transducer_stateless7_streaming/train.py`` directly.
 Training logs
 ~~~~~~~~~~~~~
 Training logs and checkpoints are saved in ``--exp-dir`` (e.g. ``pruned_transducer_stateless7_streaming/exp``.
 You will find the following files in that directory:
  - ``epoch-1.pt``, ``epoch-2.pt``, ...
    These are checkpoint files saved at the end of each epoch, containing model
    ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless7_streaming/train.py --start-epoch 11
  - ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
    These are checkpoint files saved every ``--save-every-n`` batches,
    containing model ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
      .. code-block:: bash
        $ ./pruned_transducer_stateless7_streaming/train.py --start-batch 436000
  - ``tensorboard/``
    This folder contains tensorBoard logs. Training loss, validation loss, learning
    rate, etc, are recorded in these logs. You can visualize them by:
      .. code-block:: bash
        $ cd pruned_transducer_stateless7_streaming/exp/tensorboard
        $ tensorboard dev upload --logdir . --description "pruned transducer training for LibriSpeech with icefall"
  .. hint::
    If you don't have access to google, you can use the following command
    to view the tensorboard log locally:
      .. code-block:: bash
        cd pruned_transducer_stateless7_streaming/exp/tensorboard
        tensorboard --logdir . --port 6008
    It will print the following message:
      .. code-block::
        Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
        TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
    Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
    logs.
  - ``log/log-train-xxxx``
    It is the detailed training log in text format, same as the one
    you saw printed to the console during training.
 Usage example
 ~~~~~~~~~~~~~
 You can use the following command to start the training using 4 GPUs:
 .. code-block:: bash
  export CUDA_VISIBLE_DEVICES="0,1,2,3"
  ./pruned_transducer_stateless7_streaming/train.py \
    --world-size 4 \
    --num-epochs 30 \
    --start-epoch 1 \
    --use-fp16 1 \
    --exp-dir pruned_transducer_stateless7_streaming/exp \
    --full-libri 1 \
    --max-duration 550
 Decoding
 --------
 The decoding part uses checkpoints saved by the training part, so you have
 to run the training part first.
 .. hint::
   There are two kinds of checkpoints:
    - (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
      of each epoch. You can pass ``--epoch`` to
      ``pruned_transducer_stateless7_streaming/decode.py`` to use them.
    - (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
      every ``--save-every-n`` batches. You can pass ``--iter`` to
      ``pruned_transducer_stateless7_streaming/decode.py`` to use them.
    We suggest that you try both types of checkpoints and choose the one
    that produces the lowest WERs.
 .. tip::
    To decode a streaming model, you can use either ``simulate streaming decoding`` in ``decode.py`` or
    ``real chunk-wise streaming decoding`` in ``streaming_decode.py``. The difference between ``decode.py`` and
    ``streaming_decode.py`` is that, ``decode.py`` processes the whole acoustic frames at one time with masking (i.e. same as training),
    but ``streaming_decode.py`` processes the acoustic frames chunk by chunk.
 .. NOTE::
   ``simulate streaming decoding`` in ``decode.py`` and ``real chunk-size streaming decoding`` in ``streaming_decode.py`` should
   produce almost the same results given the same ``--decode-chunk-len``.
 Simulate streaming decoding
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless7_streaming/decode.py --help
 shows the options for decoding.
 The following options are important for streaming models:
  ``--decode-chunk-len``
    It is same as in ``train.py``, which specifies the chunk size for decoding (in frames before subsampling).
    The default value is 32 (i.e., 320ms).
 The following shows two examples (for the two types of checkpoints):
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for epoch in 30; do
      for avg in 12 11 10 9 8; do
        ./pruned_transducer_stateless7_streaming/decode.py \
          --epoch $epoch \
          --avg $avg \
          --decode-chunk-len 32 \
          --exp-dir pruned_transducer_stateless7_streaming/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for iter in 474000; do
      for avg in 8 10 12 14 16 18; do
        ./pruned_transducer_stateless7_streaming/decode.py \
          --iter $iter \
          --avg $avg \
          --decode-chunk-len 32 \
          --exp-dir pruned_transducer_stateless7_streaming/exp \
          --max-duration 600 \
          --decoding-method $m
      done
    done
  done
 Real streaming decoding
 ~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  $ cd egs/librispeech/ASR
  $ ./pruned_transducer_stateless7_streaming/streaming_decode.py --help
 shows the options for decoding.
 The following options are important for streaming models:
  ``--decode-chunk-len``
    It is same as in ``train.py``, which specifies the chunk size for decoding (in frames before subsampling).
    The default value is 32 (i.e., 320ms).
    For ``real streaming decoding``, we will process ``decode-chunk-len`` acoustic frames at each time.
  ``--num-decode-streams``
    The number of decoding streams that can be run in parallel (very similar to the ``bath size``).
    For ``real streaming decoding``, the batches will be packed dynamically, for example, if the
    ``num-decode-streams`` equals to 10, then, sequence 1 to 10 will be decoded at first, after a while,
    suppose sequence 1 and 2 are done, so, sequence 3 to 12 will be processed parallelly in a batch.
 The following shows two examples (for the two types of checkpoints):
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for epoch in 30; do
      for avg in 12 11 10 9 8; do
        ./pruned_transducer_stateless7_streaming/decode.py \
          --epoch $epoch \
          --avg $avg \
          --decode-chunk-len 32 \
          --num-decode-streams 100 \
          --exp-dir pruned_transducer_stateless7_streaming/exp \
          --decoding-method $m
      done
    done
  done
 .. code-block:: bash
  for m in greedy_search fast_beam_search modified_beam_search; do
    for iter in 474000; do
      for avg in 8 10 12 14 16 18; do
        ./pruned_transducer_stateless7_streaming/decode.py \
          --iter $iter \
          --avg $avg \
          --decode-chunk-len 16 \
          --num-decode-streams 100 \
          --exp-dir pruned_transducer_stateless7_streaming/exp \
          --decoding-method $m
      done
    done
  done
 .. tip::
  Supporting decoding methods are as follows:
    - ``greedy_search`` : It takes the symbol with largest posterior probability
      of each frame as the decoding result.
    - ``beam_search`` :  It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf and
      `espnet/nets/beam_search_transducer.py <https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py#L247>`_
      is used as a reference. Basicly, it keeps topk states for each frame, and expands the kept states with their own contexts to
      next frame.
    - ``modified_beam_search`` : It implements the same algorithm as ``beam_search`` above, but it
      runs in batch mode with ``--max-sym-per-frame=1`` being hardcoded.
    - ``fast_beam_search`` : It implements graph composition between the output ``log_probs`` and
      given ``FSAs``. It is hard to describe the details in several lines of texts, you can read
      our paper in https://arxiv.org/pdf/2211.00484.pdf or our `rnnt decode code in k2 <https://github.com/k2-fsa/k2/blob/master/k2/csrc/rnnt_decode.h>`_. ``fast_beam_search`` can decode with ``FSAs`` on GPU efficiently.
    - ``fast_beam_search_LG`` : The same as ``fast_beam_search`` above, ``fast_beam_search`` uses
      an trivial graph that has only one state, while ``fast_beam_search_LG`` uses an LG graph
      (with N-gram LM).
    - ``fast_beam_search_nbest`` : It produces the decoding results as follows:
      - (1) Use ``fast_beam_search`` to get a lattice
      - (2) Select ``num_paths`` paths from the lattice using ``k2.random_paths()``
      - (3) Unique the selected paths
      - (4) Intersect the selected paths with the lattice and compute the
            shortest path from the intersection result
      - (5) The path with the largest score is used as the decoding output.
    - ``fast_beam_search_nbest_LG`` : It implements same logic as ``fast_beam_search_nbest``, the
      only difference is that it uses ``fast_beam_search_LG`` to generate the lattice.
 .. NOTE::
  The supporting decoding methods in ``streaming_decode.py`` might be less than that in ``decode.py``, if needed,
  you can implement them by yourself or file a issue in `icefall <https://github.com/k2-fsa/icefall/issues>`_ .
 Export Model
 ------------
 Currently it supports exporting checkpoints from ``pruned_transducer_stateless7_streaming/exp`` in the following ways.
 Export ``model.state_dict()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Checkpoints saved by ``pruned_transducer_stateless7_streaming/train.py`` also include
 ``optimizer.state_dict()``. It is useful for resuming training. But after training,
 we are interested only in ``model.state_dict()``. You can use the following
 command to extract ``model.state_dict()``.
 .. code-block:: bash
  # Assume that --epoch 30 --avg 9 produces the smallest WER
  # (You can get such information after running ./pruned_transducer_stateless7_streaming/decode.py)
  epoch=30
  avg=9
  ./pruned_transducer_stateless7_streaming/export.py \
    --exp-dir ./pruned_transducer_stateless7_streaming/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch $epoch \
    --avg  $avg \
    --use-averaged-model=True \
    --decode-chunk-len 32
 It will generate a file ``./pruned_transducer_stateless7_streaming/exp/pretrained.pt``.
 .. hint::
   To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless7_streaming/decode.py``,
   you can run:
   .. code-block:: bash
      cd pruned_transducer_stateless7_streaming/exp
      ln -s pretrained.pt epoch-999.pt
   And then pass ``--epoch 999 --avg 1 --use-averaged-model 0`` to
   ``./pruned_transducer_stateless7_streaming/decode.py``.
 To use the exported model with ``./pruned_transducer_stateless7_streaming/pretrained.py``, you
 can run:
 .. code-block:: bash
  ./pruned_transducer_stateless7_streaming/pretrained.py \
    --checkpoint ./pruned_transducer_stateless7_streaming/exp/pretrained.pt \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --method greedy_search \
    --decode-chunk-len 32 \
    /path/to/foo.wav \
    /path/to/bar.wav
 Export model using ``torch.jit.script()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  ./pruned_transducer_stateless7_streaming/export.py \
    --exp-dir ./pruned_transducer_stateless7_streaming/exp \
    --bpe-model data/lang_bpe_500/bpe.model \
    --epoch 30 \
    --avg 9 \
    --decode-chunk-len 32 \
    --jit 1
 .. caution::
   ``--decode-chunk-len`` is required to export a ScriptModule.
 It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
 load it by ``torch.jit.load("cpu_jit.pt")``.
 Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
 are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
 Export model using ``torch.jit.trace()``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. code-block:: bash
  epoch=30
  avg=9
  ./pruned_transducer_stateless7_streaming/jit_trace_export.py \
    --bpe-model data/lang_bpe_500/bpe.model \
    --use-averaged-model=True \
    --decode-chunk-len 32 \
    --exp-dir ./pruned_transducer_stateless7_streaming/exp \
    --epoch $epoch \
    --avg $avg
 .. caution::
   ``--decode-chunk-len`` is required to export a ScriptModule.
 It will generate 3 files:
  - ``./pruned_transducer_stateless7_streaming/exp/encoder_jit_trace.pt``
  - ``./pruned_transducer_stateless7_streaming/exp/decoder_jit_trace.pt``
  - ``./pruned_transducer_stateless7_streaming/exp/joiner_jit_trace.pt``
 To use the generated files with ``./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py``:
 .. code-block:: bash
  ./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py \
    --encoder-model-filename ./pruned_transducer_stateless7_streaming/exp/encoder_jit_trace.pt \
    --decoder-model-filename ./pruned_transducer_stateless7_streaming/exp/decoder_jit_trace.pt \
    --joiner-model-filename ./pruned_transducer_stateless7_streaming/exp/joiner_jit_trace.pt \
    --bpe-model ./data/lang_bpe_500/bpe.model \
    --decode-chunk-len 32 \
    /path/to/foo.wav
 Download pretrained models
 --------------------------
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
  - `pruned_transducer_stateless7_streaming <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`_
  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
  for the details of the above pretrained models
 Deploy with Sherpa
 ------------------
 Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/conformer/index.html#>`_
 for how to deploy the models in ``sherpa``.
--- a/docs/source/recipes/index.rst
+++ b/docs/source/recipes/index.rst
@ -13,7 +13,5 @@ We may add recipes for other tasks as well in the future.
   :maxdepth: 2
   :caption: Table of Contents
-   aishell/index
+   Non-streaming-ASR/index
-   librispeech/index
+   Streaming-ASR/index
   timit/index
   yesno/index
--- a/egs/alimeeting/ASR_v2/README.md
+++ b/egs/alimeeting/ASR_v2/README.md
@ -0,0 +1,38 @@
 # Introduction
 This recipe trains multi-domain ASR models for AliMeeting. By multi-domain, we mean that
 we train a single model on close-talk and far-field conditions. This recipe optionally
 uses [GSS]-based enhancement for far-field array microphone.
 We pool data in the following 4 ways and train a single model on the pooled data:
 (i) individual headset microphone (IHM)
 (ii) IHM with simulated reverb
 (iii) Single distant microphone (SDM)
 (iv) GSS-enhanced array microphones
 This is different from `alimeeting/ASR` since that recipe trains a model only on the
 far-field audio. Additionally, we use text normalization here similar to the original
 M2MeT challenge, so the results should be more comparable to those from Table 4 of
 the [paper](https://arxiv.org/abs/2110.07393).
 The following additional packages need to be installed to run this recipe:
 * `pip install jieba`
 * `pip install paddlepaddle`
 * `pip install git+https://github.com/desh2608/gss.git`
 [./RESULTS.md](./RESULTS.md) contains the latest results.
 ## Performance Record
 ### pruned_transducer_stateless7
 The following are decoded using `modified_beam_search`:
 | Evaluation set           | eval WER    | test WER |
 |--------------------------|------------|---------|
 | IHM                      |  9.58  | 11.53 |
 | SDM                      |  23.37  | 25.85 |
 | MDM (GSS-enhanced)       |  11.82  | 14.22 |
 See [RESULTS](/egs/alimeeting/ASR_v2/RESULTS.md) for details.
--- a/egs/alimeeting/ASR_v2/RESULTS.md
+++ b/egs/alimeeting/ASR_v2/RESULTS.md
@ -0,0 +1,90 @@
 ## Results (CER)
 #### 2022-12-09
 #### Zipformer (pruned_transducer_stateless7)
 Zipformer encoder + non-current decoder. The decoder
 contains only an embedding layer, a Conv1d (with kernel size 2) and a linear
 layer (to transform tensor dim).
 All the results below are using a single model that is trained by combining the following
 data: IHM, IHM+reverb, SDM, and GSS-enhanced MDM. Speed perturbation and MUSAN noise
 augmentation are applied on top of the pooled data.
 **WERs for IHM:**
 |                           | eval | test | comment                                  |
 |---------------------------|------------|------------|------------------------------------------|
 | greedy search             |  10.13  |  12.21  | --epoch 15 --avg 8 --max-duration 500 |
 | modified beam search      |  9.58  |  11.53  | --epoch 15 --avg 8 --max-duration 500 --beam-size 4 |
 | fast beam search          |  9.92  |  12.07  | --epoch 15 --avg 8 --max-duration 500 --beam-size 4 --max-contexts 4 --max-states 8 |
 **WERs for SDM:**
 |                           | eval | test | comment                                  |
 |---------------------------|------------|------------|------------------------------------------|
 | greedy search             |  23.70  |  26.41  | --epoch 15 --avg 8 --max-duration 500 |
 | modified beam search      |  23.37  |  25.85  | --epoch 15 --avg 8 --max-duration 500 --beam-size 4 |
 | fast beam search          |  23.60  |  26.38  | --epoch 15 --avg 8 --max-duration 500 --beam-size 4 --max-contexts 4 --max-states 8 |
 **WERs for GSS-enhanced MDM:**
 |                           | eval | test | comment                                  |
 |---------------------------|------------|------------|------------------------------------------|
 | greedy search             |  12.24  |  14.99  | --epoch 15 --avg 8 --max-duration 500 |
 | modified beam search      |  11.82  |  14.22  | --epoch 15 --avg 8 --max-duration 500 --beam-size 4 |
 | fast beam search          |  12.30  |  14.98  | --epoch 15 --avg 8 --max-duration 500 --beam-size 4 --max-contexts 4 --max-states 8 |
 The training command for reproducing is given below:
 ```
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 ./pruned_transducer_stateless7/train.py \
  --world-size 4 \
  --num-epochs 15 \
  --exp-dir pruned_transducer_stateless7/exp \
  --max-duration 300 \
  --max-cuts 100 \
  --prune-range 5 \
  --lr-factor 5 \
  --lm-scale 0.25 \
  --use-fp16 True
 ```
 The decoding command is:
 ```
 # greedy search
 ./pruned_transducer_stateless7/decode.py \
        --epoch 15 \
        --avg 8 \
        --exp-dir ./pruned_transducer_stateless7/exp \
        --max-duration 500 \
        --decoding-method greedy_search
 # modified beam search
 ./pruned_transducer_stateless7/decode.py \
        --epoch 15 \
        --avg 8 \
        --exp-dir ./pruned_transducer_stateless7/exp \
        --max-duration 500 \
        --decoding-method modified_beam_search \
        --beam-size 4
 # fast beam search
 ./pruned_transducer_stateless7/decode.py \
        --epoch 15 \
        --avg 8 \
        --exp-dir ./pruned_transducer_stateless5/exp \
        --max-duration 500 \
        --decoding-method fast_beam_search \
        --beam 4 \
        --max-contexts 4 \
        --max-states 8
 ```
 Pretrained model is available at <https://huggingface.co/desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7>
 The tensorboard training log can be found at
 <https://tensorboard.dev/experiment/EzmVahMMTb2YfKWXwQ2dyQ/#scalars>
--- a/egs/alimeeting/ASR_v2/local/init.py
+++ b/egs/alimeeting/ASR_v2/local/init.py
--- a/egs/alimeeting/ASR_v2/local/compute_fbank_alimeeting.py
+++ b/egs/alimeeting/ASR_v2/local/compute_fbank_alimeeting.py
@ -0,0 +1,193 @@
 #!/usr/bin/env python3
 # Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file computes fbank features of the AliMeeting dataset.
 For the training data, we prepare IHM, reverberated IHM, SDM, and GSS-enhanced
 audios. For the test data, we separately prepare IHM, SDM, and GSS-enhanced
 parts (which are the 3 evaluation settings).
 It looks for manifests in the directory data/manifests.
 The generated fbank features are saved in data/fbank.
 """
 import logging
 from pathlib import Path
 import torch
 import torch.multiprocessing
 from lhotse import CutSet, LilcomChunkyWriter
 from lhotse.features.kaldifeat import (
    KaldifeatFbank,
    KaldifeatFbankConfig,
    KaldifeatFrameOptions,
    KaldifeatMelOptions,
 )
 from lhotse.recipes.utils import read_manifests_if_cached
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 torch.multiprocessing.set_sharing_strategy("file_system")
 def compute_fbank_ami():
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    sampling_rate = 16000
    num_mel_bins = 80
    extractor = KaldifeatFbank(
        KaldifeatFbankConfig(
            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
            device="cuda",
        )
    )
    logging.info("Reading manifests")
    manifests_ihm = read_manifests_if_cached(
        dataset_parts=["train", "eval", "test"],
        output_dir=src_dir,
        prefix="alimeeting-ihm",
        suffix="jsonl.gz",
    )
    manifests_sdm = read_manifests_if_cached(
        dataset_parts=["train", "eval", "test"],
        output_dir=src_dir,
        prefix="alimeeting-sdm",
        suffix="jsonl.gz",
    )
    # For GSS we already have cuts so we read them directly.
    manifests_gss = read_manifests_if_cached(
        dataset_parts=["train", "eval", "test"],
        output_dir=src_dir,
        prefix="alimeeting-gss",
        suffix="jsonl.gz",
    )
    def _extract_feats(cuts: CutSet, storage_path: Path, manifest_path: Path) -> None:
        cuts = cuts + cuts.perturb_speed(0.9) + cuts.perturb_speed(1.1)
        _ = cuts.compute_and_store_features_batch(
            extractor=extractor,
            storage_path=storage_path,
            manifest_path=manifest_path,
            batch_duration=5000,
            num_workers=8,
            storage_type=LilcomChunkyWriter,
        )
    logging.info(
        "Preparing training cuts: IHM + reverberated IHM + SDM + GSS (optional)"
    )
    logging.info("Processing train split IHM")
    cuts_ihm = (
        CutSet.from_manifests(**manifests_ihm["train"])
        .trim_to_supervisions(keep_overlapping=False, keep_all_channels=False)
        .modify_ids(lambda x: x + "-ihm")
    )
    _extract_feats(
        cuts_ihm,
        output_dir / "feats_train_ihm",
        src_dir / "cuts_train_ihm.jsonl.gz",
    )
    logging.info("Processing train split IHM + reverberated IHM")
    cuts_ihm_rvb = cuts_ihm.reverb_rir()
    _extract_feats(
        cuts_ihm_rvb,
        output_dir / "feats_train_ihm_rvb",
        src_dir / "cuts_train_ihm_rvb.jsonl.gz",
    )
    logging.info("Processing train split SDM")
    cuts_sdm = (
        CutSet.from_manifests(**manifests_sdm["train"])
        .trim_to_supervisions(keep_overlapping=False)
        .modify_ids(lambda x: x + "-sdm")
    )
    _extract_feats(
        cuts_sdm,
        output_dir / "feats_train_sdm",
        src_dir / "cuts_train_sdm.jsonl.gz",
    )
    logging.info("Processing train split GSS")
    cuts_gss = (
        CutSet.from_manifests(**manifests_gss["train"])
        .trim_to_supervisions(keep_overlapping=False)
        .modify_ids(lambda x: x + "-gss")
    )
    _extract_feats(
        cuts_gss,
        output_dir / "feats_train_gss",
        src_dir / "cuts_train_gss.jsonl.gz",
    )
    logging.info("Preparing test cuts: IHM, SDM, GSS (optional)")
    for split in ["eval", "test"]:
        logging.info(f"Processing {split} IHM")
        cuts_ihm = (
            CutSet.from_manifests(**manifests_ihm[split])
            .trim_to_supervisions(keep_overlapping=False, keep_all_channels=False)
            .compute_and_store_features_batch(
                extractor=extractor,
                storage_path=output_dir / f"feats_{split}_ihm",
                manifest_path=src_dir / f"cuts_{split}_ihm.jsonl.gz",
                batch_duration=500,
                num_workers=4,
                storage_type=LilcomChunkyWriter,
            )
        )
        logging.info(f"Processing {split} SDM")
        cuts_sdm = (
            CutSet.from_manifests(**manifests_sdm[split])
            .trim_to_supervisions(keep_overlapping=False)
            .compute_and_store_features_batch(
                extractor=extractor,
                storage_path=output_dir / f"feats_{split}_sdm",
                manifest_path=src_dir / f"cuts_{split}_sdm.jsonl.gz",
                batch_duration=500,
                num_workers=4,
                storage_type=LilcomChunkyWriter,
            )
        )
        logging.info(f"Processing {split} GSS")
        cuts_gss = (
            CutSet.from_manifests(**manifests_gss[split])
            .trim_to_supervisions(keep_overlapping=False)
            .compute_and_store_features_batch(
                extractor=extractor,
                storage_path=output_dir / f"feats_{split}_gss",
                manifest_path=src_dir / f"cuts_{split}_gss.jsonl.gz",
                batch_duration=500,
                num_workers=4,
                storage_type=LilcomChunkyWriter,
            )
        )
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    compute_fbank_ami()
--- a/egs/alimeeting/ASR_v2/local/compute_fbank_musan.py
+++ b/egs/alimeeting/ASR_v2/local/compute_fbank_musan.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/compute_fbank_musan.py
--- a/egs/alimeeting/ASR_v2/local/prepare_alimeeting_enhanced.py
+++ b/egs/alimeeting/ASR_v2/local/prepare_alimeeting_enhanced.py
@ -0,0 +1,158 @@
 #!/usr/local/bin/python
 # -*- coding: utf-8 -*-
 # Data preparation for AliMeeting GSS-enhanced dataset.
 import logging
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from lhotse import Recording, RecordingSet, SupervisionSet
 from lhotse.qa import fix_manifests
 from lhotse.recipes.utils import read_manifests_if_cached
 from lhotse.utils import fastcopy
 from tqdm import tqdm
 logging.basicConfig(
    format="%(asctime)s %(levelname)-8s %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S",
 )
 def get_args():
    import argparse
    parser = argparse.ArgumentParser(description="AMI enhanced dataset preparation.")
    parser.add_argument(
        "manifests_dir",
        type=Path,
        help="Path to directory containing AliMeeting manifests.",
    )
    parser.add_argument(
        "enhanced_dir",
        type=Path,
        help="Path to enhanced data directory.",
    )
    parser.add_argument(
        "--num-jobs",
        "-j",
        type=int,
        default=1,
        help="Number of parallel jobs to run.",
    )
    parser.add_argument(
        "--min-segment-duration",
        "-d",
        type=float,
        default=0.0,
        help="Minimum duration of a segment in seconds.",
    )
    return parser.parse_args()
 def find_recording_and_create_new_supervision(enhanced_dir, supervision):
    """
    Given a supervision (corresponding to original AMI recording), this function finds the
    enhanced recording correspoding to the supervision, and returns this recording and
    a new supervision whose start and end times are adjusted to match the enhanced recording.
    """
    file_name = Path(
        f"{supervision.recording_id}-{supervision.speaker}-{int(100*supervision.start):06d}_{int(100*supervision.end):06d}.flac"
    )
    save_path = enhanced_dir / f"{supervision.recording_id}" / file_name
    if save_path.exists():
        recording = Recording.from_file(save_path)
        if recording.duration == 0:
            logging.warning(f"Skipping {save_path} which has duration 0 seconds.")
            return None
        # Old supervision is wrt to the original recording, we create new supervision
        # wrt to the enhanced segment
        new_supervision = fastcopy(
            supervision,
            recording_id=recording.id,
            start=0,
            duration=recording.duration,
        )
        return recording, new_supervision
    else:
        logging.warning(f"{save_path} does not exist.")
        return None
 def main(args):
    # Get arguments
    manifests_dir = args.manifests_dir
    enhanced_dir = args.enhanced_dir
    # Load manifests from cache if they exist (saves time)
    manifests = read_manifests_if_cached(
        dataset_parts=["train", "eval", "test"],
        output_dir=manifests_dir,
        prefix="alimeeting-sdm",
        suffix="jsonl.gz",
    )
    if not manifests:
        raise ValueError(
            "AliMeeting SDM manifests not found in {}".format(manifests_dir)
        )
    with ThreadPoolExecutor(args.num_jobs) as ex:
        for part in ["train", "eval", "test"]:
            logging.info(f"Processing {part}...")
            supervisions_orig = manifests[part]["supervisions"].filter(
                lambda s: s.duration >= args.min_segment_duration
            )
            futures = []
            for supervision in tqdm(
                supervisions_orig,
                desc="Distributing tasks",
            ):
                futures.append(
                    ex.submit(
                        find_recording_and_create_new_supervision,
                        enhanced_dir,
                        supervision,
                    )
                )
            recordings = []
            supervisions = []
            for future in tqdm(
                futures,
                total=len(futures),
                desc="Processing tasks",
            ):
                result = future.result()
                if result is not None:
                    recording, new_supervision = result
                    recordings.append(recording)
                    supervisions.append(new_supervision)
            # Remove duplicates from the recordings
            recordings_nodup = {}
            for recording in recordings:
                if recording.id not in recordings_nodup:
                    recordings_nodup[recording.id] = recording
                else:
                    logging.warning("Recording {} is duplicated.".format(recording.id))
            recordings = RecordingSet.from_recordings(recordings_nodup.values())
            supervisions = SupervisionSet.from_segments(supervisions)
            recordings, supervisions = fix_manifests(
                recordings=recordings, supervisions=supervisions
            )
            logging.info(f"Writing {part} enhanced manifests")
            recordings.to_file(
                manifests_dir / f"alimeeting-gss_recordings_{part}.jsonl.gz"
            )
            supervisions.to_file(
                manifests_dir / f"alimeeting-gss_supervisions_{part}.jsonl.gz"
            )
 if __name__ == "__main__":
    args = get_args()
    main(args)
--- a/egs/alimeeting/ASR_v2/local/prepare_alimeeting_gss.sh
+++ b/egs/alimeeting/ASR_v2/local/prepare_alimeeting_gss.sh
@ -0,0 +1,98 @@
 #!/bin/bash
 # This script is used to run GSS-based enhancement on AMI data.
 set -euo pipefail
 nj=4
 stage=0
 . shared/parse_options.sh || exit 1
 if [ $# != 2 ]; then
   echo "Wrong #arguments ($#, expected 2)"
   echo "Usage: local/prepare_alimeeting_gss.sh [options] <data-dir> <exp-dir>"
   echo "e.g. local/prepare_alimeeting_gss.sh data/manifests exp/ami_gss"
   echo "main options (for others, see top of script file)"
   echo "  --nj <nj>                                # number of parallel jobs"
   echo "  --stage <stage>                          # stage to start running from"
   exit 1;
 fi
 DATA_DIR=$1
 EXP_DIR=$2
 mkdir -p $EXP_DIR
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 if [ $stage -le 1 ]; then
  log "Stage 1: Prepare cut sets"
  for part in train eval test; do
    lhotse cut simple \
      -r $DATA_DIR/alimeeting-mdm_recordings_${part}.jsonl.gz \
      -s $DATA_DIR/alimeeting-mdm_supervisions_${part}.jsonl.gz \
      $EXP_DIR/cuts_${part}.jsonl.gz
  done
 fi
 if [ $stage -le 2 ]; then
  log "Stage 2: Trim cuts to supervisions (1 cut per supervision segment)"
  for part in train eval test; do
    lhotse cut trim-to-supervisions --discard-overlapping \
        $EXP_DIR/cuts_${part}.jsonl.gz $EXP_DIR/cuts_per_segment_${part}.jsonl.gz
  done
 fi
 if [ $stage -le 3 ]; then
  log "Stage 3: Split manifests for multi-GPU processing (optional)"
  for part in train eval test; do
    gss utils split $nj $EXP_DIR/cuts_per_segment_${part}.jsonl.gz \
      $EXP_DIR/cuts_per_segment_${part}_split$nj
  done
 fi
 if [ $stage -le 4 ]; then
  log "Stage 4: Enhance train segments using GSS (requires GPU)"
  # for train, we use smaller context and larger batches to speed-up processing
  for JOB in $(seq $nj); do
    gss enhance cuts $EXP_DIR/cuts_train.jsonl.gz \
      $EXP_DIR/cuts_per_segment_train_split$nj/cuts_per_segment_train.JOB.jsonl.gz $EXP_DIR/enhanced \
      --bss-iterations 10 \
      --context-duration 5.0 \
      --use-garbage-class \
      --channels 0,1,2,3,4,5,6,7 \
      --min-segment-length 0.05 \
      --max-segment-length 25.0 \
      --max-batch-duration 60.0 \
      --num-buckets 4 \
      --num-workers 4
  done
 fi
 if [ $stage -le 5 ]; then
  log "Stage 5: Enhance eval/test segments using GSS (using GPU)"
  # for eval/test, we use larger context and smaller batches to get better quality
  for part in eval test; do
    for JOB in $(seq $nj); do
      gss enhance cuts $EXP_DIR/cuts_${part}.jsonl.gz \
      $EXP_DIR/cuts_per_segment_${part}_split$nj/cuts_per_segment_${part}.JOB.jsonl.gz \
      $EXP_DIR/enhanced \
      --bss-iterations 10 \
      --context-duration 15.0 \
      --use-garbage-class \
      --channels 0,1,2,3,4,5,6,7 \
      --min-segment-length 0.05 \
      --max-segment-length 16.0 \
      --max-batch-duration 45.0 \
      --num-buckets 4 \
      --num-workers 4
    done
  done
 fi
 if [ $stage -le 6 ]; then
  log "Stage 6: Prepare manifests for GSS-enhanced data"
  python local/prepare_alimeeting_enhanced.py $DATA_DIR $EXP_DIR/enhanced -j $nj --min-segment-duration 0.05
 fi
--- a/egs/alimeeting/ASR_v2/local/prepare_char.py
+++ b/egs/alimeeting/ASR_v2/local/prepare_char.py
@ -0,0 +1 @@
 ../../ASR/local/prepare_char.py
--- a/egs/alimeeting/ASR_v2/local/prepare_words.py
+++ b/egs/alimeeting/ASR_v2/local/prepare_words.py
@ -0,0 +1 @@
 ../../ASR/local/prepare_words.py
--- a/egs/alimeeting/ASR_v2/local/text2segments.py
+++ b/egs/alimeeting/ASR_v2/local/text2segments.py
@ -0,0 +1 @@
 ../../ASR/local/text2segments.py
--- a/egs/alimeeting/ASR_v2/local/text2token.py
+++ b/egs/alimeeting/ASR_v2/local/text2token.py
@ -0,0 +1 @@
 ../../ASR/local/text2token.py
--- a/egs/alimeeting/ASR_v2/prepare.sh
+++ b/egs/alimeeting/ASR_v2/prepare.sh
@ -0,0 +1,125 @@
 #!/usr/bin/env bash
 set -eou pipefail
 stage=-1
 stop_stage=100
 use_gss=true  # Use GSS-based enhancement with MDM setting
 # We assume dl_dir (download dir) contains the following
 # directories and files. If not, they will be downloaded
 # by this script automatically.
 #
 #  - $dl_dir/alimeeting
 #     This directory contains the following files downloaded from
 #       https://openslr.org/62/
 #
 #     - Train_Ali_far.tar.gz
 #     - Train_Ali_near.tar.gz
 #     - Test_Ali.tar.gz
 #     - Eval_Ali.tar.gz
 #
 #  - $dl_dir/musan
 #      This directory contains the following directories downloaded from
 #       http://www.openslr.org/17/
 #
 #     - music
 #     - noise
 #     - speech
 dl_dir=$PWD/download
 . shared/parse_options.sh || exit 1
 # All files generated by this script are saved in "data".
 # You can safely remove "data" and rerun this script to regenerate it.
 mkdir -p data
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 log "dl_dir: $dl_dir"
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "Stage 0: Download data"
  if [ ! -f $dl_dir/alimeeting/Train_Ali_far.tar.gz ]; then
    lhotse download ali-meeting $dl_dir/alimeeting
  fi
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare alimeeting manifest"
  # We assume that you have downloaded the alimeeting corpus
  # to $dl_dir/alimeeting
  for part in ihm sdm mdm; do
    mkdir -p data/manifests/alimeeting
    lhotse prepare ali-meeting --mic $part --save-mono --normalize-text m2met \
      $dl_dir/alimeeting data/manifests
  done
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Prepare musan manifest"
  # We assume that you have downloaded the musan corpus
  # to data/musan
  mkdir -p data/manifests
  lhotse prepare musan $dl_dir/musan data/manifests
 fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ] && [ $use_gss = true ]; then
  log "Stage 3: Apply GSS enhancement on MDM data (this stage requires a GPU)"
  # We assume that you have installed the GSS package: https://github.com/desh2608/gss
  local/prepare_alimeeting_gss.sh data/manifests exp/alimeeting_gss
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Compute fbank for musan"
  mkdir -p data/fbank
  python local/compute_fbank_musan.py
 fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "Stage 5: Compute fbank for alimeeting"
  mkdir -p data/fbank
  python local/compute_fbank_alimeeting.py
  log "Combine features from train splits"
  lhotse combine data/manifests/cuts_train_{ihm,ihm_rvb,sdm,gss}.jsonl.gz - | shuf |\
    gzip -c > data/manifests/cuts_train_all.jsonl.gz
 fi
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "Stage 6: Prepare char based lang"
  lang_char_dir=data/lang_char
  mkdir -p $lang_char_dir
  # Prepare text.
  # Note: in Linux, you can install jq with the  following command:
  # wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
  gunzip -c data/manifests/alimeeting-sdm_supervisions_train.jsonl.gz \
    | jq ".text" | sed 's/"//g' \
    | ./local/text2token.py -t "char" > $lang_char_dir/text
  # Prepare words segments
  python ./local/text2segments.py \
    --input $lang_char_dir/text \
    --output $lang_char_dir/text_words_segmentation
  cat $lang_char_dir/text_words_segmentation | sed "s/ /\n/g" \
    | sort -u | sed "/^$/d" \
    | uniq > $lang_char_dir/words_no_ids.txt
  # Prepare words.txt
  if [ ! -f $lang_char_dir/words.txt ]; then
    ./local/prepare_words.py \
      --input-file $lang_char_dir/words_no_ids.txt \
      --output-file $lang_char_dir/words.txt
  fi
  if [ ! -f $lang_char_dir/L_disambig.pt ]; then
    ./local/prepare_char.py
  fi
 fi
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/init.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/init.py
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/asr_datamodule.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/asr_datamodule.py
@ -0,0 +1,419 @@
 # Copyright      2021  Piotr Żelasko
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 import re
 from functools import lru_cache
 from pathlib import Path
 from typing import Any, Dict, Optional
 import torch
 from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
 from lhotse.cut import Cut
 from lhotse.dataset import (
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SpecAugment,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
 from lhotse.utils import fix_random_seed
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 from icefall.utils import str2bool
 class _SeedWorkers:
    def __init__(self, seed: int):
        self.seed = seed
    def __call__(self, worker_id: int):
        fix_random_seed(self.seed + worker_id)
 class AlimeetingAsrDataModule:
    """
    DataModule for k2 ASR experiments.
    It assumes there is always one train and valid dataloader,
    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
    and test-other).
    It contains all the common data pipeline modules used in ASR
    experiments, e.g.:
    - dynamic batch size,
    - bucketing samplers,
    - cut concatenation,
    - augmentation,
    - on-the-fly feature extraction
    This class should be derived for specific corpora used in ASR tasks.
    """
    def __init__(self, args: argparse.Namespace):
        self.args = args
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        group = parser.add_argument_group(
            title="ASR data related options",
            description=(
                "These options are used for the preparation of "
                "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
                "effective batch sizes, sampling strategies, applied data "
                "augmentations, etc."
            ),
        )
        group.add_argument(
            "--manifest-dir",
            type=Path,
            default=Path("data/manifests"),
            help="Path to directory with train/valid/test cuts.",
        )
        group.add_argument(
            "--enable-musan",
            type=str2bool,
            default=True,
            help=(
                "When enabled, select noise from MUSAN and mix it "
                "with training dataset. "
            ),
        )
        group.add_argument(
            "--concatenate-cuts",
            type=str2bool,
            default=False,
            help=(
                "When enabled, utterances (cuts) will be concatenated "
                "to minimize the amount of padding."
            ),
        )
        group.add_argument(
            "--duration-factor",
            type=float,
            default=1.0,
            help=(
                "Determines the maximum duration of a concatenated cut "
                "relative to the duration of the longest cut in a batch."
            ),
        )
        group.add_argument(
            "--gap",
            type=float,
            default=1.0,
            help=(
                "The amount of padding (in seconds) inserted between "
                "concatenated cuts. This padding is filled with noise when "
                "noise augmentation is used."
            ),
        )
        group.add_argument(
            "--max-duration",
            type=int,
            default=100.0,
            help=(
                "Maximum pooled recordings duration (seconds) in a "
                "single batch. You can reduce it if it causes CUDA OOM."
            ),
        )
        group.add_argument(
            "--max-cuts", type=int, default=None, help="Maximum cuts in a single batch."
        )
        group.add_argument(
            "--num-buckets",
            type=int,
            default=50,
            help=(
                "The number of buckets for the BucketingSampler"
                "(you might want to increase it for larger datasets)."
            ),
        )
        group.add_argument(
            "--on-the-fly-feats",
            type=str2bool,
            default=False,
            help=(
                "When enabled, use on-the-fly cut mixing and feature "
                "extraction. Will drop existing precomputed feature manifests "
                "if available."
            ),
        )
        group.add_argument(
            "--shuffle",
            type=str2bool,
            default=True,
            help=(
                "When enabled (=default), the examples will be "
                "shuffled for each epoch."
            ),
        )
        group.add_argument(
            "--num-workers",
            type=int,
            default=8,
            help=(
                "The number of training dataloader workers that " "collect the batches."
            ),
        )
        group.add_argument(
            "--enable-spec-aug",
            type=str2bool,
            default=True,
            help="When enabled, use SpecAugment for training dataset.",
        )
        group.add_argument(
            "--spec-aug-time-warp-factor",
            type=int,
            default=80,
            help=(
                "Used only when --enable-spec-aug is True. "
                "It specifies the factor for time warping in SpecAugment. "
                "Larger values mean more warping. "
                "A value less than 1 means to disable time warp."
            ),
        )
    def train_dataloaders(
        self,
        cuts_train: CutSet,
        sampler_state_dict: Optional[Dict[str, Any]] = None,
    ) -> DataLoader:
        """
        Args:
          cuts_train:
            CutSet for training.
          sampler_state_dict:
            The state dict for the training sampler.
        """
        logging.info("About to get Musan cuts")
        transforms = []
        if self.args.enable_musan:
            logging.info("Enable MUSAN")
            cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
            transforms.append(
                CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
            )
        else:
            logging.info("Disable MUSAN")
        if self.args.concatenate_cuts:
            logging.info(
                "Using cut concatenation with duration factor "
                f"{self.args.duration_factor} and gap {self.args.gap}."
            )
            # Cut concatenation should be the first transform in the list,
            # so that if we e.g. mix noise in, it will fill the gaps between
            # different utterances.
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        input_transforms = []
        if self.args.enable_spec_aug:
            logging.info("Enable SpecAugment")
            logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
            input_transforms.append(
                SpecAugment(
                    time_warp_factor=self.args.spec_aug_time_warp_factor,
                    num_frame_masks=2,
                    features_mask_size=27,
                    num_feature_masks=2,
                    frames_mask_size=100,
                )
            )
        else:
            logging.info("Disable SpecAugment")
        logging.info("About to create train dataset")
        if self.args.on_the_fly_feats:
            train = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
                input_transforms=input_transforms,
            )
        else:
            train = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_transforms=input_transforms,
            )
        logging.info("Using DynamicBucketingSampler.")
        train_sampler = DynamicBucketingSampler(
            cuts_train,
            max_duration=self.args.max_duration,
            max_cuts=self.args.max_cuts,
            shuffle=False,
            num_buckets=self.args.num_buckets,
            drop_last=True,
        )
        logging.info("About to create train dataloader")
        if sampler_state_dict is not None:
            logging.info("Loading sampler state dict")
            train_sampler.load_state_dict(sampler_state_dict)
        # 'seed' is derived from the current random state, which will have
        # previously been set in the main process.
        seed = torch.randint(0, 100000, ()).item()
        worker_init_fn = _SeedWorkers(seed)
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=self.args.num_workers,
            persistent_workers=False,
            worker_init_fn=worker_init_fn,
        )
        return train_dl
    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
        transforms = []
        if self.args.concatenate_cuts:
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
            )
        else:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
            )
        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.info("About to create dev dataloader")
        valid_dl = DataLoader(
            validate,
            sampler=valid_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=False,
        )
        return valid_dl
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.debug("About to create test dataset")
        test = K2SpeechRecognitionDataset(
            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
            if self.args.on_the_fly_feats
            else PrecomputedFeatures(),
            return_cuts=True,
        )
        sampler = DynamicBucketingSampler(
            cuts, max_duration=self.args.max_duration, shuffle=False
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
            test,
            batch_size=None,
            sampler=sampler,
            num_workers=self.args.num_workers,
        )
        return test_dl
    def remove_short_cuts(self, cut: Cut) -> bool:
        """
        See: https://github.com/k2-fsa/icefall/issues/500
        Basically, the zipformer model subsamples the input using the following formula:
        num_out_frames = ((num_in_frames - 7)//2 + 1)//2
        For num_out_frames to be at least 1, num_in_frames must be at least 9.
        """
        return cut.duration >= 0.09
    @lru_cache()
    def train_cuts(self, sp: Optional[Any] = None) -> CutSet:
        logging.info("About to get AMI train cuts")
        def _remove_short_and_long_utt(c: Cut):
            if c.duration < 0.1 or c.duration > 25.0:
                return False
            # In pruned RNN-T, we require that T >= S
            # where T is the number of feature frames after subsampling
            # and S is the number of tokens in the utterance
            # In ./zipformer.py, the conv module uses the following expression
            # for subsampling
            T = ((c.num_frames - 7) // 2 + 1) // 2
            tokens = c.supervisions[0].text
            return T >= len(tokens)
        cuts_train = load_manifest_lazy(
            self.args.manifest_dir / "cuts_train_all.jsonl.gz"
        )
        return cuts_train.filter(_remove_short_and_long_utt)
    @lru_cache()
    def eval_ihm_cuts(self) -> CutSet:
        logging.info("About to get AliMeeting IHM eval cuts")
        cs = load_manifest_lazy(self.args.manifest_dir / "cuts_eval_ihm.jsonl.gz")
        return cs.filter(self.remove_short_cuts)
    @lru_cache()
    def eval_sdm_cuts(self) -> CutSet:
        logging.info("About to get AliMeeting SDM eval cuts")
        cs = load_manifest_lazy(self.args.manifest_dir / "cuts_eval_sdm.jsonl.gz")
        return cs.filter(self.remove_short_cuts)
    @lru_cache()
    def eval_gss_cuts(self) -> CutSet:
        if not (self.args.manifest_dir / "cuts_eval_gss.jsonl.gz").exists():
            logging.info("No GSS dev cuts found")
            return None
        logging.info("About to get AliMeeting GSS-enhanced eval cuts")
        cs = load_manifest_lazy(self.args.manifest_dir / "cuts_eval_gss.jsonl.gz")
        return cs.filter(self.remove_short_cuts)
    @lru_cache()
    def test_ihm_cuts(self) -> CutSet:
        logging.info("About to get AliMeeting IHM test cuts")
        cs = load_manifest_lazy(self.args.manifest_dir / "cuts_test_ihm.jsonl.gz")
        return cs.filter(self.remove_short_cuts)
    @lru_cache()
    def test_sdm_cuts(self) -> CutSet:
        logging.info("About to get AliMeeting SDM test cuts")
        cs = load_manifest_lazy(self.args.manifest_dir / "cuts_test_sdm.jsonl.gz")
        return cs.filter(self.remove_short_cuts)
    @lru_cache()
    def test_gss_cuts(self) -> CutSet:
        if not (self.args.manifest_dir / "cuts_test_gss.jsonl.gz").exists():
            logging.info("No GSS test cuts found")
            return None
        logging.info("About to get AliMeeting GSS-enhanced test cuts")
        cs = load_manifest_lazy(self.args.manifest_dir / "cuts_test_gss.jsonl.gz")
        return cs.filter(self.remove_short_cuts)
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/beam_search.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/beam_search.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless7/beam_search.py
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/decode.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/decode.py
@ -0,0 +1,698 @@
 #!/usr/bin/env python3
 #
 # Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
 (1) greedy search
 ./pruned_transducer_stateless7/decode.py \
        --epoch 15 \
        --avg 8 \
        --exp-dir ./pruned_transducer_stateless7/exp \
        --max-duration 500 \
        --decoding-method greedy_search
 (2) modified beam search
 ./pruned_transducer_stateless7/decode.py \
        --epoch 15 \
        --avg 8 \
        --exp-dir ./pruned_transducer_stateless7/exp \
        --max-duration 500 \
        --decoding-method modified_beam_search \
        --beam-size 4
 (3) fast beam search
 ./pruned_transducer_stateless7/decode.py \
        --epoch 15 \
        --avg 8 \
        --exp-dir ./pruned_transducer_stateless7/exp \
        --max-duration 500 \
        --decoding-method fast_beam_search \
        --beam 4 \
        --max-contexts 4 \
        --max-states 8
 """
 import argparse
 import logging
 from collections import defaultdict
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import k2
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import AlimeetingAsrDataModule
 from beam_search import (
    beam_search,
    fast_beam_search_nbest_LG,
    fast_beam_search_one_best,
    greedy_search,
    greedy_search_batch,
    modified_beam_search,
 )
 from train import add_model_arguments, get_params, get_transducer_model
 from icefall import NgramLm
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
    setup_logger,
    store_transcripts,
    str2bool,
    write_error_stats,
 )
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=30,
        help="""It specifies the checkpoint to use for decoding.
        Note: Epoch counts from 0.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=10,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
    )
    parser.add_argument(
        "--use-averaged-model",
        type=str2bool,
        default=True,
        help="Whether to load averaged model. Currently it only supports "
        "using --epoch. If True, it would decode with the averaged model "
        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
        "Actually only the models with epoch number of `epoch-avg` and "
        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="pruned_transducer_stateless2/exp",
        help="The experiment dir",
    )
    parser.add_argument(
        "--lang-dir",
        type=str,
        default="data/lang_char",
        help="""The lang dir
        It contains language related input files such as
        "lexicon.txt"
        """,
    )
    parser.add_argument(
        "--decoding-method",
        type=str,
        default="greedy_search",
        help="""Possible values are:
          - greedy_search
          - beam_search
          - modified_beam_search
          - fast_beam_search
          - fast_beam_search_nbest
          - fast_beam_search_nbest_oracle
          - fast_beam_search_nbest_LG
        If you use fast_beam_search_nbest_LG, you have to specify
        `--lang-dir`, which should contain `LG.pt`.
        """,
    )
    parser.add_argument(
        "--beam-size",
        type=int,
        default=4,
        help="""An interger indicating how many candidates we will keep for each
        frame. Used only when --decoding-method is beam_search or
        modified_beam_search.""",
    )
    parser.add_argument(
        "--beam",
        type=float,
        default=4,
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
        Used only when --decoding-method is fast_beam_search""",
    )
    parser.add_argument(
        "--ngram-lm-scale",
        type=float,
        default=0.01,
        help="""
        Used only when --decoding_method is fast_beam_search_nbest_LG.
        It specifies the scale for n-gram LM scores.
        """,
    )
    parser.add_argument(
        "--max-contexts",
        type=int,
        default=8,
        help="""Used only when --decoding-method is
        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
        and fast_beam_search_nbest_oracle""",
    )
    parser.add_argument(
        "--max-states",
        type=int,
        default=64,
        help="""Used only when --decoding-method is
        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
        and fast_beam_search_nbest_oracle""",
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; " "2 means tri-gram",
    )
    parser.add_argument(
        "--max-sym-per-frame",
        type=int,
        default=1,
        help="""Maximum number of symbols per frame.
        Used only when --decoding_method is greedy_search""",
    )
    parser.add_argument(
        "--num-paths",
        type=int,
        default=200,
        help="""Number of paths for nbest decoding.
        Used only when the decoding method is fast_beam_search_nbest,
        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
    )
    parser.add_argument(
        "--nbest-scale",
        type=float,
        default=0.5,
        help="""Scale applied to lattice scores when computing nbest paths.
        Used only when the decoding method is fast_beam_search_nbest,
        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
    )
    add_model_arguments(parser)
    return parser
 def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
    lexicon: Lexicon,
    batch: dict,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[List[str]]]:
    """Decode one batch and return the result in a dict. The dict has the
    following format:
        - key: It indicates the setting used for decoding. For example,
               if greedy_search is used, it would be "greedy_search"
               If beam search with a beam size of 7 is used, it would be
               "beam_7"
        - value: It contains the decoding result. `len(value)` equals to
                 batch size. `value[i]` is the decoding result for the i-th
                 utterance in the given batch.
    Args:
      params:
        It's the return value of :func:`get_params`.
      model:
        The neural model.
      batch:
        It is the return value from iterating
        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
        for the format of the `batch`.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
        only when --decoding_method is fast_beam_search.
    Returns:
      Return the decoding result. See above description for the format of
      the returned dict.
    """
    device = model.device
    feature = batch["inputs"]
    assert feature.ndim == 3
    feature = feature.to(device)
    # at entry, feature is (N, T, C)
    supervisions = batch["supervisions"]
    feature_lens = supervisions["num_frames"].to(device)
    encoder_out, encoder_out_lens = model.encoder(x=feature, x_lens=feature_lens)
    hyps = []
    if params.decoding_method == "fast_beam_search":
        hyp_tokens = fast_beam_search_one_best(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam,
            max_contexts=params.max_contexts,
            max_states=params.max_states,
        )
        for i in range(encoder_out.size(0)):
            hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
    elif params.decoding_method == "fast_beam_search_nbest_LG":
        hyp_tokens = fast_beam_search_nbest_LG(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam,
            max_contexts=params.max_contexts,
            max_states=params.max_states,
            num_paths=params.num_paths,
            nbest_scale=params.nbest_scale,
        )
        for i in range(encoder_out.size(0)):
            hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
    elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
        hyp_tokens = greedy_search_batch(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
        )
        for i in range(encoder_out.size(0)):
            hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
    elif params.decoding_method == "modified_beam_search":
        hyp_tokens = modified_beam_search(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam_size,
        )
        for i in range(encoder_out.size(0)):
            hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
    else:
        batch_size = encoder_out.size(0)
        for i in range(batch_size):
            # fmt: off
            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
            # fmt: on
            if params.decoding_method == "greedy_search":
                hyp = greedy_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    max_sym_per_frame=params.max_sym_per_frame,
                )
            elif params.decoding_method == "beam_search":
                hyp = beam_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    beam=params.beam_size,
                )
            else:
                raise ValueError(
                    f"Unsupported decoding method: {params.decoding_method}"
                )
            hyps.append([lexicon.token_table[idx] for idx in hyp])
    if params.decoding_method == "greedy_search":
        return {"greedy_search": hyps}
    elif params.decoding_method == "fast_beam_search":
        return {
            (
                f"beam_{params.beam}_"
                f"max_contexts_{params.max_contexts}_"
                f"max_states_{params.max_states}"
            ): hyps
        }
    elif "fast_beam_search" in params.decoding_method:
        key = f"beam_{params.beam}_"
        key += f"max_contexts_{params.max_contexts}_"
        key += f"max_states_{params.max_states}"
        if "nbest" in params.decoding_method:
            key += f"_num_paths_{params.num_paths}_"
            key += f"nbest_scale_{params.nbest_scale}"
            if "LG" in params.decoding_method:
                key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
        return {key: hyps}
    else:
        return {f"beam_size_{params.beam_size}": hyps}
 def decode_dataset(
    dl: torch.utils.data.DataLoader,
    params: AttributeDict,
    model: nn.Module,
    lexicon: Lexicon,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
    """Decode dataset.
    Args:
      dl:
        PyTorch's dataloader containing the dataset to decode.
      params:
        It is returned by :func:`get_params`.
      model:
        The neural model.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
        only when --decoding_method is fast_beam_search.
    Returns:
      Return a dict, whose key may be "greedy_search" if greedy search
      is used, or it may be "beam_7" if beam size of 7 is used.
      Its value is a list of tuples. Each tuple contains two elements:
      The first is the reference transcript, and the second is the
      predicted result.
    """
    num_cuts = 0
    try:
        num_batches = len(dl)
    except TypeError:
        num_batches = "?"
    if params.decoding_method == "greedy_search":
        log_interval = 100
    else:
        log_interval = 2
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
        texts = [list(str(text).replace(" ", "")) for text in texts]
        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
        hyps_dict = decode_one_batch(
            params=params,
            model=model,
            lexicon=lexicon,
            decoding_graph=decoding_graph,
            batch=batch,
        )
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                this_batch.append((cut_id, ref_text, hyp_words))
            results[name].extend(this_batch)
        num_cuts += len(texts)
        if batch_idx % log_interval == 0:
            batch_str = f"{batch_idx}/{num_batches}"
            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
    return results
 def save_results(
    params: AttributeDict,
    test_set_name: str,
    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
        errs_filename = (
            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
                f, f"{test_set_name}-{key}", results, enable_log=True
            )
            test_set_wers[key] = wer
        logging.info("Wrote detailed error stats to {}".format(errs_filename))
    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = (
        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
    )
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
        for key, val in test_set_wers:
            print("{}\t{}".format(key, val), file=f)
    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
    note = "\tbest for {}".format(test_set_name)
    for key, val in test_set_wers:
        s += "{}\t{}{}\n".format(key, val, note)
        note = ""
    logging.info(s)
@torch.no_grad()
 def main():
    parser = get_parser()
    AlimeetingAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    assert params.decoding_method in (
        "greedy_search",
        "beam_search",
        "fast_beam_search",
        "fast_beam_search_nbest_LG",
        "modified_beam_search",
    )
    params.res_dir = params.exp_dir / params.decoding_method
    if params.iter > 0:
        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
    else:
        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
    if "fast_beam_search" in params.decoding_method:
        params.suffix += f"-beam-{params.beam}"
        params.suffix += f"-max-contexts-{params.max_contexts}"
        params.suffix += f"-max-states-{params.max_states}"
        if "nbest" in params.decoding_method:
            params.suffix += f"-nbest-scale-{params.nbest_scale}"
            params.suffix += f"-num-paths-{params.num_paths}"
            if "LG" in params.decoding_method:
                params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
    elif "beam_search" in params.decoding_method:
        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
    else:
        params.suffix += f"-context-{params.context_size}"
        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
    logging.info("Decoding started")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"Device: {device}")
    lexicon = Lexicon(params.lang_dir)
    params.blank_id = lexicon.token_table["<blk>"]
    params.vocab_size = max(lexicon.tokens) + 1
    logging.info(params)
    logging.info("About to create model")
    model = get_transducer_model(params)
    if not params.use_averaged_model:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if i >= 1:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg + 1
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model.to(device)
    model.eval()
    model.device = device
    if "fast_beam_search" in params.decoding_method:
        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
    else:
        decoding_graph = None
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    alimeeting = AlimeetingAsrDataModule(args)
    eval_ihm_cuts = alimeeting.eval_ihm_cuts()
    test_ihm_cuts = alimeeting.test_ihm_cuts()
    eval_sdm_cuts = alimeeting.eval_sdm_cuts()
    test_sdm_cuts = alimeeting.test_sdm_cuts()
    eval_gss_cuts = alimeeting.eval_gss_cuts()
    test_gss_cuts = alimeeting.test_gss_cuts()
    eval_ihm_dl = alimeeting.test_dataloaders(eval_ihm_cuts)
    test_ihm_dl = alimeeting.test_dataloaders(test_ihm_cuts)
    eval_sdm_dl = alimeeting.test_dataloaders(eval_sdm_cuts)
    test_sdm_dl = alimeeting.test_dataloaders(test_sdm_cuts)
    if eval_gss_cuts is not None:
        eval_gss_dl = alimeeting.test_dataloaders(eval_gss_cuts)
    if test_gss_cuts is not None:
        test_gss_dl = alimeeting.test_dataloaders(test_gss_cuts)
    test_sets = {
        "eval_ihm": (eval_ihm_dl, eval_ihm_cuts),
        "test_ihm": (test_ihm_dl, test_ihm_cuts),
        "eval_sdm": (eval_sdm_dl, eval_sdm_cuts),
        "test_sdm": (test_sdm_dl, test_sdm_cuts),
    }
    if eval_gss_cuts is not None:
        test_sets["eval_gss"] = (eval_gss_dl, eval_gss_cuts)
    if test_gss_cuts is not None:
        test_sets["test_gss"] = (test_gss_dl, test_gss_cuts)
    for test_set in test_sets:
        logging.info(f"Decoding {test_set}")
        dl, cuts = test_sets[test_set]
        results_dict = decode_dataset(
            dl=dl,
            params=params,
            model=model,
            lexicon=lexicon,
            decoding_graph=decoding_graph,
        )
        save_results(
            params=params,
            test_set_name=test_set,
            results_dict=results_dict,
        )
    logging.info("Done!")
 if __name__ == "__main__":
    main()
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/decoder.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/decoder.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless7/decoder.py
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/encoder_interface.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/encoder_interface.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless7/encoder_interface.py
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/export.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/export.py
@ -0,0 +1,320 @@
 #!/usr/bin/env python3
 #
 # Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This script converts several saved checkpoints
 # to a single one using model averaging.
 """
 Usage:
 (1) Export to torchscript model using torch.jit.script()
 ./pruned_transducer_stateless7/export.py \
  --exp-dir ./pruned_transducer_stateless7/exp \
  --bpe-model data/lang_bpe_500/bpe.model \
  --epoch 30 \
  --avg 9 \
  --jit 1
 It will generate a file `cpu_jit.pt` in the given `exp_dir`. You can later
 load it by `torch.jit.load("cpu_jit.pt")`.
 Note `cpu` in the name `cpu_jit.pt` means the parameters when loaded into Python
 are on CPU. You can use `to("cuda")` to move them to a CUDA device.
 Check
 https://github.com/k2-fsa/sherpa
 for how to use the exported models outside of icefall.
 (2) Export `model.state_dict()`
 ./pruned_transducer_stateless7/export.py \
  --exp-dir ./pruned_transducer_stateless7/exp \
  --bpe-model data/lang_bpe_500/bpe.model \
  --epoch 20 \
  --avg 10
 It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
 load it by `icefall.checkpoint.load_checkpoint()`.
 To use the generated file with `pruned_transducer_stateless7/decode.py`,
 you can do:
    cd /path/to/exp_dir
    ln -s pretrained.pt epoch-9999.pt
    cd /path/to/egs/librispeech/ASR
    ./pruned_transducer_stateless7/decode.py \
        --exp-dir ./pruned_transducer_stateless7/exp \
        --epoch 9999 \
        --avg 1 \
        --max-duration 600 \
        --decoding-method greedy_search \
        --bpe-model data/lang_bpe_500/bpe.model
 Check ./pretrained.py for its usage.
 Note: If you don't want to train a model from scratch, we have
 provided one for you. You can get it at
 https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
 with the following commands:
    sudo apt-get install git-lfs
    git lfs install
    git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
    # You will find the pre-trained model in icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11/exp
 """
 import argparse
 import logging
 from pathlib import Path
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 from scaling_converter import convert_scaled_to_non_scaled
 from train import add_model_arguments, get_params, get_transducer_model
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.lexicon import Lexicon
 from icefall.utils import str2bool
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=15,
        help="""It specifies the checkpoint to use for decoding.
        Note: Epoch counts from 1.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=8,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
    )
    parser.add_argument(
        "--use-averaged-model",
        type=str2bool,
        default=True,
        help="Whether to load averaged model. Currently it only supports "
        "using --epoch. If True, it would decode with the averaged model "
        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
        "Actually only the models with epoch number of `epoch-avg` and "
        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="pruned_transducer_stateless7/exp",
        help="""It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
        """,
    )
    parser.add_argument(
        "--lang-dir",
        type=str,
        default="data/lang_char",
        help="The lang dir",
    )
    parser.add_argument(
        "--jit",
        type=str2bool,
        default=False,
        help="""True to save a model after applying torch.jit.script.
        It will generate a file named cpu_jit.pt
        Check ./jit_pretrained.py for how to use it.
        """,
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )
    add_model_arguments(parser)
    return parser
@torch.no_grad()
 def main():
    args = get_parser().parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"device: {device}")
    lexicon = Lexicon(params.lang_dir)
    params.blank_id = 0
    params.vocab_size = max(lexicon.tokens) + 1
    logging.info(params)
    logging.info("About to create model")
    model = get_transducer_model(params)
    model.to(device)
    if not params.use_averaged_model:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if i >= 1:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg + 1
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model.to("cpu")
    model.eval()
    if params.jit is True:
        convert_scaled_to_non_scaled(model, inplace=True)
        logging.info("Using torch.jit.script()")
        # We won't use the forward() method of the model in C++, so just ignore
        # it here.
        # Otherwise, one of its arguments is a ragged tensor and is not
        # torch scriptabe.
        model.__class__.forward = torch.jit.ignore(model.__class__.forward)
        logging.info("Using torch.jit.script")
        model = torch.jit.script(model)
        filename = params.exp_dir / "cpu_jit.pt"
        model.save(str(filename))
        logging.info(f"Saved to {filename}")
    else:
        logging.info("Not using torchscript. Export model.state_dict()")
        # Save it using a format so that it can be loaded
        # by :func:`load_checkpoint`
        filename = params.exp_dir / "pretrained.pt"
        torch.save({"model": model.state_dict()}, str(filename))
        logging.info(f"Saved to {filename}")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/jit_pretrained.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/jit_pretrained.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless7/jit_pretrained.py
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/joiner.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/joiner.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless7/joiner.py
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/model.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/model.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless7/model.py
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/optim.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/optim.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless7/optim.py
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/pretrained.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/pretrained.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless7/pretrained.py
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/scaling.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/scaling.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless7/scaling.py
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/scaling_converter.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/scaling_converter.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless7/scaling_converter.py
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/test_model.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/test_model.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless7/test_model.py
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/train.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/train.py
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/zipformer.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/zipformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless7/zipformer.py
--- a/egs/alimeeting/ASR_v2/shared
+++ b/egs/alimeeting/ASR_v2/shared
@ -0,0 +1 @@
 ../../../egs/aishell/ASR/shared
--- a/egs/gigaspeech/ASR/.gitignore
+++ b/egs/gigaspeech/ASR/.gitignore
@ -1 +1,2 @@
 log-*
 .DS_Store
--- a/egs/librispeech/ASR/.gitignore
+++ b/egs/librispeech/ASR/.gitignore
@ -1 +1,2 @@
 log-*
 .DS_Store
--- a/egs/librispeech/ASR/README.md
+++ b/egs/librispeech/ASR/README.md
@ -19,18 +19,36 @@ The following table lists the differences among them.
 | `pruned_transducer_stateless`         | Conformer           | Embedding + Conv1d | Using k2 pruned RNN-T loss                        |
 | `pruned_transducer_stateless2`        | Conformer(modified) | Embedding + Conv1d | Using k2 pruned RNN-T loss                        |
 | `pruned_transducer_stateless3`        | Conformer(modified) | Embedding + Conv1d | Using k2 pruned RNN-T loss + using GigaSpeech as extra training data |
-| `pruned_transducer_stateless4`        | Conformer(modified) | Embedding + Conv1d | same as pruned_transducer_stateless2 + save averaged models periodically during training                        |
+| `pruned_transducer_stateless4`        | Conformer(modified) | Embedding + Conv1d | same as pruned_transducer_stateless2 + save averaged models periodically during training + delay penalty |
 | `pruned_transducer_stateless5`        | Conformer(modified) | Embedding + Conv1d | same as pruned_transducer_stateless4 + more layers + random combiner|
 | `pruned_transducer_stateless6`        | Conformer(modified) | Embedding + Conv1d | same as pruned_transducer_stateless4 + distillation with hubert|
 | `pruned_transducer_stateless7`        | Zipformer | Embedding + Conv1d | First experiment with Zipformer from Dan|
 | `pruned_transducer_stateless7_ctc`    | Zipformer | Embedding + Conv1d | Same as pruned_transducer_stateless7, but with extra CTC head|
 | `pruned_transducer_stateless7_ctc_bs` | Zipformer | Embedding + Conv1d | pruned_transducer_stateless7_ctc + blank skip |
 | `pruned_transducer_stateless7_streaming` | Streaming Zipformer | Embedding + Conv1d | streaming version of pruned_transducer_stateless7 |
 | `pruned_transducer_stateless8`        | Zipformer | Embedding + Conv1d | Same as pruned_transducer_stateless7, but using extra data from GigaSpeech|
 | `pruned_stateless_emformer_rnnt2`     | Emformer(from torchaudio) | Embedding + Conv1d | Using Emformer from torchaudio for streaming ASR|
 | `conv_emformer_transducer_stateless`  | ConvEmformer | Embedding + Conv1d | Using ConvEmformer for streaming ASR + mechanisms in reworked model |
 | `conv_emformer_transducer_stateless2` | ConvEmformer | Embedding + Conv1d | Using ConvEmformer with simplified memory for streaming ASR + mechanisms in reworked model |
 | `lstm_transducer_stateless`           | LSTM | Embedding + Conv1d | Using LSTM with mechanisms in reworked model |
-| `lstm_transducer_stateless2`           | LSTM | Embedding + Conv1d | Using LSTM with mechanisms in reworked model + gigaspeech (multi-dataset setup) |
+| `lstm_transducer_stateless2`          | LSTM | Embedding + Conv1d | Using LSTM with mechanisms in reworked model + gigaspeech (multi-dataset setup) |
 | `lstm_transducer_stateless3`          | LSTM | Embedding + Conv1d | Using LSTM with mechanisms in reworked model + gradient filter + delay penalty |
 The decoder in `transducer_stateless` is modified from the paper
 [Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
 We place an additional Conv1d layer right after the input embedding layer.
 # CTC
 |                              | Encoder            | Comment                      |
 |------------------------------|--------------------|------------------------------|
 | `conformer-ctc`              | Conformer          | Use auxiliary attention head |
 | `conformer-ctc2`             | Reworked Conformer | Use auxiliary attention head |
 | `conformer-ctc3`             | Reworked Conformer | Streaming version + delay penalty |
 # MMI
 |                              | Encoder   | Comment                                           |
 |------------------------------|-----------|---------------------------------------------------|
 | `conformer-mmi`              | Conformer |                                                   |
 | `zipformer-mmi`              | Zipformer | CTC warmup + use HP as decoding graph for decoding |
--- a/egs/librispeech/ASR/RESULTS.md
+++ b/egs/librispeech/ASR/RESULTS.md
@ -1,5 +1,140 @@
 ## Results
 ### Streaming Zipformer-Transducer (Pruned Stateless Transducer + Streaming Zipformer)
 #### [pruned_transducer_stateless7_streaming](./pruned_transducer_stateless7_streaming)
 See <https://github.com/k2-fsa/icefall/pull/787> for more details.
 You can find a pretrained model, training logs, decoding logs, and decoding
 results at:
 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>
 Number of model parameters: 70369391, i.e., 70.37 M
 ##### training on full librispeech
 The WERs are:
 | decoding method      | chunk size | test-clean | test-other | comment             | decoding mode        |
 |----------------------|------------|------------|------------|---------------------|----------------------|
 | greedy search        | 320ms      | 3.15       | 8.09       | --epoch 30 --avg 9  | simulated streaming  |
 | greedy search        | 320ms      | 3.17       | 8.24       | --epoch 30 --avg 9  | chunk-wise           |
 | fast beam search     | 320ms      | 3.2        | 8.04       | --epoch 30 --avg 9  | simulated streaming  |
 | fast beam search     | 320ms      | 3.36       | 8.19       | --epoch 30 --avg 9  | chunk-wise           |
 | modified beam search | 320ms      | 3.11       | 7.93       | --epoch 30 --avg 9  | simulated streaming  |
 | modified beam search | 320ms      | 3.12       | 8.11       | --epoch 30 --avg 9  | chunk-size           |
 | greedy search        | 640ms      | 2.97       | 7.5        | --epoch 30 --avg 9  | simulated streaming  |
 | greedy search        | 640ms      | 2.98       | 7.67       | --epoch 30 --avg 9  | chunk-wise           |
 | fast beam search     | 640ms      | 3.02       | 7.47       | --epoch 30 --avg 9  | simulated streaming  |
 | fast beam search     | 640ms      | 2.96       | 7.61       | --epoch 30 --avg 9  | chunk-wise           |
 | modified beam search | 640ms      | 2.94       | 7.36       | --epoch 30 --avg 9  | simulated streaming  |
 | modified beam search | 640ms      | 2.95       | 7.53       | --epoch 30 --avg 9  | chunk-size           |
 Note: `simulated streaming` indicates feeding full utterance during decoding using `decode.py`,
 while `chunk-size` indicates feeding certain number of frames at each time using `streaming_decode.py`.
 The training command is:
 ```bash
 ./pruned_transducer_stateless7_streaming/train.py \
  --world-size 4 \
  --num-epochs 30 \
  --start-epoch 1 \
  --use-fp16 1 \
  --exp-dir pruned_transducer_stateless7_streaming/exp \
  --full-libri 1 \
  --max-duration 750 \
  --master-port 12345
 ```
 The tensorboard log can be found at
 <https://tensorboard.dev/experiment/A46UpqEWQWS7oDi5VcQ8rg/>
 The simulated streaming decoding command (e.g., chunk-size=320ms) is:
 ```bash
 for $m in greedy_search fast_beam_search modified_beam_search; do
  ./pruned_transducer_stateless7_streaming/decode.py \
    --epoch 30 \
    --avg 9 \
    --exp-dir ./pruned_transducer_stateless7_streaming/exp \
    --max-duration 600 \
    --decode-chunk-len 32 \
    --decoding-method $m
 done
 ```
 The streaming chunk-size decoding command (e.g., chunk-size=320ms) is:
 ```bash
 for m in greedy_search modified_beam_search fast_beam_search; do
  ./pruned_transducer_stateless7_streaming/streaming_decode.py \
    --epoch 30 \
    --avg 9 \
    --exp-dir ./pruned_transducer_stateless7_streaming/exp \
    --decoding-method $m \
    --decode-chunk-len 32 \
    --num-decode-streams 2000
 done
 ```
 ### zipformer_mmi (zipformer with mmi loss)
 See <https://github.com/k2-fsa/icefall/pull/746> for more details.
 [zipformer_mmi](./zipformer_mmi)
 The tensorboard log can be found at
 <https://tensorboard.dev/experiment/xyOZUKpEQm62HBIlUD4uPA/>
 You can find a pretrained model, training logs, decoding logs, and decoding
 results at:
 <https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-mmi-2022-12-08>
 Number of model parameters: 69136519, i.e., 69.14 M
 |                          | test-clean | test-other  | comment             |
 |--------------------------|------------|-------------|---------------------|
 | 1best                    | 2.54       | 5.65        | --epoch 30 --avg 10 |
 | nbest                    | 2.54       | 5.66        | --epoch 30 --avg 10 |
 | nbest-rescoring-LG       | 2.49       | 5.42        | --epoch 30 --avg 10 |
 | nbest-rescoring-3-gram   | 2.52       | 5.62        | --epoch 30 --avg 10 |
 | nbest-rescoring-4-gram   | 2.5        | 5.51        | --epoch 30 --avg 10 |
 The training commands are:
 ```bash
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 ./zipformer_mmi/train.py \
  --world-size 4 \
  --master-port 12345 \
  --num-epochs 30 \
  --start-epoch 1 \
  --lang-dir data/lang_bpe_500 \
  --max-duration 500 \
  --full-libri 1 \
  --use-fp16 1 \
  --exp-dir zipformer_mmi/exp
 ```
 The decoding commands for the transducer branch are:
 ```bash
 export CUDA_VISIBLE_DEVICES="5"
 for m in nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescoring-4-gram; do
  ./zipformer_mmi/decode.py \
    --epoch 30 \
    --avg 10 \
    --exp-dir ./zipformer_mmi/exp/ \
    --max-duration 100 \
    --lang-dir data/lang_bpe_500 \
    --nbest-scale 1.2 \
    --hp-scale 1.0 \
    --decoding-method $m
 done
 ```
 ### pruned_transducer_stateless7_ctc (zipformer with transducer loss and ctc loss)
 See <https://github.com/k2-fsa/icefall/pull/683> for more details.
@ -261,9 +396,13 @@ Number of model parameters: 70369391, i.e., 70.37 M
 |                      | test-clean | test-other  | comment                                |
 |----------------------|------------|-------------|----------------------------------------|
-| greedy search        | 2.17       | 5.23        | --epoch 39 --avg 6 --max-duration 600  |
+| greedy search        | 2.17       | 5.23        | --epoch 30 --avg 9 --max-duration 600  |
-| modified beam search | 2.15       | 5.20        | --epoch 39 --avg 6 --max-duration 600  |
+| modified beam search | 2.15       | 5.20        | --epoch 30 --avg 9 --max-duration 600  |
-| fast beam search     | 2.15       | 5.22        | --epoch 39 --avg 6 --max-duration 600  |
+| modified beam search + RNNLM shallow fusion | 1.99       | 4.73        | --epoch 30 --avg 9 --max-duration 600  |
 | modified beam search + TransformerLM shallow fusion | 1.94       | 4.73        | --epoch 30 --avg 9 --max-duration 600  |
 | modified beam search + RNNLM + LODR | 1.91       | 4.57        | --epoch 30 --avg 9 --max-duration 600  |
 | modified beam search + TransformerLM + LODR | 1.91       | 4.51        | --epoch 30 --avg 9 --max-duration 600  |
 | fast beam search     | 2.15       | 5.22        | --epoch 30 --avg 9 --max-duration 600  |
 The training commands are:
 ```bash
@ -401,7 +540,9 @@ The WERs are:
 | greedy search (max sym per frame 1) | 2.78       | 7.36       | --iter 468000 --avg 16  |
 | modified_beam_search                | 2.73       | 7.15       | --iter 468000 --avg 16  |
 | modified_beam_search + RNNLM shallow fusion   | 2.42     |  6.46      | --iter 468000 --avg 16  |
-| modified_beam_search + RNNLM shallow fusion   | 2.28     |  5.94      | --iter 468000 --avg 16  |
+| modified_beam_search + TransformerLM shallow fusion   | 2.37     |  6.48      | --iter 468000 --avg 16  |
 | modified_beam_search + RNNLM + LODR   | 2.24     |  5.89      | --iter 468000 --avg 16  |
 | modified_beam_search + TransformerLM + LODR   | 2.19     |  5.90      | --iter 468000 --avg 16  |
 | fast_beam_search                    | 2.76       | 7.31       | --iter 468000 --avg 16  |
 | greedy search (max sym per frame 1) | 2.77       | 7.35       | --iter 472000 --avg 18  |
 | modified_beam_search                | 2.75       | 7.08       | --iter 472000 --avg 18  |
@ -456,9 +597,12 @@ for m in greedy_search fast_beam_search modified_beam_search; do
 done
 ```
-To decode with RNNLM shallow fusion, use the following decoding command. A well-trained RNNLM
+You may also decode using shallow fusion with external neural network LM. To do so you need to
-can be found here: <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm/tree/main>
+download a well-trained NN LM:
 RNN LM: <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm/tree/main>
 Transformer LM: <https://huggingface.co/marcoyang/icefall-librispeech-transformer-lm/tree/main>
 ```bash
 for iter in 472000; do
    for avg in 8 10 12 14 16 18; do
        ./lstm_transducer_stateless2/decode.py \
@ -466,23 +610,24 @@ for iter in 472000; do
                --avg $avg \
                --exp-dir ./lstm_transducer_stateless2/exp \
                --max-duration 600 \
-                --decoding-method modified_beam_search_rnnlm_shallow_fusion \
+                --decoding-method modified_beam_search_lm_shallow_fusion \
-                --beam 4 \
+                --use-shallow-fusion 1 \
-                --rnn-lm-scale 0.3 \
+                --lm-type rnn \
-                --rnn-lm-exp-dir /path/to/RNNLM \
+                --lm-exp-dir /ceph-data4/yangxiaoyu/pretrained_models/LM/icefall-librispeech-rnn-lm/exp \
-                --rnn-lm-epoch 99 \
+                --lm-epoch 99 \
-                --rnn-lm-avg 1 \
+                --lm-scale $lm_scale \
-                --rnn-lm-num-layers 3 \
+                --lm-avg 1 \
                --rnn-lm-tie-weights 1
    done
 done
 ```
-You may also decode using LODR + RNNLM shallow fusion. This decoding method is proposed in <https://arxiv.org/pdf/2203.16776.pdf>.
+You may also decode using LODR + LM shallow fusion. This decoding method is proposed in <https://arxiv.org/pdf/2203.16776.pdf>.
 It subtracts the internal language model score during shallow fusion, which is approximated by a bi-gram model. The bi-gram can be
 generated by `generate-lm.sh`, or you may download it from <https://huggingface.co/marcoyang/librispeech_bigram>.
 The decoding command is as follows:
 ```bash
 for iter in 472000; do
    for avg in 8 10 12 14 16 18; do
        ./lstm_transducer_stateless2/decode.py \
@ -490,18 +635,22 @@ for iter in 472000; do
                --avg $avg \
                --exp-dir ./lstm_transducer_stateless2/exp \
                --max-duration 600 \
-                --decoding-method modified_beam_search_rnnlm_LODR \
+                --decoding-method modified_beam_search_LODR \
                --beam 4 \
-                --rnn-lm-scale 0.4 \
+                --max-contexts 4 \
-                --rnn-lm-exp-dir /path/to/RNNLM \
+                --use-shallow-fusion 1 \
-                --rnn-lm-epoch 99 \
+                --lm-type rnn \
-                --rnn-lm-avg 1 \
+                --lm-exp-dir /ceph-data4/yangxiaoyu/pretrained_models/LM/icefall-librispeech-rnn-lm/exp \
-                --rnn-lm-num-layers 3 \
+                --lm-epoch 99 \
-                --rnn-lm-tie-weights 1 \
+                --lm-scale 0.4 \
-                --token-ngram 2 \
+                --lm-avg 1 \
                --tokens-ngram 2 \
                --ngram-lm-scale -0.16
    done
 done
 ```
 Note that you can also set `--lm-type transformer` to use transformer LM during LODR. But it will be slower
 because it has not been optimized. The pre-trained transformer LM is available at <https://huggingface.co/marcoyang/icefall-librispeech-transformer-lm/tree/main>
 Pretrained models, training logs, decoding logs, and decoding results
 are available at
@ -1660,6 +1809,9 @@ layers (24 v.s 12) but a narrower model (1536 feedforward dim and 384 encoder di
 | greedy search (max sym per frame 1) | 2.54       | 5.72       | --epoch 30 --avg 10  --max-duration 600 |
 | modified beam search                | 2.47       | 5.71       | --epoch 30 --avg 10  --max-duration 600 |
 | modified beam search + RNNLM shallow fusion     | 2.27       | 5.24      | --epoch 30 --avg 10  --max-duration 600 |
 | modified beam search + RNNLM + LODR     | 2.23       | 5.17      | --epoch 30 --avg 10  --max-duration 600 |
 | modified beam search + TransformerLM shallow fusion     | 2.27       | 5.26      | --epoch 30 --avg 10  --max-duration 600 |
 | modified beam search + TransformerLM + LODR     | 2.22       | 5.11      | --epoch 30 --avg 10  --max-duration 600 |
 | fast beam search                    | 2.5        | 5.72       | --epoch 30 --avg 10  --max-duration 600 |
 ```bash
@ -2023,7 +2175,8 @@ subset so that the gigaspeech dataloader never exhausts.
 | greedy search (max sym per frame 1) | 2.03       | 4.70       | --iter 1224000 --avg 14  --max-duration 600 |
 | modified beam search                | 2.00       | 4.63       | --iter 1224000 --avg 14  --max-duration 600 |
 | modified beam search + rnnlm shallow fusion  | 1.94     |  4.2    | --iter 1224000 --avg 14  --max-duration 600 |
-| modified beam search + LODR         | 1.83       | 4.03       | --iter 1224000 --avg 14  --max-duration 600 |
+| modified beam search + rnnlm + LODR         | 1.77       | 3.99       | --iter 1224000 --avg 14  --max-duration 600 |
 | modified beam search + TransformerLM + LODR    | 1.75       | 3.94       | --iter 1224000 --avg 14  --max-duration 600 |
 | fast beam search                    | 2.10       | 4.68       | --iter 1224000 --avg 14 --max-duration 600 |
 The training commands are:
@ -2069,8 +2222,10 @@ for iter in 1224000; do
  done
 done
 ```
-You may also decode using shallow fusion with external RNNLM. To do so you need to
+You may also decode using shallow fusion with external neural network LM. To do so you need to
-download a well-trained RNNLM from this link <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm/tree/main>
+download a well-trained NN LM:
 RNN LM: <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm/tree/main>
 Transformer LM: <https://huggingface.co/marcoyang/icefall-librispeech-transformer-lm/tree/main>
 ```bash
 rnn_lm_scale=0.3
--- a/egs/librispeech/ASR/conformer_ctc/label_smoothing.py
+++ b/egs/librispeech/ASR/conformer_ctc/label_smoothing.py
@ -44,7 +44,8 @@ class LabelSmoothingLoss(torch.nn.Module):
            mean of the output is taken. (3) "sum": the output will be summed.
        """
        super().__init__()
-        assert 0.0 <= label_smoothing < 1.0
+        assert 0.0 <= label_smoothing < 1.0, f"{label_smoothing}"
        assert reduction in ("none", "sum", "mean"), reduction
        self.ignore_index = ignore_index
        self.label_smoothing = label_smoothing
        self.reduction = reduction
--- a/egs/librispeech/ASR/conformer_ctc2/subsampling.py
+++ b/egs/librispeech/ASR/conformer_ctc2/subsampling.py
@ -24,10 +24,9 @@ from scaling import (
    ScaledConv2d,
    ScaledLinear,
 )
 from torch import nn
-class Conv2dSubsampling(nn.Module):
+class Conv2dSubsampling(torch.nn.Module):
    """Convolutional 2D subsampling (to 1/4 length).
    Convert an input of shape (N, T, idim) to an output
@ -61,7 +60,7 @@ class Conv2dSubsampling(nn.Module):
        assert in_channels >= 7
        super().__init__()
-        self.conv = nn.Sequential(
+        self.conv = torch.nn.Sequential(
            ScaledConv2d(
                in_channels=1,
                out_channels=layer1_channels,
--- a/egs/librispeech/ASR/conformer_ctc3/jit_pretrained.py
+++ b/egs/librispeech/ASR/conformer_ctc3/jit_pretrained.py
@ -291,7 +291,10 @@ def main():
    batch_size = nnet_output.shape[0]
    supervision_segments = torch.tensor(
-        [[i, 0, nnet_output.shape[1]] for i in range(batch_size)],
+        [
            [i, 0, feature_lengths[i] // params.subsampling_factor]
            for i in range(batch_size)
        ],
        dtype=torch.int32,
    )
--- a/egs/librispeech/ASR/conformer_ctc3/pretrained.py
+++ b/egs/librispeech/ASR/conformer_ctc3/pretrained.py
@ -339,7 +339,10 @@ def main():
    batch_size = nnet_output.shape[0]
    supervision_segments = torch.tensor(
-        [[i, 0, nnet_output.shape[1]] for i in range(batch_size)],
+        [
            [i, 0, feature_lengths[i] // params.subsampling_factor]
            for i in range(batch_size)
        ],
        dtype=torch.int32,
    )
--- a/egs/librispeech/ASR/conformer_mmi/decode.py
+++ b/egs/librispeech/ASR/conformer_mmi/decode.py
@ -660,14 +660,22 @@ def main():
    # we need cut ids to display recognition results.
    args.return_cuts = True
    librispeech = LibriSpeechAsrDataModule(args)
    test_clean_cuts = librispeech.test_clean_cuts()
    test_other_cuts = librispeech.test_other_cuts()
    test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
    test_other_dl = librispeech.test_dataloaders(test_other_cuts)
    # CAUTION: `test_sets` is for displaying only.
    # If you want to skip test-clean, you have to skip
    # it inside the for loop. That is, use
    #
    #   if test_set == 'test-clean': continue
    #
    test_sets = ["test-clean", "test-other"]
-    for test_set, test_dl in zip(test_sets, librispeech.test_dataloaders()):
+    test_dls = [test_clean_dl, test_other_dl]
    for test_set, test_dl in zip(test_sets, test_dls):
        results_dict = decode_dataset(
            dl=test_dl,
            params=params,
--- a/egs/librispeech/ASR/conformer_mmi/train-with-attention.py
+++ b/egs/librispeech/ASR/conformer_mmi/train-with-attention.py
@ -30,6 +30,8 @@ import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
 from conformer import Conformer
 from lhotse.cut import Cut
 from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.nn.utils import clip_grad_norm_
@ -100,6 +102,41 @@ def get_parser():
        """,
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="conformer_mmi/exp-attn",
        help="""The experiment dir.
        It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
        """,
    )
    parser.add_argument(
        "--lang-dir",
        type=str,
        default="data/lang_bpe_500",
        help="""The lang dir
        It contains language related input files such as
        "lexicon.txt"
        """,
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="The seed for random generators intended for reproducibility",
    )
    parser.add_argument(
        "--use-pruned-intersect",
        type=str2bool,
        default=False,
        help="""Whether to use `intersect_dense_pruned` to get denominator
        lattice.""",
    )
    return parser
@ -114,12 +151,6 @@ def get_params() -> AttributeDict:
    Explanation of options saved in `params`:
        - exp_dir: It specifies the directory where all training related
                   files, e.g., checkpoints, log, etc, are saved
        - lang_dir: It contains language related input files such as
                    "lexicon.txt"
        - best_train_loss: Best training loss so far. It is used to select
                           the model that has the lowest training loss. It is
                           updated during the training.
@ -164,8 +195,6 @@ def get_params() -> AttributeDict:
    """
    params = AttributeDict(
        {
            "exp_dir": Path("conformer_mmi/exp_500_with_attention"),
            "lang_dir": Path("data/lang_bpe_500"),
            "best_train_loss": float("inf"),
            "best_valid_loss": float("inf"),
            "best_train_epoch": -1,
@ -184,15 +213,12 @@ def get_params() -> AttributeDict:
            "beam_size": 6,  # will change it to 8 after some batches (see code)
            "reduction": "sum",
            "use_double_scores": True,
            #  "att_rate": 0.0,
            #  "num_decoder_layers": 0,
            "att_rate": 0.7,
            "num_decoder_layers": 6,
            # parameters for Noam
            "weight_decay": 1e-6,
            "lr_factor": 5.0,
            "warm_step": 80000,
            "use_pruned_intersect": False,
            "den_scale": 1.0,
            # use alignments before this number of batches
            "use_ali_until": 13000,
@ -661,7 +687,7 @@ def run(rank, world_size, args):
    params = get_params()
    params.update(vars(args))
-    fix_random_seed(42)
+    fix_random_seed(params.seed)
    if world_size > 1:
        setup_dist(rank, world_size, params.master_port)
@ -745,8 +771,29 @@ def run(rank, world_size, args):
        valid_ali = None
    librispeech = LibriSpeechAsrDataModule(args)
-    train_dl = librispeech.train_dataloaders()
+    train_cuts = librispeech.train_clean_100_cuts()
-    valid_dl = librispeech.valid_dataloaders()
+    if params.full_libri:
        train_cuts += librispeech.train_clean_360_cuts()
        train_cuts += librispeech.train_other_500_cuts()
    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
        #
        # Caution: There is a reason to select 20.0 here. Please see
        # ../local/display_manifest_statistics.py
        #
        # You should use ../local/display_manifest_statistics.py to get
        # an utterance duration distribution for your dataset to select
        # the threshold
        return 1.0 <= c.duration <= 20.0
    train_cuts = train_cuts.filter(remove_short_and_long_utt)
    train_dl = librispeech.train_dataloaders(train_cuts)
    valid_cuts = librispeech.dev_clean_cuts()
    valid_cuts += librispeech.dev_other_cuts()
    valid_dl = librispeech.valid_dataloaders(valid_cuts)
    for epoch in range(params.start_epoch, params.num_epochs):
        train_dl.sampler.set_epoch(epoch)
@ -796,6 +843,7 @@ def main():
    parser = get_parser()
    LibriSpeechAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    world_size = args.world_size
    assert world_size >= 1
--- a/egs/librispeech/ASR/conformer_mmi/train.py
+++ b/egs/librispeech/ASR/conformer_mmi/train.py
@ -30,6 +30,8 @@ import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
 from conformer import Conformer
 from lhotse.cut import Cut
 from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.nn.utils import clip_grad_norm_
@ -100,6 +102,26 @@ def get_parser():
        """,
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="conformer_mmi/exp",
        help="""The experiment dir.
        It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
        """,
    )
    parser.add_argument(
        "--lang-dir",
        type=str,
        default="data/lang_bpe_500",
        help="""The lang dir
        It contains language related input files such as
        "lexicon.txt"
        """,
    )
    parser.add_argument(
        "--seed",
        type=int,
@ -107,6 +129,14 @@ def get_parser():
        help="The seed for random generators intended for reproducibility",
    )
    parser.add_argument(
        "--use-pruned-intersect",
        type=str2bool,
        default=False,
        help="""Whether to use `intersect_dense_pruned` to get denominator
        lattice.""",
    )
    return parser
@ -121,12 +151,6 @@ def get_params() -> AttributeDict:
    Explanation of options saved in `params`:
        - exp_dir: It specifies the directory where all training related
                   files, e.g., checkpoints, log, etc, are saved
        - lang_dir: It contains language related input files such as
                    "lexicon.txt"
        - best_train_loss: Best training loss so far. It is used to select
                           the model that has the lowest training loss. It is
                           updated during the training.
@ -171,8 +195,6 @@ def get_params() -> AttributeDict:
    """
    params = AttributeDict(
        {
            "exp_dir": Path("conformer_mmi/exp_500"),
            "lang_dir": Path("data/lang_bpe_500"),
            "best_train_loss": float("inf"),
            "best_valid_loss": float("inf"),
            "best_train_epoch": -1,
@ -193,13 +215,10 @@ def get_params() -> AttributeDict:
            "use_double_scores": True,
            "att_rate": 0.0,
            "num_decoder_layers": 0,
            #  "att_rate": 0.7,
            #  "num_decoder_layers": 6,
            # parameters for Noam
            "weight_decay": 1e-6,
            "lr_factor": 5.0,
            "warm_step": 80000,
            "use_pruned_intersect": False,
            "den_scale": 1.0,
            # use alignments before this number of batches
            "use_ali_until": 13000,
@ -752,8 +771,29 @@ def run(rank, world_size, args):
        valid_ali = None
    librispeech = LibriSpeechAsrDataModule(args)
-    train_dl = librispeech.train_dataloaders()
+    train_cuts = librispeech.train_clean_100_cuts()
-    valid_dl = librispeech.valid_dataloaders()
+    if params.full_libri:
        train_cuts += librispeech.train_clean_360_cuts()
        train_cuts += librispeech.train_other_500_cuts()
    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
        #
        # Caution: There is a reason to select 20.0 here. Please see
        # ../local/display_manifest_statistics.py
        #
        # You should use ../local/display_manifest_statistics.py to get
        # an utterance duration distribution for your dataset to select
        # the threshold
        return 1.0 <= c.duration <= 20.0
    train_cuts = train_cuts.filter(remove_short_and_long_utt)
    train_dl = librispeech.train_dataloaders(train_cuts)
    valid_cuts = librispeech.dev_clean_cuts()
    valid_cuts += librispeech.dev_other_cuts()
    valid_dl = librispeech.valid_dataloaders(valid_cuts)
    for epoch in range(params.start_epoch, params.num_epochs):
        fix_random_seed(params.seed + epoch)
@ -804,6 +844,7 @@ def main():
    parser = get_parser()
    LibriSpeechAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    world_size = args.world_size
    assert world_size >= 1
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/emformer2.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/emformer2.py
@ -1435,7 +1435,7 @@ class EmformerEncoder(nn.Module):
        self,
        x: torch.Tensor,
        states: List[torch.Tensor],
-    ) -> Tuple[torch.Tensor, List[torch.Tensor],]:
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
        """Forward pass for streaming inference.
        B: batch size;
@ -1512,24 +1512,6 @@ class EmformerEncoder(nn.Module):
            )
        return states
        attn_caches = [
            [
                torch.zeros(self.memory_size, self.d_model, device=device),
                torch.zeros(self.left_context_length, self.d_model, device=device),
                torch.zeros(self.left_context_length, self.d_model, device=device),
            ]
            for _ in range(self.num_encoder_layers)
        ]
        conv_caches = [
            torch.zeros(self.d_model, self.cnn_module_kernel - 1, device=device)
            for _ in range(self.num_encoder_layers)
        ]
        states: Tuple[List[List[torch.Tensor]], List[torch.Tensor]] = (
            attn_caches,
            conv_caches,
        )
        return states
 class Emformer(EncoderInterface):
    def __init__(
@ -1640,7 +1622,7 @@ class Emformer(EncoderInterface):
        self,
        x: torch.Tensor,
        states: List[torch.Tensor],
-    ) -> Tuple[torch.Tensor, List[torch.Tensor],]:
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
        """Forward pass for streaming inference.
        B: batch size;
--- a/Show More
+++ b/Show More
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/compute_fbank_musan.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/pruned_transducer_stateless7/beam_search.py`