Merge latest commit 'b0f70c9' on k2-fsa/icefall

I needed this in order to pull unreleased fixes. The last tagged version was too old (dated back in Jul 2023), and not compatible with recent lhotse releases.
2025-08-26 18:24:18 +00:00 · 2023-12-18 15:08:41 +09:00 · 2023-12-18 15:08:41 +09:00 · 16c02cfcc2
commit 16c02cfcc2
parent 6fd674312c b0f70c9d04
911 changed files with 102247 additions and 3662 deletions
--- a/.flake8
+++ b/.flake8
@ -15,7 +15,7 @@ per-file-ignores =
    egs/librispeech/ASR/zipformer_mmi/*.py: E501, E203
    egs/librispeech/ASR/zipformer/*.py: E501, E203
    egs/librispeech/ASR/RESULTS.md: E999,
-
+    egs/ljspeech/TTS/vits/*.py: E501, E203
    # invalid escape sequence (cause by tex formular), W605
    icefall/utils.py: E501, W605

@ -24,6 +24,7 @@ exclude =
  **/data/**,
  icefall/shared/make_kn_lm.py,
  icefall/__init__.py
+  icefall/ctc/__init__.py

 ignore =
  # E203 white space before ":"
--- a/.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh
+++ b/.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh
@ -18,8 +18,8 @@ log "Downloading pre-commputed fbank from $fbank_url"
 git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests
 ln -s $PWD/aishell-test-dev-manifests/data .

-log "Downloading pre-trained model from $repo_url"
 repo_url=https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20
+log "Downloading pre-trained model from $repo_url"
 git clone $repo_url
 repo=$(basename $repo_url)

--- a/.github/scripts/run-aishell-zipformer-2023-10-24.sh
+++ b/.github/scripts/run-aishell-zipformer-2023-10-24.sh
@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/aishell/ASR
+
+git lfs install
+
+fbank_url=https://huggingface.co/csukuangfj/aishell-test-dev-manifests
+log "Downloading pre-commputed fbank from $fbank_url"
+
+git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests
+ln -s $PWD/aishell-test-dev-manifests/data .
+
+log "======================="
+log "CI testing large model"
+repo_url=https://huggingface.co/zrjin/icefall-asr-aishell-zipformer-large-2023-10-24/
+log "Downloading pre-trained model from $repo_url"
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+for method in modified_beam_search greedy_search fast_beam_search; do
+  log "$method"
+
+  ./zipformer/pretrained.py \
+    --method $method \
+    --context-size 1 \
+    --checkpoint $repo/exp/pretrained.pt \
+    --tokens $repo/data/lang_char/tokens.txt \
+    --num-encoder-layers 2,2,4,5,4,2 \
+    --feedforward-dim 512,768,1536,2048,1536,768 \
+    --encoder-dim 192,256,512,768,512,256 \
+    --encoder-unmasked-dim 192,192,256,320,256,192 \
+    $repo/test_wavs/BAC009S0764W0121.wav \
+    $repo/test_wavs/BAC009S0764W0122.wav \
+    $repo/test_wavs/BAC009S0764W0123.wav
+done
+
+log "======================="
+log "CI testing medium model"
+repo_url=https://huggingface.co/zrjin/icefall-asr-aishell-zipformer-2023-10-24/
+log "Downloading pre-trained model from $repo_url"
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+
+for method in modified_beam_search greedy_search fast_beam_search; do
+  log "$method"
+
+  ./zipformer/pretrained.py \
+    --method $method \
+    --context-size 1 \
+    --checkpoint $repo/exp/pretrained.pt \
+    --tokens $repo/data/lang_char/tokens.txt \
+    $repo/test_wavs/BAC009S0764W0121.wav \
+    $repo/test_wavs/BAC009S0764W0122.wav \
+    $repo/test_wavs/BAC009S0764W0123.wav
+done
+
+
+log "======================="
+log "CI testing small model"
+repo_url=https://huggingface.co/zrjin/icefall-asr-aishell-zipformer-small-2023-10-24/
+log "Downloading pre-trained model from $repo_url"
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+
+for method in modified_beam_search greedy_search fast_beam_search; do
+  log "$method"
+
+  ./zipformer/pretrained.py \
+    --method $method \
+    --context-size 1 \
+    --checkpoint $repo/exp/pretrained.pt \
+    --tokens $repo/data/lang_char/tokens.txt \
+    --num-encoder-layers 2,2,2,2,2,2 \
+    --feedforward-dim 512,768,768,768,768,768 \
+    --encoder-dim 192,256,256,256,256,256 \
+    --encoder-unmasked-dim 192,192,192,192,192,192 \
+    $repo/test_wavs/BAC009S0764W0121.wav \
+    $repo/test_wavs/BAC009S0764W0122.wav \
+    $repo/test_wavs/BAC009S0764W0123.wav
+done
+
--- a/.github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh
+++ b/.github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh
@ -29,6 +29,9 @@ if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" ==
  ls -lh data/fbank
  ls -lh pruned_transducer_stateless2/exp

+  ln -s data/fbank/cuts_DEV.jsonl.gz data/fbank/gigaspeech_cuts_DEV.jsonl.gz
+  ln -s data/fbank/cuts_TEST.jsonl.gz data/fbank/gigaspeech_cuts_TEST.jsonl.gz
+
  log "Decoding dev and test"

  # use a small value for decoding with CPU
--- a/.github/scripts/run-gigaspeech-zipformer-2023-10-17.sh
+++ b/.github/scripts/run-gigaspeech-zipformer-2023-10-17.sh
@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/gigaspeech/ASR
+
+repo_url=https://huggingface.co/yfyeung/icefall-asr-gigaspeech-zipformer-2023-10-17
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
+git lfs pull --include "exp/jit_script.pt"
+git lfs pull --include "exp/pretrained.pt"
+ln -s pretrained.pt epoch-99.pt
+ls -lh *.pt
+popd
+
+log "Export to torchscript model"
+./zipformer/export.py \
+  --exp-dir $repo/exp \
+  --use-averaged-model false \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --epoch 99 \
+  --avg 1 \
+  --jit 1
+
+ls -lh $repo/exp/*.pt
+
+log "Decode with models exported by torch.jit.script()"
+
+./zipformer/jit_pretrained.py \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --nn-model-filename $repo/exp/jit_script.pt \
+  $repo/test_wavs/1089-134686-0001.wav \
+  $repo/test_wavs/1221-135766-0001.wav \
+  $repo/test_wavs/1221-135766-0002.wav
+
+for method in greedy_search modified_beam_search fast_beam_search; do
+  log "$method"
+
+  ./zipformer/pretrained.py \
+    --method $method \
+    --beam-size 4 \
+    --checkpoint $repo/exp/pretrained.pt \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
+    $repo/test_wavs/1089-134686-0001.wav \
+    $repo/test_wavs/1221-135766-0001.wav \
+    $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
+  mkdir -p zipformer/exp
+  ln -s $PWD/$repo/exp/pretrained.pt zipformer/exp/epoch-999.pt
+  ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+  ls -lh data
+  ls -lh zipformer/exp
+
+  log "Decoding test-clean and test-other"
+
+  # use a small value for decoding with CPU
+  max_duration=100
+
+  for method in greedy_search fast_beam_search modified_beam_search; do
+    log "Decoding with $method"
+
+    ./zipformer/decode.py \
+      --decoding-method $method \
+      --epoch 999 \
+      --avg 1 \
+      --use-averaged-model 0 \
+      --max-duration $max_duration \
+      --exp-dir zipformer/exp
+  done
+
+  rm zipformer/exp/*.pt
+fi
--- a/.github/scripts/run-librispeech-conformer-ctc3-2022-11-28.sh
+++ b/.github/scripts/run-librispeech-conformer-ctc3-2022-11-28.sh
@ -38,7 +38,7 @@ log "Decode with models exported by torch.jit.trace()"
 for m in ctc-decoding 1best; do
  ./conformer_ctc3/jit_pretrained.py \
    --model-filename $repo/exp/jit_trace.pt \
-    --words-file $repo/data/lang_bpe_500/words.txt  \
+    --words-file $repo/data/lang_bpe_500/words.txt \
    --HLG $repo/data/lang_bpe_500/HLG.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --G $repo/data/lm/G_4_gram.pt \
@ -53,7 +53,7 @@ log "Export to torchscript model"

 ./conformer_ctc3/export.py \
  --exp-dir $repo/exp \
-  --lang-dir $repo/data/lang_bpe_500 \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --jit-trace 1 \
  --epoch 99 \
  --avg 1 \
@ -80,9 +80,9 @@ done
 for m in ctc-decoding 1best; do
  ./conformer_ctc3/pretrained.py \
    --checkpoint $repo/exp/pretrained.pt \
-    --words-file $repo/data/lang_bpe_500/words.txt  \
+    --words-file $repo/data/lang_bpe_500/words.txt \
    --HLG $repo/data/lang_bpe_500/HLG.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    --G $repo/data/lm/G_4_gram.pt \
    --method $m \
    --sample-rate 16000 \
@ -93,7 +93,7 @@ done

 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
-if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
  mkdir -p conformer_ctc3/exp
  ln -s $PWD/$repo/exp/pretrained.pt conformer_ctc3/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
--- a/.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.sh
+++ b/.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.sh
@ -31,7 +31,7 @@ log "Test exporting with torch.jit.trace()"

 ./lstm_transducer_stateless2/export.py \
  --exp-dir $repo/exp \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --use-averaged-model 0 \
@ -55,7 +55,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
@ -68,7 +68,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
@ -28,7 +28,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
@ -41,7 +41,7 @@ for method in fast_beam_search modified_beam_search beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
@ -36,7 +36,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
@ -49,7 +49,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
@ -35,7 +35,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
@ -48,7 +48,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-05-13.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-05-13.sh
@ -30,14 +30,14 @@ popd
 log "Export to torchscript model"
 ./pruned_transducer_stateless3/export.py \
  --exp-dir $repo/exp \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --jit 1

 ./pruned_transducer_stateless3/export.py \
  --exp-dir $repo/exp \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --jit-trace 1
@ -74,7 +74,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
@ -87,7 +87,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless5-2022-05-13.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless5-2022-05-13.sh
@ -32,7 +32,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    --num-encoder-layers 18 \
    --dim-feedforward 2048 \
    --nhead 8 \
@ -51,7 +51,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav \
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-2022-11-11.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-2022-11-11.sh
@ -33,7 +33,7 @@ log "Export to torchscript model"
 ./pruned_transducer_stateless7/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --jit 1
@ -56,7 +56,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
@ -69,7 +69,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-2022-12-01.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-2022-12-01.sh
@ -37,7 +37,7 @@ log "Export to torchscript model"
 ./pruned_transducer_stateless7_ctc/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --jit 1
@ -74,7 +74,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
@ -87,7 +87,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh
@ -21,9 +21,9 @@ tree $repo/
 ls -lh $repo/test_wavs/*.wav

 pushd $repo/exp
-git lfs pull --include "data/lang_bpe_500/HLG.pt"
 git lfs pull --include "data/lang_bpe_500/L.pt"
 git lfs pull --include "data/lang_bpe_500/LG.pt"
+git lfs pull --include "data/lang_bpe_500/HLG.pt"
 git lfs pull --include "data/lang_bpe_500/Linv.pt"
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/cpu_jit.pt"
@ -36,7 +36,7 @@ log "Export to torchscript model"
 ./pruned_transducer_stateless7_ctc_bs/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --jit 1
@ -72,7 +72,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
@ -85,7 +85,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-streaming-2022-12-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-streaming-2022-12-29.sh
@ -37,7 +37,7 @@ log "Export to torchscript model"
 ./pruned_transducer_stateless7_streaming/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --decode-chunk-len 32 \
  --epoch 99 \
  --avg 1 \
@ -81,7 +81,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    --decode-chunk-len 32 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
@ -95,7 +95,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    --decode-chunk-len 32 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless8-2022-11-14.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless8-2022-11-14.sh
@ -41,7 +41,7 @@ log "Decode with models exported by torch.jit.script()"
 log "Export to torchscript model"
 ./pruned_transducer_stateless8/export.py \
  --exp-dir $repo/exp \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --use-averaged-model false \
  --epoch 99 \
  --avg 1 \
@ -65,7 +65,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
@ -78,7 +78,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh
+++ b/.github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh
@ -32,7 +32,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    --simulate-streaming 1 \
    --causal-convolution 1 \
    $repo/test_wavs/1089-134686-0001.wav \
@ -47,7 +47,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    --simulate-streaming 1 \
    --causal-convolution 1 \
    $repo/test_wavs/1089-134686-0001.wav \
--- a/.github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
+++ b/.github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
@ -28,7 +28,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
@ -41,7 +41,7 @@ for method in fast_beam_search modified_beam_search beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-librispeech-zipformer-mmi-2022-12-08.sh
+++ b/.github/scripts/run-librispeech-zipformer-mmi-2022-12-08.sh
@ -37,7 +37,7 @@ log "Export to torchscript model"
 ./zipformer_mmi/export.py \
  --exp-dir $repo/exp \
  --use-averaged-model false \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --jit 1
@ -61,7 +61,7 @@ for method in 1best nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescor
    --method $method \
    --checkpoint $repo/exp/pretrained.pt \
    --lang-dir $repo/data/lang_bpe_500 \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-multi-corpora-zipformer.sh
+++ b/.github/scripts/run-multi-corpora-zipformer.sh
@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/multi_zh-hans/ASR
+
+log "==== Test icefall-asr-multi-zh-hans-zipformer-2023-9-2 ===="
+repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-2023-9-2/
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s epoch-20.pt epoch-99.pt
+popd
+
+ls -lh $repo/exp/*.pt
+
+
+./zipformer/pretrained.py \
+  --checkpoint $repo/exp/epoch-99.pt \
+  --tokens $repo/data/lang_bpe_2000/tokens.txt \
+  --method greedy_search \
+$repo/test_wavs/DEV_T0000000000.wav \
+$repo/test_wavs/DEV_T0000000001.wav \
+$repo/test_wavs/DEV_T0000000002.wav
+
+for method in modified_beam_search fast_beam_search; do
+  log "$method"
+
+  ./zipformer/pretrained.py \
+    --method $method \
+    --beam-size 4 \
+    --checkpoint $repo/exp/epoch-99.pt \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+  $repo/test_wavs/DEV_T0000000000.wav \
+  $repo/test_wavs/DEV_T0000000001.wav \
+  $repo/test_wavs/DEV_T0000000002.wav
+done
+
+rm -rf $repo
+
+log "==== Test icefall-asr-multi-zh-hans-zipformer-ctc-2023-10-24 ===="
+repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-2023-10-24/
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s epoch-20.pt epoch-99.pt
+popd
+
+ls -lh $repo/exp/*.pt
+
+
+./zipformer/pretrained.py \
+  --checkpoint $repo/exp/epoch-99.pt \
+  --tokens $repo/data/lang_bpe_2000/tokens.txt \
+  --use-ctc 1 \
+  --method greedy_search \
+$repo/test_wavs/DEV_T0000000000.wav \
+$repo/test_wavs/DEV_T0000000001.wav \
+$repo/test_wavs/DEV_T0000000002.wav
+
+for method in modified_beam_search fast_beam_search; do
+  log "$method"
+
+  ./zipformer/pretrained.py \
+    --method $method \
+    --beam-size 4 \
+    --use-ctc 1 \
+    --checkpoint $repo/exp/epoch-99.pt \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+  $repo/test_wavs/DEV_T0000000000.wav \
+  $repo/test_wavs/DEV_T0000000001.wav \
+  $repo/test_wavs/DEV_T0000000002.wav
+done
+
+rm -rf $repo
+
+cd ../../../egs/multi_zh_en/ASR
+log "==== Test icefall-asr-zipformer-multi-zh-en-2023-11-22 ===="
+repo_url=https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22/
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+./zipformer/pretrained.py \
+  --checkpoint $repo/exp/pretrained.pt \
+  --bpe-model $repo/data/lang_bbpe_2000/bbpe.model \
+  --method greedy_search \
+$repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_29.wav \
+$repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_55.wav \
+$repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_75.wav
+
+for method in modified_beam_search fast_beam_search; do
+  log "$method"
+
+  ./zipformer/pretrained.py \
+    --method $method \
+    --beam-size 4 \
+    --checkpoint $repo/exp/pretrained.pt \
+    --bpe-model $repo/data/lang_bbpe_2000/bbpe.model \
+  $repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_29.wav \
+  $repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_55.wav \
+  $repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_75.wav
+done
+
+rm -rf $repo
--- a/.github/scripts/run-pre-trained-conformer-ctc.sh
+++ b/.github/scripts/run-pre-trained-conformer-ctc.sh
@ -1,46 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-log() {
-  # This function is from espnet
-  local fname=${BASH_SOURCE[1]##*/}
-  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
-}
-
-cd egs/librispeech/ASR
-
-repo_url=https://github.com/csukuangfj/icefall-asr-conformer-ctc-bpe-500
-git lfs install
-
-log "Downloading pre-trained model from $repo_url"
-git clone $repo_url
-repo=$(basename $repo_url)
-
-log "Display test files"
-tree $repo/
-ls -lh $repo/test_wavs/*.flac
-
-log "CTC decoding"
-
-./conformer_ctc/pretrained.py \
-  --method ctc-decoding \
-  --num-classes 500 \
-  --checkpoint $repo/exp/pretrained.pt \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
-  $repo/test_wavs/1089-134686-0001.flac \
-  $repo/test_wavs/1221-135766-0001.flac \
-  $repo/test_wavs/1221-135766-0002.flac
-
-log "HLG decoding"
-
-./conformer_ctc/pretrained.py \
-  --method 1best \
-  --num-classes 500 \
-  --checkpoint $repo/exp/pretrained.pt \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
-  --words-file $repo/data/lang_bpe_500/words.txt \
-  --HLG $repo/data/lang_bpe_500/HLG.pt \
-  $repo/test_wavs/1089-134686-0001.flac \
-  $repo/test_wavs/1221-135766-0001.flac \
-  $repo/test_wavs/1221-135766-0002.flac
--- a/.github/scripts/run-pre-trained-ctc.sh
+++ b/.github/scripts/run-pre-trained-ctc.sh
@ -0,0 +1,240 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+pushd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/sherpa-onnx-zipformer-ctc-en-2023-10-02
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+log "CTC greedy search"
+
+./zipformer/onnx_pretrained_ctc.py \
+  --nn-model $repo/model.onnx \
+  --tokens $repo/tokens.txt \
+  $repo/test_wavs/0.wav \
+  $repo/test_wavs/1.wav \
+  $repo/test_wavs/2.wav
+
+log "CTC H decoding"
+
+./zipformer/onnx_pretrained_ctc_H.py \
+  --nn-model $repo/model.onnx \
+  --tokens $repo/tokens.txt \
+  --H $repo/H.fst \
+  $repo/test_wavs/0.wav \
+  $repo/test_wavs/1.wav \
+  $repo/test_wavs/2.wav
+
+log "CTC HL decoding"
+
+./zipformer/onnx_pretrained_ctc_HL.py \
+  --nn-model $repo/model.onnx \
+  --words $repo/words.txt \
+  --HL $repo/HL.fst \
+  $repo/test_wavs/0.wav \
+  $repo/test_wavs/1.wav \
+  $repo/test_wavs/2.wav
+
+log "CTC HLG decoding"
+
+./zipformer/onnx_pretrained_ctc_HLG.py \
+  --nn-model $repo/model.onnx \
+  --words $repo/words.txt \
+  --HLG $repo/HLG.fst \
+  $repo/test_wavs/0.wav \
+  $repo/test_wavs/1.wav \
+  $repo/test_wavs/2.wav
+
+rm -rf $repo
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09
+log "Downloading pre-trained model from $repo_url"
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+pushd $repo
+
+git lfs pull --include "exp/pretrained.pt"
+git lfs pull --include "data/lang_bpe_500/HLG.pt"
+git lfs pull --include "data/lang_bpe_500/L.pt"
+git lfs pull --include "data/lang_bpe_500/L_disambig.pt"
+git lfs pull --include "data/lang_bpe_500/Linv.pt"
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lang_bpe_500/lexicon.txt"
+git lfs pull --include "data/lang_bpe_500/lexicon_disambig.txt"
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
+git lfs pull --include "data/lang_bpe_500/words.txt"
+git lfs pull --include "data/lm/G_3_gram.fst.txt"
+
+popd
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+log "CTC decoding"
+
+./conformer_ctc/pretrained.py \
+  --method ctc-decoding \
+  --num-classes 500 \
+  --checkpoint $repo/exp/pretrained.pt \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  $repo/test_wavs/1089-134686-0001.wav \
+  $repo/test_wavs/1221-135766-0001.wav \
+  $repo/test_wavs/1221-135766-0002.wav
+
+log "HLG decoding"
+
+./conformer_ctc/pretrained.py \
+  --method 1best \
+  --num-classes 500 \
+  --checkpoint $repo/exp/pretrained.pt \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --words-file $repo/data/lang_bpe_500/words.txt \
+  --HLG $repo/data/lang_bpe_500/HLG.pt \
+  $repo/test_wavs/1089-134686-0001.wav \
+  $repo/test_wavs/1221-135766-0001.wav \
+  $repo/test_wavs/1221-135766-0002.wav
+
+log "CTC decoding on CPU with kaldi decoders using OpenFst"
+
+log "Exporting model with torchscript"
+
+pushd $repo/exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+./conformer_ctc/export.py \
+  --epoch 99 \
+  --avg 1 \
+  --exp-dir $repo/exp \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --jit 1
+
+ls -lh $repo/exp
+
+
+log "Generating H.fst, HL.fst"
+
+./local/prepare_lang_fst.py  --lang-dir $repo/data/lang_bpe_500 --ngram-G $repo/data/lm/G_3_gram.fst.txt
+
+ls -lh $repo/data/lang_bpe_500
+
+log "Decoding with H on CPU with OpenFst"
+
+./conformer_ctc/jit_pretrained_decode_with_H.py \
+  --nn-model $repo/exp/cpu_jit.pt \
+  --H $repo/data/lang_bpe_500/H.fst \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  $repo/test_wavs/1089-134686-0001.wav \
+  $repo/test_wavs/1221-135766-0001.wav \
+  $repo/test_wavs/1221-135766-0002.wav
+
+log "Decoding with HL on CPU with OpenFst"
+
+./conformer_ctc/jit_pretrained_decode_with_HL.py \
+  --nn-model $repo/exp/cpu_jit.pt \
+  --HL $repo/data/lang_bpe_500/HL.fst \
+  --words $repo/data/lang_bpe_500/words.txt \
+  $repo/test_wavs/1089-134686-0001.wav \
+  $repo/test_wavs/1221-135766-0001.wav \
+  $repo/test_wavs/1221-135766-0002.wav
+
+log "Decoding with HLG on CPU with OpenFst"
+
+./conformer_ctc/jit_pretrained_decode_with_HLG.py \
+  --nn-model $repo/exp/cpu_jit.pt \
+  --HLG $repo/data/lang_bpe_500/HLG.fst \
+  --words $repo/data/lang_bpe_500/words.txt \
+  $repo/test_wavs/1089-134686-0001.wav \
+  $repo/test_wavs/1221-135766-0001.wav \
+  $repo/test_wavs/1221-135766-0002.wav
+
+rm -rf $repo
+
+popd
+
+log "Test aishell"
+
+pushd egs/aishell/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall_asr_aishell_conformer_ctc
+log "Downloading pre-trained model from $repo_url"
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+pushd $repo
+
+git lfs pull --include "exp/pretrained.pt"
+git lfs pull --include "data/lang_char/H.fst"
+git lfs pull --include "data/lang_char/HL.fst"
+git lfs pull --include "data/lang_char/HLG.fst"
+
+popd
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+log "CTC decoding"
+
+log "Exporting model with torchscript"
+
+pushd $repo/exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+./conformer_ctc/export.py \
+  --epoch 99 \
+  --avg 1 \
+  --exp-dir $repo/exp \
+  --tokens $repo/data/lang_char/tokens.txt \
+  --jit 1
+
+ls -lh $repo/exp
+
+ls -lh $repo/data/lang_char
+
+log "Decoding with H on CPU with OpenFst"
+
+./conformer_ctc/jit_pretrained_decode_with_H.py \
+  --nn-model $repo/exp/cpu_jit.pt \
+  --H $repo/data/lang_char/H.fst \
+  --tokens $repo/data/lang_char/tokens.txt \
+  $repo/test_wavs/0.wav \
+  $repo/test_wavs/1.wav \
+  $repo/test_wavs/2.wav
+
+log "Decoding with HL on CPU with OpenFst"
+
+./conformer_ctc/jit_pretrained_decode_with_HL.py \
+  --nn-model $repo/exp/cpu_jit.pt \
+  --HL $repo/data/lang_char/HL.fst \
+  --words $repo/data/lang_char/words.txt \
+  $repo/test_wavs/0.wav \
+  $repo/test_wavs/1.wav \
+  $repo/test_wavs/2.wav
+
+log "Decoding with HLG on CPU with OpenFst"
+
+./conformer_ctc/jit_pretrained_decode_with_HLG.py \
+  --nn-model $repo/exp/cpu_jit.pt \
+  --HLG $repo/data/lang_char/HLG.fst \
+  --words $repo/data/lang_char/words.txt \
+  $repo/test_wavs/0.wav \
+  $repo/test_wavs/1.wav \
+  $repo/test_wavs/2.wav
+
+rm -rf $repo
--- a/.github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
@ -28,7 +28,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
@ -41,7 +41,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
@ -28,7 +28,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
@ -41,7 +41,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-pre-trained-transducer-stateless.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless.sh
@ -28,7 +28,7 @@ for sym in 1 2 3; do
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
@ -41,7 +41,7 @@ for method in fast_beam_search modified_beam_search beam_search; do
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-pre-trained-transducer.sh
+++ b/.github/scripts/run-pre-trained-transducer.sh
@ -27,7 +27,7 @@ log "Beam search decoding"
  --method beam_search \
  --beam-size 4 \
  --checkpoint $repo/exp/pretrained.pt \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-swbd-conformer-ctc-2023-08-26.sh
+++ b/.github/scripts/run-swbd-conformer-ctc-2023-08-26.sh
@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/swbd/ASR
+
+repo_url=https://huggingface.co/zrjin/icefall-asr-swbd-conformer-ctc-2023-8-26
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s epoch-98.pt epoch-99.pt
+popd
+
+ls -lh $repo/exp/*.pt
+
+for method in ctc-decoding 1best; do
+  log "$method"
+
+  ./conformer_ctc/pretrained.py \
+    --method $method \
+    --checkpoint $repo/exp/epoch-99.pt \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
+    --words-file $repo/data/lang_bpe_500/words.txt \
+    --HLG  $repo/data/lang_bpe_500/HLG.pt \
+    --G $repo/data/lm/G_4_gram.pt \
+  $repo/test_wavs/1089-134686-0001.wav \
+  $repo/test_wavs/1221-135766-0001.wav \
+  $repo/test_wavs/1221-135766-0002.wav
+done
--- a/.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
+++ b/.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
@ -17,7 +17,6 @@ git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)

-
 log "Display test files"
 tree $repo/
 ls -lh $repo/test_wavs/*.wav
@ -29,12 +28,11 @@ popd

 log "Test exporting to ONNX format"

-./pruned_transducer_stateless2/export.py \
+./pruned_transducer_stateless2/export-onnx.py \
  --exp-dir $repo/exp \
  --lang-dir $repo/data/lang_char \
  --epoch 99 \
-  --avg 1 \
-  --onnx 1
+  --avg 1

 log "Export to torchscript model"

@ -59,19 +57,17 @@ log "Decode with ONNX models"

 ./pruned_transducer_stateless2/onnx_check.py \
  --jit-filename $repo/exp/cpu_jit.pt \
-  --onnx-encoder-filename $repo/exp/encoder.onnx \
-  --onnx-decoder-filename $repo/exp/decoder.onnx \
-  --onnx-joiner-filename $repo/exp/joiner.onnx \
-  --onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
-  --onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx
+  --onnx-encoder-filename $repo/exp/encoder-epoch-10-avg-2.onnx \
+  --onnx-decoder-filename $repo/exp/decoder-epoch-10-avg-2.onnx \
+  --onnx-joiner-filename $repo/exp/joiner-epoch-10-avg-2.onnx \
+  --onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj-epoch-10-avg-2.onnx \
+  --onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj-epoch-10-avg-2.onnx

 ./pruned_transducer_stateless2/onnx_pretrained.py \
  --tokens $repo/data/lang_char/tokens.txt \
-  --encoder-model-filename $repo/exp/encoder.onnx \
-  --decoder-model-filename $repo/exp/decoder.onnx \
-  --joiner-model-filename $repo/exp/joiner.onnx \
-  --joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
-  --joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
+  --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+  --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+  --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
  $repo/test_wavs/DEV_T0000000000.wav \
  $repo/test_wavs/DEV_T0000000001.wav \
  $repo/test_wavs/DEV_T0000000002.wav
@ -104,9 +100,9 @@ for sym in 1 2 3; do
    --lang-dir $repo/data/lang_char \
    --decoding-method greedy_search \
    --max-sym-per-frame $sym \
-  $repo/test_wavs/DEV_T0000000000.wav \
-  $repo/test_wavs/DEV_T0000000001.wav \
-  $repo/test_wavs/DEV_T0000000002.wav
+    $repo/test_wavs/DEV_T0000000000.wav \
+    $repo/test_wavs/DEV_T0000000001.wav \
+    $repo/test_wavs/DEV_T0000000002.wav
 done

 for method in modified_beam_search beam_search fast_beam_search; do
@ -117,7 +113,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --beam-size 4 \
    --checkpoint $repo/exp/epoch-99.pt \
    --lang-dir $repo/data/lang_char \
-  $repo/test_wavs/DEV_T0000000000.wav \
-  $repo/test_wavs/DEV_T0000000001.wav \
-  $repo/test_wavs/DEV_T0000000002.wav
+    $repo/test_wavs/DEV_T0000000000.wav \
+    $repo/test_wavs/DEV_T0000000001.wav \
+    $repo/test_wavs/DEV_T0000000002.wav
 done
--- a/.github/scripts/test-ncnn-export.sh
+++ b/.github/scripts/test-ncnn-export.sh
@ -45,7 +45,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)

 pushd $repo
-git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"

 cd exp
@ -56,11 +55,10 @@ log "Export via torch.jit.trace()"

 ./conv_emformer_transducer_stateless2/export-for-ncnn.py \
  --exp-dir $repo/exp \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --epoch 99 \
  --avg 1 \
  --use-averaged-model 0 \
-  \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --num-encoder-layers 12 \
  --chunk-length 32 \
  --cnn-module-kernel 31 \
@ -91,7 +89,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)

 pushd $repo
-git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/pretrained-iter-468000-avg-16.pt"

 cd exp
@ -102,7 +99,7 @@ log "Export via torch.jit.trace()"

 ./lstm_transducer_stateless2/export-for-ncnn.py \
  --exp-dir $repo/exp \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --use-averaged-model 0
@ -140,7 +137,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)

 pushd $repo
-git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/pretrained.pt"

 cd exp
@ -148,7 +144,7 @@ ln -s pretrained.pt epoch-99.pt
 popd

 ./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --exp-dir $repo/exp \
  --use-averaged-model 0 \
  --epoch 99 \
@ -199,7 +195,7 @@ ln -s pretrained.pt epoch-9999.pt
 popd

 ./pruned_transducer_stateless7_streaming/export-for-ncnn-zh.py \
-  --lang-dir $repo/data/lang_char_bpe \
+  --tokens $repo/data/lang_char_bpe/tokens.txt \
  --exp-dir $repo/exp \
  --use-averaged-model 0 \
  --epoch 9999 \
--- a/.github/scripts/test-onnx-export.sh
+++ b/.github/scripts/test-onnx-export.sh
@ -10,7 +10,123 @@ log() {

 cd egs/librispeech/ASR

+log "=========================================================================="
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)

+pushd $repo
+git lfs pull --include "exp/pretrained.pt"
+cd exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+log "Export via torch.jit.script()"
+./zipformer/export.py \
+  --exp-dir $repo/exp \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --epoch 99 \
+  --avg 1 \
+  --jit 1
+
+log "Test export to ONNX format"
+./zipformer/export-onnx.py \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --use-averaged-model 0 \
+  --epoch 99 \
+  --avg 1 \
+  --exp-dir $repo/exp \
+  --num-encoder-layers "2,2,3,4,3,2" \
+  --downsampling-factor "1,2,4,8,4,2" \
+  --feedforward-dim "512,768,1024,1536,1024,768" \
+  --num-heads "4,4,4,8,4,4" \
+  --encoder-dim "192,256,384,512,384,256" \
+  --query-head-dim 32 \
+  --value-head-dim 12 \
+  --pos-head-dim 4 \
+  --pos-dim 48 \
+  --encoder-unmasked-dim "192,192,256,256,256,192" \
+  --cnn-module-kernel "31,31,15,15,15,31" \
+  --decoder-dim 512 \
+  --joiner-dim 512 \
+  --causal False \
+  --chunk-size "16,32,64,-1" \
+  --left-context-frames "64,128,256,-1"
+
+ls -lh $repo/exp
+
+log "Run onnx_check.py"
+
+./zipformer/onnx_check.py \
+  --jit-filename $repo/exp/jit_script.pt \
+  --onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+  --onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+  --onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
+
+log "Run onnx_pretrained.py"
+
+./zipformer/onnx_pretrained.py \
+  --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+  --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+  --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  $repo/test_wavs/1089-134686-0001.wav
+
+rm -rf $repo
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "exp/pretrained.pt"
+
+cd exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+log "Test export streaming model to ONNX format"
+./zipformer/export-onnx-streaming.py \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --use-averaged-model 0 \
+  --epoch 99 \
+  --avg 1 \
+  --exp-dir $repo/exp \
+  --num-encoder-layers "2,2,3,4,3,2" \
+  --downsampling-factor "1,2,4,8,4,2" \
+  --feedforward-dim "512,768,1024,1536,1024,768" \
+  --num-heads "4,4,4,8,4,4" \
+  --encoder-dim "192,256,384,512,384,256" \
+  --query-head-dim 32 \
+  --value-head-dim 12 \
+  --pos-head-dim 4 \
+  --pos-dim 48 \
+  --encoder-unmasked-dim "192,192,256,256,256,192" \
+  --cnn-module-kernel "31,31,15,15,15,31" \
+  --decoder-dim 512 \
+  --joiner-dim 512 \
+  --causal True \
+  --chunk-size 16 \
+  --left-context-frames 64
+
+ls -lh $repo/exp
+
+log "Run onnx_pretrained-streaming.py"
+
+./zipformer/onnx_pretrained-streaming.py \
+  --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1-chunk-16-left-64.onnx \
+  --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1-chunk-16-left-64.onnx \
+  --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1-chunk-16-left-64.onnx \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  $repo/test_wavs/1089-134686-0001.wav
+
+rm -rf $repo
+
+log "--------------------------------------------------------------------------"

 log "=========================================================================="
 repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
@ -39,7 +155,7 @@ log "Export via torch.jit.trace()"
 log "Test exporting to ONNX format"

 ./pruned_transducer_stateless7_streaming/export-onnx.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --use-averaged-model 0 \
  --epoch 99 \
  --avg 1 \
@ -88,7 +204,7 @@ popd
 log "Export via torch.jit.script()"

 ./pruned_transducer_stateless3/export.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --epoch 9999 \
  --avg 1 \
  --exp-dir $repo/exp/ \
@ -97,7 +213,7 @@ log "Export via torch.jit.script()"
 log "Test exporting to ONNX format"

 ./pruned_transducer_stateless3/export-onnx.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --epoch 9999 \
  --avg 1 \
  --exp-dir $repo/exp/
@ -126,7 +242,6 @@ log "Run onnx_pretrained.py"
 rm -rf $repo
 log "--------------------------------------------------------------------------"

-
 log "=========================================================================="
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
@ -143,7 +258,7 @@ popd
 log "Export via torch.jit.script()"

 ./pruned_transducer_stateless5/export.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --use-averaged-model 0 \
@ -159,7 +274,7 @@ log "Export via torch.jit.script()"
 log "Test exporting to ONNX format"

 ./pruned_transducer_stateless5/export-onnx.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --use-averaged-model 0 \
@ -205,7 +320,6 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)

 pushd $repo
-git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/pretrained.pt"

 cd exp
@ -215,7 +329,7 @@ popd
 log "Export via torch.jit.script()"

 ./pruned_transducer_stateless7/export.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --use-averaged-model 0 \
  --epoch 99 \
  --avg 1 \
@ -226,7 +340,7 @@ log "Export via torch.jit.script()"
 log "Test exporting to ONNX format"

 ./pruned_transducer_stateless7/export-onnx.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --use-averaged-model 0 \
  --epoch 99 \
  --avg 1 \
@ -270,7 +384,7 @@ popd
 log "Test exporting to ONNX format"

 ./conv_emformer_transducer_stateless2/export-onnx.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --use-averaged-model 0 \
  --epoch 99 \
  --avg 1 \
@ -310,7 +424,7 @@ popd
 log "Export via torch.jit.trace()"

 ./lstm_transducer_stateless2/export.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --use-averaged-model 0 \
  --epoch 99 \
  --avg 1 \
@ -320,7 +434,7 @@ log "Export via torch.jit.trace()"
 log "Test exporting to ONNX format"

 ./lstm_transducer_stateless2/export-onnx.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --use-averaged-model 0 \
  --epoch 99 \
  --avg 1 \
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@ -0,0 +1,52 @@
+# see also
+# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages
+name: Build docker image
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: build_docker-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-docker-image:
+    name: ${{ matrix.image }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        image: ["torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+
+    steps:
+      # refer to https://github.com/actions/checkout
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Rename
+        shell: bash
+        run: |
+          image=${{ matrix.image }}
+          mv -v ./docker/$image.dockerfile ./Dockerfile
+
+      - name: Free space
+        shell: bash
+        run: |
+          df -h
+          rm -rf /opt/hostedtoolcache
+          df -h
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          tags: k2fsa/icefall:${{ matrix.image }}
--- a/.github/workflows/run-aishell-2022-06-20.yml
+++ b/.github/workflows/run-aishell-2022-06-20.yml
@ -45,7 +45,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-aishell-zipformer-2023-10-24.yml
+++ b/.github/workflows/run-aishell-zipformer-2023-10-24.yml
@ -0,0 +1,95 @@
+# Copyright      2023  Zengrui Jin (Xiaomi Corp.)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-aishell-zipformer-2023-10-24
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    types: [labeled]
+
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 15:50 UTC time every day
+    - cron: "50 15 * * *"
+
+concurrency:
+  group: run_aishell_zipformer_2023_10_24-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run_aishell_zipformer_2023_10_24:
+    if: github.event.label.name == 'ready' || github.event.label.name == 'zipformer' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: [3.8]
+
+      fail-fast: false
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+          cache-dependency-path: '**/requirements-ci.txt'
+
+      - name: Install Python dependencies
+        run: |
+          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf==3.20.*
+
+      - name: Cache kaldifeat
+        id: my-cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/kaldifeat
+          key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+      - name: Install kaldifeat
+        if: steps.my-cache.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/install-kaldifeat.sh
+
+      - name: Inference with pre-trained model
+        shell: bash
+        env:
+          GITHUB_EVENT_NAME: ${{ github.event_name }}
+          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+        run: |
+          sudo apt-get -qq install git-lfs tree
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+          .github/scripts/run-aishell-zipformer-2023-10-24.sh
+
+      
--- a/.github/workflows/run-docker-image.yml
+++ b/.github/workflows/run-docker-image.yml
@ -0,0 +1,105 @@
+name: Run docker image
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: run_docker_image-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run-docker-image:
+    name: ${{ matrix.image }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        image: ["torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+    steps:
+      # refer to https://github.com/actions/checkout
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Run the build process with Docker
+        uses: addnab/docker-run-action@v3
+        with:
+            image: k2fsa/icefall:${{ matrix.image }}
+            shell: bash
+            run: |
+              uname -a
+              cat /etc/*release
+
+              find / -name libcuda* 2>/dev/null
+
+              ls -lh /usr/local/
+              ls -lh /usr/local/cuda*
+
+              nvcc --version
+
+              ls -lh /usr/local/cuda-*/compat/*
+
+              # For torch1.9.0-cuda10.2
+              export LD_LIBRARY_PATH=/usr/local/cuda-10.2/compat:$LD_LIBRARY_PATH
+
+              # For torch1.12.1-cuda11.3
+              export LD_LIBRARY_PATH=/usr/local/cuda-11.3/compat:$LD_LIBRARY_PATH
+
+              # For torch2.0.0-cuda11.7
+              export LD_LIBRARY_PATH=/usr/local/cuda-11.7/compat:$LD_LIBRARY_PATH
+
+              # For torch2.1.0-cuda11.8
+              export LD_LIBRARY_PATH=/usr/local/cuda-11.8/compat:$LD_LIBRARY_PATH
+
+              # For torch2.1.0-cuda12.1
+              export LD_LIBRARY_PATH=/usr/local/cuda-12.1/compat:$LD_LIBRARY_PATH
+
+
+              which nvcc
+              cuda_dir=$(dirname $(which nvcc))
+              echo "cuda_dir: $cuda_dir"
+
+              find $cuda_dir -name libcuda.so*
+              echo "--------------------"
+
+              find / -name libcuda.so* 2>/dev/null
+
+              # for torch1.13.0-cuda11.6
+              if [ -e /opt/conda/lib/stubs/libcuda.so ]; then
+                cd /opt/conda/lib/stubs && ln -s libcuda.so libcuda.so.1 && cd -
+                export LD_LIBRARY_PATH=/opt/conda/lib/stubs:$LD_LIBRARY_PATH
+              fi
+
+              find / -name libcuda.so* 2>/dev/null
+              echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
+
+              python3 --version
+              which python3
+
+              python3 -m pip list
+
+              echo "----------torch----------"
+              python3 -m torch.utils.collect_env
+
+              echo "----------k2----------"
+              python3 -c "import k2; print(k2.__file__)"
+              python3 -c "import k2; print(k2.__dev_version__)"
+              python3 -m k2.version
+
+              echo "----------lhotse----------"
+              python3 -c "import lhotse; print(lhotse.__file__)"
+              python3 -c "import lhotse; print(lhotse.__version__)"
+
+              echo "----------kaldifeat----------"
+              python3 -c "import kaldifeat; print(kaldifeat.__file__)"
+              python3 -c "import kaldifeat; print(kaldifeat.__version__)"
+
+              echo "Test yesno recipe"
+
+              cd egs/yesno/ASR
+
+              ./prepare.sh
+
+              ./tdnn/train.py
+
+              ./tdnn/decode.py
--- a/.github/workflows/run-gigaspeech-2022-05-13.yml
+++ b/.github/workflows/run-gigaspeech-2022-05-13.yml
@ -44,7 +44,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-gigaspeech-zipformer-2023-10-17.yml
+++ b/.github/workflows/run-gigaspeech-zipformer-2023-10-17.yml
@ -0,0 +1,126 @@
+# Copyright      2022  Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-gigaspeech-zipformer-2023-10-17
+# zipformer
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    types: [labeled]
+
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 15:50 UTC time every day
+    - cron: "50 15 * * *"
+
+concurrency:
+  group: run_gigaspeech_2023_10_17_zipformer-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run_gigaspeech_2023_10_17_zipformer:
+    if: github.event.label.name == 'zipformer' ||github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: [3.8]
+
+      fail-fast: false
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+          cache-dependency-path: '**/requirements-ci.txt'
+
+      - name: Install Python dependencies
+        run: |
+          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf==3.20.*
+
+      - name: Cache kaldifeat
+        id: my-cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/kaldifeat
+          key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+      - name: Install kaldifeat
+        if: steps.my-cache.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/install-kaldifeat.sh
+
+      - name: Inference with pre-trained model
+        shell: bash
+        env:
+          GITHUB_EVENT_NAME: ${{ github.event_name }}
+          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+        run: |
+          mkdir -p egs/gigaspeech/ASR/data
+          ln -sfv ~/tmp/fbank-libri egs/gigaspeech/ASR/data/fbank
+          ls -lh egs/gigaspeech/ASR/data/*
+
+          sudo apt-get -qq install git-lfs tree
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+          .github/scripts/run-gigaspeech-zipformer-2023-10-17.sh
+
+      - name: Display decoding results for gigaspeech zipformer
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+        shell: bash
+        run: |
+          cd egs/gigaspeech/ASR/
+          tree ./zipformer/exp
+
+          cd zipformer
+          echo "results for zipformer"
+          echo "===greedy search==="
+          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+          echo "===fast_beam_search==="
+          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+          echo "===modified beam search==="
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+      - name: Upload decoding results for gigaspeech zipformer
+        uses: actions/upload-artifact@v2
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+        with:
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer-2022-11-11
+          path: egs/gigaspeech/ASR/zipformer/exp/
--- a/.github/workflows/run-librispeech-2022-03-12.yml
+++ b/.github/workflows/run-librispeech-2022-03-12.yml
@ -44,7 +44,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-librispeech-2022-04-29.yml
+++ b/.github/workflows/run-librispeech-2022-04-29.yml
@ -44,7 +44,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-librispeech-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-2022-05-13.yml
@ -44,7 +44,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-librispeech-2023-01-29-stateless7-ctc-bs.yml
+++ b/.github/workflows/run-librispeech-2023-01-29-stateless7-ctc-bs.yml
@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-name: run-librispeech-2022-12-15-stateless7-ctc-bs
+name: run-librispeech-2023-01-29-stateless7-ctc-bs
 # zipformer

 on:
@ -34,7 +34,7 @@ on:
    - cron: "50 15 * * *"

 jobs:
-  run_librispeech_2022_12_15_zipformer_ctc_bs:
+  run_librispeech_2023_01_29_zipformer_ctc_bs:
    if: github.event.label.name == 'run-decode' || github.event.label.name == 'blank-skip' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
@ -124,7 +124,7 @@ jobs:
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH

-          .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
+          .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh

      - name: Display decoding results for librispeech pruned_transducer_stateless7_ctc_bs
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
@ -159,5 +159,5 @@ jobs:
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-ctc-bs-2022-12-15
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-ctc-bs-2023-01-29
          path: egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/exp/
--- a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
@ -44,7 +44,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
+++ b/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
@ -44,7 +44,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
+++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
@ -44,7 +44,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-multi-corpora-zipformer.yml
+++ b/.github/workflows/run-multi-corpora-zipformer.yml
@ -0,0 +1,84 @@
+# Copyright      2023   Xiaomi Corp.    (author: Zengrui Jin)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-multi-corpora-zipformer
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    types: [labeled]
+
+concurrency:
+  group: run_multi-corpora_zipformer-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run_multi-corpora_zipformer:
+    if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event_name == 'push' || github.event.label.name == 'multi-zh_hans' || github.event.label.name == 'zipformer' || github.event.label.name == 'multi-corpora'
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: [3.8]
+
+      fail-fast: false
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+          cache-dependency-path: '**/requirements-ci.txt'
+
+      - name: Install Python dependencies
+        run: |
+          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf==3.20.*
+
+      - name: Cache kaldifeat
+        id: my-cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/kaldifeat
+          key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+      - name: Install kaldifeat
+        if: steps.my-cache.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/install-kaldifeat.sh
+
+      - name: Inference with pre-trained model
+        shell: bash
+        env:
+          GITHUB_EVENT_NAME: ${{ github.event_name }}
+          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+        run: |
+          sudo apt-get -qq install git-lfs tree
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+          .github/scripts/run-multi-corpora-zipformer.sh
--- a/.github/workflows/run-pretrained-conformer-ctc.yml
+++ b/.github/workflows/run-pretrained-conformer-ctc.yml
@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-name: run-pre-trained-conformer-ctc
+name: run-pre-trained-ctc

 on:
  push:
@ -23,18 +23,25 @@ on:
  pull_request:
    types: [labeled]

+  workflow_dispatch:
+    inputs:
+      test-run:
+        description: 'Test (y/n)?'
+        required: true
+        default: 'y'
+
 concurrency:
-  group: run_pre_trained_conformer_ctc-${{ github.ref }}
+  group: run_pre_trained_ctc-${{ github.ref }}
  cancel-in-progress: true

 jobs:
-  run_pre_trained_conformer_ctc:
-    if: github.event.label.name == 'ready' || github.event_name == 'push'
+  run_pre_trained_ctc:
+    if: github.event.label.name == 'ready' || github.event_name == 'push' || github.event.inputs.test-run == 'y' || github.event.label.name == 'ctc'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

@ -77,4 +84,4 @@ jobs:
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
-          .github/scripts/run-pre-trained-conformer-ctc.sh
+          .github/scripts/run-pre-trained-ctc.sh
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@ -43,7 +43,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@ -43,7 +43,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
@ -34,7 +34,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
@ -34,7 +34,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-pretrained-transducer-stateless.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@ -43,7 +43,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-pretrained-transducer.yml
+++ b/.github/workflows/run-pretrained-transducer.yml
@ -34,7 +34,7 @@ jobs:
    strategy:
      matrix:
        os: [ubuntu-latest]
-        python-version: [3.7, 3.8, 3.9]
+        python-version: [3.8]

      fail-fast: false

--- a/.github/workflows/run-swbd-conformer-ctc.yml
+++ b/.github/workflows/run-swbd-conformer-ctc.yml
@ -0,0 +1,84 @@
+# Copyright      2023   Xiaomi Corp.    (author: Zengrui Jin)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-swbd-conformer_ctc
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    types: [labeled]
+
+concurrency:
+  group: run-swbd-conformer_ctc-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run-swbd-conformer_ctc:
+    if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event_name == 'push' || github.event.label.name == 'swbd'
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        python-version: [3.8]
+
+      fail-fast: false
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+          cache-dependency-path: '**/requirements-ci.txt'
+
+      - name: Install Python dependencies
+        run: |
+          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf==3.20.*
+
+      - name: Cache kaldifeat
+        id: my-cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/kaldifeat
+          key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+      - name: Install kaldifeat
+        if: steps.my-cache.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/install-kaldifeat.sh
+
+      - name: Inference with pre-trained model
+        shell: bash
+        env:
+          GITHUB_EVENT_NAME: ${{ github.event_name }}
+          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+        run: |
+          sudo apt-get -qq install git-lfs tree
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+          .github/scripts/run-swbd-conformer-ctc-2023-08-26.sh
--- a/.github/workflows/run-yesno-recipe.yml
+++ b/.github/workflows/run-yesno-recipe.yml
@ -44,11 +44,6 @@ jobs:
        with:
          fetch-depth: 0

-      - name: Install graphviz
-        shell: bash
-        run: |
-          sudo apt-get -qq install graphviz
-
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
@ -65,11 +60,12 @@ jobs:

      - name: Install Python dependencies
        run: |
-          grep -v '^#' ./requirements-ci.txt  | grep -v kaldifst | xargs -n 1 -L 1 pip install
+          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf==3.20.*

-          pip install --no-deps --force-reinstall https://huggingface.co/csukuangfj/k2/resolve/main/cpu/k2-1.24.3.dev20230508+cpu.torch1.13.1-cp38-cp38-linux_x86_64.whl
+          pip install --no-deps --force-reinstall k2==1.24.4.dev20231021+cpu.torch1.13.1 -f https://k2-fsa.github.io/k2/cpu.html
+          pip install kaldifeat==1.25.1.dev20231022+cpu.torch1.13.1 -f https://csukuangfj.github.io/kaldifeat/cpu.html

      - name: Run yesno recipe
        shell: bash
@ -78,9 +74,112 @@ jobs:
          export PYTHONPATH=$PWD:$PYTHONPATH
          echo $PYTHONPATH

-
          cd egs/yesno/ASR
          ./prepare.sh
          python3 ./tdnn/train.py
          python3 ./tdnn/decode.py
-          # TODO: Check that the WER is less than some value
+
+      - name: Test exporting to pretrained.pt
+        shell: bash
+        working-directory: ${{github.workspace}}
+        run: |
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          echo $PYTHONPATH
+
+          cd egs/yesno/ASR
+          python3 ./tdnn/export.py --epoch 14 --avg 2
+
+          python3 ./tdnn/pretrained.py \
+            --checkpoint ./tdnn/exp/pretrained.pt \
+            --HLG ./data/lang_phone/HLG.pt \
+            --words-file ./data/lang_phone/words.txt \
+            download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+            download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+      - name: Test exporting to torchscript
+        shell: bash
+        working-directory: ${{github.workspace}}
+        run: |
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          echo $PYTHONPATH
+
+          cd egs/yesno/ASR
+          python3 ./tdnn/export.py --epoch 14 --avg 2 --jit 1
+
+          python3 ./tdnn/jit_pretrained.py \
+            --nn-model ./tdnn/exp/cpu_jit.pt \
+            --HLG ./data/lang_phone/HLG.pt \
+            --words-file ./data/lang_phone/words.txt \
+            download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+            download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+      - name: Test exporting to onnx
+        shell: bash
+        working-directory: ${{github.workspace}}
+        run: |
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          echo $PYTHONPATH
+
+          cd egs/yesno/ASR
+          python3 ./tdnn/export_onnx.py --epoch 14 --avg 2
+
+          echo "Test float32 model"
+          python3 ./tdnn/onnx_pretrained.py \
+            --nn-model ./tdnn/exp/model-epoch-14-avg-2.onnx \
+            --HLG ./data/lang_phone/HLG.pt \
+            --words-file ./data/lang_phone/words.txt \
+            download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+            download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+
+          echo "Test int8 model"
+          python3 ./tdnn/onnx_pretrained.py \
+            --nn-model ./tdnn/exp/model-epoch-14-avg-2.int8.onnx \
+            --HLG ./data/lang_phone/HLG.pt \
+            --words-file ./data/lang_phone/words.txt \
+            download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+            download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+      - name: Test decoding with H
+        shell: bash
+        working-directory: ${{github.workspace}}
+        run: |
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          echo $PYTHONPATH
+
+          cd egs/yesno/ASR
+          python3 ./tdnn/export.py --epoch 14 --avg 2 --jit 1
+
+          python3 ./tdnn/jit_pretrained_decode_with_H.py \
+              --nn-model ./tdnn/exp/cpu_jit.pt \
+              --H ./data/lang_phone/H.fst \
+              --tokens ./data/lang_phone/tokens.txt \
+              ./download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+              ./download/waves_yesno/0_0_1_0_0_0_1_0.wav \
+              ./download/waves_yesno/0_0_1_0_0_1_1_1.wav
+
+      - name: Test decoding with HL
+        shell: bash
+        working-directory: ${{github.workspace}}
+        run: |
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          echo $PYTHONPATH
+
+          cd egs/yesno/ASR
+          python3 ./tdnn/export.py --epoch 14 --avg 2 --jit 1
+
+          python3 ./tdnn/jit_pretrained_decode_with_HL.py \
+              --nn-model ./tdnn/exp/cpu_jit.pt \
+              --HL ./data/lang_phone/HL.fst \
+              --words ./data/lang_phone/words.txt \
+              ./download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+              ./download/waves_yesno/0_0_1_0_0_0_1_0.wav \
+              ./download/waves_yesno/0_0_1_0_0_1_1_1.wav
+
+      - name: Show generated files
+        shell: bash
+        working-directory: ${{github.workspace}}
+        run: |
+          cd egs/yesno/ASR
+          ls -lh tdnn/exp
+          ls -lh data/lang_phone
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -35,9 +35,9 @@ jobs:
      matrix:
        os: [ubuntu-latest]
        python-version: ["3.8"]
-        torch: ["1.10.0"]
-        torchaudio: ["0.10.0"]
-        k2-version: ["1.23.2.dev20221201"]
+        torch: ["1.13.0"]
+        torchaudio: ["0.13.0"]
+        k2-version: ["1.24.3.dev20230719"]

      fail-fast: false

@ -66,14 +66,14 @@ jobs:
          pip install torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html

-          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
+          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.github.io/k2/cpu.html
          pip install git+https://github.com/lhotse-speech/lhotse
          # icefall requirements
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf==3.20.*

          pip install kaldifst
-          pip install onnxruntime
+          pip install onnxruntime matplotlib
          pip install -r requirements.txt

      - name: Install graphviz
@ -83,13 +83,6 @@ jobs:
          python3 -m pip install -qq graphviz
          sudo apt-get -qq install graphviz

-      - name: Install graphviz
-        if: startsWith(matrix.os, 'macos')
-        shell: bash
-        run: |
-          python3 -m pip install -qq graphviz
-          brew install -q graphviz
-
      - name: Run tests
        if: startsWith(matrix.os, 'ubuntu')
        run: |
@ -129,40 +122,10 @@ jobs:
          cd ../transducer_lstm
          pytest -v -s

-      - name: Run tests
-        if: startsWith(matrix.os, 'macos')
-        run: |
-          ls -lh
-          export PYTHONPATH=$PWD:$PWD/lhotse:$PYTHONPATH
-          lib_path=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")
-          echo "lib_path: $lib_path"
-          export DYLD_LIBRARY_PATH=$lib_path:$DYLD_LIBRARY_PATH
-          pytest -v -s ./test
-
-          # run tests for conformer ctc
-          cd egs/librispeech/ASR/conformer_ctc
+          cd ../zipformer
          pytest -v -s

-          cd ../pruned_transducer_stateless
-          pytest -v -s
-
-          cd ../pruned_transducer_stateless2
-          pytest -v -s
-
-          cd ../pruned_transducer_stateless3
-          pytest -v -s
-
-          cd ../pruned_transducer_stateless4
-          pytest -v -s
-
-          cd ../transducer_stateless
-          pytest -v -s
-
-          # cd ../transducer
-          # pytest -v -s
-
-          cd ../transducer_stateless2
-          pytest -v -s
-
-          cd ../transducer_lstm
-          pytest -v -s
+      - uses: actions/upload-artifact@v2
+        with:
+          path: egs/librispeech/ASR/zipformer/swoosh.pdf
+          name: swoosh.pdf
--- a/.gitignore
+++ b/.gitignore
@ -34,3 +34,5 @@ node_modules
 *.param
 *.bin
 .DS_Store
+*.fst
+*.arpa
--- a/README.md
+++ b/README.md
@ -29,6 +29,7 @@ We provide the following recipes:
  - [yesno][yesno]
  - [LibriSpeech][librispeech]
  - [GigaSpeech][gigaspeech]
+  - [AMI][ami]
  - [Aishell][aishell]
  - [Aishell2][aishell2]
  - [Aishell4][aishell4]
@ -37,6 +38,7 @@ We provide the following recipes:
  - [Aidatatang_200zh][aidatatang_200zh]
  - [WenetSpeech][wenetspeech]
  - [Alimeeting][alimeeting]
+  - [Switchboard][swbd]
  - [TAL_CSASR][tal_csasr]

 ### yesno
@ -116,11 +118,12 @@ We provide a Colab notebook to run a pre-trained transducer conformer + stateles

 #### k2 pruned RNN-T

-| Encoder         | Params | test-clean | test-other |
-|-----------------|--------|------------|------------|
-| zipformer       | 65.5M  | 2.21       | 4.91       |
-| zipformer-small | 23.2M  | 2.46       | 5.83       |
-| zipformer-large | 148.4M | 2.11       | 4.77       |
+| Encoder         | Params | test-clean | test-other | epochs  | devices    |
+|-----------------|--------|------------|------------|---------|------------|
+| zipformer       | 65.5M  | 2.21       | 4.79       | 50      | 4 32G-V100 |
+| zipformer-small | 23.2M  | 2.42       | 5.73       | 50      | 2 32G-V100 |
+| zipformer-large | 148.4M | 2.06       | 4.63       | 50      | 4 32G-V100 |
+| zipformer-large | 148.4M | 2.00       | 4.38       | 174     | 8 80G-A100 |

 Note: No auxiliary losses are used in the training and no LMs are used
 in the decoding.
@ -146,8 +149,11 @@ in the decoding.

 ### GigaSpeech

-We provide two models for this recipe: [Conformer CTC model][GigaSpeech_conformer_ctc]
-and [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][GigaSpeech_pruned_transducer_stateless2].
+We provide three models for this recipe:
+
+- [Conformer CTC model][GigaSpeech_conformer_ctc]
+- [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][GigaSpeech_pruned_transducer_stateless2].
+- [Transducer: Zipformer encoder + Embedding decoder][GigaSpeech_zipformer]

 #### Conformer CTC

@ -163,6 +169,14 @@ and [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned R
 |   fast beam search   | 10.50 | 10.69 |
 | modified beam search | 10.40 | 10.51 |

+#### Transducer: Zipformer encoder + Embedding decoder
+
+|                      |  Dev  | Test  |
+|----------------------|-------|-------|
+|    greedy search     | 10.31 | 10.50 |
+|   fast beam search   | 10.26 | 10.48 |
+| modified beam search | 10.25 | 10.38 |
+

 ### Aishell

@ -338,7 +352,7 @@ We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder

 #### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss

-The best results for Chinese CER(%) and English WER(%) respectivly (zh: Chinese, en: English):
+The best results for Chinese CER(%) and English WER(%) respectively (zh: Chinese, en: English):
 |decoding-method | dev | dev_zh | dev_en | test | test_zh | test_en |
 |--|--|--|--|--|--|--|
 |greedy_search| 7.30 | 6.48 | 19.19 |7.39| 6.66 | 19.13|
@ -353,7 +367,7 @@ Once you have trained a model in icefall, you may want to deploy it with C++,
 without Python dependencies.

 Please refer to the documentation
-<https://icefall.readthedocs.io/en/latest/recipes/librispeech/conformer_ctc.html#deployment-with-c>
+<https://icefall.readthedocs.io/en/latest/recipes/Non-streaming-ASR/librispeech/conformer_ctc.html#deployment-with-c>
 for how to do this.

 We also provide a Colab notebook, showing you how to run a torch scripted model in [k2][k2] with C++.
@ -376,6 +390,7 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad
 [TED-LIUM3_pruned_transducer_stateless]: egs/tedlium3/ASR/pruned_transducer_stateless
 [GigaSpeech_conformer_ctc]: egs/gigaspeech/ASR/conformer_ctc
 [GigaSpeech_pruned_transducer_stateless2]: egs/gigaspeech/ASR/pruned_transducer_stateless2
+[GigaSpeech_zipformer]: egs/gigaspeech/ASR/zipformer
 [Aidatatang_200zh_pruned_transducer_stateless2]: egs/aidatatang_200zh/ASR/pruned_transducer_stateless2
 [WenetSpeech_pruned_transducer_stateless2]: egs/wenetspeech/ASR/pruned_transducer_stateless2
 [WenetSpeech_pruned_transducer_stateless5]: egs/wenetspeech/ASR/pruned_transducer_stateless5
@ -393,4 +408,6 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad
 [wenetspeech]: egs/wenetspeech/ASR
 [alimeeting]: egs/alimeeting/ASR
 [tal_csasr]: egs/tal_csasr/ASR
+[ami]: egs/ami
+[swbd]: egs/swbd/ASR
 [k2]: https://github.com/k2-fsa/k2
--- a/contributing.md
+++ b/contributing.md
@ -1,39 +1,37 @@
+# Contributing to Our Project

-## Pre-commit hooks
+Thank you for your interest in contributing to our project! We use Git pre-commit hooks to ensure code quality and consistency. Before contributing, please follow these guidelines to enable and use the pre-commit hooks.

-We use [git][git] [pre-commit][pre-commit] [hooks][hooks] to check that files
-going to be committed:
+## Pre-Commit Hooks

-  - contain no trailing spaces
-  - are formatted with [black][black]
-  - are compatible to [PEP8][PEP8] (checked by [flake8][flake8])
-  - end in a newline and only a newline
-  - contain sorted `imports` (checked by [isort][isort])
+We have set up pre-commit hooks to check that the files you're committing meet our coding and formatting standards. These checks include:

-These hooks are disabled by default. Please use the following commands to enable them:
+- Ensuring there are no trailing spaces.
+- Formatting code with [black](https://github.com/psf/black).
+- Checking compliance with PEP8 using [flake8](https://flake8.pycqa.org/).
+- Verifying that files end with a newline character (and only a newline).
+- Sorting imports using [isort](https://pycqa.github.io/isort/).

-```bash
-pip install pre-commit  # run it only once
-pre-commit install      # run it only once, it will install all hooks
+Please note that these hooks are disabled by default. To enable them, follow these steps:

-# modify some files
-git add <some files>
-git commit              # It runs all hooks automatically.
+### Installation (Run only once)

-# If all hooks run successfully, you can write the commit message now. Done!
-#
-# If any hook failed, your commit was not successful.
-# Please read the error messages and make changes accordingly.
-# And rerun
+1. Install the `pre-commit` package using pip:
+   ```bash
+   pip install pre-commit
+   ```
+1. Install the Git hooks using:
+   ```bash
+   pre-commit install
+   ```
+### Making a Commit
+Once you have enabled the pre-commit hooks, follow these steps when making a commit:
+1. Make your changes to the codebase.
+2. Stage your changes by using git add for the files you modified.
+3. Commit your changes using git commit. The pre-commit hooks will run automatically at this point.
+4. If all hooks run successfully, you can write your commit message, and your changes will be successfully committed.
+5. If any hook fails, your commit will not be successful. Please read and follow the error messages provided, make the necessary changes, and then re-run git add and git commit.

-git add <some files>
-git commit
-```
+### Your Contribution
+Your contributions are valuable to us, and by following these guidelines, you help maintain code consistency and quality in our project. We appreciate your dedication to ensuring high-quality code. If you have questions or need assistance, feel free to reach out to us. Thank you for being part of our open-source community!

-[git]: https://git-scm.com/book/en/v2/Customizing-Git-Git-Hooks
-[flake8]: https://github.com/PyCQA/flake8
-[PEP8]: https://www.python.org/dev/peps/pep-0008/
-[black]: https://github.com/psf/black
-[hooks]: https://github.com/pre-commit/pre-commit-hooks
-[pre-commit]: https://github.com/pre-commit/pre-commit
-[isort]: https://github.com/PyCQA/isort
--- a/docker/README.md
+++ b/docker/README.md
@ -1,5 +1,20 @@
 # icefall dockerfile

+## Download from dockerhub
+
+You can find pre-built docker image for icefall at the following address:
+
+  <https://hub.docker.com/r/k2fsa/icefall/tags>
+
+Example usage:
+
+```bash
+docker run --gpus all --rm -it  k2fsa/icefall:torch1.13.0-cuda11.6 /bin/bash
+```
+
+
+## Build from dockerfile
+
 2 sets of configuration are provided - (a) Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8, and (b) Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8.

 If your NVIDIA driver supports CUDA Version: 11.3, please go for case (a) Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8.
--- a/docker/torch1.12.1-cuda11.3.dockerfile
+++ b/docker/torch1.12.1-cuda11.3.dockerfile
@ -0,0 +1,70 @@
+FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.7
+ARG K2_VERSION="1.24.4.dev20230725+cuda11.3.torch1.12.1"
+ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.3.torch1.12.1"
+ARG TORCHAUDIO_VERSION="0.12.1+cu113"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docker/torch1.13.0-cuda11.6.dockerfile
+++ b/docker/torch1.13.0-cuda11.6.dockerfile
@ -0,0 +1,72 @@
+FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.9
+ARG K2_VERSION="1.24.4.dev20231021+cuda11.6.torch1.13.0"
+ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.6.torch1.13.0"
+ARG TORCHAUDIO_VERSION="0.13.0+cu116"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+ENV LD_LIBRARY_PATH /opt/conda/lib/stubs:$LD_LIBRARY_PATH
+
+WORKDIR /workspace/icefall
--- a/docker/torch1.9.0-cuda10.2.dockerfile
+++ b/docker/torch1.9.0-cuda10.2.dockerfile
@ -0,0 +1,86 @@
+FROM pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.7
+ARG K2_VERSION="1.24.3.dev20230726+cuda10.2.torch1.9.0"
+ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda10.2.torch1.9.0"
+ARG TORCHAUDIO_VERSION="0.9.0"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+# see https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/
+
+RUN rm /etc/apt/sources.list.d/cuda.list && \
+	rm /etc/apt/sources.list.d/nvidia-ml.list && \
+	apt-key del 7fa2af80
+
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
+    dpkg -i cuda-keyring_1.0-1_all.deb && \
+    rm -v cuda-keyring_1.0-1_all.deb && \
+    apt-get update && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip uninstall -y tqdm && \
+    pip install -U --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz \
+      tqdm>=4.63.0
+
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docker/torch2.0.0-cuda11.7.dockerfile
+++ b/docker/torch2.0.0-cuda11.7.dockerfile
@ -0,0 +1,70 @@
+FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20231021+cuda11.7.torch2.0.0"
+ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.7.torch2.0.0"
+ARG TORCHAUDIO_VERSION="2.0.0+cu117"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docker/torch2.1.0-cuda11.8.dockerfile
+++ b/docker/torch2.1.0-cuda11.8.dockerfile
@ -0,0 +1,70 @@
+FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20231021+cuda11.8.torch2.1.0"
+ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.8.torch2.1.0"
+ARG TORCHAUDIO_VERSION="2.1.0+cu118"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docker/torch2.1.0-cuda12.1.dockerfile
+++ b/docker/torch2.1.0-cuda12.1.dockerfile
@ -0,0 +1,70 @@
+FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20231021+cuda12.1.torch2.1.0"
+ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda12.1.torch2.1.0"
+ARG TORCHAUDIO_VERSION="2.1.0+cu121"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -86,7 +86,16 @@ rst_epilog = """
 .. _git-lfs: https://git-lfs.com/
 .. _ncnn: https://github.com/tencent/ncnn
 .. _LibriSpeech: https://www.openslr.org/12
+.. _Gigaspeech: https://github.com/SpeechColab/GigaSpeech
 .. _musan: http://www.openslr.org/17/
 .. _ONNX: https://github.com/onnx/onnx
 .. _onnxruntime: https://github.com/microsoft/onnxruntime
+.. _torch: https://github.com/pytorch/pytorch
+.. _torchaudio: https://github.com/pytorch/audio
+.. _k2: https://github.com/k2-fsa/k2
+.. _lhotse: https://github.com/lhotse-speech/lhotse
+.. _yesno: https://www.openslr.org/1/
+.. _Next-gen Kaldi: https://github.com/k2-fsa
+.. _Kaldi: https://github.com/kaldi-asr/kaldi
+.. _lilcom: https://github.com/danpovey/lilcom
 """
--- a/docs/source/contributing/code-style.rst
+++ b/docs/source/contributing/code-style.rst
@ -38,7 +38,7 @@ Please fix any issues reported by the check tools.
 .. HINT::

  Some of the check tools, i.e., ``black`` and ``isort`` will modify
-  the files to be commited **in-place**. So please run ``git status``
+  the files to be committed **in-place**. So please run ``git status``
  after failure to see which file has been modified by the tools
  before you make any further changes.

--- a/docs/source/contributing/how-to-create-a-recipe.rst
+++ b/docs/source/contributing/how-to-create-a-recipe.rst
@ -3,7 +3,7 @@ How to create a recipe

 .. HINT::

-  Please read :ref:`follow the code style` to adjust your code sytle.
+  Please read :ref:`follow the code style` to adjust your code style.

 .. CAUTION::

--- a/docs/source/decoding-with-langugage-models/LODR.rst
+++ b/docs/source/decoding-with-langugage-models/LODR.rst
@ -0,0 +1,187 @@
+.. _LODR:
+
+LODR for RNN Transducer
+=======================
+
+
+As a type of E2E model, neural transducers are usually considered as having an internal
+language model, which learns the language level information on the training corpus.
+In real-life scenario, there is often a mismatch between the training corpus and the target corpus space.
+This mismatch can be a problem when decoding for neural transducer models with language models as its internal
+language can act "against" the external LM. In this tutorial, we show how to use
+`Low-order Density Ratio <https://arxiv.org/abs/2203.16776>`_ to alleviate this effect to further improve the performance
+of langugae model integration.
+
+.. note::
+
+    This tutorial is based on the recipe
+    `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
+    which is a streaming transducer model trained on `LibriSpeech`_.
+    However, you can easily apply LODR to other recipes.
+    If you encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`__.
+
+
+.. note::
+
+    For simplicity, the training and testing corpus in this tutorial are the same (`LibriSpeech`_). However,
+    you can change the testing set to any other domains (e.g `GigaSpeech`_) and prepare the language models
+    using that corpus.
+
+First, let's have a look at some background information. As the predecessor of LODR, Density Ratio (DR) is first proposed `here <https://arxiv.org/abs/2002.11268>`_
+to address the language information mismatch between the training
+corpus (source domain) and the testing corpus (target domain). Assuming that the source domain and the test domain
+are acoustically similar, DR derives the following formular for decoding with Bayes' theorem:
+
+.. math::
+
+    \text{score}\left(y_u|\mathit{x},y\right) =
+    \log p\left(y_u|\mathit{x},y_{1:u-1}\right) +
+    \lambda_1 \log p_{\text{Target LM}}\left(y_u|\mathit{x},y_{1:u-1}\right) -
+    \lambda_2 \log p_{\text{Source LM}}\left(y_u|\mathit{x},y_{1:u-1}\right)
+
+
+where :math:`\lambda_1` and :math:`\lambda_2` are the weights of LM scores for target domain and source domain respectively.
+Here, the source domain LM is trained on the training corpus. The only difference in the above formular compared to
+shallow fusion is the subtraction of the source domain LM.
+
+Some works treat the predictor and the joiner of the neural transducer as its internal LM. However, the LM is
+considered to be weak and can only capture low-level language information. Therefore, `LODR <https://arxiv.org/abs/2203.16776>`__ proposed to use
+a low-order n-gram LM as an approximation of the ILM of the neural transducer. This leads to the following formula
+during decoding for transducer model:
+
+.. math::
+
+    \text{score}\left(y_u|\mathit{x},y\right) =
+    \log p_{rnnt}\left(y_u|\mathit{x},y_{1:u-1}\right) +
+    \lambda_1 \log p_{\text{Target LM}}\left(y_u|\mathit{x},y_{1:u-1}\right) -
+    \lambda_2 \log p_{\text{bi-gram}}\left(y_u|\mathit{x},y_{1:u-1}\right)
+
+In LODR, an additional bi-gram LM estimated on the source domain (e.g training corpus) is required. Compared to DR,
+the only difference lies in the choice of source domain LM. According to the original `paper <https://arxiv.org/abs/2203.16776>`_,
+LODR achieves similar performance compared DR in both intra-domain and cross-domain settings.
+As a bi-gram is much faster to evaluate, LODR is usually much faster.
+
+Now, we will show you how to use LODR in ``icefall``.
+For illustration purpose, we will use a pre-trained ASR model from this `link <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`_.
+If you want to train your model from scratch, please have a look at :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+The testing scenario here is intra-domain (we decode the model trained on `LibriSpeech`_ on `LibriSpeech`_ testing sets).
+
+As the initial step, let's download the pre-trained model.
+
+.. code-block:: bash
+
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+    $ cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
+    $ cd ../data/lang_bpe_500
+    $ git lfs pull --include bpe.model
+    $ cd ../../..
+
+To test the model, let's have a look at the decoding results **without** using LM. This can be done via the following command:
+
+.. code-block:: bash
+
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --exp-dir $exp_dir \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search
+
+The following WERs are achieved on test-clean and test-other:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	3.11	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.93	best for test-other
+
+Then, we download the external language model and bi-gram LM that are necessary for LODR.
+Note that the bi-gram is estimated on the LibriSpeech 960 hours' text.
+
+.. code-block:: bash
+
+    $ # download the external LM
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm
+    $ # create a symbolic link so that the checkpoint can be loaded
+    $ pushd icefall-librispeech-rnn-lm/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt
+    $ popd
+    $
+    $ # download the bi-gram
+    $ git lfs install
+    $ git clone https://huggingface.co/marcoyang/librispeech_bigram
+    $ pushd data/lang_bpe_500
+    $ ln -s ../../librispeech_bigram/2gram.fst.txt .
+    $ popd
+
+Then, we perform LODR decoding by setting ``--decoding-method`` to ``modified_beam_search_lm_LODR``:
+
+.. code-block:: bash
+
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ lm_dir=./icefall-librispeech-rnn-lm/exp
+    $ lm_scale=0.42
+    $ LODR_scale=-0.24
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --beam-size 4 \
+        --exp-dir $exp_dir \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search_LODR \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --use-shallow-fusion 1 \
+        --lm-type rnn \
+        --lm-exp-dir $lm_dir \
+        --lm-epoch 99 \
+        --lm-scale $lm_scale \
+        --lm-avg 1 \
+        --rnn-lm-embedding-dim 2048 \
+        --rnn-lm-hidden-dim 2048 \
+        --rnn-lm-num-layers 3 \
+        --lm-vocab-size 500 \
+        --tokens-ngram 2 \
+        --ngram-lm-scale $LODR_scale
+
+There are two extra arguments that need to be given when doing LODR. ``--tokens-ngram`` specifies the order of n-gram. As we
+are using a bi-gram, we set it to 2. ``--ngram-lm-scale`` is the scale of the bi-gram, it should be a negative number
+as we are subtracting the bi-gram's score during decoding.
+
+The decoding results obtained with the above command are shown below:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	2.61	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	6.74	best for test-other
+
+Recall that the lowest WER we obtained in :ref:`shallow_fusion` with beam size of 4 is ``2.77/7.08``, LODR
+indeed **further improves** the WER. We can do even better if we increase ``--beam-size``:
+
+.. list-table:: WER of LODR with different beam sizes
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Beam size
+     - test-clean
+     - test-other
+   * - 4
+     - 2.61
+     - 6.74
+   * - 8
+     - 2.45
+     - 6.38
+   * - 12
+     - 2.4
+     - 6.23
--- a/docs/source/decoding-with-langugage-models/index.rst
+++ b/docs/source/decoding-with-langugage-models/index.rst
@ -0,0 +1,34 @@
+Decoding with language models
+=============================
+
+This section describes how to use external langugage models 
+during decoding to improve the WER of transducer models. To train an external language model,
+please refer to this tutorial: :ref:`train_nnlm`.
+
+The following decoding methods with external langugage models are available:
+
+
+.. list-table:: 
+   :widths: 25 50
+   :header-rows: 1
+
+   * - Decoding method
+     - beam=4
+   * - ``modified_beam_search``
+     - Beam search (i.e. really n-best decoding, the "beam" is the value of n), similar to the original RNN-T paper. Note, this method does not use language model. 
+   * - ``modified_beam_search_lm_shallow_fusion``
+     - As ``modified_beam_search``, but interpolate RNN-T scores with language model scores, also known as shallow fusion
+   * - ``modified_beam_search_LODR``
+     - As ``modified_beam_search_lm_shallow_fusion``, but subtract score of a (BPE-symbol-level) bigram backoff language model used as an approximation to the internal language model of RNN-T.
+   * - ``modified_beam_search_lm_rescore``
+     - As ``modified_beam_search``, but rescore the n-best hypotheses with external language model (e.g. RNNLM) and re-rank them.
+   * - ``modified_beam_search_lm_rescore_LODR``
+     - As ``modified_beam_search_lm_rescore``, but also subtract the score of a (BPE-symbol-level) bigram backoff language model during re-ranking.
+
+
+.. toctree::
+   :maxdepth: 2
+
+   shallow-fusion
+   LODR
+   rescoring
--- a/docs/source/decoding-with-langugage-models/rescoring.rst
+++ b/docs/source/decoding-with-langugage-models/rescoring.rst
@ -0,0 +1,255 @@
+.. _rescoring:
+
+LM rescoring for Transducer
+=================================
+
+LM rescoring is a commonly used approach to incorporate external LM information. Unlike shallow-fusion-based
+methods (see :ref:`shallow_fusion`, :ref:`LODR`), rescoring is usually performed to re-rank the n-best hypotheses after beam search.
+Rescoring is usually more efficient than shallow fusion since less computation is performed on the external LM.
+In this tutorial, we will show you how to use external LM to rescore the n-best hypotheses decoded from neural transducer models in
+`icefall <https://github.com/k2-fsa/icefall>`__.
+
+.. note::
+
+    This tutorial is based on the recipe 
+    `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
+    which is a streaming transducer model trained on `LibriSpeech`_. 
+    However, you can easily apply shallow fusion to other recipes.
+    If you encounter any problems, please open an issue `here <https://github.com/k2-fsa/icefall/issues>`_.
+
+.. note::
+
+    For simplicity, the training and testing corpus in this tutorial is the same (`LibriSpeech`_). However, you can change the testing set
+    to any other domains (e.g `GigaSpeech`_) and use an external LM trained on that domain.
+
+.. HINT::
+
+  We recommend you to use a GPU for decoding.
+
+For illustration purpose, we will use a pre-trained ASR model from this `link <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`__.
+If you want to train your model from scratch, please have a look at :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+
+As the initial step, let's download the pre-trained model.
+
+.. code-block:: bash
+
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+    $ cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
+    $ cd ../data/lang_bpe_500
+    $ git lfs pull --include bpe.model
+    $ cd ../../..
+
+As usual, we first test the model's performance without external LM. This can be done via the following command:
+
+.. code-block:: bash
+
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --exp-dir $exp_dir \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search
+
+The following WERs are achieved on test-clean and test-other:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	3.11	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.93	best for test-other
+
+Now, we will try to improve the above WER numbers via external LM rescoring. We will download 
+a pre-trained LM from this `link <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm>`__.
+
+.. note::
+
+    This is an RNN LM trained on the LibriSpeech text corpus. So it might not be ideal for other corpus.
+    You may also train a RNN LM from scratch. Please refer to this `script <https://github.com/k2-fsa/icefall/blob/master/icefall/rnn_lm/train.py>`__
+    for training a RNN LM and this `script <https://github.com/k2-fsa/icefall/blob/master/icefall/transformer_lm/train.py>`__ to train a transformer LM.
+
+.. code-block:: bash
+
+    $ # download the external LM
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm 
+    $ # create a symbolic link so that the checkpoint can be loaded
+    $ pushd icefall-librispeech-rnn-lm/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt 
+    $ popd
+
+
+With the RNNLM available, we can rescore the n-best hypotheses generated from `modified_beam_search`. Here,
+`n` should be the number of beams, i.e ``--beam-size``. The command for LM rescoring is
+as follows. Note that the ``--decoding-method`` is set to `modified_beam_search_lm_rescore` and ``--use-shallow-fusion``
+is set to `False`.
+
+.. code-block:: bash
+    
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ lm_dir=./icefall-librispeech-rnn-lm/exp
+    $ lm_scale=0.43
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --beam-size 4 \
+        --exp-dir $exp_dir \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search_lm_rescore \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --use-shallow-fusion 0 \
+        --lm-type rnn \
+        --lm-exp-dir $lm_dir \
+        --lm-epoch 99 \
+        --lm-scale $lm_scale \
+        --lm-avg 1 \
+        --rnn-lm-embedding-dim 2048 \
+        --rnn-lm-hidden-dim 2048 \
+        --rnn-lm-num-layers 3 \
+        --lm-vocab-size 500
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	2.93	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.6	best for test-other
+
+Great! We made some improvements! Increasing the size of the n-best hypotheses will further boost the performance,
+see the following table:
+
+.. list-table:: WERs of LM rescoring with different beam sizes
+   :widths: 25 25 25
+   :header-rows: 1
+
+   * - Beam size
+     - test-clean
+     - test-other
+   * - 4
+     - 2.93
+     - 7.6
+   * - 8
+     - 2.67
+     - 7.11
+   * - 12
+     - 2.59
+     - 6.86
+
+In fact, we can also apply LODR (see :ref:`LODR`) when doing LM rescoring. To do so, we need to 
+download the bi-gram required by LODR:
+
+.. code-block:: bash
+
+    $ # download the bi-gram
+    $ git lfs install
+    $ git clone https://huggingface.co/marcoyang/librispeech_bigram
+    $ pushd data/lang_bpe_500
+    $ ln -s ../../librispeech_bigram/2gram.arpa .
+    $ popd
+
+Then we can performn LM rescoring + LODR by changing the decoding method to `modified_beam_search_lm_rescore_LODR`. 
+
+.. note:: 
+
+    This decoding method requires the dependency of `kenlm <https://github.com/kpu/kenlm>`_. You can install it
+    via this command: `pip install https://github.com/kpu/kenlm/archive/master.zip`. 
+
+.. code-block:: bash
+    
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ lm_dir=./icefall-librispeech-rnn-lm/exp
+    $ lm_scale=0.43
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --beam-size 4 \
+        --exp-dir $exp_dir \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search_lm_rescore_LODR \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --use-shallow-fusion 0 \
+        --lm-type rnn \
+        --lm-exp-dir $lm_dir \
+        --lm-epoch 99 \
+        --lm-scale $lm_scale \
+        --lm-avg 1 \
+        --rnn-lm-embedding-dim 2048 \
+        --rnn-lm-hidden-dim 2048 \
+        --rnn-lm-num-layers 3 \
+        --lm-vocab-size 500
+
+You should see the following WERs after executing the commands above:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	2.9	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.57	best for test-other
+
+It's slightly better than LM rescoring. If we further increase the beam size, we will see
+further improvements from LM rescoring + LODR:
+
+.. list-table:: WERs of LM rescoring + LODR with different beam sizes
+   :widths: 25 25 25
+   :header-rows: 1
+
+   * - Beam size
+     - test-clean
+     - test-other
+   * - 4
+     - 2.9
+     - 7.57
+   * - 8
+     - 2.63
+     - 7.04
+   * - 12
+     - 2.52
+     - 6.73
+
+As mentioned earlier, LM rescoring is usually faster than shallow-fusion based methods.
+Here, we benchmark the WERs and decoding speed of them:
+
+.. list-table:: LM-rescoring-based methods vs shallow-fusion-based methods (The numbers in each field is WER on test-clean, WER on test-other and decoding time on test-clean)
+   :widths: 25 25 25 25
+   :header-rows: 1
+
+   * - Decoding method
+     - beam=4
+     - beam=8
+     - beam=12
+   * - ``modified_beam_search``
+     - 3.11/7.93; 132s
+     - 3.1/7.95; 177s
+     - 3.1/7.96; 210s
+   * - ``modified_beam_search_lm_shallow_fusion``
+     - 2.77/7.08; 262s
+     - 2.62/6.65; 352s
+     - 2.58/6.65; 488s
+   * - ``modified_beam_search_LODR``
+     - 2.61/6.74; 400s
+     - 2.45/6.38; 610s
+     - 2.4/6.23; 870s
+   * - ``modified_beam_search_lm_rescore``
+     - 2.93/7.6; 156s
+     - 2.67/7.11; 203s
+     - 2.59/6.86; 255s
+   * - ``modified_beam_search_lm_rescore_LODR``
+     - 2.9/7.57; 160s
+     - 2.63/7.04; 203s
+     - 2.52/6.73; 263s
+
+.. note::
+
+    Decoding is performed with a single 32G V100, we set ``--max-duration`` to 600. 
+    Decoding time here is only for reference and it may vary.
--- a/docs/source/decoding-with-langugage-models/shallow-fusion.rst
+++ b/docs/source/decoding-with-langugage-models/shallow-fusion.rst
@ -0,0 +1,179 @@
+.. _shallow_fusion:
+
+Shallow fusion for Transducer
+=================================
+
+External language models (LM) are commonly used to improve WERs for E2E ASR models.
+This tutorial shows you how to perform ``shallow fusion`` with an external LM
+to improve the word-error-rate of a transducer model.
+
+.. note::
+
+    This tutorial is based on the recipe 
+    `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
+    which is a streaming transducer model trained on `LibriSpeech`_. 
+    However, you can easily apply shallow fusion to other recipes.
+    If you encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`_.
+
+.. note::
+
+    For simplicity, the training and testing corpus in this tutorial is the same (`LibriSpeech`_). However, you can change the testing set
+    to any other domains (e.g `GigaSpeech`_) and use an external LM trained on that domain.
+
+.. HINT::
+
+  We recommend you to use a GPU for decoding.
+
+For illustration purpose, we will use a pre-trained ASR model from this `link <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`__.
+If you want to train your model from scratch, please have a look at :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+
+As the initial step, let's download the pre-trained model.
+
+.. code-block:: bash
+
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+    $ cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
+    $ cd ../data/lang_bpe_500
+    $ git lfs pull --include bpe.model
+    $ cd ../../..
+
+To test the model, let's have a look at the decoding results without using LM. This can be done via the following command:
+
+.. code-block:: bash
+
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --exp-dir $exp_dir \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search
+
+The following WERs are achieved on test-clean and test-other:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	3.11	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.93	best for test-other
+
+These are already good numbers! But we can further improve it by using shallow fusion with external LM.
+Training a language model usually takes a long time, we can download a pre-trained LM from this `link <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm>`__.
+
+.. code-block:: bash
+
+    $ # download the external LM
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm 
+    $ # create a symbolic link so that the checkpoint can be loaded
+    $ pushd icefall-librispeech-rnn-lm/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt 
+    $ popd
+
+.. note::
+
+    This is an RNN LM trained on the LibriSpeech text corpus. So it might not be ideal for other corpus.
+    You may also train a RNN LM from scratch. Please refer to this `script <https://github.com/k2-fsa/icefall/blob/master/icefall/rnn_lm/train.py>`__
+    for training a RNN LM and this `script <https://github.com/k2-fsa/icefall/blob/master/icefall/transformer_lm/train.py>`__ to train a transformer LM.
+
+To use shallow fusion for decoding, we can execute the following command:
+
+.. code-block:: bash
+    
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ lm_dir=./icefall-librispeech-rnn-lm/exp
+    $ lm_scale=0.29
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --beam-size 4 \
+        --exp-dir $exp_dir \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search_lm_shallow_fusion \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --use-shallow-fusion 1 \
+        --lm-type rnn \
+        --lm-exp-dir $lm_dir \
+        --lm-epoch 99 \
+        --lm-scale $lm_scale \
+        --lm-avg 1 \
+        --rnn-lm-embedding-dim 2048 \
+        --rnn-lm-hidden-dim 2048 \
+        --rnn-lm-num-layers 3 \
+        --lm-vocab-size 500
+
+Note that we set ``--decoding-method modified_beam_search_lm_shallow_fusion`` and ``--use-shallow-fusion True``
+to use shallow fusion. ``--lm-type`` specifies the type of neural LM we are going to use, you can either choose
+between ``rnn`` or ``transformer``. The following three arguments are associated with the rnn:
+
+- ``--rnn-lm-embedding-dim``
+    The embedding dimension of the RNN LM
+
+- ``--rnn-lm-hidden-dim``
+    The hidden dimension of the RNN LM
+
+- ``--rnn-lm-num-layers``
+    The number of RNN layers in the RNN LM.
+
+
+The decoding result obtained with the above command are shown below.
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	2.77	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.08	best for test-other
+
+The improvement of shallow fusion is very obvious! The relative WER reduction on test-other is around 10.5%. 
+A few parameters can be tuned to further boost the performance of shallow fusion:
+
+- ``--lm-scale`` 
+
+    Controls the scale of the LM. If too small, the external language model may not be fully utilized; if too large, 
+    the LM score may dominant during decoding, leading to bad WER. A typical value of this is around 0.3.
+
+- ``--beam-size`` 
+    
+    The number of active paths in the search beam. It controls the trade-off between decoding efficiency and accuracy.
+
+Here, we also show how `--beam-size` effect the WER and decoding time:
+
+.. list-table:: WERs and decoding time (on test-clean) of shallow fusion with different beam sizes
+   :widths: 25 25 25 25
+   :header-rows: 1
+
+   * - Beam size
+     - test-clean
+     - test-other
+     - Decoding time on test-clean (s)
+   * - 4
+     - 2.77
+     - 7.08
+     - 262
+   * - 8
+     - 2.62
+     - 6.65
+     - 352
+   * - 12
+     - 2.58
+     - 6.65
+     - 488
+
+As we see, a larger beam size during shallow fusion improves the WER, but is also slower.
+
+
+
+
+
+
+
+ 
--- a/docs/source/docker/img/docker-hub.png
+++ b/docs/source/docker/img/docker-hub.png
--- a/docs/source/docker/index.rst
+++ b/docs/source/docker/index.rst
@ -0,0 +1,17 @@
+.. _icefall_docker:
+
+Docker
+======
+
+This section describes how to use pre-built docker images to run `icefall`_.
+
+.. hint::
+
+   If you only have CPUs available, you can still use the pre-built docker
+   images.
+
+.. toctree::
+   :maxdepth: 2
+
+   ./intro.rst
+
--- a/docs/source/docker/intro.rst
+++ b/docs/source/docker/intro.rst
@ -0,0 +1,173 @@
+Introduction
+=============
+
+We have pre-built docker images hosted at the following address:
+
+  `<https://hub.docker.com/repository/docker/k2fsa/icefall/general>`_
+
+.. figure:: img/docker-hub.png
+   :width: 600
+   :align: center
+
+You can find the ``Dockerfile`` at `<https://github.com/k2-fsa/icefall/tree/master/docker>`_.
+
+We describe the following items in this section:
+
+  - How to view available tags
+  - How to download pre-built docker images
+  - How to run the `yesno`_ recipe within a docker container on ``CPU``
+
+View available tags
+===================
+
+You can use the following command to view available tags:
+
+.. code-block:: bash
+
+   curl -s 'https://registry.hub.docker.com/v2/repositories/k2fsa/icefall/tags/'|jq '."results"[]["name"]'
+
+which will give you something like below:
+
+.. code-block:: bash
+
+  "torch2.1.0-cuda12.1"
+  "torch2.1.0-cuda11.8"
+  "torch2.0.0-cuda11.7"
+  "torch1.12.1-cuda11.3"
+  "torch1.9.0-cuda10.2"
+  "torch1.13.0-cuda11.6"
+
+.. hint::
+
+   Available tags will be updated when there are new releases of `torch`_.
+
+Please select an appropriate combination of `torch`_ and  CUDA.
+
+Download a docker image
+=======================
+
+Suppose that you select the tag ``torch1.13.0-cuda11.6``, you can use
+the following command to download it:
+
+.. code-block:: bash
+
+   sudo docker image pull k2fsa/icefall:torch1.13.0-cuda11.6
+
+Run a docker image with GPU
+===========================
+
+.. code-block:: bash
+
+  sudo docker run --gpus all --rm -it k2fsa/icefall:torch1.13.0-cuda11.6 /bin/bash
+
+Run a docker image with CPU
+===========================
+
+.. code-block:: bash
+
+  sudo docker run --rm -it k2fsa/icefall:torch1.13.0-cuda11.6 /bin/bash
+
+Run yesno within a docker container
+===================================
+
+After starting the container, the following interface is presented:
+
+.. code-block:: bash
+
+  root@60c947eac59c:/workspace/icefall#
+
+It shows the current user is ``root`` and the current working directory
+is ``/workspace/icefall``.
+
+Update the code
+---------------
+
+Please first run:
+
+.. code-block:: bash
+
+  root@60c947eac59c:/workspace/icefall# git pull
+
+so that your local copy contains the latest code.
+
+Data preparation
+----------------
+
+Now we can use
+
+.. code-block:: bash
+
+  root@60c947eac59c:/workspace/icefall# cd egs/yesno/ASR/
+
+to switch to the ``yesno`` recipe and run
+
+.. code-block:: bash
+
+  root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ./prepare.sh
+
+.. hint::
+
+   If you are running without GPU, it may report the following error:
+
+    .. code-block:: bash
+
+        File "/opt/conda/lib/python3.9/site-packages/k2/__init__.py", line 23, in <module>
+          from _k2 import DeterminizeWeightPushingType
+        ImportError: libcuda.so.1: cannot open shared object file: No such file or directory
+
+  We can use the following command to fix it:
+
+    .. code-block:: bash
+
+      root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ln -s /opt/conda/lib/stubs/libcuda.so /opt/conda/lib/stubs/libcuda.so.1
+
+The logs of running ``./prepare.sh`` are listed below:
+
+.. literalinclude:: ./log/log-preparation.txt
+
+Training
+--------
+
+After preparing the data, we can start training with the following command
+
+.. code-block:: bash
+
+  root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ./tdnn/train.py
+
+All of the training logs are given below:
+
+.. hint::
+
+   It is running on CPU and it takes only 16 seconds for this run.
+
+.. literalinclude:: ./log/log-train-2023-08-01-01-55-27
+
+
+Decoding
+--------
+
+After training, we can decode the trained model with
+
+.. code-block:: bash
+
+  root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ./tdnn/decode.py
+
+The decoding logs are given below:
+
+.. code-block:: bash
+
+  2023-08-01 02:06:22,400 INFO [decode.py:263] Decoding started
+  2023-08-01 02:06:22,400 INFO [decode.py:264] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'export': False, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4c05309499a08454997adf500b56dcc629e35ae5', 'k2-git-date': 'Tue Jul 25 16:23:36 2023', 'lhotse-version': '1.16.0.dev+git.7640d663.clean', 'torch-version': '1.13.0', 'torch-cuda-available': False, 'torch-cuda-version': '11.6', 'python-version': '3.9', 'icefall-git-branch': 'master', 'icefall-git-sha1': '375520d-clean', 'icefall-git-date': 'Fri Jul 28 07:43:08 2023', 'icefall-path': '/workspace/icefall', 'k2-path': '/opt/conda/lib/python3.9/site-packages/k2/__init__.py', 'lhotse-path': '/opt/conda/lib/python3.9/site-packages/lhotse/__init__.py', 'hostname': '60c947eac59c', 'IP address': '172.17.0.2'}}
+  2023-08-01 02:06:22,401 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+  2023-08-01 02:06:22,403 INFO [decode.py:273] device: cpu
+  2023-08-01 02:06:22,406 INFO [decode.py:291] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
+  2023-08-01 02:06:22,424 INFO [asr_datamodule.py:218] About to get test cuts
+  2023-08-01 02:06:22,425 INFO [asr_datamodule.py:252] About to get test cuts
+  2023-08-01 02:06:22,504 INFO [decode.py:204] batch 0/?, cuts processed until now is 4
+  [W NNPACK.cpp:53] Could not initialize NNPACK! Reason: Unsupported hardware.
+  2023-08-01 02:06:22,687 INFO [decode.py:241] The transcripts are stored in tdnn/exp/recogs-test_set.txt
+  2023-08-01 02:06:22,688 INFO [utils.py:564] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
+  2023-08-01 02:06:22,690 INFO [decode.py:249] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
+  2023-08-01 02:06:22,690 INFO [decode.py:316] Done!
+
+Congratulations! You have finished successfully running `icefall`_ within a docker container.
--- a/docs/source/for-dummies/data-preparation.rst
+++ b/docs/source/for-dummies/data-preparation.rst
@ -0,0 +1,180 @@
+.. _dummies_tutorial_data_preparation:
+
+Data Preparation
+================
+
+After :ref:`dummies_tutorial_environment_setup`, we can start preparing the
+data for training and decoding.
+
+The first step is to prepare the data for training. We have already provided
+`prepare.sh <https://github.com/k2-fsa/icefall/blob/master/egs/yesno/ASR/prepare.sh>`_
+that would prepare everything required for training.
+
+.. code-block::
+
+   cd /tmp/icefall
+   export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+   cd egs/yesno/ASR
+
+   ./prepare.sh
+
+Note that in each recipe from `icefall`_, there exists a file ``prepare.sh``,
+which you should run before you run anything else.
+
+That is all you need for data preparation.
+
+For the more curious
+--------------------
+
+If you are wondering how to prepare your own dataset, please refer to the following
+URLs for more details:
+
+  - `<https://github.com/lhotse-speech/lhotse/tree/master/lhotse/recipes>`_
+
+    It contains recipes for a variety of dataset. If you want to add your own
+    dataset, please read recipes in this folder first.
+
+  - `<https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/yesno.py>`_
+
+    The `yesno`_ recipe in `lhotse`_.
+
+If you already have a `Kaldi`_ dataset directory, which contains files like
+``wav.scp``, ``feats.scp``, then you can refer to `<https://lhotse.readthedocs.io/en/latest/kaldi.html#example>`_.
+
+A quick look to the generated files
+-----------------------------------
+
+``./prepare.sh`` puts generated files into two directories:
+
+  - ``download``
+  - ``data``
+
+download
+^^^^^^^^
+
+The ``download`` directory contains downloaded dataset files:
+
+.. code-block:: bas
+
+    tree -L 1 ./download/
+
+    ./download/
+    |-- waves_yesno
+    `-- waves_yesno.tar.gz
+
+.. hint::
+
+   Please refer to `<https://github.com/lhotse-speech/lhotse/blob/master/lhotse/recipes/yesno.py#L41>`_
+   for how the data is downloaded and extracted.
+
+data
+^^^^
+
+.. code-block:: bash
+
+    tree ./data/
+
+    ./data/
+    |-- fbank
+    |   |-- yesno_cuts_test.jsonl.gz
+    |   |-- yesno_cuts_train.jsonl.gz
+    |   |-- yesno_feats_test.lca
+    |   `-- yesno_feats_train.lca
+    |-- lang_phone
+    |   |-- HLG.pt
+    |   |-- L.pt
+    |   |-- L_disambig.pt
+    |   |-- Linv.pt
+    |   |-- lexicon.txt
+    |   |-- lexicon_disambig.txt
+    |   |-- tokens.txt
+    |   `-- words.txt
+    |-- lm
+    |   |-- G.arpa
+    |   `-- G.fst.txt
+    `-- manifests
+        |-- yesno_recordings_test.jsonl.gz
+        |-- yesno_recordings_train.jsonl.gz
+        |-- yesno_supervisions_test.jsonl.gz
+        `-- yesno_supervisions_train.jsonl.gz
+
+    4 directories, 18 files
+
+**data/manifests**:
+
+  This directory contains manifests. They are used to generate files in
+  ``data/fbank``.
+
+  To give you an idea of what it contains, we examine the first few lines of
+  the manifests related to the ``train`` dataset.
+
+  .. code-block:: bash
+
+      cd data/manifests
+      gunzip -c  yesno_recordings_train.jsonl.gz  | head -n 3
+
+  The output is given below:
+
+    .. code-block:: bash
+
+      {"id": "0_0_0_0_1_1_1_1", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_0_1_1_1_1.wav"}], "sampling_rate": 8000, "num_samples": 50800, "duration": 6.35, "channel_ids": [0]}
+      {"id": "0_0_0_1_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_1_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48880, "duration": 6.11, "channel_ids": [0]}
+      {"id": "0_0_1_0_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_1_0_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48160, "duration": 6.02, "channel_ids": [0]}
+
+  Please refer to `<https://github.com/lhotse-speech/lhotse/blob/master/lhotse/audio.py#L300>`_
+  for the meaning of each field per line.
+
+  .. code-block:: bash
+
+      gunzip -c  yesno_supervisions_train.jsonl.gz  | head -n 3
+
+  The output is given below:
+
+  .. code-block:: bash
+
+      {"id": "0_0_0_0_1_1_1_1", "recording_id": "0_0_0_0_1_1_1_1", "start": 0.0, "duration": 6.35, "channel": 0, "text": "NO NO NO NO YES YES YES YES", "language": "Hebrew"}
+      {"id": "0_0_0_1_0_1_1_0", "recording_id": "0_0_0_1_0_1_1_0", "start": 0.0, "duration": 6.11, "channel": 0, "text": "NO NO NO YES NO YES YES NO", "language": "Hebrew"}
+      {"id": "0_0_1_0_0_1_1_0", "recording_id": "0_0_1_0_0_1_1_0", "start": 0.0, "duration": 6.02, "channel": 0, "text": "NO NO YES NO NO YES YES NO", "language": "Hebrew"}
+
+  Please refer to `<https://github.com/lhotse-speech/lhotse/blob/master/lhotse/supervision.py#L510>`_
+  for the meaning of each field per line.
+
+**data/fbank**:
+
+  This directory contains everything from ``data/manifests``. Furthermore, it also contains features
+  for training.
+
+  ``data/fbank/yesno_feats_train.lca`` contains the features for the train dataset.
+  Features are compressed using `lilcom`_.
+
+  ``data/fbank/yesno_cuts_train.jsonl.gz`` stores the `CutSet <https://github.com/lhotse-speech/lhotse/blob/master/lhotse/cut/set.py#L72>`_,
+  which stores `RecordingSet <https://github.com/lhotse-speech/lhotse/blob/master/lhotse/audio.py#L928>`_,
+  `SupervisionSet <https://github.com/lhotse-speech/lhotse/blob/master/lhotse/supervision.py#L510>`_,
+  and `FeatureSet <https://github.com/lhotse-speech/lhotse/blob/master/lhotse/features/base.py#L593>`_.
+
+  To give you an idea about what it looks like, we can run the following command:
+
+    .. code-block:: bash
+
+        cd data/fbank
+
+        gunzip -c yesno_cuts_train.jsonl.gz | head -n 3
+
+  The output is given below:
+
+    .. code-block:: bash
+
+      {"id": "0_0_0_0_1_1_1_1-0", "start": 0, "duration": 6.35, "channel": 0, "supervisions": [{"id": "0_0_0_0_1_1_1_1", "recording_id": "0_0_0_0_1_1_1_1", "start": 0.0, "duration": 6.35, "channel": 0, "text": "NO NO NO NO YES YES YES YES", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 635, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.35, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "0,13000,3570", "channels": 0}, "recording": {"id": "0_0_0_0_1_1_1_1", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_0_1_1_1_1.wav"}], "sampling_rate": 8000, "num_samples": 50800, "duration": 6.35, "channel_ids": [0]}, "type": "MonoCut"}
+      {"id": "0_0_0_1_0_1_1_0-1", "start": 0, "duration": 6.11, "channel": 0, "supervisions": [{"id": "0_0_0_1_0_1_1_0", "recording_id": "0_0_0_1_0_1_1_0", "start": 0.0, "duration": 6.11, "channel": 0, "text": "NO NO NO YES NO YES YES NO", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 611, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.11, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "16570,12964,2929", "channels": 0}, "recording": {"id": "0_0_0_1_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_1_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48880, "duration": 6.11, "channel_ids": [0]}, "type": "MonoCut"}
+      {"id": "0_0_1_0_0_1_1_0-2", "start": 0, "duration": 6.02, "channel": 0, "supervisions": [{"id": "0_0_1_0_0_1_1_0", "recording_id": "0_0_1_0_0_1_1_0", "start": 0.0, "duration": 6.02, "channel": 0, "text": "NO NO YES NO NO YES YES NO", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 602, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.02, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "32463,12936,2696", "channels": 0}, "recording": {"id": "0_0_1_0_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_1_0_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48160, "duration": 6.02, "channel_ids": [0]}, "type": "MonoCut"}
+
+  Note that ``yesno_cuts_train.jsonl.gz`` only stores the information about how to read the features.
+  The actual features are stored separately in ``data/fbank/yesno_feats_train.lca``.
+
+**data/lang**:
+
+  This directory contains the lexicon.
+
+**data/lm**:
+
+  This directory contains language models.
--- a/docs/source/for-dummies/decoding.rst
+++ b/docs/source/for-dummies/decoding.rst
@ -0,0 +1,39 @@
+.. _dummies_tutorial_decoding:
+
+Decoding
+========
+
+After :ref:`dummies_tutorial_training`, we can start decoding.
+
+The command to start the decoding is quite simple:
+
+.. code-block:: bash
+
+   cd /tmp/icefall
+   export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+   cd egs/yesno/ASR
+
+   # We use CPU for decoding by setting the following environment variable
+   export CUDA_VISIBLE_DEVICES=""
+
+   ./tdnn/decode.py
+
+The output logs are given below:
+
+.. literalinclude:: ./code/decoding-yesno.txt
+
+For the more curious
+--------------------
+
+.. code-block:: bash
+
+   ./tdnn/decode.py --help
+
+will print the usage information about ``./tdnn/decode.py``. For instance, you
+can specify:
+
+  - ``--epoch`` to use which checkpoint for decoding
+  - ``--avg`` to select how many checkpoints to use for model averaging
+
+You usually try different combinations of ``--epoch`` and ``--avg`` and select
+one that leads to the lowest WER (`Word Error Rate <https://en.wikipedia.org/wiki/Word_error_rate>`_).
--- a/docs/source/for-dummies/environment-setup.rst
+++ b/docs/source/for-dummies/environment-setup.rst
@ -0,0 +1,121 @@
+.. _dummies_tutorial_environment_setup:
+
+Environment setup
+=================
+
+We will create an environment for `Next-gen Kaldi`_ that runs on ``CPU``
+in this tutorial.
+
+.. note::
+
+   Since the `yesno`_ dataset used in this tutorial is very tiny, training on
+   ``CPU`` works very well for it.
+
+   If your dataset is very large, e.g., hundreds or thousands of hours of
+   training data, please follow :ref:`install icefall` to install `icefall`_
+   that works with ``GPU``.
+
+
+Create a virtual environment
+----------------------------
+
+.. code-block:: bash
+
+  virtualenv -p python3 /tmp/icefall_env
+
+The above command creates a virtual environment in the directory ``/tmp/icefall_env``.
+You can select any directory you want.
+
+The output of the above command is given below:
+
+.. code-block:: bash
+
+  Already using interpreter /usr/bin/python3
+  Using base prefix '/usr'
+  New python executable in /tmp/icefall_env/bin/python3
+  Also creating executable in /tmp/icefall_env/bin/python
+  Installing setuptools, pkg_resources, pip, wheel...done.
+
+Now we can activate the environment using:
+
+.. code-block:: bash
+
+  source /tmp/icefall_env/bin/activate
+
+Install dependencies
+--------------------
+
+.. warning::
+
+   Remeber to activate your virtual environment before you continue!
+
+After activating the virtual environment, we can use the following command
+to install dependencies of `icefall`_:
+
+.. hint::
+
+   Remeber that we will run this tutorial on ``CPU``, so we install
+   dependencies required only by running on ``CPU``.
+
+.. code-block:: bash
+
+   # Caution: Installation order matters!
+
+   # We use torch 2.0.0 and torchaduio 2.0.0 in this tutorial.
+   # Other versions should also work.
+
+   pip install torch==2.0.0+cpu torchaudio==2.0.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+
+   # If you are using macOS or Windows, please use the following command to install torch and torchaudio
+   # pip install torch==2.0.0 torchaudio==2.0.0 -f https://download.pytorch.org/whl/torch_stable.html
+
+   # Now install k2
+   # Please refer to https://k2-fsa.github.io/k2/installation/from_wheels.html#linux-cpu-example
+
+   pip install k2==1.24.3.dev20230726+cpu.torch2.0.0 -f https://k2-fsa.github.io/k2/cpu.html
+
+   # Install the latest version of lhotse
+
+   pip install git+https://github.com/lhotse-speech/lhotse
+
+
+Install icefall
+---------------
+
+We will put the source code of `icefall`_ into the directory ``/tmp``
+You can select any directory you want.
+
+.. code-block:: bash
+
+   cd /tmp
+   git clone https://github.com/k2-fsa/icefall
+   cd icefall
+   pip install -r ./requirements.txt
+
+.. code-block:: bash
+
+   # Anytime we want to use icefall, we have to set the following
+   # environment variable
+
+   export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+
+.. hint::
+
+   If you get the following error during this tutorial:
+
+    .. code-block:: bash
+
+      ModuleNotFoundError: No module named 'icefall'
+
+  please set the above environment variable to fix it.
+
+
+Congratulations! You have installed `icefall`_ successfully.
+
+For the more curious
+--------------------
+
+`icefall`_ contains a collection of Python scripts and you don't need to
+use ``python3 setup.py install`` or ``pip install icefall`` to install it.
+All you need to do is to download the code and set the environment variable
+``PYTHONPATH``.
--- a/docs/source/for-dummies/index.rst
+++ b/docs/source/for-dummies/index.rst
@ -0,0 +1,34 @@
+Icefall for dummies tutorial
+============================
+
+This tutorial walks you step by step about how to create a simple
+ASR (`Automatic Speech Recognition <https://en.wikipedia.org/wiki/Speech_recognition>`_)
+system with `Next-gen Kaldi`_.
+
+We use the `yesno`_ dataset for demonstration. We select it out of two reasons:
+
+  - It is quite tiny, containing only about 12 minutes of data
+  - The training can be finished within 20 seconds on ``CPU``.
+
+That also means you don't need a ``GPU`` to run this tutorial.
+
+Let's get started!
+
+Please follow items below **sequentially**.
+
+.. note::
+
+   The :ref:`dummies_tutorial_data_preparation` runs only on Linux and on macOS.
+   All other parts run on Linux, macOS, and Windows.
+
+   Help from the community is appreciated to port the :ref:`dummies_tutorial_data_preparation`
+   to Windows.
+
+.. toctree::
+   :maxdepth: 2
+
+   ./environment-setup.rst
+   ./data-preparation.rst
+   ./training.rst
+   ./decoding.rst
+   ./model-export.rst
--- a/docs/source/for-dummies/model-export.rst
+++ b/docs/source/for-dummies/model-export.rst
@ -0,0 +1,310 @@
+Model Export
+============
+
+There are three ways to export a pre-trained model.
+
+  - Export the model parameters via `model.state_dict() <https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.state_dict>`_
+  - Export via `torchscript <https://pytorch.org/docs/stable/jit.html>`_: either `torch.jit.script() <https://pytorch.org/docs/stable/generated/torch.jit.script.html#torch.jit.script>`_ or `torch.jit.trace() <https://pytorch.org/docs/stable/generated/torch.jit.trace.html>`_
+  - Export to `ONNX`_ via `torch.onnx.export() <https://pytorch.org/docs/stable/onnx.html>`_
+
+Each method is explained below in detail.
+
+Export the model parameters via model.state_dict()
+---------------------------------------------------
+
+The command for this kind of export is
+
+.. code-block:: bash
+
+   cd /tmp/icefall
+   export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+   cd egs/yesno/ASR
+
+   # assume that "--epoch 14 --avg 2" produces the lowest WER.
+
+   ./tdnn/export.py --epoch 14 --avg 2
+
+The output logs are given below:
+
+.. code-block:: bash
+
+  2023-08-16 20:42:03,912 INFO [export.py:76] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'jit': False}
+  2023-08-16 20:42:03,913 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+  2023-08-16 20:42:03,950 INFO [export.py:93] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
+  2023-08-16 20:42:03,971 INFO [export.py:106] Not using torch.jit.script
+  2023-08-16 20:42:03,974 INFO [export.py:111] Saved to tdnn/exp/pretrained.pt
+
+We can see from the logs that the exported model is saved to the file ``tdnn/exp/pretrained.pt``.
+
+To give you an idea of what ``tdnn/exp/pretrained.pt`` contains, we can use the following command:
+
+.. code-block:: python3
+
+    >>> import torch
+    >>> m = torch.load("tdnn/exp/pretrained.pt")
+    >>> list(m.keys())
+    ['model']
+    >>> list(m["model"].keys())
+    ['tdnn.0.weight', 'tdnn.0.bias', 'tdnn.2.running_mean', 'tdnn.2.running_var', 'tdnn.2.num_batches_tracked', 'tdnn.3.weight', 'tdnn.3.bias', 'tdnn.5.running_mean', 'tdnn.5.running_var', 'tdnn.5.num_batches_tracked', 'tdnn.6.weight', 'tdnn.6.bias', 'tdnn.8.running_mean', 'tdnn.8.running_var', 'tdnn.8.num_batches_tracked', 'output_linear.weight', 'output_linear.bias']
+
+We can use ``tdnn/exp/pretrained.pt`` in the following way with ``./tdnn/decode.py``:
+
+.. code-block:: bash
+
+   cd tdnn/exp
+   ln -s pretrained.pt epoch-99.pt
+   cd ../..
+
+   ./tdnn/decode.py --epoch 99 --avg 1
+
+The output logs of the above command are given below:
+
+.. code-block:: bash
+
+    2023-08-16 20:45:48,089 INFO [decode.py:262] Decoding started
+    2023-08-16 20:45:48,090 INFO [decode.py:263] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 99, 'avg': 1, 'export': False, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': False, 'k2-git-sha1': 'ad79f1c699c684de9785ed6ca5edb805a41f78c3', 'k2-git-date': 'Wed Jul 26 09:30:42 2023', 'lhotse-version': '1.16.0.dev+git.aa073f6.clean', 'torch-version': '2.0.0', 'torch-cuda-available': False, 'torch-cuda-version': None, 'python-version': '3.1', 'icefall-git-branch': 'master', 'icefall-git-sha1': '9a47c08-clean', 'icefall-git-date': 'Mon Aug 14 22:10:50 2023', 'icefall-path': '/private/tmp/icefall', 'k2-path': '/private/tmp/icefall_env/lib/python3.11/site-packages/k2/__init__.py', 'lhotse-path': '/private/tmp/icefall_env/lib/python3.11/site-packages/lhotse/__init__.py', 'hostname': 'fangjuns-MacBook-Pro.local', 'IP address': '127.0.0.1'}}
+    2023-08-16 20:45:48,092 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+    2023-08-16 20:45:48,103 INFO [decode.py:272] device: cpu
+    2023-08-16 20:45:48,109 INFO [checkpoint.py:112] Loading checkpoint from tdnn/exp/epoch-99.pt
+    2023-08-16 20:45:48,115 INFO [asr_datamodule.py:218] About to get test cuts
+    2023-08-16 20:45:48,115 INFO [asr_datamodule.py:253] About to get test cuts
+    2023-08-16 20:45:50,386 INFO [decode.py:203] batch 0/?, cuts processed until now is 4
+    2023-08-16 20:45:50,556 INFO [decode.py:240] The transcripts are stored in tdnn/exp/recogs-test_set.txt
+    2023-08-16 20:45:50,557 INFO [utils.py:564] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
+    2023-08-16 20:45:50,558 INFO [decode.py:248] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
+    2023-08-16 20:45:50,559 INFO [decode.py:315] Done!
+
+We can see that it produces an identical WER as before.
+
+We can also use it to decode files with the following command:
+
+.. code-block:: bash
+
+  # ./tdnn/pretrained.py requires kaldifeat
+  #
+  # Please refer to https://csukuangfj.github.io/kaldifeat/installation/from_wheels.html
+  # for how to install kaldifeat
+
+  pip install kaldifeat==1.25.0.dev20230726+cpu.torch2.0.0 -f https://csukuangfj.github.io/kaldifeat/cpu.html
+
+  ./tdnn/pretrained.py \
+    --checkpoint ./tdnn/exp/pretrained.pt \
+    --HLG ./data/lang_phone/HLG.pt \
+    --words-file ./data/lang_phone/words.txt \
+    download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+    download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+The output is given below:
+
+.. code-block:: bash
+
+  2023-08-16 20:53:19,208 INFO [pretrained.py:136] {'feature_dim': 23, 'num_classes': 4, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './tdnn/exp/pretrained.pt', 'words_file': './data/lang_phone/words.txt', 'HLG': './data/lang_phone/HLG.pt', 'sound_files': ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']}
+  2023-08-16 20:53:19,208 INFO [pretrained.py:142] device: cpu
+  2023-08-16 20:53:19,208 INFO [pretrained.py:144] Creating model
+  2023-08-16 20:53:19,212 INFO [pretrained.py:156] Loading HLG from ./data/lang_phone/HLG.pt
+  2023-08-16 20:53:19,213 INFO [pretrained.py:160] Constructing Fbank computer
+  2023-08-16 20:53:19,213 INFO [pretrained.py:170] Reading sound files: ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']
+  2023-08-16 20:53:19,224 INFO [pretrained.py:176] Decoding started
+  2023-08-16 20:53:19,304 INFO [pretrained.py:212]
+  download/waves_yesno/0_0_0_1_0_0_0_1.wav:
+  NO NO NO YES NO NO NO YES
+
+  download/waves_yesno/0_0_1_0_0_0_1_0.wav:
+  NO NO YES NO NO NO YES NO
+
+
+  2023-08-16 20:53:19,304 INFO [pretrained.py:214] Decoding Done
+
+
+Export via torch.jit.script()
+-----------------------------
+
+The command for this kind of export is
+
+.. code-block:: bash
+
+   cd /tmp/icefall
+   export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+   cd egs/yesno/ASR
+
+   # assume that "--epoch 14 --avg 2" produces the lowest WER.
+
+   ./tdnn/export.py --epoch 14 --avg 2 --jit true
+
+The output logs are given below:
+
+.. code-block:: bash
+
+  2023-08-16 20:47:44,666 INFO [export.py:76] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'jit': True}
+  2023-08-16 20:47:44,667 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+  2023-08-16 20:47:44,670 INFO [export.py:93] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
+  2023-08-16 20:47:44,677 INFO [export.py:100] Using torch.jit.script
+  2023-08-16 20:47:44,843 INFO [export.py:104] Saved to tdnn/exp/cpu_jit.pt
+
+From the output logs we can see that the generated file is saved to ``tdnn/exp/cpu_jit.pt``.
+
+Don't be confused by the name ``cpu_jit.pt``. The ``cpu`` part means the model is moved to
+CPU before exporting. That means, when you load it with:
+
+.. code-block:: bash
+
+   torch.jit.load()
+
+you don't need to specify the argument `map_location <https://pytorch.org/docs/stable/generated/torch.jit.load.html#torch.jit.load>`_
+and it resides on CPU by default.
+
+To use ``tdnn/exp/cpu_jit.pt`` with `icefall`_ to decode files, we can use:
+
+.. code-block:: bash
+
+  # ./tdnn/jit_pretrained.py requires kaldifeat
+  #
+  # Please refer to https://csukuangfj.github.io/kaldifeat/installation/from_wheels.html
+  # for how to install kaldifeat
+
+  pip install kaldifeat==1.25.0.dev20230726+cpu.torch2.0.0 -f https://csukuangfj.github.io/kaldifeat/cpu.html
+
+
+  ./tdnn/jit_pretrained.py \
+    --nn-model ./tdnn/exp/cpu_jit.pt \
+    --HLG ./data/lang_phone/HLG.pt \
+    --words-file ./data/lang_phone/words.txt \
+    download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+    download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+The output is given below:
+
+.. code-block:: bash
+
+  2023-08-16 20:56:00,603 INFO [jit_pretrained.py:121] {'feature_dim': 23, 'num_classes': 4, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'nn_model': './tdnn/exp/cpu_jit.pt', 'words_file': './data/lang_phone/words.txt', 'HLG': './data/lang_phone/HLG.pt', 'sound_files': ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']}
+  2023-08-16 20:56:00,603 INFO [jit_pretrained.py:127] device: cpu
+  2023-08-16 20:56:00,603 INFO [jit_pretrained.py:129] Loading torchscript model
+  2023-08-16 20:56:00,640 INFO [jit_pretrained.py:134] Loading HLG from ./data/lang_phone/HLG.pt
+  2023-08-16 20:56:00,641 INFO [jit_pretrained.py:138] Constructing Fbank computer
+  2023-08-16 20:56:00,641 INFO [jit_pretrained.py:148] Reading sound files: ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']
+  2023-08-16 20:56:00,642 INFO [jit_pretrained.py:154] Decoding started
+  2023-08-16 20:56:00,727 INFO [jit_pretrained.py:190]
+  download/waves_yesno/0_0_0_1_0_0_0_1.wav:
+  NO NO NO YES NO NO NO YES
+
+  download/waves_yesno/0_0_1_0_0_0_1_0.wav:
+  NO NO YES NO NO NO YES NO
+
+
+  2023-08-16 20:56:00,727 INFO [jit_pretrained.py:192] Decoding Done
+
+.. hint::
+
+   We provide only code for ``torch.jit.script()``. You can try ``torch.jit.trace()``
+   if you want.
+
+Export via torch.onnx.export()
+------------------------------
+
+The command for this kind of export is
+
+.. code-block:: bash
+
+   cd /tmp/icefall
+   export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+   cd egs/yesno/ASR
+
+   # tdnn/export_onnx.py requires onnx and onnxruntime
+   pip install onnx onnxruntime
+
+   # assume that "--epoch 14 --avg 2" produces the lowest WER.
+
+   ./tdnn/export_onnx.py \
+     --epoch 14 \
+     --avg 2
+
+The output logs are given below:
+
+.. code-block:: bash
+
+  2023-08-16 20:59:20,888 INFO [export_onnx.py:83] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'epoch': 14, 'avg': 2}
+  2023-08-16 20:59:20,888 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+  2023-08-16 20:59:20,892 INFO [export_onnx.py:100] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
+  ================ Diagnostic Run torch.onnx.export version 2.0.0 ================
+  verbose: False, log level: Level.ERROR
+  ======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================
+
+  2023-08-16 20:59:21,047 INFO [export_onnx.py:127] Saved to tdnn/exp/model-epoch-14-avg-2.onnx
+  2023-08-16 20:59:21,047 INFO [export_onnx.py:136] meta_data: {'model_type': 'tdnn', 'version': '1', 'model_author': 'k2-fsa', 'comment': 'non-streaming tdnn for the yesno recipe', 'vocab_size': 4}
+  2023-08-16 20:59:21,049 INFO [export_onnx.py:140] Generate int8 quantization models
+  2023-08-16 20:59:21,075 INFO [onnx_quantizer.py:538] Quantization parameters for tensor:"/Transpose_1_output_0" not specified
+  2023-08-16 20:59:21,081 INFO [export_onnx.py:151] Saved to tdnn/exp/model-epoch-14-avg-2.int8.onnx
+
+We can see from the logs that it generates two files:
+
+  - ``tdnn/exp/model-epoch-14-avg-2.onnx`` (ONNX model with ``float32`` weights)
+  - ``tdnn/exp/model-epoch-14-avg-2.int8.onnx`` (ONNX model with ``int8`` weights)
+
+To use the generated ONNX model files for decoding with `onnxruntime`_, we can use
+
+.. code-block:: bash
+
+  # ./tdnn/onnx_pretrained.py requires kaldifeat
+  #
+  # Please refer to https://csukuangfj.github.io/kaldifeat/installation/from_wheels.html
+  # for how to install kaldifeat
+
+  pip install kaldifeat==1.25.0.dev20230726+cpu.torch2.0.0 -f https://csukuangfj.github.io/kaldifeat/cpu.html
+
+  ./tdnn/onnx_pretrained.py \
+    --nn-model ./tdnn/exp/model-epoch-14-avg-2.onnx \
+    --HLG ./data/lang_phone/HLG.pt \
+    --words-file ./data/lang_phone/words.txt \
+    download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+    download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+The output is given below:
+
+.. code-block:: bash
+
+  2023-08-16 21:03:24,260 INFO [onnx_pretrained.py:166] {'feature_dim': 23, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'nn_model': './tdnn/exp/model-epoch-14-avg-2.onnx', 'words_file': './data/lang_phone/words.txt', 'HLG': './data/lang_phone/HLG.pt', 'sound_files': ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']}
+  2023-08-16 21:03:24,260 INFO [onnx_pretrained.py:171] device: cpu
+  2023-08-16 21:03:24,260 INFO [onnx_pretrained.py:173] Loading onnx model ./tdnn/exp/model-epoch-14-avg-2.onnx
+  2023-08-16 21:03:24,267 INFO [onnx_pretrained.py:176] Loading HLG from ./data/lang_phone/HLG.pt
+  2023-08-16 21:03:24,270 INFO [onnx_pretrained.py:180] Constructing Fbank computer
+  2023-08-16 21:03:24,273 INFO [onnx_pretrained.py:190] Reading sound files: ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']
+  2023-08-16 21:03:24,279 INFO [onnx_pretrained.py:196] Decoding started
+  2023-08-16 21:03:24,318 INFO [onnx_pretrained.py:232]
+  download/waves_yesno/0_0_0_1_0_0_0_1.wav:
+  NO NO NO YES NO NO NO YES
+
+  download/waves_yesno/0_0_1_0_0_0_1_0.wav:
+  NO NO YES NO NO NO YES NO
+
+
+  2023-08-16 21:03:24,318 INFO [onnx_pretrained.py:234] Decoding Done
+
+.. note::
+
+   To use the ``int8`` ONNX model for decoding, please use:
+
+   .. code-block:: bash
+
+      ./tdnn/onnx_pretrained.py \
+        --nn-model ./tdnn/exp/model-epoch-14-avg-2.onnx \
+        --HLG ./data/lang_phone/HLG.pt \
+        --words-file ./data/lang_phone/words.txt \
+        download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+        download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+For the more curious
+--------------------
+
+If you are wondering how to deploy the model without ``torch``, please
+continue reading. We will show how to use `sherpa-onnx`_ to run the
+exported ONNX models, which depends only on `onnxruntime`_ and does not
+depend on ``torch``.
+
+In this tutorial, we will only demonstrate the usage of `sherpa-onnx`_ with the
+pre-trained model of the `yesno`_ recipe. There are also other two frameworks
+available:
+
+  - `sherpa`_. It works with torchscript models.
+  - `sherpa-ncnn`_. It works with models exported using :ref:`icefall_export_to_ncnn` with `ncnn`_
+
+Please see `<https://k2-fsa.github.io/sherpa/>`_ for further details.
--- a/docs/source/for-dummies/training.rst
+++ b/docs/source/for-dummies/training.rst
@ -0,0 +1,39 @@
+.. _dummies_tutorial_training:
+
+Training
+========
+
+After :ref:`dummies_tutorial_data_preparation`, we can start training.
+
+The command to start the training is quite simple:
+
+.. code-block:: bash
+
+   cd /tmp/icefall
+   export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+   cd egs/yesno/ASR
+
+   # We use CPU for training by setting the following environment variable
+   export CUDA_VISIBLE_DEVICES=""
+
+   ./tdnn/train.py
+
+That's it!
+
+You can find the training logs below:
+
+.. literalinclude:: ./code/train-yesno.txt
+
+For the more curious
+--------------------
+
+.. code-block:: bash
+
+   ./tdnn/train.py --help
+
+will print the usage information about ``./tdnn/train.py``. For instance, you
+can specify the number of epochs to train and the location to save the training
+results.
+
+The training text logs are saved in ``tdnn/exp/log`` while the tensorboard
+logs are in ``tdnn/exp/tensorboard``.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -20,10 +20,13 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
   :maxdepth: 2
   :caption: Contents:

+   for-dummies/index.rst
   installation/index
+   docker/index
   faqs
   model-export/index

+
 .. toctree::
   :maxdepth: 3

@ -34,3 +37,8 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.

   contributing/index
   huggingface/index
+
+.. toctree::
+   :maxdepth: 2
+   
+   decoding-with-langugage-models/index
--- a/docs/source/installation/index.rst
+++ b/docs/source/installation/index.rst
@ -3,40 +3,28 @@
 Installation
 ============

+.. hint::

+   We also provide :ref:`icefall_docker` support, which has already setup
+   the environment for you.

-``icefall`` depends on `k2 <https://github.com/k2-fsa/k2>`_ and
-`lhotse <https://github.com/lhotse-speech/lhotse>`_.
+.. hint::
+
+  We have a colab notebook guiding you step by step to setup the environment.
+
+  |yesno colab notebook|
+
+  .. |yesno colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
+     :target: https://colab.research.google.com/drive/1tIjjzaJc3IvGyKiMCDWO-TSnBgkcuN3B?usp=sharing
+
+`icefall`_ depends on `k2`_ and `lhotse`_.

 We recommend that you use the following steps to install the dependencies.

 - (0) Install CUDA toolkit and cuDNN
- (1) Install PyTorch and torchaudio
- (2) Install k2
- (3) Install lhotse
-
-.. caution::
-
-   99% users who have issues about the installation are using conda.
-
-.. caution::
-
-   99% users who have issues about the installation are using conda.
-
-.. caution::
-
-   99% users who have issues about the installation are using conda.
-
-.. hint::
-
-   We suggest that you use ``pip install`` to install PyTorch.
-
-   You can use the following command to create a virutal environment in Python:
-
-    .. code-block:: bash
-
-        python3 -m venv ./my_env
-        source ./my_env/bin/activate
+- (1) Install `torch`_ and `torchaudio`_
+- (2) Install `k2`_
+- (3) Install `lhotse`_

 .. caution::

@ -50,27 +38,20 @@ Please refer to
 to install CUDA and cuDNN.


-(1) Install PyTorch and torchaudio
----------------------------------
+(1) Install torch and torchaudio
+--------------------------------

-Please refer `<https://pytorch.org/>`_ to install PyTorch
-and torchaudio.
-
-.. hint::
-
-   You can also go to  `<https://download.pytorch.org/whl/torch_stable.html>`_
-   to download pre-compiled wheels and install them.
+Please refer `<https://pytorch.org/>`_ to install `torch`_ and `torchaudio`_.

 .. caution::

   Please install torch and torchaudio at the same time.

-
 (2) Install k2
 --------------

 Please refer to `<https://k2-fsa.github.io/k2/installation/index.html>`_
-to install ``k2``.
+to install `k2`_.

 .. caution::

@ -78,21 +59,18 @@ to install ``k2``.

 .. note::

-   We suggest that you install k2 from source by following
-   `<https://k2-fsa.github.io/k2/installation/from_source.html>`_
-   or
-   `<https://k2-fsa.github.io/k2/installation/for_developers.html>`_.
+   We suggest that you install k2 from pre-compiled wheels by following
+   `<https://k2-fsa.github.io/k2/installation/from_wheels.html>`_

 .. hint::

-   Please always install the latest version of k2.
+   Please always install the latest version of `k2`_.

 (3) Install lhotse
 ------------------

 Please refer to `<https://lhotse.readthedocs.io/en/latest/getting-started.html#installation>`_
-to install ``lhotse``.
-
+to install `lhotse`_.

 .. hint::

@ -100,17 +78,16 @@ to install ``lhotse``.

      pip install git+https://github.com/lhotse-speech/lhotse

-    to install the latest version of lhotse.
+    to install the latest version of `lhotse`_.

 (4) Download icefall
 --------------------

-``icefall`` is a collection of Python scripts; what you need is to download it
+`icefall`_ is a collection of Python scripts; what you need is to download it
 and set the environment variable ``PYTHONPATH`` to point to it.

-Assume you want to place ``icefall`` in the folder ``/tmp``. The
-following commands show you how to setup ``icefall``:
-
+Assume you want to place `icefall`_ in the folder ``/tmp``. The
+following commands show you how to setup `icefall`_:

 .. code-block:: bash

@ -122,285 +99,334 @@ following commands show you how to setup ``icefall``:

 .. HINT::

-  You can put several versions of ``icefall`` in the same virtual environment.
-  To switch among different versions of ``icefall``, just set ``PYTHONPATH``
+  You can put several versions of `icefall`_ in the same virtual environment.
+  To switch among different versions of `icefall`_, just set ``PYTHONPATH``
  to point to the version you want.

-
 Installation example
 --------------------

 The following shows an example about setting up the environment.

-
 (1) Create a virtual environment
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. code-block:: bash

-  $ virtualenv -p python3.8  test-icefall
+   kuangfangjun:~$ virtualenv -p python3.8 test-icefall
+   created virtual environment CPython3.8.0.final.0-64 in 9422ms
+     creator CPython3Posix(dest=/star-fj/fangjun/test-icefall, clear=False, no_vcs_ignore=False, global=False)
+     seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/star-fj/fangjun/.local/share/virtualenv)
+       added seed packages: pip==22.3.1, setuptools==65.6.3, wheel==0.38.4
+     activators BashActivator,CShellActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator

-  created virtual environment CPython3.8.6.final.0-64 in 1540ms
-    creator CPython3Posix(dest=/ceph-fj/fangjun/test-icefall, clear=False, no_vcs_ignore=False, global=False)
-    seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/root/fangjun/.local/share/v
-  irtualenv)
-      added seed packages: pip==21.1.3, setuptools==57.4.0, wheel==0.36.2
-    activators BashActivator,CShellActivator,FishActivator,PowerShellActivator,PythonActivator,XonshActivator
+   kuangfangjun:~$ source test-icefall/bin/activate

+   (test-icefall) kuangfangjun:~$

-(2) Activate your virtual environment
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+(2) Install CUDA toolkit and cuDNN
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You need to determine the version of CUDA toolkit to install.

 .. code-block:: bash

-  $ source test-icefall/bin/activate
+   (test-icefall) kuangfangjun:~$ nvidia-smi | head -n 4

-(3) Install k2
+   Wed Jul 26 21:57:49 2023
+   +-----------------------------------------------------------------------------+
+   | NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
+   |-------------------------------+----------------------+----------------------+
+
+You can choose any CUDA version that is ``not`` greater than the version printed by ``nvidia-smi``.
+In our case, we can choose any version ``<= 11.6``.
+
+We will use ``CUDA 11.6`` in this example. Please follow
+`<https://k2-fsa.github.io/k2/installation/cuda-cudnn.html#cuda-11-6>`_
+to install CUDA toolkit and cuDNN if you have not done that before.
+
+After installing CUDA toolkit, you can use the following command to verify it:
+
+.. code-block:: bash
+
+  (test-icefall) kuangfangjun:~$ nvcc --version
+
+  nvcc: NVIDIA (R) Cuda compiler driver
+  Copyright (c) 2005-2019 NVIDIA Corporation
+  Built on Wed_Oct_23_19:24:38_PDT_2019
+  Cuda compilation tools, release 10.2, V10.2.89
+
+(3) Install torch and torchaudio
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Since we have selected CUDA toolkit ``11.6``, we have to install a version of `torch`_
+that is compiled against CUDA ``11.6``. We select ``torch 1.13.0+cu116`` in this
+example.
+
+After selecting the version of `torch`_ to install, we need to also install
+a compatible version of `torchaudio`_, which is ``0.13.0+cu116`` in our case.
+
+Please refer to `<https://pytorch.org/audio/stable/installation.html#compatibility-matrix>`_
+to select an appropriate version of `torchaudio`_ to install if you use a different
+version of `torch`_.
+
+.. code-block:: bash
+
+  (test-icefall) kuangfangjun:~$ pip install torch==1.13.0+cu116 torchaudio==0.13.0+cu116 -f https://download.pytorch.org/whl/torch_stable.html
+
+  Looking in links: https://download.pytorch.org/whl/torch_stable.html
+  Collecting torch==1.13.0+cu116
+    Downloading https://download.pytorch.org/whl/cu116/torch-1.13.0%2Bcu116-cp38-cp38-linux_x86_64.whl (1983.0 MB)
+       ________________________________________ 2.0/2.0 GB 764.4 kB/s eta 0:00:00
+  Collecting torchaudio==0.13.0+cu116
+    Downloading https://download.pytorch.org/whl/cu116/torchaudio-0.13.0%2Bcu116-cp38-cp38-linux_x86_64.whl (4.2 MB)
+       ________________________________________ 4.2/4.2 MB 1.3 MB/s eta 0:00:00
+  Requirement already satisfied: typing-extensions in /star-fj/fangjun/test-icefall/lib/python3.8/site-packages (from torch==1.13.0+cu116) (4.7.1)
+  Installing collected packages: torch, torchaudio
+  Successfully installed torch-1.13.0+cu116 torchaudio-0.13.0+cu116
+
+Verify that `torch`_ and `torchaudio`_ are successfully installed:
+
+.. code-block:: bash
+
+  (test-icefall) kuangfangjun:~$ python3 -c "import torch; print(torch.__version__)"
+
+  1.13.0+cu116
+
+  (test-icefall) kuangfangjun:~$ python3 -c "import torchaudio; print(torchaudio.__version__)"
+
+  0.13.0+cu116
+
+(4) Install k2
 ~~~~~~~~~~~~~~

+We will install `k2`_ from pre-compiled wheels by following
+`<https://k2-fsa.github.io/k2/installation/from_wheels.html>`_
+
 .. code-block:: bash

-  $ pip install k2==1.4.dev20210822+cpu.torch1.9.0 -f https://k2-fsa.org/nightly/index.html
+  (test-icefall) kuangfangjun:~$ pip install k2==1.24.3.dev20230725+cuda11.6.torch1.13.0 -f https://k2-fsa.github.io/k2/cuda.html

-  Looking in links: https://k2-fsa.org/nightly/index.html
-  Collecting k2==1.4.dev20210822+cpu.torch1.9.0
-    Downloading https://k2-fsa.org/nightly/whl/k2-1.4.dev20210822%2Bcpu.torch1.9.0-cp38-cp38-linux_x86_64.whl (1.6 MB)
-       |________________________________| 1.6 MB 185 kB/s
+  Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
+  Looking in links: https://k2-fsa.github.io/k2/cuda.html
+  Collecting k2==1.24.3.dev20230725+cuda11.6.torch1.13.0
+    Downloading https://huggingface.co/csukuangfj/k2/resolve/main/ubuntu-cuda/k2-1.24.3.dev20230725%2Bcuda11.6.torch1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (104.3 MB)
+       ________________________________________ 104.3/104.3 MB 5.1 MB/s eta 0:00:00
+  Requirement already satisfied: torch==1.13.0 in /star-fj/fangjun/test-icefall/lib/python3.8/site-packages (from k2==1.24.3.dev20230725+cuda11.6.torch1.13.0) (1.13.0+cu116)
  Collecting graphviz
-    Downloading graphviz-0.17-py3-none-any.whl (18 kB)
-  Collecting torch==1.9.0
-    Using cached torch-1.9.0-cp38-cp38-manylinux1_x86_64.whl (831.4 MB)
-  Collecting typing-extensions
-    Using cached typing_extensions-3.10.0.0-py3-none-any.whl (26 kB)
-  Installing collected packages: typing-extensions, torch, graphviz, k2
-  Successfully installed graphviz-0.17 k2-1.4.dev20210822+cpu.torch1.9.0 torch-1.9.0 typing-extensions-3.10.0.0
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/de/5e/fcbb22c68208d39edff467809d06c9d81d7d27426460ebc598e55130c1aa/graphviz-0.20.1-py3-none-any.whl (47 kB)
+  Requirement already satisfied: typing-extensions in /star-fj/fangjun/test-icefall/lib/python3.8/site-packages (from torch==1.13.0->k2==1.24.3.dev20230725+cuda11.6.torch1.13.0) (4.7.1)
+  Installing collected packages: graphviz, k2
+  Successfully installed graphviz-0.20.1 k2-1.24.3.dev20230725+cuda11.6.torch1.13.0

-.. WARNING::
+.. hint::

-  We choose to install a CPU version of k2 for testing. You would probably want to install
-  a CUDA version of k2.
+   Please refer to `<https://k2-fsa.github.io/k2/cuda.html>`_ for the available
+   pre-compiled wheels about `k2`_.

+Verify that `k2`_ has been installed successfully:

-(4) Install lhotse
+.. code-block:: bash
+
+  (test-icefall) kuangfangjun:~$ python3 -m k2.version
+
+  Collecting environment information...
+
+  k2 version: 1.24.3
+  Build type: Release
+  Git SHA1: 4c05309499a08454997adf500b56dcc629e35ae5
+  Git date: Tue Jul 25 16:23:36 2023
+  Cuda used to build k2: 11.6
+  cuDNN used to build k2: 8.3.2
+  Python version used to build k2: 3.8
+  OS used to build k2: CentOS Linux release 7.9.2009 (Core)
+  CMake version: 3.27.0
+  GCC version: 9.3.1
+  CMAKE_CUDA_FLAGS:  -Wno-deprecated-gpu-targets   -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_35,code=sm_35  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_50,code=sm_50  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_60,code=sm_60  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_61,code=sm_61  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_70,code=sm_70  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_75,code=sm_75  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_80,code=sm_80  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_86,code=sm_86 -DONNX_NAMESPACE=onnx_c2 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -Xcudafe --diag_suppress=cc_clobber_ignored,--diag_suppress=integer_sign_change,--diag_suppress=useless_using_declaration,--diag_suppress=set_but_not_used,--diag_suppress=field_without_dll_interface,--diag_suppress=base_class_has_different_dll_interface,--diag_suppress=dll_interface_conflict_none_assumed,--diag_suppress=dll_interface_conflict_dllexport_assumed,--diag_suppress=implicit_return_from_non_void_function,--diag_suppress=unsigned_compare_with_zero,--diag_suppress=declared_but_not_referenced,--diag_suppress=bad_friend_decl --expt-relaxed-constexpr --expt-extended-lambda -D_GLIBCXX_USE_CXX11_ABI=0 --compiler-options -Wall  --compiler-options -Wno-strict-overflow  --compiler-options -Wno-unknown-pragmas
+  CMAKE_CXX_FLAGS:  -D_GLIBCXX_USE_CXX11_ABI=0 -Wno-unused-variable  -Wno-strict-overflow
+  PyTorch version used to build k2: 1.13.0+cu116
+  PyTorch is using Cuda: 11.6
+  NVTX enabled: True
+  With CUDA: True
+  Disable debug: True
+  Sync kernels : False
+  Disable checks: False
+  Max cpu memory allocate: 214748364800 bytes (or 200.0 GB)
+  k2 abort: False
+  __file__: /star-fj/fangjun/test-icefall/lib/python3.8/site-packages/k2/version/version.py
+  _k2.__file__: /star-fj/fangjun/test-icefall/lib/python3.8/site-packages/_k2.cpython-38-x86_64-linux-gnu.so
+
+(5) Install lhotse
 ~~~~~~~~~~~~~~~~~~

-.. code-block::
+.. code-block:: bash

-  $ pip install git+https://github.com/lhotse-speech/lhotse
+  (test-icefall) kuangfangjun:~$ pip install git+https://github.com/lhotse-speech/lhotse

  Collecting git+https://github.com/lhotse-speech/lhotse
-    Cloning https://github.com/lhotse-speech/lhotse to /tmp/pip-req-build-7b1b76ge
-    Running command git clone -q https://github.com/lhotse-speech/lhotse /tmp/pip-req-build-7b1b76ge
-  Collecting audioread>=2.1.9
-    Using cached audioread-2.1.9-py3-none-any.whl
-  Collecting SoundFile>=0.10
-    Using cached SoundFile-0.10.3.post1-py2.py3-none-any.whl (21 kB)
-  Collecting click>=7.1.1
-    Using cached click-8.0.1-py3-none-any.whl (97 kB)
+    Cloning https://github.com/lhotse-speech/lhotse to /tmp/pip-req-build-vq12fd5i
+    Running command git clone --filter=blob:none --quiet https://github.com/lhotse-speech/lhotse /tmp/pip-req-build-vq12fd5i
+    Resolved https://github.com/lhotse-speech/lhotse to commit 7640d663469b22cd0b36f3246ee9b849cd25e3b7
+    Installing build dependencies ... done
+    Getting requirements to build wheel ... done
+    Preparing metadata (pyproject.toml) ... done
  Collecting cytoolz>=0.10.1
-    Using cached cytoolz-0.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.9 MB)
-  Collecting dataclasses
-    Using cached dataclasses-0.6-py3-none-any.whl (14 kB)
-  Collecting h5py>=2.10.0
-    Downloading h5py-3.4.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.5 MB)
-       |________________________________| 4.5 MB 684 kB/s
-  Collecting intervaltree>=3.1.0
-    Using cached intervaltree-3.1.0-py2.py3-none-any.whl
-  Collecting lilcom>=1.1.0
-    Using cached lilcom-1.1.1-cp38-cp38-linux_x86_64.whl
-  Collecting numpy>=1.18.1
-    Using cached numpy-1.21.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.8 MB)
-  Collecting packaging
-    Using cached packaging-21.0-py3-none-any.whl (40 kB)
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/1e/3b/a7828d575aa17fb7acaf1ced49a3655aa36dad7e16eb7e6a2e4df0dda76f/cytoolz-0.12.2-cp38-cp38-
+  manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
+       ________________________________________ 2.0/2.0 MB 33.2 MB/s eta 0:00:00
  Collecting pyyaml>=5.3.1
-    Using cached PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl (662 kB)
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c8/6b/6600ac24725c7388255b2f5add93f91e58a5d7efaf4af244fdbcc11a541b/PyYAML-6.0.1-cp38-cp38-ma
+  nylinux_2_17_x86_64.manylinux2014_x86_64.whl (736 kB)
+       ________________________________________ 736.6/736.6 kB 38.6 MB/s eta 0:00:00
+  Collecting dataclasses
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/26/2f/1095cdc2868052dd1e64520f7c0d5c8c550ad297e944e641dbf1ffbb9a5d/dataclasses-0.6-py3-none-
+  any.whl (14 kB)
+  Requirement already satisfied: torchaudio in ./test-icefall/lib/python3.8/site-packages (from lhotse==1.16.0.dev0+git.7640d66.clean) (0.13.0+cu116)
+  Collecting lilcom>=1.1.0
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a8/65/df0a69c52bd085ca1ad4e5c4c1a5c680e25f9477d8e49316c4ff1e5084a4/lilcom-1.7-cp38-cp38-many
+  linux_2_17_x86_64.manylinux2014_x86_64.whl (87 kB)
+       ________________________________________ 87.1/87.1 kB 8.7 MB/s eta 0:00:00
  Collecting tqdm
-    Downloading tqdm-4.62.1-py2.py3-none-any.whl (76 kB)
-       |________________________________| 76 kB 2.7 MB/s
-  Collecting torchaudio==0.9.0
-    Downloading torchaudio-0.9.0-cp38-cp38-manylinux1_x86_64.whl (1.9 MB)
-       |________________________________| 1.9 MB 73.1 MB/s
-  Requirement already satisfied: torch==1.9.0 in ./test-icefall/lib/python3.8/site-packages (from torchaudio==0.9.0->lhotse===0.8.0.dev
-  -2a1410b-clean) (1.9.0)
-  Requirement already satisfied: typing-extensions in ./test-icefall/lib/python3.8/site-packages (from torch==1.9.0->torchaudio==0.9.0-
-  >lhotse===0.8.0.dev-2a1410b-clean) (3.10.0.0)
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/e6/02/a2cff6306177ae6bc73bc0665065de51dfb3b9db7373e122e2735faf0d97/tqdm-4.65.0-py3-none-any
+  .whl (77 kB)
+  Requirement already satisfied: numpy>=1.18.1 in ./test-icefall/lib/python3.8/site-packages (from lhotse==1.16.0.dev0+git.7640d66.clean) (1.24.4)
+  Collecting audioread>=2.1.9
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/5d/cb/82a002441902dccbe427406785db07af10182245ee639ea9f4d92907c923/audioread-3.0.0.tar.gz (
+  377 kB)
+    Preparing metadata (setup.py) ... done
+  Collecting tabulate>=0.8.1
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-
+  any.whl (35 kB)
+  Collecting click>=7.1.1
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/1a/70/e63223f8116931d365993d4a6b7ef653a4d920b41d03de7c59499962821f/click-8.1.6-py3-none-any.
+  whl (97 kB)
+       ________________________________________ 97.9/97.9 kB 8.4 MB/s eta 0:00:00
+  Collecting packaging
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/ab/c3/57f0601a2d4fe15de7a553c00adbc901425661bf048f2a22dfc500caf121/packaging-23.1-py3-none-
+  any.whl (48 kB)
+  Collecting intervaltree>=3.1.0
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/50/fb/396d568039d21344639db96d940d40eb62befe704ef849b27949ded5c3bb/intervaltree-3.1.0.tar.gz
+   (32 kB)
+    Preparing metadata (setup.py) ... done
+  Requirement already satisfied: torch in ./test-icefall/lib/python3.8/site-packages (from lhotse==1.16.0.dev0+git.7640d66.clean) (1.13.0+cu116)
+  Collecting SoundFile>=0.10
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/ad/bd/0602167a213d9184fc688b1086dc6d374b7ae8c33eccf169f9b50ce6568c/soundfile-0.12.1-py2.py3-
+  none-manylinux_2_17_x86_64.whl (1.3 MB)
+       ________________________________________ 1.3/1.3 MB 46.5 MB/s eta 0:00:00
  Collecting toolz>=0.8.0
-    Using cached toolz-0.11.1-py3-none-any.whl (55 kB)
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/7f/5c/922a3508f5bda2892be3df86c74f9cf1e01217c2b1f8a0ac4841d903e3e9/toolz-0.12.0-py3-none-any.whl (55 kB)
  Collecting sortedcontainers<3.0,>=2.0
-    Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
  Collecting cffi>=1.0
-    Using cached cffi-1.14.6-cp38-cp38-manylinux1_x86_64.whl (411 kB)
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/b7/8b/06f30caa03b5b3ac006de4f93478dbd0239e2a16566d81a106c322dc4f79/cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (442 kB)
+  Requirement already satisfied: typing-extensions in ./test-icefall/lib/python3.8/site-packages (from torch->lhotse==1.16.0.dev0+git.7640d66.clean) (4.7.1)
  Collecting pycparser
-    Using cached pycparser-2.20-py2.py3-none-any.whl (112 kB)
-  Collecting pyparsing>=2.0.2
-    Using cached pyparsing-2.4.7-py2.py3-none-any.whl (67 kB)
-  Building wheels for collected packages: lhotse
-    Building wheel for lhotse (setup.py) ... done
-    Created wheel for lhotse: filename=lhotse-0.8.0.dev_2a1410b_clean-py3-none-any.whl size=342242 sha256=f683444afa4dc0881133206b4646a
-  9d0f774224cc84000f55d0a67f6e4a37997
-    Stored in directory: /tmp/pip-ephem-wheel-cache-ftu0qysz/wheels/7f/7a/8e/a0bf241336e2e3cb573e1e21e5600952d49f5162454f2e612f
-    WARNING: Built wheel for lhotse is invalid: Metadata 1.2 mandates PEP 440 version, but '0.8.0.dev-2a1410b-clean' is not
-  Failed to build lhotse
-  Installing collected packages: pycparser, toolz, sortedcontainers, pyparsing, numpy, cffi, tqdm, torchaudio, SoundFile, pyyaml, packa
-  ging, lilcom, intervaltree, h5py, dataclasses, cytoolz, click, audioread, lhotse
-      Running setup.py install for lhotse ... done
-    DEPRECATION: lhotse was installed using the legacy 'setup.py install' method, because a wheel could not be built for it. A possible
-   replacement is to fix the wheel build issue reported above. You can find discussion regarding this at https://github.com/pypa/pip/is
-  sues/8368.
-  Successfully installed SoundFile-0.10.3.post1 audioread-2.1.9 cffi-1.14.6 click-8.0.1 cytoolz-0.11.0 dataclasses-0.6 h5py-3.4.0 inter
-  valtree-3.1.0 lhotse-0.8.0.dev-2a1410b-clean lilcom-1.1.1 numpy-1.21.2 packaging-21.0 pycparser-2.20 pyparsing-2.4.7 pyyaml-5.4.1 sor
-  tedcontainers-2.4.0 toolz-0.11.1 torchaudio-0.9.0 tqdm-4.62.1
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/62/d5/5f610ebe421e85889f2e55e33b7f9a6795bd982198517d912eb1c76e1a53/pycparser-2.21-py2.py3-none-any.whl (118 kB)
+  Building wheels for collected packages: lhotse, audioread, intervaltree
+    Building wheel for lhotse (pyproject.toml) ... done
+    Created wheel for lhotse: filename=lhotse-1.16.0.dev0+git.7640d66.clean-py3-none-any.whl size=687627 sha256=cbf0a4d2d0b639b33b91637a4175bc251d6a021a069644ecb1a9f2b3a83d072a
+    Stored in directory: /tmp/pip-ephem-wheel-cache-wwtk90_m/wheels/7f/7a/8e/a0bf241336e2e3cb573e1e21e5600952d49f5162454f2e612f
+    Building wheel for audioread (setup.py) ... done
+    Created wheel for audioread: filename=audioread-3.0.0-py3-none-any.whl size=23704 sha256=5e2d3537c96ce9cf0f645a654c671163707bf8cb8d9e358d0e2b0939a85ff4c2
+    Stored in directory: /star-fj/fangjun/.cache/pip/wheels/e2/c3/9c/f19ae5a03f8862d9f0776b0c0570f1fdd60a119d90954e3f39
+    Building wheel for intervaltree (setup.py) ... done
+    Created wheel for intervaltree: filename=intervaltree-3.1.0-py2.py3-none-any.whl size=26098 sha256=2604170976cfffe0d2f678cb1a6e5b525f561cd50babe53d631a186734fec9f9
+    Stored in directory: /star-fj/fangjun/.cache/pip/wheels/f3/ed/2b/c179ebfad4e15452d6baef59737f27beb9bfb442e0620f7271
+  Successfully built lhotse audioread intervaltree
+  Installing collected packages: sortedcontainers, dataclasses, tqdm, toolz, tabulate, pyyaml, pycparser, packaging, lilcom, intervaltree, click, audioread, cytoolz, cffi, SoundFile, lhotse
+  Successfully installed SoundFile-0.12.1 audioread-3.0.0 cffi-1.15.1 click-8.1.6 cytoolz-0.12.2 dataclasses-0.6 intervaltree-3.1.0 lhotse-1.16.0.dev0+git.7640d66.clean lilcom-1.7 packaging-23.1 pycparser-2.21 pyyaml-6.0.1 sortedcontainers-2.4.0 tabulate-0.9.0 toolz-0.12.0 tqdm-4.65.0

-(5) Download icefall
+
+Verify that `lhotse`_ has been installed successfully:
+
+.. code-block:: bash
+
+  (test-icefall) kuangfangjun:~$ python3 -c "import lhotse; print(lhotse.__version__)"
+
+  1.16.0.dev+git.7640d66.clean
+
+(6) Download icefall
 ~~~~~~~~~~~~~~~~~~~~

-.. code-block::
+.. code-block:: bash

-  $ cd /tmp
-  $ git clone https://github.com/k2-fsa/icefall
+  (test-icefall) kuangfangjun:~$ cd /tmp/
+
+  (test-icefall) kuangfangjun:tmp$ git clone https://github.com/k2-fsa/icefall

  Cloning into 'icefall'...
-  remote: Enumerating objects: 500, done.
-  remote: Counting objects: 100% (500/500), done.
-  remote: Compressing objects: 100% (308/308), done.
-  remote: Total 500 (delta 263), reused 307 (delta 102), pack-reused 0
-  Receiving objects: 100% (500/500), 172.49 KiB | 385.00 KiB/s, done.
-  Resolving deltas: 100% (263/263), done.
+  remote: Enumerating objects: 12942, done.
+  remote: Counting objects: 100% (67/67), done.
+  remote: Compressing objects: 100% (56/56), done.
+  remote: Total 12942 (delta 17), reused 35 (delta 6), pack-reused 12875
+  Receiving objects: 100% (12942/12942), 14.77 MiB | 9.29 MiB/s, done.
+  Resolving deltas: 100% (8835/8835), done.

-  $ cd icefall
-  $ pip install -r requirements.txt
-
-  Collecting kaldilm
-    Downloading kaldilm-1.8.tar.gz (48 kB)
-       |________________________________| 48 kB 574 kB/s
-  Collecting kaldialign
-    Using cached kaldialign-0.2-cp38-cp38-linux_x86_64.whl
-  Collecting sentencepiece>=0.1.96
-    Using cached sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
-  Collecting tensorboard
-    Using cached tensorboard-2.6.0-py3-none-any.whl (5.6 MB)
-  Requirement already satisfied: setuptools>=41.0.0 in /ceph-fj/fangjun/test-icefall/lib/python3.8/site-packages (from tensorboard->-r
-  requirements.txt (line 4)) (57.4.0)
-  Collecting absl-py>=0.4
-    Using cached absl_py-0.13.0-py3-none-any.whl (132 kB)
-  Collecting google-auth-oauthlib<0.5,>=0.4.1
-    Using cached google_auth_oauthlib-0.4.5-py2.py3-none-any.whl (18 kB)
-  Collecting grpcio>=1.24.3
-    Using cached grpcio-1.39.0-cp38-cp38-manylinux2014_x86_64.whl (4.3 MB)
-  Requirement already satisfied: wheel>=0.26 in /ceph-fj/fangjun/test-icefall/lib/python3.8/site-packages (from tensorboard->-r require
-  ments.txt (line 4)) (0.36.2)
-  Requirement already satisfied: numpy>=1.12.0 in /ceph-fj/fangjun/test-icefall/lib/python3.8/site-packages (from tensorboard->-r requi
-  rements.txt (line 4)) (1.21.2)
-  Collecting protobuf>=3.6.0
-    Using cached protobuf-3.17.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
-  Collecting werkzeug>=0.11.15
-    Using cached Werkzeug-2.0.1-py3-none-any.whl (288 kB)
-  Collecting tensorboard-data-server<0.7.0,>=0.6.0
-    Using cached tensorboard_data_server-0.6.1-py3-none-manylinux2010_x86_64.whl (4.9 MB)
-  Collecting google-auth<2,>=1.6.3
-    Downloading google_auth-1.35.0-py2.py3-none-any.whl (152 kB)
-       |________________________________| 152 kB 1.4 MB/s
-  Collecting requests<3,>=2.21.0
-    Using cached requests-2.26.0-py2.py3-none-any.whl (62 kB)
-  Collecting tensorboard-plugin-wit>=1.6.0
-    Using cached tensorboard_plugin_wit-1.8.0-py3-none-any.whl (781 kB)
-  Collecting markdown>=2.6.8
-    Using cached Markdown-3.3.4-py3-none-any.whl (97 kB)
-  Collecting six
-    Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
-  Collecting cachetools<5.0,>=2.0.0
-    Using cached cachetools-4.2.2-py3-none-any.whl (11 kB)
-  Collecting rsa<5,>=3.1.4
-    Using cached rsa-4.7.2-py3-none-any.whl (34 kB)
-  Collecting pyasn1-modules>=0.2.1
-    Using cached pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)
-  Collecting requests-oauthlib>=0.7.0
-    Using cached requests_oauthlib-1.3.0-py2.py3-none-any.whl (23 kB)
-  Collecting pyasn1<0.5.0,>=0.4.6
-    Using cached pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)
-  Collecting urllib3<1.27,>=1.21.1
-    Using cached urllib3-1.26.6-py2.py3-none-any.whl (138 kB)
-  Collecting certifi>=2017.4.17
-    Using cached certifi-2021.5.30-py2.py3-none-any.whl (145 kB)
-  Collecting charset-normalizer~=2.0.0
-    Using cached charset_normalizer-2.0.4-py3-none-any.whl (36 kB)
-  Collecting idna<4,>=2.5
-    Using cached idna-3.2-py3-none-any.whl (59 kB)
-  Collecting oauthlib>=3.0.0
-    Using cached oauthlib-3.1.1-py2.py3-none-any.whl (146 kB)
-  Building wheels for collected packages: kaldilm
-    Building wheel for kaldilm (setup.py) ... done
-    Created wheel for kaldilm: filename=kaldilm-1.8-cp38-cp38-linux_x86_64.whl size=897233 sha256=eccb906cafcd45bf9a7e1a1718e4534254bfb
-  f4c0d0cbc66eee6c88d68a63862
-    Stored in directory: /root/fangjun/.cache/pip/wheels/85/7d/63/f2dd586369b8797cb36d213bf3a84a789eeb92db93d2e723c9
-  Successfully built kaldilm
-  Installing collected packages: urllib3, pyasn1, idna, charset-normalizer, certifi, six, rsa, requests, pyasn1-modules, oauthlib, cach
-  etools, requests-oauthlib, google-auth, werkzeug, tensorboard-plugin-wit, tensorboard-data-server, protobuf, markdown, grpcio, google
-  -auth-oauthlib, absl-py, tensorboard, sentencepiece, kaldilm, kaldialign
-  Successfully installed absl-py-0.13.0 cachetools-4.2.2 certifi-2021.5.30 charset-normalizer-2.0.4 google-auth-1.35.0 google-auth-oaut
-  hlib-0.4.5 grpcio-1.39.0 idna-3.2 kaldialign-0.2 kaldilm-1.8 markdown-3.3.4 oauthlib-3.1.1 protobuf-3.17.3 pyasn1-0.4.8 pyasn1-module
-  s-0.2.8 requests-2.26.0 requests-oauthlib-1.3.0 rsa-4.7.2 sentencepiece-0.1.96 six-1.16.0 tensorboard-2.6.0 tensorboard-data-server-0
-  .6.1 tensorboard-plugin-wit-1.8.0 urllib3-1.26.6 werkzeug-2.0.1
+  (test-icefall) kuangfangjun:tmp$ cd icefall/

+  (test-icefall) kuangfangjun:icefall$ pip install -r ./requirements.txt

 Test Your Installation
 ----------------------

 To test that your installation is successful, let us run
 the `yesno recipe <https://github.com/k2-fsa/icefall/tree/master/egs/yesno/ASR>`_
-on CPU.
+on ``CPU``.

 Data preparation
 ~~~~~~~~~~~~~~~~

 .. code-block:: bash

-  $ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
-  $ cd /tmp/icefall
-  $ cd egs/yesno/ASR
-  $ ./prepare.sh
+  (test-icefall) kuangfangjun:icefall$ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+
+  (test-icefall) kuangfangjun:icefall$ cd /tmp/icefall
+
+  (test-icefall) kuangfangjun:icefall$ cd egs/yesno/ASR
+
+  (test-icefall) kuangfangjun:ASR$ ./prepare.sh
+

 The log of running ``./prepare.sh`` is:

 .. code-block::

-   2023-05-12 17:55:21 (prepare.sh:27:main) dl_dir: /tmp/icefall/egs/yesno/ASR/download
-   2023-05-12 17:55:21 (prepare.sh:30:main) Stage 0: Download data
-   /tmp/icefall/egs/yesno/ASR/download/waves_yesno.tar.gz: 100%|_______________________________________________________________| 4.70M/4.70M [06:54<00:00, 11.4kB/s]
-   2023-05-12 18:02:19 (prepare.sh:39:main) Stage 1: Prepare yesno manifest
-   2023-05-12 18:02:21 (prepare.sh:45:main) Stage 2: Compute fbank for yesno
-   2023-05-12 18:02:23,199 INFO [compute_fbank_yesno.py:65] Processing train
-   Extracting and storing features: 100%|_______________________________________________________________| 90/90 [00:00<00:00, 212.60it/s]
-   2023-05-12 18:02:23,640 INFO [compute_fbank_yesno.py:65] Processing test
-   Extracting and storing features: 100%|_______________________________________________________________| 30/30 [00:00<00:00, 304.53it/s]
-   2023-05-12 18:02:24 (prepare.sh:51:main) Stage 3: Prepare lang
-   2023-05-12 18:02:26 (prepare.sh:66:main) Stage 4: Prepare G
-   /project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):79
-   [I] Reading \data\ section.
-   /project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):140
-   [I] Reading \1-grams: section.
-   2023-05-12 18:02:26 (prepare.sh:92:main) Stage 5: Compile HLG
-   2023-05-12 18:02:28,581 INFO [compile_hlg.py:124] Processing data/lang_phone
-   2023-05-12 18:02:28,582 INFO [lexicon.py:171] Converting L.pt to Linv.pt
-   2023-05-12 18:02:28,609 INFO [compile_hlg.py:48] Building ctc_topo. max_token_id: 3
-   2023-05-12 18:02:28,610 INFO [compile_hlg.py:52] Loading G.fst.txt
-   2023-05-12 18:02:28,611 INFO [compile_hlg.py:62] Intersecting L and G
-   2023-05-12 18:02:28,613 INFO [compile_hlg.py:64] LG shape: (4, None)
-   2023-05-12 18:02:28,613 INFO [compile_hlg.py:66] Connecting LG
-   2023-05-12 18:02:28,614 INFO [compile_hlg.py:68] LG shape after k2.connect: (4, None)
-   2023-05-12 18:02:28,614 INFO [compile_hlg.py:70] <class 'torch.Tensor'>
-   2023-05-12 18:02:28,614 INFO [compile_hlg.py:71] Determinizing LG
-   2023-05-12 18:02:28,615 INFO [compile_hlg.py:74] <class '_k2.ragged.RaggedTensor'>
-   2023-05-12 18:02:28,615 INFO [compile_hlg.py:76] Connecting LG after k2.determinize
-   2023-05-12 18:02:28,615 INFO [compile_hlg.py:79] Removing disambiguation symbols on LG
-   2023-05-12 18:02:28,616 INFO [compile_hlg.py:91] LG shape after k2.remove_epsilon: (6, None)
-   2023-05-12 18:02:28,617 INFO [compile_hlg.py:96] Arc sorting LG
-   2023-05-12 18:02:28,617 INFO [compile_hlg.py:99] Composing H and LG
-   2023-05-12 18:02:28,619 INFO [compile_hlg.py:106] Connecting LG
-   2023-05-12 18:02:28,619 INFO [compile_hlg.py:109] Arc sorting LG
-   2023-05-12 18:02:28,619 INFO [compile_hlg.py:111] HLG.shape: (8, None)
-   2023-05-12 18:02:28,619 INFO [compile_hlg.py:127] Saving HLG.pt to data/lang_phone
-
+  2023-07-27 12:41:39 (prepare.sh:27:main) dl_dir: /tmp/icefall/egs/yesno/ASR/download
+  2023-07-27 12:41:39 (prepare.sh:30:main) Stage 0: Download data
+  /tmp/icefall/egs/yesno/ASR/download/waves_yesno.tar.gz: 100%|___________________________________________________| 4.70M/4.70M [00:00<00:00, 11.1MB/s]
+  2023-07-27 12:41:46 (prepare.sh:39:main) Stage 1: Prepare yesno manifest
+  2023-07-27 12:41:50 (prepare.sh:45:main) Stage 2: Compute fbank for yesno
+  2023-07-27 12:41:55,718 INFO [compute_fbank_yesno.py:65] Processing train
+  Extracting and storing features: 100%|_______________________________________________________________________________| 90/90 [00:01<00:00, 87.82it/s]
+  2023-07-27 12:41:56,778 INFO [compute_fbank_yesno.py:65] Processing test
+  Extracting and storing features: 100%|______________________________________________________________________________| 30/30 [00:00<00:00, 256.92it/s]
+  2023-07-27 12:41:57 (prepare.sh:51:main) Stage 3: Prepare lang
+  2023-07-27 12:42:02 (prepare.sh:66:main) Stage 4: Prepare G
+  /project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):79
+  [I] Reading \data\ section.
+  /project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):140
+  [I] Reading \1-grams: section.
+  2023-07-27 12:42:02 (prepare.sh:92:main) Stage 5: Compile HLG
+  2023-07-27 12:42:07,275 INFO [compile_hlg.py:124] Processing data/lang_phone
+  2023-07-27 12:42:07,276 INFO [lexicon.py:171] Converting L.pt to Linv.pt
+  2023-07-27 12:42:07,309 INFO [compile_hlg.py:48] Building ctc_topo. max_token_id: 3
+  2023-07-27 12:42:07,310 INFO [compile_hlg.py:52] Loading G.fst.txt
+  2023-07-27 12:42:07,314 INFO [compile_hlg.py:62] Intersecting L and G
+  2023-07-27 12:42:07,323 INFO [compile_hlg.py:64] LG shape: (4, None)
+  2023-07-27 12:42:07,323 INFO [compile_hlg.py:66] Connecting LG
+  2023-07-27 12:42:07,323 INFO [compile_hlg.py:68] LG shape after k2.connect: (4, None)
+  2023-07-27 12:42:07,323 INFO [compile_hlg.py:70] <class 'torch.Tensor'>
+  2023-07-27 12:42:07,323 INFO [compile_hlg.py:71] Determinizing LG
+  2023-07-27 12:42:07,341 INFO [compile_hlg.py:74] <class '_k2.ragged.RaggedTensor'>
+  2023-07-27 12:42:07,341 INFO [compile_hlg.py:76] Connecting LG after k2.determinize
+  2023-07-27 12:42:07,341 INFO [compile_hlg.py:79] Removing disambiguation symbols on LG
+  2023-07-27 12:42:07,354 INFO [compile_hlg.py:91] LG shape after k2.remove_epsilon: (6, None)
+  2023-07-27 12:42:07,445 INFO [compile_hlg.py:96] Arc sorting LG
+  2023-07-27 12:42:07,445 INFO [compile_hlg.py:99] Composing H and LG
+  2023-07-27 12:42:07,446 INFO [compile_hlg.py:106] Connecting LG
+  2023-07-27 12:42:07,446 INFO [compile_hlg.py:109] Arc sorting LG
+  2023-07-27 12:42:07,447 INFO [compile_hlg.py:111] HLG.shape: (8, None)
+  2023-07-27 12:42:07,447 INFO [compile_hlg.py:127] Saving HLG.pt to data/lang_phone

 Training
 ~~~~~~~~
@ -409,12 +435,13 @@ Now let us run the training part:

 .. code-block::

-  $ export CUDA_VISIBLE_DEVICES=""
-  $ ./tdnn/train.py
+  (test-icefall) kuangfangjun:ASR$ export CUDA_VISIBLE_DEVICES=""
+
+  (test-icefall) kuangfangjun:ASR$ ./tdnn/train.py

 .. CAUTION::

-  We use ``export CUDA_VISIBLE_DEVICES=""`` so that ``icefall`` uses CPU
+  We use ``export CUDA_VISIBLE_DEVICES=""`` so that `icefall`_ uses CPU
  even if there are GPUs available.

 .. hint::
@ -432,53 +459,52 @@ The training log is given below:

 .. code-block::

-   2023-05-12 18:04:59,759 INFO [train.py:481] Training started
-   2023-05-12 18:04:59,759 INFO [train.py:482] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 
-   'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 
-   'reduction': 'sum', 'use_double_scores': True, 'world_size': 1, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 15, 'seed': 42, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0,
-   'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 
-   'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '3b7f09fa35e72589914f67089c0da9f196a92ca4', 'k2-git-date': 'Mon May 8 22:58:45 2023', 
-   'lhotse-version': '1.15.0.dev+git.6fcfced.clean', 'torch-version': '2.0.0+cu118', 'torch-cuda-available': False, 'torch-cuda-version': '11.8', 'python-version': '3.1', 'icefall-git-branch': 'master', 
-   'icefall-git-sha1': '30bde4b-clean', 'icefall-git-date': 'Thu May 11 17:37:47 2023', 'icefall-path': '/tmp/icefall', 
-   'k2-path': 'tmp/lib/python3.10/site-packages/k2-1.24.3.dev20230512+cuda11.8.torch2.0.0-py3.10-linux-x86_64.egg/k2/__init__.py', 
-   'lhotse-path': 'tmp/lib/python3.10/site-packages/lhotse/__init__.py', 'hostname': 'host', 'IP address': '0.0.0.0'}}
-   2023-05-12 18:04:59,761 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
-   2023-05-12 18:04:59,764 INFO [train.py:495] device: cpu
-   2023-05-12 18:04:59,791 INFO [asr_datamodule.py:146] About to get train cuts
-   2023-05-12 18:04:59,791 INFO [asr_datamodule.py:244] About to get train cuts
-   2023-05-12 18:04:59,852 INFO [asr_datamodule.py:149] About to create train dataset
-   2023-05-12 18:04:59,852 INFO [asr_datamodule.py:199] Using SingleCutSampler.
-   2023-05-12 18:04:59,852 INFO [asr_datamodule.py:205] About to create train dataloader
-   2023-05-12 18:04:59,853 INFO [asr_datamodule.py:218] About to get test cuts
-   2023-05-12 18:04:59,853 INFO [asr_datamodule.py:252] About to get test cuts
-   2023-05-12 18:04:59,986 INFO [train.py:422] Epoch 0, batch 0, loss[loss=1.065, over 2436.00 frames. ], tot_loss[loss=1.065, over 2436.00 frames. ], batch size: 4
-   2023-05-12 18:05:00,352 INFO [train.py:422] Epoch 0, batch 10, loss[loss=0.4561, over 2828.00 frames. ], tot_loss[loss=0.7076, over 22192.90 frames. ], batch size: 4
-   2023-05-12 18:05:00,691 INFO [train.py:444] Epoch 0, validation loss=0.9002, over 18067.00 frames.
-   2023-05-12 18:05:00,996 INFO [train.py:422] Epoch 0, batch 20, loss[loss=0.2555, over 2695.00 frames. ], tot_loss[loss=0.484, over 34971.47 frames. ], batch size: 5
-   2023-05-12 18:05:01,217 INFO [train.py:444] Epoch 0, validation loss=0.4688, over 18067.00 frames.
-   2023-05-12 18:05:01,251 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-0.pt
-   2023-05-12 18:05:01,389 INFO [train.py:422] Epoch 1, batch 0, loss[loss=0.2532, over 2436.00 frames. ], tot_loss[loss=0.2532, over 2436.00 frames. ], batch size: 4
-   2023-05-12 18:05:01,637 INFO [train.py:422] Epoch 1, batch 10, loss[loss=0.1139, over 2828.00 frames. ], tot_loss[loss=0.1592, over 22192.90 frames. ], batch size: 4
-   2023-05-12 18:05:01,859 INFO [train.py:444] Epoch 1, validation loss=0.1629, over 18067.00 frames.
-   2023-05-12 18:05:02,094 INFO [train.py:422] Epoch 1, batch 20, loss[loss=0.0767, over 2695.00 frames. ], tot_loss[loss=0.118, over 34971.47 frames. ], batch size: 5
-   2023-05-12 18:05:02,350 INFO [train.py:444] Epoch 1, validation loss=0.06778, over 18067.00 frames.
-   2023-05-12 18:05:02,395 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-1.pt
+    2023-07-27 12:50:51,936 INFO [train.py:481] Training started
+    2023-07-27 12:50:51,936 INFO [train.py:482] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'world_size': 1, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 15, 'seed': 42, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4c05309499a08454997adf500b56dcc629e35ae5', 'k2-git-date': 'Tue Jul 25 16:23:36 2023', 'lhotse-version': '1.16.0.dev+git.7640d66.clean', 'torch-version': '1.13.0+cu116', 'torch-cuda-available': False, 'torch-cuda-version': '11.6', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '3fb0a43-clean', 'icefall-git-date': 'Thu Jul 27 12:36:05 2023', 'icefall-path': '/tmp/icefall', 'k2-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-1-1220091118-57c4d55446-sph26', 'IP address': '10.177.77.20'}}
+    2023-07-27 12:50:51,941 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+    2023-07-27 12:50:51,949 INFO [train.py:495] device: cpu
+    2023-07-27 12:50:51,965 INFO [asr_datamodule.py:146] About to get train cuts
+    2023-07-27 12:50:51,965 INFO [asr_datamodule.py:244] About to get train cuts
+    2023-07-27 12:50:51,967 INFO [asr_datamodule.py:149] About to create train dataset
+    2023-07-27 12:50:51,967 INFO [asr_datamodule.py:199] Using SingleCutSampler.
+    2023-07-27 12:50:51,967 INFO [asr_datamodule.py:205] About to create train dataloader
+    2023-07-27 12:50:51,968 INFO [asr_datamodule.py:218] About to get test cuts
+    2023-07-27 12:50:51,968 INFO [asr_datamodule.py:252] About to get test cuts
+    2023-07-27 12:50:52,565 INFO [train.py:422] Epoch 0, batch 0, loss[loss=1.065, over 2436.00 frames. ], tot_loss[loss=1.065, over 2436.00 frames. ], batch size: 4
+    2023-07-27 12:50:53,681 INFO [train.py:422] Epoch 0, batch 10, loss[loss=0.4561, over 2828.00 frames. ], tot_loss[loss=0.7076, over 22192.90 frames.], batch size: 4
+    2023-07-27 12:50:54,167 INFO [train.py:444] Epoch 0, validation loss=0.9002, over 18067.00 frames.
+    2023-07-27 12:50:55,011 INFO [train.py:422] Epoch 0, batch 20, loss[loss=0.2555, over 2695.00 frames. ], tot_loss[loss=0.484, over 34971.47 frames. ], batch size: 5
+    2023-07-27 12:50:55,331 INFO [train.py:444] Epoch 0, validation loss=0.4688, over 18067.00 frames.
+    2023-07-27 12:50:55,368 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-0.pt
+    2023-07-27 12:50:55,633 INFO [train.py:422] Epoch 1, batch 0, loss[loss=0.2532, over 2436.00 frames. ], tot_loss[loss=0.2532, over 2436.00 frames. ],
+     batch size: 4
+    2023-07-27 12:50:56,242 INFO [train.py:422] Epoch 1, batch 10, loss[loss=0.1139, over 2828.00 frames. ], tot_loss[loss=0.1592, over 22192.90 frames.], batch size: 4
+    2023-07-27 12:50:56,522 INFO [train.py:444] Epoch 1, validation loss=0.1627, over 18067.00 frames.
+    2023-07-27 12:50:57,209 INFO [train.py:422] Epoch 1, batch 20, loss[loss=0.07055, over 2695.00 frames. ], tot_loss[loss=0.1175, over 34971.47 frames.], batch size: 5
+    2023-07-27 12:50:57,600 INFO [train.py:444] Epoch 1, validation loss=0.07091, over 18067.00 frames.
+    2023-07-27 12:50:57,640 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-1.pt
+    2023-07-27 12:50:57,847 INFO [train.py:422] Epoch 2, batch 0, loss[loss=0.07731, over 2436.00 frames. ], tot_loss[loss=0.07731, over 2436.00 frames.], batch size: 4
+    2023-07-27 12:50:58,427 INFO [train.py:422] Epoch 2, batch 10, loss[loss=0.04391, over 2828.00 frames. ], tot_loss[loss=0.05341, over 22192.90 frames. ], batch size: 4
+    2023-07-27 12:50:58,884 INFO [train.py:444] Epoch 2, validation loss=0.04384, over 18067.00 frames.
+    2023-07-27 12:50:59,387 INFO [train.py:422] Epoch 2, batch 20, loss[loss=0.03458, over 2695.00 frames. ], tot_loss[loss=0.04616, over 34971.47 frames. ], batch size: 5
+    2023-07-27 12:50:59,707 INFO [train.py:444] Epoch 2, validation loss=0.03379, over 18067.00 frames.
+    2023-07-27 12:50:59,758 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-2.pt

-  ... ...
+      ... ...

-   2023-05-12 18:05:14,789 INFO [train.py:422] Epoch 13, batch 0, loss[loss=0.01056, over 2436.00 frames. ], tot_loss[loss=0.01056, over 2436.00 frames. ], batch size: 4
-   2023-05-12 18:05:15,016 INFO [train.py:422] Epoch 13, batch 10, loss[loss=0.009022, over 2828.00 frames. ], tot_loss[loss=0.009985, over 22192.90 frames. ], batch size: 4
-   2023-05-12 18:05:15,271 INFO [train.py:444] Epoch 13, validation loss=0.01088, over 18067.00 frames.
-   2023-05-12 18:05:15,497 INFO [train.py:422] Epoch 13, batch 20, loss[loss=0.01174, over 2695.00 frames. ], tot_loss[loss=0.01077, over 34971.47 frames. ], batch size: 5
-   2023-05-12 18:05:15,747 INFO [train.py:444] Epoch 13, validation loss=0.01087, over 18067.00 frames.
-   2023-05-12 18:05:15,783 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-13.pt
-   2023-05-12 18:05:15,921 INFO [train.py:422] Epoch 14, batch 0, loss[loss=0.01045, over 2436.00 frames. ], tot_loss[loss=0.01045, over 2436.00 frames. ], batch size: 4
-   2023-05-12 18:05:16,146 INFO [train.py:422] Epoch 14, batch 10, loss[loss=0.008957, over 2828.00 frames. ], tot_loss[loss=0.009903, over 22192.90 frames. ], batch size: 4
-   2023-05-12 18:05:16,374 INFO [train.py:444] Epoch 14, validation loss=0.01092, over 18067.00 frames.
-   2023-05-12 18:05:16,598 INFO [train.py:422] Epoch 14, batch 20, loss[loss=0.01169, over 2695.00 frames. ], tot_loss[loss=0.01065, over 34971.47 frames. ], batch size: 5
-   2023-05-12 18:05:16,824 INFO [train.py:444] Epoch 14, validation loss=0.01077, over 18067.00 frames.
-   2023-05-12 18:05:16,862 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-14.pt
-   2023-05-12 18:05:16,865 INFO [train.py:555] Done!
+    2023-07-27 12:51:23,433 INFO [train.py:422] Epoch 13, batch 0, loss[loss=0.01054, over 2436.00 frames. ], tot_loss[loss=0.01054, over 2436.00 frames. ], batch size: 4
+    2023-07-27 12:51:23,980 INFO [train.py:422] Epoch 13, batch 10, loss[loss=0.009014, over 2828.00 frames. ], tot_loss[loss=0.009974, over 22192.90 frames. ], batch size: 4
+    2023-07-27 12:51:24,489 INFO [train.py:444] Epoch 13, validation loss=0.01085, over 18067.00 frames.
+    2023-07-27 12:51:25,258 INFO [train.py:422] Epoch 13, batch 20, loss[loss=0.01172, over 2695.00 frames. ], tot_loss[loss=0.01055, over 34971.47 frames. ], batch size: 5
+    2023-07-27 12:51:25,621 INFO [train.py:444] Epoch 13, validation loss=0.01074, over 18067.00 frames.
+    2023-07-27 12:51:25,699 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-13.pt
+    2023-07-27 12:51:25,866 INFO [train.py:422] Epoch 14, batch 0, loss[loss=0.01044, over 2436.00 frames. ], tot_loss[loss=0.01044, over 2436.00 frames. ], batch size: 4
+    2023-07-27 12:51:26,844 INFO [train.py:422] Epoch 14, batch 10, loss[loss=0.008942, over 2828.00 frames. ], tot_loss[loss=0.01, over 22192.90 frames. ], batch size: 4
+    2023-07-27 12:51:27,221 INFO [train.py:444] Epoch 14, validation loss=0.01082, over 18067.00 frames.
+    2023-07-27 12:51:27,970 INFO [train.py:422] Epoch 14, batch 20, loss[loss=0.01169, over 2695.00 frames. ], tot_loss[loss=0.01054, over 34971.47 frames. ], batch size: 5
+    2023-07-27 12:51:28,247 INFO [train.py:444] Epoch 14, validation loss=0.01073, over 18067.00 frames.
+    2023-07-27 12:51:28,323 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-14.pt
+    2023-07-27 12:51:28,326 INFO [train.py:555] Done!

 Decoding
 ~~~~~~~~
@ -487,42 +513,32 @@ Let us use the trained model to decode the test set:

 .. code-block::

-  $ ./tdnn/decode.py
+  (test-icefall) kuangfangjun:ASR$ ./tdnn/decode.py

-The decoding log is:
+  2023-07-27 12:55:12,840 INFO [decode.py:263] Decoding started
+  2023-07-27 12:55:12,840 INFO [decode.py:264] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'export': False, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4c05309499a08454997adf500b56dcc629e35ae5', 'k2-git-date': 'Tue Jul 25 16:23:36 2023', 'lhotse-version': '1.16.0.dev+git.7640d66.clean', 'torch-version': '1.13.0+cu116', 'torch-cuda-available': False, 'torch-cuda-version': '11.6', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '3fb0a43-clean', 'icefall-git-date': 'Thu Jul 27 12:36:05 2023', 'icefall-path': '/tmp/icefall', 'k2-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-1-1220091118-57c4d55446-sph26', 'IP address': '10.177.77.20'}}
+  2023-07-27 12:55:12,841 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+  2023-07-27 12:55:12,855 INFO [decode.py:273] device: cpu
+  2023-07-27 12:55:12,868 INFO [decode.py:291] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
+  2023-07-27 12:55:12,882 INFO [asr_datamodule.py:218] About to get test cuts
+  2023-07-27 12:55:12,883 INFO [asr_datamodule.py:252] About to get test cuts
+  2023-07-27 12:55:13,157 INFO [decode.py:204] batch 0/?, cuts processed until now is 4
+  2023-07-27 12:55:13,701 INFO [decode.py:241] The transcripts are stored in tdnn/exp/recogs-test_set.txt
+  2023-07-27 12:55:13,702 INFO [utils.py:564] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
+  2023-07-27 12:55:13,704 INFO [decode.py:249] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
+  2023-07-27 12:55:13,704 INFO [decode.py:316] Done!

-.. code-block::

-   2023-05-12 18:08:30,482 INFO [decode.py:263] Decoding started
-   2023-05-12 18:08:30,483 INFO [decode.py:264] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23, 
-   'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'export': False, 'feature_dir': PosixPath('data/fbank'), 
-   'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 
-   'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '3b7f09fa35e72589914f67089c0da9f196a92ca4', 'k2-git-date': 'Mon May 8 22:58:45 2023', 
-   'lhotse-version': '1.15.0.dev+git.6fcfced.clean', 'torch-version': '2.0.0+cu118', 'torch-cuda-available': False, 'torch-cuda-version': '11.8', 'python-version': '3.1', 'icefall-git-branch': 'master', 
-   'icefall-git-sha1': '30bde4b-clean', 'icefall-git-date': 'Thu May 11 17:37:47 2023', 'icefall-path': '/tmp/icefall', 
-   'k2-path': '/tmp/lib/python3.10/site-packages/k2-1.24.3.dev20230512+cuda11.8.torch2.0.0-py3.10-linux-x86_64.egg/k2/__init__.py', 
-   'lhotse-path': '/tmp/lib/python3.10/site-packages/lhotse/__init__.py', 'hostname': 'host', 'IP address': '0.0.0.0'}}
-   2023-05-12 18:08:30,483 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
-   2023-05-12 18:08:30,487 INFO [decode.py:273] device: cpu
-   2023-05-12 18:08:30,513 INFO [decode.py:291] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
-   2023-05-12 18:08:30,521 INFO [asr_datamodule.py:218] About to get test cuts
-   2023-05-12 18:08:30,521 INFO [asr_datamodule.py:252] About to get test cuts
-   2023-05-12 18:08:30,675 INFO [decode.py:204] batch 0/?, cuts processed until now is 4
-   2023-05-12 18:08:30,923 INFO [decode.py:241] The transcripts are stored in tdnn/exp/recogs-test_set.txt
-   2023-05-12 18:08:30,924 INFO [utils.py:558] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
-   2023-05-12 18:08:30,925 INFO [decode.py:249] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
-   2023-05-12 18:08:30,925 INFO [decode.py:316] Done!
-
-**Congratulations!** You have successfully setup the environment and have run the first recipe in ``icefall``.
+**Congratulations!** You have successfully setup the environment and have run the first recipe in `icefall`_.

 Have fun with ``icefall``!

 YouTube Video
 -------------

-We provide the following YouTube video showing how to install ``icefall``.
+We provide the following YouTube video showing how to install `icefall`_.
 It also shows how to debug various problems that you may encounter while
-using ``icefall``.
+using `icefall`_.

 .. note::

--- a/docs/source/model-export/export-model-state-dict.rst
+++ b/docs/source/model-export/export-model-state-dict.rst
@ -41,7 +41,7 @@ as an example.

  ./pruned_transducer_stateless3/export.py \
    --exp-dir ./pruned_transducer_stateless3/exp \
-    --bpe-model data/lang_bpe_500/bpe.model \
+    --tokens data/lang_bpe_500/tokens.txt \
    --epoch 20 \
    --avg 10

@ -78,7 +78,7 @@ In each recipe, there is also a file ``pretrained.py``, which can use

   ./pruned_transducer_stateless3/pretrained.py \
      --checkpoint ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/pretrained-iter-1224000-avg-14.pt \
-      --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/bpe.model \
+      --tokens ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \
      --method greedy_search \
      ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \
      ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \
--- a/docs/source/model-export/export-ncnn-conv-emformer.rst
+++ b/docs/source/model-export/export-ncnn-conv-emformer.rst
@ -125,7 +125,7 @@ Python code. We have also set up ``PATH`` so that you can use
 .. caution::

  Please don't use `<https://github.com/tencent/ncnn>`_.
-  We have made some modifications to the offical `ncnn`_.
+  We have made some modifications to the official `ncnn`_.

  We will synchronize `<https://github.com/csukuangfj/ncnn>`_ periodically
  with the official one.
@ -153,11 +153,10 @@ Next, we use the following code to export our model:

  ./conv_emformer_transducer_stateless2/export-for-ncnn.py \
    --exp-dir $dir/exp \
-    --bpe-model $dir/data/lang_bpe_500/bpe.model \
+    --tokens $dir/data/lang_bpe_500/tokens.txt \
    --epoch 30 \
    --avg 1 \
    --use-averaged-model 0 \
-    \
    --num-encoder-layers 12 \
    --chunk-length 32 \
    --cnn-module-kernel 31 \
--- a/docs/source/model-export/export-ncnn-lstm.rst
+++ b/docs/source/model-export/export-ncnn-lstm.rst
@ -73,7 +73,7 @@ Next, we use the following code to export our model:

  ./lstm_transducer_stateless2/export-for-ncnn.py \
    --exp-dir $dir/exp \
-    --bpe-model $dir/data/lang_bpe_500/bpe.model \
+    --tokens $dir/data/lang_bpe_500/tokens.txt \
    --epoch 99 \
    --avg 1 \
    --use-averaged-model 0 \
--- a/docs/source/model-export/export-ncnn-zipformer.rst
+++ b/docs/source/model-export/export-ncnn-zipformer.rst
@ -72,12 +72,11 @@ Next, we use the following code to export our model:
  dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29

  ./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
-    --bpe-model $dir/data/lang_bpe_500/bpe.model \
+    --tokens $dir/data/lang_bpe_500/tokens.txt \
    --exp-dir $dir/exp \
    --use-averaged-model 0 \
    --epoch 99 \
    --avg 1 \
-    \
    --decode-chunk-len 32 \
    --num-left-chunks 4 \
    --num-encoder-layers "2,4,3,2,4" \
--- a/docs/source/model-export/export-ncnn.rst
+++ b/docs/source/model-export/export-ncnn.rst
@ -1,3 +1,5 @@
+.. _icefall_export_to_ncnn:
+
 Export to ncnn
 ==============

--- a/docs/source/model-export/export-onnx.rst
+++ b/docs/source/model-export/export-onnx.rst
@ -71,7 +71,7 @@ Export the model to ONNX
 .. code-block:: bash

  ./pruned_transducer_stateless7_streaming/export-onnx.py \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
    --use-averaged-model 0 \
    --epoch 99 \
    --avg 1 \
--- a/docs/source/model-export/export-with-torch-jit-script.rst
+++ b/docs/source/model-export/export-with-torch-jit-script.rst
@ -32,7 +32,7 @@ as an example in the following.

    ./pruned_transducer_stateless3/export.py \
      --exp-dir ./pruned_transducer_stateless3/exp \
-      --bpe-model data/lang_bpe_500/bpe.model \
+      --tokens data/lang_bpe_500/tokens.txt \
      --epoch $epoch \
      --avg $avg \
      --jit 1
--- a/docs/source/model-export/export-with-torch-jit-trace.rst
+++ b/docs/source/model-export/export-with-torch-jit-trace.rst
@ -33,7 +33,7 @@ as an example in the following.

    ./lstm_transducer_stateless2/export.py \
      --exp-dir ./lstm_transducer_stateless2/exp \
-      --bpe-model data/lang_bpe_500/bpe.model \
+      --tokens data/lang_bpe_500/tokens.txt \
      --iter $iter \
      --avg  $avg \
      --jit-trace 1
--- a/docs/source/recipes/Non-streaming-ASR/aishell/conformer_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/conformer_ctc.rst
@ -67,7 +67,7 @@ To run stage 2 to stage 5, use:
 .. HINT::

  A 3-gram language model will be downloaded from huggingface, we assume you have
-  intalled and initialized ``git-lfs``. If not, you could install ``git-lfs`` by
+  installed and initialized ``git-lfs``. If not, you could install ``git-lfs`` by

  .. code-block:: bash

--- a/docs/source/recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc.rst
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc.rst
@ -67,7 +67,7 @@ To run stage 2 to stage 5, use:
 .. HINT::

  A 3-gram language model will be downloaded from huggingface, we assume you have
-  intalled and initialized ``git-lfs``. If not, you could install ``git-lfs`` by
+  installed and initialized ``git-lfs``. If not, you could install ``git-lfs`` by

  .. code-block:: bash

--- a/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
@ -1,7 +1,7 @@
 Distillation with HuBERT
 ========================

-This tutorial shows you how to perform knowledge distillation in `icefall`_
+This tutorial shows you how to perform knowledge distillation in `icefall <https://github.com/k2-fsa/icefall>`_
 with the `LibriSpeech`_ dataset. The distillation method
 used here is called "Multi Vector Quantization Knowledge Distillation" (MVQ-KD).
 Please have a look at our paper `Predicting Multi-Codebook Vector Quantization Indexes for Knowledge Distillation <https://arxiv.org/abs/2211.00508>`_
@ -13,7 +13,7 @@ for more details about MVQ-KD.
    `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_.
    Currently, we only implement MVQ-KD in this recipe. However, MVQ-KD is theoretically applicable to all recipes
    with only minor changes needed. Feel free to try out MVQ-KD in different recipes. If you
-    encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`_.
+    encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`__.

 .. note::

@ -47,7 +47,7 @@ The data preparation contains several stages, you can use the following two
 options:

  - ``--stage``
-  - ``--stop-stage``
+  - ``--stop_stage``

 to control which stage(s) should be run. By default, all stages are executed.

@ -56,8 +56,8 @@ For example,
 .. code-block:: bash

  $ cd egs/librispeech/ASR
-  $ ./prepare.sh --stage 0 --stop-stage 0 # run only stage 0
-  $ ./prepare.sh --stage 2 --stop-stage 5 # run from stage 2 to stage 5
+  $ ./prepare.sh --stage 0 --stop_stage 0 # run only stage 0
+  $ ./prepare.sh --stage 2 --stop_stage 5 # run from stage 2 to stage 5

 .. HINT::

@ -108,15 +108,15 @@ As usual, you can control the stages you want to run by specifying the following
 two options:

  - ``--stage``
-  - ``--stop-stage``
+  - ``--stop_stage``

 For example,

 .. code-block:: bash

  $ cd egs/librispeech/ASR
-  $ ./distillation_with_hubert.sh --stage 0 --stop-stage 0 # run only stage 0
-  $ ./distillation_with_hubert.sh --stage 2 --stop-stage 4 # run from stage 2 to stage 5
+  $ ./distillation_with_hubert.sh --stage 0 --stop_stage 0 # run only stage 0
+  $ ./distillation_with_hubert.sh --stage 2 --stop_stage 4 # run from stage 2 to stage 5

 Here are a few options in `./distillation_with_hubert.sh <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/distillation_with_hubert.sh>`_
 you need to know before you proceed.
@ -134,7 +134,7 @@ and prepares MVQ-augmented training manifests.

 .. code-block:: bash

-  $ ./distillation_with_hubert.sh --stage 2 --stop-stage 2 # run only stage 2
+  $ ./distillation_with_hubert.sh --stage 2 --stop_stage 2 # run only stage 2

 Please see the
 following screenshot for the output of an example execution.
@ -172,7 +172,7 @@ To perform training, please run stage 3 by executing the following command.

 .. code-block:: bash

-  $ ./prepare.sh --stage 3 --stop-stage 3 # run MVQ training
+  $ ./prepare.sh --stage 3 --stop_stage 3 # run MVQ training

 Here is the code snippet for training:

@ -217,7 +217,7 @@ the following command.
    --exp-dir $exp_dir \
    --enable-distillation True

-You should get similar results as `here <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS-100hours.md#distillation-with-hubert>`_.
+You should get similar results as `here <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS-100hours.md#distillation-with-hubert>`__.

 That's all! Feel free to experiment with your own setups and report your results.
-If you encounter any problems during training, please open up an issue `here <https://github.com/k2-fsa/icefall/issues>`_.
+If you encounter any problems during training, please open up an issue `here <https://github.com/k2-fsa/icefall/issues>`__.
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
@ -8,10 +8,10 @@ with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.

 .. Note::

-   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
-   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
-   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
-   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_,
+   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`__,
+   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`__,
+   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`__,
+   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`__,
   We will take pruned_transducer_stateless4 as an example in this tutorial.

 .. HINT::
@ -237,7 +237,7 @@ them, please modify ``./pruned_transducer_stateless4/train.py`` directly.

 .. NOTE::

-  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`_ are a little different from
+  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`__ are a little different from
  other recipes. It allows you to configure ``--num-encoder-layers``, ``--dim-feedforward``, ``--nhead``, ``--encoder-dim``, ``--decoder-dim``, ``--joiner-dim`` from commandline, so that you can train models with different size with pruned_transducer_stateless5.


@ -418,7 +418,7 @@ The following shows two examples (for two types of checkpoints):

    - ``beam_search`` :  It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf and
      `espnet/nets/beam_search_transducer.py <https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py#L247>`_
-      is used as a reference. Basicly, it keeps topk states for each frame, and expands the kept states with their own contexts to
+      is used as a reference. Basically, it keeps topk states for each frame, and expands the kept states with their own contexts to
      next frame.

    - ``modified_beam_search`` : It implements the same algorithm as ``beam_search`` above, but it
@ -529,13 +529,13 @@ Download pretrained models
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:

-  - `pruned_transducer_stateless <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12>`_
+  - `pruned_transducer_stateless <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12>`__

-  - `pruned_transducer_stateless2 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29>`_
+  - `pruned_transducer_stateless2 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29>`__

-  - `pruned_transducer_stateless4 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless4-2022-06-03>`_
+  - `pruned_transducer_stateless4 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless4-2022-06-03>`__

-  - `pruned_transducer_stateless5 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless5-2022-07-07>`_
+  - `pruned_transducer_stateless5 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless5-2022-07-07>`__

  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
  for the details of the above pretrained models
--- a/docs/source/recipes/RNN-LM/index.rst
+++ b/docs/source/recipes/RNN-LM/index.rst
@ -0,0 +1,7 @@
+RNN-LM
+======
+
+.. toctree::
+   :maxdepth: 2
+
+   librispeech/lm-training
--- a/docs/source/recipes/RNN-LM/librispeech/lm-training.rst
+++ b/docs/source/recipes/RNN-LM/librispeech/lm-training.rst
@ -0,0 +1,104 @@
+.. _train_nnlm:
+
+Train an RNN language model
+======================================
+
+If you have enough text data, you can train a neural network language model (NNLM) to improve
+the WER of your E2E ASR system. This tutorial shows you how to train an RNNLM from 
+scratch.
+
+.. HINT::
+
+    For how to use an NNLM during decoding, please refer to the following tutorials:
+    :ref:`shallow_fusion`, :ref:`LODR`, :ref:`rescoring`
+
+.. note::
+
+    This tutorial is based on the LibriSpeech recipe. Please check it out for the necessary
+    python scripts for this tutorial. We use the LibriSpeech LM-corpus as the LM training set 
+    for illustration purpose. You can also collect your own data. The data format is quite simple:
+    each line should contain a complete sentence, and words should be separated by space.
+
+First, let's download the training data for the RNNLM. This can be done via the 
+following command:
+
+.. code-block:: bash
+
+    $ wget https://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz 
+    $ gzip -d librispeech-lm-norm.txt.gz
+
+As we are training a BPE-level RNNLM, we need to tokenize the training text, which requires a
+BPE tokenizer. This can be achieved by executing the following command:
+
+.. code-block:: bash
+    
+    $ # if you don't have the BPE
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+    $ cd icefall-asr-librispeech-zipformer-2023-05-15/data/lang_bpe_500
+    $ git lfs pull --include bpe.model
+    $ cd ../../..
+
+    $ ./local/prepare_lm_training_data.py \
+        --bpe-model icefall-asr-librispeech-zipformer-2023-05-15/data/lang_bpe_500/bpe.model \
+        --lm-data librispeech-lm-norm.txt \
+        --lm-archive data/lang_bpe_500/lm_data.pt
+
+Now, you should have a file name ``lm_data.pt`` file store under the directory ``data/lang_bpe_500``.
+This is the packed training data for the RNNLM. We then sort the training data according to its
+sentence length.
+
+.. code-block:: bash
+
+    $ # This could take a while (~ 20 minutes), feel free to grab a cup of coffee :)
+    $ ./local/sort_lm_training_data.py \
+        --in-lm-data data/lang_bpe_500/lm_data.pt \
+        --out-lm-data data/lang_bpe_500/sorted_lm_data.pt \
+        --out-statistics data/lang_bpe_500/lm_data_stats.txt
+
+
+The aforementioned steps can be repeated to create a a validation set for you RNNLM. Let's say 
+you have a validation set in ``valid.txt``, you can just set ``--lm-data valid.txt`` 
+and ``--lm-archive data/lang_bpe_500/lm-data-valid.pt`` when calling ``./local/prepare_lm_training_data.py``.
+
+After completing the previous steps, the training and testing sets for training RNNLM are ready. 
+The next step is to train the RNNLM model. The training command is as follows:
+
+.. code-block:: bash
+
+    $ # assume you are in the icefall root directory
+    $ cd rnn_lm
+    $ ln -s ../../egs/librispeech/ASR/data .
+    $ cd ..
+    $ ./rnn_lm/train.py \
+        --world-size 4 \
+        --exp-dir ./rnn_lm/exp \
+        --start-epoch 0 \
+        --num-epochs 10 \
+        --use-fp16 0 \
+        --tie-weights 1 \
+        --embedding-dim 2048 \
+        --hidden_dim 2048 \
+        --num-layers 3 \
+        --batch-size 300 \
+        --lm-data rnn_lm/data/lang_bpe_500/sorted_lm_data.pt \
+        --lm-data-valid rnn_lm/data/lang_bpe_500/sorted_lm_data.pt
+
+
+.. note::
+
+    You can adjust the RNNLM hyper parameters to control the size of the RNNLM,
+    such as embedding dimension and hidden state dimension. For more details, please
+    run ``./rnn_lm/train.py --help``.
+
+.. note::
+
+    The training of RNNLM can take a long time (usually a couple of days).
+
+
+
+
+
+
+
+
+
--- a/docs/source/recipes/Streaming-ASR/introduction.rst
+++ b/docs/source/recipes/Streaming-ASR/introduction.rst
@ -32,7 +32,7 @@ In icefall, we implement the streaming conformer the way just like what `WeNet <
 .. HINT::
   If you want to modify a non-streaming conformer recipe to support both streaming and non-streaming, please refer
   to `this pull request <https://github.com/k2-fsa/icefall/pull/454>`_.  After adding the code needed by streaming training,
-   you have to re-train it with the extra arguments metioned in the docs above to get a streaming model.
+   you have to re-train it with the extra arguments mentioned in the docs above to get a streaming model.


 Streaming Emformer
@ -45,9 +45,9 @@ the input features.

 We have three variants of Emformer models in ``icefall``.

- - ``pruned_stateless_emformer_rnnt2`` using Emformer from torchaudio, see `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2>`_.
+ - ``pruned_stateless_emformer_rnnt2`` using Emformer from torchaudio, see `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2>`__.
 - ``conv_emformer_transducer_stateless`` using ConvEmformer implemented by ourself. Different from the Emformer in torchaudio,
   ConvEmformer has a convolution in each layer and uses the mechanisms in our reworked conformer model.
-   See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless>`_.
+   See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless>`__.
 - ``conv_emformer_transducer_stateless2`` using ConvEmformer implemented by ourself. The only difference from the above one is that
   it uses a simplified memory bank. See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2>`_.
--- a/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst
@ -6,10 +6,10 @@ with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.

 .. Note::

-   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
-   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
-   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
-   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_,
+   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`__,
+   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`__,
+   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`__,
+   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`__,
   We will take pruned_transducer_stateless4 as an example in this tutorial.

 .. HINT::
@ -264,7 +264,7 @@ them, please modify ``./pruned_transducer_stateless4/train.py`` directly.

 .. NOTE::

-  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`_ are a little different from
+  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`__ are a little different from
  other recipes. It allows you to configure ``--num-encoder-layers``, ``--dim-feedforward``, ``--nhead``, ``--encoder-dim``, ``--decoder-dim``, ``--joiner-dim`` from commandline, so that you can train models with different size with pruned_transducer_stateless5.


@ -584,7 +584,7 @@ The following shows two examples (for the two types of checkpoints):

    - ``beam_search`` :  It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf and
      `espnet/nets/beam_search_transducer.py <https://github.com/espnet/espnet/blob/master/espnet/nets/beam_search_transducer.py#L247>`_
-      is used as a reference. Basicly, it keeps topk states for each frame, and expands the kept states with their own contexts to
+      is used as a reference. Basically, it keeps topk states for each frame, and expands the kept states with their own contexts to
      next frame.

    - ``modified_beam_search`` : It implements the same algorithm as ``beam_search`` above, but it
@ -648,7 +648,7 @@ command to extract ``model.state_dict()``.
 .. caution::

   ``--streaming-model`` and ``--causal-convolution`` require to be True to export
-   a streaming mdoel.
+   a streaming model.

 It will generate a file ``./pruned_transducer_stateless4/exp/pretrained.pt``.

@ -697,7 +697,7 @@ Export model using ``torch.jit.script()``
 .. caution::

   ``--streaming-model`` and ``--causal-convolution`` require to be True to export
-   a streaming mdoel.
+   a streaming model.

 It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
 load it by ``torch.jit.load("cpu_jit.pt")``.
--- a/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst
@ -6,7 +6,7 @@ with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.

 .. Note::

-   The tutorial is suitable for `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
+   The tutorial is suitable for `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`__,

 .. HINT::

@ -642,7 +642,7 @@ Download pretrained models
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:

-  - `pruned_transducer_stateless7_streaming <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`_
+  - `pruned_transducer_stateless7_streaming <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`__

  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
  for the details of the above pretrained models
--- a/Show More
+++ b/Show More