Merge branch 'k2-fsa:master' into master

2025-08-13 20:12:24 +00:00 · 2022-10-18 19:58:43 +08:00 · 2022-10-18 19:58:43 +08:00 · 6b7e467e01
commit 6b7e467e01
parent 104dce59da 15c1a4a441
176 changed files with 13443 additions and 640 deletions
--- a/.flake8
+++ b/.flake8
@ -9,7 +9,7 @@ per-file-ignores =
    egs/*/ASR/pruned_transducer_stateless*/*.py: E501,
    egs/*/ASR/*/optim.py: E501,
    egs/*/ASR/*/scaling.py: E501,
-    egs/librispeech/ASR/lstm_transducer_stateless/*.py: E501, E203
+    egs/librispeech/ASR/lstm_transducer_stateless*/*.py: E501, E203
    egs/librispeech/ASR/conv_emformer_transducer_stateless*/*.py: E501, E203
    egs/librispeech/ASR/conformer_ctc2/*py: E501,
    egs/librispeech/ASR/RESULTS.md: E999,
--- a/.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+++ b/.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
@ -4,6 +4,8 @@
 # The computed features are saved to ~/tmp/fbank-libri and are
 # cached for later runs

+set -e
+
 export PYTHONPATH=$PWD:$PYTHONPATH
 echo $PYTHONPATH

--- a/.github/scripts/download-gigaspeech-dev-test-dataset.sh
+++ b/.github/scripts/download-gigaspeech-dev-test-dataset.sh
@ -6,6 +6,8 @@
 # You will find directories `~/tmp/giga-dev-dataset-fbank` after running
 # this script.

+set -e
+
 mkdir -p ~/tmp
 cd ~/tmp

--- a/.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+++ b/.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
@ -7,6 +7,8 @@
 # You will find directories ~/tmp/download/LibriSpeech after running
 # this script.

+set -e
+
 mkdir ~/tmp/download
 cd egs/librispeech/ASR
 ln -s ~/tmp/download .
--- a/.github/scripts/install-kaldifeat.sh
+++ b/.github/scripts/install-kaldifeat.sh
@ -3,6 +3,8 @@
 # This script installs kaldifeat into the directory ~/tmp/kaldifeat
 # which is cached by GitHub actions for later runs.

+set -e
+
 mkdir -p ~/tmp
 cd ~/tmp
 git clone https://github.com/csukuangfj/kaldifeat
--- a/.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+++ b/.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
@ -4,6 +4,8 @@
 # to egs/librispeech/ASR/download/LibriSpeech and generates manifest
 # files in egs/librispeech/ASR/data/manifests

+set -e
+
 cd egs/librispeech/ASR
 [ ! -e download ] && ln -s ~/tmp/download .
 mkdir -p data/manifests
--- a/.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh
+++ b/.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
@ -40,7 +42,7 @@ for sym in 1 2 3; do
    --lang-dir $repo/data/lang_char \
    $repo/test_wavs/BAC009S0764W0121.wav \
    $repo/test_wavs/BAC009S0764W0122.wav \
-    $rep/test_wavs/BAC009S0764W0123.wav
+    $repo/test_wavs/BAC009S0764W0123.wav
 done

 for method in modified_beam_search beam_search fast_beam_search; do
@ -53,7 +55,7 @@ for method in modified_beam_search beam_search fast_beam_search; do
    --lang-dir $repo/data/lang_char \
    $repo/test_wavs/BAC009S0764W0121.wav \
    $repo/test_wavs/BAC009S0764W0122.wav \
-    $rep/test_wavs/BAC009S0764W0123.wav
+    $repo/test_wavs/BAC009S0764W0123.wav
 done

 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
--- a/.github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh
+++ b/.github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
--- a/.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
+++ b/.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
@ -0,0 +1,203 @@
+#!/usr/bin/env bash
+#
+set -e
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+soxi $repo/test_wavs/*.wav
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s pretrained-iter-468000-avg-16.pt pretrained.pt
+ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
+popd
+
+log  "Install ncnn and pnnx"
+
+# We are using a modified ncnn here. Will try to merge it to the official repo
+# of ncnn
+git clone https://github.com/csukuangfj/ncnn
+pushd ncnn
+git submodule init
+git submodule update python/pybind11
+python3 setup.py bdist_wheel
+ls -lh dist/
+pip install dist/*.whl
+cd tools/pnnx
+mkdir build
+cd build
+cmake ..
+make -j4 pnnx
+
+./src/pnnx || echo "pass"
+
+popd
+
+log "Test exporting to pnnx format"
+
+./lstm_transducer_stateless2/export.py \
+  --exp-dir $repo/exp \
+  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --epoch 99 \
+  --avg 1 \
+  --use-averaged-model 0 \
+  --pnnx 1
+
+./ncnn/tools/pnnx/build/src/pnnx $repo/exp/encoder_jit_trace-pnnx.pt
+./ncnn/tools/pnnx/build/src/pnnx $repo/exp/decoder_jit_trace-pnnx.pt
+./ncnn/tools/pnnx/build/src/pnnx $repo/exp/joiner_jit_trace-pnnx.pt
+
+./lstm_transducer_stateless2/ncnn-decode.py \
+ --bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
+ --encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
+ --encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
+ --decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
+ --decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
+ --joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
+ --joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
+ $repo/test_wavs/1089-134686-0001.wav
+
+./lstm_transducer_stateless2/streaming-ncnn-decode.py \
+ --bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
+ --encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
+ --encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
+ --decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
+ --decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
+ --joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
+ --joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
+ $repo/test_wavs/1089-134686-0001.wav
+
+
+
+log "Test exporting with torch.jit.trace()"
+
+./lstm_transducer_stateless2/export.py \
+  --exp-dir $repo/exp \
+  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --epoch 99 \
+  --avg 1 \
+  --use-averaged-model 0 \
+  --jit-trace 1
+
+log "Decode with models exported by torch.jit.trace()"
+
+./lstm_transducer_stateless2/jit_pretrained.py \
+  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --encoder-model-filename $repo/exp/encoder_jit_trace.pt \
+  --decoder-model-filename $repo/exp/decoder_jit_trace.pt \
+  --joiner-model-filename $repo/exp/joiner_jit_trace.pt \
+  $repo/test_wavs/1089-134686-0001.wav \
+  $repo/test_wavs/1221-135766-0001.wav \
+  $repo/test_wavs/1221-135766-0002.wav
+
+log "Test exporting to ONNX"
+
+./lstm_transducer_stateless2/export.py \
+  --exp-dir $repo/exp \
+  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --epoch 99 \
+  --avg 1 \
+  --use-averaged-model 0 \
+  --onnx 1
+
+log "Decode with ONNX models "
+
+./lstm_transducer_stateless2/streaming-onnx-decode.py \
+  --bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
+  --encoder-model-filename $repo//exp/encoder.onnx \
+  --decoder-model-filename $repo/exp/decoder.onnx \
+  --joiner-model-filename $repo/exp/joiner.onnx \
+  --joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
+  --joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
+ $repo/test_wavs/1089-134686-0001.wav
+
+./lstm_transducer_stateless2/streaming-onnx-decode.py \
+  --bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
+  --encoder-model-filename $repo//exp/encoder.onnx \
+  --decoder-model-filename $repo/exp/decoder.onnx \
+  --joiner-model-filename $repo/exp/joiner.onnx \
+  --joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
+  --joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
+ $repo/test_wavs/1221-135766-0001.wav
+
+./lstm_transducer_stateless2/streaming-onnx-decode.py \
+  --bpe-model-filename $repo/data/lang_bpe_500/bpe.model \
+  --encoder-model-filename $repo//exp/encoder.onnx \
+  --decoder-model-filename $repo/exp/decoder.onnx \
+  --joiner-model-filename $repo/exp/joiner.onnx \
+  --joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
+  --joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
+ $repo/test_wavs/1221-135766-0002.wav
+
+
+
+for sym in 1 2 3; do
+  log "Greedy search with --max-sym-per-frame $sym"
+
+  ./lstm_transducer_stateless2/pretrained.py \
+    --method greedy_search \
+    --max-sym-per-frame $sym \
+    --checkpoint $repo/exp/pretrained.pt \
+    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    $repo/test_wavs/1089-134686-0001.wav \
+    $repo/test_wavs/1221-135766-0001.wav \
+    $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+  log "$method"
+
+  ./lstm_transducer_stateless2/pretrained.py \
+    --method $method \
+    --beam-size 4 \
+    --checkpoint $repo/exp/pretrained.pt \
+    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    $repo/test_wavs/1089-134686-0001.wav \
+    $repo/test_wavs/1221-135766-0001.wav \
+    $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" ]]; then
+  mkdir -p lstm_transducer_stateless2/exp
+  ln -s $PWD/$repo/exp/pretrained.pt lstm_transducer_stateless2/exp/epoch-999.pt
+  ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+  ls -lh data
+  ls -lh lstm_transducer_stateless2/exp
+
+  log "Decoding test-clean and test-other"
+
+  # use a small value for decoding with CPU
+  max_duration=100
+
+  for method in greedy_search fast_beam_search modified_beam_search; do
+    log "Decoding with $method"
+
+    ./lstm_transducer_stateless2/decode.py \
+      --decoding-method $method \
+      --epoch 999 \
+      --avg 1 \
+      --use-averaged-model 0 \
+      --max-duration $max_duration \
+      --exp-dir lstm_transducer_stateless2/exp
+  done
+
+  rm lstm_transducer_stateless2/exp/*.pt
+fi
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
@ -11,10 +13,14 @@ cd egs/librispeech/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29

 log "Downloading pre-trained model from $repo_url"
-git lfs install
-git clone $repo_url
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)

+pushd $repo
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/pretrained-epoch-38-avg-10.pt"
+popd
+
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-05-13.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-05-13.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
@ -58,17 +60,17 @@ log "Decode with ONNX models"
  --jit-filename $repo/exp/cpu_jit.pt \
  --onnx-encoder-filename $repo/exp/encoder.onnx \
  --onnx-decoder-filename $repo/exp/decoder.onnx \
-  --onnx-joiner-filename $repo/exp/joiner.onnx
-
-./pruned_transducer_stateless3/onnx_check_all_in_one.py \
-  --jit-filename $repo/exp/cpu_jit.pt \
-  --onnx-all-in-one-filename $repo/exp/all_in_one.onnx
+  --onnx-joiner-filename $repo/exp/joiner.onnx \
+  --onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
+  --onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx

 ./pruned_transducer_stateless3/onnx_pretrained.py \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --encoder-model-filename $repo/exp/encoder.onnx \
  --decoder-model-filename $repo/exp/decoder.onnx \
  --joiner-model-filename $repo/exp/joiner.onnx \
+  --joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
+  --joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless5-2022-05-13.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless5-2022-05-13.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
--- a/.github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh
+++ b/.github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
--- a/.github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
+++ b/.github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
--- a/.github/scripts/run-pre-trained-conformer-ctc.sh
+++ b/.github/scripts/run-pre-trained-conformer-ctc.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
@ -10,7 +12,6 @@ cd egs/librispeech/ASR

 repo_url=https://github.com/csukuangfj/icefall-asr-conformer-ctc-bpe-500
 git lfs install
-git clone $repo

 log "Downloading pre-trained model from $repo_url"
 git clone $repo_url
--- a/.github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
--- a/.github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
--- a/.github/scripts/run-pre-trained-transducer-stateless-modified-2-aishell.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless-modified-2-aishell.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
--- a/.github/scripts/run-pre-trained-transducer-stateless-modified-aishell.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless-modified-aishell.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
--- a/.github/scripts/run-pre-trained-transducer-stateless.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
--- a/.github/scripts/run-pre-trained-transducer.sh
+++ b/.github/scripts/run-pre-trained-transducer.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+set -e
+
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
--- a/.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
+++ b/.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/wenetspeech/ASR
+
+repo_url=https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+
+log "Display test files"
+tree $repo/
+soxi $repo/test_wavs/*.wav
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s pretrained_epoch_10_avg_2.pt pretrained.pt
+ln -s pretrained_epoch_10_avg_2.pt epoch-99.pt
+popd
+
+log "Test exporting to ONNX format"
+
+./pruned_transducer_stateless2/export.py \
+  --exp-dir $repo/exp \
+  --lang-dir $repo/data/lang_char \
+  --epoch 99 \
+  --avg 1 \
+  --onnx 1
+
+log "Export to torchscript model"
+
+./pruned_transducer_stateless2/export.py \
+  --exp-dir $repo/exp \
+  --lang-dir $repo/data/lang_char \
+  --epoch 99 \
+  --avg 1 \
+  --jit 1
+
+./pruned_transducer_stateless2/export.py \
+  --exp-dir $repo/exp \
+  --lang-dir $repo/data/lang_char \
+  --epoch 99 \
+  --avg 1 \
+  --jit-trace 1
+
+ls -lh $repo/exp/*.onnx
+ls -lh $repo/exp/*.pt
+
+log "Decode with ONNX models"
+
+./pruned_transducer_stateless2/onnx_check.py \
+  --jit-filename $repo/exp/cpu_jit.pt \
+  --onnx-encoder-filename $repo/exp/encoder.onnx \
+  --onnx-decoder-filename $repo/exp/decoder.onnx \
+  --onnx-joiner-filename $repo/exp/joiner.onnx \
+  --onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj.onnx \
+  --onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj.onnx
+
+./pruned_transducer_stateless2/onnx_pretrained.py \
+  --tokens $repo/data/lang_char/tokens.txt \
+  --encoder-model-filename $repo/exp/encoder.onnx \
+  --decoder-model-filename $repo/exp/decoder.onnx \
+  --joiner-model-filename $repo/exp/joiner.onnx \
+  --joiner-encoder-proj-model-filename $repo/exp/joiner_encoder_proj.onnx \
+  --joiner-decoder-proj-model-filename $repo/exp/joiner_decoder_proj.onnx \
+  $repo/test_wavs/DEV_T0000000000.wav \
+  $repo/test_wavs/DEV_T0000000001.wav \
+  $repo/test_wavs/DEV_T0000000002.wav
+
+log "Decode with models exported by torch.jit.trace()"
+
+./pruned_transducer_stateless2/jit_pretrained.py \
+  --tokens $repo/data/lang_char/tokens.txt \
+  --encoder-model-filename $repo/exp/encoder_jit_trace.pt \
+  --decoder-model-filename $repo/exp/decoder_jit_trace.pt \
+  --joiner-model-filename $repo/exp/joiner_jit_trace.pt \
+  $repo/test_wavs/DEV_T0000000000.wav \
+  $repo/test_wavs/DEV_T0000000001.wav \
+  $repo/test_wavs/DEV_T0000000002.wav
+
+./pruned_transducer_stateless2/jit_pretrained.py \
+  --tokens $repo/data/lang_char/tokens.txt \
+  --encoder-model-filename $repo/exp/encoder_jit_script.pt \
+  --decoder-model-filename $repo/exp/decoder_jit_script.pt \
+  --joiner-model-filename $repo/exp/joiner_jit_script.pt \
+  $repo/test_wavs/DEV_T0000000000.wav \
+  $repo/test_wavs/DEV_T0000000001.wav \
+  $repo/test_wavs/DEV_T0000000002.wav
+
+for sym in 1 2 3; do
+  log "Greedy search with --max-sym-per-frame $sym"
+
+  ./pruned_transducer_stateless2/pretrained.py \
+    --checkpoint $repo/exp/epoch-99.pt \
+    --lang-dir $repo/data/lang_char \
+    --decoding-method greedy_search \
+    --max-sym-per-frame $sym \
+  $repo/test_wavs/DEV_T0000000000.wav \
+  $repo/test_wavs/DEV_T0000000001.wav \
+  $repo/test_wavs/DEV_T0000000002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+  log "$method"
+
+  ./pruned_transducer_stateless2/pretrained.py \
+    --decoding-method $method \
+    --beam-size 4 \
+    --checkpoint $repo/exp/epoch-99.pt \
+    --lang-dir $repo/data/lang_char \
+  $repo/test_wavs/DEV_T0000000000.wav \
+  $repo/test_wavs/DEV_T0000000001.wav \
+  $repo/test_wavs/DEV_T0000000002.wav
+done
--- a/.github/workflows/run-aishell-2022-06-20.yml
+++ b/.github/workflows/run-aishell-2022-06-20.yml
@ -69,7 +69,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-gigaspeech-2022-05-13.yml
+++ b/.github/workflows/run-gigaspeech-2022-05-13.yml
@ -68,7 +68,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-2022-03-12.yml
+++ b/.github/workflows/run-librispeech-2022-03-12.yml
@ -68,7 +68,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-2022-04-29.yml
+++ b/.github/workflows/run-librispeech-2022-04-29.yml
@ -68,7 +68,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-2022-05-13.yml
@ -68,7 +68,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
+++ b/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
@ -0,0 +1,136 @@
+name: run-librispeech-lstm-transducer2-2022-09-03
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    types: [labeled]
+
+  schedule:
+    # minute (0-59)
+    # hour (0-23)
+    # day of the month (1-31)
+    # month (1-12)
+    # day of the week (0-6)
+    # nightly build at 15:50 UTC time every day
+    - cron: "50 15 * * *"
+
+jobs:
+  run_librispeech_lstm_transducer_stateless2_2022_09_03:
+    if: github.event.label.name == 'ready' || github.event.label.name == 'ncnn' || github.event.label.name == 'onnx' || github.event_name == 'push' || github.event_name == 'schedule'
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-18.04]
+        python-version: [3.8]
+
+      fail-fast: false
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+          cache-dependency-path: '**/requirements-ci.txt'
+
+      - name: Install Python dependencies
+        run: |
+          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
+
+      - name: Cache kaldifeat
+        id: my-cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/kaldifeat
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25
+
+      - name: Install kaldifeat
+        if: steps.my-cache.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/install-kaldifeat.sh
+
+      - name: Cache LibriSpeech test-clean and test-other datasets
+        id: libri-test-clean-and-test-other-data
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/download
+          key: cache-libri-test-clean-and-test-other
+
+      - name: Download LibriSpeech test-clean and test-other
+        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+      - name: Prepare manifests for LibriSpeech test-clean and test-other
+        shell: bash
+        run: |
+          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+      - name: Cache LibriSpeech test-clean and test-other fbank features
+        id: libri-test-clean-and-test-other-fbank
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/fbank-libri
+          key: cache-libri-fbank-test-clean-and-test-other-v2
+
+      - name: Compute fbank for LibriSpeech test-clean and test-other
+        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+      - name: Inference with pre-trained model
+        shell: bash
+        env:
+          GITHUB_EVENT_NAME: ${{ github.event_name }}
+          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+        run: |
+          mkdir -p egs/librispeech/ASR/data
+          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+          ls -lh egs/librispeech/ASR/data/*
+
+          sudo apt-get -qq install git-lfs tree sox
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+          .github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
+
+      - name: Display decoding results for lstm_transducer_stateless2
+        if: github.event_name == 'schedule'
+        shell: bash
+        run: |
+          cd egs/librispeech/ASR
+          tree lstm_transducer_stateless2/exp
+          cd lstm_transducer_stateless2/exp
+          echo "===greedy search==="
+          find greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+          echo "===fast_beam_search==="
+          find fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+          echo "===modified beam search==="
+          find modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+          find modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+      - name: Upload decoding results for lstm_transducer_stateless2
+        uses: actions/upload-artifact@v2
+        if: github.event_name == 'schedule'
+        with:
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-lstm_transducer_stateless2-2022-09-03
+          path: egs/librispeech/ASR/lstm_transducer_stateless2/exp/
--- a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
@ -68,7 +68,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
+++ b/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
@ -68,7 +68,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
+++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
@ -68,7 +68,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-pretrained-conformer-ctc.yml
+++ b/.github/workflows/run-pretrained-conformer-ctc.yml
@ -58,7 +58,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@ -67,7 +67,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@ -67,7 +67,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
@ -58,7 +58,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
@ -58,7 +58,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-pretrained-transducer-stateless.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@ -67,7 +67,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-pretrained-transducer.yml
+++ b/.github/workflows/run-pretrained-transducer.yml
@ -58,7 +58,7 @@ jobs:
        with:
          path: |
            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25

      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
--- a/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
+++ b/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
@ -0,0 +1,80 @@
+# Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-wenetspeech-pruned-transducer-stateless2
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    types: [labeled]
+
+jobs:
+  run_librispeech_pruned_transducer_stateless3_2022_05_13:
+    if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event_name == 'push' || github.event.label.name == 'wenetspeech'
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-18.04]
+        python-version: [3.8]
+
+      fail-fast: false
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: 'pip'
+          cache-dependency-path: '**/requirements-ci.txt'
+
+      - name: Install Python dependencies
+        run: |
+          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
+
+      - name: Cache kaldifeat
+        id: my-cache
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/tmp/kaldifeat
+          key: cache-tmp-${{ matrix.python-version }}-2022-09-25
+
+      - name: Install kaldifeat
+        if: steps.my-cache.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          .github/scripts/install-kaldifeat.sh
+
+      - name: Inference with pre-trained model
+        shell: bash
+        env:
+          GITHUB_EVENT_NAME: ${{ github.event_name }}
+          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+        run: |
+          sudo apt-get -qq install git-lfs tree sox
+          export PYTHONPATH=$PWD:$PYTHONPATH
+          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+          .github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@ -29,8 +29,8 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-18.04, macos-latest]
-        python-version: [3.7, 3.9]
+        os: [ubuntu-latest]
+        python-version: [3.8]
      fail-fast: false

    steps:
--- a/.gitignore
+++ b/.gitignore
@ -11,3 +11,5 @@ log
 *.bak
 *-bak
 *bak.py
+*.param
+*.bin
--- a/docker/README.md
+++ b/docker/README.md
@ -1,24 +1,114 @@
 # icefall dockerfile

-We provide a dockerfile for some users, the configuration of dockerfile is : Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8-python3.8. You can use the dockerfile by following the steps:
+2 sets of configuration are provided - (a) Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8, and (b) Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8.

-## Building images locally
+If your NVIDIA driver supports CUDA Version: 11.3, please go for case (a) Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8. 
+
+Otherwise, since the older PyTorch images are not updated with the [apt-key rotation by NVIDIA](https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key), you have to go for case (b) Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8. Ensure that your NVDIA driver supports at least CUDA 11.0.
+
+You can check the highest CUDA version within your NVIDIA driver's support with the `nvidia-smi` command below. In this example, the highest CUDA version is 11.0, i.e. case (b) Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8.

 ```bash
-cd docker/Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8
-docker build -t icefall/pytorch1.7.1:latest -f ./Dockerfile ./
+$ nvidia-smi
+Tue Sep 20 00:26:13 2022       
+-----------------------------------------------------------------------------+
+| NVIDIA-SMI 450.119.03   Driver Version: 450.119.03   CUDA Version: 11.0     |
+|-------------------------------+----------------------+----------------------+
+| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
+| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
+|                               |                      |               MIG M. |
+|===============================+======================+======================|
+|   0  TITAN RTX           On   | 00000000:03:00.0 Off |                  N/A |
+| 41%   31C    P8     4W / 280W |     16MiB / 24219MiB |      0%      Default |
+|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
+|   1  TITAN RTX           On   | 00000000:04:00.0 Off |                  N/A |
+| 41%   30C    P8    11W / 280W |      6MiB / 24220MiB |      0%      Default |
+|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
+                                                                               
+-----------------------------------------------------------------------------+
+| Processes:                                                                  |
+|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
+|        ID   ID                                                   Usage      |
+|=============================================================================|
+|    0   N/A  N/A      2085      G   /usr/lib/xorg/Xorg                  9MiB |
+|    0   N/A  N/A      2240      G   /usr/bin/gnome-shell                4MiB |
+|    1   N/A  N/A      2085      G   /usr/lib/xorg/Xorg                  4MiB |
+-----------------------------------------------------------------------------+
+
 ```

-## Using built images 
-Sample usage of the GPU based images:
+## Building images locally
+If your environment requires a proxy to access the Internet, remember to add those information into the Dockerfile directly. 
+For most cases, you can uncomment these lines in the Dockerfile and add in your proxy details. 
+
+```dockerfile
+ENV http_proxy=http://aaa.bb.cc.net:8080 \
+    https_proxy=http://aaa.bb.cc.net:8080
+```
+
+Then, proceed with these commands. 
+
+### If you are case (a), i.e. your NVIDIA driver supports CUDA version >= 11.3:
+
+```bash
+cd docker/Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8
+docker build -t icefall/pytorch1.12.1 .
+```
+
+### If you are case (b), i.e. your NVIDIA driver can only support CUDA versions 11.0 <= x < 11.3:
+```bash
+cd docker/Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8
+docker build -t icefall/pytorch1.7.1 .
+```
+
+## Running your built local image
+Sample usage of the GPU based images. These commands are written with case (a) in mind, so please make the necessary changes to your image name if you are case (b).
 Note: use [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) to run the GPU images.

 ```bash
-docker run -it --runtime=nvidia --name=icefall_username --gpus all icefall/pytorch1.7.1:latest
+docker run -it --runtime=nvidia --shm-size=2gb --name=icefall --gpus all icefall/pytorch1.12.1
 ```

-Sample usage of the CPU based images:
+### Tips:
+1. Since your data and models most probably won't be in the docker, you must use the -v flag to access the host machine. Do this by specifying `-v {/path/in/docker}:{/path/in/host/machine}`. 
+
+2. Also, if your environment requires a proxy, this would be a good time to add it in too: `-e http_proxy=http://aaa.bb.cc.net:8080 -e https_proxy=http://aaa.bb.cc.net:8080`.
+
+Overall, your docker run command should look like this. 

 ```bash
-docker run -it icefall/pytorch1.7.1:latest /bin/bash
-``` 
+docker run -it --runtime=nvidia --shm-size=2gb --name=icefall --gpus all -v {/path/in/docker}:{/path/in/host/machine} -e http_proxy=http://aaa.bb.cc.net:8080 -e https_proxy=http://aaa.bb.cc.net:8080 icefall/pytorch1.12.1
+```
+
+You can explore more docker run options [here](https://docs.docker.com/engine/reference/commandline/run/) to suit your environment.
+
+### Linking to icefall in your host machine
+
+If you already have icefall downloaded onto your host machine, you can use that repository instead so that changes in your code are visible inside and outside of the container. 
+
+Note: Remember to set the -v flag above during the first run of the container, as that is the only way for your container to access your host machine. 
+Warning: Check that the icefall in your host machine is visible from within your container before proceeding to the commands below.
+
+Use these commands once you are inside the container.
+
+```bash
+rm -r /workspace/icefall
+ln -s {/path/in/docker/to/icefall} /workspace/icefall
+```
+
+## Starting another session in the same running container.
+```bash
+docker exec -it icefall /bin/bash
+```
+
+## Restarting a killed container that has been run before. 
+```bash
+docker start -ai icefall
+```
+
+## Sample usage of the CPU based images:
+```bash
+docker run -it icefall /bin/bash
+``` 
--- a/docker/Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8/Dockerfile
+++ b/docker/Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8/Dockerfile
@ -0,0 +1,72 @@
+FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel
+
+# ENV http_proxy=http://aaa.bbb.cc.net:8080 \
+#	https_proxy=http://aaa.bbb.cc.net:8080 
+
+# install normal source
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        g++ \
+        make \
+        automake \
+        autoconf \
+        bzip2 \
+        unzip \
+        wget \
+        sox \
+        libtool \
+        git \
+        subversion \
+        zlib1g-dev \
+        gfortran \
+        ca-certificates \
+        patch \
+        ffmpeg \
+        valgrind \
+		libssl-dev \
+	    vim \
+		curl
+
+# cmake
+RUN wget -P /opt https://cmake.org/files/v3.18/cmake-3.18.0.tar.gz && \
+    cd /opt && \
+    tar -zxvf cmake-3.18.0.tar.gz && \
+    cd cmake-3.18.0 && \
+    ./bootstrap && \
+    make && \
+    make install && \
+    rm -rf cmake-3.18.0.tar.gz && \
+    find /opt/cmake-3.18.0 -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+    cd -
+	
+# flac 
+RUN wget -P /opt https://downloads.xiph.org/releases/flac/flac-1.3.2.tar.xz  && \
+    cd /opt && \ 
+    xz -d flac-1.3.2.tar.xz && \
+    tar -xvf flac-1.3.2.tar && \
+    cd flac-1.3.2 && \
+    ./configure && \
+    make && make install && \
+    rm -rf flac-1.3.2.tar && \
+    find /opt/flac-1.3.2  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+    cd - 
+
+RUN pip install kaldiio graphviz && \
+	conda install -y -c pytorch torchaudio
+
+#install k2 from source
+RUN git clone https://github.com/k2-fsa/k2.git /opt/k2 && \
+    cd /opt/k2 && \
+    python3 setup.py install && \
+    cd -
+
+# install  lhotse
+RUN pip install git+https://github.com/lhotse-speech/lhotse
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+	cd /workspace/icefall && \
+	pip install -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docker/Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8/Dockerfile
+++ b/docker/Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8/Dockerfile
@ -1,7 +1,13 @@
 FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-devel

-# install normal source
+# ENV http_proxy=http://aaa.bbb.cc.net:8080 \
+#	https_proxy=http://aaa.bbb.cc.net:8080 

+RUN rm /etc/apt/sources.list.d/cuda.list && \
+	rm /etc/apt/sources.list.d/nvidia-ml.list && \
+	apt-key del 7fa2af80
+	
+# install normal source
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        g++ \
@ -21,20 +27,25 @@ RUN apt-get update && \
        patch \
        ffmpeg \
        valgrind \
-	libssl-dev \
-	    vim && \
-        rm -rf /var/lib/apt/lists/*
+		libssl-dev \
+	    vim \
+		curl

-
-RUN mv /opt/conda/lib/libcufft.so.10 /opt/libcufft.so.10.bak && \
+# Add new keys and reupdate
+RUN curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub | apt-key add - && \
+	curl -fsSL https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
+	echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
+	echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
+	rm -rf /var/lib/apt/lists/* && \ 
+	mv /opt/conda/lib/libcufft.so.10 /opt/libcufft.so.10.bak && \
    mv /opt/conda/lib/libcurand.so.10 /opt/libcurand.so.10.bak && \
    mv /opt/conda/lib/libcublas.so.11 /opt/libcublas.so.11.bak && \
    mv /opt/conda/lib/libnvrtc.so.11.0 /opt/libnvrtc.so.11.1.bak && \
-    mv /opt/conda/lib/libnvToolsExt.so.1 /opt/libnvToolsExt.so.1.bak && \
-    mv /opt/conda/lib/libcudart.so.11.0 /opt/libcudart.so.11.0.bak
+    # mv /opt/conda/lib/libnvToolsExt.so.1 /opt/libnvToolsExt.so.1.bak && \
+    mv /opt/conda/lib/libcudart.so.11.0 /opt/libcudart.so.11.0.bak && \
+	apt-get update && apt-get -y upgrade

 # cmake
-
 RUN wget -P /opt https://cmake.org/files/v3.18/cmake-3.18.0.tar.gz && \
    cd /opt && \
    tar -zxvf cmake-3.18.0.tar.gz && \
@ -45,11 +56,7 @@ RUN wget -P /opt https://cmake.org/files/v3.18/cmake-3.18.0.tar.gz && \
    rm -rf cmake-3.18.0.tar.gz && \
    find /opt/cmake-3.18.0 -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
    cd -
-
-#kaldiio
-
-RUN pip install kaldiio
-
+	
 # flac 
 RUN wget -P /opt https://downloads.xiph.org/releases/flac/flac-1.3.2.tar.xz  && \
    cd /opt && \ 
@ -62,15 +69,8 @@ RUN wget -P /opt https://downloads.xiph.org/releases/flac/flac-1.3.2.tar.xz  &&
    find /opt/flac-1.3.2  -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
    cd - 

-# graphviz
-RUN pip install graphviz
-
-# kaldifeat
-RUN git clone https://github.com/csukuangfj/kaldifeat.git /opt/kaldifeat && \
-    cd /opt/kaldifeat && \
-    python setup.py install && \
-    cd -
-
+RUN pip install kaldiio graphviz && \
+	conda install -y -c pytorch torchaudio=0.7.1

 #install k2 from source
 RUN git clone https://github.com/k2-fsa/k2.git /opt/k2 && \
@ -79,14 +79,13 @@ RUN git clone https://github.com/k2-fsa/k2.git /opt/k2 && \
    cd -

 # install  lhotse
-RUN pip install torchaudio==0.7.2
-RUN pip install git+https://github.com/lhotse-speech/lhotse
-#RUN pip install lhotse
+RUN pip install git+https://github.com/lhotse-speech/lhotse 

-# install icefall
-RUN git clone https://github.com/k2-fsa/icefall && \
-    cd icefall && \
-    pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
-    
-ENV PYTHONPATH /workspace/icefall:$PYTHONPATH  
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+	cd /workspace/icefall && \
+	pip install -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -74,7 +74,7 @@ html_context = {
    "github_user": "k2-fsa",
    "github_repo": "icefall",
    "github_version": "master",
-    "conf_py_path": "/icefall/docs/source/",
+    "conf_py_path": "/docs/source/",
 }

 todo_include_todos = True
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -21,6 +21,7 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
   :caption: Contents:

   installation/index
+   model-export/index
   recipes/index
   contributing/index
   huggingface/index
--- a/docs/source/model-export/code/export-model-state-dict-pretrained-out.txt
+++ b/docs/source/model-export/code/export-model-state-dict-pretrained-out.txt
@ -0,0 +1,21 @@
+2022-10-13 19:09:02,233 INFO [pretrained.py:265] {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampling_factor': 4, 'encoder_dim': 512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'decoder_dim': 512, 'joiner_dim': 512, 'model_warm_step': 3000, 'env_info': {'k2-version': '1.21', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4810e00d8738f1a21278b0156a42ff396a2d40ac', 'k2-git-date': 'Fri Oct 7 19:35:03 2022', 'lhotse-version': '1.3.0.dev+missing.version.file', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': False, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'onnx-doc-1013', 'icefall-git-sha1': 'c39cba5-dirty', 'icefall-git-date': 'Thu Oct 13 15:17:20 2022', 'icefall-path': '/k2-dev/fangjun/open-source/icefall-master', 'k2-path': '/k2-dev/fangjun/open-source/k2-master/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-jsonl/lhotse/__init__.py', 'hostname': 'de-74279-k2-test-4-0324160024-65bfd8b584-jjlbn', 'IP address': '10.177.74.203'}, 'checkpoint': './icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/pretrained-iter-1224000-avg-14.pt', 'bpe_model': './icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/bpe.model', 'method': 'greedy_search', 'sound_files': ['./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav'], 'sample_rate': 16000, 'beam_size': 4, 'beam': 4, 'max_contexts': 4, 'max_states': 8, 'context_size': 2, 'max_sym_per_frame': 1, 'simulate_streaming': False, 'decode_chunk_size': 16, 'left_context': 64, 'dynamic_chunk_training': False, 'causal_convolution': False, 'short_chunk_size': 25, 'num_left_chunks': 4, 'blank_id': 0, 'unk_id': 2, 'vocab_size': 500}
+2022-10-13 19:09:02,233 INFO [pretrained.py:271] device: cpu
+2022-10-13 19:09:02,233 INFO [pretrained.py:273] Creating model
+2022-10-13 19:09:02,612 INFO [train.py:458] Disable giga
+2022-10-13 19:09:02,623 INFO [pretrained.py:277] Number of model parameters: 78648040
+2022-10-13 19:09:02,951 INFO [pretrained.py:285] Constructing Fbank computer
+2022-10-13 19:09:02,952 INFO [pretrained.py:295] Reading sound files: ['./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav']
+2022-10-13 19:09:02,957 INFO [pretrained.py:301] Decoding started
+2022-10-13 19:09:06,700 INFO [pretrained.py:329] Using greedy_search
+2022-10-13 19:09:06,912 INFO [pretrained.py:388]
+./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav:
+AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
+
+./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav:
+GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
+
+./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav:
+YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
+
+
+2022-10-13 19:09:06,912 INFO [pretrained.py:390] Decoding Done
--- a/docs/source/model-export/export-model-state-dict.rst
+++ b/docs/source/model-export/export-model-state-dict.rst
@ -0,0 +1,135 @@
+Export model.state_dict()
+=========================
+
+When to use it
+--------------
+
+During model training, we save checkpoints periodically to disk.
+
+A checkpoint contains the following information:
+
+  - ``model.state_dict()``
+  - ``optimizer.state_dict()``
+  - and some other information related to training
+
+When we need to resume the training process from some point, we need a checkpoint.
+However, if we want to publish the model for inference, then only
+``model.state_dict()`` is needed. In this case, we need to strip all other information
+except ``model.state_dict()`` to reduce the file size of the published model.
+
+How to export
+-------------
+
+Every recipe contains a file ``export.py`` that you can use to
+export ``model.state_dict()`` by taking some checkpoints as inputs.
+
+.. hint::
+
+   Each ``export.py`` contains well-documented usage information.
+
+In the following, we use
+`<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless3/export.py>`_
+as an example.
+
+.. note::
+
+   The steps for other recipes are almost the same.
+
+.. code-block:: bash
+
+  cd egs/librispeech/ASR
+
+  ./pruned_transducer_stateless3/export.py \
+    --exp-dir ./pruned_transducer_stateless3/exp \
+    --bpe-model data/lang_bpe_500/bpe.model \
+    --epoch 20 \
+    --avg 10
+
+will generate a file ``pruned_transducer_stateless3/exp/pretrained.pt``, which
+is a dict containing ``{"model": model.state_dict()}`` saved by ``torch.save()``.
+
+How to use the exported model
+-----------------------------
+
+For each recipe, we provide pretrained models hosted on huggingface.
+You can find links to pretrained models in ``RESULTS.md`` of each dataset.
+
+In the following, we demonstrate how to use the pretrained model from
+`<https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13>`_.
+
+.. code-block:: bash
+
+   cd egs/librispeech/ASR
+
+   git lfs install
+   git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
+
+After cloning the repo with ``git lfs``, you will find several files in the folder
+``icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp``
+that have a prefix ``pretrained-``. Those files contain ``model.state_dict()``
+exported by the above ``export.py``.
+
+In each recipe, there is also a file ``pretrained.py``, which can use
+``pretrained-xxx.pt`` to decode waves. The following is an example:
+
+.. code-block:: bash
+
+   cd egs/librispeech/ASR
+
+   ./pruned_transducer_stateless3/pretrained.py \
+      --checkpoint ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/pretrained-iter-1224000-avg-14.pt \
+      --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/bpe.model \
+      --method greedy_search \
+      ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \
+      ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \
+      ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav
+
+The above commands show how to use the exported model with ``pretrained.py`` to
+decode multiple sound files. Its output is given as follows for reference:
+
+.. literalinclude:: ./code/export-model-state-dict-pretrained-out.txt
+
+Use the exported model to run decode.py
+---------------------------------------
+
+When we publish the model, we always note down its WERs on some test
+dataset in ``RESULTS.md``. This section describes how to use the
+pretrained model to reproduce the WER.
+
+.. code-block:: bash
+
+   cd egs/librispeech/ASR
+   git lfs install
+   git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
+
+   cd icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp
+   ln -s pretrained-iter-1224000-avg-14.pt epoch-9999.pt
+   cd ../..
+
+We create a symlink with name ``epoch-9999.pt`` to ``pretrained-iter-1224000-avg-14.pt``,
+so that we can pass ``--epoch 9999 --avg 1`` to ``decode.py`` in the following
+command:
+
+.. code-block:: bash
+
+  ./pruned_transducer_stateless3/decode.py \
+      --epoch 9999 \
+      --avg 1 \
+      --exp-dir ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp \
+      --lang-dir ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500 \
+      --max-duration 600 \
+      --decoding-method greedy_search
+
+You will find the decoding results in
+``./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/greedy_search``.
+
+.. caution::
+
+   For some recipes, you also need to pass ``--use-averaged-model False``
+   to ``decode.py``. The reason is that the exported pretrained model is already
+   the averaged one.
+
+.. hint::
+
+   Before running ``decode.py``, we assume that you have already run
+   ``prepare.sh`` to prepare the test dataset.
--- a/docs/source/model-export/export-ncnn.rst
+++ b/docs/source/model-export/export-ncnn.rst
@ -0,0 +1,12 @@
+Export to ncnn
+==============
+
+We support exporting LSTM transducer models to `ncnn <https://github.com/tencent/ncnn>`_.
+
+Please refer to :ref:`export-model-for-ncnn` for details.
+
+We also provide `<https://github.com/k2-fsa/sherpa-ncnn>`_
+performing speech recognition using ``ncnn`` with exported models.
+It has been tested on Linux, macOS, Windows, and Raspberry Pi. The project is
+self-contained and can be statically linked to produce a binary containing
+everything needed.
--- a/docs/source/model-export/export-onnx.rst
+++ b/docs/source/model-export/export-onnx.rst
@ -0,0 +1,69 @@
+Export to ONNX
+==============
+
+In this section, we describe how to export models to ONNX.
+
+.. hint::
+
+  Only non-streaming conformer transducer models are tested.
+
+
+When to use it
+--------------
+
+It you want to use an inference framework that supports ONNX
+to run the pretrained model.
+
+
+How to export
+-------------
+
+We use
+`<https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless3>`_
+as an example in the following.
+
+.. code-block:: bash
+
+    cd egs/librispeech/ASR
+    epoch=14
+    avg=2
+
+    ./pruned_transducer_stateless3/export.py \
+      --exp-dir ./pruned_transducer_stateless3/exp \
+      --bpe-model data/lang_bpe_500/bpe.model \
+      --epoch $epoch \
+      --avg $avg \
+      --onnx 1
+
+It will generate the following files inside ``pruned_transducer_stateless3/exp``:
+
+  - ``encoder.onnx``
+  - ``decoder.onnx``
+  - ``joiner.onnx``
+  - ``joiner_encoder_proj.onnx``
+  - ``joiner_decoder_proj.onnx``
+
+You can use ``./pruned_transducer_stateless3/exp/onnx_pretrained.py`` to decode
+waves with the generated files:
+
+.. code-block:: bash
+
+  ./pruned_transducer_stateless3/onnx_pretrained.py \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --encoder-model-filename ./pruned_transducer_stateless3/exp/encoder.onnx \
+    --decoder-model-filename ./pruned_transducer_stateless3/exp/decoder.onnx \
+    --joiner-model-filename ./pruned_transducer_stateless3/exp/joiner.onnx \
+    --joiner-encoder-proj-model-filename ./pruned_transducer_stateless3/exp/joiner_encoder_proj.onnx \
+    --joiner-decoder-proj-model-filename ./pruned_transducer_stateless3/exp/joiner_decoder_proj.onnx \
+    /path/to/foo.wav \
+    /path/to/bar.wav \
+    /path/to/baz.wav
+
+
+How to use the exported model
+-----------------------------
+
+We also provide `<https://github.com/k2-fsa/sherpa-onnx>`_
+performing speech recognition using `onnxruntime <https://github.com/microsoft/onnxruntime>`_
+with exported models.
+It has been tested on Linux, macOS, and Windows.
--- a/docs/source/model-export/export-with-torch-jit-script.rst
+++ b/docs/source/model-export/export-with-torch-jit-script.rst
@ -0,0 +1,58 @@
+.. _export-model-with-torch-jit-script:
+
+Export model with torch.jit.script()
+===================================
+
+In this section, we describe how to export a model via
+``torch.jit.script()``.
+
+When to use it
+--------------
+
+If we want to use our trained model with torchscript,
+we can use ``torch.jit.script()``.
+
+.. hint::
+
+  See :ref:`export-model-with-torch-jit-trace`
+  if you want to use ``torch.jit.trace()``.
+
+How to export
+-------------
+
+We use
+`<https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless3>`_
+as an example in the following.
+
+.. code-block:: bash
+
+    cd egs/librispeech/ASR
+    epoch=14
+    avg=1
+
+    ./pruned_transducer_stateless3/export.py \
+      --exp-dir ./pruned_transducer_stateless3/exp \
+      --bpe-model data/lang_bpe_500/bpe.model \
+      --epoch $epoch \
+      --avg $avg \
+      --jit 1
+
+It will generate a file ``cpu_jit.pt`` in ``pruned_transducer_stateless3/exp``.
+
+.. caution::
+
+   Don't be confused by ``cpu`` in ``cpu_jit.pt``. We move all parameters
+   to CPU before saving it into a ``pt`` file; that's why we use ``cpu``
+   in the filename.
+
+How to use the exported model
+-----------------------------
+
+Please refer to the following pages for usage:
+
+- `<https://k2-fsa.github.io/sherpa/python/streaming_asr/emformer/index.html>`_
+- `<https://k2-fsa.github.io/sherpa/python/streaming_asr/conv_emformer/index.html>`_
+- `<https://k2-fsa.github.io/sherpa/python/streaming_asr/conformer/index.html>`_
+- `<https://k2-fsa.github.io/sherpa/python/offline_asr/conformer/index.html>`_
+- `<https://k2-fsa.github.io/sherpa/cpp/offline_asr/gigaspeech.html>`_
+- `<https://k2-fsa.github.io/sherpa/cpp/offline_asr/wenetspeech.html>`_
--- a/docs/source/model-export/export-with-torch-jit-trace.rst
+++ b/docs/source/model-export/export-with-torch-jit-trace.rst
@ -0,0 +1,69 @@
+.. _export-model-with-torch-jit-trace:
+
+Export model with torch.jit.trace()
+===================================
+
+In this section, we describe how to export a model via
+``torch.jit.trace()``.
+
+When to use it
+--------------
+
+If we want to use our trained model with torchscript,
+we can use ``torch.jit.trace()``.
+
+.. hint::
+
+  See :ref:`export-model-with-torch-jit-script`
+  if you want to use ``torch.jit.script()``.
+
+How to export
+-------------
+
+We use
+`<https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/lstm_transducer_stateless2>`_
+as an example in the following.
+
+.. code-block:: bash
+
+    iter=468000
+    avg=16
+
+    cd egs/librispeech/ASR
+
+    ./lstm_transducer_stateless2/export.py \
+      --exp-dir ./lstm_transducer_stateless2/exp \
+      --bpe-model data/lang_bpe_500/bpe.model \
+      --iter $iter \
+      --avg  $avg \
+      --jit-trace 1
+
+It will generate three files inside ``lstm_transducer_stateless2/exp``:
+
+  - ``encoder_jit_trace.pt``
+  - ``decoder_jit_trace.pt``
+  - ``joiner_jit_trace.pt``
+
+You can use
+`<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/lstm_transducer_stateless2/jit_pretrained.py>`_
+to decode sound files with the following commands:
+
+.. code-block:: bash
+
+    cd egs/librispeech/ASR
+    ./lstm_transducer_stateless2/jit_pretrained.py \
+      --bpe-model ./data/lang_bpe_500/bpe.model \
+      --encoder-model-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace.pt \
+      --decoder-model-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace.pt \
+      --joiner-model-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace.pt \
+      /path/to/foo.wav \
+      /path/to/bar.wav \
+      /path/to/baz.wav
+
+How to use the exported models
+------------------------------
+
+Please refer to
+`<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/index.html>`_
+for its usage in `sherpa <https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/index.html>`_.
+You can also find pretrained models there.
--- a/docs/source/model-export/index.rst
+++ b/docs/source/model-export/index.rst
@ -0,0 +1,14 @@
+Model export
+============
+
+In this section, we describe various ways to export models.
+
+
+
+.. toctree::
+
+   export-model-state-dict
+   export-with-torch-jit-trace
+   export-with-torch-jit-script
+   export-onnx
+   export-ncnn
--- a/docs/source/recipes/aishell/conformer_ctc.rst
+++ b/docs/source/recipes/aishell/conformer_ctc.rst
@ -422,7 +422,7 @@ The information of the test sound files is listed below:

 .. code-block:: bash

-  $ soxi tmp/icefall_asr_aishell_conformer_ctc/test_wavs/*.wav
+  $ soxi tmp/icefall_asr_aishell_conformer_ctc/test_waves/*.wav

  Input File     : 'tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav'
  Channels       : 1
@ -485,9 +485,9 @@ The command to run CTC decoding is:
    --checkpoint ./tmp/icefall_asr_aishell_conformer_ctc/exp/pretrained.pt \
    --tokens-file ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/tokens.txt \
    --method ctc-decoding \
-    ./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0121.wav \
-    ./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0122.wav \
-    ./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0123.wav
+    ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav \
+    ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav \
+    ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav

 The output is given below:

@ -529,9 +529,9 @@ The command to run HLG decoding is:
    --words-file ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/words.txt \
    --HLG ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt \
    --method 1best \
-    ./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0121.wav \
-    ./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0122.wav \
-    ./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0123.wav
+    ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav \
+    ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav \
+    ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav

 The output is given below:

@ -575,9 +575,9 @@ The command to run HLG decoding + attention decoder rescoring is:
    --words-file ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/words.txt \
    --HLG ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt \
    --method attention-decoder \
-    ./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0121.wav \
-    ./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0122.wav \
-    ./tmp/icefall_asr_aishell_conformer_ctc/test_wavs/BAC009S0764W0123.wav
+    ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav \
+    ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav \
+    ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav

 The output is below:

--- a/docs/source/recipes/aishell/tdnn_lstm_ctc.rst
+++ b/docs/source/recipes/aishell/tdnn_lstm_ctc.rst
@ -402,7 +402,7 @@ The information of the test sound files is listed below:

 .. code-block:: bash

-  $ soxi tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_wavs/*.wav
+  $ soxi tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/*.wav

  Input File     : 'tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0121.wav'
  Channels       : 1
@ -461,9 +461,9 @@ The command to run HLG decoding is:
    --words-file ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/data/lang_phone/words.txt \
    --HLG ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/data/lang_phone/HLG.pt \
    --method 1best \
-    ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_wavs/BAC009S0764W0121.wav \
-    ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_wavs/BAC009S0764W0122.wav \
-    ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_wavs/BAC009S0764W0123.wav
+    ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0121.wav \
+    ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0122.wav \
+    ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0123.wav

 The output is given below:

--- a/docs/source/recipes/librispeech/images/librispeech-lstm-transducer-tensorboard-log.png
+++ b/docs/source/recipes/librispeech/images/librispeech-lstm-transducer-tensorboard-log.png
--- a/docs/source/recipes/librispeech/index.rst
+++ b/docs/source/recipes/librispeech/index.rst
@ -6,3 +6,4 @@ LibriSpeech

   tdnn_lstm_ctc
   conformer_ctc
+   lstm_pruned_stateless_transducer
--- a/docs/source/recipes/librispeech/lstm_pruned_stateless_transducer.rst
+++ b/docs/source/recipes/librispeech/lstm_pruned_stateless_transducer.rst
@ -0,0 +1,636 @@
+LSTM Transducer
+===============
+
+.. hint::
+
+   Please scroll down to the bottom of this page to find download links
+   for pretrained models if you don't want to train a model from scratch.
+
+
+This tutorial shows you how to train an LSTM transducer model
+with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
+
+We use pruned RNN-T to compute the loss.
+
+.. note::
+
+   You can find the paper about pruned RNN-T at the following address:
+
+   `<https://arxiv.org/abs/2206.13236>`_
+
+The transducer model consists of 3 parts:
+
+  - Encoder, a.k.a, the transcription network. We use an LSTM model
+  - Decoder, a.k.a, the prediction network. We use a stateless model consisting of
+    ``nn.Embedding`` and ``nn.Conv1d``
+  - Joiner, a.k.a, the joint network.
+
+.. caution::
+
+   Contrary to the conventional RNN-T models, we use a stateless decoder.
+   That is, it has no recurrent connections.
+
+.. hint::
+
+   Since the encoder model is an LSTM, not Transformer/Conformer, the
+   resulting model is suitable for streaming/online ASR.
+
+
+Which model to use
+------------------
+
+Currently, there are two folders about LSTM stateless transducer training:
+
+  - ``(1)`` `<https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/lstm_transducer_stateless>`_
+
+    This recipe uses only LibriSpeech during training.
+
+  - ``(2)`` `<https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/lstm_transducer_stateless2>`_
+
+    This recipe uses GigaSpeech + LibriSpeech during training.
+
+``(1)`` and ``(2)`` use the same model architecture. The only difference is that ``(2)`` supports
+multi-dataset. Since ``(2)`` uses more data, it has a lower WER than ``(1)`` but it needs
+more training time.
+
+We use ``lstm_transducer_stateless2`` as an example below.
+
+.. note::
+
+   You need to download the `GigaSpeech <https://github.com/SpeechColab/GigaSpeech>`_ dataset
+   to run ``(2)``. If you have only ``LibriSpeech`` dataset available, feel free to use ``(1)``.
+
+Data preparation
+----------------
+
+.. code-block:: bash
+
+  $ cd egs/librispeech/ASR
+  $ ./prepare.sh
+
+  # If you use (1), you can **skip** the following command
+  $ ./prepare_giga_speech.sh
+
+The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
+All you need to do is to run it.
+
+.. note::
+
+   We encourage you to read ``./prepare.sh``.
+
+The data preparation contains several stages. You can use the following two
+options:
+
+  - ``--stage``
+  - ``--stop-stage``
+
+to control which stage(s) should be run. By default, all stages are executed.
+
+
+For example,
+
+.. code-block:: bash
+
+  $ cd egs/librispeech/ASR
+  $ ./prepare.sh --stage 0 --stop-stage 0
+
+means to run only stage 0.
+
+To run stage 2 to stage 5, use:
+
+.. code-block:: bash
+
+  $ ./prepare.sh --stage 2 --stop-stage 5
+
+.. hint::
+
+  If you have pre-downloaded the `LibriSpeech <https://www.openslr.org/12>`_
+  dataset and the `musan <http://www.openslr.org/17/>`_ dataset, say,
+  they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
+  the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
+  ``./prepare.sh`` won't re-download them.
+
+.. note::
+
+  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
+  are saved in ``./data`` directory.
+
+We provide the following YouTube video showing how to run ``./prepare.sh``.
+
+.. note::
+
+   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
+   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
+
+      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
+
+..  youtube:: ofEIoJL-mGM
+
+Training
+--------
+
+Configurable options
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+  $ cd egs/librispeech/ASR
+  $ ./lstm_transducer_stateless2/train.py --help
+
+shows you the training options that can be passed from the commandline.
+The following options are used quite often:
+
+  - ``--full-libri``
+
+    If it's True, the training part uses all the training data, i.e.,
+    960 hours. Otherwise, the training part uses only the subset
+    ``train-clean-100``, which has 100 hours of training data.
+
+    .. CAUTION::
+
+      The training set is perturbed by speed with two factors: 0.9 and 1.1.
+      If ``--full-libri`` is True, each epoch actually processes
+      ``3x960 == 2880`` hours of data.
+
+  - ``--num-epochs``
+
+    It is the number of epochs to train. For instance,
+    ``./lstm_transducer_stateless2/train.py --num-epochs 30`` trains for 30 epochs
+    and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
+    in the folder ``./lstm_transducer_stateless2/exp``.
+
+  - ``--start-epoch``
+
+    It's used to resume training.
+    ``./lstm_transducer_stateless2/train.py --start-epoch 10`` loads the
+    checkpoint ``./lstm_transducer_stateless2/exp/epoch-9.pt`` and starts
+    training from epoch 10, based on the state from epoch 9.
+
+  - ``--world-size``
+
+    It is used for multi-GPU single-machine DDP training.
+
+      - (a) If it is 1, then no DDP training is used.
+
+      - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
+
+    The following shows some use cases with it.
+
+      **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
+      GPU 2 for training. You can do the following:
+
+        .. code-block:: bash
+
+          $ cd egs/librispeech/ASR
+          $ export CUDA_VISIBLE_DEVICES="0,2"
+          $ ./lstm_transducer_stateless2/train.py --world-size 2
+
+      **Use case 2**: You have 4 GPUs and you want to use all of them
+      for training. You can do the following:
+
+        .. code-block:: bash
+
+          $ cd egs/librispeech/ASR
+          $ ./lstm_transducer_stateless2/train.py --world-size 4
+
+      **Use case 3**: You have 4 GPUs but you only want to use GPU 3
+      for training. You can do the following:
+
+        .. code-block:: bash
+
+          $ cd egs/librispeech/ASR
+          $ export CUDA_VISIBLE_DEVICES="3"
+          $ ./lstm_transducer_stateless2/train.py --world-size 1
+
+    .. caution::
+
+      Only multi-GPU single-machine DDP training is implemented at present.
+      Multi-GPU multi-machine DDP training will be added later.
+
+  - ``--max-duration``
+
+    It specifies the number of seconds over all utterances in a
+    batch, before **padding**.
+    If you encounter CUDA OOM, please reduce it.
+
+    .. HINT::
+
+      Due to padding, the number of seconds of all utterances in a
+      batch will usually be larger than ``--max-duration``.
+
+      A larger value for ``--max-duration`` may cause OOM during training,
+      while a smaller value may increase the training time. You have to
+      tune it.
+
+  - ``--giga-prob``
+
+    The probability to select a batch from the ``GigaSpeech`` dataset.
+    Note: It is available only for ``(2)``.
+
+Pre-configured options
+~~~~~~~~~~~~~~~~~~~~~~
+
+There are some training options, e.g., weight decay,
+number of warmup steps, results dir, etc,
+that are not passed from the commandline.
+They are pre-configured by the function ``get_params()`` in
+`lstm_transducer_stateless2/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/lstm_transducer_stateless2/train.py>`_
+
+You don't need to change these pre-configured parameters. If you really need to change
+them, please modify ``./lstm_transducer_stateless2/train.py`` directly.
+
+Training logs
+~~~~~~~~~~~~~
+
+Training logs and checkpoints are saved in ``lstm_transducer_stateless2/exp``.
+You will find the following files in that directory:
+
+  - ``epoch-1.pt``, ``epoch-2.pt``, ...
+
+    These are checkpoint files saved at the end of each epoch, containing model
+    ``state_dict`` and optimizer ``state_dict``.
+    To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
+
+      .. code-block:: bash
+
+        $ ./lstm_transducer_stateless2/train.py --start-epoch 11
+
+  - ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
+
+    These are checkpoint files saved every ``--save-every-n`` batches,
+    containing model ``state_dict`` and optimizer ``state_dict``.
+    To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
+
+      .. code-block:: bash
+
+        $ ./lstm_transducer_stateless2/train.py --start-batch 436000
+
+  - ``tensorboard/``
+
+    This folder contains tensorBoard logs. Training loss, validation loss, learning
+    rate, etc, are recorded in these logs. You can visualize them by:
+
+      .. code-block:: bash
+
+        $ cd lstm_transducer_stateless2/exp/tensorboard
+        $ tensorboard dev upload --logdir . --description "LSTM transducer training for LibriSpeech with icefall"
+
+    It will print something like below:
+
+      .. code-block::
+
+        TensorFlow installation not found - running with reduced feature set.
+        Upload started and will continue reading any new data as it's added to the logdir.
+
+        To stop uploading, press Ctrl-C.
+
+        New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/cj2vtPiwQHKN9Q1tx6PTpg/
+
+        [2022-09-20T15:50:50] Started scanning logdir.
+        Uploading 4468 scalars...
+        [2022-09-20T15:53:02] Total uploaded: 210171 scalars, 0 tensors, 0 binary objects
+        Listening for new data in logdir...
+
+    Note there is a URL in the above output. Click it and you will see
+    the following screenshot:
+
+      .. figure:: images/librispeech-lstm-transducer-tensorboard-log.png
+         :width: 600
+         :alt: TensorBoard screenshot
+         :align: center
+         :target: https://tensorboard.dev/experiment/lzGnETjwRxC3yghNMd4kPw/
+
+         TensorBoard screenshot.
+
+  .. hint::
+
+    If you don't have access to google, you can use the following command
+    to view the tensorboard log locally:
+
+      .. code-block:: bash
+
+        cd lstm_transducer_stateless2/exp/tensorboard
+        tensorboard --logdir . --port 6008
+
+    It will print the following message:
+
+      .. code-block::
+
+        Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
+        TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
+
+    Now start your browser and go to `<http://localhost:6008>`_ to view the tensorboard
+    logs.
+
+
+  - ``log/log-train-xxxx``
+
+    It is the detailed training log in text format, same as the one
+    you saw printed to the console during training.
+
+Usage example
+~~~~~~~~~~~~~
+
+You can use the following command to start the training using 8 GPUs:
+
+.. code-block:: bash
+
+  export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+  ./lstm_transducer_stateless2/train.py \
+    --world-size 8 \
+    --num-epochs 35 \
+    --start-epoch 1 \
+    --full-libri 1 \
+    --exp-dir lstm_transducer_stateless2/exp \
+    --max-duration 500 \
+    --use-fp16 0 \
+    --lr-epochs 10 \
+    --num-workers 2 \
+    --giga-prob 0.9
+
+Decoding
+--------
+
+The decoding part uses checkpoints saved by the training part, so you have
+to run the training part first.
+
+.. hint::
+
+   There are two kinds of checkpoints:
+
+    - (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
+      of each epoch. You can pass ``--epoch`` to
+      ``lstm_transducer_stateless2/decode.py`` to use them.
+
+    - (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
+      every ``--save-every-n`` batches. You can pass ``--iter`` to
+      ``lstm_transducer_stateless2/decode.py`` to use them.
+
+    We suggest that you try both types of checkpoints and choose the one
+    that produces the lowest WERs.
+
+.. code-block:: bash
+
+  $ cd egs/librispeech/ASR
+  $ ./lstm_transducer_stateless2/decode.py --help
+
+shows the options for decoding.
+
+The following shows two examples:
+
+.. code-block:: bash
+
+  for m in greedy_search fast_beam_search modified_beam_search; do
+    for epoch in 17; do
+      for avg in 1 2; do
+        ./lstm_transducer_stateless2/decode.py \
+          --epoch $epoch \
+          --avg $avg \
+          --exp-dir lstm_transducer_stateless2/exp \
+          --max-duration 600 \
+          --num-encoder-layers 12 \
+          --rnn-hidden-size 1024 \
+          --decoding-method $m \
+          --use-averaged-model True \
+          --beam 4 \
+          --max-contexts 4 \
+          --max-states 8 \
+          --beam-size 4
+      done
+    done
+  done
+
+
+.. code-block:: bash
+
+  for m in greedy_search fast_beam_search modified_beam_search; do
+    for iter in 474000; do
+      for avg in 8 10 12 14 16 18; do
+        ./lstm_transducer_stateless2/decode.py \
+          --iter $iter \
+          --avg $avg \
+          --exp-dir lstm_transducer_stateless2/exp \
+          --max-duration 600 \
+          --num-encoder-layers 12 \
+          --rnn-hidden-size 1024 \
+          --decoding-method $m \
+          --use-averaged-model True \
+          --beam 4 \
+          --max-contexts 4 \
+          --max-states 8 \
+          --beam-size 4
+      done
+    done
+  done
+
+Export models
+-------------
+
+`lstm_transducer_stateless2/export.py <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/lstm_transducer_stateless2/export.py>`_ supports exporting checkpoints from ``lstm_transducer_stateless2/exp`` in the following ways.
+
+Export ``model.state_dict()``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Checkpoints saved by ``lstm_transducer_stateless2/train.py`` also include
+``optimizer.state_dict()``. It is useful for resuming training. But after training,
+we are interested only in ``model.state_dict()``. You can use the following
+command to extract ``model.state_dict()``.
+
+.. code-block:: bash
+
+  # Assume that --iter 468000 --avg 16 produces the smallest WER
+  # (You can get such information after running ./lstm_transducer_stateless2/decode.py)
+
+  iter=468000
+  avg=16
+
+  ./lstm_transducer_stateless2/export.py \
+    --exp-dir ./lstm_transducer_stateless2/exp \
+    --bpe-model data/lang_bpe_500/bpe.model \
+    --iter $iter \
+    --avg  $avg
+
+It will generate a file ``./lstm_transducer_stateless2/exp/pretrained.pt``.
+
+.. hint::
+
+   To use the generated ``pretrained.pt`` for ``lstm_transducer_stateless2/decode.py``,
+   you can run:
+
+   .. code-block:: bash
+
+      cd lstm_transducer_stateless2/exp
+      ln -s pretrained epoch-9999.pt
+
+   And then pass ``--epoch 9999 --avg 1 --use-averaged-model 0`` to
+   ``./lstm_transducer_stateless2/decode.py``.
+
+To use the exported model with ``./lstm_transducer_stateless2/pretrained.py``, you
+can run:
+
+.. code-block:: bash
+
+  ./lstm_transducer_stateless2/pretrained.py \
+    --checkpoint ./lstm_transducer_stateless2/exp/pretrained.pt \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --method greedy_search \
+    /path/to/foo.wav \
+    /path/to/bar.wav
+
+Export model using ``torch.jit.trace()``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+  iter=468000
+  avg=16
+
+  ./lstm_transducer_stateless2/export.py \
+    --exp-dir ./lstm_transducer_stateless2/exp \
+    --bpe-model data/lang_bpe_500/bpe.model \
+    --iter $iter \
+    --avg  $avg \
+    --jit-trace 1
+
+It will generate 3 files:
+
+  - ``./lstm_transducer_stateless2/exp/encoder_jit_trace.pt``
+  - ``./lstm_transducer_stateless2/exp/decoder_jit_trace.pt``
+  - ``./lstm_transducer_stateless2/exp/joiner_jit_trace.pt``
+
+To use the generated files with ``./lstm_transducer_stateless2/jit_pretrained``:
+
+.. code-block:: bash
+
+  ./lstm_transducer_stateless2/jit_pretrained.py \
+    --bpe-model ./data/lang_bpe_500/bpe.model \
+    --encoder-model-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace.pt \
+    --decoder-model-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace.pt \
+    --joiner-model-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace.pt \
+    /path/to/foo.wav \
+    /path/to/bar.wav
+
+.. hint::
+
+   Please see `<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/english/server.html>`_
+   for how to use the exported models in ``sherpa``.
+
+.. _export-model-for-ncnn:
+
+Export model for ncnn
+~~~~~~~~~~~~~~~~~~~~~
+
+We support exporting pretrained LSTM transducer models to
+`ncnn <https://github.com/tencent/ncnn>`_ using
+`pnnx <https://github.com/Tencent/ncnn/tree/master/tools/pnnx>`_.
+
+First, let us install a modified version of ``ncnn``:
+
+.. code-block:: bash
+
+  git clone https://github.com/csukuangfj/ncnn
+  cd ncnn
+  git submodule update --recursive --init
+  python3 setup.py bdist_wheel
+  ls -lh dist/
+  pip install ./dist/*.whl
+
+  # now build pnnx
+  cd tools/pnnx
+  mkdir build
+  cd build
+  make -j4
+  export PATH=$PWD/src:$PATH
+
+  ./src/pnnx
+
+.. note::
+
+   We assume that you have added the path to the binary ``pnnx`` to the
+   environment variable ``PATH``.
+
+Second, let us export the model using ``torch.jit.trace()`` that is suitable
+for ``pnnx``:
+
+.. code-block:: bash
+
+  iter=468000
+  avg=16
+
+  ./lstm_transducer_stateless2/export.py \
+    --exp-dir ./lstm_transducer_stateless2/exp \
+    --bpe-model data/lang_bpe_500/bpe.model \
+    --iter $iter \
+    --avg  $avg \
+    --pnnx 1
+
+It will generate 3 files:
+
+  - ``./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.pt``
+  - ``./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.pt``
+  - ``./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.pt``
+
+Third, convert torchscript model to ``ncnn`` format:
+
+.. code-block::
+
+   pnnx ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.pt
+   pnnx ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.pt
+   pnnx ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.pt
+
+It will generate the following files:
+
+  - ``./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.param``
+  - ``./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.bin``
+  - ``./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.param``
+  - ``./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.bin``
+  - ``./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.param``
+  - ``./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.bin``
+
+To use the above generated files, run:
+
+.. code-block:: bash
+
+  ./lstm_transducer_stateless2/ncnn-decode.py \
+   --bpe-model-filename ./data/lang_bpe_500/bpe.model \
+   --encoder-param-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.param \
+   --encoder-bin-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.bin \
+   --decoder-param-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.param \
+   --decoder-bin-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.bin \
+   --joiner-param-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.param \
+   --joiner-bin-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.bin \
+   /path/to/foo.wav
+
+.. code-block:: bash
+
+  ./lstm_transducer_stateless2/streaming-ncnn-decode.py \
+   --bpe-model-filename ./data/lang_bpe_500/bpe.model \
+   --encoder-param-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.param \
+   --encoder-bin-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace-pnnx.ncnn.bin \
+   --decoder-param-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.param \
+   --decoder-bin-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace-pnnx.ncnn.bin \
+   --joiner-param-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.param \
+   --joiner-bin-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace-pnnx.ncnn.bin \
+   /path/to/foo.wav
+
+To use the above generated files in C++, please see
+`<https://github.com/k2-fsa/sherpa-ncnn>`_
+
+It is able to generate a static linked executable that can be run on Linux, Windows,
+macOS, Raspberry Pi, etc, without external dependencies.
+
+Download pretrained models
+--------------------------
+
+If you don't want to train from scratch, you can download the pretrained models
+by visiting the following links:
+
+  - `<https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03>`_
+
+  - `<https://huggingface.co/Zengwei/icefall-asr-librispeech-lstm-transducer-stateless-2022-08-18>`_
+
+  See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
+  for the details of the above pretrained models
+
+You can find more usages of the pretrained models in
+`<https://k2-fsa.github.io/sherpa/python/streaming_asr/lstm/index.html>`_
--- a/egs/aishell/ASR/conformer_ctc/conformer.py
+++ b/egs/aishell/ASR/conformer_ctc/conformer.py
@ -248,7 +248,9 @@ class ConformerEncoderLayer(nn.Module):
        residual = src
        if self.normalize_before:
            src = self.norm_conv(src)
-        src = residual + self.dropout(self.conv_module(src))
+        src = residual + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
        if not self.normalize_before:
            src = self.norm_conv(src)

@ -879,11 +881,16 @@ class ConvolutionModule(nn.Module):
        )
        self.activation = Swish()

-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
        """Compute convolution module.

        Args:
            x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).

        Returns:
            Tensor: Output tensor (#time, batch, channels).
@ -897,6 +904,8 @@ class ConvolutionModule(nn.Module):
        x = nn.functional.glu(x, dim=1)  # (batch, channels, time)

        # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
        x = self.depthwise_conv(x)
        x = self.activation(self.norm(x))

--- a/egs/aishell/ASR/conformer_ctc/decode.py
+++ b/egs/aishell/ASR/conformer_ctc/decode.py
@ -335,7 +335,7 @@ def decode_dataset(
    lexicon: Lexicon,
    sos_id: int,
    eos_id: int,
-) -> Dict[str, List[Tuple[List[int], List[int]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -410,7 +410,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    if params.method == "attention-decoder":
        # Set it to False since there are too many logs.
--- a/egs/aishell/ASR/conformer_mmi/conformer.py
+++ b/egs/aishell/ASR/conformer_mmi/conformer.py
@ -248,7 +248,9 @@ class ConformerEncoderLayer(nn.Module):
        residual = src
        if self.normalize_before:
            src = self.norm_conv(src)
-        src = residual + self.dropout(self.conv_module(src))
+        src = residual + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
        if not self.normalize_before:
            src = self.norm_conv(src)

@ -879,11 +881,16 @@ class ConvolutionModule(nn.Module):
        )
        self.activation = Swish()

-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
        """Compute convolution module.

        Args:
            x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).

        Returns:
            Tensor: Output tensor (#time, batch, channels).
@ -897,6 +904,8 @@ class ConvolutionModule(nn.Module):
        x = nn.functional.glu(x, dim=1)  # (batch, channels, time)

        # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
        x = self.depthwise_conv(x)
        x = self.activation(self.norm(x))

--- a/egs/aishell/ASR/conformer_mmi/decode.py
+++ b/egs/aishell/ASR/conformer_mmi/decode.py
@ -347,7 +347,7 @@ def decode_dataset(
    lexicon: Lexicon,
    sos_id: int,
    eos_id: int,
-) -> Dict[str, List[Tuple[List[int], List[int]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -422,7 +422,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    if params.method == "attention-decoder":
        # Set it to False since there are too many logs.
--- a/egs/aishell/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless2/decode.py
@ -326,7 +326,7 @@ def decode_dataset(
    model: nn.Module,
    token_table: k2.SymbolTable,
    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -396,7 +396,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
--- a/egs/aishell/ASR/pruned_transducer_stateless3/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless3/decode.py
@ -340,7 +340,7 @@ def decode_dataset(
    model: nn.Module,
    token_table: k2.SymbolTable,
    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -410,7 +410,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
--- a/egs/aishell/ASR/tdnn_lstm_ctc/decode.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/decode.py
@ -208,7 +208,7 @@ def decode_dataset(
    model: nn.Module,
    HLG: k2.Fsa,
    lexicon: Lexicon,
-) -> Dict[str, List[Tuple[List[int], List[int]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -274,7 +274,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
--- a/egs/aishell/ASR/transducer_stateless/conformer.py
+++ b/egs/aishell/ASR/transducer_stateless/conformer.py
@ -246,7 +246,9 @@ class ConformerEncoderLayer(nn.Module):
        residual = src
        if self.normalize_before:
            src = self.norm_conv(src)
-        src = residual + self.dropout(self.conv_module(src))
+        src = residual + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
        if not self.normalize_before:
            src = self.norm_conv(src)

@ -877,11 +879,16 @@ class ConvolutionModule(nn.Module):
        )
        self.activation = Swish()

-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
        """Compute convolution module.

        Args:
            x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).

        Returns:
            Tensor: Output tensor (#time, batch, channels).
@ -895,6 +902,8 @@ class ConvolutionModule(nn.Module):
        x = nn.functional.glu(x, dim=1)  # (batch, channels, time)

        # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
        x = self.depthwise_conv(x)
        # x is (batch, channels, time)
        x = x.permute(0, 2, 1)
--- a/egs/aishell/ASR/transducer_stateless/decode.py
+++ b/egs/aishell/ASR/transducer_stateless/decode.py
@ -264,7 +264,7 @@ def decode_dataset(
    params: AttributeDict,
    model: nn.Module,
    lexicon: Lexicon,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -328,7 +328,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
--- a/egs/aishell/ASR/transducer_stateless_modified-2/decode.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/decode.py
@ -304,7 +304,7 @@ def decode_dataset(
    model: nn.Module,
    token_table: k2.SymbolTable,
    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -374,7 +374,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
--- a/egs/aishell/ASR/transducer_stateless_modified/decode.py
+++ b/egs/aishell/ASR/transducer_stateless_modified/decode.py
@ -308,7 +308,7 @@ def decode_dataset(
    model: nn.Module,
    token_table: k2.SymbolTable,
    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -378,7 +378,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
--- a/egs/aishell2/ASR/pruned_transducer_stateless5/decode.py
+++ b/egs/aishell2/ASR/pruned_transducer_stateless5/decode.py
@ -478,7 +478,7 @@ def decode_dataset(
    lexicon: Lexicon,
    graph_compiler: CharCtcTrainingGraphCompiler,
    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -547,7 +547,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
--- a/egs/aishell4/ASR/pruned_transducer_stateless5/decode.py
+++ b/egs/aishell4/ASR/pruned_transducer_stateless5/decode.py
@ -342,7 +342,7 @@ def decode_dataset(
    model: nn.Module,
    lexicon: Lexicon,
    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -410,7 +410,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
--- a/egs/alimeeting/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/decode.py
@ -331,7 +331,7 @@ def decode_dataset(
    model: nn.Module,
    lexicon: Lexicon,
    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -399,7 +399,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
--- a/egs/csj/ASR/.gitignore
+++ b/egs/csj/ASR/.gitignore
@ -0,0 +1,7 @@
+librispeech_*.*
+todelete*
+lang*
+notify_tg.py
+finetune_*
+misc.ini
+.vscode/*
--- a/egs/csj/ASR/local/compute_fbank_csj.py
+++ b/egs/csj/ASR/local/compute_fbank_csj.py
@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+# Copyright    2022  The University of Electro-Communications  (Author: Teo Wen Shen)  # noqa
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+import os
+from itertools import islice
+from pathlib import Path
+from random import Random
+from typing import List, Tuple
+
+import torch
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    # fmt: off
+    # See the following for why LilcomChunkyWriter is preferred
+    # https://github.com/k2-fsa/icefall/pull/404
+    # https://github.com/lhotse-speech/lhotse/pull/527
+    # fmt: on
+    LilcomChunkyWriter,
+    RecordingSet,
+    SupervisionSet,
+)
+
+ARGPARSE_DESCRIPTION = """
+This script follows the espnet method of splitting the remaining core+noncore
+utterances into valid and train cutsets at an index which is by default 4000.
+
+In other words, the core+noncore utterances are shuffled, where 4000 utterances
+of the shuffled set go to the `valid` cutset and are not subject to speed
+perturbation. The remaining utterances become the `train` cutset and are speed-
+perturbed (0.9x, 1.0x, 1.1x).
+
+"""
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+RNG_SEED = 42
+
+
+def make_cutset_blueprints(
+    manifest_dir: Path,
+    split: int,
+) -> List[Tuple[str, CutSet]]:
+
+    cut_sets = []
+    # Create eval datasets
+    logging.info("Creating eval cuts.")
+    for i in range(1, 4):
+        cut_set = CutSet.from_manifests(
+            recordings=RecordingSet.from_file(
+                manifest_dir / f"csj_recordings_eval{i}.jsonl.gz"
+            ),
+            supervisions=SupervisionSet.from_file(
+                manifest_dir / f"csj_supervisions_eval{i}.jsonl.gz"
+            ),
+        )
+        cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
+        cut_sets.append((f"eval{i}", cut_set))
+
+    # Create train and valid cuts
+    logging.info(
+        "Loading, trimming, and shuffling the remaining core+noncore cuts."
+    )
+    recording_set = RecordingSet.from_file(
+        manifest_dir / "csj_recordings_core.jsonl.gz"
+    ) + RecordingSet.from_file(manifest_dir / "csj_recordings_noncore.jsonl.gz")
+    supervision_set = SupervisionSet.from_file(
+        manifest_dir / "csj_supervisions_core.jsonl.gz"
+    ) + SupervisionSet.from_file(
+        manifest_dir / "csj_supervisions_noncore.jsonl.gz"
+    )
+
+    cut_set = CutSet.from_manifests(
+        recordings=recording_set,
+        supervisions=supervision_set,
+    )
+    cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
+    cut_set = cut_set.shuffle(Random(RNG_SEED))
+
+    logging.info(
+        "Creating valid and train cuts from core and noncore,"
+        f"split at {split}."
+    )
+    valid_set = CutSet.from_cuts(islice(cut_set, 0, split))
+
+    train_set = CutSet.from_cuts(islice(cut_set, split, None))
+    train_set = (
+        train_set + train_set.perturb_speed(0.9) + train_set.perturb_speed(1.1)
+    )
+
+    cut_sets.extend([("valid", valid_set), ("train", train_set)])
+
+    return cut_sets
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description=ARGPARSE_DESCRIPTION,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--manifest-dir", type=Path, help="Path to save manifests"
+    )
+    parser.add_argument(
+        "--fbank-dir", type=Path, help="Path to save fbank features"
+    )
+    parser.add_argument(
+        "--split", type=int, default=4000, help="Split at this index"
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    extractor = Fbank(FbankConfig(num_mel_bins=80))
+    num_jobs = min(16, os.cpu_count())
+
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    if (args.fbank_dir / ".done").exists():
+        logging.info(
+            "Previous fbank computed for CSJ found. "
+            f"Delete {args.fbank_dir / '.done'} to allow recomputing fbank."
+        )
+        return
+    else:
+        cut_sets = make_cutset_blueprints(args.manifest_dir, args.split)
+        for part, cut_set in cut_sets:
+            logging.info(f"Processing {part}")
+            cut_set = cut_set.compute_and_store_features(
+                extractor=extractor,
+                num_jobs=num_jobs,
+                storage_path=(args.fbank_dir / f"feats_{part}").as_posix(),
+                storage_type=LilcomChunkyWriter,
+            )
+            cut_set.to_file(args.manifest_dir / f"csj_cuts_{part}.jsonl.gz")
+
+        logging.info("All fbank computed for CSJ.")
+        (args.fbank_dir / ".done").touch()
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/csj/ASR/local/compute_fbank_musan.py
+++ b/egs/csj/ASR/local/compute_fbank_musan.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/compute_fbank_musan.py
--- a/egs/csj/ASR/local/conf/disfluent.ini
+++ b/egs/csj/ASR/local/conf/disfluent.ini
@ -0,0 +1,321 @@
+; # This section is ignored if this file is not supplied as the first config file to
+; # lhotse prepare csj  
+[SEGMENTS]
+; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
+gap = 0.5
+; # Maximum length of segment (s).
+maxlen = 10
+; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.  
+minlen = 0.02
+; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`. 
+; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi. 
+; # If you intend to use a multicharacter string for gap_sym, remember to register the 
+; # multicharacter string as part of userdef-string in prepare_lang_char.py. 
+gap_sym = 
+
+[CONSTANTS]
+; # Name of this mode
+MODE = disfluent
+; # Suffixes to use after the word surface (no longer used)
+MORPH = pos1 cForm cType2 pos2
+; # Used to differentiate between A tag and A_num tag
+JPN_NUM = ゼロ ０ 零 一 二 三 四 五 六 七 八 九 十 百 千 ．
+; # Dummy character to delineate multiline words
+PLUS = ＋
+
+[DECISIONS]
+; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
+; # The PLUS (fullwidth) sign '＋' marks line boundaries for multiline entries
+
+; # フィラー、感情表出系感動詞
+; # 0 to remain, 1 to delete
+; # Example: '(F ぎょっ)'
+F = 0
+; # Example: '(L (F ン))', '比べ(F えー)る'
+F^ = 0
+; # 言い直し、いいよどみなどによる語断片
+; # 0 to remain, 1 to delete
+; # Example: '(D だ)(D だいが) 大学の学部の会議'
+D = 0
+; # Example: '(L (D ドゥ)＋(D ヒ))'
+D^ = 0
+; # 助詞、助動詞、接辞の言い直し
+; # 0 to remain, 1 to delete
+; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
+D2 = 0
+; # Example: '(X (D2 ノ))'
+D2^ = 0
+; # 聞き取りや語彙の判断に自信がない場合
+; # 0 to remain, 1 to delete
+; # Example: (? 字数) の
+; # If no option: empty string is returned regardless of output
+; # Example: '(?) で'
+? = 0
+; # Example: '(D (? すー))＋そう＋です＋よ＋ね'
+?^ = 0
+; # タグ?で、値は複数の候補が想定される場合
+; # 0 for main guess with matching morph info, 1 for second guess
+; # Example:  '(? 次数, 実数)', '(? これ,ここで)＋(? 説明＋し＋た＋方＋が＋いい＋か＋な)'
+?, = 0
+; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ＋タ,マス))'
+?,^ = 0
+; # 音や言葉に関するメタ的な引用
+; # 0 to remain, 1 to delete
+; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
+M = 0
+; # Example: '(L (M ヒ)＋(M ヒ))', '(L (M (? ヒ＋ヒ)))'
+M^ = 0
+; # 外国語や古語、方言など
+; # 0 to remain, 1 to delete
+; # Example: '(O ザッツファイン)'
+O = 0
+; # Example: '(笑 (O エクスキューズ＋ミー))', '(笑 メダッ＋テ＋(O ナンボ))'
+O^ = 0
+; # 講演者の名前、差別語、誹謗中傷など
+; # 0 to remain, 1 to delete
+; # Example: '国語研の (R ××) です'
+R = 0
+R^ = 0
+; # 非朗読対象発話（朗読における言い間違い等）
+; # 0 to remain, 1 to delete
+; # Example: '(X 実際は) 実際には'
+X = 0
+; # Example: '(L (X (D2 ニ)))'
+X^ = 0
+; # アルファベットや算用数字、記号の表記
+; # 0 to use Japanese form, 1 to use alphabet form
+; # Example: '(A シーディーアール;ＣＤ－Ｒ)'
+A = 1
+; # Example: 'スモール(A エヌ;Ｎ)', 'ラージ(A キュー;Ｑ)', '(A ティーエフ;ＴＦ)＋(A アイディーエフ;ＩＤＦ)' (Strung together by pron: '(W (? ティーワイド);ティーエフ＋アイディーエフ)')
+A^ = 1
+; # タグAで、単語は算用数字の場合
+; # 0 to use Japanese form, 1 to use Arabic numerals
+; # Example: (A 二千;２０００)
+A_num = eval:self.notag
+A_num^ = eval:self.notag
+; # 何らかの原因で漢字表記できなくなった場合
+; # 0 to use broken form, 1 to use orthodox form
+; # Example: '(K たち (F えー) ばな;橘)'
+K = 1
+; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
+K^ = 1
+; # 転訛、発音の怠けなど、一時的な発音エラー
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(W ギーツ;ギジュツ)'
+W = 1
+; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
+W^ = 1
+; # 語の読みに関する知識レベルのいい間違い
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(B シブタイ;ジュータイ)'
+B = 0
+; # Example: 'データー(B カズ;スー)'
+B^ = 0
+; # 笑いながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(笑 ナニガ)', '(笑 (F エー)＋ソー＋イッ＋タ＋ヨー＋ナ)'
+笑 = 0
+; # Example: 'コク(笑 サイ＋(D オン))', 
+笑^ = 0
+; # 泣きながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(泣 ドンナニ)' 
+泣 = 0
+泣^ = 0
+; # 咳をしながら発話
+; # 0 to remain, 1 to delete
+; # Example: 'シャ(咳 リン) ノ' 
+咳 = 0
+; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
+咳^ = 0
+; # ささやき声や独り言などの小さな声
+; # 0 to remain, 1 to delete
+; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))＋(? セツメー＋シ＋タ＋ホー＋ガ＋イー＋カ＋ナ))' 
+L = 0
+; # Example: 'デ(L ス)', 'ッ(L テ＋コ)ト'
+L^ = 0
+
+[REPLACEMENTS]
+; # ボーカルフライなどで母音が同定できない場合
+<FV> = 
+; # 「うん/うーん/ふーん」の音の特定が困難な場合
+<VN> = 
+; # 非語彙的な母音の引き延ばし
+<H> = 
+; # 非語彙的な子音の引き延ばし
+<Q> = 
+; # 言語音と独立に講演者の笑いが生じている場合
+<笑> = 
+; # 言語音と独立に講演者の咳が生じている場合
+<咳> = 
+; # 言語音と独立に講演者の息が生じている場合
+<息> = 
+; # 講演者の泣き声
+<泣> = 
+; # 聴衆（司会者なども含む）の発話
+<フロア発話> = 
+; # 聴衆の笑い
+<フロア笑> = 
+; # 聴衆の拍手
+<拍手> = 
+; # 講演者が発表中に用いたデモンストレーションの音声
+<デモ> = 
+; # 学会講演に発表時間を知らせるためにならすベルの音
+<ベル> = 
+; # 転記単位全体が再度読み直された場合
+<朗読間違い> = 
+; # 上記以外の音で特に目立った音
+<雑音> = 
+; # 0.2秒以上のポーズ
+<P> = 
+; # Redacted information, for R
+; # It is \x00D7 multiplication sign, not your normal 'x'
+× = ×
+
+[FIELDS]
+; # Time information for segment
+time = 3
+; # Word surface
+surface = 5
+; # Word surface root form without CSJ tags
+notag = 9
+; # Part Of Speech
+pos1 = 11
+; # Conjugated Form
+cForm = 12
+; # Conjugation Type
+cType1 = 13
+; # Subcategory of POS
+pos2 = 14
+; # Euphonic Change / Subcategory of Conjugation Type
+cType2 = 15
+; # Other information
+other = 16
+; # Pronunciation for lexicon
+pron = 10
+; # Speaker ID
+spk_id = 2
+
+[KATAKANA2ROMAJI]
+ア = 'a
+イ = 'i
+ウ = 'u
+エ = 'e
+オ = 'o
+カ = ka
+キ = ki
+ク = ku
+ケ = ke
+コ = ko
+ガ = ga
+ギ = gi
+グ = gu
+ゲ = ge
+ゴ = go
+サ = sa
+シ = si
+ス = su
+セ = se
+ソ = so
+ザ = za
+ジ = zi
+ズ = zu
+ゼ = ze
+ゾ = zo
+タ = ta
+チ = ti
+ツ = tu
+テ = te
+ト = to
+ダ = da
+ヂ = di
+ヅ = du
+デ = de
+ド = do
+ナ = na
+ニ = ni
+ヌ = nu
+ネ = ne
+ノ = no
+ハ = ha
+ヒ = hi
+フ = hu
+ヘ = he
+ホ = ho
+バ = ba
+ビ = bi
+ブ = bu
+ベ = be
+ボ = bo
+パ = pa
+ピ = pi
+プ = pu
+ペ = pe
+ポ = po
+マ = ma
+ミ = mi
+ム = mu
+メ = me
+モ = mo
+ヤ = ya
+ユ = yu
+ヨ = yo
+ラ = ra
+リ = ri
+ル = ru
+レ = re
+ロ = ro
+ワ = wa
+ヰ = we
+ヱ = wi
+ヲ = wo
+ン = ŋ
+ッ = q
+ー = -
+キャ = kǐa
+キュ = kǐu
+キョ = kǐo
+ギャ = gǐa
+ギュ = gǐu
+ギョ = gǐo
+シャ = sǐa
+シュ = sǐu
+ショ = sǐo
+ジャ = zǐa
+ジュ = zǐu
+ジョ = zǐo
+チャ = tǐa
+チュ = tǐu
+チョ = tǐo
+ヂャ = dǐa
+ヂュ = dǐu
+ヂョ = dǐo
+ニャ = nǐa
+ニュ = nǐu
+ニョ = nǐo
+ヒャ = hǐa
+ヒュ = hǐu
+ヒョ = hǐo
+ビャ = bǐa
+ビュ = bǐu
+ビョ = bǐo
+ピャ = pǐa
+ピュ = pǐu
+ピョ = pǐo
+ミャ = mǐa
+ミュ = mǐu
+ミョ = mǐo
+リャ = rǐa
+リュ = rǐu
+リョ = rǐo
+ァ = a
+ィ = i
+ゥ = u
+ェ = e
+ォ = o
+ヮ = ʍ
+ヴ = vu
+ャ = ǐa
+ュ = ǐu
+ョ = ǐo
+
--- a/egs/csj/ASR/local/conf/fluent.ini
+++ b/egs/csj/ASR/local/conf/fluent.ini
@ -0,0 +1,321 @@
+; # This section is ignored if this file is not supplied as the first config file to
+; # lhotse prepare csj  
+[SEGMENTS]
+; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
+gap = 0.5
+; # Maximum length of segment (s).
+maxlen = 10
+; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.  
+minlen = 0.02
+; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`. 
+; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi. 
+; # If you intend to use a multicharacter string for gap_sym, remember to register the 
+; # multicharacter string as part of userdef-string in prepare_lang_char.py. 
+gap_sym = 
+
+[CONSTANTS]
+; # Name of this mode
+MODE = fluent
+; # Suffixes to use after the word surface (no longer used)
+MORPH = pos1 cForm cType2 pos2
+; # Used to differentiate between A tag and A_num tag
+JPN_NUM = ゼロ ０ 零 一 二 三 四 五 六 七 八 九 十 百 千 ．
+; # Dummy character to delineate multiline words
+PLUS = ＋
+
+[DECISIONS]
+; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
+; # The PLUS (fullwidth) sign '＋' marks line boundaries for multiline entries
+
+; # フィラー、感情表出系感動詞
+; # 0 to remain, 1 to delete
+; # Example: '(F ぎょっ)'
+F = 1
+; # Example: '(L (F ン))', '比べ(F えー)る'
+F^ = 1
+; # 言い直し、いいよどみなどによる語断片
+; # 0 to remain, 1 to delete
+; # Example: '(D だ)(D だいが) 大学の学部の会議'
+D = 1
+; # Example: '(L (D ドゥ)＋(D ヒ))'
+D^ = 1
+; # 助詞、助動詞、接辞の言い直し
+; # 0 to remain, 1 to delete
+; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
+D2 = 1
+; # Example: '(X (D2 ノ))'
+D2^ = 1
+; # 聞き取りや語彙の判断に自信がない場合
+; # 0 to remain, 1 to delete
+; # Example: (? 字数) の
+; # If no option: empty string is returned regardless of output
+; # Example: '(?) で'
+? = 0
+; # Example: '(D (? すー))＋そう＋です＋よ＋ね'
+?^ = 0
+; # タグ?で、値は複数の候補が想定される場合
+; # 0 for main guess with matching morph info, 1 for second guess
+; # Example:  '(? 次数, 実数)', '(? これ,ここで)＋(? 説明＋し＋た＋方＋が＋いい＋か＋な)'
+?, = 0
+; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ＋タ,マス))'
+?,^ = 0
+; # 音や言葉に関するメタ的な引用
+; # 0 to remain, 1 to delete
+; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
+M = 0
+; # Example: '(L (M ヒ)＋(M ヒ))', '(L (M (? ヒ＋ヒ)))'
+M^ = 0
+; # 外国語や古語、方言など
+; # 0 to remain, 1 to delete
+; # Example: '(O ザッツファイン)'
+O = 0
+; # Example: '(笑 (O エクスキューズ＋ミー))', '(笑 メダッ＋テ＋(O ナンボ))'
+O^ = 0
+; # 講演者の名前、差別語、誹謗中傷など
+; # 0 to remain, 1 to delete
+; # Example: '国語研の (R ××) です'
+R = 0
+R^ = 0
+; # 非朗読対象発話（朗読における言い間違い等）
+; # 0 to remain, 1 to delete
+; # Example: '(X 実際は) 実際には'
+X = 0
+; # Example: '(L (X (D2 ニ)))'
+X^ = 0
+; # アルファベットや算用数字、記号の表記
+; # 0 to use Japanese form, 1 to use alphabet form
+; # Example: '(A シーディーアール;ＣＤ－Ｒ)'
+A = 1
+; # Example: 'スモール(A エヌ;Ｎ)', 'ラージ(A キュー;Ｑ)', '(A ティーエフ;ＴＦ)＋(A アイディーエフ;ＩＤＦ)' (Strung together by pron: '(W (? ティーワイド);ティーエフ＋アイディーエフ)')
+A^ = 1
+; # タグAで、単語は算用数字の場合
+; # 0 to use Japanese form, 1 to use Arabic numerals
+; # Example: (A 二千;２０００)
+A_num = eval:self.notag
+A_num^ = eval:self.notag
+; # 何らかの原因で漢字表記できなくなった場合
+; # 0 to use broken form, 1 to use orthodox form
+; # Example: '(K たち (F えー) ばな;橘)'
+K = 1
+; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
+K^ = 1
+; # 転訛、発音の怠けなど、一時的な発音エラー
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(W ギーツ;ギジュツ)'
+W = 1
+; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
+W^ = 1
+; # 語の読みに関する知識レベルのいい間違い
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(B シブタイ;ジュータイ)'
+B = 0
+; # Example: 'データー(B カズ;スー)'
+B^ = 0
+; # 笑いながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(笑 ナニガ)', '(笑 (F エー)＋ソー＋イッ＋タ＋ヨー＋ナ)'
+笑 = 0
+; # Example: 'コク(笑 サイ＋(D オン))', 
+笑^ = 0
+; # 泣きながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(泣 ドンナニ)' 
+泣 = 0
+泣^ = 0
+; # 咳をしながら発話
+; # 0 to remain, 1 to delete
+; # Example: 'シャ(咳 リン) ノ' 
+咳 = 0
+; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
+咳^ = 0
+; # ささやき声や独り言などの小さな声
+; # 0 to remain, 1 to delete
+; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))＋(? セツメー＋シ＋タ＋ホー＋ガ＋イー＋カ＋ナ))' 
+L = 0
+; # Example: 'デ(L ス)', 'ッ(L テ＋コ)ト'
+L^ = 0
+
+[REPLACEMENTS]
+; # ボーカルフライなどで母音が同定できない場合
+<FV> = 
+; # 「うん/うーん/ふーん」の音の特定が困難な場合
+<VN> = 
+; # 非語彙的な母音の引き延ばし
+<H> = 
+; # 非語彙的な子音の引き延ばし
+<Q> = 
+; # 言語音と独立に講演者の笑いが生じている場合
+<笑> = 
+; # 言語音と独立に講演者の咳が生じている場合
+<咳> = 
+; # 言語音と独立に講演者の息が生じている場合
+<息> = 
+; # 講演者の泣き声
+<泣> = 
+; # 聴衆（司会者なども含む）の発話
+<フロア発話> = 
+; # 聴衆の笑い
+<フロア笑> = 
+; # 聴衆の拍手
+<拍手> = 
+; # 講演者が発表中に用いたデモンストレーションの音声
+<デモ> = 
+; # 学会講演に発表時間を知らせるためにならすベルの音
+<ベル> = 
+; # 転記単位全体が再度読み直された場合
+<朗読間違い> = 
+; # 上記以外の音で特に目立った音
+<雑音> = 
+; # 0.2秒以上のポーズ
+<P> = 
+; # Redacted information, for R
+; # It is \x00D7 multiplication sign, not your normal 'x'
+× = ×
+
+[FIELDS]
+; # Time information for segment
+time = 3
+; # Word surface
+surface = 5
+; # Word surface root form without CSJ tags
+notag = 9
+; # Part Of Speech
+pos1 = 11
+; # Conjugated Form
+cForm = 12
+; # Conjugation Type
+cType1 = 13
+; # Subcategory of POS
+pos2 = 14
+; # Euphonic Change / Subcategory of Conjugation Type
+cType2 = 15
+; # Other information
+other = 16
+; # Pronunciation for lexicon
+pron = 10
+; # Speaker ID
+spk_id = 2
+
+[KATAKANA2ROMAJI]
+ア = 'a
+イ = 'i
+ウ = 'u
+エ = 'e
+オ = 'o
+カ = ka
+キ = ki
+ク = ku
+ケ = ke
+コ = ko
+ガ = ga
+ギ = gi
+グ = gu
+ゲ = ge
+ゴ = go
+サ = sa
+シ = si
+ス = su
+セ = se
+ソ = so
+ザ = za
+ジ = zi
+ズ = zu
+ゼ = ze
+ゾ = zo
+タ = ta
+チ = ti
+ツ = tu
+テ = te
+ト = to
+ダ = da
+ヂ = di
+ヅ = du
+デ = de
+ド = do
+ナ = na
+ニ = ni
+ヌ = nu
+ネ = ne
+ノ = no
+ハ = ha
+ヒ = hi
+フ = hu
+ヘ = he
+ホ = ho
+バ = ba
+ビ = bi
+ブ = bu
+ベ = be
+ボ = bo
+パ = pa
+ピ = pi
+プ = pu
+ペ = pe
+ポ = po
+マ = ma
+ミ = mi
+ム = mu
+メ = me
+モ = mo
+ヤ = ya
+ユ = yu
+ヨ = yo
+ラ = ra
+リ = ri
+ル = ru
+レ = re
+ロ = ro
+ワ = wa
+ヰ = we
+ヱ = wi
+ヲ = wo
+ン = ŋ
+ッ = q
+ー = -
+キャ = kǐa
+キュ = kǐu
+キョ = kǐo
+ギャ = gǐa
+ギュ = gǐu
+ギョ = gǐo
+シャ = sǐa
+シュ = sǐu
+ショ = sǐo
+ジャ = zǐa
+ジュ = zǐu
+ジョ = zǐo
+チャ = tǐa
+チュ = tǐu
+チョ = tǐo
+ヂャ = dǐa
+ヂュ = dǐu
+ヂョ = dǐo
+ニャ = nǐa
+ニュ = nǐu
+ニョ = nǐo
+ヒャ = hǐa
+ヒュ = hǐu
+ヒョ = hǐo
+ビャ = bǐa
+ビュ = bǐu
+ビョ = bǐo
+ピャ = pǐa
+ピュ = pǐu
+ピョ = pǐo
+ミャ = mǐa
+ミュ = mǐu
+ミョ = mǐo
+リャ = rǐa
+リュ = rǐu
+リョ = rǐo
+ァ = a
+ィ = i
+ゥ = u
+ェ = e
+ォ = o
+ヮ = ʍ
+ヴ = vu
+ャ = ǐa
+ュ = ǐu
+ョ = ǐo
+
--- a/egs/csj/ASR/local/conf/number.ini
+++ b/egs/csj/ASR/local/conf/number.ini
@ -0,0 +1,321 @@
+; # This section is ignored if this file is not supplied as the first config file to
+; # lhotse prepare csj  
+[SEGMENTS]
+; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
+gap = 0.5
+; # Maximum length of segment (s).
+maxlen = 10
+; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.  
+minlen = 0.02
+; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`. 
+; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi. 
+; # If you intend to use a multicharacter string for gap_sym, remember to register the 
+; # multicharacter string as part of userdef-string in prepare_lang_char.py. 
+gap_sym = 
+
+[CONSTANTS]
+; # Name of this mode
+MODE = number
+; # Suffixes to use after the word surface (no longer used)
+MORPH = pos1 cForm cType2 pos2
+; # Used to differentiate between A tag and A_num tag
+JPN_NUM = ゼロ ０ 零 一 二 三 四 五 六 七 八 九 十 百 千 ．
+; # Dummy character to delineate multiline words
+PLUS = ＋
+
+[DECISIONS]
+; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
+; # The PLUS (fullwidth) sign '＋' marks line boundaries for multiline entries
+
+; # フィラー、感情表出系感動詞
+; # 0 to remain, 1 to delete
+; # Example: '(F ぎょっ)'
+F = 1
+; # Example: '(L (F ン))', '比べ(F えー)る'
+F^ = 1
+; # 言い直し、いいよどみなどによる語断片
+; # 0 to remain, 1 to delete
+; # Example: '(D だ)(D だいが) 大学の学部の会議'
+D = 1
+; # Example: '(L (D ドゥ)＋(D ヒ))'
+D^ = 1
+; # 助詞、助動詞、接辞の言い直し
+; # 0 to remain, 1 to delete
+; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
+D2 = 1
+; # Example: '(X (D2 ノ))'
+D2^ = 1
+; # 聞き取りや語彙の判断に自信がない場合
+; # 0 to remain, 1 to delete
+; # Example: (? 字数) の
+; # If no option: empty string is returned regardless of output
+; # Example: '(?) で'
+? = 0
+; # Example: '(D (? すー))＋そう＋です＋よ＋ね'
+?^ = 0
+; # タグ?で、値は複数の候補が想定される場合
+; # 0 for main guess with matching morph info, 1 for second guess
+; # Example:  '(? 次数, 実数)', '(? これ,ここで)＋(? 説明＋し＋た＋方＋が＋いい＋か＋な)'
+?, = 0
+; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ＋タ,マス))'
+?,^ = 0
+; # 音や言葉に関するメタ的な引用
+; # 0 to remain, 1 to delete
+; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
+M = 0
+; # Example: '(L (M ヒ)＋(M ヒ))', '(L (M (? ヒ＋ヒ)))'
+M^ = 0
+; # 外国語や古語、方言など
+; # 0 to remain, 1 to delete
+; # Example: '(O ザッツファイン)'
+O = 0
+; # Example: '(笑 (O エクスキューズ＋ミー))', '(笑 メダッ＋テ＋(O ナンボ))'
+O^ = 0
+; # 講演者の名前、差別語、誹謗中傷など
+; # 0 to remain, 1 to delete
+; # Example: '国語研の (R ××) です'
+R = 0
+R^ = 0
+; # 非朗読対象発話（朗読における言い間違い等）
+; # 0 to remain, 1 to delete
+; # Example: '(X 実際は) 実際には'
+X = 0
+; # Example: '(L (X (D2 ニ)))'
+X^ = 0
+; # アルファベットや算用数字、記号の表記
+; # 0 to use Japanese form, 1 to use alphabet form
+; # Example: '(A シーディーアール;ＣＤ－Ｒ)'
+A = 1
+; # Example: 'スモール(A エヌ;Ｎ)', 'ラージ(A キュー;Ｑ)', '(A ティーエフ;ＴＦ)＋(A アイディーエフ;ＩＤＦ)' (Strung together by pron: '(W (? ティーワイド);ティーエフ＋アイディーエフ)')
+A^ = 1
+; # タグAで、単語は算用数字の場合
+; # 0 to use Japanese form, 1 to use Arabic numerals
+; # Example: (A 二千;２０００)
+A_num = 1
+A_num^ = 1
+; # 何らかの原因で漢字表記できなくなった場合
+; # 0 to use broken form, 1 to use orthodox form
+; # Example: '(K たち (F えー) ばな;橘)'
+K = 1
+; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
+K^ = 1
+; # 転訛、発音の怠けなど、一時的な発音エラー
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(W ギーツ;ギジュツ)'
+W = 1
+; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
+W^ = 1
+; # 語の読みに関する知識レベルのいい間違い
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(B シブタイ;ジュータイ)'
+B = 0
+; # Example: 'データー(B カズ;スー)'
+B^ = 0
+; # 笑いながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(笑 ナニガ)', '(笑 (F エー)＋ソー＋イッ＋タ＋ヨー＋ナ)'
+笑 = 0
+; # Example: 'コク(笑 サイ＋(D オン))', 
+笑^ = 0
+; # 泣きながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(泣 ドンナニ)' 
+泣 = 0
+泣^ = 0
+; # 咳をしながら発話
+; # 0 to remain, 1 to delete
+; # Example: 'シャ(咳 リン) ノ' 
+咳 = 0
+; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
+咳^ = 0
+; # ささやき声や独り言などの小さな声
+; # 0 to remain, 1 to delete
+; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))＋(? セツメー＋シ＋タ＋ホー＋ガ＋イー＋カ＋ナ))' 
+L = 0
+; # Example: 'デ(L ス)', 'ッ(L テ＋コ)ト'
+L^ = 0
+
+[REPLACEMENTS]
+; # ボーカルフライなどで母音が同定できない場合
+<FV> = 
+; # 「うん/うーん/ふーん」の音の特定が困難な場合
+<VN> = 
+; # 非語彙的な母音の引き延ばし
+<H> = 
+; # 非語彙的な子音の引き延ばし
+<Q> = 
+; # 言語音と独立に講演者の笑いが生じている場合
+<笑> = 
+; # 言語音と独立に講演者の咳が生じている場合
+<咳> = 
+; # 言語音と独立に講演者の息が生じている場合
+<息> = 
+; # 講演者の泣き声
+<泣> = 
+; # 聴衆（司会者なども含む）の発話
+<フロア発話> = 
+; # 聴衆の笑い
+<フロア笑> = 
+; # 聴衆の拍手
+<拍手> = 
+; # 講演者が発表中に用いたデモンストレーションの音声
+<デモ> = 
+; # 学会講演に発表時間を知らせるためにならすベルの音
+<ベル> = 
+; # 転記単位全体が再度読み直された場合
+<朗読間違い> = 
+; # 上記以外の音で特に目立った音
+<雑音> = 
+; # 0.2秒以上のポーズ
+<P> = 
+; # Redacted information, for R
+; # It is \x00D7 multiplication sign, not your normal 'x'
+× = ×
+
+[FIELDS]
+; # Time information for segment
+time = 3
+; # Word surface
+surface = 5
+; # Word surface root form without CSJ tags
+notag = 9
+; # Part Of Speech
+pos1 = 11
+; # Conjugated Form
+cForm = 12
+; # Conjugation Type
+cType1 = 13
+; # Subcategory of POS
+pos2 = 14
+; # Euphonic Change / Subcategory of Conjugation Type
+cType2 = 15
+; # Other information
+other = 16
+; # Pronunciation for lexicon
+pron = 10
+; # Speaker ID
+spk_id = 2
+
+[KATAKANA2ROMAJI]
+ア = 'a
+イ = 'i
+ウ = 'u
+エ = 'e
+オ = 'o
+カ = ka
+キ = ki
+ク = ku
+ケ = ke
+コ = ko
+ガ = ga
+ギ = gi
+グ = gu
+ゲ = ge
+ゴ = go
+サ = sa
+シ = si
+ス = su
+セ = se
+ソ = so
+ザ = za
+ジ = zi
+ズ = zu
+ゼ = ze
+ゾ = zo
+タ = ta
+チ = ti
+ツ = tu
+テ = te
+ト = to
+ダ = da
+ヂ = di
+ヅ = du
+デ = de
+ド = do
+ナ = na
+ニ = ni
+ヌ = nu
+ネ = ne
+ノ = no
+ハ = ha
+ヒ = hi
+フ = hu
+ヘ = he
+ホ = ho
+バ = ba
+ビ = bi
+ブ = bu
+ベ = be
+ボ = bo
+パ = pa
+ピ = pi
+プ = pu
+ペ = pe
+ポ = po
+マ = ma
+ミ = mi
+ム = mu
+メ = me
+モ = mo
+ヤ = ya
+ユ = yu
+ヨ = yo
+ラ = ra
+リ = ri
+ル = ru
+レ = re
+ロ = ro
+ワ = wa
+ヰ = we
+ヱ = wi
+ヲ = wo
+ン = ŋ
+ッ = q
+ー = -
+キャ = kǐa
+キュ = kǐu
+キョ = kǐo
+ギャ = gǐa
+ギュ = gǐu
+ギョ = gǐo
+シャ = sǐa
+シュ = sǐu
+ショ = sǐo
+ジャ = zǐa
+ジュ = zǐu
+ジョ = zǐo
+チャ = tǐa
+チュ = tǐu
+チョ = tǐo
+ヂャ = dǐa
+ヂュ = dǐu
+ヂョ = dǐo
+ニャ = nǐa
+ニュ = nǐu
+ニョ = nǐo
+ヒャ = hǐa
+ヒュ = hǐu
+ヒョ = hǐo
+ビャ = bǐa
+ビュ = bǐu
+ビョ = bǐo
+ピャ = pǐa
+ピュ = pǐu
+ピョ = pǐo
+ミャ = mǐa
+ミュ = mǐu
+ミョ = mǐo
+リャ = rǐa
+リュ = rǐu
+リョ = rǐo
+ァ = a
+ィ = i
+ゥ = u
+ェ = e
+ォ = o
+ヮ = ʍ
+ヴ = vu
+ャ = ǐa
+ュ = ǐu
+ョ = ǐo
+
--- a/egs/csj/ASR/local/conf/symbol.ini
+++ b/egs/csj/ASR/local/conf/symbol.ini
@ -0,0 +1,322 @@
+; # This section is ignored if this file is not supplied as the first config file to
+; # lhotse prepare csj  
+[SEGMENTS]
+; # Allowed period of nonverbal noise. If exceeded, a new segment is created.
+gap = 0.5
+; # Maximum length of segment (s).
+maxlen = 10
+; # Minimum length of segment (s). Segments shorter than `minlen` will be dropped silently.  
+minlen = 0.02
+; # Use this symbol to represent a period of allowed nonverbal noise, i.e. `gap`. 
+; # Pass an empty string to avoid adding any symbol. It was "<sp>" in kaldi. 
+; # If you intend to use a multicharacter string for gap_sym, remember to register the 
+; # multicharacter string as part of userdef-string in prepare_lang_char.py. 
+gap_sym = 
+
+[CONSTANTS]
+; # Name of this mode
+; # See https://www.isca-speech.org/archive/pdfs/interspeech_2022/horii22_interspeech.pdf
+MODE = symbol
+; # Suffixes to use after the word surface (no longer used)
+MORPH = pos1 cForm cType2 pos2
+; # Used to differentiate between A tag and A_num tag
+JPN_NUM = ゼロ ０ 零 一 二 三 四 五 六 七 八 九 十 百 千 ．
+; # Dummy character to delineate multiline words
+PLUS = ＋
+
+[DECISIONS]
+; # TAG+'^'とは、タグが一つの転記単位に独立していない場合
+; # The PLUS (fullwidth) sign '＋' marks line boundaries for multiline entries
+
+; # フィラー、感情表出系感動詞
+; # 0 to remain, 1 to delete
+; # Example: '(F ぎょっ)'
+F = ＃
+; # Example: '(L (F ン))', '比べ(F えー)る'
+F^ = ＃
+; # 言い直し、いいよどみなどによる語断片
+; # 0 to remain, 1 to delete
+; # Example: '(D だ)(D だいが) 大学の学部の会議'
+D = ＠
+; # Example: '(L (D ドゥ)＋(D ヒ))'
+D^ = ＠
+; # 助詞、助動詞、接辞の言い直し
+; # 0 to remain, 1 to delete
+; # Example: '西洋 (D2 的)(F えー)(D ふ) 風というか'
+D2 = ＠
+; # Example: '(X (D2 ノ))'
+D2^ = ＠
+; # 聞き取りや語彙の判断に自信がない場合
+; # 0 to remain, 1 to delete
+; # Example: (? 字数) の
+; # If no option: empty string is returned regardless of output
+; # Example: '(?) で'
+? = 0
+; # Example: '(D (? すー))＋そう＋です＋よ＋ね'
+?^ = 0
+; # タグ?で、値は複数の候補が想定される場合
+; # 0 for main guess with matching morph info, 1 for second guess
+; # Example:  '(? 次数, 実数)', '(? これ,ここで)＋(? 説明＋し＋た＋方＋が＋いい＋か＋な)'
+?, = 0
+; # Example: '(W (? テユクー);(? ケッキョク,テユウコトデ))', '(W マシ;(? マシ＋タ,マス))'
+?,^ = 0
+; # 音や言葉に関するメタ的な引用
+; # 0 to remain, 1 to delete
+; # Example: '助詞の (M は) は (M は) と書くが発音は (M わ)'
+M = 0
+; # Example: '(L (M ヒ)＋(M ヒ))', '(L (M (? ヒ＋ヒ)))'
+M^ = 0
+; # 外国語や古語、方言など
+; # 0 to remain, 1 to delete
+; # Example: '(O ザッツファイン)'
+O = 0
+; # Example: '(笑 (O エクスキューズ＋ミー))', '(笑 メダッ＋テ＋(O ナンボ))'
+O^ = 0
+; # 講演者の名前、差別語、誹謗中傷など
+; # 0 to remain, 1 to delete
+; # Example: '国語研の (R ××) です'
+R = 0
+R^ = 0
+; # 非朗読対象発話（朗読における言い間違い等）
+; # 0 to remain, 1 to delete
+; # Example: '(X 実際は) 実際には'
+X = 0
+; # Example: '(L (X (D2 ニ)))'
+X^ = 0
+; # アルファベットや算用数字、記号の表記
+; # 0 to use Japanese form, 1 to use alphabet form
+; # Example: '(A シーディーアール;ＣＤ－Ｒ)'
+A = 1
+; # Example: 'スモール(A エヌ;Ｎ)', 'ラージ(A キュー;Ｑ)', '(A ティーエフ;ＴＦ)＋(A アイディーエフ;ＩＤＦ)' (Strung together by pron: '(W (? ティーワイド);ティーエフ＋アイディーエフ)')
+A^ = 1
+; # タグAで、単語は算用数字の場合
+; # 0 to use Japanese form, 1 to use Arabic numerals
+; # Example: (A 二千;２０００)
+A_num = eval:self.notag
+A_num^ = eval:self.notag
+; # 何らかの原因で漢字表記できなくなった場合
+; # 0 to use broken form, 1 to use orthodox form
+; # Example: '(K たち (F えー) ばな;橘)'
+K = 1
+; # Example: '合(K か(?)く;格)', '宮(K ま(?)え;前)'
+K^ = 1
+; # 転訛、発音の怠けなど、一時的な発音エラー
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(W ギーツ;ギジュツ)'
+W = 1
+; # Example: '(F (W エド;エト))', 'イベント(W リレーティッド;リレーテッド)'
+W^ = 1
+; # 語の読みに関する知識レベルのいい間違い
+; # 0 to use wrong form, 1 to use orthodox form
+; # Example: '(B シブタイ;ジュータイ)'
+B = 0
+; # Example: 'データー(B カズ;スー)'
+B^ = 0
+; # 笑いながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(笑 ナニガ)', '(笑 (F エー)＋ソー＋イッ＋タ＋ヨー＋ナ)'
+笑 = 0
+; # Example: 'コク(笑 サイ＋(D オン))', 
+笑^ = 0
+; # 泣きながら発話
+; # 0 to remain, 1 to delete
+; # Example: '(泣 ドンナニ)' 
+泣 = 0
+泣^ = 0
+; # 咳をしながら発話
+; # 0 to remain, 1 to delete
+; # Example: 'シャ(咳 リン) ノ' 
+咳 = 0
+; # Example: 'イッ(咳 パン)', 'ワズ(咳 カ)'
+咳^ = 0
+; # ささやき声や独り言などの小さな声
+; # 0 to remain, 1 to delete
+; # Example: '(L アレコレナンダッケ)', '(L (W コデ;(? コレ,ココデ))＋(? セツメー＋シ＋タ＋ホー＋ガ＋イー＋カ＋ナ))' 
+L = 0
+; # Example: 'デ(L ス)', 'ッ(L テ＋コ)ト'
+L^ = 0
+
+[REPLACEMENTS]
+; # ボーカルフライなどで母音が同定できない場合
+<FV> = 
+; # 「うん/うーん/ふーん」の音の特定が困難な場合
+<VN> = 
+; # 非語彙的な母音の引き延ばし
+<H> = 
+; # 非語彙的な子音の引き延ばし
+<Q> = 
+; # 言語音と独立に講演者の笑いが生じている場合
+<笑> = 
+; # 言語音と独立に講演者の咳が生じている場合
+<咳> = 
+; # 言語音と独立に講演者の息が生じている場合
+<息> = 
+; # 講演者の泣き声
+<泣> = 
+; # 聴衆（司会者なども含む）の発話
+<フロア発話> = 
+; # 聴衆の笑い
+<フロア笑> = 
+; # 聴衆の拍手
+<拍手> = 
+; # 講演者が発表中に用いたデモンストレーションの音声
+<デモ> = 
+; # 学会講演に発表時間を知らせるためにならすベルの音
+<ベル> = 
+; # 転記単位全体が再度読み直された場合
+<朗読間違い> = 
+; # 上記以外の音で特に目立った音
+<雑音> = 
+; # 0.2秒以上のポーズ
+<P> = 
+; # Redacted information, for R
+; # It is \x00D7 multiplication sign, not your normal 'x'
+× = ×
+
+[FIELDS]
+; # Time information for segment
+time = 3
+; # Word surface
+surface = 5
+; # Word surface root form without CSJ tags
+notag = 9
+; # Part Of Speech
+pos1 = 11
+; # Conjugated Form
+cForm = 12
+; # Conjugation Type
+cType1 = 13
+; # Subcategory of POS
+pos2 = 14
+; # Euphonic Change / Subcategory of Conjugation Type
+cType2 = 15
+; # Other information
+other = 16
+; # Pronunciation for lexicon
+pron = 10
+; # Speaker ID
+spk_id = 2
+
+[KATAKANA2ROMAJI]
+ア = 'a
+イ = 'i
+ウ = 'u
+エ = 'e
+オ = 'o
+カ = ka
+キ = ki
+ク = ku
+ケ = ke
+コ = ko
+ガ = ga
+ギ = gi
+グ = gu
+ゲ = ge
+ゴ = go
+サ = sa
+シ = si
+ス = su
+セ = se
+ソ = so
+ザ = za
+ジ = zi
+ズ = zu
+ゼ = ze
+ゾ = zo
+タ = ta
+チ = ti
+ツ = tu
+テ = te
+ト = to
+ダ = da
+ヂ = di
+ヅ = du
+デ = de
+ド = do
+ナ = na
+ニ = ni
+ヌ = nu
+ネ = ne
+ノ = no
+ハ = ha
+ヒ = hi
+フ = hu
+ヘ = he
+ホ = ho
+バ = ba
+ビ = bi
+ブ = bu
+ベ = be
+ボ = bo
+パ = pa
+ピ = pi
+プ = pu
+ペ = pe
+ポ = po
+マ = ma
+ミ = mi
+ム = mu
+メ = me
+モ = mo
+ヤ = ya
+ユ = yu
+ヨ = yo
+ラ = ra
+リ = ri
+ル = ru
+レ = re
+ロ = ro
+ワ = wa
+ヰ = we
+ヱ = wi
+ヲ = wo
+ン = ŋ
+ッ = q
+ー = -
+キャ = kǐa
+キュ = kǐu
+キョ = kǐo
+ギャ = gǐa
+ギュ = gǐu
+ギョ = gǐo
+シャ = sǐa
+シュ = sǐu
+ショ = sǐo
+ジャ = zǐa
+ジュ = zǐu
+ジョ = zǐo
+チャ = tǐa
+チュ = tǐu
+チョ = tǐo
+ヂャ = dǐa
+ヂュ = dǐu
+ヂョ = dǐo
+ニャ = nǐa
+ニュ = nǐu
+ニョ = nǐo
+ヒャ = hǐa
+ヒュ = hǐu
+ヒョ = hǐo
+ビャ = bǐa
+ビュ = bǐu
+ビョ = bǐo
+ピャ = pǐa
+ピュ = pǐu
+ピョ = pǐo
+ミャ = mǐa
+ミュ = mǐu
+ミョ = mǐo
+リャ = rǐa
+リュ = rǐu
+リョ = rǐo
+ァ = a
+ィ = i
+ゥ = u
+ェ = e
+ォ = o
+ヮ = ʍ
+ヴ = vu
+ャ = ǐa
+ュ = ǐu
+ョ = ǐo
+
--- a/egs/csj/ASR/local/display_manifest_statistics.py
+++ b/egs/csj/ASR/local/display_manifest_statistics.py
@ -0,0 +1,182 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#              2022  The University of Electro-Communications (author: Teo Wen Shen)  # noqa
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from pathlib import Path
+
+from lhotse import CutSet, load_manifest
+
+ARGPARSE_DESCRIPTION = """
+This file displays duration statistics of utterances in a manifest.
+You can use the displayed value to choose minimum/maximum duration
+to remove short and long utterances during the training.
+
+See the function `remove_short_and_long_utt()` in
+pruned_transducer_stateless5/train.py for usage.
+"""
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description=ARGPARSE_DESCRIPTION,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--manifest-dir", type=Path, help="Path to cutset manifests"
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_parser()
+
+    for path in args.manifest_dir.glob("csj_cuts_*.jsonl.gz"):
+
+        cuts: CutSet = load_manifest(path)
+
+        print("\n---------------------------------\n")
+        print(path.name + ":")
+        cuts.describe()
+
+
+if __name__ == "__main__":
+    main()
+
+"""
+## eval1
+Cuts count: 1272
+Total duration (hh:mm:ss): 01:50:07
+Speech duration (hh:mm:ss): 01:50:07 (100.0%)
+Duration statistics (seconds):
+mean	5.2
+std	3.9
+min	0.2
+25%	1.9
+50%	4.0
+75%	8.1
+99%	14.3
+99.5%	14.7
+99.9%	16.0
+max	16.9
+Recordings available: 1272
+Features available: 1272
+Supervisions available: 1272
+SUPERVISION custom fields:
+- fluent (in 1272 cuts)
+- disfluent (in 1272 cuts)
+- number (in 1272 cuts)
+- symbol (in 1272 cuts)
+
+## eval2
+Cuts count: 1292
+Total duration (hh:mm:ss): 01:56:50
+Speech duration (hh:mm:ss): 01:56:50 (100.0%)
+Duration statistics (seconds):
+mean	5.4
+std	3.9
+min	0.1
+25%	2.1
+50%	4.6
+75%	8.6
+99%	14.1
+99.5%	15.2
+99.9%	16.1
+max	16.9
+Recordings available: 1292
+Features available: 1292
+Supervisions available: 1292
+SUPERVISION custom fields:
+- fluent (in 1292 cuts)
+- number (in 1292 cuts)
+- symbol (in 1292 cuts)
+- disfluent (in 1292 cuts)
+
+## eval3
+Cuts count: 1385
+Total duration (hh:mm:ss): 01:19:21
+Speech duration (hh:mm:ss): 01:19:21 (100.0%)
+Duration statistics (seconds):
+mean	3.4
+std	3.0
+min	0.2
+25%	1.2
+50%	2.5
+75%	4.6
+99%	12.7
+99.5%	13.7
+99.9%	15.0
+max	15.9
+Recordings available: 1385
+Features available: 1385
+Supervisions available: 1385
+SUPERVISION custom fields:
+- number (in 1385 cuts)
+- symbol (in 1385 cuts)
+- fluent (in 1385 cuts)
+- disfluent (in 1385 cuts)
+
+## valid
+Cuts count: 4000
+Total duration (hh:mm:ss): 05:08:09
+Speech duration (hh:mm:ss): 05:08:09 (100.0%)
+Duration statistics (seconds):
+mean	4.6
+std	3.8
+min	0.1
+25%	1.5
+50%	3.4
+75%	7.0
+99%	13.8
+99.5%	14.8
+99.9%	16.0
+max	17.3
+Recordings available: 4000
+Features available: 4000
+Supervisions available: 4000
+SUPERVISION custom fields:
+- fluent (in 4000 cuts)
+- symbol (in 4000 cuts)
+- disfluent (in 4000 cuts)
+- number (in 4000 cuts)
+
+## train
+Cuts count: 1291134
+Total duration (hh:mm:ss): 1596:37:27
+Speech duration (hh:mm:ss): 1596:37:27 (100.0%)
+Duration statistics (seconds):
+mean	4.5
+std	3.6
+min	0.0
+25%	1.6
+50%	3.3
+75%	6.4
+99%	14.0
+99.5%	14.8
+99.9%	16.6
+max	27.8
+Recordings available: 1291134
+Features available: 1291134
+Supervisions available: 1291134
+SUPERVISION custom fields:
+- disfluent (in 1291134 cuts)
+- fluent (in 1291134 cuts)
+- symbol (in 1291134 cuts)
+- number (in 1291134 cuts)
+"""
--- a/egs/csj/ASR/local/prepare_lang_char.py
+++ b/egs/csj/ASR/local/prepare_lang_char.py
@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+# Copyright    2022  The University of Electro-Communications  (Author: Teo Wen Shen)  # noqa
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+from pathlib import Path
+
+from lhotse import CutSet
+
+ARGPARSE_DESCRIPTION = """
+This script gathers all training transcripts of the specified {trans_mode} type
+and produces a token_list that would be output set of the ASR system.
+
+It splits transcripts by whitespace into lists, then, for each word in the
+list, if the word does not appear in the list of user-defined multicharacter
+strings, it further splits that word into individual characters to be counted
+into the output token set.
+
+It outputs 4 files into the lang directory:
+- trans_mode: the name of transcript mode. If trans_mode was not specified,
+   this will be an empty file.
+- userdef_string: a list of user defined strings that should not be split
+ further into individual characters. By default, it contains "<unk>", "<blk>",
+ "<sos/eos>"
+- words_len: the total number of tokens in the output set.
+- words.txt: a list of tokens in the output set. The length matches words_len.
+
+"""
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description=ARGPARSE_DESCRIPTION,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--train-cut", type=Path, required=True, help="Path to the train cut"
+    )
+
+    parser.add_argument(
+        "--trans-mode",
+        type=str,
+        default=None,
+        help=(
+            "Name of the transcript mode to use. "
+            "If lang-dir is not set, this will also name the lang-dir"
+        ),
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=Path,
+        default=None,
+        help=(
+            "Name of lang dir. "
+            "If not set, this will default to lang_char_{trans-mode}"
+        ),
+    )
+
+    parser.add_argument(
+        "--userdef-string",
+        type=Path,
+        default=None,
+        help="Multicharacter strings that do not need to be split",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+
+    logging.basicConfig(
+        format=(
+            "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] " "%(message)s"
+        ),
+        level=logging.INFO,
+    )
+
+    if not args.lang_dir:
+        p = "lang_char"
+        if args.trans_mode:
+            p += f"_{args.trans_mode}"
+        args.lang_dir = Path(p)
+
+    if args.userdef_string:
+        args.userdef_string = set(args.userdef_string.read_text().split())
+    else:
+        args.userdef_string = set()
+
+    sysdef_string = ["<blk>", "<unk>", "<sos/eos>"]
+    args.userdef_string.update(sysdef_string)
+
+    train_set: CutSet = CutSet.from_file(args.train_cut)
+
+    words = set()
+    logging.info(
+        f"Creating vocabulary from {args.train_cut.name}"
+        f" at {args.trans_mode} mode."
+    )
+    for cut in train_set:
+        try:
+            text: str = (
+                cut.supervisions[0].custom[args.trans_mode]
+                if args.trans_mode
+                else cut.supervisions[0].text
+            )
+        except KeyError:
+            raise KeyError(
+                f"Could not find {args.trans_mode} in "
+                f"{cut.supervisions[0].custom}"
+            )
+        for t in text.split():
+            if t in args.userdef_string:
+                words.add(t)
+            else:
+                words.update(c for c in list(t))
+
+    words -= set(sysdef_string)
+    words = sorted(words)
+    words = ["<blk>"] + words + ["<unk>", "<sos/eos>"]
+
+    args.lang_dir.mkdir(parents=True, exist_ok=True)
+    (args.lang_dir / "words.txt").write_text(
+        "\n".join(f"{word}\t{i}" for i, word in enumerate(words))
+    )
+
+    (args.lang_dir / "words_len").write_text(f"{len(words)}")
+
+    (args.lang_dir / "userdef_string").write_text(
+        "\n".join(args.userdef_string)
+    )
+
+    (args.lang_dir / "trans_mode").write_text(args.trans_mode)
+    logging.info("Done.")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/csj/ASR/local/validate_manifest.py
+++ b/egs/csj/ASR/local/validate_manifest.py
@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# Copyright    2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script checks the following assumptions of the generated manifest:
+
+- Single supervision per cut
+- Supervision time bounds are within cut time bounds
+
+We will add more checks later if needed.
+
+Usage example:
+
+    python3 ./local/validate_manifest.py \
+            ./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
+
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+from lhotse import CutSet, load_manifest
+from lhotse.cut import Cut
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--manifest",
+        type=Path,
+        help="Path to the manifest file",
+    )
+
+    return parser.parse_args()
+
+
+def validate_one_supervision_per_cut(c: Cut):
+    if len(c.supervisions) != 1:
+        raise ValueError(f"{c.id} has {len(c.supervisions)} supervisions")
+
+
+def validate_supervision_and_cut_time_bounds(c: Cut):
+    s = c.supervisions[0]
+
+    # Removed because when the cuts were trimmed from supervisions,
+    # the start time of the supervision can be lesser than cut start time.
+    # https://github.com/lhotse-speech/lhotse/issues/813
+    # if s.start < c.start:
+    #     raise ValueError(
+    #         f"{c.id}: Supervision start time {s.start} is less "
+    #         f"than cut start time {c.start}"
+    #     )
+
+    if s.end > c.end:
+        raise ValueError(
+            f"{c.id}: Supervision end time {s.end} is larger "
+            f"than cut end time {c.end}"
+        )
+
+
+def main():
+    args = get_args()
+
+    manifest = Path(args.manifest)
+    logging.info(f"Validating {manifest}")
+
+    assert manifest.is_file(), f"{manifest} does not exist"
+    cut_set = load_manifest(manifest)
+    assert isinstance(cut_set, CutSet)
+
+    for c in cut_set:
+        validate_one_supervision_per_cut(c)
+        validate_supervision_and_cut_time_bounds(c)
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    main()
--- a/egs/csj/ASR/prepare.sh
+++ b/egs/csj/ASR/prepare.sh
@ -0,0 +1,130 @@
+#!/usr/bin/env bash
+# We assume the following directories are downloaded.
+#
+#  - $csj_dir
+#     CSJ is assumed to be the USB-type directory, which should contain the following subdirectories:- 
+#     - DATA (not used in this script)
+#     - DOC (not used in this script)
+#     - MODEL (not used in this script)
+#     - MORPH
+#       - LDB (not used in this script)
+#       - SUWDIC (not used in this script)
+#       - SDB
+#         - core
+#           - ...
+#         - noncore
+#           - ...
+#     - PLABEL (not used in this script)
+#     - SUMMARY (not used in this script)
+#     - TOOL (not used in this script)
+#     - WAV
+#       - core
+#         - ...
+#       - noncore
+#         - ...
+#     - XML (not used in this script)
+#
+#  - $musan_dir
+#      This directory contains the following directories downloaded from
+#       http://www.openslr.org/17/
+#     - music
+#     - noise
+#     - speech
+# 
+# By default, this script produces the original transcript like kaldi and espnet. Optionally, you
+# can generate other transcript formats by supplying your own config files. A few examples of these
+# config files can be found in local/conf.
+
+set -eou pipefail
+
+nj=8
+stage=-1
+stop_stage=100
+
+csj_dir=/mnt/minami_data_server/t2131178/corpus/CSJ
+musan_dir=/mnt/minami_data_server/t2131178/corpus/musan/musan
+trans_dir=$csj_dir/retranscript
+csj_fbank_dir=/mnt/host/csj_data/fbank
+musan_fbank_dir=$musan_dir/fbank
+csj_manifest_dir=data/manifests
+musan_manifest_dir=$musan_dir/manifests
+
+. shared/parse_options.sh || exit 1
+
+mkdir -p data
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then 
+    log "Stage 1: Prepare CSJ manifest"
+    # If you want to generate more transcript modes, append the path to those config files at c.
+    # Example: lhotse prepare csj $csj_dir $trans_dir $csj_manifest_dir -c local/conf/disfluent.ini
+    # NOTE: In case multiple config files are supplied, the second config file and onwards will inherit
+    #       the segment boundaries of the first config file. 
+    if [ ! -e $csj_manifest_dir/.librispeech.done ]; then 
+        lhotse prepare csj $csj_dir $trans_dir $csj_manifest_dir -j 4
+        touch $csj_manifest_dir/.librispeech.done
+    fi
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+    log "Stage 2: Prepare musan manifest"
+    mkdir -p $musan_manifest_dir
+    if [ ! -e $musan_manifest_dir/.musan.done ]; then
+        lhotse prepare musan $musan_dir $musan_manifest_dir
+        touch $musan_manifest_dir/.musan.done
+    fi
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+    log "Stage 3: Compute CSJ fbank"
+    if [ ! -e $csj_fbank_dir/.csj-validated.done ]; then
+        python local/compute_fbank_csj.py --manifest-dir $csj_manifest_dir \
+            --fbank-dir $csj_fbank_dir
+        parts=(
+            train 
+            valid
+            eval1
+            eval2
+            eval3
+        )
+        for part in ${parts[@]}; do 
+            python local/validate_manifest.py --manifest $csj_manifest_dir/csj_cuts_$part.jsonl.gz
+        done
+        touch $csj_fbank_dir/.csj-validated.done
+    fi
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then 
+    log "Stage 4: Prepare CSJ lang"
+    modes=disfluent
+
+    # If you want prepare the lang directory for other transcript modes, just append
+    # the names of those modes behind. An example is shown as below:-
+    # modes="$modes fluent symbol number"
+
+    for mode in ${modes[@]}; do
+        python local/prepare_lang_char.py --trans-mode $mode \
+            --train-cut $csj_manifest_dir/csj_cuts_train.jsonl.gz \
+            --lang-dir lang_char_$mode
+    done
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Compute fbank for musan"
+    mkdir -p $musan_fbank_dir
+
+    if [ ! -e $musan_fbank_dir/.musan.done ]; then 
+        python local/compute_fbank_musan.py --manifest-dir $musan_manifest_dir --fbank-dir $musan_fbank_dir
+        touch $musan_fbank_dir/.musan.done
+    fi
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then 
+    log "Stage 6: Show manifest statistics"
+    python local/display_manifest_statistics.py --manifest-dir $csj_manifest_dir > $csj_manifest_dir/manifest_statistics.txt
+    cat $csj_manifest_dir/manifest_statistics.txt
+fi
--- a/egs/csj/ASR/shared
+++ b/egs/csj/ASR/shared
@ -0,0 +1 @@
+../../../icefall/shared/
--- a/egs/gigaspeech/ASR/conformer_ctc/conformer.py
+++ b/egs/gigaspeech/ASR/conformer_ctc/conformer.py
@ -253,7 +253,9 @@ class ConformerEncoderLayer(nn.Module):
        residual = src
        if self.normalize_before:
            src = self.norm_conv(src)
-        src = residual + self.dropout(self.conv_module(src))
+        src = residual + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
        if not self.normalize_before:
            src = self.norm_conv(src)

@ -890,11 +892,16 @@ class ConvolutionModule(nn.Module):
        )
        self.activation = Swish()

-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
        """Compute convolution module.

        Args:
            x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).

        Returns:
            Tensor: Output tensor (#time, batch, channels).
@ -908,6 +915,8 @@ class ConvolutionModule(nn.Module):
        x = nn.functional.glu(x, dim=1)  # (batch, channels, time)

        # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
        x = self.depthwise_conv(x)
        if self.use_batchnorm:
            x = self.norm(x)
--- a/egs/gigaspeech/ASR/conformer_ctc/decode.py
+++ b/egs/gigaspeech/ASR/conformer_ctc/decode.py
@ -173,13 +173,13 @@ def get_params() -> AttributeDict:


 def post_processing(
-    results: List[Tuple[List[str], List[str]]],
-) -> List[Tuple[List[str], List[str]]]:
+    results: List[Tuple[str, List[str], List[str]]],
+) -> List[Tuple[str, List[str], List[str]]]:
    new_results = []
-    for ref, hyp in results:
+    for key, ref, hyp in results:
        new_ref = asr_text_post_processing(" ".join(ref)).split()
        new_hyp = asr_text_post_processing(" ".join(hyp)).split()
-        new_results.append((new_ref, new_hyp))
+        new_results.append((key, new_ref, new_hyp))
    return new_results


@ -408,7 +408,7 @@ def decode_dataset(
    sos_id: int,
    eos_id: int,
    G: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -502,7 +502,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[str], List[str]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    if params.method == "attention-decoder":
        # Set it to False since there are too many logs.
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/decode.py
@ -203,13 +203,13 @@ def get_parser():


 def post_processing(
-    results: List[Tuple[List[str], List[str]]],
-) -> List[Tuple[List[str], List[str]]]:
+    results: List[Tuple[str, List[str], List[str]]],
+) -> List[Tuple[str, List[str], List[str]]]:
    new_results = []
-    for ref, hyp in results:
+    for key, ref, hyp in results:
        new_ref = asr_text_post_processing(" ".join(ref)).split()
        new_hyp = asr_text_post_processing(" ".join(hyp)).split()
-        new_results.append((new_ref, new_hyp))
+        new_results.append((key, new_ref, new_hyp))
    return new_results


@ -340,7 +340,7 @@ def decode_dataset(
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -407,7 +407,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
--- a/egs/librispeech/ASR/RESULTS.md
+++ b/egs/librispeech/ASR/RESULTS.md
@ -1,12 +1,100 @@
 ## Results

-#### LibriSpeech BPE training results (Pruned Stateless LSTM RNN-T + multi-dataset)
+### LibriSpeech BPE training results (Pruned Stateless LSTM RNN-T + gradient filter)

-[lstm_transducer_stateless2](./lstm_transducer_stateless2)
+#### [lstm_transducer_stateless3](./lstm_transducer_stateless3)
+
+It implements LSTM model with mechanisms in reworked model for streaming ASR.
+Gradient filter is applied inside each lstm module to stabilize the training.
+
+See <https://github.com/k2-fsa/icefall/pull/564> for more details.
+
+##### training on full librispeech
+
+This model contains 12 encoder layers (LSTM module + Feedforward module). The number of model parameters is 84689496.
+
+The WERs are:
+
+|                                     | test-clean | test-other | comment              | decoding mode        |
+|-------------------------------------|------------|------------|----------------------|----------------------|
+| greedy search (max sym per frame 1) | 3.66       | 9.51       | --epoch 40 --avg 15  | simulated streaming  |
+| greedy search (max sym per frame 1) | 3.66       | 9.48       | --epoch 40 --avg 15  | streaming            |
+| fast beam search                    | 3.55       | 9.33       | --epoch 40 --avg 15  | simulated streaming  |
+| fast beam search                    | 3.57       | 9.25       | --epoch 40 --avg 15  | streaming            |
+| modified beam search                | 3.55       | 9.28       | --epoch 40 --avg 15  | simulated streaming  |
+| modified beam search                | 3.54       | 9.25       | --epoch 40 --avg 15  | streaming            |
+
+Note: `simulated streaming` indicates feeding full utterance during decoding, while `streaming` indicates feeding certain number of frames at each time.
+
+
+The training command is:
+
+```bash
+./lstm_transducer_stateless3/train.py \
+  --world-size 4 \
+  --num-epochs 40 \
+  --start-epoch 1 \
+  --exp-dir lstm_transducer_stateless3/exp \
+  --full-libri 1 \
+  --max-duration 500 \
+  --master-port 12325 \
+  --num-encoder-layers 12 \
+  --grad-norm-threshold 25.0 \
+  --rnn-hidden-size 1024
+```
+
+The tensorboard log can be found at
+<https://tensorboard.dev/experiment/caNPyr5lT8qAl9qKsXEeEQ/>
+
+The simulated streaming decoding command using greedy search, fast beam search, and modified beam search is:
+```bash
+for decoding_method in greedy_search fast_beam_search modified_beam_search; do
+  ./lstm_transducer_stateless3/decode.py \
+    --epoch 40 \
+    --avg 15 \
+    --exp-dir lstm_transducer_stateless3/exp \
+    --max-duration 600 \
+    --num-encoder-layers 12 \
+    --rnn-hidden-size 1024 \
+    --decoding-method $decoding_method \
+    --use-averaged-model True \
+    --beam 4 \
+    --max-contexts 4 \
+    --max-states 8 \
+    --beam-size 4
+done
+```
+
+The streaming decoding command using greedy search, fast beam search, and modified beam search is:
+```bash
+for decoding_method in greedy_search fast_beam_search modified_beam_search; do
+  ./lstm_transducer_stateless3/streaming_decode.py \
+    --epoch 40 \
+    --avg 15 \
+    --exp-dir lstm_transducer_stateless3/exp \
+    --max-duration 600 \
+    --num-encoder-layers 12 \
+    --rnn-hidden-size 1024 \
+    --decoding-method $decoding_method \
+    --use-averaged-model True \
+    --beam 4 \
+    --max-contexts 4 \
+    --max-states 8 \
+    --beam-size 4
+done
+```
+
+Pretrained models, training logs, decoding logs, and decoding results
+are available at
+<https://huggingface.co/Zengwei/icefall-asr-librispeech-lstm-transducer-stateless3-2022-09-28>
+
+
+### LibriSpeech BPE training results (Pruned Stateless LSTM RNN-T + multi-dataset)
+
+#### [lstm_transducer_stateless2](./lstm_transducer_stateless2)

 See <https://github.com/k2-fsa/icefall/pull/558> for more details.

-
 The WERs are:

 |                                     | test-clean | test-other | comment                 |
@ -18,6 +106,7 @@ The WERs are:
 | modified_beam_search                | 2.75       | 7.08       | --iter 472000 --avg 18  |
 | fast_beam_search                    | 2.77       | 7.29       | --iter 472000 --avg 18  |

+
 The training command is:

 ```bash
@ -70,15 +159,16 @@ Pretrained models, training logs, decoding logs, and decoding results
 are available at
 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03>

-#### LibriSpeech BPE training results (Pruned Stateless LSTM RNN-T)

-[lstm_transducer_stateless](./lstm_transducer_stateless)
+### LibriSpeech BPE training results (Pruned Stateless LSTM RNN-T)
+
+#### [lstm_transducer_stateless](./lstm_transducer_stateless)

 It implements LSTM model with mechanisms in reworked model for streaming ASR.

 See <https://github.com/k2-fsa/icefall/pull/479> for more details.

-#### training on full librispeech
+##### training on full librispeech

 This model contains 12 encoder layers (LSTM module + Feedforward module). The number of model parameters is 84689496.

@ -165,7 +255,7 @@ It is modified from [torchaudio](https://github.com/pytorch/audio).

 See <https://github.com/k2-fsa/icefall/pull/440> for more details.

-#### With lower latency setup, training on full librispeech
+##### With lower latency setup, training on full librispeech

 In this model, the lengths of chunk and right context are 32 frames (i.e., 0.32s) and 8 frames (i.e., 0.08s), respectively.

@ -316,7 +406,7 @@ Pretrained models, training logs, decoding logs, and decoding results
 are available at
 <https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05>

-#### With higher latency setup, training on full librispeech
+##### With higher latency setup, training on full librispeech

 In this model, the lengths of chunk and right context are 64 frames (i.e., 0.64s) and 16 frames (i.e., 0.16s), respectively.

@ -851,14 +941,14 @@ Pre-trained models, training and decoding logs, and decoding results are availab

 ### LibriSpeech BPE training results (Pruned Stateless Conv-Emformer RNN-T)

-[conv_emformer_transducer_stateless](./conv_emformer_transducer_stateless)
+#### [conv_emformer_transducer_stateless](./conv_emformer_transducer_stateless)

 It implements [Emformer](https://arxiv.org/abs/2010.10759) augmented with convolution module for streaming ASR.
 It is modified from [torchaudio](https://github.com/pytorch/audio).

 See <https://github.com/k2-fsa/icefall/pull/389> for more details.

-#### Training on full librispeech
+##### Training on full librispeech

 In this model, the lengths of chunk and right context are 32 frames (i.e., 0.32s) and 8 frames (i.e., 0.08s), respectively.

@ -1011,7 +1101,7 @@ are available at

 ### LibriSpeech BPE training results (Pruned Stateless Emformer RNN-T)

-[pruned_stateless_emformer_rnnt2](./pruned_stateless_emformer_rnnt2)
+#### [pruned_stateless_emformer_rnnt2](./pruned_stateless_emformer_rnnt2)

 Use <https://github.com/k2-fsa/icefall/pull/390>.

@ -1079,7 +1169,7 @@ results at:

 ### LibriSpeech BPE training results (Pruned Stateless Transducer 5)

-[pruned_transducer_stateless5](./pruned_transducer_stateless5)
+#### [pruned_transducer_stateless5](./pruned_transducer_stateless5)

 Same as `Pruned Stateless Transducer 2` but with more layers.

@ -1092,7 +1182,7 @@ The notations `large` and `medium` below are from the [Conformer](https://arxiv.
 paper, where the large model has about 118 M parameters and the medium model
 has 30.8 M parameters.

-#### Large
+##### Large

 Number of model parameters 118129516 (i.e, 118.13 M).

@ -1152,7 +1242,7 @@ results at:
 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless5-2022-07-07>


-#### Medium
+##### Medium

 Number of model parameters 30896748 (i.e, 30.9 M).

@ -1212,7 +1302,7 @@ results at:
 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless5-M-2022-07-07>


-#### Baseline-2
+##### Baseline-2

 It has 88.98 M parameters. Compared to the model in pruned_transducer_stateless2, its has more
 layers (24 v.s 12) but a narrower model (1536 feedforward dim and 384 encoder dim vs 2048 feed forward dim and 512 encoder dim).
@ -1273,13 +1363,13 @@ results at:

 ### LibriSpeech BPE training results (Pruned Stateless Transducer 4)

-[pruned_transducer_stateless4](./pruned_transducer_stateless4)
+#### [pruned_transducer_stateless4](./pruned_transducer_stateless4)

 This version saves averaged model during training, and decodes with averaged model.

 See <https://github.com/k2-fsa/icefall/issues/337> for details about the idea of model averaging.

-#### Training on full librispeech
+##### Training on full librispeech

 See <https://github.com/k2-fsa/icefall/pull/344>

@ -1355,7 +1445,7 @@ Pretrained models, training logs, decoding logs, and decoding results
 are available at
 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless4-2022-06-03>

-#### Training on train-clean-100
+##### Training on train-clean-100

 See <https://github.com/k2-fsa/icefall/pull/344>

@ -1392,7 +1482,7 @@ The tensorboard log can be found at

 ### LibriSpeech BPE training results (Pruned Stateless Transducer 3, 2022-04-29)

-[pruned_transducer_stateless3](./pruned_transducer_stateless3)
+#### [pruned_transducer_stateless3](./pruned_transducer_stateless3)
 Same as `Pruned Stateless Transducer 2` but using the XL subset from
 [GigaSpeech](https://github.com/SpeechColab/GigaSpeech) as extra training data.

@ -1606,10 +1696,10 @@ can be found at

 ### LibriSpeech BPE training results (Pruned Transducer 2)

-[pruned_transducer_stateless2](./pruned_transducer_stateless2)
+#### [pruned_transducer_stateless2](./pruned_transducer_stateless2)
 This is with a reworked version of the conformer encoder, with many changes.

-#### Training on fulll librispeech
+##### Training on full librispeech

 Using commit `34aad74a2c849542dd5f6359c9e6b527e8782fd6`.
 See <https://github.com/k2-fsa/icefall/pull/288>
@ -1658,7 +1748,7 @@ can be found at
 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29>


-#### Training on train-clean-100:
+##### Training on train-clean-100:

 Trained with 1 job:
 ```
--- a/egs/librispeech/ASR/conformer_ctc/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc/conformer.py
@ -253,7 +253,9 @@ class ConformerEncoderLayer(nn.Module):
        residual = src
        if self.normalize_before:
            src = self.norm_conv(src)
-        src = residual + self.dropout(self.conv_module(src))
+        src = residual + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
        if not self.normalize_before:
            src = self.norm_conv(src)

@ -890,11 +892,16 @@ class ConvolutionModule(nn.Module):
        )
        self.activation = Swish()

-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
        """Compute convolution module.

        Args:
            x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).

        Returns:
            Tensor: Output tensor (#time, batch, channels).
@ -908,6 +915,8 @@ class ConvolutionModule(nn.Module):
        x = nn.functional.glu(x, dim=1)  # (batch, channels, time)

        # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
        x = self.depthwise_conv(x)
        if self.use_batchnorm:
            x = self.norm(x)
--- a/egs/librispeech/ASR/conformer_ctc/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc/decode.py
@ -480,7 +480,7 @@ def decode_dataset(
    sos_id: int,
    eos_id: int,
    G: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -577,7 +577,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[int], List[int]]]],
 ):
    if params.method in ("attention-decoder", "rnn-lm"):
        # Set it to False since there are too many logs.
--- a/egs/librispeech/ASR/conformer_ctc2/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc2/conformer.py
@ -268,7 +268,9 @@ class ConformerEncoderLayer(nn.Module):
        src = src + self.dropout(src_att)

        # convolution module
-        src = src + self.dropout(self.conv_module(src))
+        src = src + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )

        # feed forward module
        src = src + self.dropout(self.feed_forward(src))
@ -921,11 +923,16 @@ class ConvolutionModule(nn.Module):
            initial_scale=0.25,
        )

-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
        """Compute convolution module.

        Args:
            x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).

        Returns:
            Tensor: Output tensor (#time, batch, channels).
@ -941,6 +948,8 @@ class ConvolutionModule(nn.Module):
        x = nn.functional.glu(x, dim=1)  # (batch, channels, time)

        # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
        x = self.depthwise_conv(x)

        x = self.deriv_balancer2(x)
--- a/egs/librispeech/ASR/conformer_ctc2/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc2/decode.py
@ -587,7 +587,7 @@ def decode_dataset(
    sos_id: int,
    eos_id: int,
    G: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -684,7 +684,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    if params.method in ("attention-decoder", "rnn-lm"):
        # Set it to False since there are too many logs.
--- a/egs/librispeech/ASR/conformer_mmi/conformer.py
+++ b/egs/librispeech/ASR/conformer_mmi/conformer.py
@ -247,7 +247,9 @@ class ConformerEncoderLayer(nn.Module):
        residual = src
        if self.normalize_before:
            src = self.norm_conv(src)
-        src = residual + self.dropout(self.conv_module(src))
+        src = residual + self.dropout(
+            self.conv_module(src, src_key_padding_mask=src_key_padding_mask)
+        )
        if not self.normalize_before:
            src = self.norm_conv(src)

@ -878,11 +880,16 @@ class ConvolutionModule(nn.Module):
        )
        self.activation = Swish()

-    def forward(self, x: Tensor) -> Tensor:
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
        """Compute convolution module.

        Args:
            x: Input tensor (#time, batch, channels).
+            src_key_padding_mask: the mask for the src keys per batch (optional).

        Returns:
            Tensor: Output tensor (#time, batch, channels).
@ -896,6 +903,8 @@ class ConvolutionModule(nn.Module):
        x = nn.functional.glu(x, dim=1)  # (batch, channels, time)

        # 1D Depthwise Conv
+        if src_key_padding_mask is not None:
+            x.masked_fill_(src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
        x = self.depthwise_conv(x)
        x = self.activation(self.norm(x))

--- a/egs/librispeech/ASR/conformer_mmi/decode.py
+++ b/egs/librispeech/ASR/conformer_mmi/decode.py
@ -404,7 +404,7 @@ def decode_dataset(
    sos_id: int,
    eos_id: int,
    G: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -487,7 +487,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    if params.method == "attention-decoder":
        # Set it to False since there are too many logs.
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/decode.py
@ -366,7 +366,7 @@ def decode_dataset(
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -436,7 +436,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/decode.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/decode.py
@ -366,7 +366,7 @@ def decode_dataset(
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -436,7 +436,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
@ -496,7 +496,7 @@ def decode_dataset(
    sp: spm.SentencePieceProcessor,
    word_table: Optional[k2.SymbolTable] = None,
    decoding_graph: Optional[k2.Fsa] = None,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.

    Args:
@ -570,7 +570,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
--- a/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/lstm.py
@ -116,6 +116,8 @@ class RNN(EncoderInterface):
        Period of auxiliary layers used for random combiner during training.
        If set to 0, will not use the random combiner (Default).
        You can set a positive integer to use the random combiner, e.g., 3.
+      is_pnnx:
+        True to make this class exportable via PNNX.
    """

    def __init__(
@ -129,6 +131,7 @@ class RNN(EncoderInterface):
        dropout: float = 0.1,
        layer_dropout: float = 0.075,
        aux_layer_period: int = 0,
+        is_pnnx: bool = False,
    ) -> None:
        super(RNN, self).__init__()

@ -142,7 +145,13 @@ class RNN(EncoderInterface):
        # That is, it does two things simultaneously:
        #   (1) subsampling: T -> T//subsampling_factor
        #   (2) embedding: num_features -> d_model
-        self.encoder_embed = Conv2dSubsampling(num_features, d_model)
+        self.encoder_embed = Conv2dSubsampling(
+            num_features,
+            d_model,
+            is_pnnx=is_pnnx,
+        )
+
+        self.is_pnnx = is_pnnx

        self.num_encoder_layers = num_encoder_layers
        self.d_model = d_model
@ -209,7 +218,13 @@ class RNN(EncoderInterface):
        # lengths = ((x_lens - 3) // 2 - 1) // 2 # issue an warning
        #
        # Note: rounding_mode in torch.div() is available only in torch >= 1.8.0
-        lengths = (((x_lens - 3) >> 1) - 1) >> 1
+        if not self.is_pnnx:
+            lengths = (((x_lens - 3) >> 1) - 1) >> 1
+        else:
+            lengths1 = torch.floor((x_lens - 3) / 2)
+            lengths = torch.floor((lengths1 - 1) / 2)
+            lengths = lengths.to(x_lens)
+
        if not torch.jit.is_tracing():
            assert x.size(0) == lengths.max().item()

@ -359,7 +374,7 @@ class RNNEncoderLayer(nn.Module):
                # for cell state
                assert states[1].shape == (1, src.size(1), self.rnn_hidden_size)
            src_lstm, new_states = self.lstm(src, states)
-        src = src + self.dropout(src_lstm)
+        src = self.dropout(src_lstm) + src

        # feed forward module
        src = src + self.dropout(self.feed_forward(src))
@ -505,6 +520,7 @@ class Conv2dSubsampling(nn.Module):
        layer1_channels: int = 8,
        layer2_channels: int = 32,
        layer3_channels: int = 128,
+        is_pnnx: bool = False,
    ) -> None:
        """
        Args:
@ -517,6 +533,9 @@ class Conv2dSubsampling(nn.Module):
            Number of channels in layer1
          layer1_channels:
            Number of channels in layer2
+          is_pnnx:
+            True if we are converting the model to PNNX format.
+            False otherwise.
        """
        assert in_channels >= 9
        super().__init__()
@ -559,6 +578,10 @@ class Conv2dSubsampling(nn.Module):
            channel_dim=-1, min_positive=0.45, max_positive=0.55
        )

+        # ncnn supports only batch size == 1
+        self.is_pnnx = is_pnnx
+        self.conv_out_dim = self.out.weight.shape[1]
+
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Subsample x.

@ -572,9 +595,15 @@ class Conv2dSubsampling(nn.Module):
        # On entry, x is (N, T, idim)
        x = x.unsqueeze(1)  # (N, T, idim) -> (N, 1, T, idim) i.e., (N, C, H, W)
        x = self.conv(x)
-        # Now x is of shape (N, odim, ((T-3)//2-1)//2, ((idim-3)//2-1)//2)
-        b, c, t, f = x.size()
-        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+
+        if torch.jit.is_tracing() and self.is_pnnx:
+            x = x.permute(0, 2, 1, 3).reshape(1, -1, self.conv_out_dim)
+            x = self.out(x)
+        else:
+            # Now x is of shape (N, odim, ((T-3)//2-1)//2, ((idim-3)//2-1)//2)
+            b, c, t, f = x.size()
+            x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+
        # Now x is of shape (N, ((T-3)//2-1))//2, odim)
        x = self.out_norm(x)
        x = self.out_balancer(x)
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/local/compute_fbank_musan.py`