Merge branch 'k2-fsa:master' into master

2025-09-19 05:54:20 +00:00 · 2022-07-28 22:13:15 +08:00 · 2022-07-28 22:13:15 +08:00 · 0cffe78485
commit 0cffe78485
parent b304db1661 389f9c77e5
757 changed files with 120927 additions and 3893 deletions
--- a/.flake8
+++ b/.flake8
@ -4,10 +4,20 @@ statistics=true
 max-line-length = 80
 per-file-ignores =
    # line too long
-    egs/librispeech/ASR/*/conformer.py: E501,
+    icefall/diagnostics.py: E501,
-    egs/aishell/ASR/*/conformer.py: E501,
+    egs/*/ASR/*/conformer.py: E501,
    egs/*/ASR/pruned_transducer_stateless*/*.py: E501,
    egs/*/ASR/*/optim.py: E501,
    egs/*/ASR/*/scaling.py: E501,
    egs/librispeech/ASR/conv_emformer_transducer_stateless*/*.py: E501, E203,
    egs/librispeech/ASR/conformer_ctc2/*py: E501,
    egs/librispeech/ASR/RESULTS.md: E999,
    # invalid escape sequence (cause by tex formular), W605
    icefall/utils.py: E501, W605
 exclude =
  .git,
  **/data/**,
-  icefall/shared/make_kn_lm.py
+  icefall/shared/make_kn_lm.py,
  icefall/__init__.py
--- a/.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+++ b/.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
@ -0,0 +1,17 @@
 #!/usr/bin/env bash
 # This script computes fbank features for the test-clean and test-other datasets.
 # The computed features are saved to ~/tmp/fbank-libri and are
 # cached for later runs
 export PYTHONPATH=$PWD:$PYTHONPATH
 echo $PYTHONPATH
 mkdir ~/tmp/fbank-libri
 cd egs/librispeech/ASR
 mkdir -p data
 cd data
 [ ! -e fbank ] && ln -s ~/tmp/fbank-libri fbank
 cd ..
 ./local/compute_fbank_librispeech.py
 ls -lh data/fbank/
--- a/.github/scripts/download-gigaspeech-dev-test-dataset.sh
+++ b/.github/scripts/download-gigaspeech-dev-test-dataset.sh
@ -0,0 +1,15 @@
 #!/usr/bin/env bash
 # This script downloads the pre-computed fbank features for
 # dev and test datasets of GigaSpeech.
 #
 # You will find directories `~/tmp/giga-dev-dataset-fbank` after running
 # this script.
 mkdir -p ~/tmp
 cd ~/tmp
 git lfs install
 git clone https://huggingface.co/csukuangfj/giga-dev-dataset-fbank
 ls -lh giga-dev-dataset-fbank/data/fbank
--- a/.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+++ b/.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
@ -0,0 +1,23 @@
 #!/usr/bin/env bash
 # This script downloads the test-clean and test-other datasets
 # of LibriSpeech and unzip them to the folder ~/tmp/download,
 # which is cached by GitHub actions for later runs.
 #
 # You will find directories ~/tmp/download/LibriSpeech after running
 # this script.
 mkdir ~/tmp/download
 cd egs/librispeech/ASR
 ln -s ~/tmp/download .
 cd download
 wget -q --no-check-certificate https://www.openslr.org/resources/12/test-clean.tar.gz
 tar xf test-clean.tar.gz
 rm test-clean.tar.gz
 wget -q --no-check-certificate https://www.openslr.org/resources/12/test-other.tar.gz
 tar xf test-other.tar.gz
 rm test-other.tar.gz
 pwd
 ls -lh
 ls -lh LibriSpeech
--- a/.github/scripts/install-kaldifeat.sh
+++ b/.github/scripts/install-kaldifeat.sh
@ -0,0 +1,13 @@
 #!/usr/bin/env bash
 # This script installs kaldifeat into the directory ~/tmp/kaldifeat
 # which is cached by GitHub actions for later runs.
 mkdir -p ~/tmp
 cd ~/tmp
 git clone https://github.com/csukuangfj/kaldifeat
 cd kaldifeat
 mkdir build
 cd build
 cmake -DCMAKE_BUILD_TYPE=Release ..
 make -j2 _kaldifeat
--- a/.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+++ b/.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
@ -0,0 +1,11 @@
 #!/usr/bin/env bash
 # This script assumes that test-clean and test-other are downloaded
 # to egs/librispeech/ASR/download/LibriSpeech and generates manifest
 # files in egs/librispeech/ASR/data/manifests
 cd egs/librispeech/ASR
 [ ! -e download ] && ln -s ~/tmp/download .
 mkdir -p data/manifests
 lhotse prepare librispeech -j 2 -p test-clean -p test-other ./download/LibriSpeech data/manifests
 ls -lh data/manifests
--- a/.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh
+++ b/.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh
@ -0,0 +1,86 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/aishell/ASR
 git lfs install
 fbank_url=https://huggingface.co/csukuangfj/aishell-test-dev-manifests
 log "Downloading pre-commputed fbank from $fbank_url"
 git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests
 ln -s $PWD/aishell-test-dev-manifests/data .
 log "Downloading pre-trained model from $repo_url"
 repo_url=https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 ln -s pretrained-epoch-29-avg-5-torch-1.10.0.pt pretrained.pt
 popd
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless3/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --lang-dir $repo/data/lang_char \
    $repo/test_wavs/BAC009S0764W0121.wav \
    $repo/test_wavs/BAC009S0764W0122.wav \
    $rep/test_wavs/BAC009S0764W0123.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./pruned_transducer_stateless3/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --lang-dir $repo/data/lang_char \
    $repo/test_wavs/BAC009S0764W0121.wav \
    $repo/test_wavs/BAC009S0764W0122.wav \
    $rep/test_wavs/BAC009S0764W0123.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless3/exp
  ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless3/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_char data/
  ls -lh data
  ls -lh pruned_transducer_stateless3/exp
  log "Decoding test and dev"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless3/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless3/exp
  done
  rm pruned_transducer_stateless3/exp/*.pt
 fi
--- a/.github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh
+++ b/.github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh
@ -0,0 +1,49 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/gigaspeech/ASR
 repo_url=https://huggingface.co/wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless2/exp
  ln -s $PWD/$repo/exp/pretrained-iter-3488000-avg-20.pt pruned_transducer_stateless2/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh data/lang_bpe_500
  ls -lh data/fbank
  ls -lh pruned_transducer_stateless2/exp
  log "Decoding dev and test"
  # use a small value for decoding with CPU
  max_duration=100
  # Test only greedy_search to reduce CI running time
  # for method in greedy_search fast_beam_search modified_beam_search; do
  for method in greedy_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless2/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless2/exp
  done
  rm pruned_transducer_stateless2/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
@ -0,0 +1,76 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in fast_beam_search modified_beam_search beam_search; do
  log "$method"
  ./pruned_transducer_stateless/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless/exp
  ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh pruned_transducer_stateless/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless/exp
  done
  rm pruned_transducer_stateless/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
@ -0,0 +1,80 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 ln -s pretrained-epoch-38-avg-10.pt pretrained.pt
 popd
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless2/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./pruned_transducer_stateless2/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless2/exp
  ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless2/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh pruned_transducer_stateless2/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless2/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless2/exp
  done
  rm pruned_transducer_stateless2/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
@ -0,0 +1,80 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-04-29
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 ln -s pretrained-epoch-25-avg-6.pt pretrained.pt
 popd
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless3/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./pruned_transducer_stateless3/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless3/exp
  ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless3/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh pruned_transducer_stateless3/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless3/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless3/exp
  done
  rm pruned_transducer_stateless3/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-05-13.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-05-13.sh
@ -0,0 +1,80 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 ln -s pretrained-iter-1224000-avg-14.pt pretrained.pt
 popd
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless3/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./pruned_transducer_stateless3/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless3/exp
  ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless3/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh pruned_transducer_stateless3/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless3/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless3/exp
  done
  rm pruned_transducer_stateless3/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless5-2022-05-13.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless5-2022-05-13.sh
@ -0,0 +1,99 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 ln -s pretrained-epoch-39-avg-7.pt pretrained.pt
 popd
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless5/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --num-encoder-layers 18 \
    --dim-feedforward 2048 \
    --nhead 8 \
    --encoder-dim 512 \
    --decoder-dim 512 \
    --joiner-dim 512 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./pruned_transducer_stateless5/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav \
    --num-encoder-layers 18 \
    --dim-feedforward 2048 \
    --nhead 8 \
    --encoder-dim 512 \
    --decoder-dim 512 \
    --joiner-dim 512
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless5/exp
  ln -s $PWD/$repo/exp/pretrained-epoch-39-avg-7.pt pruned_transducer_stateless5/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh pruned_transducer_stateless5/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./pruned_transducer_stateless5/decode.py \
      --decoding-method $method \
      --use-averaged-model 0 \
      --epoch 999 \
      --avg 1 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless5/exp \
      --num-encoder-layers 18 \
      --dim-feedforward 2048 \
      --nhead 8 \
      --encoder-dim 512 \
      --decoder-dim 512 \
      --joiner-dim 512
  done
  rm pruned_transducer_stateless5/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh
+++ b/.github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh
@ -0,0 +1,100 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless2_20220625
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 pushd $repo/exp
 ln -s pretrained-epoch-24-avg-10.pt pretrained.pt
 popd
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./pruned_transducer_stateless2/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --simulate-streaming 1 \
    --causal-convolution 1 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./pruned_transducer_stateless2/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    --simulate-streaming 1 \
    --causal-convolution 1 \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p pruned_transducer_stateless2/exp
  ln -s $PWD/$repo/exp/pretrained-epoch-24-avg-10.pt pruned_transducer_stateless2/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh pruned_transducer_stateless2/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Simulate streaming decoding with $method"
    ./pruned_transducer_stateless2/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --max-duration $max_duration \
      --exp-dir pruned_transducer_stateless2/exp \
      --simulate-streaming 1 \
      --causal-convolution 1
  done
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Real streaming decoding with $method"
    ./pruned_transducer_stateless2/streaming_decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --num-decode-streams 100 \
      --exp-dir pruned_transducer_stateless2/exp \
      --left-context 32 \
      --decode-chunk-size 8 \
      --right-context 0
  done
  rm pruned_transducer_stateless2/exp/*.pt
 fi
--- a/.github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
+++ b/.github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
@ -0,0 +1,76 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless2-torchaudio-2022-04-19
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./transducer_stateless2/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in fast_beam_search modified_beam_search beam_search; do
  log "$method"
  ./transducer_stateless2/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p transducer_stateless2/exp
  ln -s $PWD/$repo/exp/pretrained.pt transducer_stateless2/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh transducer_stateless2/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./transducer_stateless2/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --max-duration $max_duration \
      --exp-dir transducer_stateless2/exp
  done
  rm transducer_stateless2/exp/*.pt
 fi
--- a/.github/scripts/run-pre-trained-conformer-ctc.sh
+++ b/.github/scripts/run-pre-trained-conformer-ctc.sh
@ -0,0 +1,46 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://github.com/csukuangfj/icefall-asr-conformer-ctc-bpe-500
 git lfs install
 git clone $repo
 log "Downloading pre-trained model from $repo_url"
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.flac
 ls -lh $repo/test_wavs/*.flac
 log "CTC decoding"
 ./conformer_ctc/pretrained.py \
  --method ctc-decoding \
  --num-classes 500 \
  --checkpoint $repo/exp/pretrained.pt \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  $repo/test_wavs/1089-134686-0001.flac \
  $repo/test_wavs/1221-135766-0001.flac \
  $repo/test_wavs/1221-135766-0002.flac
 log "HLG decoding"
 ./conformer_ctc/pretrained.py \
  --method 1best \
  --num-classes 500 \
  --checkpoint $repo/exp/pretrained.pt \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  --words-file $repo/data/lang_bpe_500/words.txt \
  --HLG $repo/data/lang_bpe_500/HLG.pt \
  $repo/test_wavs/1089-134686-0001.flac \
  $repo/test_wavs/1221-135766-0001.flac \
  $repo/test_wavs/1221-135766-0002.flac
--- a/.github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
@ -0,0 +1,76 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./transducer_stateless_multi_datasets/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./transducer_stateless_multi_datasets/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p transducer_stateless_multi_datasets/exp
  ln -s $PWD/$repo/exp/pretrained.pt transducer_stateless_multi_datasets/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh transducer_stateless_multi_datasets/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./transducer_stateless_multi_datasets/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --max-duration $max_duration \
      --exp-dir transducer_stateless_multi_datasets/exp
  done
  rm transducer_stateless_multi_datasets/exp/*.pt
 fi
--- a/.github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
@ -0,0 +1,76 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./transducer_stateless_multi_datasets/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in modified_beam_search beam_search fast_beam_search; do
  log "$method"
  ./transducer_stateless_multi_datasets/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p transducer_stateless_multi_datasets/exp
  ln -s $PWD/$repo/exp/pretrained.pt transducer_stateless_multi_datasets/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh transducer_stateless_multi_datasets/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./transducer_stateless_multi_datasets/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --max-duration $max_duration \
      --exp-dir transducer_stateless_multi_datasets/exp
  done
  rm transducer_stateless_multi_datasets/exp/*.pt
 fi
--- a/.github/scripts/run-pre-trained-transducer-stateless-modified-2-aishell.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless-modified-2-aishell.sh
@ -0,0 +1,47 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/aishell/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-aishell-transducer-stateless-modified-2-2022-03-01
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./transducer_stateless_modified-2/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --lang-dir $repo/data/lang_char \
    $repo/test_wavs/BAC009S0764W0121.wav \
    $repo/test_wavs/BAC009S0764W0122.wav \
    $repo/test_wavs/BAC009S0764W0123.wav
 done
 for method in modified_beam_search beam_search; do
  log "$method"
  ./transducer_stateless_modified-2/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --lang-dir $repo/data/lang_char \
    $repo/test_wavs/BAC009S0764W0121.wav \
    $repo/test_wavs/BAC009S0764W0122.wav \
    $repo/test_wavs/BAC009S0764W0123.wav
 done
--- a/.github/scripts/run-pre-trained-transducer-stateless-modified-aishell.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless-modified-aishell.sh
@ -0,0 +1,47 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/aishell/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-aishell-transducer-stateless-modified-2022-03-01
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./transducer_stateless_modified/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --lang-dir $repo/data/lang_char \
    $repo/test_wavs/BAC009S0764W0121.wav \
    $repo/test_wavs/BAC009S0764W0122.wav \
    $repo/test_wavs/BAC009S0764W0123.wav
 done
 for method in modified_beam_search beam_search; do
  log "$method"
  ./transducer_stateless_modified/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --lang-dir $repo/data/lang_char \
    $repo/test_wavs/BAC009S0764W0121.wav \
    $repo/test_wavs/BAC009S0764W0122.wav \
    $repo/test_wavs/BAC009S0764W0123.wav
 done
--- a/.github/scripts/run-pre-trained-transducer-stateless.sh
+++ b/.github/scripts/run-pre-trained-transducer-stateless.sh
@ -0,0 +1,76 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 for sym in 1 2 3; do
  log "Greedy search with --max-sym-per-frame $sym"
  ./transducer_stateless/pretrained.py \
    --method greedy_search \
    --max-sym-per-frame $sym \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 for method in fast_beam_search modified_beam_search beam_search; do
  log "$method"
  ./transducer_stateless/pretrained.py \
    --method $method \
    --beam-size 4 \
    --checkpoint $repo/exp/pretrained.pt \
    --bpe-model $repo/data/lang_bpe_500/bpe.model \
    $repo/test_wavs/1089-134686-0001.wav \
    $repo/test_wavs/1221-135766-0001.wav \
    $repo/test_wavs/1221-135766-0002.wav
 done
 echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
 echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
 if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode"  ]]; then
  mkdir -p transducer_stateless/exp
  ln -s $PWD/$repo/exp/pretrained.pt transducer_stateless/exp/epoch-999.pt
  ln -s $PWD/$repo/data/lang_bpe_500 data/
  ls -lh data
  ls -lh transducer_stateless/exp
  log "Decoding test-clean and test-other"
  # use a small value for decoding with CPU
  max_duration=100
  for method in greedy_search fast_beam_search modified_beam_search; do
    log "Decoding with $method"
    ./transducer_stateless/decode.py \
      --decoding-method $method \
      --epoch 999 \
      --avg 1 \
      --max-duration $max_duration \
      --exp-dir transducer_stateless/exp
  done
  rm transducer_stateless/exp/*.pt
 fi
--- a/.github/scripts/run-pre-trained-transducer.sh
+++ b/.github/scripts/run-pre-trained-transducer.sh
@ -0,0 +1,32 @@
 #!/usr/bin/env bash
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-bpe-500-2021-12-23
 log "Downloading pre-trained model from $repo_url"
 git lfs install
 git clone $repo_url
 repo=$(basename $repo_url)
 log "Display test files"
 tree $repo/
 soxi $repo/test_wavs/*.wav
 ls -lh $repo/test_wavs/*.wav
 log "Beam search decoding"
 ./transducer/pretrained.py \
  --method beam_search \
  --beam-size 4 \
  --checkpoint $repo/exp/pretrained.pt \
  --bpe-model $repo/data/lang_bpe_500/bpe.model \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
--- a/.github/workflows/build-doc.yml
+++ b/.github/workflows/build-doc.yml
@ -0,0 +1,65 @@
 # Copyright      2022  Xiaomi Corp.       (author: Fangjun Kuang)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # refer to https://github.com/actions/starter-workflows/pull/47/files
 # You can access it at https://k2-fsa.github.io/icefall/
 name: Generate doc
 on:
  push:
    branches:
    - master
    - doc
  pull_request:
    types: [labeled]
 jobs:
  build-doc:
    if: github.event.label.name == 'doc' || github.event_name == 'push'
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-latest]
        python-version: ["3.8"]
    steps:
      # refer to https://github.com/actions/checkout
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
      - name: Display Python version
        run: python -c "import sys; print(sys.version)"
      - name: Build doc
        shell: bash
        run: |
          cd docs
          python3 -m pip install -r ./requirements.txt
          make html
          touch build/html/.nojekyll
      - name: Deploy
        uses: peaceiris/actions-gh-pages@v3
        with:
          github_token: ${{ secrets.GITHUB_TOKEN }}
          publish_dir: ./docs/build/html
          publish_branch: gh-pages
--- a/.github/workflows/run-aishell-2022-06-20.yml
+++ b/.github/workflows/run-aishell-2022-06-20.yml
@ -0,0 +1,119 @@
 # Copyright      2022  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-aishell-2022-06-20
 # pruned RNN-T + reworked model with random combiner
 # https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_aishell_2022_06_20:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh
      - name: Display decoding results for aishell pruned_transducer_stateless3
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/aishell/ASR/
          tree ./pruned_transducer_stateless3/exp
          cd pruned_transducer_stateless3
          echo "results for pruned_transducer_stateless3"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2
          echo "===modified beam search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2
      - name: Upload decoding results for aishell pruned_transducer_stateless3
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: aishell-torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless3-2022-06-20
          path: egs/aishell/ASR/pruned_transducer_stateless3/exp/
--- a/.github/workflows/run-gigaspeech-2022-05-13.yml
+++ b/.github/workflows/run-gigaspeech-2022-05-13.yml
@ -0,0 +1,122 @@
 # Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-gigaspeech-2022-05-13
 # stateless transducer + k2 pruned rnnt-loss + reworked conformer
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_gigaspeech_2022_05_13:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Download GigaSpeech dev/test dataset
        shell: bash
        run: |
          sudo apt-get install -y -q git-lfs
          .github/scripts/download-gigaspeech-dev-test-dataset.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          ln -s ~/tmp/giga-dev-dataset-fbank/data egs/gigaspeech/ASR/
          ls -lh egs/gigaspeech/ASR/data/fbank
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh
      - name: Display decoding results for gigaspeech pruned_transducer_stateless2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/gigaspeech/ASR/
          tree ./pruned_transducer_stateless2/exp
          sudo apt-get -qq install tree
          cd pruned_transducer_stateless2
          echo "results for pruned_transducer_stateless2"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2
      - name: Upload decoding results for gigaspeech pruned_transducer_stateless2
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-gigaspeech-pruned_transducer_stateless2-2022-05-12
          path: egs/gigaspeech/ASR/pruned_transducer_stateless2/exp/
--- a/.github/workflows/run-librispeech-2022-03-12.yml
+++ b/.github/workflows/run-librispeech-2022-03-12.yml
@ -0,0 +1,155 @@
 # Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-03-12
 # stateless transducer + k2 pruned rnnt-loss
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_librispeech_2022_03_12:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
      - name: Display decoding results for pruned_transducer_stateless
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./pruned_transducer_stateless/exp
          cd pruned_transducer_stateless
          echo "results for pruned_transducer_stateless"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for pruned_transducer_stateless
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless-2022-03-12
          path: egs/librispeech/ASR/pruned_transducer_stateless/exp/
--- a/.github/workflows/run-librispeech-2022-04-29.yml
+++ b/.github/workflows/run-librispeech-2022-04-29.yml
@ -0,0 +1,181 @@
 # Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-04-29
 # stateless pruned transducer (reworked model) + giga speech
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_librispeech_2022_04_29:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
          .github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
      - name: Display decoding results for pruned_transducer_stateless2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR
          tree pruned_transducer_stateless2/exp
          cd pruned_transducer_stateless2/exp
          echo "===greedy search==="
          find greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Display decoding results for pruned_transducer_stateless3
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR
          tree pruned_transducer_stateless3/exp
          cd pruned_transducer_stateless3/exp
          echo "===greedy search==="
          find greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for pruned_transducer_stateless2
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless2-2022-04-29
          path: egs/librispeech/ASR/pruned_transducer_stateless2/exp/
      - name: Upload decoding results for pruned_transducer_stateless3
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless3-2022-04-29
          path: egs/librispeech/ASR/pruned_transducer_stateless3/exp/
--- a/.github/workflows/run-librispeech-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-2022-05-13.yml
@ -0,0 +1,155 @@
 # Copyright      2022  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-05-13
 # stateless transducer + k2 pruned rnnt-loss + deeper model
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_librispeech_2022_05_13:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-pruned-transducer-stateless5-2022-05-13.sh
      - name: Display decoding results for librispeech pruned_transducer_stateless5
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./pruned_transducer_stateless5/exp
          cd pruned_transducer_stateless5
          echo "results for pruned_transducer_stateless5"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for librispeech pruned_transducer_stateless5
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless5-2022-05-13
          path: egs/librispeech/ASR/pruned_transducer_stateless5/exp/
--- a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
@ -0,0 +1,153 @@
 # Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-pruned-transducer-stateless3-2022-05-13
 # stateless pruned transducer (reworked model) + giga speech
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_librispeech_pruned_transducer_stateless3_2022_05_13:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-pruned-transducer-stateless3-2022-05-13.sh
      - name: Display decoding results for pruned_transducer_stateless3
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR
          tree pruned_transducer_stateless3/exp
          cd pruned_transducer_stateless3/exp
          echo "===greedy search==="
          find greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for pruned_transducer_stateless3
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless3-2022-04-29
          path: egs/librispeech/ASR/pruned_transducer_stateless3/exp/
--- a/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
+++ b/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
@ -0,0 +1,155 @@
 # Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-streaming-2022-06-26
 # streaming conformer stateless transducer2
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_librispeech_streaming_2022_06_26:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh
      - name: Display decoding results
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./pruned_transducer_stateless2/exp
          cd pruned_transducer_stateless2
          echo "results for pruned_transducer_stateless2"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified_beam_search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for pruned_transducer_stateless2
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless2-2022-06-26
          path: egs/librispeech/ASR/pruned_transducer_stateless2/exp/
--- a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
+++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
@ -0,0 +1,155 @@
 # Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-librispeech-2022-04-19
 # stateless transducer + torchaudio rnn-t loss
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_librispeech_2022_04_19:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
      - name: Display decoding results
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./transducer_stateless2/exp
          cd transducer_stateless2
          echo "results for transducer_stateless2"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified_beam_search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for transducer_stateless2
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless2-2022-04-19
          path: egs/librispeech/ASR/transducer_stateless2/exp/
--- a/.github/workflows/run-pretrained-conformer-ctc.yml
+++ b/.github/workflows/run-pretrained-conformer-ctc.yml
@ -31,9 +31,6 @@ jobs:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
        torch: ["1.10.0"]
        torchaudio: ["0.10.0"]
        k2-version: ["1.9.dev20211101"]
      fail-fast: false
@ -43,67 +40,37 @@ jobs:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
-          python3 -m pip install --upgrade pip pytest
+          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
-          # numpy 1.20.x does not support python 3.6
+          pip uninstall -y protobuf
-          pip install numpy==1.19
+          pip install --no-binary protobuf protobuf
          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
-          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
+      - name: Cache kaldifeat
-          python3 -m pip install kaldifeat
+        id: my-cache
-          # We are in ./icefall and there is a file: requirements.txt in it
+        uses: actions/cache@v2
-          pip install -r requirements.txt
+        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
-      - name: Install graphviz
+      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
-          python3 -m pip install -qq graphviz
+          .github/scripts/install-kaldifeat.sh
          sudo apt-get -qq install graphviz
-      - name: Download pre-trained model
+      - name: Inference with pre-trained model
        shell: bash
        run: |
          sudo apt-get -qq install git-lfs tree sox
          cd egs/librispeech/ASR
          mkdir tmp
          cd tmp
          git lfs install
          git clone https://github.com/csukuangfj/icefall-asr-conformer-ctc-bpe-500
          cd ..
          tree tmp
          soxi tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/*.flac
          ls -lh tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/*.flac
      - name: Run CTC decoding
        shell: bash
        run: |
          export PYTHONPATH=$PWD:PYTHONPATH
          cd egs/librispeech/ASR
          ./conformer_ctc/pretrained.py \
            --num-classes 500 \
            --checkpoint ./tmp/icefall-asr-conformer-ctc-bpe-500/exp/pretrained.pt \
            --bpe-model ./tmp/icefall-asr-conformer-ctc-bpe-500/data/lang_bpe_500/bpe.model \
            --method ctc-decoding \
            ./tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/1089-134686-0001.flac \
            ./tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/1221-135766-0001.flac \
            ./tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/1221-135766-0002.flac
      - name: Run HLG decoding
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
-          cd egs/librispeech/ASR
+          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
-          ./conformer_ctc/pretrained.py \
+          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
-            --num-classes 500 \
+          .github/scripts/run-pre-trained-conformer-ctc.sh
            --checkpoint ./tmp/icefall-asr-conformer-ctc-bpe-500/exp/pretrained.pt \
            --words-file ./tmp/icefall-asr-conformer-ctc-bpe-500/data/lang_bpe_500/words.txt \
            --HLG ./tmp/icefall-asr-conformer-ctc-bpe-500/data/lang_bpe_500/HLG.pt \
            ./tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/1089-134686-0001.flac \
            ./tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/1221-135766-0001.flac \
            ./tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/1221-135766-0002.flac
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@ -0,0 +1,154 @@
 # Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-pre-trained-trandsucer-stateless-multi-datasets-librispeech-100h
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_pre_trained_transducer_stateless_multi_datasets_librispeech_100h:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
      - name: Display decoding results for transducer_stateless_multi_datasets
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./transducer_stateless_multi_datasets/exp
          cd transducer_stateless_multi_datasets
          echo "results for transducer_stateless_multi_datasets"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for transducer_stateless_multi_datasets
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless_multi_datasets-100h-2022-02-21
          path: egs/librispeech/ASR/transducer_stateless_multi_datasets/exp/
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@ -0,0 +1,154 @@
 # Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-pre-trained-trandsucer-stateless-multi-datasets-librispeech-960h
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_pre_trained_transducer_stateless_multi_datasets_librispeech_960h:
    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
      - name: Display decoding results for transducer_stateless_multi_datasets
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        shell: bash
        run: |
          cd egs/librispeech/ASR/
          tree ./transducer_stateless_multi_datasets/exp
          cd transducer_stateless_multi_datasets
          echo "results for transducer_stateless_multi_datasets"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for transducer_stateless_multi_datasets
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless_multi_datasets-100h-2022-03-01
          path: egs/librispeech/ASR/transducer_stateless_multi_datasets/exp/
--- a/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
@ -0,0 +1,76 @@
 # Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-pre-trained-trandsucer-stateless-modified-2-aishell
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
 jobs:
  run_pre_trained_transducer_stateless_modified_2_aishell:
    if: github.event.label.name == 'ready' || github.event_name == 'push'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Inference with pre-trained model
        shell: bash
        run: |
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-pre-trained-transducer-stateless-modified-2-aishell.sh
--- a/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
@ -0,0 +1,76 @@
 # Copyright      2021  Fangjun Kuang (csukuangfj@gmail.com)
 # See ../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 name: run-pre-trained-trandsucer-stateless-modified-aishell
 on:
  push:
    branches:
      - master
  pull_request:
    types: [labeled]
 jobs:
  run_pre_trained_transducer_stateless_modified_aishell:
    if: github.event.label.name == 'ready' || github.event_name == 'push'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
      fail-fast: false
    steps:
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/install-kaldifeat.sh
      - name: Inference with pre-trained model
        shell: bash
        run: |
          sudo apt-get -qq install git-lfs tree sox
          export PYTHONPATH=$PWD:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
          .github/scripts/run-pre-trained-transducer-stateless-modified-aishell.sh
--- a/.github/workflows/run-pretrained-transducer-stateless.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-name: run-pre-trained-trandsucer-stateless
+name: run-pre-trained-transducer-stateless
 on:
  push:
@ -23,17 +23,23 @@ on:
  pull_request:
    types: [labeled]
  schedule:
    # minute (0-59)
    # hour (0-23)
    # day of the month (1-31)
    # month (1-12)
    # day of the week (0-6)
    # nightly build at 15:50 UTC time every day
    - cron: "50 15 * * *"
 jobs:
  run_pre_trained_transducer_stateless:
-    if: github.event.label.name == 'ready' || github.event_name == 'push'
+    if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
        torch: ["1.10.0"]
        torchaudio: ["0.10.0"]
        k2-version: ["1.9.dev20211101"]
      fail-fast: false
@ -43,66 +49,106 @@ jobs:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
-          python3 -m pip install --upgrade pip pytest
+          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
-          # numpy 1.20.x does not support python 3.6
+          pip uninstall -y protobuf
-          pip install numpy==1.19
+          pip install --no-binary protobuf protobuf
          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
-          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
+      - name: Cache kaldifeat
-          python3 -m pip install kaldifeat
+        id: my-cache
-          # We are in ./icefall and there is a file: requirements.txt in it
+        uses: actions/cache@v2
-          pip install -r requirements.txt
+        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
-      - name: Install graphviz
+      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
-          python3 -m pip install -qq graphviz
+          .github/scripts/install-kaldifeat.sh
          sudo apt-get -qq install graphviz
-      - name: Download pre-trained model
+      - name: Cache LibriSpeech test-clean and test-other datasets
        id: libri-test-clean-and-test-other-data
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/download
          key: cache-libri-test-clean-and-test-other
      - name: Download LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
      - name: Prepare manifests for LibriSpeech test-clean and test-other
        shell: bash
        run: |
          .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
      - name: Cache LibriSpeech test-clean and test-other fbank features
        id: libri-test-clean-and-test-other-fbank
        uses: actions/cache@v2
        with:
          path: |
            ~/tmp/fbank-libri
          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
        shell: bash
        run: |
          .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
      - name: Inference with pre-trained model
        shell: bash
        env:
          GITHUB_EVENT_NAME: ${{ github.event_name }}
          GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
        run: |
          mkdir -p egs/librispeech/ASR/data
          ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
          ls -lh egs/librispeech/ASR/data/*
          sudo apt-get -qq install git-lfs tree sox
          cd egs/librispeech/ASR
          mkdir tmp
          cd tmp
          git lfs install
          git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-27
          cd ..
          tree tmp
          soxi tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-27/test_wavs/*.wav
          ls -lh tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-27/test_wavs/*.wav
      - name: Run greedy search decoding
        shell: bash
        run: |
          export PYTHONPATH=$PWD:PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer_stateless/pretrained.py \
            --method greedy_search \
            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-27/exp/pretrained.pt \
            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-27/data/lang_bpe_500/bpe.model \
            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-27/test_wavs/1089-134686-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-27/test_wavs/1221-135766-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-27/test_wavs/1221-135766-0002.wav
      - name: Run beam search decoding
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
-          cd egs/librispeech/ASR
+          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
-          ./transducer_stateless/pretrained.py \
+          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
-            --method beam_search \
+
-            --beam-size 4 \
+          .github/scripts/run-pre-trained-transducer-stateless.sh
-            --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-27/exp/pretrained.pt \
+
-            --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-27/data/lang_bpe_500/bpe.model \
+      - name: Display decoding results for transducer_stateless
-            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-27/test_wavs/1089-134686-0001.wav \
+        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
-            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-27/test_wavs/1221-135766-0001.wav \
+        shell: bash
-            ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2021-12-27/test_wavs/1221-135766-0002.wav
+        run: |
          cd egs/librispeech/ASR/
          tree ./transducer_stateless/exp
          cd transducer_stateless
          echo "results for transducer_stateless"
          echo "===greedy search==="
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===fast_beam_search==="
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
          echo "===modified beam search==="
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
          find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
      - name: Upload decoding results for transducer_stateless
        uses: actions/upload-artifact@v2
        if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
        with:
          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless-2022-02-07
          path: egs/librispeech/ASR/transducer_stateless/exp/
--- a/.github/workflows/run-pretrained-transducer.yml
+++ b/.github/workflows/run-pretrained-transducer.yml
@ -31,9 +31,6 @@ jobs:
      matrix:
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8, 3.9]
        torch: ["1.10.0"]
        torchaudio: ["0.10.0"]
        k2-version: ["1.9.dev20211101"]
      fail-fast: false
@ -43,67 +40,37 @@ jobs:
          fetch-depth: 0
      - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install Python dependencies
        run: |
-          python3 -m pip install --upgrade pip pytest
+          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
-          # numpy 1.20.x does not support python 3.6
+          pip uninstall -y protobuf
-          pip install numpy==1.19
+          pip install --no-binary protobuf protobuf
          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
-          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
+      - name: Cache kaldifeat
-          python3 -m pip install kaldifeat
+        id: my-cache
-          # We are in ./icefall and there is a file: requirements.txt in it
+        uses: actions/cache@v2
-          pip install -r requirements.txt
+        with:
          path: |
            ~/tmp/kaldifeat
          key: cache-tmp-${{ matrix.python-version }}
-      - name: Install graphviz
+      - name: Install kaldifeat
        if: steps.my-cache.outputs.cache-hit != 'true'
        shell: bash
        run: |
-          python3 -m pip install -qq graphviz
+          make -j2 _kaldifeat
          sudo apt-get -qq install graphviz
-      - name: Download pre-trained model
+      - name: Inference with pre-trained model
        shell: bash
        run: |
          sudo apt-get -qq install git-lfs tree sox
          cd egs/librispeech/ASR
          mkdir tmp
          cd tmp
          git lfs install
          git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-bpe-500-2021-12-23
          cd ..
          tree tmp
          soxi tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/*.wav
          ls -lh tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/*.wav
      - name: Run greedy search decoding
        shell: bash
        run: |
          export PYTHONPATH=$PWD:PYTHONPATH
          cd egs/librispeech/ASR
          ./transducer/pretrained.py \
            --method greedy_search \
            --checkpoint ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/exp/pretrained.pt \
            --bpe-model ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/data/lang_bpe_500/bpe.model \
            ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/1089-134686-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/1221-135766-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/1221-135766-0002.wav
      - name: Run beam search decoding
        shell: bash
        run: |
          export PYTHONPATH=$PWD:$PYTHONPATH
-          cd egs/librispeech/ASR
+          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
-          ./transducer/pretrained.py \
+          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
-            --method beam_search \
+          .github/scripts/run-pre-trained-transducer.sh
            --beam-size 4 \
            --checkpoint ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/exp/pretrained.pt \
            --bpe-model ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/data/lang_bpe_500/bpe.model \
            ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/1089-134686-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/1221-135766-0001.wav \
            ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/1221-135766-0002.wav
--- a/.github/workflows/run-yesno-recipe.yml
+++ b/.github/workflows/run-yesno-recipe.yml
@ -33,9 +33,6 @@ jobs:
        # TODO: enable macOS for CPU testing
        os: [ubuntu-18.04]
        python-version: [3.8]
        torch: ["1.10.0"]
        torchaudio: ["0.10.0"]
        k2-version: ["1.9.dev20211101"]
      fail-fast: false
    steps:
@ -43,10 +40,17 @@ jobs:
        with:
          fetch-depth: 0
      - name: Install graphviz
        shell: bash
        run: |
          sudo apt-get -qq install graphviz
      - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v2
        with:
          python-version: ${{ matrix.python-version }}
          cache: 'pip'
          cache-dependency-path: '**/requirements-ci.txt'
      - name: Install libnsdfile and libsox
        if: startsWith(matrix.os, 'ubuntu')
@ -57,13 +61,9 @@ jobs:
      - name: Install Python dependencies
        run: |
-          python3 -m pip install -U pip
+          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
-          pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+          pip uninstall -y protobuf
-          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
+          pip install --no-binary protobuf protobuf
          python3 -m pip install git+https://github.com/lhotse-speech/lhotse
          # We are in ./icefall and there is a file: requirements.txt in it
          python3 -m pip install -r requirements.txt
      - name: Run yesno recipe
        shell: bash
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@ -29,7 +29,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
-        os: [ubuntu-18.04, macos-10.15]
+        os: [ubuntu-18.04, macos-latest]
        python-version: [3.7, 3.9]
      fail-fast: false
@ -45,7 +45,9 @@ jobs:
      - name: Install Python dependencies
        run: |
-          python3 -m pip install --upgrade pip black==21.6b0 flake8==3.9.2
+          python3 -m pip install --upgrade pip black==21.6b0 flake8==3.9.2 click==8.0.4
          # See https://github.com/psf/black/issues/2964
          # The version of click should be selected from 8.0.0, 8.0.1, 8.0.2, 8.0.3, and 8.0.4
      - name: Run flake8
        shell: bash
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -33,13 +33,13 @@ jobs:
        # disable macOS test for now.
        os: [ubuntu-18.04]
        python-version: [3.7, 3.8]
-        torch: ["1.8.0", "1.10.0"]
+        torch: ["1.8.0", "1.11.0"]
-        torchaudio: ["0.8.0", "0.10.0"]
+        torchaudio: ["0.8.0", "0.11.0"]
-        k2-version: ["1.9.dev20211101"]
+        k2-version: ["1.15.1.dev20220427"]
        exclude:
          - torch: "1.8.0"
-            torchaudio: "0.10.0"
+            torchaudio: "0.11.0"
-          - torch: "1.10.0"
+          - torch: "1.11.0"
            torchaudio: "0.8.0"
      fail-fast: false
@ -67,7 +67,7 @@ jobs:
          # numpy 1.20.x does not support python 3.6
          pip install numpy==1.19
          pip install torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
-          if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then
+          if [[ ${{ matrix.torchaudio }} == "0.11.0" ]]; then
            pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
          else
            pip install torchaudio==${{ matrix.torchaudio }}
@ -76,6 +76,9 @@ jobs:
          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
          pip install git+https://github.com/lhotse-speech/lhotse
          # icefall requirements
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
          pip install -r requirements.txt
      - name: Install graphviz
@ -103,11 +106,26 @@ jobs:
          cd egs/librispeech/ASR/conformer_ctc
          pytest -v -s
          cd ../pruned_transducer_stateless
          pytest -v -s
          cd ../pruned_transducer_stateless2
          pytest -v -s
          cd ../pruned_transducer_stateless3
          pytest -v -s
          cd ../pruned_transducer_stateless4
          pytest -v -s
          cd ../transducer_stateless
          pytest -v -s
          if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then
            cd ../transducer
            pytest -v -s
-            cd ../transducer_stateless
+            cd ../transducer_stateless2
            pytest -v -s
            cd ../transducer_lstm
@ -128,13 +146,28 @@ jobs:
          cd egs/librispeech/ASR/conformer_ctc
          pytest -v -s
-          if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then
+          cd ../pruned_transducer_stateless
-            cd ../transducer
+          pytest -v -s
          cd ../pruned_transducer_stateless2
          pytest -v -s
          cd ../pruned_transducer_stateless3
          pytest -v -s
          cd ../pruned_transducer_stateless4
          pytest -v -s
          cd ../transducer_stateless
          pytest -v -s
          if [[ ${{ matrix.torchaudio }} == "0.10.0" ]]; then
            cd ../transducer
            pytest -v -s
            cd ../transducer_stateless2
            pytest -v -s
            cd ../transducer_lstm
            pytest -v -s
          fi
--- a/.gitignore
+++ b/.gitignore
@ -6,6 +6,8 @@ exp
 exp*/
 *.pt
 download
 dask-worker-space
 log
 *.bak
 *-bak
 *bak.py
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -4,6 +4,8 @@ repos:
    hooks:
      - id: black
        args: [--line-length=80]
        additional_dependencies: ['click==8.0.1']
        exclude: icefall\/__init__\.py
  - repo: https://github.com/PyCQA/flake8
    rev: 3.9.2
--- a/README.md
+++ b/README.md
@ -2,6 +2,18 @@
 <img src="https://raw.githubusercontent.com/k2-fsa/icefall/master/docs/source/_static/logo.png" width=168>
 </div>
 ## Introduction
 icefall contains ASR recipes for various datasets
 using <https://github.com/k2-fsa/k2>.
 You can use <https://github.com/k2-fsa/sherpa> to deploy models
 trained with icefall.
 You can try pre-trained models from within your browser without the need
 to download or install anything by visiting <https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>
 See <https://k2-fsa.github.io/icefall/huggingface/spaces.html> for more details.
 ## Installation
 Please refer to <https://icefall.readthedocs.io/en/latest/installation/index.html>
@ -12,12 +24,19 @@ for installation.
 Please refer to <https://icefall.readthedocs.io/en/latest/recipes/index.html>
 for more information.
-We provide four recipes at present:
+We provide the following recipes:
  - [yesno][yesno]
  - [LibriSpeech][librispeech]
  - [Aishell][aishell]
  - [TIMIT][timit]
  - [TED-LIUM3][tedlium3]
  - [GigaSpeech][gigaspeech]
  - [Aidatatang_200zh][aidatatang_200zh]
  - [WenetSpeech][wenetspeech]
  - [Alimeeting][alimeeting]
  - [Aishell4][aishell4]
  - [TAL_CSASR][tal_csasr]
 ### yesno
@ -34,6 +53,9 @@ We do provide a Colab notebook for this recipe.
 ### LibriSpeech
 Please see <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>
 for the **latest** results.
 We provide 4 models for this recipe:
 - [conformer CTC model][LibriSpeech_conformer_ctc]
@ -80,16 +102,30 @@ We provide a Colab notebook to run a pre-trained RNN-T conformer model: [![Open
 Using Conformer as encoder. The decoder consists of 1 embedding layer
 and 1 convolutional layer.
-The best WER using beam search with beam size 4 is:
+The best WER using modified beam search with beam size 4 is:
 |     | test-clean | test-other |
 |-----|------------|------------|
-| WER | 2.83       | 7.19       |
+| WER | 2.56       | 6.27       |
 Note: No auxiliary losses are used in the training and no LMs are used
 in the decoding.
-We provide a Colab notebook to run a pre-trained transducer conformer + stateless decoder model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Lm37sNajIpkV4HTzMDF7sn9l0JpfmekN?usp=sharing)
+We provide a Colab notebook to run a pre-trained transducer conformer + stateless decoder model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CO1bXJ-2khDckZIW8zjOPHGSKLHpTDlp?usp=sharing)
 #### k2 pruned RNN-T
 |     | test-clean | test-other |
 |-----|------------|------------|
 | WER | 2.57       | 5.95       |
 #### k2 pruned RNN-T + GigaSpeech
 |     | test-clean | test-other |
 |-----|------------|------------|
 | WER | 2.00       | 4.63       |
 ### Aishell
@ -105,7 +141,7 @@ The best CER we currently have is:
 | CER | 4.26 |
-We provide a Colab notebook to run a pre-trained conformer CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1WnG17io5HEZ0Gn_cnh_VzK5QYOoiiklC?usp=sharing)
+We provide a Colab notebook to run a pre-trained conformer CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg](https://colab.research.google.com/drive/1WnG17io5HEZ0Gn_cnh_VzK5QYOoiiklC?usp=sharing)
 #### Transducer Stateless Model
@ -113,7 +149,7 @@ The best CER we currently have is:
 |     | test |
 |-----|------|
-| CER | 5.7 |
+| CER | 4.68 |
 We provide a Colab notebook to run a pre-trained TransducerStateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14XaT2MhnBkK-3_RqqWq3K90Xlbin-GZC?usp=sharing)
@ -153,6 +189,130 @@ The PER for this model is:
 We provide a Colab notebook to run a pre-trained TDNN LiGRU CTC model:  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/11IT-k4HQIgQngXz1uvWsEYktjqQt7Tmb?usp=sharing)
 ### TED-LIUM3
 We provide two models for this recipe: [Transducer Stateless: Conformer encoder + Embedding decoder][TED-LIUM3_transducer_stateless] and [Pruned Transducer Stateless: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][TED-LIUM3_pruned_transducer_stateless].
 #### Transducer Stateless:  Conformer encoder + Embedding decoder
 The best WER using modified beam search with beam size 4 is:
 |     |  dev  |  test  |
 |-----|-------|--------|
 | WER |  6.91 |  6.33  |
 Note: No auxiliary losses are used in the training and no LMs are used in the decoding.
 We provide a Colab notebook to run a pre-trained Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1MmY5bBxwvKLNT4A2DJnwiqRXhdchUqPN?usp=sharing)
 #### Pruned Transducer Stateless: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
 The best WER using modified beam search with beam size 4 is:
 |     |  dev  |  test  |
 |-----|-------|--------|
 | WER |  6.77 |  6.14  |
 We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1je_1zGrOkGVVd4WLzgkXRHxl-I27yWtz?usp=sharing)
 ### GigaSpeech
 We provide two models for this recipe: [Conformer CTC model][GigaSpeech_conformer_ctc]
 and [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][GigaSpeech_pruned_transducer_stateless2].
 #### Conformer CTC
 |     |  Dev  | Test  |
 |-----|-------|-------|
 | WER | 10.47 | 10.58 |
 #### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
 |                      |  Dev  | Test  |
 |----------------------|-------|-------|
 |    greedy search     | 10.51 | 10.73 |
 |   fast beam search   | 10.50 | 10.69 |
 | modified beam search | 10.40 | 10.51 |
 ### Aidatatang_200zh
 We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Aidatatang_200zh_pruned_transducer_stateless2].
 #### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
 |                      |  Dev  | Test  |
 |----------------------|-------|-------|
 |    greedy search     | 5.53  | 6.59  |
 |   fast beam search   | 5.30  | 6.34  |
 | modified beam search | 5.27  | 6.33  |
 We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wNSnSj3T5oOctbh5IGCa393gKOoQw2GH?usp=sharing)
 ### WenetSpeech
 We provide some models for this recipe: [Pruned stateless RNN-T_2: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][WenetSpeech_pruned_transducer_stateless2] and [Pruned stateless RNN-T_5: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][WenetSpeech_pruned_transducer_stateless5].
 #### Pruned stateless RNN-T_2: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with L subset, offline ASR)
 |                      |  Dev  | Test-Net | Test-Meeting |
 |----------------------|-------|----------|--------------|
 |    greedy search     | 7.80  |  8.75    |  13.49       |
 |   fast beam search   | 7.94  |  8.74    |  13.80       |
 | modified beam search | 7.76  |  8.71    |  13.41       |
 #### Pruned stateless RNN-T_5: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with L subset)
 **Streaming**:
 |                      |  Dev  | Test-Net | Test-Meeting |
 |----------------------|-------|----------|--------------|
 | greedy_search | 8.78 | 10.12 | 16.16 |
 | modified_beam_search | 8.53| 9.95 | 15.81 |
 | fast_beam_search| 9.01 | 10.47 | 16.28 |
 We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless2 model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1EV4e1CHa1GZgEF-bZgizqI9RyFFehIiN?usp=sharing)
 ### Alimeeting
 We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Alimeeting_pruned_transducer_stateless2].
 #### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with far subset)
 |                      |  Eval  | Test-Net |
 |----------------------|--------|----------|
 |    greedy search     | 31.77  |  34.66   |
 |   fast beam search   | 31.39  |  33.02   |
 | modified beam search | 30.38  |  34.25   |
 We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tKr3f0mL17uO_ljdHGKtR7HOmthYHwJG?usp=sharing)
 ### Aishell4
 We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Aishell4_pruned_transducer_stateless5].
 #### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with all subsets)
 The best CER(%) results:
 |                      |  test  |
 |----------------------|--------|
 |    greedy search     | 29.89  |
 |   fast beam search   | 28.91  |
 | modified beam search | 29.08  |
 We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)
 ### TAL_CSASR
 We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][TAL_CSASR_pruned_transducer_stateless5].
 #### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
 The best results for Chinese CER(%) and English WER(%) respectivly (zh: Chinese, en: English):
 |decoding-method | dev | dev_zh | dev_en | test | test_zh | test_en |
 |--|--|--|--|--|--|--|
 |greedy_search| 7.30 | 6.48 | 19.19 |7.39| 6.66 | 19.13|
 |modified_beam_search| 7.15 | 6.35 | 18.95 | 7.22| 6.50 | 18.70 |
 |fast_beam_search| 7.18 | 6.39| 18.90 |  7.27| 6.55 | 18.77|
 We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1DmIx-NloI1CMU5GdZrlse7TRu4y3Dpf8?usp=sharing)
 ## Deployment with C++
@ -175,8 +335,25 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad
 [Aishell_conformer_ctc]: egs/aishell/ASR/conformer_ctc
 [TIMIT_tdnn_lstm_ctc]: egs/timit/ASR/tdnn_lstm_ctc
 [TIMIT_tdnn_ligru_ctc]: egs/timit/ASR/tdnn_ligru_ctc
 [TED-LIUM3_transducer_stateless]: egs/tedlium3/ASR/transducer_stateless
 [TED-LIUM3_pruned_transducer_stateless]: egs/tedlium3/ASR/pruned_transducer_stateless
 [GigaSpeech_conformer_ctc]: egs/gigaspeech/ASR/conformer_ctc
 [GigaSpeech_pruned_transducer_stateless2]: egs/gigaspeech/ASR/pruned_transducer_stateless2
 [Aidatatang_200zh_pruned_transducer_stateless2]: egs/aidatatang_200zh/ASR/pruned_transducer_stateless2
 [WenetSpeech_pruned_transducer_stateless2]: egs/wenetspeech/ASR/pruned_transducer_stateless2
 [WenetSpeech_pruned_transducer_stateless5]: egs/wenetspeech/ASR/pruned_transducer_stateless5
 [Alimeeting_pruned_transducer_stateless2]: egs/alimeeting/ASR/pruned_transducer_stateless2
 [Aishell4_pruned_transducer_stateless5]: egs/aishell4/ASR/pruned_transducer_stateless5
 [TAL_CSASR_pruned_transducer_stateless5]: egs/tal_csasr/ASR/pruned_transducer_stateless5
 [yesno]: egs/yesno/ASR
 [librispeech]: egs/librispeech/ASR
 [aishell]: egs/aishell/ASR
 [timit]: egs/timit/ASR
 [tedlium3]: egs/tedlium3/ASR
 [gigaspeech]: egs/gigaspeech/ASR
 [aidatatang_200zh]: egs/aidatatang_200zh/ASR
 [wenetspeech]: egs/wenetspeech/ASR
 [alimeeting]: egs/alimeeting/ASR
 [aishell4]: egs/aishell4/ASR
 [tal_csasr]: egs/tal_csasr/ASR
 [k2]: https://github.com/k2-fsa/k2
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,2 +1,3 @@
 sphinx_rtd_theme
 sphinx
 sphinxcontrib-youtube==1.1.0
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -32,7 +32,9 @@ release = "0.1"
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
    "sphinx.ext.todo",
    "sphinx_rtd_theme",
    "sphinxcontrib.youtube",
 ]
 # Add any paths that contain templates here, relative to this directory.
@ -74,3 +76,5 @@ html_context = {
    "github_version": "master",
    "conf_py_path": "/icefall/docs/source/",
 }
 todo_include_todos = True
--- a/docs/source/huggingface/index.rst
+++ b/docs/source/huggingface/index.rst
@ -0,0 +1,13 @@
 Huggingface
 ===========
 This section describes how to find pre-trained models.
 It also demonstrates how to try them from within your browser
 without installing anything by using
 `Huggingface spaces <https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_.
 .. toctree::
   :maxdepth: 2
   pretrained-models
   spaces
--- a/docs/source/huggingface/pic/hugging-face-sherpa-2.png
+++ b/docs/source/huggingface/pic/hugging-face-sherpa-2.png
--- a/docs/source/huggingface/pic/hugging-face-sherpa-3.png
+++ b/docs/source/huggingface/pic/hugging-face-sherpa-3.png
--- a/docs/source/huggingface/pic/hugging-face-sherpa.png
+++ b/docs/source/huggingface/pic/hugging-face-sherpa.png
--- a/docs/source/huggingface/pretrained-models.rst
+++ b/docs/source/huggingface/pretrained-models.rst
@ -0,0 +1,17 @@
 Pre-trained models
 ==================
 We have uploaded pre-trained models for all recipes in ``icefall``
 to `<https://huggingface.co/>`_.
 You can find them by visiting the following link:
 `<https://huggingface.co/models?search=icefall>`_.
 You can also find links of pre-trained models for a specific recipe
 by looking at the corresponding ``RESULTS.md``. For instance:
  - `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
  - `<https://github.com/k2-fsa/icefall/blob/master/egs/aishell/ASR/RESULTS.md>`_
  - `<https://github.com/k2-fsa/icefall/blob/master/egs/gigaspeech/ASR/RESULTS.md>`_
  - `<https://github.com/k2-fsa/icefall/blob/master/egs/wenetspeech/ASR/RESULTS.md>`_
--- a/docs/source/huggingface/spaces.rst
+++ b/docs/source/huggingface/spaces.rst
@ -0,0 +1,65 @@
 Huggingface spaces
 ==================
 We have integrated the server framework
 `sherpa <http://github.com/k2-fsa/sherpa>`_
 with `Huggingface spaces <https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_
 so that you can try pre-trained models from within your browser
 without the need to download or install anything.
 All you need is a browser, which can be run on Windows, macOS, Linux, or even on your
 iPad and your phone.
 Start your browser and visit the following address:
 `<https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_
 and you will see a page like the following screenshot:
 .. image:: ./pic/hugging-face-sherpa.png
   :alt: screenshot of `<https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_
   :target: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition
 You can:
  1. Select a language for recognition. Currently, we provide pre-trained models
     from ``icefall`` for the following languages: ``Chinese``, ``English``, and
     ``Chinese+English``.
  2. After selecting the target language, you can select a pre-trained model
     corresponding to the language.
  3. Select the decoding method. Currently, it provides ``greedy search``
     and ``modified_beam_search``.
  4. If you selected ``modified_beam_search``, you can choose the number of
     active paths during the search.
  5. Either upload a file or record your speech for recognition.
  6. Click the button ``Submit for recognition``.
  7. Wait for a moment and you will get the recognition results.
 The following screenshot shows an example when selecting ``Chinese+English``:
 .. image:: ./pic/hugging-face-sherpa-3.png
   :alt: screenshot of `<https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_
   :target: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition
 In the bottom part of the page, you can find a table of examples. You can click
 one of them and then click ``Submit for recognition``.
 .. image:: ./pic/hugging-face-sherpa-2.png
   :alt: screenshot of `<https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_
   :target: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition
 YouTube Video
 -------------
 We provide the following YouTube video demonstrating how to use
 `<https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>`_.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ElN3r9dkKE4
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -23,3 +23,4 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
   installation/index
   recipes/index
   contributing/index
   huggingface/index
--- a/docs/source/installation/images/README.md
+++ b/docs/source/installation/images/README.md
@ -0,0 +1,4 @@
 # Introduction
 <https://shields.io/> is used to generate files in this directory.
--- a/docs/source/installation/images/k2-gt-v1.9-blueviolet.svg
+++ b/docs/source/installation/images/k2-gt-v1.9-blueviolet.svg
@ -0,0 +1 @@
 <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="80" height="20" role="img" aria-label="k2: &gt;= v1.9"><title>k2: &gt;= v1.9</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="80" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="23" height="20" fill="#555"/><rect x="23" width="57" height="20" fill="blueviolet"/><rect width="80" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="125" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="130">k2</text><text x="125" y="140" transform="scale(.1)" fill="#fff" textLength="130">k2</text><text aria-hidden="true" x="505" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">&gt;= v1.9</text><text x="505" y="140" transform="scale(.1)" fill="#fff" textLength="470">&gt;= v1.9</text></g></svg>
--- a/docs/source/installation/images/k2-v1.9-blueviolet.svg
+++ b/docs/source/installation/images/k2-v1.9-blueviolet.svg
@ -1 +0,0 @@
 <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="58" height="20" role="img" aria-label="k2: v1.9"><title>k2: v1.9</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="58" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="23" height="20" fill="#555"/><rect x="23" width="35" height="20" fill="blueviolet"/><rect width="58" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="125" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="130">k2</text><text x="125" y="140" transform="scale(.1)" fill="#fff" textLength="130">k2</text><text aria-hidden="true" x="395" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="250">v1.9</text><text x="395" y="140" transform="scale(.1)" fill="#fff" textLength="250">v1.9</text></g></svg>
--- a/docs/source/installation/images/python-3.6_3.7_3.8_3.9-blue.svg
+++ b/docs/source/installation/images/python-3.6_3.7_3.8_3.9-blue.svg
@ -1 +0,0 @@
 <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="170" height="20" role="img" aria-label="python: 3.6 | 3.7 | 3.8 | 3.9"><title>python: 3.6 | 3.7 | 3.8 | 3.9</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="170" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="49" height="20" fill="#555"/><rect x="49" width="121" height="20" fill="#007ec6"/><rect width="170" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="255" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="390">python</text><text x="255" y="140" transform="scale(.1)" fill="#fff" textLength="390">python</text><text aria-hidden="true" x="1085" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="1110">3.6 | 3.7 | 3.8 | 3.9</text><text x="1085" y="140" transform="scale(.1)" fill="#fff" textLength="1110">3.6 | 3.7 | 3.8 | 3.9</text></g></svg>
--- a/docs/source/installation/images/python-gt-v3.6-blue.svg
+++ b/docs/source/installation/images/python-gt-v3.6-blue.svg
@ -0,0 +1 @@
 <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="98" height="20" role="img" aria-label="python: &gt;= 3.6"><title>python: &gt;= 3.6</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="98" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="49" height="20" fill="#555"/><rect x="49" width="49" height="20" fill="#007ec6"/><rect width="98" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="255" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="390">python</text><text x="255" y="140" transform="scale(.1)" fill="#fff" textLength="390">python</text><text aria-hidden="true" x="725" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="390">&gt;= 3.6</text><text x="725" y="140" transform="scale(.1)" fill="#fff" textLength="390">&gt;= 3.6</text></g></svg>
--- a/docs/source/installation/images/torch-1.6.0_1.7.0_1.7.1_1.8.0_1.8.1_1.9.0-green.svg
+++ b/docs/source/installation/images/torch-1.6.0_1.7.0_1.7.1_1.8.0_1.8.1_1.9.0-green.svg
@ -1 +0,0 @@
 <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="286" height="20" role="img" aria-label="torch: 1.6.0 | 1.7.0 | 1.7.1 | 1.8.0 | 1.8.1 | 1.9.0"><title>torch: 1.6.0 | 1.7.0 | 1.7.1 | 1.8.0 | 1.8.1 | 1.9.0</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="286" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="39" height="20" fill="#555"/><rect x="39" width="247" height="20" fill="#97ca00"/><rect width="286" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="205" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="290">torch</text><text x="205" y="140" transform="scale(.1)" fill="#fff" textLength="290">torch</text><text aria-hidden="true" x="1615" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="2370">1.6.0 | 1.7.0 | 1.7.1 | 1.8.0 | 1.8.1 | 1.9.0</text><text x="1615" y="140" transform="scale(.1)" fill="#fff" textLength="2370">1.6.0 | 1.7.0 | 1.7.1 | 1.8.0 | 1.8.1 | 1.9.0</text></g></svg>
--- a/docs/source/installation/images/torch-gt-v1.6.0-green.svg
+++ b/docs/source/installation/images/torch-gt-v1.6.0-green.svg
@ -0,0 +1 @@
 <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="100" height="20" role="img" aria-label="torch: &gt;= 1.6.0"><title>torch: &gt;= 1.6.0</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="100" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="39" height="20" fill="#555"/><rect x="39" width="61" height="20" fill="#97ca00"/><rect width="100" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="205" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="290">torch</text><text x="205" y="140" transform="scale(.1)" fill="#fff" textLength="290">torch</text><text aria-hidden="true" x="685" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="510">&gt;= 1.6.0</text><text x="685" y="140" transform="scale(.1)" fill="#fff" textLength="510">&gt;= 1.6.0</text></g></svg>
--- a/docs/source/installation/index.rst
+++ b/docs/source/installation/index.rst
@ -15,21 +15,33 @@ Installation
 .. |device| image:: ./images/device-CPU_CUDA-orange.svg
  :alt: Supported devices
-.. |python_versions| image:: ./images/python-3.6_3.7_3.8_3.9-blue.svg
+.. |python_versions| image:: ./images/python-gt-v3.6-blue.svg
  :alt: Supported python versions
-.. |torch_versions| image:: ./images/torch-1.6.0_1.7.0_1.7.1_1.8.0_1.8.1_1.9.0-green.svg
+.. |torch_versions| image:: ./images/torch-gt-v1.6.0-green.svg
  :alt: Supported PyTorch versions
-.. |k2_versions| image:: ./images/k2-v1.9-blueviolet.svg
+.. |k2_versions| image:: ./images/k2-gt-v1.9-blueviolet.svg
  :alt: Supported k2 versions
 ``icefall`` depends on `k2 <https://github.com/k2-fsa/k2>`_ and
 `lhotse <https://github.com/lhotse-speech/lhotse>`_.
-We recommend you to install ``k2`` first, as ``k2`` is bound to
+We recommend you to use the following steps to install the dependencies.
-a specific version of PyTorch after compilation. Install ``k2`` also
+
-installs its dependency PyTorch, which can be reused by ``lhotse``.
+- (0) Install PyTorch and torchaudio
 - (1) Install k2
 - (2) Install lhotse
 .. caution::
  Installation order matters.
 (0) Install PyTorch and torchaudio
 ----------------------------------
 Please refer `<https://pytorch.org/>`_ to install PyTorch
 and torchaudio.
 (1) Install k2
@ -54,14 +66,15 @@ to install ``k2``.
 Please refer to `<https://lhotse.readthedocs.io/en/latest/getting-started.html#installation>`_
 to install ``lhotse``.
 .. HINT::
-  Install ``lhotse`` also installs its dependency `torchaudio <https://github.com/pytorch/audio>`_.
+.. hint::
-.. CAUTION::
+    We strongly recommend you to use::
      pip install git+https://github.com/lhotse-speech/lhotse
    to install the latest version of lhotse.
  If you have installed ``torchaudio``, please consider uninstalling it before
  installing ``lhotse``. Otherwise, it may update your already installed PyTorch.
 (3) Download icefall
 --------------------
@ -461,3 +474,19 @@ The decoding log is:
 **Congratulations!** You have successfully setup the environment and have run the first recipe in ``icefall``.
 Have fun with ``icefall``!
 YouTube Video
 -------------
 We provide the following YouTube video showing how to install ``icefall``.
 It also shows how to debug various problems that you may encounter while
 using ``icefall``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: LVmrBD0tLfE
--- a/docs/source/recipes/aishell.rst
+++ b/docs/source/recipes/aishell.rst
@ -1,10 +0,0 @@
 Aishell
 =======
 We provide the following models for the Aishell dataset:
 .. toctree::
   :maxdepth: 2
   aishell/conformer_ctc
   aishell/tdnn_lstm_ctc
--- a/docs/source/recipes/aishell/conformer_ctc.rst
+++ b/docs/source/recipes/aishell/conformer_ctc.rst
@ -1,4 +1,4 @@
-Confromer CTC
+Conformer CTC
 =============
 This tutorial shows you how to run a conformer ctc model
--- a/docs/source/recipes/aishell/images/aishell-transducer_stateless_modified-tensorboard-log.png
+++ b/docs/source/recipes/aishell/images/aishell-transducer_stateless_modified-tensorboard-log.png
--- a/docs/source/recipes/aishell/index.rst
+++ b/docs/source/recipes/aishell/index.rst
@ -0,0 +1,22 @@
 aishell
 =======
 Aishell is an open-source Chinese Mandarin speech corpus published by Beijing
 Shell Shell Technology Co.,Ltd.
 400 people from different accent areas in China are invited to participate in
 the recording, which is conducted in a quiet indoor environment using high
 fidelity microphone and downsampled to 16kHz. The manual transcription accuracy
 is above 95%, through professional speech annotation and strict quality
 inspection. The data is free for academic use. We hope to provide moderate
 amount of data for new researchers in the field of speech recognition.
 It can be downloaded from `<https://www.openslr.org/33/>`_
 .. toctree::
   :maxdepth: 1
   tdnn_lstm_ctc
   conformer_ctc
   stateless_transducer
--- a/docs/source/recipes/aishell/stateless_transducer.rst
+++ b/docs/source/recipes/aishell/stateless_transducer.rst
@ -0,0 +1,714 @@
 Stateless Transducer
 ====================
 This tutorial shows you how to do transducer training in ``icefall``.
 .. HINT::
  Instead of using RNN-T or RNN transducer, we only use transducer
  here. As you will see, there are no RNNs in the model.
 .. HINT::
  We assume you have read the page :ref:`install icefall` and have setup
  the environment for ``icefall``.
 .. HINT::
  We recommend you to use a GPU or several GPUs to run this recipe.
 In this tutorial, you will learn:
  - (1) What does the transducer model look like
  - (2) How to prepare data for training and decoding
  - (3) How to start the training, either with a single GPU or with multiple GPUs
  - (4) How to do decoding after training, with greedy search, beam search and, **modified beam search**
  - (5) How to use a pre-trained model provided by us to transcribe sound files
 The Model
 ---------
 The transducer model consists of 3 parts:
 - **Encoder**: It is a conformer encoder with the following parameters
    - Number of heads: 8
    - Attention dim: 512
    - Number of layers: 12
    - Feedforward dim: 2048
 - **Decoder**: We use a stateless model consisting of:
    - An embedding layer with embedding dim 512
    - A Conv1d layer with a default kernel size 2 (i.e. it sees 2
      symbols of left-context by default)
 - **Joiner**: It consists of a ``nn.tanh()`` and a ``nn.Linear()``.
 .. Caution::
  The decoder is stateless and very simple. It is borrowed from
  `<https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9054419>`_
  (Rnn-Transducer with Stateless Prediction Network)
  We make one modification to it: Place a Conv1d layer right after
  the embedding layer.
 When using Chinese characters as modelling unit, whose vocabulary size
 is 4336 in this specific dataset,
 the number of parameters of the model is ``87939824``, i.e., about ``88 M``.
 The Loss
 --------
 We are using `<https://github.com/csukuangfj/optimized_transducer>`_
 to compute the transducer loss, which removes extra paddings
 in loss computation to save memory.
 .. Hint::
  ``optimized_transducer`` implements the technqiues proposed
  in `Improving RNN Transducer Modeling for End-to-End Speech Recognition <https://arxiv.org/abs/1909.12415>`_ to save memory.
  Furthermore, it supports ``modified transducer``, limiting the maximum
  number of symbols that can be emitted per frame to 1, which simplifies
  the decoding process significantly. Also, the experiment results
  show that it does not degrade the performance.
  See `<https://github.com/csukuangfj/optimized_transducer#modified-transducer>`_
  for what exactly modified transducer is.
  `<https://github.com/csukuangfj/transducer-loss-benchmarking>`_ shows that
  in the unpruned case ``optimized_transducer`` has the advantage about minimizing
  memory usage.
 .. todo::
  Add tutorial about ``pruned_transducer_stateless`` that uses k2
  pruned transducer loss.
 .. hint::
  You can use::
    pip install optimized_transducer
  to install ``optimized_transducer``. Refer to
  `<https://github.com/csukuangfj/optimized_transducer>`_ for other
  alternatives.
 Data Preparation
 ----------------
 To prepare the data for training, please use the following commands:
 .. code-block:: bash
  cd egs/aishell/ASR
  ./prepare.sh --stop-stage 4
  ./prepare.sh --stage 6 --stop-stage 6
 .. note::
  You can use ``./prepare.sh``, though it will generate FSTs that
  are not used in transducer training.
 When you finish running the script, you will get the following two folders:
  - ``data/fbank``: It saves the pre-computed features
  - ``data/lang_char``: It contains tokens that will be used in the training
 Training
 --------
 .. code-block:: bash
  cd egs/aishell/ASR
  ./transducer_stateless_modified/train.py --help
 shows you the training options that can be passed from the commandline.
 The following options are used quite often:
  - ``--exp-dir``
    The experiment folder to save logs and model checkpoints,
    defaults to ``./transducer_stateless_modified/exp``.
  - ``--num-epochs``
    It is the number of epochs to train. For instance,
    ``./transducer_stateless_modified/train.py --num-epochs 30`` trains for 30
    epochs and generates ``epoch-0.pt``, ``epoch-1.pt``, ..., ``epoch-29.pt``
    in the folder set by ``--exp-dir``.
  - ``--start-epoch``
    It's used to resume training.
    ``./transducer_stateless_modified/train.py --start-epoch 10`` loads the
    checkpoint from ``exp_dir/epoch-9.pt`` and starts
    training from epoch 10, based on the state from epoch 9.
  - ``--world-size``
    It is used for single-machine multi-GPU DDP training.
      - (a) If it is 1, then no DDP training is used.
      - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
    The following shows some use cases with it.
      **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
      GPU 2 for training. You can do the following:
        .. code-block:: bash
          $ cd egs/aishell/ASR
          $ export CUDA_VISIBLE_DEVICES="0,2"
          $ ./transducer_stateless_modified/train.py --world-size 2
      **Use case 2**: You have 4 GPUs and you want to use all of them
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/aishell/ASR
          $ ./transducer_stateless_modified/train.py --world-size 4
      **Use case 3**: You have 4 GPUs but you only want to use GPU 3
      for training. You can do the following:
        .. code-block:: bash
          $ cd egs/aishell/ASR
          $ export CUDA_VISIBLE_DEVICES="3"
          $ ./transducer_stateless_modified/train.py --world-size 1
    .. CAUTION::
      Only single-machine multi-GPU DDP training is implemented at present.
      There is an on-going PR `<https://github.com/k2-fsa/icefall/pull/63>`_
      that adds support for multi-machine multi-GPU DDP training.
  - ``--max-duration``
    It specifies the number of seconds over all utterances in a
    batch **before padding**.
    If you encounter CUDA OOM, please reduce it. For instance, if
    your are using V100 NVIDIA GPU with 32 GB RAM, we recommend you
    to set it to ``300`` when the vocabulary size is 500.
    .. HINT::
      Due to padding, the number of seconds of all utterances in a
      batch will usually be larger than ``--max-duration``.
      A larger value for ``--max-duration`` may cause OOM during training,
      while a smaller value may increase the training time. You have to
      tune it.
  - ``--lr-factor``
    It controls the learning rate. If you use a single GPU for training, you
    may want to use a small value for it. If you use multiple GPUs for training,
    you may increase it.
  - ``--context-size``
    It specifies the kernel size in the decoder. The default value 2 means it
    functions as a tri-gram LM.
  - ``--modified-transducer-prob``
    It specifies the probability to use modified transducer loss.
    If it is 0, then no modified transducer is used; if it is 1,
    then it uses modified transducer loss for all batches. If it is
    ``p``, it applies modified transducer with probability ``p``.
 There are some training options, e.g.,
 number of warmup steps,
 that are not passed from the commandline.
 They are pre-configured by the function ``get_params()`` in
 `transducer_stateless_modified/train.py <https://github.com/k2-fsa/icefall/blob/master/egs/aishell/ASR/transducer_stateless_modified/train.py#L162>`_
 If you need to change them, please modify ``./transducer_stateless_modified/train.py`` directly.
 .. CAUTION::
  The training set is perturbed by speed with two factors: 0.9 and 1.1.
  Each epoch actually processes ``3x150 == 450`` hours of data.
 Training logs
 ~~~~~~~~~~~~~
 Training logs and checkpoints are saved in the folder set by ``--exp-dir``
 (defaults to ``transducer_stateless_modified/exp``). You will find the following files in that directory:
  - ``epoch-0.pt``, ``epoch-1.pt``, ...
    These are checkpoint files, containing model ``state_dict`` and optimizer ``state_dict``.
    To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
      .. code-block:: bash
        $ ./transducer_stateless_modified/train.py --start-epoch 11
  - ``tensorboard/``
    This folder contains TensorBoard logs. Training loss, validation loss, learning
    rate, etc, are recorded in these logs. You can visualize them by:
      .. code-block:: bash
        $ cd transducer_stateless_modified/exp/tensorboard
        $ tensorboard dev upload --logdir . --name "Aishell transducer training with icefall" --description "Training modified transducer, see https://github.com/k2-fsa/icefall/pull/219"
    It will print something like below:
      .. code-block::
        TensorFlow installation not found - running with reduced feature set.
        Upload started and will continue reading any new data as it's added to the logdir.
        To stop uploading, press Ctrl-C.
        New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/laGZ6HrcQxOigbFD5E0Y3Q/
        [2022-03-03T14:29:45] Started scanning logdir.
        [2022-03-03T14:29:48] Total uploaded: 8477 scalars, 0 tensors, 0 binary objects
        Listening for new data in logdir...
    Note there is a `URL <https://tensorboard.dev/experiment/laGZ6HrcQxOigbFD5E0Y3Q/>`_ in the
    above output, click it and you will see the following screenshot:
      .. figure:: images/aishell-transducer_stateless_modified-tensorboard-log.png
         :width: 600
         :alt: TensorBoard screenshot
         :align: center
         :target: https://tensorboard.dev/experiment/laGZ6HrcQxOigbFD5E0Y3Q
         TensorBoard screenshot.
  - ``log/log-train-xxxx``
    It is the detailed training log in text format, same as the one
    you saw printed to the console during training.
 Usage examples
 ~~~~~~~~~~~~~~
 The following shows typical use cases:
 **Case 1**
 ^^^^^^^^^^
 .. code-block:: bash
  $ cd egs/aishell/ASR
  $ ./transducer_stateless_modified/train.py --max-duration 250
 It uses ``--max-duration`` of 250 to avoid OOM.
 **Case 2**
 ^^^^^^^^^^
 .. code-block:: bash
  $ cd egs/aishell/ASR
  $ export CUDA_VISIBLE_DEVICES="0,3"
  $ ./transducer_stateless_modified/train.py --world-size 2
 It uses GPU 0 and GPU 3 for DDP training.
 **Case 3**
 ^^^^^^^^^^
 .. code-block:: bash
  $ cd egs/aishell/ASR
  $ ./transducer_stateless_modified/train.py --num-epochs 10 --start-epoch 3
 It loads checkpoint ``./transducer_stateless_modified/exp/epoch-2.pt`` and starts
 training from epoch 3. Also, it trains for 10 epochs.
 Decoding
 --------
 The decoding part uses checkpoints saved by the training part, so you have
 to run the training part first.
 .. code-block:: bash
  $ cd egs/aishell/ASR
  $ ./transducer_stateless_modified/decode.py --help
 shows the options for decoding.
 The commonly used options are:
  - ``--method``
    This specifies the decoding method. Currently, it supports:
      - **greedy_search**. You can provide the commandline option ``--max-sym-per-frame``
        to limit the maximum number of symbols that can be emitted per frame.
      - **beam_search**. You can provide the commandline option ``--beam-size``.
      - **modified_beam_search**. You can also provide the commandline option ``--beam-size``.
        To use this method, we assume that you have trained your model with modified transducer,
        i.e., used the option ``--modified-transducer-prob`` in the training.
    The following command uses greedy search for decoding
    .. code-block::
      $ cd egs/aishell/ASR
      $ ./transducer_stateless_modified/decode.py \
              --epoch 64 \
              --avg 33 \
              --exp-dir ./transducer_stateless_modified/exp \
              --max-duration 100 \
              --decoding-method greedy_search \
              --max-sym-per-frame 1
    The following command uses beam search for decoding
    .. code-block::
      $ cd egs/aishell/ASR
      $ ./transducer_stateless_modified/decode.py \
              --epoch 64 \
              --avg 33 \
              --exp-dir ./transducer_stateless_modified/exp \
              --max-duration 100 \
              --decoding-method beam_search \
              --beam-size 4
    The following command uses ``modified`` beam search for decoding
    .. code-block::
      $ cd egs/aishell/ASR
      $ ./transducer_stateless_modified/decode.py \
              --epoch 64 \
              --avg 33 \
              --exp-dir ./transducer_stateless_modified/exp \
              --max-duration 100 \
              --decoding-method modified_beam_search \
              --beam-size 4
  - ``--max-duration``
    It has the same meaning as the one used in training. A larger
    value may cause OOM.
  - ``--epoch``
    It specifies the checkpoint from which epoch that should be used for decoding.
  - ``--avg``
    It specifies the number of models to average. For instance, if it is 3 and if
    ``--epoch=10``, then it averages the checkpoints ``epoch-8.pt``, ``epoch-9.pt``,
    and ``epoch-10.pt`` and the averaged checkpoint is used for decoding.
 After decoding, you can find the decoding logs and results in `exp_dir/log/<decoding_method>`, e.g.,
 ``exp_dir/log/greedy_search``.
 Pre-trained Model
 -----------------
 We have uploaded a pre-trained model to
 `<https://huggingface.co/csukuangfj/icefall-aishell-transducer-stateless-modified-2022-03-01>`_
 We describe how to use the pre-trained model to transcribe a sound file or
 multiple sound files in the following.
 Install kaldifeat
 ~~~~~~~~~~~~~~~~~
 `kaldifeat <https://github.com/csukuangfj/kaldifeat>`_ is used to
 extract features for a single sound file or multiple sound files
 at the same time.
 Please refer to `<https://github.com/csukuangfj/kaldifeat>`_ for installation.
 Download the pre-trained model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The following commands describe how to download the pre-trained model:
 .. code-block::
  $ cd egs/aishell/ASR
  $ mkdir tmp
  $ cd tmp
  $ git lfs install
  $ git clone https://huggingface.co/csukuangfj/icefall-aishell-transducer-stateless-modified-2022-03-01
 .. CAUTION::
  You have to use ``git lfs`` to download the pre-trained model.
 After downloading, you will have the following files:
 .. code-block:: bash
  $ cd egs/aishell/ASR
  $ tree tmp/icefall-aishell-transducer-stateless-modified-2022-03-01
 .. code-block:: bash
  tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/
  |-- README.md
  |-- data
  |   `-- lang_char
  |       |-- L.pt
  |       |-- lexicon.txt
  |       |-- tokens.txt
  |       `-- words.txt
  |-- exp
  |   `-- pretrained.pt
  |-- log
  |   |-- errs-test-beam_4-epoch-64-avg-33-beam-4.txt
  |   |-- errs-test-greedy_search-epoch-64-avg-33-context-2-max-sym-per-frame-1.txt
  |   |-- log-decode-epoch-64-avg-33-beam-4-2022-03-02-12-05-03
  |   |-- log-decode-epoch-64-avg-33-context-2-max-sym-per-frame-1-2022-02-28-18-13-07
  |   |-- recogs-test-beam_4-epoch-64-avg-33-beam-4.txt
  |   `-- recogs-test-greedy_search-epoch-64-avg-33-context-2-max-sym-per-frame-1.txt
  `-- test_wavs
      |-- BAC009S0764W0121.wav
      |-- BAC009S0764W0122.wav
      |-- BAC009S0764W0123.wav
      `-- transcript.txt
  5 directories, 16 files
 **File descriptions**:
  - ``data/lang_char``
    It contains language related files. You can find the vocabulary size in ``tokens.txt``.
  - ``exp/pretrained.pt``
      It contains pre-trained model parameters, obtained by averaging
      checkpoints from ``epoch-32.pt`` to ``epoch-64.pt``.
      Note: We have removed optimizer ``state_dict`` to reduce file size.
  - ``log``
      It contains decoding logs and decoded results.
  - ``test_wavs``
      It contains some test sound files from Aishell ``test`` dataset.
 The information of the test sound files is listed below:
 .. code-block:: bash
  $ soxi tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/*.wav
  Input File     : 'tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav'
  Channels       : 1
  Sample Rate    : 16000
  Precision      : 16-bit
  Duration       : 00:00:04.20 = 67263 samples ~ 315.295 CDDA sectors
  File Size      : 135k
  Bit Rate       : 256k
  Sample Encoding: 16-bit Signed Integer PCM
  Input File     : 'tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav'
  Channels       : 1
  Sample Rate    : 16000
  Precision      : 16-bit
  Duration       : 00:00:04.12 = 65840 samples ~ 308.625 CDDA sectors
  File Size      : 132k
  Bit Rate       : 256k
  Sample Encoding: 16-bit Signed Integer PCM
  Input File     : 'tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'
  Channels       : 1
  Sample Rate    : 16000
  Precision      : 16-bit
  Duration       : 00:00:04.00 = 64000 samples ~ 300 CDDA sectors
  File Size      : 128k
  Bit Rate       : 256k
  Sample Encoding: 16-bit Signed Integer PCM
  Total Duration of 3 files: 00:00:12.32
 Usage
 ~~~~~
 .. code-block::
  $ cd egs/aishell/ASR
  $ ./transducer_stateless_modified/pretrained.py --help
 displays the help information.
 It supports three decoding methods:
  - greedy search
  - beam search
  - modified beam search
 .. note::
  In modified beam search, it limits the maximum number of symbols that can be
  emitted per frame to 1. To use this method, you have to ensure that your model
  has been trained with the option ``--modified-transducer-prob``. Otherwise,
  it may give you poor results.
 Greedy search
 ^^^^^^^^^^^^^
 The command to run greedy search is given below:
 .. code-block:: bash
  $ cd egs/aishell/ASR
  $ ./transducer_stateless_modified/pretrained.py \
      --checkpoint ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt \
      --lang-dir ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char \
      --method greedy_search \
      ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav \
      ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav \
      ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav
 The output is as follows:
 .. code-block::
  2022-03-03 15:35:26,531 INFO [pretrained.py:239] device: cuda:0
  2022-03-03 15:35:26,994 INFO [lexicon.py:176] Loading pre-compiled tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char/Linv.pt
  2022-03-03 15:35:27,027 INFO [pretrained.py:246] {'feature_dim': 80, 'encoder_out_dim': 512, 'subsampling_factor': 4, 'attention_dim': 512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'vgg_frontend': False, 'env_info': {'k2-version': '1.13', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'f4fefe4882bc0ae59af951da3f47335d5495ef71', 'k2-git-date': 'Thu Feb 10 15:16:02 2022', 'lhotse-version': '1.0.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '50d2281-clean', 'icefall-git-date': 'Wed Mar 2 16:02:38 2022', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-aishell', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-multi-datasets/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-aishell/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-2-0815224919-75d558775b-mmnv8', 'IP address': '10.177.72.138'}, 'sample_rate': 16000, 'checkpoint': './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt', 'lang_dir': PosixPath('tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char'), 'method': 'greedy_search', 'sound_files': ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'], 'beam_size': 4, 'context_size': 2, 'max_sym_per_frame': 3, 'blank_id': 0, 'vocab_size': 4336}
  2022-03-03 15:35:27,027 INFO [pretrained.py:248] About to create model
  2022-03-03 15:35:36,878 INFO [pretrained.py:257] Constructing Fbank computer
  2022-03-03 15:35:36,880 INFO [pretrained.py:267] Reading sound files: ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav']
  2022-03-03 15:35:36,891 INFO [pretrained.py:273] Decoding started
  /ceph-fj/fangjun/open-source-2/icefall-aishell/egs/aishell/ASR/transducer_stateless_modified/conformer.py:113: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
    lengths = ((x_lens - 1) // 2 - 1) // 2
  2022-03-03 15:35:37,163 INFO [pretrained.py:320]
  ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav:
  甚 至 出 现 交 易 几 乎 停 滞 的 情 况
  ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav:
  一 二 线 城 市 虽 然 也 处 于 调 整 中
  ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav:
  但 因 为 聚 集 了 过 多 公 共 资 源
  2022-03-03 15:35:37,163 INFO [pretrained.py:322] Decoding Done
 Beam search
 ^^^^^^^^^^^
 The command to run beam search is given below:
 .. code-block:: bash
  $ cd egs/aishell/ASR
  $ ./transducer_stateless_modified/pretrained.py \
      --checkpoint ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt \
      --lang-dir ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char \
      --method beam_search \
      --beam-size 4 \
      ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav \
      ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav \
      ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav
 The output is as follows:
 .. code-block::
  2022-03-03 15:39:09,285 INFO [pretrained.py:239] device: cuda:0
  2022-03-03 15:39:09,708 INFO [lexicon.py:176] Loading pre-compiled tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char/Linv.pt
  2022-03-03 15:39:09,759 INFO [pretrained.py:246] {'feature_dim': 80, 'encoder_out_dim': 512, 'subsampling_factor': 4, 'attention_dim': 512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'vgg_frontend': False, 'env_info': {'k2-version': '1.13', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'f4fefe4882bc0ae59af951da3f47335d5495ef71', 'k2-git-date': 'Thu Feb 10 15:16:02 2022', 'lhotse-version': '1.0.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '50d2281-clean', 'icefall-git-date': 'Wed Mar 2 16:02:38 2022', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-aishell', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-multi-datasets/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-aishell/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-2-0815224919-75d558775b-mmnv8', 'IP address': '10.177.72.138'}, 'sample_rate': 16000, 'checkpoint': './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt', 'lang_dir': PosixPath('tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char'), 'method': 'beam_search', 'sound_files': ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'], 'beam_size': 4, 'context_size': 2, 'max_sym_per_frame': 3, 'blank_id': 0, 'vocab_size': 4336}
  2022-03-03 15:39:09,760 INFO [pretrained.py:248] About to create model
  2022-03-03 15:39:18,919 INFO [pretrained.py:257] Constructing Fbank computer
  2022-03-03 15:39:18,922 INFO [pretrained.py:267] Reading sound files: ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav']
  2022-03-03 15:39:18,929 INFO [pretrained.py:273] Decoding started
  /ceph-fj/fangjun/open-source-2/icefall-aishell/egs/aishell/ASR/transducer_stateless_modified/conformer.py:113: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
    lengths = ((x_lens - 1) // 2 - 1) // 2
  2022-03-03 15:39:21,046 INFO [pretrained.py:320]
  ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav:
  甚 至 出 现 交 易 几 乎 停 滞 的 情 况
  ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav:
  一 二 线 城 市 虽 然 也 处 于 调 整 中
  ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav:
  但 因 为 聚 集 了 过 多 公 共 资 源
  2022-03-03 15:39:21,047 INFO [pretrained.py:322] Decoding Done
 Modified Beam search
 ^^^^^^^^^^^^^^^^^^^^
 The command to run modified beam search is given below:
 .. code-block:: bash
  $ cd egs/aishell/ASR
  $ ./transducer_stateless_modified/pretrained.py \
      --checkpoint ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt \
      --lang-dir ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char \
      --method modified_beam_search \
      --beam-size 4 \
      ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav \
      ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav \
      ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav
 The output is as follows:
 .. code-block::
  2022-03-03 15:41:23,319 INFO [pretrained.py:239] device: cuda:0
  2022-03-03 15:41:23,798 INFO [lexicon.py:176] Loading pre-compiled tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char/Linv.pt
  2022-03-03 15:41:23,831 INFO [pretrained.py:246] {'feature_dim': 80, 'encoder_out_dim': 512, 'subsampling_factor': 4, 'attention_dim': 512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'vgg_frontend': False, 'env_info': {'k2-version': '1.13', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'f4fefe4882bc0ae59af951da3f47335d5495ef71', 'k2-git-date': 'Thu Feb 10 15:16:02 2022', 'lhotse-version': '1.0.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '50d2281-clean', 'icefall-git-date': 'Wed Mar 2 16:02:38 2022', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-aishell', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-multi-datasets/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-aishell/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-2-0815224919-75d558775b-mmnv8', 'IP address': '10.177.72.138'}, 'sample_rate': 16000, 'checkpoint': './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt', 'lang_dir': PosixPath('tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char'), 'method': 'modified_beam_search', 'sound_files': ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'], 'beam_size': 4, 'context_size': 2, 'max_sym_per_frame': 3, 'blank_id': 0, 'vocab_size': 4336}
  2022-03-03 15:41:23,831 INFO [pretrained.py:248] About to create model
  2022-03-03 15:41:32,214 INFO [pretrained.py:257] Constructing Fbank computer
  2022-03-03 15:41:32,215 INFO [pretrained.py:267] Reading sound files: ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav']
  2022-03-03 15:41:32,220 INFO [pretrained.py:273] Decoding started
  /ceph-fj/fangjun/open-source-2/icefall-aishell/egs/aishell/ASR/transducer_stateless_modified/conformer.py:113: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
    lengths = ((x_lens - 1) // 2 - 1) // 2
  /ceph-fj/fangjun/open-source-2/icefall-aishell/egs/aishell/ASR/transducer_stateless_modified/beam_search.py:402: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
    topk_hyp_indexes = topk_indexes // logits.size(-1)
  2022-03-03 15:41:32,583 INFO [pretrained.py:320]
  ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav:
  甚 至 出 现 交 易 几 乎 停 滞 的 情 况
  ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav:
  一 二 线 城 市 虽 然 也 处 于 调 整 中
  ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav:
  但 因 为 聚 集 了 过 多 公 共 资 源
  2022-03-03 15:41:32,583 INFO [pretrained.py:322] Decoding Done
 Colab notebook
 --------------
 We provide a colab notebook for this recipe showing how to use a pre-trained model to
 transcribe sound files.
 |aishell asr stateless modified transducer colab notebook|
 .. |aishell asr stateless modified transducer colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
   :target: https://colab.research.google.com/drive/12jpTxJB44vzwtcmJl2DTdznW0OawPb9H?usp=sharing
--- a/docs/source/recipes/index.rst
+++ b/docs/source/recipes/index.rst
@ -10,12 +10,10 @@ We may add recipes for other tasks as well in the future.
 .. Other recipes are listed in a alphabetical order.
 .. toctree::
-   :maxdepth: 3
+   :maxdepth: 2
   :caption: Table of Contents
-   yesno
+   aishell/index
-
+   librispeech/index
-   librispeech
+   timit/index
-
+   yesno/index
   aishell
   timit
--- a/docs/source/recipes/librispeech.rst
+++ b/docs/source/recipes/librispeech.rst
@ -1,10 +0,0 @@
 LibriSpeech
 ===========
 We provide the following models for the LibriSpeech dataset:
 .. toctree::
   :maxdepth: 2
   librispeech/tdnn_lstm_ctc
   librispeech/conformer_ctc
--- a/docs/source/recipes/librispeech/conformer_ctc.rst
+++ b/docs/source/recipes/librispeech/conformer_ctc.rst
@ -70,6 +70,17 @@ To run stage 2 to stage 5, use:
  All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
  are saved in ``./data`` directory.
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 Training
 --------
--- a/docs/source/recipes/librispeech/index.rst
+++ b/docs/source/recipes/librispeech/index.rst
@ -0,0 +1,8 @@
 LibriSpeech
 ===========
 .. toctree::
   :maxdepth: 1
   tdnn_lstm_ctc
   conformer_ctc
--- a/docs/source/recipes/librispeech/tdnn_lstm_ctc.rst
+++ b/docs/source/recipes/librispeech/tdnn_lstm_ctc.rst
@ -45,6 +45,16 @@ To run stage 2 to stage 5, use:
  $ ./prepare.sh --stage 2 --stop-stage 5
 We provide the following YouTube video showing how to run ``./prepare.sh``.
 .. note::
   To get the latest news of `next-gen Kaldi <https://github.com/k2-fsa>`_, please subscribe
   the following YouTube channel by `Nadira Povey <https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_:
      `<https://www.youtube.com/channel/UC_VaumpkmINz1pNkFXAN9mw>`_
 ..  youtube:: ofEIoJL-mGM
 Training
 --------
--- a/docs/source/recipes/timit.rst
+++ b/docs/source/recipes/timit.rst
@ -1,10 +0,0 @@
 TIMIT
 ===========
 We provide the following models for the TIMIT dataset:
 .. toctree::
   :maxdepth: 2
   timit/tdnn_lstm_ctc
   timit/tdnn_ligru_ctc
--- a/docs/source/recipes/timit/index.rst
+++ b/docs/source/recipes/timit/index.rst
@ -0,0 +1,9 @@
 TIMIT
 =====
 .. toctree::
   :maxdepth: 1
   tdnn_ligru_ctc
   tdnn_lstm_ctc
--- a/docs/source/recipes/timit/tdnn_ligru_ctc.rst
+++ b/docs/source/recipes/timit/tdnn_ligru_ctc.rst
@ -1,5 +1,5 @@
 TDNN-LiGRU-CTC
-=============
+==============
 This tutorial shows you how to run a TDNN-LiGRU-CTC model with the `TIMIT <https://data.deepai.org/timit.zip>`_ dataset.
--- a/docs/source/recipes/yesno/images/tdnn-tensorboard-log.png
+++ b/docs/source/recipes/yesno/images/tdnn-tensorboard-log.png
--- a/docs/source/recipes/yesno/index.rst
+++ b/docs/source/recipes/yesno/index.rst
@ -0,0 +1,7 @@
 YesNo
 =====
 .. toctree::
   :maxdepth: 1
   tdnn
--- a/docs/source/recipes/yesno/tdnn.rst
+++ b/docs/source/recipes/yesno/tdnn.rst
@ -1,5 +1,5 @@
-yesno
+TDNN-CTC
-=====
+========
 This page shows you how to run the `yesno <https://www.openslr.org/1>`_ recipe. It contains:
@ -145,7 +145,7 @@ In ``tdnn/exp``, you will find the following files:
    Note there is a URL in the above output, click it and you will see
    the following screenshot:
-      .. figure:: images/yesno-tdnn-tensorboard-log.png
+      .. figure:: images/tdnn-tensorboard-log.png
         :width: 600
         :alt: TensorBoard screenshot
         :align: center
--- a/egs/aidatatang_200zh/ASR/README.md
+++ b/egs/aidatatang_200zh/ASR/README.md
@ -0,0 +1,38 @@
 Note: This recipe is trained with the codes from this PR https://github.com/k2-fsa/icefall/pull/375
 # Pre-trained Transducer-Stateless2 models for the Aidatatang_200zh dataset with icefall.
 The model was trained on full [Aidatatang_200zh](https://www.openslr.org/62) with the scripts in [icefall](https://github.com/k2-fsa/icefall) based on the latest version k2.
 ## Training procedure
 The main repositories are list below, we will update the training and decoding scripts with the update of version.
 k2: https://github.com/k2-fsa/k2
 icefall: https://github.com/k2-fsa/icefall
 lhotse: https://github.com/lhotse-speech/lhotse
 * Install k2 and lhotse, k2 installation guide refers to https://k2.readthedocs.io/en/latest/installation/index.html, lhotse refers to https://lhotse.readthedocs.io/en/latest/getting-started.html#installation. I think the latest version would be ok. And please also install the requirements listed in icefall.
 * Clone icefall(https://github.com/k2-fsa/icefall) and check to the commit showed above.
 ```
 git clone https://github.com/k2-fsa/icefall
 cd icefall
 ```
 * Preparing data.
 ```
 cd egs/aidatatang_200zh/ASR
 bash ./prepare.sh
 ```
 * Training
 ```
 export CUDA_VISIBLE_DEVICES="0,1"
 ./pruned_transducer_stateless2/train.py \
                  --world-size 2 \
                  --num-epochs 30 \
                  --start-epoch 0 \
                  --exp-dir pruned_transducer_stateless2/exp \
                  --lang-dir data/lang_char \
                  --max-duration 250
 ```
 ## Evaluation results
 The decoding results (WER%) on Aidatatang_200zh(dev and test) are listed below, we got this result by averaging models from epoch 11 to 29.
 The WERs are
 |                                    |     dev    |    test    | comment                                  |
 |------------------------------------|------------|------------|------------------------------------------|
 |          greedy search             | 5.53       | 6.59       | --epoch 29, --avg 19, --max-duration 100 |
 | modified beam search (beam size 4) | 5.27       | 6.33       | --epoch 29, --avg 19, --max-duration 100 |
 | fast beam search (set as default)  | 5.30       | 6.34       | --epoch 29, --avg 19, --max-duration 1500|
--- a/egs/aidatatang_200zh/ASR/RESULTS.md
+++ b/egs/aidatatang_200zh/ASR/RESULTS.md
@ -0,0 +1,72 @@
 ## Results
 ### Aidatatang_200zh Char training results (Pruned Transducer Stateless2)
 #### 2022-05-16
 Using the codes from this PR https://github.com/k2-fsa/icefall/pull/375.
 The WERs are
 |                                    |     dev    |    test    | comment                                  |
 |------------------------------------|------------|------------|------------------------------------------|
 |          greedy search             | 5.53       | 6.59       | --epoch 29, --avg 19, --max-duration 100 |
 | modified beam search (beam size 4) | 5.27       | 6.33       | --epoch 29, --avg 19, --max-duration 100 |
 | fast beam search (set as default)  | 5.30       | 6.34       | --epoch 29, --avg 19, --max-duration 1500|
 The training command for reproducing is given below:
 ```
 export CUDA_VISIBLE_DEVICES="0,1"
 ./pruned_transducer_stateless2/train.py \
  --world-size 2 \
  --num-epochs 30 \
  --start-epoch 0 \
  --exp-dir pruned_transducer_stateless2/exp \
  --lang-dir data/lang_char \
  --max-duration 250 \
  --save-every-n 1000
 ```
 The tensorboard training log can be found at
 https://tensorboard.dev/experiment/xS7kgYf2RwyDpQAOdS8rAA/#scalars
 The decoding command is:
 ```
 epoch=29
 avg=19
 ## greedy search
 ./pruned_transducer_stateless2/decode.py \
  --epoch $epoch \
  --avg $avg \
  --exp-dir pruned_transducer_stateless2/exp \
  --lang-dir ./data/lang_char \
  --max-duration 100
 ## modified beam search
 ./pruned_transducer_stateless2/decode.py \
  --epoch $epoch \
  --avg $avg \
  --exp-dir pruned_transducer_stateless2/exp \
  --lang-dir ./data/lang_char \
  --max-duration 100 \
  --decoding-method modified_beam_search \
  --beam-size 4
 ## fast beam search
 ./pruned_transducer_stateless2/decode.py \
        --epoch $epoch \
        --avg $avg \
        --exp-dir ./pruned_transducer_stateless2/exp \
        --lang-dir ./data/lang_char \
        --max-duration 1500 \
        --decoding-method fast_beam_search \
        --beam 4 \
        --max-contexts 4 \
        --max-states 8
 ```
 A pre-trained model and decoding logs can be found at <https://huggingface.co/luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2>
--- a/egs/aidatatang_200zh/ASR/local/init.py
+++ b/egs/aidatatang_200zh/ASR/local/init.py
--- a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
+++ b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
@ -0,0 +1,119 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file computes fbank features of the aidatatang_200zh dataset.
 It looks for manifests in the directory data/manifests.
 The generated fbank features are saved in data/fbank.
 """
 import argparse
 import logging
 import os
 from pathlib import Path
 import torch
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_jobs = min(15, os.cpu_count())
    dataset_parts = (
        "train",
        "dev",
        "test",
    )
    prefix = "aidatatang"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
            for sup in m["supervisions"]:
                sup.custom = {"origin": "aidatatang_200zh"}
            cut_set = CutSet.from_manifests(
                recordings=m["recordings"],
                supervisions=m["supervisions"],
            )
            if "train" in partition:
                cut_set = (
                    cut_set
                    + cut_set.perturb_speed(0.9)
                    + cut_set.perturb_speed(1.1)
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomChunkyWriter,
            )
            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--num-mel-bins",
        type=int,
        default=80,
        help="""The number of mel bins for Fbank""",
    )
    return parser.parse_args()
 if __name__ == "__main__":
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    args = get_args()
    compute_fbank_aidatatang_200zh(num_mel_bins=args.num_mel_bins)
--- a/egs/aidatatang_200zh/ASR/local/compute_fbank_musan.py
+++ b/egs/aidatatang_200zh/ASR/local/compute_fbank_musan.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/compute_fbank_musan.py
--- a/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py
+++ b/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py
@ -0,0 +1,96 @@
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang
 # 						                           Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file displays duration statistics of utterances in a manifest.
 You can use the displayed value to choose minimum/maximum duration
 to remove short and long utterances during the training.
 See the function `remove_short_and_long_utt()`
 in ../../../librispeech/ASR/transducer/train.py
 for usage.
 """
 from lhotse import load_manifest_lazy
 def main():
    paths = [
        "./data/fbank/aidatatang_cuts_train.jsonl.gz",
        "./data/fbank/aidatatang_cuts_dev.jsonl.gz",
        "./data/fbank/aidatatang_cuts_test.jsonl.gz",
    ]
    for path in paths:
        print(f"Starting display the statistics for {path}")
        cuts = load_manifest_lazy(path)
        cuts.describe()
 if __name__ == "__main__":
    main()
 """
 Starting display the statistics for ./data/fbank/aidatatang_cuts_train.jsonl.gz
 Cuts count: 494715
 Total duration (hours): 422.6
 Speech duration (hours): 422.6 (100.0%)
 ***
 Duration statistics (seconds):
 mean    3.1
 std     1.2
 min     1.0
 25%     2.3
 50%     2.7
 75%     3.5
 99%     7.2
 99.5%   8.0
 99.9%   9.5
 max     18.1
 Starting display the statistics for ./data/fbank/aidatatang_cuts_dev.jsonl.gz
 Cuts count: 24216
 Total duration (hours): 20.2
 Speech duration (hours): 20.2 (100.0%)
 ***
 Duration statistics (seconds):
 mean    3.0
 std     1.0
 min     1.2
 25%     2.3
 50%     2.7
 75%     3.4
 99%     6.7
 99.5%   7.3
 99.9%   8.8
 max     11.3
 Starting display the statistics for ./data/fbank/aidatatang_cuts_test.jsonl.gz
 Cuts count: 48144
 Total duration (hours): 40.2
 Speech duration (hours): 40.2 (100.0%)
 ***
 Duration statistics (seconds):
 mean    3.0
 std     1.1
 min     0.9
 25%     2.3
 50%     2.6
 75%     3.4
 99%     6.9
 99.5%   7.5
 99.9%   9.0
 max     21.8
 """
--- a/egs/aidatatang_200zh/ASR/local/prepare_char.py
+++ b/egs/aidatatang_200zh/ASR/local/prepare_char.py
@ -0,0 +1,248 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
 #                                                  Wei Kang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input `lang_dir`, which should contain::
    - lang_dir/text,
    - lang_dir/words.txt
 and generates the following files in the directory `lang_dir`:
    - lexicon.txt
    - lexicon_disambig.txt
    - L.pt
    - L_disambig.pt
    - tokens.txt
 """
 import re
 from pathlib import Path
 from typing import Dict, List
 import k2
 import torch
 from prepare_lang import (
    Lexicon,
    add_disambig_symbols,
    add_self_loops,
    write_lexicon,
    write_mapping,
 )
 def lexicon_to_fst_no_sil(
    lexicon: Lexicon,
    token2id: Dict[str, int],
    word2id: Dict[str, int],
    need_self_loops: bool = False,
 ) -> k2.Fsa:
    """Convert a lexicon to an FST (in k2 format).
    Args:
      lexicon:
        The input lexicon. See also :func:`read_lexicon`
      token2id:
        A dict mapping tokens to IDs.
      word2id:
        A dict mapping words to IDs.
      need_self_loops:
        If True, add self-loop to states with non-epsilon output symbols
        on at least one arc out of the state. The input label for this
        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
    Returns:
      Return an instance of `k2.Fsa` representing the given lexicon.
    """
    loop_state = 0  # words enter and leave from here
    next_state = 1  # the next un-allocated state, will be incremented as we go
    arcs = []
    # The blank symbol <blk> is defined in local/train_bpe_model.py
    assert token2id["<blk>"] == 0
    assert word2id["<eps>"] == 0
    eps = 0
    for word, pieces in lexicon:
        assert len(pieces) > 0, f"{word} has no pronunciations"
        cur_state = loop_state
        word = word2id[word]
        pieces = [
            token2id[i] if i in token2id else token2id["<unk>"] for i in pieces
        ]
        for i in range(len(pieces) - 1):
            w = word if i == 0 else eps
            arcs.append([cur_state, next_state, pieces[i], w, 0])
            cur_state = next_state
            next_state += 1
        # now for the last piece of this word
        i = len(pieces) - 1
        w = word if i == 0 else eps
        arcs.append([cur_state, loop_state, pieces[i], w, 0])
    if need_self_loops:
        disambig_token = token2id["#0"]
        disambig_word = word2id["#0"]
        arcs = add_self_loops(
            arcs,
            disambig_token=disambig_token,
            disambig_word=disambig_word,
        )
    final_state = next_state
    arcs.append([loop_state, final_state, -1, -1, 0])
    arcs.append([final_state])
    arcs = sorted(arcs, key=lambda arc: arc[0])
    arcs = [[str(i) for i in arc] for arc in arcs]
    arcs = [" ".join(arc) for arc in arcs]
    arcs = "\n".join(arcs)
    fsa = k2.Fsa.from_str(arcs, acceptor=False)
    return fsa
 def contain_oov(token_sym_table: Dict[str, int], tokens: List[str]) -> bool:
    """Check if all the given tokens are in token symbol table.
    Args:
      token_sym_table:
        Token symbol table that contains all the valid tokens.
      tokens:
        A list of tokens.
    Returns:
      Return True if there is any token not in the token_sym_table,
      otherwise False.
    """
    for tok in tokens:
        if tok not in token_sym_table:
            return True
    return False
 def generate_lexicon(
    token_sym_table: Dict[str, int], words: List[str]
 ) -> Lexicon:
    """Generate a lexicon from a word list and token_sym_table.
    Args:
      token_sym_table:
        Token symbol table that mapping token to token ids.
      words:
        A list of strings representing words.
    Returns:
      Return a dict whose keys are words and values are the corresponding
          tokens.
    """
    lexicon = []
    for word in words:
        chars = list(word.strip(" \t"))
        if contain_oov(token_sym_table, chars):
            continue
        lexicon.append((word, chars))
    # The OOV word is <UNK>
    lexicon.append(("<UNK>", ["<unk>"]))
    return lexicon
 def generate_tokens(text_file: str) -> Dict[str, int]:
    """Generate tokens from the given text file.
    Args:
      text_file:
        A file that contains text lines to generate tokens.
    Returns:
      Return a dict whose keys are tokens and values are token ids ranged
      from 0 to len(keys) - 1.
    """
    tokens: Dict[str, int] = dict()
    tokens["<blk>"] = 0
    tokens["<sos/eos>"] = 1
    tokens["<unk>"] = 2
    whitespace = re.compile(r"([ \t\r\n]+)")
    with open(text_file, "r", encoding="utf-8") as f:
        for line in f:
            line = re.sub(whitespace, "", line)
            chars = list(line)
            for char in chars:
                if char not in tokens:
                    tokens[char] = len(tokens)
    return tokens
 def main():
    lang_dir = Path("data/lang_char")
    text_file = lang_dir / "text"
    word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
    words = word_sym_table.symbols
    excluded = ["<eps>", "!SIL", "<SPOKEN_NOISE>", "<UNK>", "#0", "<s>", "</s>"]
    for w in excluded:
        if w in words:
            words.remove(w)
    token_sym_table = generate_tokens(text_file)
    lexicon = generate_lexicon(token_sym_table, words)
    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
    next_token_id = max(token_sym_table.values()) + 1
    for i in range(max_disambig + 1):
        disambig = f"#{i}"
        assert disambig not in token_sym_table
        token_sym_table[disambig] = next_token_id
        next_token_id += 1
    word_sym_table.add("#0")
    word_sym_table.add("<s>")
    word_sym_table.add("</s>")
    write_mapping(lang_dir / "tokens.txt", token_sym_table)
    write_lexicon(lang_dir / "lexicon.txt", lexicon)
    write_lexicon(lang_dir / "lexicon_disambig.txt", lexicon_disambig)
    L = lexicon_to_fst_no_sil(
        lexicon,
        token2id=token_sym_table,
        word2id=word_sym_table,
    )
    L_disambig = lexicon_to_fst_no_sil(
        lexicon_disambig,
        token2id=token_sym_table,
        word2id=word_sym_table,
        need_self_loops=True,
    )
    torch.save(L.as_dict(), lang_dir / "L.pt")
    torch.save(L_disambig.as_dict(), lang_dir / "L_disambig.pt")
 if __name__ == "__main__":
    main()
--- a/egs/aidatatang_200zh/ASR/local/prepare_lang.py
+++ b/egs/aidatatang_200zh/ASR/local/prepare_lang.py
@ -0,0 +1,390 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input a lexicon file "data/lang_phone/lexicon.txt"
 consisting of words and tokens (i.e., phones) and does the following:
 1. Add disambiguation symbols to the lexicon and generate lexicon_disambig.txt
 2. Generate tokens.txt, the token table mapping a token to a unique integer.
 3. Generate words.txt, the word table mapping a word to a unique integer.
 4. Generate L.pt, in k2 format. It can be loaded by
        d = torch.load("L.pt")
        lexicon = k2.Fsa.from_dict(d)
 5. Generate L_disambig.pt, in k2 format.
 """
 import argparse
 import math
 from collections import defaultdict
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 import k2
 import torch
 from icefall.lexicon import read_lexicon, write_lexicon
 Lexicon = List[Tuple[str, List[str]]]
 def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
    """Write a symbol to ID mapping to a file.
    Note:
      No need to implement `read_mapping` as it can be done
      through :func:`k2.SymbolTable.from_file`.
    Args:
      filename:
        Filename to save the mapping.
      sym2id:
        A dict mapping symbols to IDs.
    Returns:
      Return None.
    """
    with open(filename, "w", encoding="utf-8") as f:
        for sym, i in sym2id.items():
            f.write(f"{sym} {i}\n")
 def get_tokens(lexicon: Lexicon) -> List[str]:
    """Get tokens from a lexicon.
    Args:
      lexicon:
        It is the return value of :func:`read_lexicon`.
    Returns:
      Return a list of unique tokens.
    """
    ans = set()
    for _, tokens in lexicon:
        ans.update(tokens)
    sorted_ans = sorted(list(ans))
    return sorted_ans
 def get_words(lexicon: Lexicon) -> List[str]:
    """Get words from a lexicon.
    Args:
      lexicon:
        It is the return value of :func:`read_lexicon`.
    Returns:
      Return a list of unique words.
    """
    ans = set()
    for word, _ in lexicon:
        ans.add(word)
    sorted_ans = sorted(list(ans))
    return sorted_ans
 def add_disambig_symbols(lexicon: Lexicon) -> Tuple[Lexicon, int]:
    """It adds pseudo-token disambiguation symbols #1, #2 and so on
    at the ends of tokens to ensure that all pronunciations are different,
    and that none is a prefix of another.
    See also add_lex_disambig.pl from kaldi.
    Args:
      lexicon:
        It is returned by :func:`read_lexicon`.
    Returns:
      Return a tuple with two elements:
        - The output lexicon with disambiguation symbols
        - The ID of the max disambiguation symbol that appears
          in the lexicon
    """
    # (1) Work out the count of each token-sequence in the
    # lexicon.
    count = defaultdict(int)
    for _, tokens in lexicon:
        count[" ".join(tokens)] += 1
    # (2) For each left sub-sequence of each token-sequence, note down
    # that it exists (for identifying prefixes of longer strings).
    issubseq = defaultdict(int)
    for _, tokens in lexicon:
        tokens = tokens.copy()
        tokens.pop()
        while tokens:
            issubseq[" ".join(tokens)] = 1
            tokens.pop()
    # (3) For each entry in the lexicon:
    # if the token sequence is unique and is not a
    # prefix of another word, no disambig symbol.
    # Else output #1, or #2, #3, ... if the same token-seq
    # has already been assigned a disambig symbol.
    ans = []
    # We start with #1 since #0 has its own purpose
    first_allowed_disambig = 1
    max_disambig = first_allowed_disambig - 1
    last_used_disambig_symbol_of = defaultdict(int)
    for word, tokens in lexicon:
        tokenseq = " ".join(tokens)
        assert tokenseq != ""
        if issubseq[tokenseq] == 0 and count[tokenseq] == 1:
            ans.append((word, tokens))
            continue
        cur_disambig = last_used_disambig_symbol_of[tokenseq]
        if cur_disambig == 0:
            cur_disambig = first_allowed_disambig
        else:
            cur_disambig += 1
        if cur_disambig > max_disambig:
            max_disambig = cur_disambig
        last_used_disambig_symbol_of[tokenseq] = cur_disambig
        tokenseq += f" #{cur_disambig}"
        ans.append((word, tokenseq.split()))
    return ans, max_disambig
 def generate_id_map(symbols: List[str]) -> Dict[str, int]:
    """Generate ID maps, i.e., map a symbol to a unique ID.
    Args:
      symbols:
        A list of unique symbols.
    Returns:
      A dict containing the mapping between symbols and IDs.
    """
    return {sym: i for i, sym in enumerate(symbols)}
 def add_self_loops(
    arcs: List[List[Any]], disambig_token: int, disambig_word: int
 ) -> List[List[Any]]:
    """Adds self-loops to states of an FST to propagate disambiguation symbols
    through it. They are added on each state with non-epsilon output symbols
    on at least one arc out of the state.
    See also fstaddselfloops.pl from Kaldi. One difference is that
    Kaldi uses OpenFst style FSTs and it has multiple final states.
    This function uses k2 style FSTs and it does not need to add self-loops
    to the final state.
    The input label of a self-loop is `disambig_token`, while the output
    label is `disambig_word`.
    Args:
      arcs:
        A list-of-list. The sublist contains
        `[src_state, dest_state, label, aux_label, score]`
      disambig_token:
        It is the token ID of the symbol `#0`.
      disambig_word:
        It is the word ID of the symbol `#0`.
    Return:
      Return new `arcs` containing self-loops.
    """
    states_needs_self_loops = set()
    for arc in arcs:
        src, dst, ilabel, olabel, score = arc
        if olabel != 0:
            states_needs_self_loops.add(src)
    ans = []
    for s in states_needs_self_loops:
        ans.append([s, s, disambig_token, disambig_word, 0])
    return arcs + ans
 def lexicon_to_fst(
    lexicon: Lexicon,
    token2id: Dict[str, int],
    word2id: Dict[str, int],
    sil_token: str = "SIL",
    sil_prob: float = 0.5,
    need_self_loops: bool = False,
 ) -> k2.Fsa:
    """Convert a lexicon to an FST (in k2 format) with optional silence at
    the beginning and end of each word.
    Args:
      lexicon:
        The input lexicon. See also :func:`read_lexicon`
      token2id:
        A dict mapping tokens to IDs.
      word2id:
        A dict mapping words to IDs.
      sil_token:
        The silence token.
      sil_prob:
        The probability for adding a silence at the beginning and end
        of the word.
      need_self_loops:
        If True, add self-loop to states with non-epsilon output symbols
        on at least one arc out of the state. The input label for this
        self loop is `token2id["#0"]` and the output label is `word2id["#0"]`.
    Returns:
      Return an instance of `k2.Fsa` representing the given lexicon.
    """
    assert sil_prob > 0.0 and sil_prob < 1.0
    # CAUTION: we use score, i.e, negative cost.
    sil_score = math.log(sil_prob)
    no_sil_score = math.log(1.0 - sil_prob)
    start_state = 0
    loop_state = 1  # words enter and leave from here
    sil_state = 2  # words terminate here when followed by silence; this state
    # has a silence transition to loop_state.
    next_state = 3  # the next un-allocated state, will be incremented as we go.
    arcs = []
    assert token2id["<eps>"] == 0
    assert word2id["<eps>"] == 0
    eps = 0
    sil_token = token2id[sil_token]
    arcs.append([start_state, loop_state, eps, eps, no_sil_score])
    arcs.append([start_state, sil_state, eps, eps, sil_score])
    arcs.append([sil_state, loop_state, sil_token, eps, 0])
    for word, tokens in lexicon:
        assert len(tokens) > 0, f"{word} has no pronunciations"
        cur_state = loop_state
        word = word2id[word]
        tokens = [token2id[i] for i in tokens]
        for i in range(len(tokens) - 1):
            w = word if i == 0 else eps
            arcs.append([cur_state, next_state, tokens[i], w, 0])
            cur_state = next_state
            next_state += 1
        # now for the last token of this word
        # It has two out-going arcs, one to the loop state,
        # the other one to the sil_state.
        i = len(tokens) - 1
        w = word if i == 0 else eps
        arcs.append([cur_state, loop_state, tokens[i], w, no_sil_score])
        arcs.append([cur_state, sil_state, tokens[i], w, sil_score])
    if need_self_loops:
        disambig_token = token2id["#0"]
        disambig_word = word2id["#0"]
        arcs = add_self_loops(
            arcs,
            disambig_token=disambig_token,
            disambig_word=disambig_word,
        )
    final_state = next_state
    arcs.append([loop_state, final_state, -1, -1, 0])
    arcs.append([final_state])
    arcs = sorted(arcs, key=lambda arc: arc[0])
    arcs = [[str(i) for i in arc] for arc in arcs]
    arcs = [" ".join(arc) for arc in arcs]
    arcs = "\n".join(arcs)
    fsa = k2.Fsa.from_str(arcs, acceptor=False)
    return fsa
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--lang-dir", type=str, help="The lang dir, data/lang_phone"
    )
    return parser.parse_args()
 def main():
    out_dir = Path(get_args().lang_dir)
    lexicon_filename = out_dir / "lexicon.txt"
    sil_token = "SIL"
    sil_prob = 0.5
    lexicon = read_lexicon(lexicon_filename)
    tokens = get_tokens(lexicon)
    words = get_words(lexicon)
    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
    for i in range(max_disambig + 1):
        disambig = f"#{i}"
        assert disambig not in tokens
        tokens.append(f"#{i}")
    assert "<eps>" not in tokens
    tokens = ["<eps>"] + tokens
    assert "<eps>" not in words
    assert "#0" not in words
    assert "<s>" not in words
    assert "</s>" not in words
    words = ["<eps>"] + words + ["#0", "<s>", "</s>"]
    token2id = generate_id_map(tokens)
    word2id = generate_id_map(words)
    write_mapping(out_dir / "tokens.txt", token2id)
    write_mapping(out_dir / "words.txt", word2id)
    write_lexicon(out_dir / "lexicon_disambig.txt", lexicon_disambig)
    L = lexicon_to_fst(
        lexicon,
        token2id=token2id,
        word2id=word2id,
        sil_token=sil_token,
        sil_prob=sil_prob,
    )
    L_disambig = lexicon_to_fst(
        lexicon_disambig,
        token2id=token2id,
        word2id=word2id,
        sil_token=sil_token,
        sil_prob=sil_prob,
        need_self_loops=True,
    )
    torch.save(L.as_dict(), out_dir / "L.pt")
    torch.save(L_disambig.as_dict(), out_dir / "L_disambig.pt")
    if False:
        # Just for debugging, will remove it
        L.labels_sym = k2.SymbolTable.from_file(out_dir / "tokens.txt")
        L.aux_labels_sym = k2.SymbolTable.from_file(out_dir / "words.txt")
        L_disambig.labels_sym = L.labels_sym
        L_disambig.aux_labels_sym = L.aux_labels_sym
        L.draw(out_dir / "L.png", title="L")
        L_disambig.draw(out_dir / "L_disambig.png", title="L_disambig")
 if __name__ == "__main__":
    main()
--- a/egs/aidatatang_200zh/ASR/local/prepare_words.py
+++ b/egs/aidatatang_200zh/ASR/local/prepare_words.py
@ -0,0 +1,84 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # Copyright    2021  Xiaomi Corp.        (authors: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script takes as input words.txt without ids:
    - words_no_ids.txt
 and generates the new words.txt with related ids.
    - words.txt
 """
 import argparse
 import logging
 from tqdm import tqdm
 def get_parser():
    parser = argparse.ArgumentParser(
        description="Prepare words.txt",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--input-file",
        default="data/lang_char/words_no_ids.txt",
        type=str,
        help="the words file without ids for WenetSpeech",
    )
    parser.add_argument(
        "--output-file",
        default="data/lang_char/words.txt",
        type=str,
        help="the words file with ids for WenetSpeech",
    )
    return parser
 def main():
    parser = get_parser()
    args = parser.parse_args()
    input_file = args.input_file
    output_file = args.output_file
    f = open(input_file, "r", encoding="utf-8")
    lines = f.readlines()
    new_lines = []
    add_words = ["<eps> 0", "!SIL 1", "<SPOKEN_NOISE> 2", "<UNK> 3"]
    new_lines.extend(add_words)
    logging.info("Starting reading the input file")
    for i in tqdm(range(len(lines))):
        x = lines[i]
        idx = 4 + i
        new_line = str(x.strip("\n")) + " " + str(idx)
        new_lines.append(new_line)
    logging.info("Starting writing the words.txt")
    f_out = open(output_file, "w", encoding="utf-8")
    for line in new_lines:
        f_out.write(line)
        f_out.write("\n")
 if __name__ == "__main__":
    main()
--- a/egs/aidatatang_200zh/ASR/local/test_prepare_lang.py
+++ b/egs/aidatatang_200zh/ASR/local/test_prepare_lang.py
@ -0,0 +1,106 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Copyright (c)  2021  Xiaomi Corporation (authors: Fangjun Kuang)
 import os
 import tempfile
 import k2
 from prepare_lang import (
    add_disambig_symbols,
    generate_id_map,
    get_phones,
    get_words,
    lexicon_to_fst,
    read_lexicon,
    write_lexicon,
    write_mapping,
 )
 def generate_lexicon_file() -> str:
    fd, filename = tempfile.mkstemp()
    os.close(fd)
    s = """
    !SIL SIL
    <SPOKEN_NOISE> SPN
    <UNK> SPN
    f f
    a a
    foo f o o
    bar b a r
    bark b a r k
    food f o o d
    food2 f o o d
    fo  f o
    """.strip()
    with open(filename, "w") as f:
        f.write(s)
    return filename
 def test_read_lexicon(filename: str):
    lexicon = read_lexicon(filename)
    phones = get_phones(lexicon)
    words = get_words(lexicon)
    print(lexicon)
    print(phones)
    print(words)
    lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
    print(lexicon_disambig)
    print("max disambig:", f"#{max_disambig}")
    phones = ["<eps>", "SIL", "SPN"] + phones
    for i in range(max_disambig + 1):
        phones.append(f"#{i}")
    words = ["<eps>"] + words
    phone2id = generate_id_map(phones)
    word2id = generate_id_map(words)
    print(phone2id)
    print(word2id)
    write_mapping("phones.txt", phone2id)
    write_mapping("words.txt", word2id)
    write_lexicon("a.txt", lexicon)
    write_lexicon("a_disambig.txt", lexicon_disambig)
    fsa = lexicon_to_fst(lexicon, phone2id=phone2id, word2id=word2id)
    fsa.labels_sym = k2.SymbolTable.from_file("phones.txt")
    fsa.aux_labels_sym = k2.SymbolTable.from_file("words.txt")
    fsa.draw("L.pdf", title="L")
    fsa_disambig = lexicon_to_fst(
        lexicon_disambig, phone2id=phone2id, word2id=word2id
    )
    fsa_disambig.labels_sym = k2.SymbolTable.from_file("phones.txt")
    fsa_disambig.aux_labels_sym = k2.SymbolTable.from_file("words.txt")
    fsa_disambig.draw("L_disambig.pdf", title="L_disambig")
 def main():
    filename = generate_lexicon_file()
    test_read_lexicon(filename)
    os.remove(filename)
 if __name__ == "__main__":
    main()
--- a/egs/aidatatang_200zh/ASR/local/text2token.py
+++ b/egs/aidatatang_200zh/ASR/local/text2token.py
@ -0,0 +1,195 @@
 #!/usr/bin/env python3
 # Copyright    2017  Johns Hopkins University   (authors: Shinji Watanabe)
 #              2022  Xiaomi Corp.               (authors: Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import codecs
 import re
 import sys
 from typing import List
 from pypinyin import lazy_pinyin, pinyin
 is_python2 = sys.version_info[0] == 2
 def exist_or_not(i, match_pos):
    start_pos = None
    end_pos = None
    for pos in match_pos:
        if pos[0] <= i < pos[1]:
            start_pos = pos[0]
            end_pos = pos[1]
            break
    return start_pos, end_pos
 def get_parser():
    parser = argparse.ArgumentParser(
        description="convert raw text to tokenized text",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--nchar",
        "-n",
        default=1,
        type=int,
        help="number of characters to split, i.e., \
                        aabb -> a a b b with -n 1 and aa bb with -n 2",
    )
    parser.add_argument(
        "--skip-ncols", "-s", default=0, type=int, help="skip first n columns"
    )
    parser.add_argument(
        "--space", default="<space>", type=str, help="space symbol"
    )
    parser.add_argument(
        "--non-lang-syms",
        "-l",
        default=None,
        type=str,
        help="list of non-linguistic symobles, e.g., <NOISE> etc.",
    )
    parser.add_argument(
        "text", type=str, default=False, nargs="?", help="input text"
    )
    parser.add_argument(
        "--trans_type",
        "-t",
        type=str,
        default="char",
        choices=["char", "pinyin", "lazy_pinyin"],
        help="""Transcript type. char/pinyin/lazy_pinyin""",
    )
    return parser
 def token2id(
    texts, token_table, token_type: str = "lazy_pinyin", oov: str = "<unk>"
 ) -> List[List[int]]:
    """Convert token to id.
    Args:
      texts:
        The input texts, it refers to the chinese text here.
      token_table:
        The token table is built based on "data/lang_xxx/token.txt"
      token_type:
        The type of token, such as "pinyin" and "lazy_pinyin".
      oov:
        Out of vocabulary token. When a word(token) in the transcript
        does not exist in the token list, it is replaced with `oov`.
    Returns:
      The list of ids for the input texts.
    """
    if texts is None:
        raise ValueError("texts can't be None!")
    else:
        oov_id = token_table[oov]
        ids: List[List[int]] = []
        for text in texts:
            chars_list = list(str(text))
            if token_type == "lazy_pinyin":
                text = lazy_pinyin(chars_list)
                sub_ids = [
                    token_table[txt] if txt in token_table else oov_id
                    for txt in text
                ]
                ids.append(sub_ids)
            else:  # token_type = "pinyin"
                text = pinyin(chars_list)
                sub_ids = [
                    token_table[txt[0]] if txt[0] in token_table else oov_id
                    for txt in text
                ]
                ids.append(sub_ids)
        return ids
 def main():
    parser = get_parser()
    args = parser.parse_args()
    rs = []
    if args.non_lang_syms is not None:
        with codecs.open(args.non_lang_syms, "r", encoding="utf-8") as f:
            nls = [x.rstrip() for x in f.readlines()]
            rs = [re.compile(re.escape(x)) for x in nls]
    if args.text:
        f = codecs.open(args.text, encoding="utf-8")
    else:
        f = codecs.getreader("utf-8")(
            sys.stdin if is_python2 else sys.stdin.buffer
        )
    sys.stdout = codecs.getwriter("utf-8")(
        sys.stdout if is_python2 else sys.stdout.buffer
    )
    line = f.readline()
    n = args.nchar
    while line:
        x = line.split()
        print(" ".join(x[: args.skip_ncols]), end=" ")
        a = " ".join(x[args.skip_ncols :])  # noqa E203
        # get all matched positions
        match_pos = []
        for r in rs:
            i = 0
            while i >= 0:
                m = r.search(a, i)
                if m:
                    match_pos.append([m.start(), m.end()])
                    i = m.end()
                else:
                    break
        if len(match_pos) > 0:
            chars = []
            i = 0
            while i < len(a):
                start_pos, end_pos = exist_or_not(i, match_pos)
                if start_pos is not None:
                    chars.append(a[start_pos:end_pos])
                    i = end_pos
                else:
                    chars.append(a[i])
                    i += 1
            a = chars
        if args.trans_type == "pinyin":
            a = pinyin(list(str(a)))
            a = [one[0] for one in a]
        if args.trans_type == "lazy_pinyin":
            a = lazy_pinyin(list(str(a)))
        a = [a[j : j + n] for j in range(0, len(a), n)]  # noqa E203
        a_flat = []
        for z in a:
            a_flat.append("".join(z))
        a_chars = "".join(a_flat)
        print(a_chars)
        line = f.readline()
 if __name__ == "__main__":
    main()
--- a/egs/aidatatang_200zh/ASR/prepare.sh
+++ b/egs/aidatatang_200zh/ASR/prepare.sh
@ -0,0 +1,118 @@
 #!/usr/bin/env bash
 set -eou pipefail
 stage=-1
 stop_stage=100
 # We assume dl_dir (download dir) contains the following
 # directories and files. If not, they will be downloaded
 # by this script automatically.
 #
 #  - $dl_dir/aidatatang_200zh
 #      You can find "corpus" and "transcript" inside it.
 #      You can download it at
 #       https://openslr.org/62/
 dl_dir=$PWD/download
 . shared/parse_options.sh || exit 1
 # All files generated by this script are saved in "data".
 # You can safely remove "data" and rerun this script to regenerate it.
 mkdir -p data
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 log "dl_dir: $dl_dir"
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "Stage 0: Download data"
  if [ ! -f $dl_dir/aidatatang_200zh/transcript/aidatatang_200_zh_transcript.txt ]; then
    lhotse download aidatatang-200zh $dl_dir
  fi
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare aidatatang_200zh manifest"
  # We assume that you have downloaded the aidatatang_200zh corpus
  # to $dl_dir/aidatatang_200zh
  if [ ! -f data/manifests/aidatatang_200zh/.manifests.done ]; then
    mkdir -p data/manifests/aidatatang_200zh
    lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
    touch data/manifests/aidatatang_200zh/.manifests.done
  fi
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Process aidatatang_200zh"
  if [ ! -f data/fbank/aidatatang_200zh/.fbank.done ]; then
    mkdir -p data/fbank/aidatatang_200zh
    lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
    touch data/fbank/aidatatang_200zh/.fbank.done
  fi
 fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Prepare musan manifest"
  # We assume that you have downloaded the musan corpus
  # to data/musan
  if [ ! -f data/manifests/.musan_manifests.done ]; then
    log "It may take 6 minutes"
    mkdir -p data/manifests
    lhotse prepare musan $dl_dir/musan data/manifests
    touch data/manifests/.musan_manifests.done
  fi
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Compute fbank for musan"
  if [ ! -f data/fbank/.msuan.done ]; then
    mkdir -p data/fbank
    ./local/compute_fbank_musan.py
    touch data/fbank/.msuan.done
  fi
 fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "Stage 5: Compute fbank for aidatatang_200zh"
  if [ ! -f data/fbank/.aidatatang_200zh.done ]; then
    mkdir -p data/fbank
    ./local/compute_fbank_aidatatang_200zh.py
    touch data/fbank/.aidatatang_200zh.done
  fi
 fi
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "Stage 6: Prepare char based lang"
  lang_char_dir=data/lang_char
  mkdir -p $lang_char_dir
  # Prepare text.
  grep "\"text\":" data/manifests/aidatatang_200zh/supervisions_train.json \
    | sed -e 's/["text:\t ]*//g' | sed 's/,//g' \
    | ./local/text2token.py -t "char" > $lang_char_dir/text
  # Prepare words.txt
  grep "\"text\":" data/manifests/aidatatang_200zh/supervisions_train.json \
    | sed -e 's/["text:\t]*//g' | sed 's/,//g' \
    | ./local/text2token.py -t "char" > $lang_char_dir/text_words
  cat $lang_char_dir/text_words | sed 's/ /\n/g' | sort -u | sed '/^$/d' \
    | uniq > $lang_char_dir/words_no_ids.txt
  if [ ! -f $lang_char_dir/words.txt ]; then
    ./local/prepare_words.py \
      --input-file $lang_char_dir/words_no_ids.txt
      --output-file $lang_char_dir/words.txt
  fi
  if [ ! -f $lang_char_dir/L_disambig.pt ]; then
    ./local/prepare_char.py
  fi
 fi
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/init.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/init.py
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -0,0 +1,420 @@
 # Copyright      2021  Piotr Żelasko
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import inspect
 import logging
 from functools import lru_cache
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 import torch
 from lhotse import (
    CutSet,
    Fbank,
    FbankConfig,
    load_manifest,
    load_manifest_lazy,
    set_caching_enabled,
 )
 from lhotse.dataset import (
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SingleCutSampler,
    SpecAugment,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
 from lhotse.utils import fix_random_seed
 from torch.utils.data import DataLoader
 from icefall.utils import str2bool
 set_caching_enabled(False)
 torch.set_num_threads(1)
 class _SeedWorkers:
    def __init__(self, seed: int):
        self.seed = seed
    def __call__(self, worker_id: int):
        fix_random_seed(self.seed + worker_id)
 class Aidatatang_200zhAsrDataModule:
    """
    DataModule for k2 ASR experiments.
    It assumes there is always one train and valid dataloader,
    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
    and test-other).
    It contains all the common data pipeline modules used in ASR
    experiments, e.g.:
    - dynamic batch size,
    - bucketing samplers,
    - cut concatenation,
    - augmentation,
    - on-the-fly feature extraction
    This class should be derived for specific corpora used in ASR tasks.
    """
    def __init__(self, args: argparse.Namespace):
        self.args = args
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        group = parser.add_argument_group(
            title="ASR data related options",
            description="These options are used for the preparation of "
            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
            "effective batch sizes, sampling strategies, applied data "
            "augmentations, etc.",
        )
        group.add_argument(
            "--manifest-dir",
            type=Path,
            default=Path("data/fbank"),
            help="Path to directory with train/dev/test cuts.",
        )
        group.add_argument(
            "--max-duration",
            type=int,
            default=200.0,
            help="Maximum pooled recordings duration (seconds) in a "
            "single batch. You can reduce it if it causes CUDA OOM.",
        )
        group.add_argument(
            "--bucketing-sampler",
            type=str2bool,
            default=True,
            help="When enabled, the batches will come from buckets of "
            "similar duration (saves padding frames).",
        )
        group.add_argument(
            "--num-buckets",
            type=int,
            default=300,
            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
            "--concatenate-cuts",
            type=str2bool,
            default=False,
            help="When enabled, utterances (cuts) will be concatenated "
            "to minimize the amount of padding.",
        )
        group.add_argument(
            "--duration-factor",
            type=float,
            default=1.0,
            help="Determines the maximum duration of a concatenated cut "
            "relative to the duration of the longest cut in a batch.",
        )
        group.add_argument(
            "--gap",
            type=float,
            default=1.0,
            help="The amount of padding (in seconds) inserted between "
            "concatenated cuts. This padding is filled with noise when "
            "noise augmentation is used.",
        )
        group.add_argument(
            "--on-the-fly-feats",
            type=str2bool,
            default=False,
            help="When enabled, use on-the-fly cut mixing and feature "
            "extraction. Will drop existing precomputed feature manifests "
            "if available.",
        )
        group.add_argument(
            "--shuffle",
            type=str2bool,
            default=True,
            help="When enabled (=default), the examples will be "
            "shuffled for each epoch.",
        )
        group.add_argument(
            "--return-cuts",
            type=str2bool,
            default=True,
            help="When enabled, each batch will have the "
            "field: batch['supervisions']['cut'] with the cuts that "
            "were used to construct it.",
        )
        group.add_argument(
            "--num-workers",
            type=int,
            default=2,
            help="The number of training dataloader workers that "
            "collect the batches.",
        )
        group.add_argument(
            "--enable-spec-aug",
            type=str2bool,
            default=True,
            help="When enabled, use SpecAugment for training dataset.",
        )
        group.add_argument(
            "--spec-aug-time-warp-factor",
            type=int,
            default=80,
            help="Used only when --enable-spec-aug is True. "
            "It specifies the factor for time warping in SpecAugment. "
            "Larger values mean more warping. "
            "A value less than 1 means to disable time warp.",
        )
        group.add_argument(
            "--enable-musan",
            type=str2bool,
            default=True,
            help="When enabled, select noise from MUSAN and mix it"
            "with training dataset. ",
        )
    def train_dataloaders(
        self,
        cuts_train: CutSet,
        sampler_state_dict: Optional[Dict[str, Any]] = None,
    ) -> DataLoader:
        """
        Args:
          cuts_train:
            CutSet for training.
          sampler_state_dict:
            The state dict for the training sampler.
        """
        logging.info("About to get Musan cuts")
        cuts_musan = load_manifest(
            self.args.manifest_dir / "musan_cuts.jsonl.gz"
        )
        transforms = []
        if self.args.enable_musan:
            logging.info("Enable MUSAN")
            transforms.append(
                CutMix(
                    cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True
                )
            )
        else:
            logging.info("Disable MUSAN")
        if self.args.concatenate_cuts:
            logging.info(
                f"Using cut concatenation with duration factor "
                f"{self.args.duration_factor} and gap {self.args.gap}."
            )
            # Cut concatenation should be the first transform in the list,
            # so that if we e.g. mix noise in, it will fill the gaps between
            # different utterances.
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        input_transforms = []
        if self.args.enable_spec_aug:
            logging.info("Enable SpecAugment")
            logging.info(
                f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
            )
            # Set the value of num_frame_masks according to Lhotse's version.
            # In different Lhotse's versions, the default of num_frame_masks is
            # different.
            num_frame_masks = 10
            num_frame_masks_parameter = inspect.signature(
                SpecAugment.__init__
            ).parameters["num_frame_masks"]
            if num_frame_masks_parameter.default == 1:
                num_frame_masks = 2
            logging.info(f"Num frame mask: {num_frame_masks}")
            input_transforms.append(
                SpecAugment(
                    time_warp_factor=self.args.spec_aug_time_warp_factor,
                    num_frame_masks=num_frame_masks,
                    features_mask_size=27,
                    num_feature_masks=2,
                    frames_mask_size=100,
                )
            )
        else:
            logging.info("Disable SpecAugment")
        logging.info("About to create train dataset")
        train = K2SpeechRecognitionDataset(
            cut_transforms=transforms,
            input_transforms=input_transforms,
            return_cuts=self.args.return_cuts,
        )
        if self.args.on_the_fly_feats:
            # NOTE: the PerturbSpeed transform should be added only if we
            # remove it from data prep stage.
            # Add on-the-fly speed perturbation; since originally it would
            # have increased epoch size by 3, we will apply prob 2/3 and use
            # 3x more epochs.
            # Speed perturbation probably should come first before
            # concatenation, but in principle the transforms order doesn't have
            # to be strict (e.g. could be randomized)
            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
            # Drop feats to be on the safe side.
            train = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                ),
                input_transforms=input_transforms,
                return_cuts=self.args.return_cuts,
            )
        if self.args.bucketing_sampler:
            logging.info("Using DynamicBucketingSampler.")
            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                drop_last=True,
            )
        else:
            logging.info("Using SingleCutSampler.")
            train_sampler = SingleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
            )
        logging.info("About to create train dataloader")
        # 'seed' is derived from the current random state, which will have
        # previously been set in the main process.
        seed = torch.randint(0, 100000, ()).item()
        worker_init_fn = _SeedWorkers(seed)
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=self.args.num_workers,
            persistent_workers=False,
            worker_init_fn=worker_init_fn,
        )
        if sampler_state_dict is not None:
            logging.info("Loading sampler state dict")
            train_dl.sampler.load_state_dict(sampler_state_dict)
        return train_dl
    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
        transforms = []
        if self.args.concatenate_cuts:
            transforms = [
                CutConcatenate(
                    duration_factor=self.args.duration_factor, gap=self.args.gap
                )
            ] + transforms
        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                ),
                return_cuts=self.args.return_cuts,
            )
        else:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.info("About to create dev dataloader")
        from lhotse.dataset.iterable_dataset import IterableDatasetWrapper
        dev_iter_dataset = IterableDatasetWrapper(
            dataset=validate,
            sampler=valid_sampler,
        )
        valid_dl = DataLoader(
            dev_iter_dataset,
            batch_size=None,
            num_workers=self.args.num_workers,
            persistent_workers=False,
        )
        return valid_dl
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.debug("About to create test dataset")
        test = K2SpeechRecognitionDataset(
            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
            if self.args.on_the_fly_feats
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
        sampler = DynamicBucketingSampler(
            cuts,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        from lhotse.dataset.iterable_dataset import IterableDatasetWrapper
        test_iter_dataset = IterableDatasetWrapper(
            dataset=test,
            sampler=sampler,
        )
        test_dl = DataLoader(
            test_iter_dataset,
            batch_size=None,
            num_workers=self.args.num_workers,
        )
        return test_dl
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
        )
    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz"
        )
    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        logging.info("About to get test cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
        )
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/beam_search.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/beam_search.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless2/beam_search.py
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/conformer.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/conformer.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless2/conformer.py
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/decode.py
@ -0,0 +1,600 @@
 #!/usr/bin/env python3
 #
 # Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 When training with the L subset, usage:
 (1) greedy search
 ./pruned_transducer_stateless2/decode.py \
        --epoch 6 \
        --avg 3 \
        --exp-dir ./pruned_transducer_stateless2/exp \
        --lang-dir data/lang_char \
        --max-duration 100 \
        --decoding-method greedy_search
 (2) modified beam search
 ./pruned_transducer_stateless2/decode.py \
        --epoch 6 \
        --avg 3 \
        --exp-dir ./pruned_transducer_stateless2/exp \
        --lang-dir data/lang_char \
        --max-duration 100 \
        --decoding-method modified_beam_search \
        --beam-size 4
 (3) fast beam search
 ./pruned_transducer_stateless2/decode.py \
        --epoch 6 \
        --avg 3 \
        --exp-dir ./pruned_transducer_stateless2/exp \
        --lang-dir data/lang_char \
        --max-duration 1500 \
        --decoding-method fast_beam_search \
        --beam 4 \
        --max-contexts 4 \
        --max-states 8
 """
 import argparse
 import logging
 from collections import defaultdict
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import k2
 import torch
 import torch.nn as nn
 from asr_datamodule import Aidatatang_200zhAsrDataModule
 from beam_search import (
    beam_search,
    fast_beam_search_one_best,
    greedy_search,
    greedy_search_batch,
    modified_beam_search,
 )
 from train import get_params, get_transducer_model
 from icefall.checkpoint import (
    average_checkpoints,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
    setup_logger,
    store_transcripts,
    write_error_stats,
 )
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=28,
        help="It specifies the checkpoint to use for decoding."
        "Note: Epoch counts from 0.",
    )
    parser.add_argument(
        "--batch",
        type=int,
        default=None,
        help="It specifies the batch checkpoint to use for decoding."
        "Note: Epoch counts from 0.",
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=15,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch'. ",
    )
    parser.add_argument(
        "--avg-last-n",
        type=int,
        default=0,
        help="""If positive, --epoch and --avg are ignored and it
        will use the last n checkpoints exp_dir/checkpoint-xxx.pt
        where xxx is the number of processed batches while
        saving that checkpoint.
        """,
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="pruned_transducer_stateless2/exp",
        help="The experiment dir",
    )
    parser.add_argument(
        "--lang-dir",
        type=str,
        default="data/lang_char",
        help="""The lang dir
        It contains language related input files such as
        "lexicon.txt"
        """,
    )
    parser.add_argument(
        "--decoding-method",
        type=str,
        default="greedy_search",
        help="""Possible values are:
          - greedy_search
          - beam_search
          - modified_beam_search
          - fast_beam_search
        """,
    )
    parser.add_argument(
        "--beam-size",
        type=int,
        default=4,
        help="""An interger indicating how many candidates we will keep for each
        frame. Used only when --decoding-method is beam_search or
        modified_beam_search.""",
    )
    parser.add_argument(
        "--beam",
        type=float,
        default=4,
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
        Used only when --decoding-method is fast_beam_search""",
    )
    parser.add_argument(
        "--max-contexts",
        type=int,
        default=4,
        help="""Used only when --decoding-method is
        fast_beam_search""",
    )
    parser.add_argument(
        "--max-states",
        type=int,
        default=8,
        help="""Used only when --decoding-method is
        fast_beam_search""",
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; "
        "2 means tri-gram",
    )
    parser.add_argument(
        "--max-sym-per-frame",
        type=int,
        default=1,
        help="""Maximum number of symbols per frame.
        Used only when --decoding_method is greedy_search""",
    )
    return parser
 def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
    lexicon: Lexicon,
    batch: dict,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[List[str]]]:
    """Decode one batch and return the result in a dict. The dict has the
    following format:
        - key: It indicates the setting used for decoding. For example,
               if greedy_search is used, it would be "greedy_search"
               If beam search with a beam size of 7 is used, it would be
               "beam_7"
        - value: It contains the decoding result. `len(value)` equals to
                 batch size. `value[i]` is the decoding result for the i-th
                 utterance in the given batch.
    Args:
      params:
        It's the return value of :func:`get_params`.
      model:
        The neural model.
      batch:
        It is the return value from iterating
        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
        for the format of the `batch`.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
        only when --decoding_method is fast_beam_search.
    Returns:
      Return the decoding result. See above description for the format of
      the returned dict.
    """
    device = model.device
    feature = batch["inputs"]
    assert feature.ndim == 3
    feature = feature.to(device)
    # at entry, feature is (N, T, C)
    supervisions = batch["supervisions"]
    feature_lens = supervisions["num_frames"].to(device)
    encoder_out, encoder_out_lens = model.encoder(
        x=feature, x_lens=feature_lens
    )
    hyps = []
    if params.decoding_method == "fast_beam_search":
        hyp_tokens = fast_beam_search_one_best(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam,
            max_contexts=params.max_contexts,
            max_states=params.max_states,
        )
        for i in range(encoder_out.size(0)):
            hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
    elif (
        params.decoding_method == "greedy_search"
        and params.max_sym_per_frame == 1
    ):
        hyp_tokens = greedy_search_batch(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
        )
        for i in range(encoder_out.size(0)):
            hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
    elif params.decoding_method == "modified_beam_search":
        hyp_tokens = modified_beam_search(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam_size,
        )
        for i in range(encoder_out.size(0)):
            hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
    else:
        batch_size = encoder_out.size(0)
        for i in range(batch_size):
            # fmt: off
            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
            # fmt: on
            if params.decoding_method == "greedy_search":
                hyp = greedy_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    max_sym_per_frame=params.max_sym_per_frame,
                )
            elif params.decoding_method == "beam_search":
                hyp = beam_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    beam=params.beam_size,
                )
            else:
                raise ValueError(
                    f"Unsupported decoding method: {params.decoding_method}"
                )
            hyps.append([lexicon.token_table[idx] for idx in hyp])
    if params.decoding_method == "greedy_search":
        return {"greedy_search": hyps}
    elif params.decoding_method == "fast_beam_search":
        return {
            (
                f"beam_{params.beam}_"
                f"max_contexts_{params.max_contexts}_"
                f"max_states_{params.max_states}"
            ): hyps
        }
    else:
        return {f"beam_size_{params.beam_size}": hyps}
 def decode_dataset(
    dl: torch.utils.data.DataLoader,
    params: AttributeDict,
    model: nn.Module,
    lexicon: Lexicon,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
    """Decode dataset.
    Args:
      dl:
        PyTorch's dataloader containing the dataset to decode.
      params:
        It is returned by :func:`get_params`.
      model:
        The neural model.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
        only when --decoding_method is fast_beam_search.
    Returns:
      Return a dict, whose key may be "greedy_search" if greedy search
      is used, or it may be "beam_7" if beam size of 7 is used.
      Its value is a list of tuples. Each tuple contains two elements:
      The first is the reference transcript, and the second is the
      predicted result.
    """
    num_cuts = 0
    try:
        num_batches = len(dl)
    except TypeError:
        num_batches = "?"
    if params.decoding_method == "greedy_search":
        log_interval = 100
    else:
        log_interval = 50
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
        texts = [list(str(text).replace(" ", "")) for text in texts]
        hyps_dict = decode_one_batch(
            params=params,
            model=model,
            lexicon=lexicon,
            decoding_graph=decoding_graph,
            batch=batch,
        )
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
            for hyp_words, ref_text in zip(hyps, texts):
                this_batch.append((ref_text, hyp_words))
            results[name].extend(this_batch)
        num_cuts += len(texts)
        if batch_idx % log_interval == 0:
            batch_str = f"{batch_idx}/{num_batches}"
            logging.info(
                f"batch {batch_str}, cuts processed until now is {num_cuts}"
            )
    return results
 def save_results(
    params: AttributeDict,
    test_set_name: str,
    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
        errs_filename = (
            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
                f, f"{test_set_name}-{key}", results, enable_log=True
            )
            test_set_wers[key] = wer
        logging.info("Wrote detailed error stats to {}".format(errs_filename))
    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = (
        params.res_dir
        / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
    )
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
        for key, val in test_set_wers:
            print("{}\t{}".format(key, val), file=f)
    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
    note = "\tbest for {}".format(test_set_name)
    for key, val in test_set_wers:
        s += "{}\t{}{}\n".format(key, val, note)
        note = ""
    logging.info(s)
@torch.no_grad()
 def main():
    parser = get_parser()
    Aidatatang_200zhAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    assert params.decoding_method in (
        "greedy_search",
        "beam_search",
        "fast_beam_search",
        "modified_beam_search",
    )
    params.res_dir = params.exp_dir / params.decoding_method
    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
    if "fast_beam_search" in params.decoding_method:
        params.suffix += f"-beam-{params.beam}"
        params.suffix += f"-max-contexts-{params.max_contexts}"
        params.suffix += f"-max-states-{params.max_states}"
    elif "beam_search" in params.decoding_method:
        params.suffix += f"-beam-{params.beam_size}"
    else:
        params.suffix += f"-context-{params.context_size}"
        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
    logging.info("Decoding started")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"Device: {device}")
    lexicon = Lexicon(params.lang_dir)
    params.blank_id = lexicon.token_table["<blk>"]
    params.vocab_size = max(lexicon.tokens) + 1
    logging.info(params)
    logging.info("About to create model")
    model = get_transducer_model(params)
    if params.avg_last_n > 0:
        filenames = find_checkpoints(params.exp_dir)[: params.avg_last_n]
        logging.info(f"averaging {filenames}")
        model.to(device)
        model.load_state_dict(average_checkpoints(filenames, device=device))
    elif params.avg == 1:
        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
    elif params.batch is not None:
        filenames = f"{params.exp_dir}/checkpoint-{params.batch}.pt"
        logging.info(f"averaging {filenames}")
        model.to(device)
        model.load_state_dict(average_checkpoints([filenames], device=device))
    else:
        start = params.epoch - params.avg + 1
        filenames = []
        for i in range(start, params.epoch + 1):
            if start >= 0:
                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
        logging.info(f"averaging {filenames}")
        model.to(device)
        model.load_state_dict(average_checkpoints(filenames, device=device))
    model.to(device)
    model.eval()
    model.device = device
    if params.decoding_method == "fast_beam_search":
        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
    else:
        decoding_graph = None
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    # Note: Please use "pip install webdataset==0.1.103"
    # for installing the webdataset.
    import glob
    import os
    from lhotse import CutSet
    from lhotse.dataset.webdataset import export_to_webdataset
    aidatatang_200zh = Aidatatang_200zhAsrDataModule(args)
    dev = "dev"
    test = "test"
    if not os.path.exists(f"{dev}/shared-0.tar"):
        os.makedirs(dev)
        dev_cuts = aidatatang_200zh.valid_cuts()
        export_to_webdataset(
            dev_cuts,
            output_path=f"{dev}/shared-%d.tar",
            shard_size=300,
        )
    if not os.path.exists(f"{test}/shared-0.tar"):
        os.makedirs(test)
        test_cuts = aidatatang_200zh.test_cuts()
        export_to_webdataset(
            test_cuts,
            output_path=f"{test}/shared-%d.tar",
            shard_size=300,
        )
    dev_shards = [
        str(path)
        for path in sorted(glob.glob(os.path.join(dev, "shared-*.tar")))
    ]
    cuts_dev_webdataset = CutSet.from_webdataset(
        dev_shards,
        split_by_worker=True,
        split_by_node=True,
        shuffle_shards=True,
    )
    test_shards = [
        str(path)
        for path in sorted(glob.glob(os.path.join(test, "shared-*.tar")))
    ]
    cuts_test_webdataset = CutSet.from_webdataset(
        test_shards,
        split_by_worker=True,
        split_by_node=True,
        shuffle_shards=True,
    )
    dev_dl = aidatatang_200zh.valid_dataloaders(cuts_dev_webdataset)
    test_dl = aidatatang_200zh.test_dataloaders(cuts_test_webdataset)
    test_sets = ["dev", "test"]
    test_dl = [dev_dl, test_dl]
    for test_set, test_dl in zip(test_sets, test_dl):
        results_dict = decode_dataset(
            dl=test_dl,
            params=params,
            model=model,
            lexicon=lexicon,
            decoding_graph=decoding_graph,
        )
        save_results(
            params=params,
            test_set_name=test_set,
            results_dict=results_dict,
        )
    logging.info("Done!")
 if __name__ == "__main__":
    main()
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/decoder.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/decoder.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless2/decoder.py
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/encoder_interface.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/encoder_interface.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/transducer_stateless/encoder_interface.py
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/export.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/export.py
@ -0,0 +1,181 @@
 # Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This script converts several saved checkpoints
 # to a single one using model averaging.
 """
 Usage:
 ./pruned_transducer_stateless2/export.py \
  --exp-dir ./pruned_transducer_stateless2/exp \
  --lang-dir data/lang_char \
  --epoch 29 \
  --avg 19
 It will generate a file exp_dir/pretrained.pt
 To use the generated file with `pruned_transducer_stateless2/decode.py`,
 you can do:
    cd /path/to/exp_dir
    ln -s pretrained.pt epoch-9999.pt
    cd /path/to/egs/aidatatang_200zh/ASR
    ./pruned_transducer_stateless2/decode.py \
        --exp-dir ./pruned_transducer_stateless2/exp \
        --epoch 9999 \
        --avg 1 \
        --max-duration 100 \
        --lang-dir data/lang_char
 """
 import argparse
 import logging
 from pathlib import Path
 import torch
 from train import get_params, get_transducer_model
 from icefall.checkpoint import average_checkpoints, load_checkpoint
 from icefall.lexicon import Lexicon
 from icefall.utils import str2bool
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=28,
        help="It specifies the checkpoint to use for decoding."
        "Note: Epoch counts from 0.",
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=15,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch'. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="pruned_transducer_stateless2/exp",
        help="""It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
        """,
    )
    parser.add_argument(
        "--lang-dir",
        type=str,
        default="data/lang_char",
        help="The lang dir",
    )
    parser.add_argument(
        "--jit",
        type=str2bool,
        default=False,
        help="""True to save a model after applying torch.jit.script.
        """,
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; "
        "2 means tri-gram",
    )
    return parser
 def main():
    args = get_parser().parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"device: {device}")
    lexicon = Lexicon(params.lang_dir)
    params.blank_id = 0
    params.vocab_size = max(lexicon.tokens) + 1
    logging.info(params)
    logging.info("About to create model")
    model = get_transducer_model(params)
    model.to(device)
    if params.avg == 1:
        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
    else:
        start = params.epoch - params.avg + 1
        filenames = []
        for i in range(start, params.epoch + 1):
            if start >= 0:
                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
        logging.info(f"averaging {filenames}")
        model.to(device)
        model.load_state_dict(average_checkpoints(filenames, device=device))
    model.eval()
    model.to("cpu")
    model.eval()
    if params.jit:
        # We won't use the forward() method of the model in C++, so just ignore
        # it here.
        # Otherwise, one of its arguments is a ragged tensor and is not
        # torch scriptabe.
        model.__class__.forward = torch.jit.ignore(model.__class__.forward)
        logging.info("Using torch.jit.script")
        model = torch.jit.script(model)
        filename = params.exp_dir / "cpu_jit.pt"
        model.save(str(filename))
        logging.info(f"Saved to {filename}")
    else:
        logging.info("Not using torch.jit.script")
        # Save it using a format so that it can be loaded
        # by :func:`load_checkpoint`
        filename = params.exp_dir / "pretrained.pt"
        torch.save({"model": model.state_dict()}, str(filename))
        logging.info(f"Saved to {filename}")
 if __name__ == "__main__":
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/joiner.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/joiner.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless2/joiner.py
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/model.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/model.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless2/model.py
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/optim.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/optim.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless2/optim.py
--- a/Show More
+++ b/Show More
		`@ -0,0 +1,4 @@`

							`# Introduction`

							`<https://shields.io/> is used to generate files in this directory.`
		`@ -0,0 +1 @@`
							<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="80" height="20" role="img" aria-label="k2: >= v1.9"><title>k2: >= v1.9</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="80" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="23" height="20" fill="#555"/><rect x="23" width="57" height="20" fill="blueviolet"/><rect width="80" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="125" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="130">k2</text><text x="125" y="140" transform="scale(.1)" fill="#fff" textLength="130">k2</text><text aria-hidden="true" x="505" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="470">>= v1.9</text><text x="505" y="140" transform="scale(.1)" fill="#fff" textLength="470">>= v1.9</text></g></svg>
		`@ -1 +0,0 @@`
			<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="58" height="20" role="img" aria-label="k2: v1.9"><title>k2: v1.9</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="58" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="23" height="20" fill="#555"/><rect x="23" width="35" height="20" fill="blueviolet"/><rect width="58" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="125" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="130">k2</text><text x="125" y="140" transform="scale(.1)" fill="#fff" textLength="130">k2</text><text aria-hidden="true" x="395" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="250">v1.9</text><text x="395" y="140" transform="scale(.1)" fill="#fff" textLength="250">v1.9</text></g></svg>
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/compute_fbank_musan.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/pruned_transducer_stateless2/beam_search.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/transducer_stateless/encoder_interface.py`