diff --git a/.flake8 b/.flake8
new file mode 100644
index 000000000..cf276d0ba
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,35 @@
+[flake8]
+show-source=true
+statistics=true
+max-line-length = 88
+per-file-ignores =
+ # line too long
+ icefall/diagnostics.py: E501,
+ egs/*/ASR/*/conformer.py: E501,
+ egs/*/ASR/pruned_transducer_stateless*/*.py: E501,
+ egs/*/ASR/*/optim.py: E501,
+ egs/*/ASR/*/scaling.py: E501,
+ egs/librispeech/ASR/lstm_transducer_stateless*/*.py: E501, E203
+ egs/librispeech/ASR/conv_emformer_transducer_stateless*/*.py: E501, E203
+ egs/librispeech/ASR/conformer_ctc*/*py: E501,
+ egs/librispeech/ASR/zipformer_mmi/*.py: E501, E203
+ egs/librispeech/ASR/zipformer/*.py: E501, E203
+ egs/librispeech/ASR/RESULTS.md: E999,
+ egs/ljspeech/TTS/vits/*.py: E501, E203
+ # invalid escape sequence (cause by tex formular), W605
+ icefall/utils.py: E501, W605
+
+exclude =
+ .git,
+ **/data/**,
+ icefall/shared/make_kn_lm.py,
+ icefall/__init__.py
+ icefall/ctc/__init__.py
+
+ignore =
+ # E203 white space before ":"
+ E203,
+ # W503 line break before binary operator
+ W503,
+ # E226 missing whitespace around arithmetic operator
+ E226,
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 000000000..5d65b98e9
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,3 @@
+# Migrate to 88 characters per line (see: https://github.com/lhotse-speech/lhotse/issues/890)
+107df3b115a58f1b68a6458c3f94a130004be34c
+d31db010371a4128856480382876acdc0d1739ed
diff --git a/.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh b/.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
new file mode 100755
index 000000000..0bec8c0c4
--- /dev/null
+++ b/.github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+# This script computes fbank features for the test-clean and test-other datasets.
+# The computed features are saved to ~/tmp/fbank-libri and are
+# cached for later runs
+
+set -e
+
+export PYTHONPATH=$PWD:$PYTHONPATH
+echo $PYTHONPATH
+
+mkdir ~/tmp/fbank-libri
+cd egs/librispeech/ASR
+mkdir -p data
+cd data
+[ ! -e fbank ] && ln -s ~/tmp/fbank-libri fbank
+cd ..
+./local/compute_fbank_librispeech.py --dataset 'test-clean test-other'
+ls -lh data/fbank/
diff --git a/.github/scripts/download-gigaspeech-dev-test-dataset.sh b/.github/scripts/download-gigaspeech-dev-test-dataset.sh
new file mode 100755
index 000000000..f3564efc7
--- /dev/null
+++ b/.github/scripts/download-gigaspeech-dev-test-dataset.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# This script downloads the pre-computed fbank features for
+# dev and test datasets of GigaSpeech.
+#
+# You will find directories `~/tmp/giga-dev-dataset-fbank` after running
+# this script.
+
+set -e
+
+mkdir -p ~/tmp
+cd ~/tmp
+
+git lfs install
+git clone https://huggingface.co/csukuangfj/giga-dev-dataset-fbank
+
+ls -lh giga-dev-dataset-fbank/data/fbank
diff --git a/.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh b/.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
new file mode 100755
index 000000000..11704526c
--- /dev/null
+++ b/.github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+# This script downloads the test-clean and test-other datasets
+# of LibriSpeech and unzip them to the folder ~/tmp/download,
+# which is cached by GitHub actions for later runs.
+#
+# You will find directories ~/tmp/download/LibriSpeech after running
+# this script.
+
+set -e
+
+mkdir ~/tmp/download
+cd egs/librispeech/ASR
+ln -s ~/tmp/download .
+cd download
+wget -q --no-check-certificate https://www.openslr.org/resources/12/test-clean.tar.gz
+tar xf test-clean.tar.gz
+rm test-clean.tar.gz
+
+wget -q --no-check-certificate https://www.openslr.org/resources/12/test-other.tar.gz
+tar xf test-other.tar.gz
+rm test-other.tar.gz
+pwd
+ls -lh
+ls -lh LibriSpeech
diff --git a/.github/scripts/install-kaldifeat.sh b/.github/scripts/install-kaldifeat.sh
new file mode 100755
index 000000000..de30f7dfe
--- /dev/null
+++ b/.github/scripts/install-kaldifeat.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+# This script installs kaldifeat into the directory ~/tmp/kaldifeat
+# which is cached by GitHub actions for later runs.
+
+set -e
+
+mkdir -p ~/tmp
+cd ~/tmp
+git clone https://github.com/csukuangfj/kaldifeat
+cd kaldifeat
+mkdir build
+cd build
+cmake -DCMAKE_BUILD_TYPE=Release ..
+make -j2 _kaldifeat
diff --git a/.github/scripts/multi-zh-hans.sh b/.github/scripts/multi-zh-hans.sh
new file mode 100755
index 000000000..427d8887b
--- /dev/null
+++ b/.github/scripts/multi-zh-hans.sh
@@ -0,0 +1,158 @@
+#!/usr/bin/env bash
+
+set -ex
+
+git config --global user.name "k2-fsa"
+git config --global user.email "csukuangfj@gmail.com"
+git config --global lfs.allowincompletepush true
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "pwd: $PWD"
+
+cd egs/multi_zh-hans/ASR
+
+repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05
+log "Downloading pre-trained model from $repo_url"
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+cd exp/
+git lfs pull --include pretrained.pt
+rm -fv epoch-20.pt
+rm -fv *.onnx
+ln -s pretrained.pt epoch-20.pt
+cd ../data/lang_bpe_2000
+ls -lh
+git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
+git lfs pull --include "*.model"
+ls -lh
+popd
+
+log "----------------------------------------"
+log "Export streaming ONNX CTC models "
+log "----------------------------------------"
+./zipformer/export-onnx-streaming-ctc.py \
+ --exp-dir $repo/exp \
+ --tokens $repo/data/lang_bpe_2000/tokens.txt \
+ --causal 1 \
+ --avg 1 \
+ --epoch 20 \
+ --use-averaged-model 0 \
+ --chunk-size 16 \
+ --left-context-frames 128 \
+ --use-ctc 1
+
+ls -lh $repo/exp/
+
+log "------------------------------------------------------------"
+log "Test exported streaming ONNX CTC models (greedy search) "
+log "------------------------------------------------------------"
+
+test_wavs=(
+DEV_T0000000000.wav
+DEV_T0000000001.wav
+DEV_T0000000002.wav
+TEST_MEETING_T0000000113.wav
+TEST_MEETING_T0000000219.wav
+TEST_MEETING_T0000000351.wav
+)
+
+for w in ${test_wavs[@]}; do
+ ./zipformer/onnx_pretrained-streaming-ctc.py \
+ --model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
+ --tokens $repo/data/lang_bpe_2000/tokens.txt \
+ $repo/test_wavs/$w
+done
+
+log "Upload onnx CTC models to huggingface"
+url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
+GIT_LFS_SKIP_SMUDGE=1 git clone $url
+dst=$(basename $url)
+cp -v $repo/exp/ctc*.onnx $dst
+cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+mkdir -p $dst/test_wavs
+cp -v $repo/test_wavs/*.wav $dst/test_wavs
+cd $dst
+git lfs track "*.onnx" "bpe.model"
+ls -lh
+file bpe.model
+git status
+git add .
+git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
+
+log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+rm -rf .git
+rm -fv .gitattributes
+cd ..
+tar cjfv $dst.tar.bz2 $dst
+ls -lh *.tar.bz2
+mv -v $dst.tar.bz2 ../../../
+
+log "----------------------------------------"
+log "Export streaming ONNX transducer models "
+log "----------------------------------------"
+
+./zipformer/export-onnx-streaming.py \
+ --exp-dir $repo/exp \
+ --tokens $repo/data/lang_bpe_2000/tokens.txt \
+ --causal 1 \
+ --avg 1 \
+ --epoch 20 \
+ --use-averaged-model 0 \
+ --chunk-size 16 \
+ --left-context-frames 128 \
+ --use-ctc 0
+
+ls -lh $repo/exp
+
+log "------------------------------------------------------------"
+log "Test exported streaming ONNX transducer models (Python code)"
+log "------------------------------------------------------------"
+
+log "test fp32"
+./zipformer/onnx_pretrained-streaming.py \
+ --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.onnx \
+ --tokens $repo/data/lang_bpe_2000/tokens.txt \
+ $repo/test_wavs/DEV_T0000000000.wav
+
+log "test int8"
+./zipformer/onnx_pretrained-streaming.py \
+ --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
+ --tokens $repo/data/lang_bpe_2000/tokens.txt \
+ $repo/test_wavs/DEV_T0000000000.wav
+
+log "Upload onnx transducer models to huggingface"
+
+url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12
+GIT_LFS_SKIP_SMUDGE=1 git clone $url
+dst=$(basename $url)
+cp -v $repo/exp/encoder*.onnx $dst
+cp -v $repo/exp/decoder*.onnx $dst
+cp -v $repo/exp/joiner*.onnx $dst
+cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+mkdir -p $dst/test_wavs
+cp -v $repo/test_wavs/*.wav $dst/test_wavs
+cd $dst
+git lfs track "*.onnx" bpe.model
+git add .
+git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
+
+log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+rm -rf .git
+rm -fv .gitattributes
+cd ..
+tar cjfv $dst.tar.bz2 $dst
+ls -lh *.tar.bz2
+mv -v $dst.tar.bz2 ../../../
diff --git a/.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh b/.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
new file mode 100755
index 000000000..1b48aae27
--- /dev/null
+++ b/.github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+# This script assumes that test-clean and test-other are downloaded
+# to egs/librispeech/ASR/download/LibriSpeech and generates manifest
+# files in egs/librispeech/ASR/data/manifests
+
+set -e
+
+cd egs/librispeech/ASR
+[ ! -e download ] && ln -s ~/tmp/download .
+mkdir -p data/manifests
+lhotse prepare librispeech -j 2 -p test-clean -p test-other ./download/LibriSpeech data/manifests
+ls -lh data/manifests
diff --git a/.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh b/.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh
new file mode 100755
index 000000000..c3640cfde
--- /dev/null
+++ b/.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/aishell/ASR
+
+git lfs install
+
+fbank_url=https://huggingface.co/csukuangfj/aishell-test-dev-manifests
+log "Downloading pre-commputed fbank from $fbank_url"
+
+git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests
+ln -s $PWD/aishell-test-dev-manifests/data .
+
+repo_url=https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20
+log "Downloading pre-trained model from $repo_url"
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s pretrained-epoch-29-avg-5-torch-1.10.0.pt pretrained.pt
+popd
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./pruned_transducer_stateless3/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --lang-dir $repo/data/lang_char \
+ $repo/test_wavs/BAC009S0764W0121.wav \
+ $repo/test_wavs/BAC009S0764W0122.wav \
+ $repo/test_wavs/BAC009S0764W0123.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./pruned_transducer_stateless3/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --lang-dir $repo/data/lang_char \
+ $repo/test_wavs/BAC009S0764W0121.wav \
+ $repo/test_wavs/BAC009S0764W0122.wav \
+ $repo/test_wavs/BAC009S0764W0123.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p pruned_transducer_stateless3/exp
+ ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless3/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_char data/
+
+ ls -lh data
+ ls -lh pruned_transducer_stateless3/exp
+
+ log "Decoding test and dev"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./pruned_transducer_stateless3/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --max-duration $max_duration \
+ --exp-dir pruned_transducer_stateless3/exp
+ done
+
+ rm pruned_transducer_stateless3/exp/*.pt
+fi
diff --git a/.github/scripts/run-aishell-zipformer-2023-10-24.sh b/.github/scripts/run-aishell-zipformer-2023-10-24.sh
new file mode 100755
index 000000000..865e29799
--- /dev/null
+++ b/.github/scripts/run-aishell-zipformer-2023-10-24.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/aishell/ASR
+
+git lfs install
+
+fbank_url=https://huggingface.co/csukuangfj/aishell-test-dev-manifests
+log "Downloading pre-commputed fbank from $fbank_url"
+
+git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests
+ln -s $PWD/aishell-test-dev-manifests/data .
+
+log "======================="
+log "CI testing large model"
+repo_url=https://huggingface.co/zrjin/icefall-asr-aishell-zipformer-large-2023-10-24/
+log "Downloading pre-trained model from $repo_url"
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+for method in modified_beam_search greedy_search fast_beam_search; do
+ log "$method"
+
+ ./zipformer/pretrained.py \
+ --method $method \
+ --context-size 1 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_char/tokens.txt \
+ --num-encoder-layers 2,2,4,5,4,2 \
+ --feedforward-dim 512,768,1536,2048,1536,768 \
+ --encoder-dim 192,256,512,768,512,256 \
+ --encoder-unmasked-dim 192,192,256,320,256,192 \
+ $repo/test_wavs/BAC009S0764W0121.wav \
+ $repo/test_wavs/BAC009S0764W0122.wav \
+ $repo/test_wavs/BAC009S0764W0123.wav
+done
+
+log "======================="
+log "CI testing medium model"
+repo_url=https://huggingface.co/zrjin/icefall-asr-aishell-zipformer-2023-10-24/
+log "Downloading pre-trained model from $repo_url"
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+
+for method in modified_beam_search greedy_search fast_beam_search; do
+ log "$method"
+
+ ./zipformer/pretrained.py \
+ --method $method \
+ --context-size 1 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_char/tokens.txt \
+ $repo/test_wavs/BAC009S0764W0121.wav \
+ $repo/test_wavs/BAC009S0764W0122.wav \
+ $repo/test_wavs/BAC009S0764W0123.wav
+done
+
+
+log "======================="
+log "CI testing small model"
+repo_url=https://huggingface.co/zrjin/icefall-asr-aishell-zipformer-small-2023-10-24/
+log "Downloading pre-trained model from $repo_url"
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+
+for method in modified_beam_search greedy_search fast_beam_search; do
+ log "$method"
+
+ ./zipformer/pretrained.py \
+ --method $method \
+ --context-size 1 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_char/tokens.txt \
+ --num-encoder-layers 2,2,2,2,2,2 \
+ --feedforward-dim 512,768,768,768,768,768 \
+ --encoder-dim 192,256,256,256,256,256 \
+ --encoder-unmasked-dim 192,192,192,192,192,192 \
+ $repo/test_wavs/BAC009S0764W0121.wav \
+ $repo/test_wavs/BAC009S0764W0122.wav \
+ $repo/test_wavs/BAC009S0764W0123.wav
+done
+
diff --git a/.github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh b/.github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh
new file mode 100755
index 000000000..b61a9d7b6
--- /dev/null
+++ b/.github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/gigaspeech/ASR
+
+repo_url=https://huggingface.co/wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p pruned_transducer_stateless2/exp
+ ln -s $PWD/$repo/exp/pretrained-iter-3488000-avg-20.pt pruned_transducer_stateless2/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh data/lang_bpe_500
+ ls -lh data/fbank
+ ls -lh pruned_transducer_stateless2/exp
+
+ ln -s data/fbank/cuts_DEV.jsonl.gz data/fbank/gigaspeech_cuts_DEV.jsonl.gz
+ ln -s data/fbank/cuts_TEST.jsonl.gz data/fbank/gigaspeech_cuts_TEST.jsonl.gz
+
+ log "Decoding dev and test"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ # Test only greedy_search to reduce CI running time
+ # for method in greedy_search fast_beam_search modified_beam_search; do
+ for method in greedy_search; do
+ log "Decoding with $method"
+
+ ./pruned_transducer_stateless2/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --max-duration $max_duration \
+ --exp-dir pruned_transducer_stateless2/exp
+ done
+
+ rm pruned_transducer_stateless2/exp/*.pt
+fi
diff --git a/.github/scripts/run-gigaspeech-zipformer-2023-10-17.sh b/.github/scripts/run-gigaspeech-zipformer-2023-10-17.sh
new file mode 100755
index 000000000..329896ef6
--- /dev/null
+++ b/.github/scripts/run-gigaspeech-zipformer-2023-10-17.sh
@@ -0,0 +1,158 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/gigaspeech/ASR
+
+repo_url=https://huggingface.co/yfyeung/icefall-asr-gigaspeech-zipformer-2023-10-17
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
+git lfs pull --include "exp/jit_script.pt"
+git lfs pull --include "exp/pretrained.pt"
+rm epoch-30.pt
+ln -s pretrained.pt epoch-30.pt
+rm *.onnx
+ls -lh
+popd
+
+log "----------------------------------------"
+log "Export ONNX transducer models "
+log "----------------------------------------"
+
+./zipformer/export-onnx.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --use-averaged-model 0 \
+ --epoch 30 \
+ --avg 1 \
+ --exp-dir $repo/exp
+
+ls -lh $repo/exp
+
+log "------------------------------------------------------------"
+log "Test exported ONNX transducer models (Python code) "
+log "------------------------------------------------------------"
+
+log "test fp32"
+./zipformer/onnx_pretrained.py \
+ --encoder-model-filename $repo/exp/encoder-epoch-30-avg-1.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-30-avg-1.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-30-avg-1.onnx \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+log "test int8"
+./zipformer/onnx_pretrained.py \
+ --encoder-model-filename $repo/exp/encoder-epoch-30-avg-1.int8.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-30-avg-1.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-30-avg-1.int8.onnx \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+log "Upload models to huggingface"
+git config --global user.name "k2-fsa"
+git config --global user.email "xxx@gmail.com"
+
+url=https://huggingface.co/k2-fsa/sherpa-onnx-zipformer-gigaspeech-2023-12-12
+GIT_LFS_SKIP_SMUDGE=1 git clone $url
+dst=$(basename $url)
+cp -v $repo/exp/*.onnx $dst
+cp -v $repo/data/lang_bpe_500/tokens.txt $dst
+cp -v $repo/data/lang_bpe_500/bpe.model $dst
+mkdir -p $dst/test_wavs
+cp -v $repo/test_wavs/*.wav $dst/test_wavs
+cd $dst
+git lfs track "*.onnx"
+git add .
+git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
+
+log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+rm -rf .git
+rm -fv .gitattributes
+cd ..
+tar cjfv $dst.tar.bz2 $dst
+ls -lh
+mv -v $dst.tar.bz2 ../../../
+
+log "Export to torchscript model"
+./zipformer/export.py \
+ --exp-dir $repo/exp \
+ --use-averaged-model false \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 30 \
+ --avg 1 \
+ --jit 1
+
+ls -lh $repo/exp/*.pt
+
+log "Decode with models exported by torch.jit.script()"
+
+./zipformer/jit_pretrained.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --nn-model-filename $repo/exp/jit_script.pt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+for method in greedy_search modified_beam_search fast_beam_search; do
+ log "$method"
+
+ ./zipformer/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p zipformer/exp
+ ln -s $PWD/$repo/exp/pretrained.pt zipformer/exp/epoch-30.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh zipformer/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./zipformer/decode.py \
+ --decoding-method $method \
+ --epoch 30 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --max-duration $max_duration \
+ --exp-dir zipformer/exp
+ done
+
+ rm zipformer/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-conformer-ctc3-2022-11-28.sh b/.github/scripts/run-librispeech-conformer-ctc3-2022-11-28.sh
new file mode 100755
index 000000000..f6fe8c9b2
--- /dev/null
+++ b/.github/scripts/run-librispeech-conformer-ctc3-2022-11-28.sh
@@ -0,0 +1,122 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-conformer-ctc3-2022-11-27
+
+log "Downloading pre-trained model from $repo_url"
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+git lfs pull --include "data/lang_bpe_500/HLG.pt"
+git lfs pull --include "data/lang_bpe_500/L.pt"
+git lfs pull --include "data/lang_bpe_500/LG.pt"
+git lfs pull --include "data/lang_bpe_500/Linv.pt"
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lm/G_4_gram.pt"
+git lfs pull --include "exp/jit_trace.pt"
+git lfs pull --include "exp/pretrained.pt"
+ln -s pretrained.pt epoch-99.pt
+ls -lh *.pt
+popd
+
+log "Decode with models exported by torch.jit.trace()"
+
+for m in ctc-decoding 1best; do
+ ./conformer_ctc3/jit_pretrained.py \
+ --model-filename $repo/exp/jit_trace.pt \
+ --words-file $repo/data/lang_bpe_500/words.txt \
+ --HLG $repo/data/lang_bpe_500/HLG.pt \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --G $repo/data/lm/G_4_gram.pt \
+ --method $m \
+ --sample-rate 16000 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+log "Export to torchscript model"
+
+./conformer_ctc3/export.py \
+ --exp-dir $repo/exp \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --jit-trace 1 \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model 0
+
+ls -lh $repo/exp/*.pt
+
+log "Decode with models exported by torch.jit.trace()"
+
+for m in ctc-decoding 1best; do
+ ./conformer_ctc3/jit_pretrained.py \
+ --model-filename $repo/exp/jit_trace.pt \
+ --words-file $repo/data/lang_bpe_500/words.txt \
+ --HLG $repo/data/lang_bpe_500/HLG.pt \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --G $repo/data/lm/G_4_gram.pt \
+ --method $m \
+ --sample-rate 16000 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for m in ctc-decoding 1best; do
+ ./conformer_ctc3/pretrained.py \
+ --checkpoint $repo/exp/pretrained.pt \
+ --words-file $repo/data/lang_bpe_500/words.txt \
+ --HLG $repo/data/lang_bpe_500/HLG.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --G $repo/data/lm/G_4_gram.pt \
+ --method $m \
+ --sample-rate 16000 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p conformer_ctc3/exp
+ ln -s $PWD/$repo/exp/pretrained.pt conformer_ctc3/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh conformer_ctc3/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in ctc-decoding 1best; do
+ log "Decoding with $method"
+ ./conformer_ctc3/decode.py \
+ --epoch 999 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --exp-dir conformer_ctc3/exp/ \
+ --max-duration $max_duration \
+ --decoding-method $method \
+ --lm-dir data/lm
+ done
+
+ rm conformer_ctc3/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.sh b/.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.sh
new file mode 100755
index 000000000..d547bdd45
--- /dev/null
+++ b/.github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.sh
@@ -0,0 +1,191 @@
+#!/usr/bin/env bash
+#
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+abs_repo=$(realpath $repo)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s pretrained-iter-468000-avg-16.pt pretrained.pt
+ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
+popd
+
+log "Test exporting with torch.jit.trace()"
+
+./lstm_transducer_stateless2/export.py \
+ --exp-dir $repo/exp \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --jit-trace 1
+
+log "Decode with models exported by torch.jit.trace()"
+
+./lstm_transducer_stateless2/jit_pretrained.py \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --encoder-model-filename $repo/exp/encoder_jit_trace.pt \
+ --decoder-model-filename $repo/exp/decoder_jit_trace.pt \
+ --joiner-model-filename $repo/exp/joiner_jit_trace.pt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./lstm_transducer_stateless2/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./lstm_transducer_stateless2/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+
+if [[ x"${GITHUB_EVENT_LABEL_NAME}" == x"shallow-fusion" ]]; then
+ lm_repo_url=https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm
+ log "Download pre-trained RNN-LM model from ${lm_repo_url}"
+ GIT_LFS_SKIP_SMUDGE=1 git clone $lm_repo_url
+ lm_repo=$(basename $lm_repo_url)
+ pushd $lm_repo
+ git lfs pull --include "exp/pretrained.pt"
+ mv exp/pretrained.pt exp/epoch-88.pt
+ popd
+
+ mkdir -p lstm_transducer_stateless2/exp
+ ln -sf $PWD/$repo/exp/pretrained.pt lstm_transducer_stateless2/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh lstm_transducer_stateless2/exp
+
+ log "Decoding test-clean and test-other with RNN LM"
+
+ ./lstm_transducer_stateless2/decode.py \
+ --use-averaged-model 0 \
+ --epoch 999 \
+ --avg 1 \
+ --exp-dir lstm_transducer_stateless2/exp \
+ --max-duration 600 \
+ --decoding-method modified_beam_search_lm_shallow_fusion \
+ --beam 4 \
+ --use-shallow-fusion 1 \
+ --lm-type rnn \
+ --lm-exp-dir $lm_repo/exp \
+ --lm-epoch 88 \
+ --lm-avg 1 \
+ --lm-scale 0.3 \
+ --rnn-lm-num-layers 3 \
+ --rnn-lm-tie-weights 1
+fi
+
+if [[ x"${GITHUB_EVENT_LABEL_NAME}" == x"LODR" ]]; then
+ bigram_repo_url=https://huggingface.co/marcoyang/librispeech_bigram
+ log "Download bi-gram LM from ${bigram_repo_url}"
+ GIT_LFS_SKIP_SMUDGE=1 git clone $bigram_repo_url
+ bigramlm_repo=$(basename $bigram_repo_url)
+ pushd $bigramlm_repo
+ git lfs pull --include "2gram.fst.txt"
+ cp 2gram.fst.txt $abs_repo/data/lang_bpe_500/.
+ popd
+
+ lm_repo_url=https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm
+ log "Download pre-trained RNN-LM model from ${lm_repo_url}"
+ GIT_LFS_SKIP_SMUDGE=1 git clone $lm_repo_url
+ lm_repo=$(basename $lm_repo_url)
+ pushd $lm_repo
+ git lfs pull --include "exp/pretrained.pt"
+ mv exp/pretrained.pt exp/epoch-88.pt
+ popd
+
+ mkdir -p lstm_transducer_stateless2/exp
+ ln -sf $PWD/$repo/exp/pretrained.pt lstm_transducer_stateless2/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh lstm_transducer_stateless2/exp
+
+ log "Decoding test-clean and test-other"
+
+ ./lstm_transducer_stateless2/decode.py \
+ --use-averaged-model 0 \
+ --epoch 999 \
+ --avg 1 \
+ --exp-dir lstm_transducer_stateless2/exp \
+ --max-duration 600 \
+ --decoding-method modified_beam_search_LODR \
+ --beam 4 \
+ --use-shallow-fusion 1 \
+ --lm-type rnn \
+ --lm-exp-dir $lm_repo/exp \
+ --lm-scale 0.4 \
+ --lm-epoch 88 \
+ --rnn-lm-avg 1 \
+ --rnn-lm-num-layers 3 \
+ --rnn-lm-tie-weights 1 \
+ --tokens-ngram 2 \
+ --ngram-lm-scale -0.16
+fi
+
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" ]]; then
+ mkdir -p lstm_transducer_stateless2/exp
+ ln -s $PWD/$repo/exp/pretrained.pt lstm_transducer_stateless2/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh lstm_transducer_stateless2/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./lstm_transducer_stateless2/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --max-duration $max_duration \
+ --exp-dir lstm_transducer_stateless2/exp
+ done
+
+ rm lstm_transducer_stateless2/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
new file mode 100755
index 000000000..412e3ad56
--- /dev/null
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./pruned_transducer_stateless/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in fast_beam_search modified_beam_search beam_search; do
+ log "$method"
+
+ ./pruned_transducer_stateless/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p pruned_transducer_stateless/exp
+ ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh pruned_transducer_stateless/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./pruned_transducer_stateless/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --max-duration $max_duration \
+ --exp-dir pruned_transducer_stateless/exp
+ done
+
+ rm pruned_transducer_stateless/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
new file mode 100755
index 000000000..243b669ed
--- /dev/null
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29
+
+log "Downloading pre-trained model from $repo_url"
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/pretrained-epoch-38-avg-10.pt"
+popd
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s pretrained-epoch-38-avg-10.pt pretrained.pt
+popd
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./pruned_transducer_stateless2/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./pruned_transducer_stateless2/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p pruned_transducer_stateless2/exp
+ ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless2/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh pruned_transducer_stateless2/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./pruned_transducer_stateless2/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --max-duration $max_duration \
+ --exp-dir pruned_transducer_stateless2/exp
+ done
+
+ rm pruned_transducer_stateless2/exp/*.pt
+ rm -r data/lang_bpe_500
+fi
diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
new file mode 100755
index 000000000..2d0f80304
--- /dev/null
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-04-29
+
+log "Downloading pre-trained model from $repo_url"
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+pushd $repo
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/pretrained-epoch-25-avg-6.pt"
+popd
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s pretrained-epoch-25-avg-6.pt pretrained.pt
+popd
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./pruned_transducer_stateless3/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./pruned_transducer_stateless3/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p pruned_transducer_stateless3/exp
+ ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless3/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh pruned_transducer_stateless3/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./pruned_transducer_stateless3/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --max-duration $max_duration \
+ --exp-dir pruned_transducer_stateless3/exp
+ done
+
+ rm pruned_transducer_stateless3/exp/*.pt
+ rm -r data/lang_bpe_500
+fi
diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-05-13.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-05-13.sh
new file mode 100755
index 000000000..3d5814c48
--- /dev/null
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless3-2022-05-13.sh
@@ -0,0 +1,123 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s pretrained-iter-1224000-avg-14.pt pretrained.pt
+ln -s pretrained-iter-1224000-avg-14.pt epoch-99.pt
+popd
+
+
+log "Export to torchscript model"
+./pruned_transducer_stateless3/export.py \
+ --exp-dir $repo/exp \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --jit 1
+
+./pruned_transducer_stateless3/export.py \
+ --exp-dir $repo/exp \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --jit-trace 1
+
+ls -lh $repo/exp/*.pt
+
+log "Decode with models exported by torch.jit.trace()"
+
+./pruned_transducer_stateless3/jit_pretrained.py \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --encoder-model-filename $repo/exp/encoder_jit_trace.pt \
+ --decoder-model-filename $repo/exp/decoder_jit_trace.pt \
+ --joiner-model-filename $repo/exp/joiner_jit_trace.pt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+log "Decode with models exported by torch.jit.script()"
+
+./pruned_transducer_stateless3/jit_pretrained.py \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --encoder-model-filename $repo/exp/encoder_jit_script.pt \
+ --decoder-model-filename $repo/exp/decoder_jit_script.pt \
+ --joiner-model-filename $repo/exp/joiner_jit_script.pt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./pruned_transducer_stateless3/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./pruned_transducer_stateless3/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p pruned_transducer_stateless3/exp
+ ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless3/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh pruned_transducer_stateless3/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./pruned_transducer_stateless3/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --max-duration $max_duration \
+ --exp-dir pruned_transducer_stateless3/exp
+ done
+
+ rm pruned_transducer_stateless3/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless5-2022-05-13.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless5-2022-05-13.sh
new file mode 100755
index 000000000..3d2442d54
--- /dev/null
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless5-2022-05-13.sh
@@ -0,0 +1,100 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s pretrained-epoch-39-avg-7.pt pretrained.pt
+popd
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./pruned_transducer_stateless5/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --num-encoder-layers 18 \
+ --dim-feedforward 2048 \
+ --nhead 8 \
+ --encoder-dim 512 \
+ --decoder-dim 512 \
+ --joiner-dim 512 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./pruned_transducer_stateless5/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav \
+ --num-encoder-layers 18 \
+ --dim-feedforward 2048 \
+ --nhead 8 \
+ --encoder-dim 512 \
+ --decoder-dim 512 \
+ --joiner-dim 512
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p pruned_transducer_stateless5/exp
+ ln -s $PWD/$repo/exp/pretrained-epoch-39-avg-7.pt pruned_transducer_stateless5/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh pruned_transducer_stateless5/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./pruned_transducer_stateless5/decode.py \
+ --decoding-method $method \
+ --use-averaged-model 0 \
+ --epoch 999 \
+ --avg 1 \
+ --max-duration $max_duration \
+ --exp-dir pruned_transducer_stateless5/exp \
+ --num-encoder-layers 18 \
+ --dim-feedforward 2048 \
+ --nhead 8 \
+ --encoder-dim 512 \
+ --decoder-dim 512 \
+ --joiner-dim 512
+ done
+
+ rm pruned_transducer_stateless5/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless7-2022-11-11.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless7-2022-11-11.sh
new file mode 100755
index 000000000..961dde4f4
--- /dev/null
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-2022-11-11.sh
@@ -0,0 +1,106 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/cpu_jit.pt"
+git lfs pull --include "exp/pretrained.pt"
+ln -s pretrained.pt epoch-99.pt
+ls -lh *.pt
+popd
+
+log "Export to torchscript model"
+./pruned_transducer_stateless7/export.py \
+ --exp-dir $repo/exp \
+ --use-averaged-model false \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --jit 1
+
+ls -lh $repo/exp/*.pt
+
+log "Decode with models exported by torch.jit.script()"
+
+./pruned_transducer_stateless7/jit_pretrained.py \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --nn-model-filename $repo/exp/cpu_jit.pt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./pruned_transducer_stateless7/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./pruned_transducer_stateless7/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p pruned_transducer_stateless7/exp
+ ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless7/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh pruned_transducer_stateless7/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./pruned_transducer_stateless7/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --max-duration $max_duration \
+ --exp-dir pruned_transducer_stateless7/exp
+ done
+
+ rm pruned_transducer_stateless7/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-2022-12-01.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-2022-12-01.sh
new file mode 100755
index 000000000..ba7139efb
--- /dev/null
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-2022-12-01.sh
@@ -0,0 +1,150 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-ctc-2022-12-01
+
+log "Downloading pre-trained model from $repo_url"
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+git lfs pull --include "data/lang_bpe_500/HLG.pt"
+git lfs pull --include "data/lang_bpe_500/L.pt"
+git lfs pull --include "data/lang_bpe_500/LG.pt"
+git lfs pull --include "data/lang_bpe_500/Linv.pt"
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lm/G_4_gram.pt"
+git lfs pull --include "exp/cpu_jit.pt"
+git lfs pull --include "exp/pretrained.pt"
+ln -s pretrained.pt epoch-99.pt
+ls -lh *.pt
+popd
+
+log "Export to torchscript model"
+./pruned_transducer_stateless7_ctc/export.py \
+ --exp-dir $repo/exp \
+ --use-averaged-model false \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --jit 1
+
+ls -lh $repo/exp/*.pt
+
+log "Decode with models exported by torch.jit.script()"
+
+./pruned_transducer_stateless7_ctc/jit_pretrained.py \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --nn-model-filename $repo/exp/cpu_jit.pt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+for m in ctc-decoding 1best; do
+ ./pruned_transducer_stateless7_ctc/jit_pretrained_ctc.py \
+ --model-filename $repo/exp/cpu_jit.pt \
+ --words-file $repo/data/lang_bpe_500/words.txt \
+ --HLG $repo/data/lang_bpe_500/HLG.pt \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --G $repo/data/lm/G_4_gram.pt \
+ --method $m \
+ --sample-rate 16000 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./pruned_transducer_stateless7_ctc/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./pruned_transducer_stateless7_ctc/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for m in ctc-decoding 1best; do
+ ./pruned_transducer_stateless7_ctc/pretrained_ctc.py \
+ --checkpoint $repo/exp/pretrained.pt \
+ --words-file $repo/data/lang_bpe_500/words.txt \
+ --HLG $repo/data/lang_bpe_500/HLG.pt \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --G $repo/data/lm/G_4_gram.pt \
+ --method $m \
+ --sample-rate 16000 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p pruned_transducer_stateless7_ctc/exp
+ ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless7_ctc/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh pruned_transducer_stateless7_ctc/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./pruned_transducer_stateless7_ctc/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --max-duration $max_duration \
+ --exp-dir pruned_transducer_stateless7_ctc/exp
+ done
+
+ for m in ctc-decoding 1best; do
+ ./pruned_transducer_stateless7_ctc/ctc_decode.py \
+ --epoch 999 \
+ --avg 1 \
+ --exp-dir ./pruned_transducer_stateless7_ctc/exp \
+ --max-duration $max_duration \
+ --use-averaged-model 0 \
+ --decoding-method $m \
+ --hlg-scale 0.6 \
+ --lm-dir data/lm
+ done
+
+ rm pruned_transducer_stateless7_ctc/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh
new file mode 100755
index 000000000..1ecbc4798
--- /dev/null
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh
@@ -0,0 +1,147 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/yfyeung/icefall-asr-librispeech-pruned_transducer_stateless7_ctc_bs-2023-01-29
+
+log "Downloading pre-trained model from $repo_url"
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+git lfs pull --include "data/lang_bpe_500/L.pt"
+git lfs pull --include "data/lang_bpe_500/LG.pt"
+git lfs pull --include "data/lang_bpe_500/HLG.pt"
+git lfs pull --include "data/lang_bpe_500/Linv.pt"
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/cpu_jit.pt"
+git lfs pull --include "exp/pretrained.pt"
+ln -s pretrained.pt epoch-99.pt
+ls -lh *.pt
+popd
+
+log "Export to torchscript model"
+./pruned_transducer_stateless7_ctc_bs/export.py \
+ --exp-dir $repo/exp \
+ --use-averaged-model false \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --jit 1
+
+ls -lh $repo/exp/*.pt
+
+log "Decode with models exported by torch.jit.script()"
+
+./pruned_transducer_stateless7_ctc_bs/jit_pretrained.py \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --nn-model-filename $repo/exp/cpu_jit.pt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+for m in ctc-decoding 1best; do
+ ./pruned_transducer_stateless7_ctc_bs/jit_pretrained_ctc.py \
+ --model-filename $repo/exp/cpu_jit.pt \
+ --words-file $repo/data/lang_bpe_500/words.txt \
+ --HLG $repo/data/lang_bpe_500/HLG.pt \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --method $m \
+ --sample-rate 16000 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./pruned_transducer_stateless7_ctc_bs/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./pruned_transducer_stateless7_ctc_bs/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for m in ctc-decoding 1best; do
+ ./pruned_transducer_stateless7_ctc_bs/pretrained_ctc.py \
+ --checkpoint $repo/exp/pretrained.pt \
+ --words-file $repo/data/lang_bpe_500/words.txt \
+ --HLG $repo/data/lang_bpe_500/HLG.pt \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --method $m \
+ --sample-rate 16000 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p pruned_transducer_stateless7_ctc_bs/exp
+ ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless7_ctc_bs/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh pruned_transducer_stateless7_ctc_bs/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./pruned_transducer_stateless7_ctc_bs/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --max-duration $max_duration \
+ --exp-dir pruned_transducer_stateless7_ctc_bs/exp
+ done
+
+ for m in ctc-decoding 1best; do
+ ./pruned_transducer_stateless7_ctc_bs/ctc_decode.py \
+ --epoch 999 \
+ --avg 1 \
+ --exp-dir ./pruned_transducer_stateless7_ctc_bs/exp \
+ --max-duration $max_duration \
+ --use-averaged-model 0 \
+ --decoding-method $m \
+ --hlg-scale 0.6
+ done
+
+ rm pruned_transducer_stateless7_ctc_bs/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless7-streaming-2022-12-29.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless7-streaming-2022-12-29.sh
new file mode 100755
index 000000000..37b192a57
--- /dev/null
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-streaming-2022-12-29.sh
@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/cpu_jit.pt"
+git lfs pull --include "exp/pretrained.pt"
+git lfs pull --include "exp/encoder_jit_trace.pt"
+git lfs pull --include "exp/decoder_jit_trace.pt"
+git lfs pull --include "exp/joiner_jit_trace.pt"
+cd exp
+ln -s pretrained.pt epoch-99.pt
+ls -lh *.pt
+popd
+
+log "Export to torchscript model"
+./pruned_transducer_stateless7_streaming/export.py \
+ --exp-dir $repo/exp \
+ --use-averaged-model false \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --decode-chunk-len 32 \
+ --epoch 99 \
+ --avg 1 \
+ --jit 1
+
+ls -lh $repo/exp/*.pt
+
+log "Decode with models exported by torch.jit.script()"
+
+./pruned_transducer_stateless7_streaming/jit_pretrained.py \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --nn-model-filename $repo/exp/cpu_jit.pt \
+ --decode-chunk-len 32 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+log "Export to torchscript model by torch.jit.trace()"
+./pruned_transducer_stateless7_streaming/jit_trace_export.py \
+ --exp-dir $repo/exp \
+ --use-averaged-model false \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --decode-chunk-len 32 \
+ --epoch 99 \
+ --avg 1
+
+log "Decode with models exported by torch.jit.trace()"
+
+./pruned_transducer_stateless7_streaming/jit_trace_pretrained.py \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --encoder-model-filename $repo/exp/encoder_jit_trace.pt \
+ --decoder-model-filename $repo/exp/decoder_jit_trace.pt \
+ --joiner-model-filename $repo/exp/joiner_jit_trace.pt \
+ --decode-chunk-len 32 \
+ $repo/test_wavs/1089-134686-0001.wav
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./pruned_transducer_stateless7_streaming/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --decode-chunk-len 32 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./pruned_transducer_stateless7_streaming/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --decode-chunk-len 32 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p pruned_transducer_stateless7_streaming/exp
+ ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless7_streaming/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh pruned_transducer_stateless7_streaming/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+ num_decode_stream=200
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "decoding with $method"
+
+ ./pruned_transducer_stateless7_streaming/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --max-duration $max_duration \
+ --decode-chunk-len 32 \
+ --exp-dir pruned_transducer_stateless7_streaming/exp
+ done
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./pruned_transducer_stateless7_streaming/streaming_decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --decode-chunk-len 32 \
+ --num-decode-streams $num_decode_stream
+ --exp-dir pruned_transducer_stateless7_streaming/exp
+ done
+
+ rm pruned_transducer_stateless7_streaming/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless8-2022-11-14.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless8-2022-11-14.sh
new file mode 100755
index 000000000..4f2bfac24
--- /dev/null
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless8-2022-11-14.sh
@@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/cpu_jit.pt"
+git lfs pull --include "exp/pretrained.pt"
+ln -s pretrained.pt epoch-99.pt
+ls -lh *.pt
+popd
+
+log "Decode with models exported by torch.jit.script()"
+
+./pruned_transducer_stateless8/jit_pretrained.py \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --nn-model-filename $repo/exp/cpu_jit.pt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+log "Export to torchscript model"
+./pruned_transducer_stateless8/export.py \
+ --exp-dir $repo/exp \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --use-averaged-model false \
+ --epoch 99 \
+ --avg 1 \
+ --jit 1
+
+ls -lh $repo/exp/*.pt
+
+log "Decode with models exported by torch.jit.script()"
+
+./pruned_transducer_stateless8/jit_pretrained.py \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --nn-model-filename $repo/exp/cpu_jit.pt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./pruned_transducer_stateless8/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./pruned_transducer_stateless8/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p pruned_transducer_stateless8/exp
+ ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless8/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh pruned_transducer_stateless8/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./pruned_transducer_stateless8/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --max-duration $max_duration \
+ --exp-dir pruned_transducer_stateless8/exp
+ done
+
+ rm pruned_transducer_stateless8/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh b/.github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh
new file mode 100755
index 000000000..5cbdad16d
--- /dev/null
+++ b/.github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh
@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/pkufool/icefall_librispeech_streaming_pruned_transducer_stateless2_20220625
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s pretrained-epoch-24-avg-10.pt pretrained.pt
+popd
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./pruned_transducer_stateless2/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --simulate-streaming 1 \
+ --causal-convolution 1 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./pruned_transducer_stateless2/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --simulate-streaming 1 \
+ --causal-convolution 1 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p pruned_transducer_stateless2/exp
+ ln -s $PWD/$repo/exp/pretrained-epoch-24-avg-10.pt pruned_transducer_stateless2/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh pruned_transducer_stateless2/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Simulate streaming decoding with $method"
+
+ ./pruned_transducer_stateless2/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --max-duration $max_duration \
+ --exp-dir pruned_transducer_stateless2/exp \
+ --simulate-streaming 1 \
+ --causal-convolution 1
+ done
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Real streaming decoding with $method"
+
+ ./pruned_transducer_stateless2/streaming_decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --num-decode-streams 100 \
+ --exp-dir pruned_transducer_stateless2/exp \
+ --left-context 32 \
+ --decode-chunk-size 8 \
+ --right-context 0
+ done
+
+ rm pruned_transducer_stateless2/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-streaming-zipformer-2023-05-18.sh b/.github/scripts/run-librispeech-streaming-zipformer-2023-05-18.sh
new file mode 100755
index 000000000..f4e2124b1
--- /dev/null
+++ b/.github/scripts/run-librispeech-streaming-zipformer-2023-05-18.sh
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
+git lfs pull --include "exp/jit_script_chunk_16_left_128.pt"
+git lfs pull --include "exp/pretrained.pt"
+ln -s pretrained.pt epoch-99.pt
+ls -lh *.pt
+popd
+
+log "Export to torchscript model"
+./zipformer/export.py \
+ --exp-dir $repo/exp \
+ --use-averaged-model false \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --causal 1 \
+ --chunk-size 16 \
+ --left-context-frames 128 \
+ --epoch 99 \
+ --avg 1 \
+ --jit 1
+
+ls -lh $repo/exp/*.pt
+
+log "Decode with models exported by torch.jit.script()"
+
+./zipformer/jit_pretrained_streaming.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --nn-model-filename $repo/exp/jit_script_chunk_16_left_128.pt \
+ $repo/test_wavs/1089-134686-0001.wav
+
+for method in greedy_search modified_beam_search fast_beam_search; do
+ log "$method"
+
+ ./zipformer/pretrained.py \
+ --causal 1 \
+ --chunk-size 16 \
+ --left-context-frames 128 \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p zipformer/exp
+ ln -s $PWD/$repo/exp/pretrained.pt zipformer/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh zipformer/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Simulated streaming decoding with $method"
+
+ ./zipformer/decode.py \
+ --causal 1 \
+ --chunk-size 16 \
+ --left-context-frames 128 \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --max-duration $max_duration \
+ --exp-dir zipformer/exp
+ done
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Chunk-wise streaming decoding with $method"
+
+ ./zipformer/streaming_decode.py \
+ --causal 1 \
+ --chunk-size 16 \
+ --left-context-frames 128 \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --max-duration $max_duration \
+ --exp-dir zipformer/exp
+ done
+
+ rm zipformer/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh b/.github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
new file mode 100755
index 000000000..ff77855a2
--- /dev/null
+++ b/.github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless2-torchaudio-2022-04-19
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./transducer_stateless2/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in fast_beam_search modified_beam_search beam_search; do
+ log "$method"
+
+ ./transducer_stateless2/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p transducer_stateless2/exp
+ ln -s $PWD/$repo/exp/pretrained.pt transducer_stateless2/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh transducer_stateless2/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./transducer_stateless2/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --max-duration $max_duration \
+ --exp-dir transducer_stateless2/exp
+ done
+
+ rm transducer_stateless2/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-zipformer-2023-05-18.sh b/.github/scripts/run-librispeech-zipformer-2023-05-18.sh
new file mode 100755
index 000000000..fb1a0149d
--- /dev/null
+++ b/.github/scripts/run-librispeech-zipformer-2023-05-18.sh
@@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
+git lfs pull --include "exp/jit_script.pt"
+git lfs pull --include "exp/pretrained.pt"
+ln -s pretrained.pt epoch-99.pt
+ls -lh *.pt
+popd
+
+log "Export to torchscript model"
+./zipformer/export.py \
+ --exp-dir $repo/exp \
+ --use-averaged-model false \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --jit 1
+
+ls -lh $repo/exp/*.pt
+
+log "Decode with models exported by torch.jit.script()"
+
+./zipformer/jit_pretrained.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --nn-model-filename $repo/exp/jit_script.pt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+for method in greedy_search modified_beam_search fast_beam_search; do
+ log "$method"
+
+ ./zipformer/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p zipformer/exp
+ ln -s $PWD/$repo/exp/pretrained.pt zipformer/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh zipformer/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./zipformer/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --max-duration $max_duration \
+ --exp-dir zipformer/exp
+ done
+
+ rm zipformer/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-zipformer-ctc-2023-06-14.sh b/.github/scripts/run-librispeech-zipformer-ctc-2023-06-14.sh
new file mode 100755
index 000000000..0026d2109
--- /dev/null
+++ b/.github/scripts/run-librispeech-zipformer-ctc-2023-06-14.sh
@@ -0,0 +1,117 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-transducer-ctc-2023-06-13
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
+git lfs pull --include "data/lang_bpe_500/HLG.pt"
+git lfs pull --include "data/lang_bpe_500/L.pt"
+git lfs pull --include "data/lang_bpe_500/LG.pt"
+git lfs pull --include "data/lang_bpe_500/Linv.pt"
+git lfs pull --include "data/lm/G_4_gram.pt"
+git lfs pull --include "exp/jit_script.pt"
+git lfs pull --include "exp/pretrained.pt"
+ln -s pretrained.pt epoch-99.pt
+ls -lh *.pt
+popd
+
+log "Export to torchscript model"
+./zipformer/export.py \
+ --exp-dir $repo/exp \
+ --use-transducer 1 \
+ --use-ctc 1 \
+ --use-averaged-model false \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --jit 1
+
+ls -lh $repo/exp/*.pt
+
+log "Decode with models exported by torch.jit.script()"
+
+for method in ctc-decoding 1best; do
+ ./zipformer/jit_pretrained_ctc.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --model-filename $repo/exp/jit_script.pt \
+ --HLG $repo/data/lang_bpe_500/HLG.pt \
+ --words-file $repo/data/lang_bpe_500/words.txt \
+ --G $repo/data/lm/G_4_gram.pt \
+ --method $method \
+ --sample-rate 16000 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in ctc-decoding 1best; do
+ log "$method"
+
+ ./zipformer/pretrained_ctc.py \
+ --use-transducer 1 \
+ --use-ctc 1 \
+ --method $method \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --HLG $repo/data/lang_bpe_500/HLG.pt \
+ --G $repo/data/lm/G_4_gram.pt \
+ --words-file $repo/data/lang_bpe_500/words.txt \
+ --sample-rate 16000 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p zipformer/exp
+ ln -s $PWD/$repo/exp/pretrained.pt zipformer/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh zipformer/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in ctc-decoding 1best; do
+ log "Decoding with $method"
+
+ ./zipformer/ctc_decode.py \
+ --use-transducer 1 \
+ --use-ctc 1 \
+ --decoding-method $method \
+ --nbest-scale 1.0 \
+ --hlg-scale 0.6 \
+ --epoch 999 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --max-duration $max_duration \
+ --exp-dir zipformer/exp
+ done
+
+ rm zipformer/exp/*.pt
+fi
diff --git a/.github/scripts/run-librispeech-zipformer-mmi-2022-12-08.sh b/.github/scripts/run-librispeech-zipformer-mmi-2022-12-08.sh
new file mode 100755
index 000000000..c59921055
--- /dev/null
+++ b/.github/scripts/run-librispeech-zipformer-mmi-2022-12-08.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-mmi-2022-12-08
+
+log "Downloading pre-trained model from $repo_url"
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+git lfs pull --include "data/lang_bpe_500/3gram.pt"
+git lfs pull --include "data/lang_bpe_500/4gram.pt"
+git lfs pull --include "data/lang_bpe_500/L.pt"
+git lfs pull --include "data/lang_bpe_500/LG.pt"
+git lfs pull --include "data/lang_bpe_500/Linv.pt"
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/cpu_jit.pt"
+git lfs pull --include "exp/pretrained.pt"
+ln -s pretrained.pt epoch-99.pt
+ls -lh *.pt
+popd
+
+log "Export to torchscript model"
+./zipformer_mmi/export.py \
+ --exp-dir $repo/exp \
+ --use-averaged-model false \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --jit 1
+
+ls -lh $repo/exp/*.pt
+
+log "Decode with models exported by torch.jit.script()"
+
+./zipformer_mmi/jit_pretrained.py \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --nn-model-filename $repo/exp/cpu_jit.pt \
+ --lang-dir $repo/data/lang_bpe_500 \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+for method in 1best nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescoring-4-gram; do
+ log "$method"
+
+ ./zipformer_mmi/pretrained.py \
+ --method $method \
+ --checkpoint $repo/exp/pretrained.pt \
+ --lang-dir $repo/data/lang_bpe_500 \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p zipformer_mmi/exp
+ ln -s $PWD/$repo/exp/pretrained.pt zipformer_mmi/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh zipformer_mmi/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in 1best nbest nbest-rescoring-LG nbest-rescoring-3-gram nbest-rescoring-4-gram; do
+ log "Decoding with $method"
+
+ ./zipformer_mmi/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --nbest-scale 1.2 \
+ --hp-scale 1.0 \
+ --max-duration $max_duration \
+ --lang-dir $repo/data/lang_bpe_500 \
+ --exp-dir zipformer_mmi/exp
+ done
+
+ rm zipformer_mmi/exp/*.pt
+fi
diff --git a/.github/scripts/run-multi-corpora-zipformer.sh b/.github/scripts/run-multi-corpora-zipformer.sh
new file mode 100755
index 000000000..90f859f43
--- /dev/null
+++ b/.github/scripts/run-multi-corpora-zipformer.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/multi_zh-hans/ASR
+
+log "==== Test icefall-asr-multi-zh-hans-zipformer-2023-9-2 ===="
+repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-2023-9-2/
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s epoch-20.pt epoch-99.pt
+popd
+
+ls -lh $repo/exp/*.pt
+
+
+./zipformer/pretrained.py \
+ --checkpoint $repo/exp/epoch-99.pt \
+ --tokens $repo/data/lang_bpe_2000/tokens.txt \
+ --method greedy_search \
+$repo/test_wavs/DEV_T0000000000.wav \
+$repo/test_wavs/DEV_T0000000001.wav \
+$repo/test_wavs/DEV_T0000000002.wav
+
+for method in modified_beam_search fast_beam_search; do
+ log "$method"
+
+ ./zipformer/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/epoch-99.pt \
+ --tokens $repo/data/lang_bpe_2000/tokens.txt \
+ $repo/test_wavs/DEV_T0000000000.wav \
+ $repo/test_wavs/DEV_T0000000001.wav \
+ $repo/test_wavs/DEV_T0000000002.wav
+done
+
+rm -rf $repo
+
+log "==== Test icefall-asr-multi-zh-hans-zipformer-ctc-2023-10-24 ===="
+repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-2023-10-24/
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s epoch-20.pt epoch-99.pt
+popd
+
+ls -lh $repo/exp/*.pt
+
+
+./zipformer/pretrained.py \
+ --checkpoint $repo/exp/epoch-99.pt \
+ --tokens $repo/data/lang_bpe_2000/tokens.txt \
+ --use-ctc 1 \
+ --method greedy_search \
+$repo/test_wavs/DEV_T0000000000.wav \
+$repo/test_wavs/DEV_T0000000001.wav \
+$repo/test_wavs/DEV_T0000000002.wav
+
+for method in modified_beam_search fast_beam_search; do
+ log "$method"
+
+ ./zipformer/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --use-ctc 1 \
+ --checkpoint $repo/exp/epoch-99.pt \
+ --tokens $repo/data/lang_bpe_2000/tokens.txt \
+ $repo/test_wavs/DEV_T0000000000.wav \
+ $repo/test_wavs/DEV_T0000000001.wav \
+ $repo/test_wavs/DEV_T0000000002.wav
+done
+
+rm -rf $repo
+
+cd ../../../egs/multi_zh_en/ASR
+log "==== Test icefall-asr-zipformer-multi-zh-en-2023-11-22 ===="
+repo_url=https://huggingface.co/zrjin/icefall-asr-zipformer-multi-zh-en-2023-11-22/
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+./zipformer/pretrained.py \
+ --checkpoint $repo/exp/pretrained.pt \
+ --bpe-model $repo/data/lang_bbpe_2000/bbpe.model \
+ --method greedy_search \
+$repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_29.wav \
+$repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_55.wav \
+$repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_75.wav
+
+for method in modified_beam_search fast_beam_search; do
+ log "$method"
+
+ ./zipformer/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --bpe-model $repo/data/lang_bbpe_2000/bbpe.model \
+ $repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_29.wav \
+ $repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_55.wav \
+ $repo/test_wavs/_1634_210_2577_1_1525157964032_3712259_75.wav
+done
+
+rm -rf $repo
diff --git a/.github/scripts/run-pre-trained-ctc.sh b/.github/scripts/run-pre-trained-ctc.sh
new file mode 100755
index 000000000..7d6449c9a
--- /dev/null
+++ b/.github/scripts/run-pre-trained-ctc.sh
@@ -0,0 +1,240 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+pushd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/sherpa-onnx-zipformer-ctc-en-2023-10-02
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+log "CTC greedy search"
+
+./zipformer/onnx_pretrained_ctc.py \
+ --nn-model $repo/model.onnx \
+ --tokens $repo/tokens.txt \
+ $repo/test_wavs/0.wav \
+ $repo/test_wavs/1.wav \
+ $repo/test_wavs/2.wav
+
+log "CTC H decoding"
+
+./zipformer/onnx_pretrained_ctc_H.py \
+ --nn-model $repo/model.onnx \
+ --tokens $repo/tokens.txt \
+ --H $repo/H.fst \
+ $repo/test_wavs/0.wav \
+ $repo/test_wavs/1.wav \
+ $repo/test_wavs/2.wav
+
+log "CTC HL decoding"
+
+./zipformer/onnx_pretrained_ctc_HL.py \
+ --nn-model $repo/model.onnx \
+ --words $repo/words.txt \
+ --HL $repo/HL.fst \
+ $repo/test_wavs/0.wav \
+ $repo/test_wavs/1.wav \
+ $repo/test_wavs/2.wav
+
+log "CTC HLG decoding"
+
+./zipformer/onnx_pretrained_ctc_HLG.py \
+ --nn-model $repo/model.onnx \
+ --words $repo/words.txt \
+ --HLG $repo/HLG.fst \
+ $repo/test_wavs/0.wav \
+ $repo/test_wavs/1.wav \
+ $repo/test_wavs/2.wav
+
+rm -rf $repo
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09
+log "Downloading pre-trained model from $repo_url"
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+pushd $repo
+
+git lfs pull --include "exp/pretrained.pt"
+git lfs pull --include "data/lang_bpe_500/HLG.pt"
+git lfs pull --include "data/lang_bpe_500/L.pt"
+git lfs pull --include "data/lang_bpe_500/L_disambig.pt"
+git lfs pull --include "data/lang_bpe_500/Linv.pt"
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lang_bpe_500/lexicon.txt"
+git lfs pull --include "data/lang_bpe_500/lexicon_disambig.txt"
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
+git lfs pull --include "data/lang_bpe_500/words.txt"
+git lfs pull --include "data/lm/G_3_gram.fst.txt"
+
+popd
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+log "CTC decoding"
+
+./conformer_ctc/pretrained.py \
+ --method ctc-decoding \
+ --num-classes 500 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+log "HLG decoding"
+
+./conformer_ctc/pretrained.py \
+ --method 1best \
+ --num-classes 500 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --words-file $repo/data/lang_bpe_500/words.txt \
+ --HLG $repo/data/lang_bpe_500/HLG.pt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+log "CTC decoding on CPU with kaldi decoders using OpenFst"
+
+log "Exporting model with torchscript"
+
+pushd $repo/exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+./conformer_ctc/export.py \
+ --epoch 99 \
+ --avg 1 \
+ --exp-dir $repo/exp \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --jit 1
+
+ls -lh $repo/exp
+
+
+log "Generating H.fst, HL.fst"
+
+./local/prepare_lang_fst.py --lang-dir $repo/data/lang_bpe_500 --ngram-G $repo/data/lm/G_3_gram.fst.txt
+
+ls -lh $repo/data/lang_bpe_500
+
+log "Decoding with H on CPU with OpenFst"
+
+./conformer_ctc/jit_pretrained_decode_with_H.py \
+ --nn-model $repo/exp/cpu_jit.pt \
+ --H $repo/data/lang_bpe_500/H.fst \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+log "Decoding with HL on CPU with OpenFst"
+
+./conformer_ctc/jit_pretrained_decode_with_HL.py \
+ --nn-model $repo/exp/cpu_jit.pt \
+ --HL $repo/data/lang_bpe_500/HL.fst \
+ --words $repo/data/lang_bpe_500/words.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+log "Decoding with HLG on CPU with OpenFst"
+
+./conformer_ctc/jit_pretrained_decode_with_HLG.py \
+ --nn-model $repo/exp/cpu_jit.pt \
+ --HLG $repo/data/lang_bpe_500/HLG.fst \
+ --words $repo/data/lang_bpe_500/words.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+rm -rf $repo
+
+popd
+
+log "Test aishell"
+
+pushd egs/aishell/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall_asr_aishell_conformer_ctc
+log "Downloading pre-trained model from $repo_url"
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+pushd $repo
+
+git lfs pull --include "exp/pretrained.pt"
+git lfs pull --include "data/lang_char/H.fst"
+git lfs pull --include "data/lang_char/HL.fst"
+git lfs pull --include "data/lang_char/HLG.fst"
+
+popd
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+log "CTC decoding"
+
+log "Exporting model with torchscript"
+
+pushd $repo/exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+./conformer_ctc/export.py \
+ --epoch 99 \
+ --avg 1 \
+ --exp-dir $repo/exp \
+ --tokens $repo/data/lang_char/tokens.txt \
+ --jit 1
+
+ls -lh $repo/exp
+
+ls -lh $repo/data/lang_char
+
+log "Decoding with H on CPU with OpenFst"
+
+./conformer_ctc/jit_pretrained_decode_with_H.py \
+ --nn-model $repo/exp/cpu_jit.pt \
+ --H $repo/data/lang_char/H.fst \
+ --tokens $repo/data/lang_char/tokens.txt \
+ $repo/test_wavs/0.wav \
+ $repo/test_wavs/1.wav \
+ $repo/test_wavs/2.wav
+
+log "Decoding with HL on CPU with OpenFst"
+
+./conformer_ctc/jit_pretrained_decode_with_HL.py \
+ --nn-model $repo/exp/cpu_jit.pt \
+ --HL $repo/data/lang_char/HL.fst \
+ --words $repo/data/lang_char/words.txt \
+ $repo/test_wavs/0.wav \
+ $repo/test_wavs/1.wav \
+ $repo/test_wavs/2.wav
+
+log "Decoding with HLG on CPU with OpenFst"
+
+./conformer_ctc/jit_pretrained_decode_with_HLG.py \
+ --nn-model $repo/exp/cpu_jit.pt \
+ --HLG $repo/data/lang_char/HLG.fst \
+ --words $repo/data/lang_char/words.txt \
+ $repo/test_wavs/0.wav \
+ $repo/test_wavs/1.wav \
+ $repo/test_wavs/2.wav
+
+rm -rf $repo
diff --git a/.github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
new file mode 100755
index 000000000..7b686328d
--- /dev/null
+++ b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-100h-transducer-stateless-multi-datasets-bpe-500-2022-02-21
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./transducer_stateless_multi_datasets/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./transducer_stateless_multi_datasets/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p transducer_stateless_multi_datasets/exp
+ ln -s $PWD/$repo/exp/pretrained.pt transducer_stateless_multi_datasets/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh transducer_stateless_multi_datasets/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./transducer_stateless_multi_datasets/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --max-duration $max_duration \
+ --exp-dir transducer_stateless_multi_datasets/exp
+ done
+
+ rm transducer_stateless_multi_datasets/exp/*.pt
+fi
diff --git a/.github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
new file mode 100755
index 000000000..a8eeeb514
--- /dev/null
+++ b/.github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-multi-datasets-bpe-500-2022-03-01
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./transducer_stateless_multi_datasets/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./transducer_stateless_multi_datasets/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p transducer_stateless_multi_datasets/exp
+ ln -s $PWD/$repo/exp/pretrained.pt transducer_stateless_multi_datasets/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh transducer_stateless_multi_datasets/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./transducer_stateless_multi_datasets/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --max-duration $max_duration \
+ --exp-dir transducer_stateless_multi_datasets/exp
+ done
+
+ rm transducer_stateless_multi_datasets/exp/*.pt
+fi
diff --git a/.github/scripts/run-pre-trained-transducer-stateless-modified-2-aishell.sh b/.github/scripts/run-pre-trained-transducer-stateless-modified-2-aishell.sh
new file mode 100755
index 000000000..0644d9be0
--- /dev/null
+++ b/.github/scripts/run-pre-trained-transducer-stateless-modified-2-aishell.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/aishell/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-aishell-transducer-stateless-modified-2-2022-03-01
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./transducer_stateless_modified-2/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --lang-dir $repo/data/lang_char \
+ $repo/test_wavs/BAC009S0764W0121.wav \
+ $repo/test_wavs/BAC009S0764W0122.wav \
+ $repo/test_wavs/BAC009S0764W0123.wav
+done
+
+for method in modified_beam_search beam_search; do
+ log "$method"
+
+ ./transducer_stateless_modified-2/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --lang-dir $repo/data/lang_char \
+ $repo/test_wavs/BAC009S0764W0121.wav \
+ $repo/test_wavs/BAC009S0764W0122.wav \
+ $repo/test_wavs/BAC009S0764W0123.wav
+done
diff --git a/.github/scripts/run-pre-trained-transducer-stateless-modified-aishell.sh b/.github/scripts/run-pre-trained-transducer-stateless-modified-aishell.sh
new file mode 100755
index 000000000..79fb64311
--- /dev/null
+++ b/.github/scripts/run-pre-trained-transducer-stateless-modified-aishell.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/aishell/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-aishell-transducer-stateless-modified-2022-03-01
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./transducer_stateless_modified/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --lang-dir $repo/data/lang_char \
+ $repo/test_wavs/BAC009S0764W0121.wav \
+ $repo/test_wavs/BAC009S0764W0122.wav \
+ $repo/test_wavs/BAC009S0764W0123.wav
+done
+
+for method in modified_beam_search beam_search; do
+ log "$method"
+
+ ./transducer_stateless_modified/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --lang-dir $repo/data/lang_char \
+ $repo/test_wavs/BAC009S0764W0121.wav \
+ $repo/test_wavs/BAC009S0764W0122.wav \
+ $repo/test_wavs/BAC009S0764W0123.wav
+done
diff --git a/.github/scripts/run-pre-trained-transducer-stateless.sh b/.github/scripts/run-pre-trained-transducer-stateless.sh
new file mode 100755
index 000000000..2e2360435
--- /dev/null
+++ b/.github/scripts/run-pre-trained-transducer-stateless.sh
@@ -0,0 +1,77 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./transducer_stateless/pretrained.py \
+ --method greedy_search \
+ --max-sym-per-frame $sym \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+for method in fast_beam_search modified_beam_search beam_search; do
+ log "$method"
+
+ ./transducer_stateless/pretrained.py \
+ --method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
+
+echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}"
+echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}"
+if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then
+ mkdir -p transducer_stateless/exp
+ ln -s $PWD/$repo/exp/pretrained.pt transducer_stateless/exp/epoch-999.pt
+ ln -s $PWD/$repo/data/lang_bpe_500 data/
+
+ ls -lh data
+ ls -lh transducer_stateless/exp
+
+ log "Decoding test-clean and test-other"
+
+ # use a small value for decoding with CPU
+ max_duration=100
+
+ for method in greedy_search fast_beam_search modified_beam_search; do
+ log "Decoding with $method"
+
+ ./transducer_stateless/decode.py \
+ --decoding-method $method \
+ --epoch 999 \
+ --avg 1 \
+ --max-duration $max_duration \
+ --exp-dir transducer_stateless/exp
+ done
+
+ rm transducer_stateless/exp/*.pt
+fi
diff --git a/.github/scripts/run-pre-trained-transducer.sh b/.github/scripts/run-pre-trained-transducer.sh
new file mode 100755
index 000000000..b865f8d13
--- /dev/null
+++ b/.github/scripts/run-pre-trained-transducer.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-bpe-500-2021-12-23
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+log "Beam search decoding"
+
+./transducer/pretrained.py \
+ --method beam_search \
+ --beam-size 4 \
+ --checkpoint $repo/exp/pretrained.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
diff --git a/.github/scripts/run-swbd-conformer-ctc-2023-08-26.sh b/.github/scripts/run-swbd-conformer-ctc-2023-08-26.sh
new file mode 100755
index 000000000..d8cc020e1
--- /dev/null
+++ b/.github/scripts/run-swbd-conformer-ctc-2023-08-26.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/swbd/ASR
+
+repo_url=https://huggingface.co/zrjin/icefall-asr-swbd-conformer-ctc-2023-8-26
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s epoch-98.pt epoch-99.pt
+popd
+
+ls -lh $repo/exp/*.pt
+
+for method in ctc-decoding 1best; do
+ log "$method"
+
+ ./conformer_ctc/pretrained.py \
+ --method $method \
+ --checkpoint $repo/exp/epoch-99.pt \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --words-file $repo/data/lang_bpe_500/words.txt \
+ --HLG $repo/data/lang_bpe_500/HLG.pt \
+ --G $repo/data/lm/G_4_gram.pt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+done
diff --git a/.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh b/.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
new file mode 100755
index 000000000..a3a2d3080
--- /dev/null
+++ b/.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
@@ -0,0 +1,119 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/wenetspeech/ASR
+
+repo_url=https://huggingface.co/luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2
+
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+git clone $repo_url
+repo=$(basename $repo_url)
+
+log "Display test files"
+tree $repo/
+ls -lh $repo/test_wavs/*.wav
+
+pushd $repo/exp
+ln -s pretrained_epoch_10_avg_2.pt pretrained.pt
+ln -s pretrained_epoch_10_avg_2.pt epoch-99.pt
+popd
+
+log "Test exporting to ONNX format"
+
+./pruned_transducer_stateless2/export-onnx.py \
+ --exp-dir $repo/exp \
+ --lang-dir $repo/data/lang_char \
+ --epoch 99 \
+ --avg 1
+
+log "Export to torchscript model"
+
+./pruned_transducer_stateless2/export.py \
+ --exp-dir $repo/exp \
+ --lang-dir $repo/data/lang_char \
+ --epoch 99 \
+ --avg 1 \
+ --jit 1
+
+./pruned_transducer_stateless2/export.py \
+ --exp-dir $repo/exp \
+ --lang-dir $repo/data/lang_char \
+ --epoch 99 \
+ --avg 1 \
+ --jit-trace 1
+
+ls -lh $repo/exp/*.onnx
+ls -lh $repo/exp/*.pt
+
+log "Decode with ONNX models"
+
+./pruned_transducer_stateless2/onnx_check.py \
+ --jit-filename $repo/exp/cpu_jit.pt \
+ --onnx-encoder-filename $repo/exp/encoder-epoch-10-avg-2.onnx \
+ --onnx-decoder-filename $repo/exp/decoder-epoch-10-avg-2.onnx \
+ --onnx-joiner-filename $repo/exp/joiner-epoch-10-avg-2.onnx \
+ --onnx-joiner-encoder-proj-filename $repo/exp/joiner_encoder_proj-epoch-10-avg-2.onnx \
+ --onnx-joiner-decoder-proj-filename $repo/exp/joiner_decoder_proj-epoch-10-avg-2.onnx
+
+./pruned_transducer_stateless2/onnx_pretrained.py \
+ --tokens $repo/data/lang_char/tokens.txt \
+ --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+ $repo/test_wavs/DEV_T0000000000.wav \
+ $repo/test_wavs/DEV_T0000000001.wav \
+ $repo/test_wavs/DEV_T0000000002.wav
+
+log "Decode with models exported by torch.jit.trace()"
+
+./pruned_transducer_stateless2/jit_pretrained.py \
+ --tokens $repo/data/lang_char/tokens.txt \
+ --encoder-model-filename $repo/exp/encoder_jit_trace.pt \
+ --decoder-model-filename $repo/exp/decoder_jit_trace.pt \
+ --joiner-model-filename $repo/exp/joiner_jit_trace.pt \
+ $repo/test_wavs/DEV_T0000000000.wav \
+ $repo/test_wavs/DEV_T0000000001.wav \
+ $repo/test_wavs/DEV_T0000000002.wav
+
+./pruned_transducer_stateless2/jit_pretrained.py \
+ --tokens $repo/data/lang_char/tokens.txt \
+ --encoder-model-filename $repo/exp/encoder_jit_script.pt \
+ --decoder-model-filename $repo/exp/decoder_jit_script.pt \
+ --joiner-model-filename $repo/exp/joiner_jit_script.pt \
+ $repo/test_wavs/DEV_T0000000000.wav \
+ $repo/test_wavs/DEV_T0000000001.wav \
+ $repo/test_wavs/DEV_T0000000002.wav
+
+for sym in 1 2 3; do
+ log "Greedy search with --max-sym-per-frame $sym"
+
+ ./pruned_transducer_stateless2/pretrained.py \
+ --checkpoint $repo/exp/epoch-99.pt \
+ --lang-dir $repo/data/lang_char \
+ --decoding-method greedy_search \
+ --max-sym-per-frame $sym \
+ $repo/test_wavs/DEV_T0000000000.wav \
+ $repo/test_wavs/DEV_T0000000001.wav \
+ $repo/test_wavs/DEV_T0000000002.wav
+done
+
+for method in modified_beam_search beam_search fast_beam_search; do
+ log "$method"
+
+ ./pruned_transducer_stateless2/pretrained.py \
+ --decoding-method $method \
+ --beam-size 4 \
+ --checkpoint $repo/exp/epoch-99.pt \
+ --lang-dir $repo/data/lang_char \
+ $repo/test_wavs/DEV_T0000000000.wav \
+ $repo/test_wavs/DEV_T0000000001.wav \
+ $repo/test_wavs/DEV_T0000000002.wav
+done
diff --git a/.github/scripts/test-ncnn-export.sh b/.github/scripts/test-ncnn-export.sh
new file mode 100755
index 000000000..4073c594a
--- /dev/null
+++ b/.github/scripts/test-ncnn-export.sh
@@ -0,0 +1,230 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+pushd egs/librispeech/ASR
+
+log "Install ncnn and pnnx"
+
+# We are using a modified ncnn here. Will try to merge it to the official repo
+# of ncnn
+git clone https://github.com/csukuangfj/ncnn
+pushd ncnn
+git submodule init
+git submodule update python/pybind11
+python3 setup.py bdist_wheel
+ls -lh dist/
+pip install dist/*.whl
+cd tools/pnnx
+mkdir build
+cd build
+
+echo "which python3"
+
+which python3
+#/opt/hostedtoolcache/Python/3.8.16/x64/bin/python3
+
+cmake -D Python3_EXECUTABLE=$(which python3) ..
+make -j4 pnnx
+
+./src/pnnx || echo "pass"
+
+popd
+
+export PATH=$PWD/ncnn/tools/pnnx/build/src:$PATH
+
+log "=========================================================================="
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
+
+cd exp
+ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-99.pt
+popd
+
+log "Export via torch.jit.trace()"
+
+./conv_emformer_transducer_stateless2/export-for-ncnn.py \
+ --exp-dir $repo/exp \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --num-encoder-layers 12 \
+ --chunk-length 32 \
+ --cnn-module-kernel 31 \
+ --left-context-length 32 \
+ --right-context-length 8 \
+ --memory-size 32
+
+pnnx $repo/exp/encoder_jit_trace-pnnx.pt
+pnnx $repo/exp/decoder_jit_trace-pnnx.pt
+pnnx $repo/exp/joiner_jit_trace-pnnx.pt
+
+python3 ./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
+ --encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
+ --decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
+ --decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
+ --joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
+ --joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
+ $repo/test_wavs/1089-134686-0001.wav
+
+rm -rf $repo
+log "--------------------------------------------------------------------------"
+
+log "=========================================================================="
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "exp/pretrained-iter-468000-avg-16.pt"
+
+cd exp
+ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
+popd
+
+log "Export via torch.jit.trace()"
+
+./lstm_transducer_stateless2/export-for-ncnn.py \
+ --exp-dir $repo/exp \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model 0
+
+pnnx $repo/exp/encoder_jit_trace-pnnx.pt
+pnnx $repo/exp/decoder_jit_trace-pnnx.pt
+pnnx $repo/exp/joiner_jit_trace-pnnx.pt
+
+python3 ./lstm_transducer_stateless2/streaming-ncnn-decode.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
+ --encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
+ --decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
+ --decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
+ --joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
+ --joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
+ $repo/test_wavs/1089-134686-0001.wav
+
+python3 ./lstm_transducer_stateless2/ncnn-decode.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
+ --encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
+ --decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
+ --decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
+ --joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
+ --joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
+ $repo/test_wavs/1089-134686-0001.wav
+
+rm -rf $repo
+log "--------------------------------------------------------------------------"
+
+log "=========================================================================="
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "exp/pretrained.pt"
+
+cd exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --exp-dir $repo/exp \
+ --use-averaged-model 0 \
+ --epoch 99 \
+ --avg 1 \
+ \
+ --decode-chunk-len 32 \
+ --num-encoder-layers "2,4,3,2,4" \
+ --feedforward-dims "1024,1024,2048,2048,1024" \
+ --nhead "8,8,8,8,8" \
+ --encoder-dims "384,384,384,384,384" \
+ --attention-dims "192,192,192,192,192" \
+ --encoder-unmasked-dims "256,256,256,256,256" \
+ --zipformer-downsampling-factors "1,2,4,8,2" \
+ --cnn-module-kernels "31,31,31,31,31" \
+ --decoder-dim 512 \
+ --joiner-dim 512
+
+pnnx $repo/exp/encoder_jit_trace-pnnx.pt
+pnnx $repo/exp/decoder_jit_trace-pnnx.pt
+pnnx $repo/exp/joiner_jit_trace-pnnx.pt
+
+python3 ./pruned_transducer_stateless7_streaming/streaming-ncnn-decode.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
+ --encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
+ --decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
+ --decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
+ --joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
+ --joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
+ $repo/test_wavs/1089-134686-0001.wav
+
+rm -rf $repo
+log "--------------------------------------------------------------------------"
+
+log "=========================================================================="
+repo_url=https://huggingface.co/pfluo/k2fsa-zipformer-chinese-english-mixed
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "data/lang_char_bpe/L.pt"
+git lfs pull --include "data/lang_char_bpe/L_disambig.pt"
+git lfs pull --include "data/lang_char_bpe/Linv.pt"
+git lfs pull --include "exp/pretrained.pt"
+
+cd exp
+ln -s pretrained.pt epoch-9999.pt
+popd
+
+./pruned_transducer_stateless7_streaming/export-for-ncnn-zh.py \
+ --tokens $repo/data/lang_char_bpe/tokens.txt \
+ --exp-dir $repo/exp \
+ --use-averaged-model 0 \
+ --epoch 9999 \
+ --avg 1 \
+ --decode-chunk-len 32 \
+ --num-encoder-layers "2,4,3,2,4" \
+ --feedforward-dims "1024,1024,1536,1536,1024" \
+ --nhead "8,8,8,8,8" \
+ --encoder-dims "384,384,384,384,384" \
+ --attention-dims "192,192,192,192,192" \
+ --encoder-unmasked-dims "256,256,256,256,256" \
+ --zipformer-downsampling-factors "1,2,4,8,2" \
+ --cnn-module-kernels "31,31,31,31,31" \
+ --decoder-dim 512 \
+ --joiner-dim 512
+
+pnnx $repo/exp/encoder_jit_trace-pnnx.pt
+pnnx $repo/exp/decoder_jit_trace-pnnx.pt
+pnnx $repo/exp/joiner_jit_trace-pnnx.pt
+
+python3 ./pruned_transducer_stateless7_streaming/streaming-ncnn-decode.py \
+ --tokens $repo/data/lang_char_bpe/tokens.txt \
+ --encoder-param-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.param \
+ --encoder-bin-filename $repo/exp/encoder_jit_trace-pnnx.ncnn.bin \
+ --decoder-param-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.param \
+ --decoder-bin-filename $repo/exp/decoder_jit_trace-pnnx.ncnn.bin \
+ --joiner-param-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.param \
+ --joiner-bin-filename $repo/exp/joiner_jit_trace-pnnx.ncnn.bin \
+ $repo/test_wavs/0.wav
+
+rm -rf $repo
+log "--------------------------------------------------------------------------"
diff --git a/.github/scripts/test-onnx-export.sh b/.github/scripts/test-onnx-export.sh
new file mode 100755
index 000000000..fcfc11fa6
--- /dev/null
+++ b/.github/scripts/test-onnx-export.sh
@@ -0,0 +1,465 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+ # This function is from espnet
+ local fname=${BASH_SOURCE[1]##*/}
+ echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/librispeech/ASR
+
+log "=========================================================================="
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "exp/pretrained.pt"
+cd exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+log "Export via torch.jit.script()"
+./zipformer/export.py \
+ --exp-dir $repo/exp \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --jit 1
+
+log "Test export to ONNX format"
+./zipformer/export-onnx.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --use-averaged-model 0 \
+ --epoch 99 \
+ --avg 1 \
+ --exp-dir $repo/exp \
+ --num-encoder-layers "2,2,3,4,3,2" \
+ --downsampling-factor "1,2,4,8,4,2" \
+ --feedforward-dim "512,768,1024,1536,1024,768" \
+ --num-heads "4,4,4,8,4,4" \
+ --encoder-dim "192,256,384,512,384,256" \
+ --query-head-dim 32 \
+ --value-head-dim 12 \
+ --pos-head-dim 4 \
+ --pos-dim 48 \
+ --encoder-unmasked-dim "192,192,256,256,256,192" \
+ --cnn-module-kernel "31,31,15,15,15,31" \
+ --decoder-dim 512 \
+ --joiner-dim 512 \
+ --causal False \
+ --chunk-size "16,32,64,-1" \
+ --left-context-frames "64,128,256,-1"
+
+ls -lh $repo/exp
+
+log "Run onnx_check.py"
+
+./zipformer/onnx_check.py \
+ --jit-filename $repo/exp/jit_script.pt \
+ --onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+ --onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+ --onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
+
+log "Run onnx_pretrained.py"
+
+./zipformer/onnx_pretrained.py \
+ --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav
+
+rm -rf $repo
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "exp/pretrained.pt"
+
+cd exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+log "Test export streaming model to ONNX format"
+./zipformer/export-onnx-streaming.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --use-averaged-model 0 \
+ --epoch 99 \
+ --avg 1 \
+ --exp-dir $repo/exp \
+ --num-encoder-layers "2,2,3,4,3,2" \
+ --downsampling-factor "1,2,4,8,4,2" \
+ --feedforward-dim "512,768,1024,1536,1024,768" \
+ --num-heads "4,4,4,8,4,4" \
+ --encoder-dim "192,256,384,512,384,256" \
+ --query-head-dim 32 \
+ --value-head-dim 12 \
+ --pos-head-dim 4 \
+ --pos-dim 48 \
+ --encoder-unmasked-dim "192,192,256,256,256,192" \
+ --cnn-module-kernel "31,31,15,15,15,31" \
+ --decoder-dim 512 \
+ --joiner-dim 512 \
+ --causal True \
+ --chunk-size 16 \
+ --left-context-frames 64
+
+ls -lh $repo/exp
+
+log "Run onnx_pretrained-streaming.py"
+
+./zipformer/onnx_pretrained-streaming.py \
+ --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1-chunk-16-left-64.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1-chunk-16-left-64.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1-chunk-16-left-64.onnx \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav
+
+rm -rf $repo
+
+log "--------------------------------------------------------------------------"
+
+log "=========================================================================="
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/pretrained.pt"
+cd exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+log "Export via torch.jit.trace()"
+
+./pruned_transducer_stateless7_streaming/jit_trace_export.py \
+ --bpe-model $repo/data/lang_bpe_500/bpe.model \
+ --use-averaged-model 0 \
+ --epoch 99 \
+ --avg 1 \
+ --decode-chunk-len 32 \
+ --exp-dir $repo/exp/
+
+log "Test exporting to ONNX format"
+
+./pruned_transducer_stateless7_streaming/export-onnx.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --use-averaged-model 0 \
+ --epoch 99 \
+ --avg 1 \
+ --decode-chunk-len 32 \
+ --exp-dir $repo/exp/
+
+ls -lh $repo/exp
+
+log "Run onnx_check.py"
+
+./pruned_transducer_stateless7_streaming/onnx_check.py \
+ --jit-encoder-filename $repo/exp/encoder_jit_trace.pt \
+ --jit-decoder-filename $repo/exp/decoder_jit_trace.pt \
+ --jit-joiner-filename $repo/exp/joiner_jit_trace.pt \
+ --onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+ --onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+ --onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
+
+log "Run onnx_pretrained.py"
+
+./pruned_transducer_stateless7_streaming/onnx_pretrained.py \
+ --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav
+
+rm -rf $repo
+log "--------------------------------------------------------------------------"
+
+log "=========================================================================="
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
+log "Downloading pre-trained model from $repo_url"
+git lfs install
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/pretrained-iter-1224000-avg-14.pt"
+
+cd exp
+ln -s pretrained-iter-1224000-avg-14.pt epoch-9999.pt
+popd
+
+log "Export via torch.jit.script()"
+
+./pruned_transducer_stateless3/export.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 9999 \
+ --avg 1 \
+ --exp-dir $repo/exp/ \
+ --jit 1
+
+log "Test exporting to ONNX format"
+
+./pruned_transducer_stateless3/export-onnx.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 9999 \
+ --avg 1 \
+ --exp-dir $repo/exp/
+
+ls -lh $repo/exp
+
+log "Run onnx_check.py"
+
+./pruned_transducer_stateless3/onnx_check.py \
+ --jit-filename $repo/exp/cpu_jit.pt \
+ --onnx-encoder-filename $repo/exp/encoder-epoch-9999-avg-1.onnx \
+ --onnx-decoder-filename $repo/exp/decoder-epoch-9999-avg-1.onnx \
+ --onnx-joiner-filename $repo/exp/joiner-epoch-9999-avg-1.onnx
+
+log "Run onnx_pretrained.py"
+
+./pruned_transducer_stateless3/onnx_pretrained.py \
+ --encoder-model-filename $repo/exp/encoder-epoch-9999-avg-1.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-9999-avg-1.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-9999-avg-1.onnx \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+rm -rf $repo
+log "--------------------------------------------------------------------------"
+
+log "=========================================================================="
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/pretrained-epoch-39-avg-7.pt"
+
+cd exp
+ln -s pretrained-epoch-39-avg-7.pt epoch-99.pt
+popd
+
+log "Export via torch.jit.script()"
+
+./pruned_transducer_stateless5/export.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --exp-dir $repo/exp \
+ --num-encoder-layers 18 \
+ --dim-feedforward 2048 \
+ --nhead 8 \
+ --encoder-dim 512 \
+ --decoder-dim 512 \
+ --joiner-dim 512 \
+ --jit 1
+
+log "Test exporting to ONNX format"
+
+./pruned_transducer_stateless5/export-onnx.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --exp-dir $repo/exp \
+ --num-encoder-layers 18 \
+ --dim-feedforward 2048 \
+ --nhead 8 \
+ --encoder-dim 512 \
+ --decoder-dim 512 \
+ --joiner-dim 512
+
+ls -lh $repo/exp
+
+log "Run onnx_check.py"
+
+./pruned_transducer_stateless5/onnx_check.py \
+ --jit-filename $repo/exp/cpu_jit.pt \
+ --onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+ --onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+ --onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
+
+log "Run onnx_pretrained.py"
+
+./pruned_transducer_stateless5/onnx_pretrained.py \
+ --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+rm -rf $repo
+log "--------------------------------------------------------------------------"
+
+log "=========================================================================="
+repo_url=
+
+rm -rf $repo
+log "--------------------------------------------------------------------------"
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "exp/pretrained.pt"
+
+cd exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+log "Export via torch.jit.script()"
+
+./pruned_transducer_stateless7/export.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --use-averaged-model 0 \
+ --epoch 99 \
+ --avg 1 \
+ --exp-dir $repo/exp \
+ --feedforward-dims "1024,1024,2048,2048,1024" \
+ --jit 1
+
+log "Test exporting to ONNX format"
+
+./pruned_transducer_stateless7/export-onnx.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --use-averaged-model 0 \
+ --epoch 99 \
+ --avg 1 \
+ --exp-dir $repo/exp \
+ --feedforward-dims "1024,1024,2048,2048,1024"
+
+ls -lh $repo/exp
+
+log "Run onnx_check.py"
+
+./pruned_transducer_stateless7/onnx_check.py \
+ --jit-filename $repo/exp/cpu_jit.pt \
+ --onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+ --onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+ --onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
+
+log "Run onnx_pretrained.py"
+
+./pruned_transducer_stateless7/onnx_pretrained.py \
+ --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav \
+ $repo/test_wavs/1221-135766-0001.wav \
+ $repo/test_wavs/1221-135766-0002.wav
+
+log "=========================================================================="
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
+
+cd exp
+ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-99.pt
+popd
+
+log "Test exporting to ONNX format"
+
+./conv_emformer_transducer_stateless2/export-onnx.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --use-averaged-model 0 \
+ --epoch 99 \
+ --avg 1 \
+ --exp-dir $repo/exp \
+ --num-encoder-layers 12 \
+ --chunk-length 32 \
+ --cnn-module-kernel 31 \
+ --left-context-length 32 \
+ --right-context-length 8 \
+ --memory-size 32
+
+log "Run onnx_pretrained.py"
+
+./conv_emformer_transducer_stateless2/onnx_pretrained.py \
+ --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1221-135766-0001.wav
+
+rm -rf $repo
+log "--------------------------------------------------------------------------"
+
+log "=========================================================================="
+repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/pretrained-iter-468000-avg-16.pt"
+
+cd exp
+ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
+popd
+
+log "Export via torch.jit.trace()"
+
+./lstm_transducer_stateless2/export.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --use-averaged-model 0 \
+ --epoch 99 \
+ --avg 1 \
+ --exp-dir $repo/exp/ \
+ --jit-trace 1
+
+log "Test exporting to ONNX format"
+
+./lstm_transducer_stateless2/export-onnx.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --use-averaged-model 0 \
+ --epoch 99 \
+ --avg 1 \
+ --exp-dir $repo/exp
+
+ls -lh $repo/exp
+
+log "Run onnx_check.py"
+
+./lstm_transducer_stateless2/onnx_check.py \
+ --jit-encoder-filename $repo/exp/encoder_jit_trace.pt \
+ --jit-decoder-filename $repo/exp/decoder_jit_trace.pt \
+ --jit-joiner-filename $repo/exp/joiner_jit_trace.pt \
+ --onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+ --onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+ --onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
+
+log "Run onnx_pretrained.py"
+
+./lstm_transducer_stateless2/onnx_pretrained.py \
+ --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1221-135766-0001.wav
+
+rm -rf $repo
+log "--------------------------------------------------------------------------"
diff --git a/.github/workflows/build-doc.yml b/.github/workflows/build-doc.yml
new file mode 100644
index 000000000..d7fe2c964
--- /dev/null
+++ b/.github/workflows/build-doc.yml
@@ -0,0 +1,69 @@
+# Copyright 2022 Xiaomi Corp. (author: Fangjun Kuang)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# refer to https://github.com/actions/starter-workflows/pull/47/files
+
+# You can access it at https://k2-fsa.github.io/icefall/
+name: Generate doc
+on:
+ push:
+ branches:
+ - master
+ - doc
+ pull_request:
+ types: [labeled]
+
+concurrency:
+ group: build_doc-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ build-doc:
+ if: github.event.label.name == 'doc' || github.event_name == 'push'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-latest]
+ python-version: ["3.8"]
+ steps:
+ # refer to https://github.com/actions/checkout
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Display Python version
+ run: python -c "import sys; print(sys.version)"
+
+ - name: Build doc
+ shell: bash
+ run: |
+ cd docs
+ python3 -m pip install -r ./requirements.txt
+ make html
+ touch build/html/.nojekyll
+
+ - name: Deploy
+ uses: peaceiris/actions-gh-pages@v3
+ with:
+ github_token: ${{ secrets.GITHUB_TOKEN }}
+ publish_dir: ./docs/build/html
+ publish_branch: gh-pages
diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
new file mode 100644
index 000000000..e5d96dcdf
--- /dev/null
+++ b/.github/workflows/build-docker-image.yml
@@ -0,0 +1,52 @@
+# see also
+# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages
+name: Build docker image
+on:
+ workflow_dispatch:
+
+concurrency:
+ group: build_docker-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ build-docker-image:
+ name: ${{ matrix.image }}
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-latest]
+ image: ["torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+
+ steps:
+ # refer to https://github.com/actions/checkout
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Rename
+ shell: bash
+ run: |
+ image=${{ matrix.image }}
+ mv -v ./docker/$image.dockerfile ./Dockerfile
+
+ - name: Free space
+ shell: bash
+ run: |
+ df -h
+ rm -rf /opt/hostedtoolcache
+ df -h
+
+ - name: Log in to Docker Hub
+ uses: docker/login-action@v2
+ with:
+ username: ${{ secrets.DOCKER_USERNAME }}
+ password: ${{ secrets.DOCKER_PASSWORD }}
+
+ - name: Build and push
+ uses: docker/build-push-action@v4
+ with:
+ context: .
+ file: ./Dockerfile
+ push: true
+ tags: k2fsa/icefall:${{ matrix.image }}
diff --git a/.github/workflows/multi-zh-hans.yml b/.github/workflows/multi-zh-hans.yml
new file mode 100644
index 000000000..9081047de
--- /dev/null
+++ b/.github/workflows/multi-zh-hans.yml
@@ -0,0 +1,79 @@
+name: run-multi-zh-hans
+
+on:
+ push:
+ branches:
+ - master
+
+ workflow_dispatch:
+
+concurrency:
+ group: run-multi-zh-hans-${{ github.ref }}
+ cancel-in-progress: true
+
+permissions:
+ contents: write
+
+jobs:
+ multi-zh-hans:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: export-model
+ shell: bash
+ env:
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
+ run: |
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/multi-zh-hans.sh
+ ls -lh
+
+ - name: upload model to https://github.com/k2-fsa/sherpa-onnx
+ uses: svenstaro/upload-release-action@v2
+ with:
+ file_glob: true
+ file: ./*.tar.bz2
+ overwrite: true
+ repo_name: k2-fsa/sherpa-onnx
+ repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+ tag: asr-models
diff --git a/.github/workflows/run-aishell-2022-06-20.yml b/.github/workflows/run-aishell-2022-06-20.yml
new file mode 100644
index 000000000..53fcb2c03
--- /dev/null
+++ b/.github/workflows/run-aishell-2022-06-20.yml
@@ -0,0 +1,123 @@
+# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-aishell-2022-06-20
+# pruned RNN-T + reworked model with random combiner
+# https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_aishell_2022_06_20-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_aishell_2022_06_20:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh
+
+ - name: Display decoding results for aishell pruned_transducer_stateless3
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/aishell/ASR/
+ tree ./pruned_transducer_stateless3/exp
+
+ cd pruned_transducer_stateless3
+ echo "results for pruned_transducer_stateless3"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2
+
+ - name: Upload decoding results for aishell pruned_transducer_stateless3
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: aishell-torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless3-2022-06-20
+ path: egs/aishell/ASR/pruned_transducer_stateless3/exp/
diff --git a/.github/workflows/run-aishell-zipformer-2023-10-24.yml b/.github/workflows/run-aishell-zipformer-2023-10-24.yml
new file mode 100644
index 000000000..f2fb44a5f
--- /dev/null
+++ b/.github/workflows/run-aishell-zipformer-2023-10-24.yml
@@ -0,0 +1,95 @@
+# Copyright 2023 Zengrui Jin (Xiaomi Corp.)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-aishell-zipformer-2023-10-24
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_aishell_zipformer_2023_10_24-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_aishell_zipformer_2023_10_24:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'zipformer' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-aishell-zipformer-2023-10-24.sh
+
+
\ No newline at end of file
diff --git a/.github/workflows/run-docker-image.yml b/.github/workflows/run-docker-image.yml
new file mode 100644
index 000000000..d048923b6
--- /dev/null
+++ b/.github/workflows/run-docker-image.yml
@@ -0,0 +1,105 @@
+name: Run docker image
+on:
+ workflow_dispatch:
+
+concurrency:
+ group: run_docker_image-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run-docker-image:
+ name: ${{ matrix.image }}
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [ubuntu-latest]
+ image: ["torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+ steps:
+ # refer to https://github.com/actions/checkout
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Run the build process with Docker
+ uses: addnab/docker-run-action@v3
+ with:
+ image: k2fsa/icefall:${{ matrix.image }}
+ shell: bash
+ run: |
+ uname -a
+ cat /etc/*release
+
+ find / -name libcuda* 2>/dev/null
+
+ ls -lh /usr/local/
+ ls -lh /usr/local/cuda*
+
+ nvcc --version
+
+ ls -lh /usr/local/cuda-*/compat/*
+
+ # For torch1.9.0-cuda10.2
+ export LD_LIBRARY_PATH=/usr/local/cuda-10.2/compat:$LD_LIBRARY_PATH
+
+ # For torch1.12.1-cuda11.3
+ export LD_LIBRARY_PATH=/usr/local/cuda-11.3/compat:$LD_LIBRARY_PATH
+
+ # For torch2.0.0-cuda11.7
+ export LD_LIBRARY_PATH=/usr/local/cuda-11.7/compat:$LD_LIBRARY_PATH
+
+ # For torch2.1.0-cuda11.8
+ export LD_LIBRARY_PATH=/usr/local/cuda-11.8/compat:$LD_LIBRARY_PATH
+
+ # For torch2.1.0-cuda12.1
+ export LD_LIBRARY_PATH=/usr/local/cuda-12.1/compat:$LD_LIBRARY_PATH
+
+
+ which nvcc
+ cuda_dir=$(dirname $(which nvcc))
+ echo "cuda_dir: $cuda_dir"
+
+ find $cuda_dir -name libcuda.so*
+ echo "--------------------"
+
+ find / -name libcuda.so* 2>/dev/null
+
+ # for torch1.13.0-cuda11.6
+ if [ -e /opt/conda/lib/stubs/libcuda.so ]; then
+ cd /opt/conda/lib/stubs && ln -s libcuda.so libcuda.so.1 && cd -
+ export LD_LIBRARY_PATH=/opt/conda/lib/stubs:$LD_LIBRARY_PATH
+ fi
+
+ find / -name libcuda.so* 2>/dev/null
+ echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
+
+ python3 --version
+ which python3
+
+ python3 -m pip list
+
+ echo "----------torch----------"
+ python3 -m torch.utils.collect_env
+
+ echo "----------k2----------"
+ python3 -c "import k2; print(k2.__file__)"
+ python3 -c "import k2; print(k2.__dev_version__)"
+ python3 -m k2.version
+
+ echo "----------lhotse----------"
+ python3 -c "import lhotse; print(lhotse.__file__)"
+ python3 -c "import lhotse; print(lhotse.__version__)"
+
+ echo "----------kaldifeat----------"
+ python3 -c "import kaldifeat; print(kaldifeat.__file__)"
+ python3 -c "import kaldifeat; print(kaldifeat.__version__)"
+
+ echo "Test yesno recipe"
+
+ cd egs/yesno/ASR
+
+ ./prepare.sh
+
+ ./tdnn/train.py
+
+ ./tdnn/decode.py
diff --git a/.github/workflows/run-gigaspeech-2022-05-13.yml b/.github/workflows/run-gigaspeech-2022-05-13.yml
new file mode 100644
index 000000000..3121520c1
--- /dev/null
+++ b/.github/workflows/run-gigaspeech-2022-05-13.yml
@@ -0,0 +1,126 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-gigaspeech-2022-05-13
+# stateless transducer + k2 pruned rnnt-loss + reworked conformer
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_gigaspeech_2022_05_13-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_gigaspeech_2022_05_13:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Download GigaSpeech dev/test dataset
+ shell: bash
+ run: |
+ sudo apt-get install -y -q git-lfs
+
+ .github/scripts/download-gigaspeech-dev-test-dataset.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ ln -s ~/tmp/giga-dev-dataset-fbank/data egs/gigaspeech/ASR/
+
+ ls -lh egs/gigaspeech/ASR/data/fbank
+
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-gigaspeech-pruned-transducer-stateless2-2022-05-12.sh
+
+ - name: Display decoding results for gigaspeech pruned_transducer_stateless2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/gigaspeech/ASR/
+ tree ./pruned_transducer_stateless2/exp
+
+ sudo apt-get -qq install tree
+
+ cd pruned_transducer_stateless2
+ echo "results for pruned_transducer_stateless2"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2
+
+ - name: Upload decoding results for gigaspeech pruned_transducer_stateless2
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-gigaspeech-pruned_transducer_stateless2-2022-05-12
+ path: egs/gigaspeech/ASR/pruned_transducer_stateless2/exp/
diff --git a/.github/workflows/run-gigaspeech-zipformer-2023-10-17.yml b/.github/workflows/run-gigaspeech-zipformer-2023-10-17.yml
new file mode 100644
index 000000000..87090e310
--- /dev/null
+++ b/.github/workflows/run-gigaspeech-zipformer-2023-10-17.yml
@@ -0,0 +1,140 @@
+# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-gigaspeech-zipformer-2023-10-17
+# zipformer
+
+on:
+ push:
+ branches:
+ - master
+
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+ workflow_dispatch:
+
+concurrency:
+ group: run_gigaspeech_2023_10_17_zipformer-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_gigaspeech_2023_10_17_zipformer:
+ if: github.event.label.name == 'zipformer' ||github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
+ run: |
+ mkdir -p egs/gigaspeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/gigaspeech/ASR/data/fbank
+ ls -lh egs/gigaspeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-gigaspeech-zipformer-2023-10-17.sh
+
+ - name: upload model to https://github.com/k2-fsa/sherpa-onnx
+ uses: svenstaro/upload-release-action@v2
+ with:
+ file_glob: true
+ file: ./*.tar.bz2
+ overwrite: true
+ repo_name: k2-fsa/sherpa-onnx
+ repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+ tag: asr-models
+
+ - name: Display decoding results for gigaspeech zipformer
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/gigaspeech/ASR/
+ tree ./zipformer/exp
+
+ cd zipformer
+ echo "results for zipformer"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for gigaspeech zipformer
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer-2022-11-11
+ path: egs/gigaspeech/ASR/zipformer/exp/
diff --git a/.github/workflows/run-librispeech-2022-03-12.yml b/.github/workflows/run-librispeech-2022-03-12.yml
new file mode 100644
index 000000000..f092e3c80
--- /dev/null
+++ b/.github/workflows/run-librispeech-2022-03-12.yml
@@ -0,0 +1,159 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-2022-03-12
+# stateless transducer + k2 pruned rnnt-loss
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_2022_03_12-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_2022_03_12:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-pruned-transducer-stateless-2022-03-12.sh
+
+ - name: Display decoding results for pruned_transducer_stateless
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./pruned_transducer_stateless/exp
+
+ cd pruned_transducer_stateless
+ echo "results for pruned_transducer_stateless"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for pruned_transducer_stateless
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless-2022-03-12
+ path: egs/librispeech/ASR/pruned_transducer_stateless/exp/
diff --git a/.github/workflows/run-librispeech-2022-04-29.yml b/.github/workflows/run-librispeech-2022-04-29.yml
new file mode 100644
index 000000000..f8f4d9977
--- /dev/null
+++ b/.github/workflows/run-librispeech-2022-04-29.yml
@@ -0,0 +1,185 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-2022-04-29
+# stateless pruned transducer (reworked model) + giga speech
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_2022_04_29-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_2022_04_29:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-pruned-transducer-stateless2-2022-04-29.sh
+
+ .github/scripts/run-librispeech-pruned-transducer-stateless3-2022-04-29.sh
+
+ - name: Display decoding results for pruned_transducer_stateless2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR
+ tree pruned_transducer_stateless2/exp
+ cd pruned_transducer_stateless2/exp
+ echo "===greedy search==="
+ find greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Display decoding results for pruned_transducer_stateless3
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR
+ tree pruned_transducer_stateless3/exp
+ cd pruned_transducer_stateless3/exp
+ echo "===greedy search==="
+ find greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for pruned_transducer_stateless2
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless2-2022-04-29
+ path: egs/librispeech/ASR/pruned_transducer_stateless2/exp/
+
+ - name: Upload decoding results for pruned_transducer_stateless3
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless3-2022-04-29
+ path: egs/librispeech/ASR/pruned_transducer_stateless3/exp/
diff --git a/.github/workflows/run-librispeech-2022-05-13.yml b/.github/workflows/run-librispeech-2022-05-13.yml
new file mode 100644
index 000000000..dc20185da
--- /dev/null
+++ b/.github/workflows/run-librispeech-2022-05-13.yml
@@ -0,0 +1,159 @@
+# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-2022-05-13
+# stateless transducer + k2 pruned rnnt-loss + deeper model
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_2022_05_13-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_2022_05_13:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-pruned-transducer-stateless5-2022-05-13.sh
+
+ - name: Display decoding results for librispeech pruned_transducer_stateless5
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./pruned_transducer_stateless5/exp
+
+ cd pruned_transducer_stateless5
+ echo "results for pruned_transducer_stateless5"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for librispeech pruned_transducer_stateless5
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless5-2022-05-13
+ path: egs/librispeech/ASR/pruned_transducer_stateless5/exp/
diff --git a/.github/workflows/run-librispeech-2022-11-11-stateless7.yml b/.github/workflows/run-librispeech-2022-11-11-stateless7.yml
new file mode 100644
index 000000000..7e378c9a1
--- /dev/null
+++ b/.github/workflows/run-librispeech-2022-11-11-stateless7.yml
@@ -0,0 +1,159 @@
+# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-2022-11-11-stateless7
+# zipformer
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_2022_11_11_zipformer-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_2022_11_11_zipformer:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-pruned-transducer-stateless7-2022-11-11.sh
+
+ - name: Display decoding results for librispeech pruned_transducer_stateless7
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./pruned_transducer_stateless7/exp
+
+ cd pruned_transducer_stateless7
+ echo "results for pruned_transducer_stateless7"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for librispeech pruned_transducer_stateless7
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-2022-11-11
+ path: egs/librispeech/ASR/pruned_transducer_stateless7/exp/
diff --git a/.github/workflows/run-librispeech-2022-11-14-stateless8.yml b/.github/workflows/run-librispeech-2022-11-14-stateless8.yml
new file mode 100644
index 000000000..a2c1a0ad6
--- /dev/null
+++ b/.github/workflows/run-librispeech-2022-11-14-stateless8.yml
@@ -0,0 +1,159 @@
+# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-2022-11-14-stateless8
+# zipformer
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_2022_11_14_zipformer_stateless8-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_2022_11_14_zipformer_stateless8:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-pruned-transducer-stateless8-2022-11-14.sh
+
+ - name: Display decoding results for librispeech pruned_transducer_stateless8
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./pruned_transducer_stateless8/exp
+
+ cd pruned_transducer_stateless8
+ echo "results for pruned_transducer_stateless8"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for librispeech pruned_transducer_stateless8
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless8-2022-11-14
+ path: egs/librispeech/ASR/pruned_transducer_stateless8/exp/
diff --git a/.github/workflows/run-librispeech-2022-12-01-stateless7-ctc.yml b/.github/workflows/run-librispeech-2022-12-01-stateless7-ctc.yml
new file mode 100644
index 000000000..500ab1736
--- /dev/null
+++ b/.github/workflows/run-librispeech-2022-12-01-stateless7-ctc.yml
@@ -0,0 +1,163 @@
+# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-2022-12-01-stateless7-ctc
+# zipformer
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+jobs:
+ run_librispeech_2022_11_11_zipformer:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-2022-12-01.sh
+
+ - name: Display decoding results for librispeech pruned_transducer_stateless7_ctc
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./pruned_transducer_stateless7_ctc/exp
+
+ cd pruned_transducer_stateless7_ctc
+ echo "results for pruned_transducer_stateless7_ctc"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===ctc decoding==="
+ find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===1best==="
+ find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for librispeech pruned_transducer_stateless7_ctc
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-ctc-2022-12-01
+ path: egs/librispeech/ASR/pruned_transducer_stateless7_ctc/exp/
diff --git a/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml b/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml
new file mode 100644
index 000000000..1a7f9f594
--- /dev/null
+++ b/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml
@@ -0,0 +1,167 @@
+# Copyright 2022 Zengwei Yao
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-2022-12-08-zipformer-mmi
+# zipformer
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_2022_12_08_zipformer-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_2022_12_08_zipformer:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-zipformer-mmi-2022-12-08.sh
+
+ - name: Display decoding results for librispeech zipformer-mmi
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./zipformer-mmi/exp
+
+ cd zipformer-mmi
+ echo "results for zipformer-mmi"
+ echo "===1best==="
+ find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===nbest==="
+ find exp/nbest -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/nbest -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===nbest-rescoring-LG==="
+ find exp/nbest-rescoring-LG -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/nbest-rescoring-LG -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===nbest-rescoring-3-gram==="
+ find exp/nbest-rescoring-3-gram -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/nbest-rescoring-3-gram -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===nbest-rescoring-4-gram==="
+ find exp/nbest-rescoring-4-gram -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/nbest-rescoring-4-gram -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for librispeech zipformer-mmi
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer_mmi-2022-12-08
+ path: egs/librispeech/ASR/zipformer_mmi/exp/
diff --git a/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml b/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml
new file mode 100644
index 000000000..68014e20c
--- /dev/null
+++ b/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml
@@ -0,0 +1,172 @@
+# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-2022-12-29-stateless7-streaming
+# zipformer
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_2022_12_29_zipformer_streaming-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_2022_12_29_zipformer_streaming:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event.label.name == 'streaming-zipformer' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-pruned-transducer-stateless7-streaming-2022-12-29.sh
+
+ - name: Display decoding results for librispeech pruned_transducer_stateless7_streaming
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./pruned_transducer_stateless7_streaming/exp
+
+ cd pruned_transducer_stateless7_streaming
+ echo "results for pruned_transducer_stateless7_streaming"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===streaming greedy search==="
+ find exp/streaming/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/streaming/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===streaming fast_beam_search==="
+ find exp/streaming/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/streaming/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===streaming modified beam search==="
+ find exp/streaming/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/streaming/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+
+ - name: Upload decoding results for librispeech pruned_transducer_stateless7_streaming
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-streaming-2022-12-29
+ path: egs/librispeech/ASR/pruned_transducer_stateless7_streaming/exp/
diff --git a/.github/workflows/run-librispeech-2023-01-29-stateless7-ctc-bs.yml b/.github/workflows/run-librispeech-2023-01-29-stateless7-ctc-bs.yml
new file mode 100644
index 000000000..821abc25d
--- /dev/null
+++ b/.github/workflows/run-librispeech-2023-01-29-stateless7-ctc-bs.yml
@@ -0,0 +1,163 @@
+# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-2023-01-29-stateless7-ctc-bs
+# zipformer
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+jobs:
+ run_librispeech_2023_01_29_zipformer_ctc_bs:
+ if: github.event.label.name == 'run-decode' || github.event.label.name == 'blank-skip' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh
+
+ - name: Display decoding results for librispeech pruned_transducer_stateless7_ctc_bs
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./pruned_transducer_stateless7_ctc_bs/exp
+
+ cd pruned_transducer_stateless7_ctc_bs
+ echo "results for pruned_transducer_stateless7_ctc_bs"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===ctc decoding==="
+ find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===1best==="
+ find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for librispeech pruned_transducer_stateless7_ctc_bs
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-ctc-bs-2023-01-29
+ path: egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/exp/
diff --git a/.github/workflows/run-librispeech-conformer-ctc3-2022-11-28.yml b/.github/workflows/run-librispeech-conformer-ctc3-2022-11-28.yml
new file mode 100644
index 000000000..905515dc4
--- /dev/null
+++ b/.github/workflows/run-librispeech-conformer-ctc3-2022-11-28.yml
@@ -0,0 +1,155 @@
+# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-conformer-ctc3-2022-11-28
+# zipformer
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_2022_11_28_conformer_ctc3-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_2022_11_28_conformer_ctc3:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-conformer-ctc3-2022-11-28.sh
+
+ - name: Display decoding results for librispeech conformer_ctc3
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./conformer_ctc3/exp
+
+ cd conformer_ctc3
+ echo "results for conformer_ctc3"
+ echo "===ctc-decoding==="
+ find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===1best==="
+ find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for librispeech conformer_ctc3
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-conformer_ctc3-2022-11-28
+ path: egs/librispeech/ASR/conformer_ctc3/exp/
diff --git a/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml b/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
new file mode 100644
index 000000000..501fae38c
--- /dev/null
+++ b/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
@@ -0,0 +1,163 @@
+name: run-librispeech-lstm-transducer2-2022-09-03
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_lstm_transducer_stateless2_2022_09_03-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_lstm_transducer_stateless2_2022_09_03:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'LODR' || github.event.label.name == 'shallow-fusion' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-lstm-transducer-stateless2-2022-09-03.sh
+
+ - name: Display decoding results for lstm_transducer_stateless2
+ if: github.event_name == 'schedule'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR
+ tree lstm_transducer_stateless2/exp
+ cd lstm_transducer_stateless2/exp
+ echo "===greedy search==="
+ find greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Display decoding results for lstm_transducer_stateless2
+ if: github.event.label.name == 'shallow-fusion'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR
+ tree lstm_transducer_stateless2/exp
+ cd lstm_transducer_stateless2/exp
+ echo "===modified_beam_search_lm_shallow_fusion==="
+ echo "===Using RNNLM==="
+ find modified_beam_search_lm_shallow_fusion -name "log-*rnn*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find modified_beam_search_lm_shallow_fusion -name "log-*rnn*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Display decoding results for lstm_transducer_stateless2
+ if: github.event.label.name == 'LODR'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR
+ tree lstm_transducer_stateless2/exp
+ cd lstm_transducer_stateless2/exp
+ echo "===modified_beam_search_rnnlm_LODR==="
+ find modified_beam_search_LODR -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find modified_beam_search_LODR -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for lstm_transducer_stateless2
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'shallow-fusion' || github.event.label.name == 'LODR'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-lstm_transducer_stateless2-2022-09-03
+ path: egs/librispeech/ASR/lstm_transducer_stateless2/exp/
diff --git a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
new file mode 100644
index 000000000..3fb0920bc
--- /dev/null
+++ b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
@@ -0,0 +1,157 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-pruned-transducer-stateless3-2022-05-13
+# stateless pruned transducer (reworked model) + giga speech
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_pruned_transducer_stateless3_2022_05_13-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_pruned_transducer_stateless3_2022_05_13:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-pruned-transducer-stateless3-2022-05-13.sh
+
+ - name: Display decoding results for pruned_transducer_stateless3
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR
+ tree pruned_transducer_stateless3/exp
+ cd pruned_transducer_stateless3/exp
+ echo "===greedy search==="
+ find greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for pruned_transducer_stateless3
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless3-2022-04-29
+ path: egs/librispeech/ASR/pruned_transducer_stateless3/exp/
diff --git a/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml b/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
new file mode 100644
index 000000000..67a6f6fc4
--- /dev/null
+++ b/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
@@ -0,0 +1,159 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-streaming-2022-06-26
+# streaming conformer stateless transducer2
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_streaming_2022_06_26-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_streaming_2022_06_26:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-streaming-pruned-transducer-stateless2-2022-06-26.sh
+
+ - name: Display decoding results
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./pruned_transducer_stateless2/exp
+
+ cd pruned_transducer_stateless2
+ echo "results for pruned_transducer_stateless2"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified_beam_search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for pruned_transducer_stateless2
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless2-2022-06-26
+ path: egs/librispeech/ASR/pruned_transducer_stateless2/exp/
diff --git a/.github/workflows/run-librispeech-streaming-zipformer-2023-05-18.yml b/.github/workflows/run-librispeech-streaming-zipformer-2023-05-18.yml
new file mode 100644
index 000000000..5145fb43c
--- /dev/null
+++ b/.github/workflows/run-librispeech-streaming-zipformer-2023-05-18.yml
@@ -0,0 +1,174 @@
+# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-streaming-zipformer-2023-05-18
+# zipformer
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_2023_05_18_streaming_zipformer-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_2023_05_18_streaming_zipformer:
+ if: github.event.label.name == 'zipformer' ||github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-streaming-zipformer-2023-05-18.sh
+
+ - name: Display decoding results for librispeech zipformer
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./zipformer/exp
+
+ cd zipformer
+
+ echo "results for zipformer, simulated streaming decoding"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "results for zipformer, chunk-wise streaming decoding"
+ echo "===greedy search==="
+ find exp/streaming/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/streaming/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/streaming/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/streaming/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/streaming/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/streaming/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+
+ - name: Upload decoding results for librispeech zipformer
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer-2022-11-11
+ path: egs/librispeech/ASR/zipformer/exp/
diff --git a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
new file mode 100644
index 000000000..35ca08a31
--- /dev/null
+++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
@@ -0,0 +1,159 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-2022-04-19
+# stateless transducer + torchaudio rnn-t loss
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_2022_04_19-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_2022_04_19:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-transducer-stateless2-2022-04-19.sh
+
+ - name: Display decoding results
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./transducer_stateless2/exp
+
+ cd transducer_stateless2
+ echo "results for transducer_stateless2"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified_beam_search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for transducer_stateless2
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-transducer_stateless2-2022-04-19
+ path: egs/librispeech/ASR/transducer_stateless2/exp/
diff --git a/.github/workflows/run-librispeech-zipformer-2023-05-18.yml b/.github/workflows/run-librispeech-zipformer-2023-05-18.yml
new file mode 100644
index 000000000..e9d235ad1
--- /dev/null
+++ b/.github/workflows/run-librispeech-zipformer-2023-05-18.yml
@@ -0,0 +1,159 @@
+# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-zipformer-2023-05-18
+# zipformer
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_2023_05_18_zipformer-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_2023_05_18_zipformer:
+ if: github.event.label.name == 'zipformer' ||github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-zipformer-2023-05-18.sh
+
+ - name: Display decoding results for librispeech zipformer
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./zipformer/exp
+
+ cd zipformer
+ echo "results for zipformer"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for librispeech zipformer
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer-2022-11-11
+ path: egs/librispeech/ASR/zipformer/exp/
diff --git a/.github/workflows/run-librispeech-zipformer-ctc-2023-06-14.yml b/.github/workflows/run-librispeech-zipformer-ctc-2023-06-14.yml
new file mode 100644
index 000000000..48f0b1532
--- /dev/null
+++ b/.github/workflows/run-librispeech-zipformer-ctc-2023-06-14.yml
@@ -0,0 +1,155 @@
+# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-librispeech-zipformer-ctc-2023-06-14
+# zipformer
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_librispeech_2023_06_14_zipformer-ctc-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_librispeech_2023_06_14_zipformer_ctc:
+ if: github.event.label.name == 'zipformer' ||github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-librispeech-zipformer-ctc-2023-06-14.sh
+
+ - name: Display decoding results for librispeech zipformer
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./zipformer/exp
+
+ cd zipformer
+ echo "results for zipformer"
+ echo "===ctc-decoding==="
+ find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/ctc-decoding -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===1best==="
+ find exp/1best -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/1best -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for librispeech zipformer
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer-2022-11-11
+ path: egs/librispeech/ASR/zipformer/exp/
diff --git a/.github/workflows/run-multi-corpora-zipformer.yml b/.github/workflows/run-multi-corpora-zipformer.yml
new file mode 100644
index 000000000..38f7eb908
--- /dev/null
+++ b/.github/workflows/run-multi-corpora-zipformer.yml
@@ -0,0 +1,84 @@
+# Copyright 2023 Xiaomi Corp. (author: Zengrui Jin)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-multi-corpora-zipformer
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+concurrency:
+ group: run_multi-corpora_zipformer-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_multi-corpora_zipformer:
+ if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event_name == 'push' || github.event.label.name == 'multi-zh_hans' || github.event.label.name == 'zipformer' || github.event.label.name == 'multi-corpora'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-multi-corpora-zipformer.sh
diff --git a/.github/workflows/run-pretrained-ctc.yml b/.github/workflows/run-pretrained-ctc.yml
new file mode 100644
index 000000000..074a63dfc
--- /dev/null
+++ b/.github/workflows/run-pretrained-ctc.yml
@@ -0,0 +1,87 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-pre-trained-ctc
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ workflow_dispatch:
+ inputs:
+ test-run:
+ description: 'Test (y/n)?'
+ required: true
+ default: 'y'
+
+concurrency:
+ group: run_pre_trained_ctc-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_pre_trained_ctc:
+ if: github.event.label.name == 'ready' || github.event_name == 'push' || github.event.inputs.test-run == 'y' || github.event.label.name == 'ctc'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ run: |
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+ .github/scripts/run-pre-trained-ctc.sh
diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
new file mode 100644
index 000000000..f8caee8e5
--- /dev/null
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@@ -0,0 +1,158 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-pre-trained-trandsucer-stateless-multi-datasets-librispeech-100h
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_pre_trained_transducer_stateless_multi_datasets_librispeech_100h-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_pre_trained_transducer_stateless_multi_datasets_librispeech_100h:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-pre-trained-transducer-stateless-librispeech-100h.sh
+
+ - name: Display decoding results for transducer_stateless_multi_datasets
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./transducer_stateless_multi_datasets/exp
+
+ cd transducer_stateless_multi_datasets
+ echo "results for transducer_stateless_multi_datasets"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for transducer_stateless_multi_datasets
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-transducer_stateless_multi_datasets-100h-2022-02-21
+ path: egs/librispeech/ASR/transducer_stateless_multi_datasets/exp/
diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
new file mode 100644
index 000000000..7c3910eb8
--- /dev/null
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@@ -0,0 +1,158 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-pre-trained-trandsucer-stateless-multi-datasets-librispeech-960h
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_pre_trained_transducer_stateless_multi_datasets_librispeech_960h-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_pre_trained_transducer_stateless_multi_datasets_librispeech_960h:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-pre-trained-transducer-stateless-librispeech-960h.sh
+
+ - name: Display decoding results for transducer_stateless_multi_datasets
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./transducer_stateless_multi_datasets/exp
+
+ cd transducer_stateless_multi_datasets
+ echo "results for transducer_stateless_multi_datasets"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for transducer_stateless_multi_datasets
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-transducer_stateless_multi_datasets-100h-2022-03-01
+ path: egs/librispeech/ASR/transducer_stateless_multi_datasets/exp/
diff --git a/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml b/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
new file mode 100644
index 000000000..ce6d6f92d
--- /dev/null
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
@@ -0,0 +1,80 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-pre-trained-trandsucer-stateless-modified-2-aishell
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+concurrency:
+ group: run_pre_trained_transducer_stateless_modified_2_aishell-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_pre_trained_transducer_stateless_modified_2_aishell:
+ if: github.event.label.name == 'ready' || github.event_name == 'push'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ run: |
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+ .github/scripts/run-pre-trained-transducer-stateless-modified-2-aishell.sh
diff --git a/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml b/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
new file mode 100644
index 000000000..f0cebd94a
--- /dev/null
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
@@ -0,0 +1,80 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-pre-trained-trandsucer-stateless-modified-aishell
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+concurrency:
+ group: run_pre_trained_transducer_stateless_modified_aishell-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_pre_trained_transducer_stateless_modified_aishell:
+ if: github.event.label.name == 'ready' || github.event_name == 'push'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ run: |
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+ .github/scripts/run-pre-trained-transducer-stateless-modified-aishell.sh
diff --git a/.github/workflows/run-pretrained-transducer-stateless.yml b/.github/workflows/run-pretrained-transducer-stateless.yml
new file mode 100644
index 000000000..1b69b97bf
--- /dev/null
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@@ -0,0 +1,158 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-pre-trained-transducer-stateless
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_pre_trained_transducer_stateless-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_pre_trained_transducer_stateless:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Cache LibriSpeech test-clean and test-other datasets
+ id: libri-test-clean-and-test-other-data
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/download
+ key: cache-libri-test-clean-and-test-other
+
+ - name: Download LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-data.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/download-librispeech-test-clean-and-test-other-dataset.sh
+
+ - name: Prepare manifests for LibriSpeech test-clean and test-other
+ shell: bash
+ run: |
+ .github/scripts/prepare-librispeech-test-clean-and-test-other-manifests.sh
+
+ - name: Cache LibriSpeech test-clean and test-other fbank features
+ id: libri-test-clean-and-test-other-fbank
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/fbank-libri
+ key: cache-libri-fbank-test-clean-and-test-other-v2
+
+ - name: Compute fbank for LibriSpeech test-clean and test-other
+ if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/compute-fbank-librispeech-test-clean-and-test-other.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ mkdir -p egs/librispeech/ASR/data
+ ln -sfv ~/tmp/fbank-libri egs/librispeech/ASR/data/fbank
+ ls -lh egs/librispeech/ASR/data/*
+
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-pre-trained-transducer-stateless.sh
+
+ - name: Display decoding results for transducer_stateless
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ shell: bash
+ run: |
+ cd egs/librispeech/ASR/
+ tree ./transducer_stateless/exp
+
+ cd transducer_stateless
+ echo "results for transducer_stateless"
+ echo "===greedy search==="
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/greedy_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===fast_beam_search==="
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ echo "===modified beam search==="
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-clean" {} + | sort -n -k2
+ find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test-other" {} + | sort -n -k2
+
+ - name: Upload decoding results for transducer_stateless
+ uses: actions/upload-artifact@v2
+ if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
+ with:
+ name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-transducer_stateless-2022-02-07
+ path: egs/librispeech/ASR/transducer_stateless/exp/
diff --git a/.github/workflows/run-pretrained-transducer.yml b/.github/workflows/run-pretrained-transducer.yml
new file mode 100644
index 000000000..91d87f1c9
--- /dev/null
+++ b/.github/workflows/run-pretrained-transducer.yml
@@ -0,0 +1,80 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-pre-trained-transducer
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+concurrency:
+ group: run_pre_trained_transducer-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_pre_trained_transducer:
+ if: github.event.label.name == 'ready' || github.event_name == 'push'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ make -j2 _kaldifeat
+
+ - name: Inference with pre-trained model
+ shell: bash
+ run: |
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+ .github/scripts/run-pre-trained-transducer.sh
diff --git a/.github/workflows/run-ptb-rnn-lm.yml b/.github/workflows/run-ptb-rnn-lm.yml
new file mode 100644
index 000000000..f8d9c02c5
--- /dev/null
+++ b/.github/workflows/run-ptb-rnn-lm.yml
@@ -0,0 +1,71 @@
+name: run-ptb-rnn-lm-training
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: run_ptb_rnn_lm_training-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_ptb_rnn_lm_training:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'rnnlm' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: ["3.8"]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | grep -v kaldifst | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Prepare data
+ shell: bash
+ run: |
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ cd egs/ptb/LM
+ ./prepare.sh
+
+ - name: Run training
+ shell: bash
+ run: |
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ cd egs/ptb/LM
+ ./train-rnn-lm.sh --world-size 1 --num-epochs 5 --use-epoch 4 --use-avg 2
+
+ - name: Upload pretrained models
+ uses: actions/upload-artifact@v2
+ if: github.event.label.name == 'ready' || github.event.label.name == 'rnnlm' || github.event_name == 'push' || github.event_name == 'schedule'
+ with:
+ name: python-${{ matrix.python-version }}-ubuntu-rnn-lm-ptb
+ path: egs/ptb/LM/my-rnnlm-exp/
diff --git a/.github/workflows/run-swbd-conformer-ctc.yml b/.github/workflows/run-swbd-conformer-ctc.yml
new file mode 100644
index 000000000..842691d38
--- /dev/null
+++ b/.github/workflows/run-swbd-conformer-ctc.yml
@@ -0,0 +1,84 @@
+# Copyright 2023 Xiaomi Corp. (author: Zengrui Jin)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-swbd-conformer_ctc
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+concurrency:
+ group: run-swbd-conformer_ctc-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run-swbd-conformer_ctc:
+ if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event_name == 'push' || github.event.label.name == 'swbd'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-swbd-conformer-ctc-2023-08-26.sh
diff --git a/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml b/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
new file mode 100644
index 000000000..319a5558a
--- /dev/null
+++ b/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
@@ -0,0 +1,84 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-wenetspeech-pruned-transducer-stateless2
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+concurrency:
+ group: run_wenetspeech_pruned_transducer_stateless2-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run_wenetspeech_pruned_transducer_stateless2:
+ if: github.event.label.name == 'onnx' || github.event.label.name == 'ready' || github.event_name == 'push' || github.event.label.name == 'wenetspeech'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Inference with pre-trained model
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ sudo apt-get -qq install git-lfs tree
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
diff --git a/.github/workflows/run-yesno-recipe.yml b/.github/workflows/run-yesno-recipe.yml
new file mode 100644
index 000000000..9ac848535
--- /dev/null
+++ b/.github/workflows/run-yesno-recipe.yml
@@ -0,0 +1,185 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: run-yesno-recipe
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ branches:
+ - master
+
+concurrency:
+ group: run-yesno-recipe-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ run-yesno-recipe:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ # os: [ubuntu-latest, macos-10.15]
+ # TODO: enable macOS for CPU testing
+ os: [ubuntu-latest]
+ python-version: [3.8]
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install libnsdfile and libsox
+ if: startsWith(matrix.os, 'ubuntu')
+ run: |
+ sudo apt update
+ sudo apt install -q -y libsndfile1-dev libsndfile1 ffmpeg
+ sudo apt install -q -y --fix-missing sox libsox-dev libsox-fmt-all
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ pip install --no-deps --force-reinstall k2==1.24.4.dev20231021+cpu.torch1.13.1 -f https://k2-fsa.github.io/k2/cpu.html
+ pip install kaldifeat==1.25.1.dev20231022+cpu.torch1.13.1 -f https://csukuangfj.github.io/kaldifeat/cpu.html
+
+ - name: Run yesno recipe
+ shell: bash
+ working-directory: ${{github.workspace}}
+ run: |
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ echo $PYTHONPATH
+
+ cd egs/yesno/ASR
+ ./prepare.sh
+ python3 ./tdnn/train.py
+ python3 ./tdnn/decode.py
+
+ - name: Test exporting to pretrained.pt
+ shell: bash
+ working-directory: ${{github.workspace}}
+ run: |
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ echo $PYTHONPATH
+
+ cd egs/yesno/ASR
+ python3 ./tdnn/export.py --epoch 14 --avg 2
+
+ python3 ./tdnn/pretrained.py \
+ --checkpoint ./tdnn/exp/pretrained.pt \
+ --HLG ./data/lang_phone/HLG.pt \
+ --words-file ./data/lang_phone/words.txt \
+ download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+ download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+ - name: Test exporting to torchscript
+ shell: bash
+ working-directory: ${{github.workspace}}
+ run: |
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ echo $PYTHONPATH
+
+ cd egs/yesno/ASR
+ python3 ./tdnn/export.py --epoch 14 --avg 2 --jit 1
+
+ python3 ./tdnn/jit_pretrained.py \
+ --nn-model ./tdnn/exp/cpu_jit.pt \
+ --HLG ./data/lang_phone/HLG.pt \
+ --words-file ./data/lang_phone/words.txt \
+ download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+ download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+ - name: Test exporting to onnx
+ shell: bash
+ working-directory: ${{github.workspace}}
+ run: |
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ echo $PYTHONPATH
+
+ cd egs/yesno/ASR
+ python3 ./tdnn/export_onnx.py --epoch 14 --avg 2
+
+ echo "Test float32 model"
+ python3 ./tdnn/onnx_pretrained.py \
+ --nn-model ./tdnn/exp/model-epoch-14-avg-2.onnx \
+ --HLG ./data/lang_phone/HLG.pt \
+ --words-file ./data/lang_phone/words.txt \
+ download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+ download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+
+ echo "Test int8 model"
+ python3 ./tdnn/onnx_pretrained.py \
+ --nn-model ./tdnn/exp/model-epoch-14-avg-2.int8.onnx \
+ --HLG ./data/lang_phone/HLG.pt \
+ --words-file ./data/lang_phone/words.txt \
+ download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+ download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+ - name: Test decoding with H
+ shell: bash
+ working-directory: ${{github.workspace}}
+ run: |
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ echo $PYTHONPATH
+
+ cd egs/yesno/ASR
+ python3 ./tdnn/export.py --epoch 14 --avg 2 --jit 1
+
+ python3 ./tdnn/jit_pretrained_decode_with_H.py \
+ --nn-model ./tdnn/exp/cpu_jit.pt \
+ --H ./data/lang_phone/H.fst \
+ --tokens ./data/lang_phone/tokens.txt \
+ ./download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+ ./download/waves_yesno/0_0_1_0_0_0_1_0.wav \
+ ./download/waves_yesno/0_0_1_0_0_1_1_1.wav
+
+ - name: Test decoding with HL
+ shell: bash
+ working-directory: ${{github.workspace}}
+ run: |
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ echo $PYTHONPATH
+
+ cd egs/yesno/ASR
+ python3 ./tdnn/export.py --epoch 14 --avg 2 --jit 1
+
+ python3 ./tdnn/jit_pretrained_decode_with_HL.py \
+ --nn-model ./tdnn/exp/cpu_jit.pt \
+ --HL ./data/lang_phone/HL.fst \
+ --words ./data/lang_phone/words.txt \
+ ./download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+ ./download/waves_yesno/0_0_1_0_0_0_1_0.wav \
+ ./download/waves_yesno/0_0_1_0_0_1_1_1.wav
+
+ - name: Show generated files
+ shell: bash
+ working-directory: ${{github.workspace}}
+ run: |
+ cd egs/yesno/ASR
+ ls -lh tdnn/exp
+ ls -lh data/lang_phone
diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml
new file mode 100644
index 000000000..fc1dcbfd4
--- /dev/null
+++ b/.github/workflows/style_check.yml
@@ -0,0 +1,69 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: style_check
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ branches:
+ - master
+
+concurrency:
+ group: style_check-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ style_check:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v1
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install Python dependencies
+ run: |
+ python3 -m pip install --upgrade pip black==22.3.0 flake8==5.0.4 click==8.1.0
+ # Click issue fixed in https://github.com/psf/black/pull/2966
+
+ - name: Run flake8
+ shell: bash
+ working-directory: ${{github.workspace}}
+ run: |
+ # stop the build if there are Python syntax errors or undefined names
+ flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+ # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+ flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 \
+ --statistics --extend-ignore=E203,E266,E501,F401,E402,F403,F841,W503
+
+ - name: Run black
+ shell: bash
+ working-directory: ${{github.workspace}}
+ run: |
+ black --check --diff .
diff --git a/.github/workflows/test-ncnn-export.yml b/.github/workflows/test-ncnn-export.yml
new file mode 100644
index 000000000..5709f8ebb
--- /dev/null
+++ b/.github/workflows/test-ncnn-export.yml
@@ -0,0 +1,75 @@
+name: test-ncnn-export
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: test_ncnn_export-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ test_ncnn_export:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'ncnn' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Test ncnn export
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/test-ncnn-export.sh
diff --git a/.github/workflows/test-onnx-export.yml b/.github/workflows/test-onnx-export.yml
new file mode 100644
index 000000000..c05cde3ba
--- /dev/null
+++ b/.github/workflows/test-onnx-export.yml
@@ -0,0 +1,75 @@
+name: test-onnx-export
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ types: [labeled]
+
+ schedule:
+ # minute (0-59)
+ # hour (0-23)
+ # day of the month (1-31)
+ # month (1-12)
+ # day of the week (0-6)
+ # nightly build at 15:50 UTC time every day
+ - cron: "50 15 * * *"
+
+concurrency:
+ group: test_onnx_export-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ test_onnx_export:
+ if: github.event.label.name == 'ready' || github.event.label.name == 'onnx' || github.event_name == 'push' || github.event_name == 'schedule'
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: [3.8]
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: 'pip'
+ cache-dependency-path: '**/requirements-ci.txt'
+
+ - name: Install Python dependencies
+ run: |
+ grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ - name: Cache kaldifeat
+ id: my-cache
+ uses: actions/cache@v2
+ with:
+ path: |
+ ~/tmp/kaldifeat
+ key: cache-tmp-${{ matrix.python-version }}-2023-05-22
+
+ - name: Install kaldifeat
+ if: steps.my-cache.outputs.cache-hit != 'true'
+ shell: bash
+ run: |
+ .github/scripts/install-kaldifeat.sh
+
+ - name: Test ONNX export
+ shell: bash
+ env:
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
+ GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }}
+ run: |
+ export PYTHONPATH=$PWD:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
+ export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+
+ .github/scripts/test-onnx-export.sh
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 000000000..363556bb7
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,131 @@
+# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com)
+
+# See ../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: test
+
+on:
+ push:
+ branches:
+ - master
+ pull_request:
+ branches:
+ - master
+
+concurrency:
+ group: test-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ test:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ os: [ubuntu-latest]
+ python-version: ["3.8"]
+ torch: ["1.13.0"]
+ torchaudio: ["0.13.0"]
+ k2-version: ["1.24.3.dev20230719"]
+
+ fail-fast: false
+
+ steps:
+ - uses: actions/checkout@v2
+ with:
+ fetch-depth: 0
+
+ - name: Setup Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v1
+ with:
+ python-version: ${{ matrix.python-version }}
+
+ - name: Install libnsdfile and libsox
+ if: startsWith(matrix.os, 'ubuntu')
+ run: |
+ sudo apt update
+ sudo apt install -q -y libsndfile1-dev libsndfile1 ffmpeg
+ sudo apt install -q -y --fix-missing libsox-dev libsox-fmt-all
+
+ - name: Install Python dependencies
+ run: |
+ python3 -m pip install --upgrade pip pytest
+ # numpy 1.20.x does not support python 3.6
+ pip install numpy==1.19
+ pip install torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+ pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+
+ pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.github.io/k2/cpu.html
+ pip install git+https://github.com/lhotse-speech/lhotse
+ # icefall requirements
+ pip uninstall -y protobuf
+ pip install --no-binary protobuf protobuf==3.20.*
+
+ pip install kaldifst
+ pip install onnxruntime matplotlib
+ pip install -r requirements.txt
+
+ - name: Install graphviz
+ if: startsWith(matrix.os, 'ubuntu')
+ shell: bash
+ run: |
+ python3 -m pip install -qq graphviz
+ sudo apt-get -qq install graphviz
+
+ - name: Run tests
+ if: startsWith(matrix.os, 'ubuntu')
+ run: |
+ ls -lh
+ export PYTHONPATH=$PWD:$PWD/lhotse:$PYTHONPATH
+ echo $PYTHONPATH
+ pytest -v -s ./test
+ # runt tests for conformer ctc
+ cd egs/librispeech/ASR/conformer_ctc
+ pytest -v -s
+
+ cd ../pruned_transducer_stateless
+ pytest -v -s
+
+ cd ../pruned_transducer_stateless2
+ pytest -v -s
+
+ cd ../pruned_transducer_stateless3
+ pytest -v -s
+
+ cd ../pruned_transducer_stateless4
+ pytest -v -s
+
+ echo $PYTHONPATH
+ cd ../pruned_transducer_stateless7
+ pytest -v -s
+
+ cd ../transducer_stateless
+ pytest -v -s
+
+ # cd ../transducer
+ # pytest -v -s
+
+ cd ../transducer_stateless2
+ pytest -v -s
+
+ cd ../transducer_lstm
+ pytest -v -s
+
+ cd ../zipformer
+ pytest -v -s
+
+ - uses: actions/upload-artifact@v2
+ with:
+ path: egs/librispeech/ASR/zipformer/swoosh.pdf
+ name: swoosh.pdf
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..fa18ca83c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,38 @@
+icefall.egg-info/
+data
+__pycache__
+path.sh
+exp
+exp*/
+*.pt
+download
+dask-worker-space
+log
+*.bak
+*-bak
+*bak.py
+
+# Ignore Mac system files
+.DS_store
+
+# Ignore node_modules folder
+node_modules
+
+# ignore .nfs
+
+.nfs*
+
+# Ignore all text files
+*.txt
+
+# Ignore files related to API keys
+.env
+
+# Ignore SASS config files
+.sass-cache
+
+*.param
+*.bin
+.DS_Store
+*.fst
+*.arpa
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..1bb38f6ba
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,40 @@
+repos:
+ - repo: https://github.com/psf/black
+ rev: 22.3.0
+ hooks:
+ - id: black
+ args: ["--line-length=88"]
+ additional_dependencies: ['click==8.1.0']
+ exclude: icefall\/__init__\.py
+
+ - repo: https://github.com/PyCQA/flake8
+ rev: 5.0.4
+ hooks:
+ - id: flake8
+ args: ["--max-line-length=88", "--extend-ignore=E203,E266,E501,F401,E402,F403,F841,W503"]
+
+ # What are we ignoring here?
+ # E203: whitespace before ':'
+ # E266: too many leading '#' for block comment
+ # E501: line too long
+ # F401: module imported but unused
+ # E402: module level import not at top of file
+ # F403: 'from module import *' used; unable to detect undefined names
+ # F841: local variable is assigned to but never used
+ # W503: line break before binary operator
+ # In addition, the default ignore list is:
+ # E121,E123,E126,E226,E24,E704,W503,W504
+
+ - repo: https://github.com/pycqa/isort
+ rev: 5.11.5
+ hooks:
+ - id: isort
+ args: ["--profile=black"]
+
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.2.0
+ hooks:
+ - id: check-executables-have-shebangs
+ - id: end-of-file-fixer
+ - id: mixed-line-ending
+ - id: trailing-whitespace
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..15e9e17e6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,413 @@
+
+
+
+
+## Introduction
+
+icefall contains ASR recipes for various datasets
+using .
+
+You can use to deploy models
+trained with icefall.
+
+You can try pre-trained models from within your browser without the need
+to download or install anything by visiting
+See for more details.
+
+## Installation
+
+Please refer to
+for installation.
+
+## Recipes
+
+Please refer to
+for more information.
+
+We provide the following recipes:
+
+ - [yesno][yesno]
+ - [LibriSpeech][librispeech]
+ - [GigaSpeech][gigaspeech]
+ - [AMI][ami]
+ - [Aishell][aishell]
+ - [Aishell2][aishell2]
+ - [Aishell4][aishell4]
+ - [TIMIT][timit]
+ - [TED-LIUM3][tedlium3]
+ - [Aidatatang_200zh][aidatatang_200zh]
+ - [WenetSpeech][wenetspeech]
+ - [Alimeeting][alimeeting]
+ - [Switchboard][swbd]
+ - [TAL_CSASR][tal_csasr]
+
+### yesno
+
+This is the simplest ASR recipe in `icefall` and can be run on CPU.
+Training takes less than 30 seconds and gives you the following WER:
+
+```
+[test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
+```
+We provide a Colab notebook for this recipe: [](https://colab.research.google.com/drive/1tIjjzaJc3IvGyKiMCDWO-TSnBgkcuN3B?usp=sharing)
+
+
+### LibriSpeech
+
+Please see
+for the **latest** results.
+
+We provide 5 models for this recipe:
+
+- [conformer CTC model][LibriSpeech_conformer_ctc]
+- [TDNN LSTM CTC model][LibriSpeech_tdnn_lstm_ctc]
+- [Transducer: Conformer encoder + LSTM decoder][LibriSpeech_transducer]
+- [Transducer: Conformer encoder + Embedding decoder][LibriSpeech_transducer_stateless]
+- [Transducer: Zipformer encoder + Embedding decoder][LibriSpeech_zipformer]
+
+#### Conformer CTC Model
+
+The best WER we currently have is:
+
+| | test-clean | test-other |
+|-----|------------|------------|
+| WER | 2.42 | 5.73 |
+
+
+We provide a Colab notebook to run a pre-trained conformer CTC model: [](https://colab.research.google.com/drive/1huyupXAcHsUrKaWfI83iMEJ6J0Nh0213?usp=sharing)
+
+#### TDNN LSTM CTC Model
+
+The WER for this model is:
+
+| | test-clean | test-other |
+|-----|------------|------------|
+| WER | 6.59 | 17.69 |
+
+We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model: [](https://colab.research.google.com/drive/1-iSfQMp2So-We_Uu49N4AAcMInB72u9z?usp=sharing)
+
+
+#### Transducer: Conformer encoder + LSTM decoder
+
+Using Conformer as encoder and LSTM as decoder.
+
+The best WER with greedy search is:
+
+| | test-clean | test-other |
+|-----|------------|------------|
+| WER | 3.07 | 7.51 |
+
+We provide a Colab notebook to run a pre-trained RNN-T conformer model: [](https://colab.research.google.com/drive/1_u6yK9jDkPwG_NLrZMN2XK7Aeq4suMO2?usp=sharing)
+
+#### Transducer: Conformer encoder + Embedding decoder
+
+Using Conformer as encoder. The decoder consists of 1 embedding layer
+and 1 convolutional layer.
+
+The best WER using modified beam search with beam size 4 is:
+
+| | test-clean | test-other |
+|-----|------------|------------|
+| WER | 2.56 | 6.27 |
+
+Note: No auxiliary losses are used in the training and no LMs are used
+in the decoding.
+
+We provide a Colab notebook to run a pre-trained transducer conformer + stateless decoder model: [](https://colab.research.google.com/drive/1CO1bXJ-2khDckZIW8zjOPHGSKLHpTDlp?usp=sharing)
+
+
+#### k2 pruned RNN-T
+
+| Encoder | Params | test-clean | test-other | epochs | devices |
+|-----------------|--------|------------|------------|---------|------------|
+| zipformer | 65.5M | 2.21 | 4.79 | 50 | 4 32G-V100 |
+| zipformer-small | 23.2M | 2.42 | 5.73 | 50 | 2 32G-V100 |
+| zipformer-large | 148.4M | 2.06 | 4.63 | 50 | 4 32G-V100 |
+| zipformer-large | 148.4M | 2.00 | 4.38 | 174 | 8 80G-A100 |
+
+Note: No auxiliary losses are used in the training and no LMs are used
+in the decoding.
+
+#### k2 pruned RNN-T + GigaSpeech
+
+| | test-clean | test-other |
+|-----|------------|------------|
+| WER | 1.78 | 4.08 |
+
+Note: No auxiliary losses are used in the training and no LMs are used
+in the decoding.
+
+#### k2 pruned RNN-T + GigaSpeech + CommonVoice
+
+| | test-clean | test-other |
+|-----|------------|------------|
+| WER | 1.90 | 3.98 |
+
+Note: No auxiliary losses are used in the training and no LMs are used
+in the decoding.
+
+
+### GigaSpeech
+
+We provide three models for this recipe:
+
+- [Conformer CTC model][GigaSpeech_conformer_ctc]
+- [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][GigaSpeech_pruned_transducer_stateless2].
+- [Transducer: Zipformer encoder + Embedding decoder][GigaSpeech_zipformer]
+
+#### Conformer CTC
+
+| | Dev | Test |
+|-----|-------|-------|
+| WER | 10.47 | 10.58 |
+
+#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
+
+| | Dev | Test |
+|----------------------|-------|-------|
+| greedy search | 10.51 | 10.73 |
+| fast beam search | 10.50 | 10.69 |
+| modified beam search | 10.40 | 10.51 |
+
+#### Transducer: Zipformer encoder + Embedding decoder
+
+| | Dev | Test |
+|----------------------|-------|-------|
+| greedy search | 10.31 | 10.50 |
+| fast beam search | 10.26 | 10.48 |
+| modified beam search | 10.25 | 10.38 |
+
+
+### Aishell
+
+We provide three models for this recipe: [conformer CTC model][Aishell_conformer_ctc],
+[TDNN LSTM CTC model][Aishell_tdnn_lstm_ctc], and [Transducer Stateless Model][Aishell_pruned_transducer_stateless7],
+
+#### Conformer CTC Model
+
+The best CER we currently have is:
+
+| | test |
+|-----|------|
+| CER | 4.26 |
+
+#### TDNN LSTM CTC Model
+
+The CER for this model is:
+
+| | test |
+|-----|-------|
+| CER | 10.16 |
+
+We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model: [](https://colab.research.google.com/drive/1jbyzYq3ytm6j2nlEt-diQm-6QVWyDDEa?usp=sharing)
+
+#### Transducer Stateless Model
+
+The best CER we currently have is:
+
+| | test |
+|-----|------|
+| CER | 4.38 |
+
+We provide a Colab notebook to run a pre-trained TransducerStateless model: [](https://colab.research.google.com/drive/14XaT2MhnBkK-3_RqqWq3K90Xlbin-GZC?usp=sharing)
+
+
+### Aishell2
+
+We provide one model for this recipe: [Transducer Stateless Model][Aishell2_pruned_transducer_stateless5].
+
+#### Transducer Stateless Model
+
+The best WER we currently have is:
+
+| | dev-ios | test-ios |
+|-----|------------|------------|
+| WER | 5.32 | 5.56 |
+
+
+### Aishell4
+
+We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Aishell4_pruned_transducer_stateless5].
+
+#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with all subsets)
+
+The best CER we currently have is:
+
+| | test |
+|-----|------------|
+| CER | 29.08 |
+
+
+We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)
+
+
+### TIMIT
+
+We provide two models for this recipe: [TDNN LSTM CTC model][TIMIT_tdnn_lstm_ctc]
+and [TDNN LiGRU CTC model][TIMIT_tdnn_ligru_ctc].
+
+#### TDNN LSTM CTC Model
+
+The best PER we currently have is:
+
+||TEST|
+|--|--|
+|PER| 19.71% |
+
+We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model: [](https://colab.research.google.com/drive/1Hs9DA4V96uapw_30uNp32OMJgkuR5VVd?usp=sharing)
+
+#### TDNN LiGRU CTC Model
+
+The PER for this model is:
+
+||TEST|
+|--|--|
+|PER| 17.66% |
+
+We provide a Colab notebook to run a pre-trained TDNN LiGRU CTC model: [](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)
+
+
+### TED-LIUM3
+
+We provide two models for this recipe: [Transducer Stateless: Conformer encoder + Embedding decoder][TED-LIUM3_transducer_stateless] and [Pruned Transducer Stateless: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][TED-LIUM3_pruned_transducer_stateless].
+
+#### Transducer Stateless: Conformer encoder + Embedding decoder
+
+The best WER using modified beam search with beam size 4 is:
+
+| | dev | test |
+|-----|-------|--------|
+| WER | 6.91 | 6.33 |
+
+Note: No auxiliary losses are used in the training and no LMs are used in the decoding.
+
+We provide a Colab notebook to run a pre-trained Transducer Stateless model: [](https://colab.research.google.com/drive/1MmY5bBxwvKLNT4A2DJnwiqRXhdchUqPN?usp=sharing)
+
+#### Pruned Transducer Stateless: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
+
+The best WER using modified beam search with beam size 4 is:
+
+| | dev | test |
+|-----|-------|--------|
+| WER | 6.77 | 6.14 |
+
+We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [](https://colab.research.google.com/drive/1je_1zGrOkGVVd4WLzgkXRHxl-I27yWtz?usp=sharing)
+
+
+### Aidatatang_200zh
+
+We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Aidatatang_200zh_pruned_transducer_stateless2].
+
+#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
+
+| | Dev | Test |
+|----------------------|-------|-------|
+| greedy search | 5.53 | 6.59 |
+| fast beam search | 5.30 | 6.34 |
+| modified beam search | 5.27 | 6.33 |
+
+We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [](https://colab.research.google.com/drive/1wNSnSj3T5oOctbh5IGCa393gKOoQw2GH?usp=sharing)
+
+
+### WenetSpeech
+
+We provide some models for this recipe: [Pruned stateless RNN-T_2: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][WenetSpeech_pruned_transducer_stateless2] and [Pruned stateless RNN-T_5: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][WenetSpeech_pruned_transducer_stateless5].
+
+#### Pruned stateless RNN-T_2: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with L subset, offline ASR)
+
+| | Dev | Test-Net | Test-Meeting |
+|----------------------|-------|----------|--------------|
+| greedy search | 7.80 | 8.75 | 13.49 |
+| modified beam search| 7.76 | 8.71 | 13.41 |
+| fast beam search | 7.94 | 8.74 | 13.80 |
+
+#### Pruned stateless RNN-T_5: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with L subset)
+**Streaming**:
+| | Dev | Test-Net | Test-Meeting |
+|----------------------|-------|----------|--------------|
+| greedy_search | 8.78 | 10.12 | 16.16 |
+| modified_beam_search | 8.53| 9.95 | 15.81 |
+| fast_beam_search| 9.01 | 10.47 | 16.28 |
+
+We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless2 model: [](https://colab.research.google.com/drive/1EV4e1CHa1GZgEF-bZgizqI9RyFFehIiN?usp=sharing)
+
+### Alimeeting
+
+We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Alimeeting_pruned_transducer_stateless2].
+
+#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with far subset)
+
+| | Eval | Test-Net |
+|----------------------|--------|----------|
+| greedy search | 31.77 | 34.66 |
+| fast beam search | 31.39 | 33.02 |
+| modified beam search | 30.38 | 34.25 |
+
+We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [](https://colab.research.google.com/drive/1tKr3f0mL17uO_ljdHGKtR7HOmthYHwJG?usp=sharing)
+
+
+### TAL_CSASR
+
+We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][TAL_CSASR_pruned_transducer_stateless5].
+
+#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
+
+The best results for Chinese CER(%) and English WER(%) respectively (zh: Chinese, en: English):
+|decoding-method | dev | dev_zh | dev_en | test | test_zh | test_en |
+|--|--|--|--|--|--|--|
+|greedy_search| 7.30 | 6.48 | 19.19 |7.39| 6.66 | 19.13|
+|modified_beam_search| 7.15 | 6.35 | 18.95 | 7.22| 6.50 | 18.70 |
+|fast_beam_search| 7.18 | 6.39| 18.90 | 7.27| 6.55 | 18.77|
+
+We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [](https://colab.research.google.com/drive/1DmIx-NloI1CMU5GdZrlse7TRu4y3Dpf8?usp=sharing)
+
+## Deployment with C++
+
+Once you have trained a model in icefall, you may want to deploy it with C++,
+without Python dependencies.
+
+Please refer to the documentation
+
+for how to do this.
+
+We also provide a Colab notebook, showing you how to run a torch scripted model in [k2][k2] with C++.
+Please see: [](https://colab.research.google.com/drive/1BIGLWzS36isskMXHKcqC9ysN6pspYXs_?usp=sharing)
+
+
+[LibriSpeech_tdnn_lstm_ctc]: egs/librispeech/ASR/tdnn_lstm_ctc
+[LibriSpeech_conformer_ctc]: egs/librispeech/ASR/conformer_ctc
+[LibriSpeech_transducer]: egs/librispeech/ASR/transducer
+[LibriSpeech_transducer_stateless]: egs/librispeech/ASR/transducer_stateless
+[LibriSpeech_zipformer]: egs/librispeech/ASR/zipformer
+[Aishell_tdnn_lstm_ctc]: egs/aishell/ASR/tdnn_lstm_ctc
+[Aishell_conformer_ctc]: egs/aishell/ASR/conformer_ctc
+[Aishell_pruned_transducer_stateless7]: egs/aishell/ASR/pruned_transducer_stateless7_bbpe
+[Aishell2_pruned_transducer_stateless5]: egs/aishell2/ASR/pruned_transducer_stateless5
+[Aishell4_pruned_transducer_stateless5]: egs/aishell4/ASR/pruned_transducer_stateless5
+[TIMIT_tdnn_lstm_ctc]: egs/timit/ASR/tdnn_lstm_ctc
+[TIMIT_tdnn_ligru_ctc]: egs/timit/ASR/tdnn_ligru_ctc
+[TED-LIUM3_transducer_stateless]: egs/tedlium3/ASR/transducer_stateless
+[TED-LIUM3_pruned_transducer_stateless]: egs/tedlium3/ASR/pruned_transducer_stateless
+[GigaSpeech_conformer_ctc]: egs/gigaspeech/ASR/conformer_ctc
+[GigaSpeech_pruned_transducer_stateless2]: egs/gigaspeech/ASR/pruned_transducer_stateless2
+[GigaSpeech_zipformer]: egs/gigaspeech/ASR/zipformer
+[Aidatatang_200zh_pruned_transducer_stateless2]: egs/aidatatang_200zh/ASR/pruned_transducer_stateless2
+[WenetSpeech_pruned_transducer_stateless2]: egs/wenetspeech/ASR/pruned_transducer_stateless2
+[WenetSpeech_pruned_transducer_stateless5]: egs/wenetspeech/ASR/pruned_transducer_stateless5
+[Alimeeting_pruned_transducer_stateless2]: egs/alimeeting/ASR/pruned_transducer_stateless2
+[TAL_CSASR_pruned_transducer_stateless5]: egs/tal_csasr/ASR/pruned_transducer_stateless5
+[yesno]: egs/yesno/ASR
+[librispeech]: egs/librispeech/ASR
+[aishell]: egs/aishell/ASR
+[aishell2]: egs/aishell2/ASR
+[aishell4]: egs/aishell4/ASR
+[timit]: egs/timit/ASR
+[tedlium3]: egs/tedlium3/ASR
+[gigaspeech]: egs/gigaspeech/ASR
+[aidatatang_200zh]: egs/aidatatang_200zh/ASR
+[wenetspeech]: egs/wenetspeech/ASR
+[alimeeting]: egs/alimeeting/ASR
+[tal_csasr]: egs/tal_csasr/ASR
+[ami]: egs/ami
+[swbd]: egs/swbd/ASR
+[k2]: https://github.com/k2-fsa/k2
diff --git a/activate-icefall.sh b/activate-icefall.sh
new file mode 100644
index 000000000..6116ca47a
--- /dev/null
+++ b/activate-icefall.sh
@@ -0,0 +1 @@
+export PYTHONPATH=/var/data/share20/qc/k2/Github/icefall:$PYTHONPATH
diff --git a/contributing.md b/contributing.md
new file mode 100644
index 000000000..0a1f9936e
--- /dev/null
+++ b/contributing.md
@@ -0,0 +1,37 @@
+# Contributing to Our Project
+
+Thank you for your interest in contributing to our project! We use Git pre-commit hooks to ensure code quality and consistency. Before contributing, please follow these guidelines to enable and use the pre-commit hooks.
+
+## Pre-Commit Hooks
+
+We have set up pre-commit hooks to check that the files you're committing meet our coding and formatting standards. These checks include:
+
+- Ensuring there are no trailing spaces.
+- Formatting code with [black](https://github.com/psf/black).
+- Checking compliance with PEP8 using [flake8](https://flake8.pycqa.org/).
+- Verifying that files end with a newline character (and only a newline).
+- Sorting imports using [isort](https://pycqa.github.io/isort/).
+
+Please note that these hooks are disabled by default. To enable them, follow these steps:
+
+### Installation (Run only once)
+
+1. Install the `pre-commit` package using pip:
+ ```bash
+ pip install pre-commit
+ ```
+1. Install the Git hooks using:
+ ```bash
+ pre-commit install
+ ```
+### Making a Commit
+Once you have enabled the pre-commit hooks, follow these steps when making a commit:
+1. Make your changes to the codebase.
+2. Stage your changes by using git add for the files you modified.
+3. Commit your changes using git commit. The pre-commit hooks will run automatically at this point.
+4. If all hooks run successfully, you can write your commit message, and your changes will be successfully committed.
+5. If any hook fails, your commit will not be successful. Please read and follow the error messages provided, make the necessary changes, and then re-run git add and git commit.
+
+### Your Contribution
+Your contributions are valuable to us, and by following these guidelines, you help maintain code consistency and quality in our project. We appreciate your dedication to ensuring high-quality code. If you have questions or need assistance, feel free to reach out to us. Thank you for being part of our open-source community!
+
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 000000000..19959bfe6
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,129 @@
+# icefall dockerfile
+
+## Download from dockerhub
+
+You can find pre-built docker image for icefall at the following address:
+
+
+
+Example usage:
+
+```bash
+docker run --gpus all --rm -it k2fsa/icefall:torch1.13.0-cuda11.6 /bin/bash
+```
+
+
+## Build from dockerfile
+
+2 sets of configuration are provided - (a) Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8, and (b) Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8.
+
+If your NVIDIA driver supports CUDA Version: 11.3, please go for case (a) Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8.
+
+Otherwise, since the older PyTorch images are not updated with the [apt-key rotation by NVIDIA](https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key), you have to go for case (b) Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8. Ensure that your NVDIA driver supports at least CUDA 11.0.
+
+You can check the highest CUDA version within your NVIDIA driver's support with the `nvidia-smi` command below. In this example, the highest CUDA version is 11.0, i.e. case (b) Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8.
+
+```bash
+$ nvidia-smi
+Tue Sep 20 00:26:13 2022
++-----------------------------------------------------------------------------+
+| NVIDIA-SMI 450.119.03 Driver Version: 450.119.03 CUDA Version: 11.0 |
+|-------------------------------+----------------------+----------------------+
+| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
+| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
+| | | MIG M. |
+|===============================+======================+======================|
+| 0 TITAN RTX On | 00000000:03:00.0 Off | N/A |
+| 41% 31C P8 4W / 280W | 16MiB / 24219MiB | 0% Default |
+| | | N/A |
++-------------------------------+----------------------+----------------------+
+| 1 TITAN RTX On | 00000000:04:00.0 Off | N/A |
+| 41% 30C P8 11W / 280W | 6MiB / 24220MiB | 0% Default |
+| | | N/A |
++-------------------------------+----------------------+----------------------+
+
++-----------------------------------------------------------------------------+
+| Processes: |
+| GPU GI CI PID Type Process name GPU Memory |
+| ID ID Usage |
+|=============================================================================|
+| 0 N/A N/A 2085 G /usr/lib/xorg/Xorg 9MiB |
+| 0 N/A N/A 2240 G /usr/bin/gnome-shell 4MiB |
+| 1 N/A N/A 2085 G /usr/lib/xorg/Xorg 4MiB |
++-----------------------------------------------------------------------------+
+
+```
+
+## Building images locally
+If your environment requires a proxy to access the Internet, remember to add those information into the Dockerfile directly.
+For most cases, you can uncomment these lines in the Dockerfile and add in your proxy details.
+
+```dockerfile
+ENV http_proxy=http://aaa.bb.cc.net:8080 \
+ https_proxy=http://aaa.bb.cc.net:8080
+```
+
+Then, proceed with these commands.
+
+### If you are case (a), i.e. your NVIDIA driver supports CUDA version >= 11.3:
+
+```bash
+cd docker/Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8
+docker build -t icefall/pytorch1.12.1 .
+```
+
+### If you are case (b), i.e. your NVIDIA driver can only support CUDA versions 11.0 <= x < 11.3:
+```bash
+cd docker/Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8
+docker build -t icefall/pytorch1.7.1 .
+```
+
+## Running your built local image
+Sample usage of the GPU based images. These commands are written with case (a) in mind, so please make the necessary changes to your image name if you are case (b).
+Note: use [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) to run the GPU images.
+
+```bash
+docker run -it --runtime=nvidia --shm-size=2gb --name=icefall --gpus all icefall/pytorch1.12.1
+```
+
+### Tips:
+1. Since your data and models most probably won't be in the docker, you must use the -v flag to access the host machine. Do this by specifying `-v {/path/in/host/machine}:{/path/in/docker}`.
+
+2. Also, if your environment requires a proxy, this would be a good time to add it in too: `-e http_proxy=http://aaa.bb.cc.net:8080 -e https_proxy=http://aaa.bb.cc.net:8080`.
+
+Overall, your docker run command should look like this.
+
+```bash
+docker run -it --runtime=nvidia --shm-size=2gb --name=icefall --gpus all -v {/path/in/host/machine}:{/path/in/docker} -e http_proxy=http://aaa.bb.cc.net:8080 -e https_proxy=http://aaa.bb.cc.net:8080 icefall/pytorch1.12.1
+```
+
+You can explore more docker run options [here](https://docs.docker.com/engine/reference/commandline/run/) to suit your environment.
+
+### Linking to icefall in your host machine
+
+If you already have icefall downloaded onto your host machine, you can use that repository instead so that changes in your code are visible inside and outside of the container.
+
+Note: Remember to set the -v flag above during the first run of the container, as that is the only way for your container to access your host machine.
+Warning: Check that the icefall in your host machine is visible from within your container before proceeding to the commands below.
+
+Use these commands once you are inside the container.
+
+```bash
+rm -r /workspace/icefall
+ln -s {/path/in/docker/to/icefall} /workspace/icefall
+```
+
+## Starting another session in the same running container.
+```bash
+docker exec -it icefall /bin/bash
+```
+
+## Restarting a killed container that has been run before.
+```bash
+docker start -ai icefall
+```
+
+## Sample usage of the CPU based images:
+```bash
+docker run -it icefall /bin/bash
+```
diff --git a/docker/Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8/Dockerfile b/docker/Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8/Dockerfile
new file mode 100644
index 000000000..ff9e40604
--- /dev/null
+++ b/docker/Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8/Dockerfile
@@ -0,0 +1,74 @@
+FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel
+
+# ENV http_proxy=http://aaa.bbb.cc.net:8080 \
+# https_proxy=http://aaa.bbb.cc.net:8080
+
+# install normal source
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ g++ \
+ make \
+ automake \
+ autoconf \
+ bzip2 \
+ unzip \
+ wget \
+ sox \
+ libtool \
+ git \
+ subversion \
+ zlib1g-dev \
+ gfortran \
+ ca-certificates \
+ patch \
+ ffmpeg \
+ valgrind \
+ libssl-dev \
+ vim \
+ curl
+
+# cmake
+RUN wget -P /opt https://cmake.org/files/v3.18/cmake-3.18.0.tar.gz && \
+ cd /opt && \
+ tar -zxvf cmake-3.18.0.tar.gz && \
+ cd cmake-3.18.0 && \
+ ./bootstrap && \
+ make && \
+ make install && \
+ rm -rf cmake-3.18.0.tar.gz && \
+ find /opt/cmake-3.18.0 -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+ cd -
+
+# flac
+RUN wget -P /opt https://downloads.xiph.org/releases/flac/flac-1.3.2.tar.xz && \
+ cd /opt && \
+ xz -d flac-1.3.2.tar.xz && \
+ tar -xvf flac-1.3.2.tar && \
+ cd flac-1.3.2 && \
+ ./configure && \
+ make && make install && \
+ rm -rf flac-1.3.2.tar && \
+ find /opt/flac-1.3.2 -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+ cd -
+
+RUN conda install -y -c pytorch torchaudio=0.12 && \
+ pip install graphviz
+
+
+#install k2 from source
+RUN git clone https://github.com/k2-fsa/k2.git /opt/k2 && \
+ cd /opt/k2 && \
+ python3 setup.py install && \
+ cd -
+
+# install lhotse
+RUN pip install git+https://github.com/lhotse-speech/lhotse
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+ cd /workspace/icefall && \
+ pip install -r requirements.txt
+
+RUN pip install kaldifeat
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
diff --git a/docker/Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8/Dockerfile b/docker/Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8/Dockerfile
new file mode 100644
index 000000000..5c7423fa5
--- /dev/null
+++ b/docker/Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8/Dockerfile
@@ -0,0 +1,90 @@
+FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-devel
+
+# ENV http_proxy=http://aaa.bbb.cc.net:8080 \
+# https_proxy=http://aaa.bbb.cc.net:8080
+
+RUN rm /etc/apt/sources.list.d/cuda.list && \
+ rm /etc/apt/sources.list.d/nvidia-ml.list && \
+ apt-key del 7fa2af80
+
+# install normal source
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ g++ \
+ make \
+ automake \
+ autoconf \
+ bzip2 \
+ unzip \
+ wget \
+ sox \
+ libtool \
+ git \
+ subversion \
+ zlib1g-dev \
+ gfortran \
+ ca-certificates \
+ patch \
+ ffmpeg \
+ valgrind \
+ libssl-dev \
+ vim \
+ curl
+
+# Add new keys and reupdate
+RUN curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub | apt-key add - && \
+ curl -fsSL https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub | apt-key add - && \
+ echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
+ echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list && \
+ rm -rf /var/lib/apt/lists/* && \
+ mv /opt/conda/lib/libcufft.so.10 /opt/libcufft.so.10.bak && \
+ mv /opt/conda/lib/libcurand.so.10 /opt/libcurand.so.10.bak && \
+ mv /opt/conda/lib/libcublas.so.11 /opt/libcublas.so.11.bak && \
+ mv /opt/conda/lib/libnvrtc.so.11.0 /opt/libnvrtc.so.11.1.bak && \
+ # mv /opt/conda/lib/libnvToolsExt.so.1 /opt/libnvToolsExt.so.1.bak && \
+ mv /opt/conda/lib/libcudart.so.11.0 /opt/libcudart.so.11.0.bak && \
+ apt-get update && apt-get -y upgrade
+
+# cmake
+RUN wget -P /opt https://cmake.org/files/v3.18/cmake-3.18.0.tar.gz && \
+ cd /opt && \
+ tar -zxvf cmake-3.18.0.tar.gz && \
+ cd cmake-3.18.0 && \
+ ./bootstrap && \
+ make && \
+ make install && \
+ rm -rf cmake-3.18.0.tar.gz && \
+ find /opt/cmake-3.18.0 -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+ cd -
+
+# flac
+RUN wget -P /opt https://downloads.xiph.org/releases/flac/flac-1.3.2.tar.xz && \
+ cd /opt && \
+ xz -d flac-1.3.2.tar.xz && \
+ tar -xvf flac-1.3.2.tar && \
+ cd flac-1.3.2 && \
+ ./configure && \
+ make && make install && \
+ rm -rf flac-1.3.2.tar && \
+ find /opt/flac-1.3.2 -type f \( -name "*.o" -o -name "*.la" -o -name "*.a" \) -exec rm {} \; && \
+ cd -
+
+RUN conda install -y -c pytorch torchaudio=0.7.1 && \
+ pip install graphviz
+
+#install k2 from source
+RUN git clone https://github.com/k2-fsa/k2.git /opt/k2 && \
+ cd /opt/k2 && \
+ python3 setup.py install && \
+ cd -
+
+# install lhotse
+RUN pip install git+https://github.com/lhotse-speech/lhotse
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+ cd /workspace/icefall && \
+ pip install -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
diff --git a/docker/torch1.12.1-cuda11.3.dockerfile b/docker/torch1.12.1-cuda11.3.dockerfile
new file mode 100644
index 000000000..deb5715cc
--- /dev/null
+++ b/docker/torch1.12.1-cuda11.3.dockerfile
@@ -0,0 +1,70 @@
+FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.7
+ARG K2_VERSION="1.24.4.dev20230725+cuda11.3.torch1.12.1"
+ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.3.torch1.12.1"
+ARG TORCHAUDIO_VERSION="0.12.1+cu113"
+
+LABEL authors="Fangjun Kuang "
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ curl \
+ vim \
+ libssl-dev \
+ autoconf \
+ automake \
+ bzip2 \
+ ca-certificates \
+ ffmpeg \
+ g++ \
+ gfortran \
+ git \
+ libtool \
+ make \
+ patch \
+ sox \
+ subversion \
+ unzip \
+ valgrind \
+ wget \
+ zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+ torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+ k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+ git+https://github.com/lhotse-speech/lhotse \
+ kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+ kaldi_native_io \
+ kaldialign \
+ kaldifst \
+ kaldilm \
+ sentencepiece>=0.1.96 \
+ tensorboard \
+ typeguard \
+ dill \
+ onnx \
+ onnxruntime \
+ onnxmltools \
+ multi_quantization \
+ typeguard \
+ numpy \
+ pytest \
+ graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+ cd /workspace/icefall && \
+ pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
diff --git a/docker/torch1.13.0-cuda11.6.dockerfile b/docker/torch1.13.0-cuda11.6.dockerfile
new file mode 100644
index 000000000..afc6c1b84
--- /dev/null
+++ b/docker/torch1.13.0-cuda11.6.dockerfile
@@ -0,0 +1,72 @@
+FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.9
+ARG K2_VERSION="1.24.4.dev20231021+cuda11.6.torch1.13.0"
+ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.6.torch1.13.0"
+ARG TORCHAUDIO_VERSION="0.13.0+cu116"
+
+LABEL authors="Fangjun Kuang "
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ curl \
+ vim \
+ libssl-dev \
+ autoconf \
+ automake \
+ bzip2 \
+ ca-certificates \
+ ffmpeg \
+ g++ \
+ gfortran \
+ git \
+ libtool \
+ make \
+ patch \
+ sox \
+ subversion \
+ unzip \
+ valgrind \
+ wget \
+ zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+ torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+ k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+ git+https://github.com/lhotse-speech/lhotse \
+ kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+ kaldi_native_io \
+ kaldialign \
+ kaldifst \
+ kaldilm \
+ sentencepiece>=0.1.96 \
+ tensorboard \
+ typeguard \
+ dill \
+ onnx \
+ onnxruntime \
+ onnxmltools \
+ multi_quantization \
+ typeguard \
+ numpy \
+ pytest \
+ graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+ cd /workspace/icefall && \
+ pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+ENV LD_LIBRARY_PATH /opt/conda/lib/stubs:$LD_LIBRARY_PATH
+
+WORKDIR /workspace/icefall
diff --git a/docker/torch1.9.0-cuda10.2.dockerfile b/docker/torch1.9.0-cuda10.2.dockerfile
new file mode 100644
index 000000000..9ff225b54
--- /dev/null
+++ b/docker/torch1.9.0-cuda10.2.dockerfile
@@ -0,0 +1,86 @@
+FROM pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.7
+ARG K2_VERSION="1.24.3.dev20230726+cuda10.2.torch1.9.0"
+ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda10.2.torch1.9.0"
+ARG TORCHAUDIO_VERSION="0.9.0"
+
+LABEL authors="Fangjun Kuang "
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+# see https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/
+
+RUN rm /etc/apt/sources.list.d/cuda.list && \
+ rm /etc/apt/sources.list.d/nvidia-ml.list && \
+ apt-key del 7fa2af80
+
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ curl \
+ vim \
+ libssl-dev \
+ autoconf \
+ automake \
+ bzip2 \
+ ca-certificates \
+ ffmpeg \
+ g++ \
+ gfortran \
+ git \
+ libtool \
+ make \
+ patch \
+ sox \
+ subversion \
+ unzip \
+ valgrind \
+ wget \
+ zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
+ dpkg -i cuda-keyring_1.0-1_all.deb && \
+ rm -v cuda-keyring_1.0-1_all.deb && \
+ apt-get update && \
+ rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip uninstall -y tqdm && \
+ pip install -U --no-cache-dir \
+ torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+ k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+ kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+ git+https://github.com/lhotse-speech/lhotse \
+ kaldi_native_io \
+ kaldialign \
+ kaldifst \
+ kaldilm \
+ sentencepiece>=0.1.96 \
+ tensorboard \
+ typeguard \
+ dill \
+ onnx \
+ onnxruntime \
+ onnxmltools \
+ multi_quantization \
+ typeguard \
+ numpy \
+ pytest \
+ graphviz \
+ tqdm>=4.63.0
+
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+ cd /workspace/icefall && \
+ pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
diff --git a/docker/torch2.0.0-cuda11.7.dockerfile b/docker/torch2.0.0-cuda11.7.dockerfile
new file mode 100644
index 000000000..db8076560
--- /dev/null
+++ b/docker/torch2.0.0-cuda11.7.dockerfile
@@ -0,0 +1,70 @@
+FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20231021+cuda11.7.torch2.0.0"
+ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.7.torch2.0.0"
+ARG TORCHAUDIO_VERSION="2.0.0+cu117"
+
+LABEL authors="Fangjun Kuang "
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ curl \
+ vim \
+ libssl-dev \
+ autoconf \
+ automake \
+ bzip2 \
+ ca-certificates \
+ ffmpeg \
+ g++ \
+ gfortran \
+ git \
+ libtool \
+ make \
+ patch \
+ sox \
+ subversion \
+ unzip \
+ valgrind \
+ wget \
+ zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+ torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+ k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+ git+https://github.com/lhotse-speech/lhotse \
+ kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+ kaldi_native_io \
+ kaldialign \
+ kaldifst \
+ kaldilm \
+ sentencepiece>=0.1.96 \
+ tensorboard \
+ typeguard \
+ dill \
+ onnx \
+ onnxruntime \
+ onnxmltools \
+ multi_quantization \
+ typeguard \
+ numpy \
+ pytest \
+ graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+ cd /workspace/icefall && \
+ pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
diff --git a/docker/torch2.1.0-cuda11.8.dockerfile b/docker/torch2.1.0-cuda11.8.dockerfile
new file mode 100644
index 000000000..b006b0d96
--- /dev/null
+++ b/docker/torch2.1.0-cuda11.8.dockerfile
@@ -0,0 +1,70 @@
+FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20231021+cuda11.8.torch2.1.0"
+ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.8.torch2.1.0"
+ARG TORCHAUDIO_VERSION="2.1.0+cu118"
+
+LABEL authors="Fangjun Kuang "
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ curl \
+ vim \
+ libssl-dev \
+ autoconf \
+ automake \
+ bzip2 \
+ ca-certificates \
+ ffmpeg \
+ g++ \
+ gfortran \
+ git \
+ libtool \
+ make \
+ patch \
+ sox \
+ subversion \
+ unzip \
+ valgrind \
+ wget \
+ zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+ torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+ k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+ git+https://github.com/lhotse-speech/lhotse \
+ kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+ kaldi_native_io \
+ kaldialign \
+ kaldifst \
+ kaldilm \
+ sentencepiece>=0.1.96 \
+ tensorboard \
+ typeguard \
+ dill \
+ onnx \
+ onnxruntime \
+ onnxmltools \
+ multi_quantization \
+ typeguard \
+ numpy \
+ pytest \
+ graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+ cd /workspace/icefall && \
+ pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
diff --git a/docker/torch2.1.0-cuda12.1.dockerfile b/docker/torch2.1.0-cuda12.1.dockerfile
new file mode 100644
index 000000000..1b078dc22
--- /dev/null
+++ b/docker/torch2.1.0-cuda12.1.dockerfile
@@ -0,0 +1,70 @@
+FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20231021+cuda12.1.torch2.1.0"
+ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda12.1.torch2.1.0"
+ARG TORCHAUDIO_VERSION="2.1.0+cu121"
+
+LABEL authors="Fangjun Kuang "
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ curl \
+ vim \
+ libssl-dev \
+ autoconf \
+ automake \
+ bzip2 \
+ ca-certificates \
+ ffmpeg \
+ g++ \
+ gfortran \
+ git \
+ libtool \
+ make \
+ patch \
+ sox \
+ subversion \
+ unzip \
+ valgrind \
+ wget \
+ zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+ torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+ k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+ git+https://github.com/lhotse-speech/lhotse \
+ kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+ kaldi_native_io \
+ kaldialign \
+ kaldifst \
+ kaldilm \
+ sentencepiece>=0.1.96 \
+ tensorboard \
+ typeguard \
+ dill \
+ onnx \
+ onnxruntime \
+ onnxmltools \
+ multi_quantization \
+ typeguard \
+ numpy \
+ pytest \
+ graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+ cd /workspace/icefall && \
+ pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 000000000..567609b12
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1 @@
+build/
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 000000000..d0c3cbf10
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = source
+BUILDDIR = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 000000000..3abb38f8b
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,24 @@
+
+## Usage
+
+```bash
+cd /path/to/icefall/docs
+pip install -r requirements.txt
+make clean
+make html
+cd build/html
+python3 -m http.server 8000
+```
+
+It prints:
+
+```
+Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ...
+```
+
+Open your browser and go to to view the generated
+documentation.
+
+Done!
+
+**Hint**: You can change the port number when starting the server.
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 000000000..6247f7e23
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/_static/logo.png b/docs/source/_static/logo.png
new file mode 100644
index 000000000..84d42568c
Binary files /dev/null and b/docs/source/_static/logo.png differ
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 000000000..5a534e126
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,101 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+import sphinx_rtd_theme
+
+# -- Project information -----------------------------------------------------
+
+project = "icefall"
+copyright = "2021, icefall development team"
+author = "icefall development team"
+
+# The full version, including alpha/beta/rc tags
+release = "0.1"
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ "sphinx.ext.todo",
+ "sphinx_rtd_theme",
+ "sphinxcontrib.youtube",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+source_suffix = {
+ ".rst": "restructuredtext",
+}
+master_doc = "index"
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = "sphinx_rtd_theme"
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+html_show_sourcelink = True
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static", "installation/images"]
+
+pygments_style = "sphinx"
+
+numfig = True
+
+html_context = {
+ "display_github": True,
+ "github_user": "k2-fsa",
+ "github_repo": "icefall",
+ "github_version": "master",
+ "conf_py_path": "/docs/source/",
+}
+
+todo_include_todos = True
+
+rst_epilog = """
+.. _sherpa-ncnn: https://github.com/k2-fsa/sherpa-ncnn
+.. _sherpa-onnx: https://github.com/k2-fsa/sherpa-onnx
+.. _icefall: https://github.com/k2-fsa/icefall
+.. _git-lfs: https://git-lfs.com/
+.. _ncnn: https://github.com/tencent/ncnn
+.. _LibriSpeech: https://www.openslr.org/12
+.. _Gigaspeech: https://github.com/SpeechColab/GigaSpeech
+.. _musan: http://www.openslr.org/17/
+.. _ONNX: https://github.com/onnx/onnx
+.. _onnxruntime: https://github.com/microsoft/onnxruntime
+.. _torch: https://github.com/pytorch/pytorch
+.. _torchaudio: https://github.com/pytorch/audio
+.. _k2: https://github.com/k2-fsa/k2
+.. _lhotse: https://github.com/lhotse-speech/lhotse
+.. _yesno: https://www.openslr.org/1/
+.. _Next-gen Kaldi: https://github.com/k2-fsa
+.. _Kaldi: https://github.com/kaldi-asr/kaldi
+.. _lilcom: https://github.com/danpovey/lilcom
+"""
diff --git a/docs/source/contributing/code-style.rst b/docs/source/contributing/code-style.rst
new file mode 100644
index 000000000..cb08229c3
--- /dev/null
+++ b/docs/source/contributing/code-style.rst
@@ -0,0 +1,74 @@
+.. _follow the code style:
+
+Follow the code style
+=====================
+
+We use the following tools to make the code style to be as consistent as possible:
+
+ - `black `_, to format the code
+ - `flake8 `_, to check the style and quality of the code
+ - `isort `_, to sort ``imports``
+
+The following versions of the above tools are used:
+
+ - ``black == 22.3.0``
+ - ``flake8 == 5.0.4``
+ - ``isort == 5.10.1``
+
+After running the following commands:
+
+ .. code-block::
+
+ $ git clone https://github.com/k2-fsa/icefall
+ $ cd icefall
+ $ pip install pre-commit
+ $ pre-commit install
+
+it will run the following checks whenever you run ``git commit``, **automatically**:
+
+ .. figure:: images/pre-commit-check.png
+ :width: 600
+ :align: center
+
+ pre-commit hooks invoked by ``git commit`` (Failed).
+
+If any of the above checks failed, your ``git commit`` was not successful.
+Please fix any issues reported by the check tools.
+
+.. HINT::
+
+ Some of the check tools, i.e., ``black`` and ``isort`` will modify
+ the files to be committed **in-place**. So please run ``git status``
+ after failure to see which file has been modified by the tools
+ before you make any further changes.
+
+After fixing all the failures, run ``git commit`` again and
+it should succeed this time:
+
+ .. figure:: images/pre-commit-check-success.png
+ :width: 600
+ :align: center
+
+ pre-commit hooks invoked by ``git commit`` (Succeeded).
+
+If you want to check the style of your code before ``git commit``, you
+can do the following:
+
+ .. code-block:: bash
+
+ $ pre-commit install
+ $ pre-commit run
+
+Or without installing the pre-commit hooks:
+
+ .. code-block:: bash
+
+ $ cd icefall
+ $ pip install black==22.3.0 flake8==5.0.4 isort==5.10.1
+ $ black --check your_changed_file.py
+ $ black your_changed_file.py # modify it in-place
+ $
+ $ flake8 your_changed_file.py
+ $
+ $ isort --check your_changed_file.py # modify it in-place
+ $ isort your_changed_file.py
diff --git a/docs/source/contributing/doc.rst b/docs/source/contributing/doc.rst
new file mode 100644
index 000000000..893d8a15e
--- /dev/null
+++ b/docs/source/contributing/doc.rst
@@ -0,0 +1,45 @@
+Contributing to Documentation
+=============================
+
+We use `sphinx `_
+for documentation.
+
+Before writing documentation, you have to prepare the environment:
+
+ .. code-block:: bash
+
+ $ cd docs
+ $ pip install -r requirements.txt
+
+After setting up the environment, you are ready to write documentation.
+Please refer to `reStructuredText Primer `_
+if you are not familiar with ``reStructuredText``.
+
+After writing some documentation, you can build the documentation **locally**
+to preview what it looks like if it is published:
+
+ .. code-block:: bash
+
+ $ cd docs
+ $ make html
+
+The generated documentation is in ``docs/build/html`` and can be viewed
+with the following commands:
+
+ .. code-block:: bash
+
+ $ cd docs/build/html
+ $ python3 -m http.server
+
+It will print::
+
+ Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ...
+
+Open your browser, go to ``_, and you will see
+the following:
+
+ .. figure:: images/doc-contrib.png
+ :width: 600
+ :align: center
+
+ View generated documentation locally with ``python3 -m http.server``.
diff --git a/docs/source/contributing/how-to-create-a-recipe.rst b/docs/source/contributing/how-to-create-a-recipe.rst
new file mode 100644
index 000000000..168a856c3
--- /dev/null
+++ b/docs/source/contributing/how-to-create-a-recipe.rst
@@ -0,0 +1,156 @@
+How to create a recipe
+======================
+
+.. HINT::
+
+ Please read :ref:`follow the code style` to adjust your code style.
+
+.. CAUTION::
+
+ ``icefall`` is designed to be as Pythonic as possible. Please use
+ Python in your recipe if possible.
+
+Data Preparation
+----------------
+
+We recommend you to prepare your training/test/validate dataset
+with `lhotse `_.
+
+Please refer to ``_
+for how to create a recipe in ``lhotse``.
+
+.. HINT::
+
+ The ``yesno`` recipe in ``lhotse`` is a very good example.
+
+ Please refer to ``_,
+ which shows how to add a new recipe to ``lhotse``.
+
+Suppose you would like to add a recipe for a dataset named ``foo``.
+You can do the following:
+
+.. code-block::
+
+ $ cd egs
+ $ mkdir -p foo/ASR
+ $ cd foo/ASR
+ $ touch prepare.sh
+ $ chmod +x prepare.sh
+
+If your dataset is very simple, please follow
+`egs/yesno/ASR/prepare.sh `_
+to write your own ``prepare.sh``.
+Otherwise, please refer to
+`egs/librispeech/ASR/prepare.sh `_
+to prepare your data.
+
+
+Training
+--------
+
+Assume you have a fancy model, called ``bar`` for the ``foo`` recipe, you can
+organize your files in the following way:
+
+.. code-block::
+
+ $ cd egs/foo/ASR
+ $ mkdir bar
+ $ cd bar
+ $ touch README.md model.py train.py decode.py asr_datamodule.py pretrained.py
+
+For instance , the ``yesno`` recipe has a ``tdnn`` model and its directory structure
+looks like the following:
+
+.. code-block:: bash
+
+ egs/yesno/ASR/tdnn/
+ |-- README.md
+ |-- asr_datamodule.py
+ |-- decode.py
+ |-- model.py
+ |-- pretrained.py
+ `-- train.py
+
+**File description**:
+
+ - ``README.md``
+
+ It contains information of this recipe, e.g., how to run it, what the WER is, etc.
+
+ - ``asr_datamodule.py``
+
+ It provides code to create PyTorch dataloaders with train/test/validation dataset.
+
+ - ``decode.py``
+
+ It takes as inputs the checkpoints saved during the training stage to decode the test
+ dataset(s).
+
+ - ``model.py``
+
+ It contains the definition of your fancy neural network model.
+
+ - ``pretrained.py``
+
+ We can use this script to do inference with a pre-trained model.
+
+ - ``train.py``
+
+ It contains training code.
+
+
+.. HINT::
+
+ Please take a look at
+
+ - `egs/yesno/tdnn `_
+ - `egs/librispeech/tdnn_lstm_ctc `_
+ - `egs/librispeech/conformer_ctc `_
+
+ to get a feel what the resulting files look like.
+
+.. NOTE::
+
+ Every model in a recipe is kept to be as self-contained as possible.
+ We tolerate duplicate code among different recipes.
+
+
+The training stage should be invocable by:
+
+ .. code-block::
+
+ $ cd egs/foo/ASR
+ $ ./bar/train.py
+ $ ./bar/train.py --help
+
+
+Decoding
+--------
+
+Please refer to
+
+ - ``_
+
+ If your model is transformer/conformer based.
+
+ - ``_
+
+ If your model is TDNN/LSTM based, i.e., there is no attention decoder.
+
+ - ``_
+
+ If there is no LM rescoring.
+
+The decoding stage should be invocable by:
+
+ .. code-block::
+
+ $ cd egs/foo/ASR
+ $ ./bar/decode.py
+ $ ./bar/decode.py --help
+
+Pre-trained model
+-----------------
+
+Please demonstrate how to use your model for inference in ``egs/foo/ASR/bar/pretrained.py``.
+If possible, please consider creating a Colab notebook to show that.
diff --git a/docs/source/contributing/images/doc-contrib.png b/docs/source/contributing/images/doc-contrib.png
new file mode 100644
index 000000000..00906ab83
Binary files /dev/null and b/docs/source/contributing/images/doc-contrib.png differ
diff --git a/docs/source/contributing/images/pre-commit-check-success.png b/docs/source/contributing/images/pre-commit-check-success.png
new file mode 100644
index 000000000..3c6ee9b1c
Binary files /dev/null and b/docs/source/contributing/images/pre-commit-check-success.png differ
diff --git a/docs/source/contributing/images/pre-commit-check.png b/docs/source/contributing/images/pre-commit-check.png
new file mode 100644
index 000000000..80784eced
Binary files /dev/null and b/docs/source/contributing/images/pre-commit-check.png differ
diff --git a/docs/source/contributing/index.rst b/docs/source/contributing/index.rst
new file mode 100644
index 000000000..21c747d33
--- /dev/null
+++ b/docs/source/contributing/index.rst
@@ -0,0 +1,22 @@
+Contributing
+============
+
+Contributions to ``icefall`` are very welcomed.
+There are many possible ways to make contributions and
+two of them are:
+
+ - To write documentation
+ - To write code
+
+ - (1) To follow the code style in the repository
+ - (2) To write a new recipe
+
+In this page, we describe how to contribute documentation
+and code to ``icefall``.
+
+.. toctree::
+ :maxdepth: 2
+
+ doc
+ code-style
+ how-to-create-a-recipe
diff --git a/docs/source/decoding-with-langugage-models/LODR.rst b/docs/source/decoding-with-langugage-models/LODR.rst
new file mode 100644
index 000000000..b6b6e8cbb
--- /dev/null
+++ b/docs/source/decoding-with-langugage-models/LODR.rst
@@ -0,0 +1,187 @@
+.. _LODR:
+
+LODR for RNN Transducer
+=======================
+
+
+As a type of E2E model, neural transducers are usually considered as having an internal
+language model, which learns the language level information on the training corpus.
+In real-life scenario, there is often a mismatch between the training corpus and the target corpus space.
+This mismatch can be a problem when decoding for neural transducer models with language models as its internal
+language can act "against" the external LM. In this tutorial, we show how to use
+`Low-order Density Ratio `_ to alleviate this effect to further improve the performance
+of langugae model integration.
+
+.. note::
+
+ This tutorial is based on the recipe
+ `pruned_transducer_stateless7_streaming `_,
+ which is a streaming transducer model trained on `LibriSpeech`_.
+ However, you can easily apply LODR to other recipes.
+ If you encounter any problems, please open an issue here `icefall `__.
+
+
+.. note::
+
+ For simplicity, the training and testing corpus in this tutorial are the same (`LibriSpeech`_). However,
+ you can change the testing set to any other domains (e.g `GigaSpeech`_) and prepare the language models
+ using that corpus.
+
+First, let's have a look at some background information. As the predecessor of LODR, Density Ratio (DR) is first proposed `here `_
+to address the language information mismatch between the training
+corpus (source domain) and the testing corpus (target domain). Assuming that the source domain and the test domain
+are acoustically similar, DR derives the following formular for decoding with Bayes' theorem:
+
+.. math::
+
+ \text{score}\left(y_u|\mathit{x},y\right) =
+ \log p\left(y_u|\mathit{x},y_{1:u-1}\right) +
+ \lambda_1 \log p_{\text{Target LM}}\left(y_u|\mathit{x},y_{1:u-1}\right) -
+ \lambda_2 \log p_{\text{Source LM}}\left(y_u|\mathit{x},y_{1:u-1}\right)
+
+
+where :math:`\lambda_1` and :math:`\lambda_2` are the weights of LM scores for target domain and source domain respectively.
+Here, the source domain LM is trained on the training corpus. The only difference in the above formular compared to
+shallow fusion is the subtraction of the source domain LM.
+
+Some works treat the predictor and the joiner of the neural transducer as its internal LM. However, the LM is
+considered to be weak and can only capture low-level language information. Therefore, `LODR `__ proposed to use
+a low-order n-gram LM as an approximation of the ILM of the neural transducer. This leads to the following formula
+during decoding for transducer model:
+
+.. math::
+
+ \text{score}\left(y_u|\mathit{x},y\right) =
+ \log p_{rnnt}\left(y_u|\mathit{x},y_{1:u-1}\right) +
+ \lambda_1 \log p_{\text{Target LM}}\left(y_u|\mathit{x},y_{1:u-1}\right) -
+ \lambda_2 \log p_{\text{bi-gram}}\left(y_u|\mathit{x},y_{1:u-1}\right)
+
+In LODR, an additional bi-gram LM estimated on the source domain (e.g training corpus) is required. Compared to DR,
+the only difference lies in the choice of source domain LM. According to the original `paper `_,
+LODR achieves similar performance compared DR in both intra-domain and cross-domain settings.
+As a bi-gram is much faster to evaluate, LODR is usually much faster.
+
+Now, we will show you how to use LODR in ``icefall``.
+For illustration purpose, we will use a pre-trained ASR model from this `link `_.
+If you want to train your model from scratch, please have a look at :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+The testing scenario here is intra-domain (we decode the model trained on `LibriSpeech`_ on `LibriSpeech`_ testing sets).
+
+As the initial step, let's download the pre-trained model.
+
+.. code-block:: bash
+
+ $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+ $ cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+ $ git lfs pull --include "pretrained.pt"
+ $ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
+ $ cd ../data/lang_bpe_500
+ $ git lfs pull --include bpe.model
+ $ cd ../../..
+
+To test the model, let's have a look at the decoding results **without** using LM. This can be done via the following command:
+
+.. code-block:: bash
+
+ $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
+ $ ./pruned_transducer_stateless7_streaming/decode.py \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model False \
+ --exp-dir $exp_dir \
+ --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+ --max-duration 600 \
+ --decode-chunk-len 32 \
+ --decoding-method modified_beam_search
+
+The following WERs are achieved on test-clean and test-other:
+
+.. code-block:: text
+
+ $ For test-clean, WER of different settings are:
+ $ beam_size_4 3.11 best for test-clean
+ $ For test-other, WER of different settings are:
+ $ beam_size_4 7.93 best for test-other
+
+Then, we download the external language model and bi-gram LM that are necessary for LODR.
+Note that the bi-gram is estimated on the LibriSpeech 960 hours' text.
+
+.. code-block:: bash
+
+ $ # download the external LM
+ $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm
+ $ # create a symbolic link so that the checkpoint can be loaded
+ $ pushd icefall-librispeech-rnn-lm/exp
+ $ git lfs pull --include "pretrained.pt"
+ $ ln -s pretrained.pt epoch-99.pt
+ $ popd
+ $
+ $ # download the bi-gram
+ $ git lfs install
+ $ git clone https://huggingface.co/marcoyang/librispeech_bigram
+ $ pushd data/lang_bpe_500
+ $ ln -s ../../librispeech_bigram/2gram.fst.txt .
+ $ popd
+
+Then, we perform LODR decoding by setting ``--decoding-method`` to ``modified_beam_search_lm_LODR``:
+
+.. code-block:: bash
+
+ $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+ $ lm_dir=./icefall-librispeech-rnn-lm/exp
+ $ lm_scale=0.42
+ $ LODR_scale=-0.24
+ $ ./pruned_transducer_stateless7_streaming/decode.py \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model False \
+ --beam-size 4 \
+ --exp-dir $exp_dir \
+ --max-duration 600 \
+ --decode-chunk-len 32 \
+ --decoding-method modified_beam_search_LODR \
+ --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+ --use-shallow-fusion 1 \
+ --lm-type rnn \
+ --lm-exp-dir $lm_dir \
+ --lm-epoch 99 \
+ --lm-scale $lm_scale \
+ --lm-avg 1 \
+ --rnn-lm-embedding-dim 2048 \
+ --rnn-lm-hidden-dim 2048 \
+ --rnn-lm-num-layers 3 \
+ --lm-vocab-size 500 \
+ --tokens-ngram 2 \
+ --ngram-lm-scale $LODR_scale
+
+There are two extra arguments that need to be given when doing LODR. ``--tokens-ngram`` specifies the order of n-gram. As we
+are using a bi-gram, we set it to 2. ``--ngram-lm-scale`` is the scale of the bi-gram, it should be a negative number
+as we are subtracting the bi-gram's score during decoding.
+
+The decoding results obtained with the above command are shown below:
+
+.. code-block:: text
+
+ $ For test-clean, WER of different settings are:
+ $ beam_size_4 2.61 best for test-clean
+ $ For test-other, WER of different settings are:
+ $ beam_size_4 6.74 best for test-other
+
+Recall that the lowest WER we obtained in :ref:`shallow_fusion` with beam size of 4 is ``2.77/7.08``, LODR
+indeed **further improves** the WER. We can do even better if we increase ``--beam-size``:
+
+.. list-table:: WER of LODR with different beam sizes
+ :widths: 25 25 50
+ :header-rows: 1
+
+ * - Beam size
+ - test-clean
+ - test-other
+ * - 4
+ - 2.61
+ - 6.74
+ * - 8
+ - 2.45
+ - 6.38
+ * - 12
+ - 2.4
+ - 6.23
diff --git a/docs/source/decoding-with-langugage-models/index.rst b/docs/source/decoding-with-langugage-models/index.rst
new file mode 100644
index 000000000..c49da9a4e
--- /dev/null
+++ b/docs/source/decoding-with-langugage-models/index.rst
@@ -0,0 +1,34 @@
+Decoding with language models
+=============================
+
+This section describes how to use external langugage models
+during decoding to improve the WER of transducer models. To train an external language model,
+please refer to this tutorial: :ref:`train_nnlm`.
+
+The following decoding methods with external langugage models are available:
+
+
+.. list-table::
+ :widths: 25 50
+ :header-rows: 1
+
+ * - Decoding method
+ - beam=4
+ * - ``modified_beam_search``
+ - Beam search (i.e. really n-best decoding, the "beam" is the value of n), similar to the original RNN-T paper. Note, this method does not use language model.
+ * - ``modified_beam_search_lm_shallow_fusion``
+ - As ``modified_beam_search``, but interpolate RNN-T scores with language model scores, also known as shallow fusion
+ * - ``modified_beam_search_LODR``
+ - As ``modified_beam_search_lm_shallow_fusion``, but subtract score of a (BPE-symbol-level) bigram backoff language model used as an approximation to the internal language model of RNN-T.
+ * - ``modified_beam_search_lm_rescore``
+ - As ``modified_beam_search``, but rescore the n-best hypotheses with external language model (e.g. RNNLM) and re-rank them.
+ * - ``modified_beam_search_lm_rescore_LODR``
+ - As ``modified_beam_search_lm_rescore``, but also subtract the score of a (BPE-symbol-level) bigram backoff language model during re-ranking.
+
+
+.. toctree::
+ :maxdepth: 2
+
+ shallow-fusion
+ LODR
+ rescoring
diff --git a/docs/source/decoding-with-langugage-models/rescoring.rst b/docs/source/decoding-with-langugage-models/rescoring.rst
new file mode 100644
index 000000000..4cabaa432
--- /dev/null
+++ b/docs/source/decoding-with-langugage-models/rescoring.rst
@@ -0,0 +1,255 @@
+.. _rescoring:
+
+LM rescoring for Transducer
+=================================
+
+LM rescoring is a commonly used approach to incorporate external LM information. Unlike shallow-fusion-based
+methods (see :ref:`shallow_fusion`, :ref:`LODR`), rescoring is usually performed to re-rank the n-best hypotheses after beam search.
+Rescoring is usually more efficient than shallow fusion since less computation is performed on the external LM.
+In this tutorial, we will show you how to use external LM to rescore the n-best hypotheses decoded from neural transducer models in
+`icefall `__.
+
+.. note::
+
+ This tutorial is based on the recipe
+ `pruned_transducer_stateless7_streaming `_,
+ which is a streaming transducer model trained on `LibriSpeech`_.
+ However, you can easily apply shallow fusion to other recipes.
+ If you encounter any problems, please open an issue `here `_.
+
+.. note::
+
+ For simplicity, the training and testing corpus in this tutorial is the same (`LibriSpeech`_). However, you can change the testing set
+ to any other domains (e.g `GigaSpeech`_) and use an external LM trained on that domain.
+
+.. HINT::
+
+ We recommend you to use a GPU for decoding.
+
+For illustration purpose, we will use a pre-trained ASR model from this `link `__.
+If you want to train your model from scratch, please have a look at :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+
+As the initial step, let's download the pre-trained model.
+
+.. code-block:: bash
+
+ $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+ $ cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+ $ git lfs pull --include "pretrained.pt"
+ $ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
+ $ cd ../data/lang_bpe_500
+ $ git lfs pull --include bpe.model
+ $ cd ../../..
+
+As usual, we first test the model's performance without external LM. This can be done via the following command:
+
+.. code-block:: bash
+
+ $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
+ $ ./pruned_transducer_stateless7_streaming/decode.py \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model False \
+ --exp-dir $exp_dir \
+ --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+ --max-duration 600 \
+ --decode-chunk-len 32 \
+ --decoding-method modified_beam_search
+
+The following WERs are achieved on test-clean and test-other:
+
+.. code-block:: text
+
+ $ For test-clean, WER of different settings are:
+ $ beam_size_4 3.11 best for test-clean
+ $ For test-other, WER of different settings are:
+ $ beam_size_4 7.93 best for test-other
+
+Now, we will try to improve the above WER numbers via external LM rescoring. We will download
+a pre-trained LM from this `link `__.
+
+.. note::
+
+ This is an RNN LM trained on the LibriSpeech text corpus. So it might not be ideal for other corpus.
+ You may also train a RNN LM from scratch. Please refer to this `script `__
+ for training a RNN LM and this `script `__ to train a transformer LM.
+
+.. code-block:: bash
+
+ $ # download the external LM
+ $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm
+ $ # create a symbolic link so that the checkpoint can be loaded
+ $ pushd icefall-librispeech-rnn-lm/exp
+ $ git lfs pull --include "pretrained.pt"
+ $ ln -s pretrained.pt epoch-99.pt
+ $ popd
+
+
+With the RNNLM available, we can rescore the n-best hypotheses generated from `modified_beam_search`. Here,
+`n` should be the number of beams, i.e ``--beam-size``. The command for LM rescoring is
+as follows. Note that the ``--decoding-method`` is set to `modified_beam_search_lm_rescore` and ``--use-shallow-fusion``
+is set to `False`.
+
+.. code-block:: bash
+
+ $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+ $ lm_dir=./icefall-librispeech-rnn-lm/exp
+ $ lm_scale=0.43
+ $ ./pruned_transducer_stateless7_streaming/decode.py \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model False \
+ --beam-size 4 \
+ --exp-dir $exp_dir \
+ --max-duration 600 \
+ --decode-chunk-len 32 \
+ --decoding-method modified_beam_search_lm_rescore \
+ --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+ --use-shallow-fusion 0 \
+ --lm-type rnn \
+ --lm-exp-dir $lm_dir \
+ --lm-epoch 99 \
+ --lm-scale $lm_scale \
+ --lm-avg 1 \
+ --rnn-lm-embedding-dim 2048 \
+ --rnn-lm-hidden-dim 2048 \
+ --rnn-lm-num-layers 3 \
+ --lm-vocab-size 500
+
+.. code-block:: text
+
+ $ For test-clean, WER of different settings are:
+ $ beam_size_4 2.93 best for test-clean
+ $ For test-other, WER of different settings are:
+ $ beam_size_4 7.6 best for test-other
+
+Great! We made some improvements! Increasing the size of the n-best hypotheses will further boost the performance,
+see the following table:
+
+.. list-table:: WERs of LM rescoring with different beam sizes
+ :widths: 25 25 25
+ :header-rows: 1
+
+ * - Beam size
+ - test-clean
+ - test-other
+ * - 4
+ - 2.93
+ - 7.6
+ * - 8
+ - 2.67
+ - 7.11
+ * - 12
+ - 2.59
+ - 6.86
+
+In fact, we can also apply LODR (see :ref:`LODR`) when doing LM rescoring. To do so, we need to
+download the bi-gram required by LODR:
+
+.. code-block:: bash
+
+ $ # download the bi-gram
+ $ git lfs install
+ $ git clone https://huggingface.co/marcoyang/librispeech_bigram
+ $ pushd data/lang_bpe_500
+ $ ln -s ../../librispeech_bigram/2gram.arpa .
+ $ popd
+
+Then we can performn LM rescoring + LODR by changing the decoding method to `modified_beam_search_lm_rescore_LODR`.
+
+.. note::
+
+ This decoding method requires the dependency of `kenlm `_. You can install it
+ via this command: `pip install https://github.com/kpu/kenlm/archive/master.zip`.
+
+.. code-block:: bash
+
+ $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+ $ lm_dir=./icefall-librispeech-rnn-lm/exp
+ $ lm_scale=0.43
+ $ ./pruned_transducer_stateless7_streaming/decode.py \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model False \
+ --beam-size 4 \
+ --exp-dir $exp_dir \
+ --max-duration 600 \
+ --decode-chunk-len 32 \
+ --decoding-method modified_beam_search_lm_rescore_LODR \
+ --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+ --use-shallow-fusion 0 \
+ --lm-type rnn \
+ --lm-exp-dir $lm_dir \
+ --lm-epoch 99 \
+ --lm-scale $lm_scale \
+ --lm-avg 1 \
+ --rnn-lm-embedding-dim 2048 \
+ --rnn-lm-hidden-dim 2048 \
+ --rnn-lm-num-layers 3 \
+ --lm-vocab-size 500
+
+You should see the following WERs after executing the commands above:
+
+.. code-block:: text
+
+ $ For test-clean, WER of different settings are:
+ $ beam_size_4 2.9 best for test-clean
+ $ For test-other, WER of different settings are:
+ $ beam_size_4 7.57 best for test-other
+
+It's slightly better than LM rescoring. If we further increase the beam size, we will see
+further improvements from LM rescoring + LODR:
+
+.. list-table:: WERs of LM rescoring + LODR with different beam sizes
+ :widths: 25 25 25
+ :header-rows: 1
+
+ * - Beam size
+ - test-clean
+ - test-other
+ * - 4
+ - 2.9
+ - 7.57
+ * - 8
+ - 2.63
+ - 7.04
+ * - 12
+ - 2.52
+ - 6.73
+
+As mentioned earlier, LM rescoring is usually faster than shallow-fusion based methods.
+Here, we benchmark the WERs and decoding speed of them:
+
+.. list-table:: LM-rescoring-based methods vs shallow-fusion-based methods (The numbers in each field is WER on test-clean, WER on test-other and decoding time on test-clean)
+ :widths: 25 25 25 25
+ :header-rows: 1
+
+ * - Decoding method
+ - beam=4
+ - beam=8
+ - beam=12
+ * - ``modified_beam_search``
+ - 3.11/7.93; 132s
+ - 3.1/7.95; 177s
+ - 3.1/7.96; 210s
+ * - ``modified_beam_search_lm_shallow_fusion``
+ - 2.77/7.08; 262s
+ - 2.62/6.65; 352s
+ - 2.58/6.65; 488s
+ * - ``modified_beam_search_LODR``
+ - 2.61/6.74; 400s
+ - 2.45/6.38; 610s
+ - 2.4/6.23; 870s
+ * - ``modified_beam_search_lm_rescore``
+ - 2.93/7.6; 156s
+ - 2.67/7.11; 203s
+ - 2.59/6.86; 255s
+ * - ``modified_beam_search_lm_rescore_LODR``
+ - 2.9/7.57; 160s
+ - 2.63/7.04; 203s
+ - 2.52/6.73; 263s
+
+.. note::
+
+ Decoding is performed with a single 32G V100, we set ``--max-duration`` to 600.
+ Decoding time here is only for reference and it may vary.
\ No newline at end of file
diff --git a/docs/source/decoding-with-langugage-models/shallow-fusion.rst b/docs/source/decoding-with-langugage-models/shallow-fusion.rst
new file mode 100644
index 000000000..684fefeb4
--- /dev/null
+++ b/docs/source/decoding-with-langugage-models/shallow-fusion.rst
@@ -0,0 +1,179 @@
+.. _shallow_fusion:
+
+Shallow fusion for Transducer
+=================================
+
+External language models (LM) are commonly used to improve WERs for E2E ASR models.
+This tutorial shows you how to perform ``shallow fusion`` with an external LM
+to improve the word-error-rate of a transducer model.
+
+.. note::
+
+ This tutorial is based on the recipe
+ `pruned_transducer_stateless7_streaming `_,
+ which is a streaming transducer model trained on `LibriSpeech`_.
+ However, you can easily apply shallow fusion to other recipes.
+ If you encounter any problems, please open an issue here `icefall `_.
+
+.. note::
+
+ For simplicity, the training and testing corpus in this tutorial is the same (`LibriSpeech`_). However, you can change the testing set
+ to any other domains (e.g `GigaSpeech`_) and use an external LM trained on that domain.
+
+.. HINT::
+
+ We recommend you to use a GPU for decoding.
+
+For illustration purpose, we will use a pre-trained ASR model from this `link `__.
+If you want to train your model from scratch, please have a look at :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+
+As the initial step, let's download the pre-trained model.
+
+.. code-block:: bash
+
+ $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+ $ cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+ $ git lfs pull --include "pretrained.pt"
+ $ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
+ $ cd ../data/lang_bpe_500
+ $ git lfs pull --include bpe.model
+ $ cd ../../..
+
+To test the model, let's have a look at the decoding results without using LM. This can be done via the following command:
+
+.. code-block:: bash
+
+ $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
+ $ ./pruned_transducer_stateless7_streaming/decode.py \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model False \
+ --exp-dir $exp_dir \
+ --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+ --max-duration 600 \
+ --decode-chunk-len 32 \
+ --decoding-method modified_beam_search
+
+The following WERs are achieved on test-clean and test-other:
+
+.. code-block:: text
+
+ $ For test-clean, WER of different settings are:
+ $ beam_size_4 3.11 best for test-clean
+ $ For test-other, WER of different settings are:
+ $ beam_size_4 7.93 best for test-other
+
+These are already good numbers! But we can further improve it by using shallow fusion with external LM.
+Training a language model usually takes a long time, we can download a pre-trained LM from this `link `__.
+
+.. code-block:: bash
+
+ $ # download the external LM
+ $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm
+ $ # create a symbolic link so that the checkpoint can be loaded
+ $ pushd icefall-librispeech-rnn-lm/exp
+ $ git lfs pull --include "pretrained.pt"
+ $ ln -s pretrained.pt epoch-99.pt
+ $ popd
+
+.. note::
+
+ This is an RNN LM trained on the LibriSpeech text corpus. So it might not be ideal for other corpus.
+ You may also train a RNN LM from scratch. Please refer to this `script `__
+ for training a RNN LM and this `script `__ to train a transformer LM.
+
+To use shallow fusion for decoding, we can execute the following command:
+
+.. code-block:: bash
+
+ $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+ $ lm_dir=./icefall-librispeech-rnn-lm/exp
+ $ lm_scale=0.29
+ $ ./pruned_transducer_stateless7_streaming/decode.py \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model False \
+ --beam-size 4 \
+ --exp-dir $exp_dir \
+ --max-duration 600 \
+ --decode-chunk-len 32 \
+ --decoding-method modified_beam_search_lm_shallow_fusion \
+ --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+ --use-shallow-fusion 1 \
+ --lm-type rnn \
+ --lm-exp-dir $lm_dir \
+ --lm-epoch 99 \
+ --lm-scale $lm_scale \
+ --lm-avg 1 \
+ --rnn-lm-embedding-dim 2048 \
+ --rnn-lm-hidden-dim 2048 \
+ --rnn-lm-num-layers 3 \
+ --lm-vocab-size 500
+
+Note that we set ``--decoding-method modified_beam_search_lm_shallow_fusion`` and ``--use-shallow-fusion True``
+to use shallow fusion. ``--lm-type`` specifies the type of neural LM we are going to use, you can either choose
+between ``rnn`` or ``transformer``. The following three arguments are associated with the rnn:
+
+- ``--rnn-lm-embedding-dim``
+ The embedding dimension of the RNN LM
+
+- ``--rnn-lm-hidden-dim``
+ The hidden dimension of the RNN LM
+
+- ``--rnn-lm-num-layers``
+ The number of RNN layers in the RNN LM.
+
+
+The decoding result obtained with the above command are shown below.
+
+.. code-block:: text
+
+ $ For test-clean, WER of different settings are:
+ $ beam_size_4 2.77 best for test-clean
+ $ For test-other, WER of different settings are:
+ $ beam_size_4 7.08 best for test-other
+
+The improvement of shallow fusion is very obvious! The relative WER reduction on test-other is around 10.5%.
+A few parameters can be tuned to further boost the performance of shallow fusion:
+
+- ``--lm-scale``
+
+ Controls the scale of the LM. If too small, the external language model may not be fully utilized; if too large,
+ the LM score may dominant during decoding, leading to bad WER. A typical value of this is around 0.3.
+
+- ``--beam-size``
+
+ The number of active paths in the search beam. It controls the trade-off between decoding efficiency and accuracy.
+
+Here, we also show how `--beam-size` effect the WER and decoding time:
+
+.. list-table:: WERs and decoding time (on test-clean) of shallow fusion with different beam sizes
+ :widths: 25 25 25 25
+ :header-rows: 1
+
+ * - Beam size
+ - test-clean
+ - test-other
+ - Decoding time on test-clean (s)
+ * - 4
+ - 2.77
+ - 7.08
+ - 262
+ * - 8
+ - 2.62
+ - 6.65
+ - 352
+ * - 12
+ - 2.58
+ - 6.65
+ - 488
+
+As we see, a larger beam size during shallow fusion improves the WER, but is also slower.
+
+
+
+
+
+
+
+
diff --git a/docs/source/docker/img/docker-hub.png b/docs/source/docker/img/docker-hub.png
new file mode 100644
index 000000000..a9e7715b0
Binary files /dev/null and b/docs/source/docker/img/docker-hub.png differ
diff --git a/docs/source/docker/index.rst b/docs/source/docker/index.rst
new file mode 100644
index 000000000..2c92a4cbc
--- /dev/null
+++ b/docs/source/docker/index.rst
@@ -0,0 +1,17 @@
+.. _icefall_docker:
+
+Docker
+======
+
+This section describes how to use pre-built docker images to run `icefall`_.
+
+.. hint::
+
+ If you only have CPUs available, you can still use the pre-built docker
+ images.
+
+.. toctree::
+ :maxdepth: 2
+
+ ./intro.rst
+
diff --git a/docs/source/docker/intro.rst b/docs/source/docker/intro.rst
new file mode 100644
index 000000000..9ead0df00
--- /dev/null
+++ b/docs/source/docker/intro.rst
@@ -0,0 +1,173 @@
+Introduction
+=============
+
+We have pre-built docker images hosted at the following address:
+
+ ``_
+
+.. figure:: img/docker-hub.png
+ :width: 600
+ :align: center
+
+You can find the ``Dockerfile`` at ``_.
+
+We describe the following items in this section:
+
+ - How to view available tags
+ - How to download pre-built docker images
+ - How to run the `yesno`_ recipe within a docker container on ``CPU``
+
+View available tags
+===================
+
+You can use the following command to view available tags:
+
+.. code-block:: bash
+
+ curl -s 'https://registry.hub.docker.com/v2/repositories/k2fsa/icefall/tags/'|jq '."results"[]["name"]'
+
+which will give you something like below:
+
+.. code-block:: bash
+
+ "torch2.1.0-cuda12.1"
+ "torch2.1.0-cuda11.8"
+ "torch2.0.0-cuda11.7"
+ "torch1.12.1-cuda11.3"
+ "torch1.9.0-cuda10.2"
+ "torch1.13.0-cuda11.6"
+
+.. hint::
+
+ Available tags will be updated when there are new releases of `torch`_.
+
+Please select an appropriate combination of `torch`_ and CUDA.
+
+Download a docker image
+=======================
+
+Suppose that you select the tag ``torch1.13.0-cuda11.6``, you can use
+the following command to download it:
+
+.. code-block:: bash
+
+ sudo docker image pull k2fsa/icefall:torch1.13.0-cuda11.6
+
+Run a docker image with GPU
+===========================
+
+.. code-block:: bash
+
+ sudo docker run --gpus all --rm -it k2fsa/icefall:torch1.13.0-cuda11.6 /bin/bash
+
+Run a docker image with CPU
+===========================
+
+.. code-block:: bash
+
+ sudo docker run --rm -it k2fsa/icefall:torch1.13.0-cuda11.6 /bin/bash
+
+Run yesno within a docker container
+===================================
+
+After starting the container, the following interface is presented:
+
+.. code-block:: bash
+
+ root@60c947eac59c:/workspace/icefall#
+
+It shows the current user is ``root`` and the current working directory
+is ``/workspace/icefall``.
+
+Update the code
+---------------
+
+Please first run:
+
+.. code-block:: bash
+
+ root@60c947eac59c:/workspace/icefall# git pull
+
+so that your local copy contains the latest code.
+
+Data preparation
+----------------
+
+Now we can use
+
+.. code-block:: bash
+
+ root@60c947eac59c:/workspace/icefall# cd egs/yesno/ASR/
+
+to switch to the ``yesno`` recipe and run
+
+.. code-block:: bash
+
+ root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ./prepare.sh
+
+.. hint::
+
+ If you are running without GPU, it may report the following error:
+
+ .. code-block:: bash
+
+ File "/opt/conda/lib/python3.9/site-packages/k2/__init__.py", line 23, in
+ from _k2 import DeterminizeWeightPushingType
+ ImportError: libcuda.so.1: cannot open shared object file: No such file or directory
+
+ We can use the following command to fix it:
+
+ .. code-block:: bash
+
+ root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ln -s /opt/conda/lib/stubs/libcuda.so /opt/conda/lib/stubs/libcuda.so.1
+
+The logs of running ``./prepare.sh`` are listed below:
+
+.. literalinclude:: ./log/log-preparation.txt
+
+Training
+--------
+
+After preparing the data, we can start training with the following command
+
+.. code-block:: bash
+
+ root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ./tdnn/train.py
+
+All of the training logs are given below:
+
+.. hint::
+
+ It is running on CPU and it takes only 16 seconds for this run.
+
+.. literalinclude:: ./log/log-train-2023-08-01-01-55-27
+
+
+Decoding
+--------
+
+After training, we can decode the trained model with
+
+.. code-block:: bash
+
+ root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ./tdnn/decode.py
+
+The decoding logs are given below:
+
+.. code-block:: bash
+
+ 2023-08-01 02:06:22,400 INFO [decode.py:263] Decoding started
+ 2023-08-01 02:06:22,400 INFO [decode.py:264] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'export': False, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4c05309499a08454997adf500b56dcc629e35ae5', 'k2-git-date': 'Tue Jul 25 16:23:36 2023', 'lhotse-version': '1.16.0.dev+git.7640d663.clean', 'torch-version': '1.13.0', 'torch-cuda-available': False, 'torch-cuda-version': '11.6', 'python-version': '3.9', 'icefall-git-branch': 'master', 'icefall-git-sha1': '375520d-clean', 'icefall-git-date': 'Fri Jul 28 07:43:08 2023', 'icefall-path': '/workspace/icefall', 'k2-path': '/opt/conda/lib/python3.9/site-packages/k2/__init__.py', 'lhotse-path': '/opt/conda/lib/python3.9/site-packages/lhotse/__init__.py', 'hostname': '60c947eac59c', 'IP address': '172.17.0.2'}}
+ 2023-08-01 02:06:22,401 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+ 2023-08-01 02:06:22,403 INFO [decode.py:273] device: cpu
+ 2023-08-01 02:06:22,406 INFO [decode.py:291] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
+ 2023-08-01 02:06:22,424 INFO [asr_datamodule.py:218] About to get test cuts
+ 2023-08-01 02:06:22,425 INFO [asr_datamodule.py:252] About to get test cuts
+ 2023-08-01 02:06:22,504 INFO [decode.py:204] batch 0/?, cuts processed until now is 4
+ [W NNPACK.cpp:53] Could not initialize NNPACK! Reason: Unsupported hardware.
+ 2023-08-01 02:06:22,687 INFO [decode.py:241] The transcripts are stored in tdnn/exp/recogs-test_set.txt
+ 2023-08-01 02:06:22,688 INFO [utils.py:564] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
+ 2023-08-01 02:06:22,690 INFO [decode.py:249] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
+ 2023-08-01 02:06:22,690 INFO [decode.py:316] Done!
+
+Congratulations! You have finished successfully running `icefall`_ within a docker container.
diff --git a/docs/source/faqs.rst b/docs/source/faqs.rst
new file mode 100644
index 000000000..72b0302d7
--- /dev/null
+++ b/docs/source/faqs.rst
@@ -0,0 +1,107 @@
+Frequently Asked Questions (FAQs)
+=================================
+
+In this section, we collect issues reported by users and post the corresponding
+solutions.
+
+
+OSError: libtorch_hip.so: cannot open shared object file: no such file or directory
+-----------------------------------------------------------------------------------
+
+One user is using the following code to install ``torch`` and ``torchaudio``:
+
+.. code-block:: bash
+
+ pip install \
+ torch==1.10.0+cu111 \
+ torchvision==0.11.0+cu111 \
+ torchaudio==0.10.0 \
+ -f https://download.pytorch.org/whl/torch_stable.html
+
+and it throws the following error when running ``tdnn/train.py``:
+
+.. code-block::
+
+ OSError: libtorch_hip.so: cannot open shared object file: no such file or directory
+
+The fix is to specify the CUDA version while installing ``torchaudio``. That
+is, change ``torchaudio==0.10.0`` to ``torchaudio==0.10.0+cu11```. Therefore,
+the correct command is:
+
+.. code-block:: bash
+
+ pip install \
+ torch==1.10.0+cu111 \
+ torchvision==0.11.0+cu111 \
+ torchaudio==0.10.0+cu111 \
+ -f https://download.pytorch.org/whl/torch_stable.html
+
+AttributeError: module 'distutils' has no attribute 'version'
+-------------------------------------------------------------
+
+The error log is:
+
+.. code-block::
+
+ Traceback (most recent call last):
+ File "./tdnn/train.py", line 14, in
+ from asr_datamodule import YesNoAsrDataModule
+ File "/home/xxx/code/next-gen-kaldi/icefall/egs/yesno/ASR/tdnn/asr_datamodule.py", line 34, in
+ from icefall.dataset.datamodule import DataModule
+ File "/home/xxx/code/next-gen-kaldi/icefall/icefall/__init__.py", line 3, in
+ from . import (
+ File "/home/xxx/code/next-gen-kaldi/icefall/icefall/decode.py", line 23, in
+ from icefall.utils import add_eos, add_sos, get_texts
+ File "/home/xxx/code/next-gen-kaldi/icefall/icefall/utils.py", line 39, in
+ from torch.utils.tensorboard import SummaryWriter
+ File "/home/xxx/tool/miniconda3/envs/yyy/lib/python3.8/site-packages/torch/utils/tensorboard/__init__.py", line 4, in
+ LooseVersion = distutils.version.LooseVersion
+ AttributeError: module 'distutils' has no attribute 'version'
+
+The fix is:
+
+.. code-block:: bash
+
+ pip uninstall setuptools
+
+ pip install setuptools==58.0.4
+
+ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
+--------------------------------------------------------------------------------------------
+
+If you are using ``conda`` and encounter the following issue:
+
+.. code-block::
+
+ Traceback (most recent call last):
+ File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 24, in
+ from _k2 import DeterminizeWeightPushingType
+ ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
+
+ During handling of the above exception, another exception occurred:
+
+ Traceback (most recent call last):
+ File "/k2-dev/yangyifan/icefall/egs/librispeech/ASR/./pruned_transducer_stateless7_ctc_bs/decode.py", line 104, in
+ import k2
+ File "/k2-dev/yangyifan/anaconda3/envs/icefall/lib/python3.10/site-packages/k2-1.23.3.dev20230112+cuda11.6.torch1.13.1-py3.10-linux-x86_64.egg/k2/__init__.py", line 30, in
+ raise ImportError(
+ ImportError: libpython3.10.so.1.0: cannot open shared object file: No such file or directory
+ Note: If you're using anaconda and importing k2 on MacOS,
+ you can probably fix this by setting the environment variable:
+ export DYLD_LIBRARY_PATH=$CONDA_PREFIX/lib/python3.10/site-packages:$DYLD_LIBRARY_PATH
+
+Please first try to find where ``libpython3.10.so.1.0`` locates.
+
+For instance,
+
+.. code-block:: bash
+
+ cd $CONDA_PREFIX/lib
+ find . -name "libpython*"
+
+If you are able to find it inside ``$CODNA_PREFIX/lib``, please set the
+following environment variable:
+
+.. code-block:: bash
+
+ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
diff --git a/docs/source/for-dummies/data-preparation.rst b/docs/source/for-dummies/data-preparation.rst
new file mode 100644
index 000000000..f03d44e79
--- /dev/null
+++ b/docs/source/for-dummies/data-preparation.rst
@@ -0,0 +1,180 @@
+.. _dummies_tutorial_data_preparation:
+
+Data Preparation
+================
+
+After :ref:`dummies_tutorial_environment_setup`, we can start preparing the
+data for training and decoding.
+
+The first step is to prepare the data for training. We have already provided
+`prepare.sh `_
+that would prepare everything required for training.
+
+.. code-block::
+
+ cd /tmp/icefall
+ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+ cd egs/yesno/ASR
+
+ ./prepare.sh
+
+Note that in each recipe from `icefall`_, there exists a file ``prepare.sh``,
+which you should run before you run anything else.
+
+That is all you need for data preparation.
+
+For the more curious
+--------------------
+
+If you are wondering how to prepare your own dataset, please refer to the following
+URLs for more details:
+
+ - ``_
+
+ It contains recipes for a variety of dataset. If you want to add your own
+ dataset, please read recipes in this folder first.
+
+ - ``_
+
+ The `yesno`_ recipe in `lhotse`_.
+
+If you already have a `Kaldi`_ dataset directory, which contains files like
+``wav.scp``, ``feats.scp``, then you can refer to ``_.
+
+A quick look to the generated files
+-----------------------------------
+
+``./prepare.sh`` puts generated files into two directories:
+
+ - ``download``
+ - ``data``
+
+download
+^^^^^^^^
+
+The ``download`` directory contains downloaded dataset files:
+
+.. code-block:: bas
+
+ tree -L 1 ./download/
+
+ ./download/
+ |-- waves_yesno
+ `-- waves_yesno.tar.gz
+
+.. hint::
+
+ Please refer to ``_
+ for how the data is downloaded and extracted.
+
+data
+^^^^
+
+.. code-block:: bash
+
+ tree ./data/
+
+ ./data/
+ |-- fbank
+ | |-- yesno_cuts_test.jsonl.gz
+ | |-- yesno_cuts_train.jsonl.gz
+ | |-- yesno_feats_test.lca
+ | `-- yesno_feats_train.lca
+ |-- lang_phone
+ | |-- HLG.pt
+ | |-- L.pt
+ | |-- L_disambig.pt
+ | |-- Linv.pt
+ | |-- lexicon.txt
+ | |-- lexicon_disambig.txt
+ | |-- tokens.txt
+ | `-- words.txt
+ |-- lm
+ | |-- G.arpa
+ | `-- G.fst.txt
+ `-- manifests
+ |-- yesno_recordings_test.jsonl.gz
+ |-- yesno_recordings_train.jsonl.gz
+ |-- yesno_supervisions_test.jsonl.gz
+ `-- yesno_supervisions_train.jsonl.gz
+
+ 4 directories, 18 files
+
+**data/manifests**:
+
+ This directory contains manifests. They are used to generate files in
+ ``data/fbank``.
+
+ To give you an idea of what it contains, we examine the first few lines of
+ the manifests related to the ``train`` dataset.
+
+ .. code-block:: bash
+
+ cd data/manifests
+ gunzip -c yesno_recordings_train.jsonl.gz | head -n 3
+
+ The output is given below:
+
+ .. code-block:: bash
+
+ {"id": "0_0_0_0_1_1_1_1", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_0_1_1_1_1.wav"}], "sampling_rate": 8000, "num_samples": 50800, "duration": 6.35, "channel_ids": [0]}
+ {"id": "0_0_0_1_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_1_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48880, "duration": 6.11, "channel_ids": [0]}
+ {"id": "0_0_1_0_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_1_0_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48160, "duration": 6.02, "channel_ids": [0]}
+
+ Please refer to ``_
+ for the meaning of each field per line.
+
+ .. code-block:: bash
+
+ gunzip -c yesno_supervisions_train.jsonl.gz | head -n 3
+
+ The output is given below:
+
+ .. code-block:: bash
+
+ {"id": "0_0_0_0_1_1_1_1", "recording_id": "0_0_0_0_1_1_1_1", "start": 0.0, "duration": 6.35, "channel": 0, "text": "NO NO NO NO YES YES YES YES", "language": "Hebrew"}
+ {"id": "0_0_0_1_0_1_1_0", "recording_id": "0_0_0_1_0_1_1_0", "start": 0.0, "duration": 6.11, "channel": 0, "text": "NO NO NO YES NO YES YES NO", "language": "Hebrew"}
+ {"id": "0_0_1_0_0_1_1_0", "recording_id": "0_0_1_0_0_1_1_0", "start": 0.0, "duration": 6.02, "channel": 0, "text": "NO NO YES NO NO YES YES NO", "language": "Hebrew"}
+
+ Please refer to ``_
+ for the meaning of each field per line.
+
+**data/fbank**:
+
+ This directory contains everything from ``data/manifests``. Furthermore, it also contains features
+ for training.
+
+ ``data/fbank/yesno_feats_train.lca`` contains the features for the train dataset.
+ Features are compressed using `lilcom`_.
+
+ ``data/fbank/yesno_cuts_train.jsonl.gz`` stores the `CutSet `_,
+ which stores `RecordingSet `_,
+ `SupervisionSet `_,
+ and `FeatureSet `_.
+
+ To give you an idea about what it looks like, we can run the following command:
+
+ .. code-block:: bash
+
+ cd data/fbank
+
+ gunzip -c yesno_cuts_train.jsonl.gz | head -n 3
+
+ The output is given below:
+
+ .. code-block:: bash
+
+ {"id": "0_0_0_0_1_1_1_1-0", "start": 0, "duration": 6.35, "channel": 0, "supervisions": [{"id": "0_0_0_0_1_1_1_1", "recording_id": "0_0_0_0_1_1_1_1", "start": 0.0, "duration": 6.35, "channel": 0, "text": "NO NO NO NO YES YES YES YES", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 635, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.35, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "0,13000,3570", "channels": 0}, "recording": {"id": "0_0_0_0_1_1_1_1", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_0_1_1_1_1.wav"}], "sampling_rate": 8000, "num_samples": 50800, "duration": 6.35, "channel_ids": [0]}, "type": "MonoCut"}
+ {"id": "0_0_0_1_0_1_1_0-1", "start": 0, "duration": 6.11, "channel": 0, "supervisions": [{"id": "0_0_0_1_0_1_1_0", "recording_id": "0_0_0_1_0_1_1_0", "start": 0.0, "duration": 6.11, "channel": 0, "text": "NO NO NO YES NO YES YES NO", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 611, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.11, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "16570,12964,2929", "channels": 0}, "recording": {"id": "0_0_0_1_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_0_1_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48880, "duration": 6.11, "channel_ids": [0]}, "type": "MonoCut"}
+ {"id": "0_0_1_0_0_1_1_0-2", "start": 0, "duration": 6.02, "channel": 0, "supervisions": [{"id": "0_0_1_0_0_1_1_0", "recording_id": "0_0_1_0_0_1_1_0", "start": 0.0, "duration": 6.02, "channel": 0, "text": "NO NO YES NO NO YES YES NO", "language": "Hebrew"}], "features": {"type": "kaldi-fbank", "num_frames": 602, "num_features": 23, "frame_shift": 0.01, "sampling_rate": 8000, "start": 0, "duration": 6.02, "storage_type": "lilcom_chunky", "storage_path": "data/fbank/yesno_feats_train.lca", "storage_key": "32463,12936,2696", "channels": 0}, "recording": {"id": "0_0_1_0_0_1_1_0", "sources": [{"type": "file", "channels": [0], "source": "/tmp/icefall/egs/yesno/ASR/download/waves_yesno/0_0_1_0_0_1_1_0.wav"}], "sampling_rate": 8000, "num_samples": 48160, "duration": 6.02, "channel_ids": [0]}, "type": "MonoCut"}
+
+ Note that ``yesno_cuts_train.jsonl.gz`` only stores the information about how to read the features.
+ The actual features are stored separately in ``data/fbank/yesno_feats_train.lca``.
+
+**data/lang**:
+
+ This directory contains the lexicon.
+
+**data/lm**:
+
+ This directory contains language models.
diff --git a/docs/source/for-dummies/decoding.rst b/docs/source/for-dummies/decoding.rst
new file mode 100644
index 000000000..3e48e8bfd
--- /dev/null
+++ b/docs/source/for-dummies/decoding.rst
@@ -0,0 +1,39 @@
+.. _dummies_tutorial_decoding:
+
+Decoding
+========
+
+After :ref:`dummies_tutorial_training`, we can start decoding.
+
+The command to start the decoding is quite simple:
+
+.. code-block:: bash
+
+ cd /tmp/icefall
+ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+ cd egs/yesno/ASR
+
+ # We use CPU for decoding by setting the following environment variable
+ export CUDA_VISIBLE_DEVICES=""
+
+ ./tdnn/decode.py
+
+The output logs are given below:
+
+.. literalinclude:: ./code/decoding-yesno.txt
+
+For the more curious
+--------------------
+
+.. code-block:: bash
+
+ ./tdnn/decode.py --help
+
+will print the usage information about ``./tdnn/decode.py``. For instance, you
+can specify:
+
+ - ``--epoch`` to use which checkpoint for decoding
+ - ``--avg`` to select how many checkpoints to use for model averaging
+
+You usually try different combinations of ``--epoch`` and ``--avg`` and select
+one that leads to the lowest WER (`Word Error Rate `_).
diff --git a/docs/source/for-dummies/environment-setup.rst b/docs/source/for-dummies/environment-setup.rst
new file mode 100644
index 000000000..0cb8ecc1d
--- /dev/null
+++ b/docs/source/for-dummies/environment-setup.rst
@@ -0,0 +1,121 @@
+.. _dummies_tutorial_environment_setup:
+
+Environment setup
+=================
+
+We will create an environment for `Next-gen Kaldi`_ that runs on ``CPU``
+in this tutorial.
+
+.. note::
+
+ Since the `yesno`_ dataset used in this tutorial is very tiny, training on
+ ``CPU`` works very well for it.
+
+ If your dataset is very large, e.g., hundreds or thousands of hours of
+ training data, please follow :ref:`install icefall` to install `icefall`_
+ that works with ``GPU``.
+
+
+Create a virtual environment
+----------------------------
+
+.. code-block:: bash
+
+ virtualenv -p python3 /tmp/icefall_env
+
+The above command creates a virtual environment in the directory ``/tmp/icefall_env``.
+You can select any directory you want.
+
+The output of the above command is given below:
+
+.. code-block:: bash
+
+ Already using interpreter /usr/bin/python3
+ Using base prefix '/usr'
+ New python executable in /tmp/icefall_env/bin/python3
+ Also creating executable in /tmp/icefall_env/bin/python
+ Installing setuptools, pkg_resources, pip, wheel...done.
+
+Now we can activate the environment using:
+
+.. code-block:: bash
+
+ source /tmp/icefall_env/bin/activate
+
+Install dependencies
+--------------------
+
+.. warning::
+
+ Remeber to activate your virtual environment before you continue!
+
+After activating the virtual environment, we can use the following command
+to install dependencies of `icefall`_:
+
+.. hint::
+
+ Remeber that we will run this tutorial on ``CPU``, so we install
+ dependencies required only by running on ``CPU``.
+
+.. code-block:: bash
+
+ # Caution: Installation order matters!
+
+ # We use torch 2.0.0 and torchaduio 2.0.0 in this tutorial.
+ # Other versions should also work.
+
+ pip install torch==2.0.0+cpu torchaudio==2.0.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+
+ # If you are using macOS or Windows, please use the following command to install torch and torchaudio
+ # pip install torch==2.0.0 torchaudio==2.0.0 -f https://download.pytorch.org/whl/torch_stable.html
+
+ # Now install k2
+ # Please refer to https://k2-fsa.github.io/k2/installation/from_wheels.html#linux-cpu-example
+
+ pip install k2==1.24.3.dev20230726+cpu.torch2.0.0 -f https://k2-fsa.github.io/k2/cpu.html
+
+ # Install the latest version of lhotse
+
+ pip install git+https://github.com/lhotse-speech/lhotse
+
+
+Install icefall
+---------------
+
+We will put the source code of `icefall`_ into the directory ``/tmp``
+You can select any directory you want.
+
+.. code-block:: bash
+
+ cd /tmp
+ git clone https://github.com/k2-fsa/icefall
+ cd icefall
+ pip install -r ./requirements.txt
+
+.. code-block:: bash
+
+ # Anytime we want to use icefall, we have to set the following
+ # environment variable
+
+ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+
+.. hint::
+
+ If you get the following error during this tutorial:
+
+ .. code-block:: bash
+
+ ModuleNotFoundError: No module named 'icefall'
+
+ please set the above environment variable to fix it.
+
+
+Congratulations! You have installed `icefall`_ successfully.
+
+For the more curious
+--------------------
+
+`icefall`_ contains a collection of Python scripts and you don't need to
+use ``python3 setup.py install`` or ``pip install icefall`` to install it.
+All you need to do is to download the code and set the environment variable
+``PYTHONPATH``.
diff --git a/docs/source/for-dummies/index.rst b/docs/source/for-dummies/index.rst
new file mode 100644
index 000000000..7c0a3d8ee
--- /dev/null
+++ b/docs/source/for-dummies/index.rst
@@ -0,0 +1,34 @@
+Icefall for dummies tutorial
+============================
+
+This tutorial walks you step by step about how to create a simple
+ASR (`Automatic Speech Recognition `_)
+system with `Next-gen Kaldi`_.
+
+We use the `yesno`_ dataset for demonstration. We select it out of two reasons:
+
+ - It is quite tiny, containing only about 12 minutes of data
+ - The training can be finished within 20 seconds on ``CPU``.
+
+That also means you don't need a ``GPU`` to run this tutorial.
+
+Let's get started!
+
+Please follow items below **sequentially**.
+
+.. note::
+
+ The :ref:`dummies_tutorial_data_preparation` runs only on Linux and on macOS.
+ All other parts run on Linux, macOS, and Windows.
+
+ Help from the community is appreciated to port the :ref:`dummies_tutorial_data_preparation`
+ to Windows.
+
+.. toctree::
+ :maxdepth: 2
+
+ ./environment-setup.rst
+ ./data-preparation.rst
+ ./training.rst
+ ./decoding.rst
+ ./model-export.rst
diff --git a/docs/source/for-dummies/model-export.rst b/docs/source/for-dummies/model-export.rst
new file mode 100644
index 000000000..079ebc712
--- /dev/null
+++ b/docs/source/for-dummies/model-export.rst
@@ -0,0 +1,310 @@
+Model Export
+============
+
+There are three ways to export a pre-trained model.
+
+ - Export the model parameters via `model.state_dict() `_
+ - Export via `torchscript `_: either `torch.jit.script() `_ or `torch.jit.trace() `_
+ - Export to `ONNX`_ via `torch.onnx.export() `_
+
+Each method is explained below in detail.
+
+Export the model parameters via model.state_dict()
+---------------------------------------------------
+
+The command for this kind of export is
+
+.. code-block:: bash
+
+ cd /tmp/icefall
+ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+ cd egs/yesno/ASR
+
+ # assume that "--epoch 14 --avg 2" produces the lowest WER.
+
+ ./tdnn/export.py --epoch 14 --avg 2
+
+The output logs are given below:
+
+.. code-block:: bash
+
+ 2023-08-16 20:42:03,912 INFO [export.py:76] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'jit': False}
+ 2023-08-16 20:42:03,913 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+ 2023-08-16 20:42:03,950 INFO [export.py:93] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
+ 2023-08-16 20:42:03,971 INFO [export.py:106] Not using torch.jit.script
+ 2023-08-16 20:42:03,974 INFO [export.py:111] Saved to tdnn/exp/pretrained.pt
+
+We can see from the logs that the exported model is saved to the file ``tdnn/exp/pretrained.pt``.
+
+To give you an idea of what ``tdnn/exp/pretrained.pt`` contains, we can use the following command:
+
+.. code-block:: python3
+
+ >>> import torch
+ >>> m = torch.load("tdnn/exp/pretrained.pt")
+ >>> list(m.keys())
+ ['model']
+ >>> list(m["model"].keys())
+ ['tdnn.0.weight', 'tdnn.0.bias', 'tdnn.2.running_mean', 'tdnn.2.running_var', 'tdnn.2.num_batches_tracked', 'tdnn.3.weight', 'tdnn.3.bias', 'tdnn.5.running_mean', 'tdnn.5.running_var', 'tdnn.5.num_batches_tracked', 'tdnn.6.weight', 'tdnn.6.bias', 'tdnn.8.running_mean', 'tdnn.8.running_var', 'tdnn.8.num_batches_tracked', 'output_linear.weight', 'output_linear.bias']
+
+We can use ``tdnn/exp/pretrained.pt`` in the following way with ``./tdnn/decode.py``:
+
+.. code-block:: bash
+
+ cd tdnn/exp
+ ln -s pretrained.pt epoch-99.pt
+ cd ../..
+
+ ./tdnn/decode.py --epoch 99 --avg 1
+
+The output logs of the above command are given below:
+
+.. code-block:: bash
+
+ 2023-08-16 20:45:48,089 INFO [decode.py:262] Decoding started
+ 2023-08-16 20:45:48,090 INFO [decode.py:263] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 99, 'avg': 1, 'export': False, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': False, 'k2-git-sha1': 'ad79f1c699c684de9785ed6ca5edb805a41f78c3', 'k2-git-date': 'Wed Jul 26 09:30:42 2023', 'lhotse-version': '1.16.0.dev+git.aa073f6.clean', 'torch-version': '2.0.0', 'torch-cuda-available': False, 'torch-cuda-version': None, 'python-version': '3.1', 'icefall-git-branch': 'master', 'icefall-git-sha1': '9a47c08-clean', 'icefall-git-date': 'Mon Aug 14 22:10:50 2023', 'icefall-path': '/private/tmp/icefall', 'k2-path': '/private/tmp/icefall_env/lib/python3.11/site-packages/k2/__init__.py', 'lhotse-path': '/private/tmp/icefall_env/lib/python3.11/site-packages/lhotse/__init__.py', 'hostname': 'fangjuns-MacBook-Pro.local', 'IP address': '127.0.0.1'}}
+ 2023-08-16 20:45:48,092 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+ 2023-08-16 20:45:48,103 INFO [decode.py:272] device: cpu
+ 2023-08-16 20:45:48,109 INFO [checkpoint.py:112] Loading checkpoint from tdnn/exp/epoch-99.pt
+ 2023-08-16 20:45:48,115 INFO [asr_datamodule.py:218] About to get test cuts
+ 2023-08-16 20:45:48,115 INFO [asr_datamodule.py:253] About to get test cuts
+ 2023-08-16 20:45:50,386 INFO [decode.py:203] batch 0/?, cuts processed until now is 4
+ 2023-08-16 20:45:50,556 INFO [decode.py:240] The transcripts are stored in tdnn/exp/recogs-test_set.txt
+ 2023-08-16 20:45:50,557 INFO [utils.py:564] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
+ 2023-08-16 20:45:50,558 INFO [decode.py:248] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
+ 2023-08-16 20:45:50,559 INFO [decode.py:315] Done!
+
+We can see that it produces an identical WER as before.
+
+We can also use it to decode files with the following command:
+
+.. code-block:: bash
+
+ # ./tdnn/pretrained.py requires kaldifeat
+ #
+ # Please refer to https://csukuangfj.github.io/kaldifeat/installation/from_wheels.html
+ # for how to install kaldifeat
+
+ pip install kaldifeat==1.25.0.dev20230726+cpu.torch2.0.0 -f https://csukuangfj.github.io/kaldifeat/cpu.html
+
+ ./tdnn/pretrained.py \
+ --checkpoint ./tdnn/exp/pretrained.pt \
+ --HLG ./data/lang_phone/HLG.pt \
+ --words-file ./data/lang_phone/words.txt \
+ download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+ download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+The output is given below:
+
+.. code-block:: bash
+
+ 2023-08-16 20:53:19,208 INFO [pretrained.py:136] {'feature_dim': 23, 'num_classes': 4, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './tdnn/exp/pretrained.pt', 'words_file': './data/lang_phone/words.txt', 'HLG': './data/lang_phone/HLG.pt', 'sound_files': ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']}
+ 2023-08-16 20:53:19,208 INFO [pretrained.py:142] device: cpu
+ 2023-08-16 20:53:19,208 INFO [pretrained.py:144] Creating model
+ 2023-08-16 20:53:19,212 INFO [pretrained.py:156] Loading HLG from ./data/lang_phone/HLG.pt
+ 2023-08-16 20:53:19,213 INFO [pretrained.py:160] Constructing Fbank computer
+ 2023-08-16 20:53:19,213 INFO [pretrained.py:170] Reading sound files: ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']
+ 2023-08-16 20:53:19,224 INFO [pretrained.py:176] Decoding started
+ 2023-08-16 20:53:19,304 INFO [pretrained.py:212]
+ download/waves_yesno/0_0_0_1_0_0_0_1.wav:
+ NO NO NO YES NO NO NO YES
+
+ download/waves_yesno/0_0_1_0_0_0_1_0.wav:
+ NO NO YES NO NO NO YES NO
+
+
+ 2023-08-16 20:53:19,304 INFO [pretrained.py:214] Decoding Done
+
+
+Export via torch.jit.script()
+-----------------------------
+
+The command for this kind of export is
+
+.. code-block:: bash
+
+ cd /tmp/icefall
+ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+ cd egs/yesno/ASR
+
+ # assume that "--epoch 14 --avg 2" produces the lowest WER.
+
+ ./tdnn/export.py --epoch 14 --avg 2 --jit true
+
+The output logs are given below:
+
+.. code-block:: bash
+
+ 2023-08-16 20:47:44,666 INFO [export.py:76] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'jit': True}
+ 2023-08-16 20:47:44,667 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+ 2023-08-16 20:47:44,670 INFO [export.py:93] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
+ 2023-08-16 20:47:44,677 INFO [export.py:100] Using torch.jit.script
+ 2023-08-16 20:47:44,843 INFO [export.py:104] Saved to tdnn/exp/cpu_jit.pt
+
+From the output logs we can see that the generated file is saved to ``tdnn/exp/cpu_jit.pt``.
+
+Don't be confused by the name ``cpu_jit.pt``. The ``cpu`` part means the model is moved to
+CPU before exporting. That means, when you load it with:
+
+.. code-block:: bash
+
+ torch.jit.load()
+
+you don't need to specify the argument `map_location `_
+and it resides on CPU by default.
+
+To use ``tdnn/exp/cpu_jit.pt`` with `icefall`_ to decode files, we can use:
+
+.. code-block:: bash
+
+ # ./tdnn/jit_pretrained.py requires kaldifeat
+ #
+ # Please refer to https://csukuangfj.github.io/kaldifeat/installation/from_wheels.html
+ # for how to install kaldifeat
+
+ pip install kaldifeat==1.25.0.dev20230726+cpu.torch2.0.0 -f https://csukuangfj.github.io/kaldifeat/cpu.html
+
+
+ ./tdnn/jit_pretrained.py \
+ --nn-model ./tdnn/exp/cpu_jit.pt \
+ --HLG ./data/lang_phone/HLG.pt \
+ --words-file ./data/lang_phone/words.txt \
+ download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+ download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+The output is given below:
+
+.. code-block:: bash
+
+ 2023-08-16 20:56:00,603 INFO [jit_pretrained.py:121] {'feature_dim': 23, 'num_classes': 4, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'nn_model': './tdnn/exp/cpu_jit.pt', 'words_file': './data/lang_phone/words.txt', 'HLG': './data/lang_phone/HLG.pt', 'sound_files': ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']}
+ 2023-08-16 20:56:00,603 INFO [jit_pretrained.py:127] device: cpu
+ 2023-08-16 20:56:00,603 INFO [jit_pretrained.py:129] Loading torchscript model
+ 2023-08-16 20:56:00,640 INFO [jit_pretrained.py:134] Loading HLG from ./data/lang_phone/HLG.pt
+ 2023-08-16 20:56:00,641 INFO [jit_pretrained.py:138] Constructing Fbank computer
+ 2023-08-16 20:56:00,641 INFO [jit_pretrained.py:148] Reading sound files: ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']
+ 2023-08-16 20:56:00,642 INFO [jit_pretrained.py:154] Decoding started
+ 2023-08-16 20:56:00,727 INFO [jit_pretrained.py:190]
+ download/waves_yesno/0_0_0_1_0_0_0_1.wav:
+ NO NO NO YES NO NO NO YES
+
+ download/waves_yesno/0_0_1_0_0_0_1_0.wav:
+ NO NO YES NO NO NO YES NO
+
+
+ 2023-08-16 20:56:00,727 INFO [jit_pretrained.py:192] Decoding Done
+
+.. hint::
+
+ We provide only code for ``torch.jit.script()``. You can try ``torch.jit.trace()``
+ if you want.
+
+Export via torch.onnx.export()
+------------------------------
+
+The command for this kind of export is
+
+.. code-block:: bash
+
+ cd /tmp/icefall
+ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+ cd egs/yesno/ASR
+
+ # tdnn/export_onnx.py requires onnx and onnxruntime
+ pip install onnx onnxruntime
+
+ # assume that "--epoch 14 --avg 2" produces the lowest WER.
+
+ ./tdnn/export_onnx.py \
+ --epoch 14 \
+ --avg 2
+
+The output logs are given below:
+
+.. code-block:: bash
+
+ 2023-08-16 20:59:20,888 INFO [export_onnx.py:83] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'epoch': 14, 'avg': 2}
+ 2023-08-16 20:59:20,888 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+ 2023-08-16 20:59:20,892 INFO [export_onnx.py:100] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
+ ================ Diagnostic Run torch.onnx.export version 2.0.0 ================
+ verbose: False, log level: Level.ERROR
+ ======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================
+
+ 2023-08-16 20:59:21,047 INFO [export_onnx.py:127] Saved to tdnn/exp/model-epoch-14-avg-2.onnx
+ 2023-08-16 20:59:21,047 INFO [export_onnx.py:136] meta_data: {'model_type': 'tdnn', 'version': '1', 'model_author': 'k2-fsa', 'comment': 'non-streaming tdnn for the yesno recipe', 'vocab_size': 4}
+ 2023-08-16 20:59:21,049 INFO [export_onnx.py:140] Generate int8 quantization models
+ 2023-08-16 20:59:21,075 INFO [onnx_quantizer.py:538] Quantization parameters for tensor:"/Transpose_1_output_0" not specified
+ 2023-08-16 20:59:21,081 INFO [export_onnx.py:151] Saved to tdnn/exp/model-epoch-14-avg-2.int8.onnx
+
+We can see from the logs that it generates two files:
+
+ - ``tdnn/exp/model-epoch-14-avg-2.onnx`` (ONNX model with ``float32`` weights)
+ - ``tdnn/exp/model-epoch-14-avg-2.int8.onnx`` (ONNX model with ``int8`` weights)
+
+To use the generated ONNX model files for decoding with `onnxruntime`_, we can use
+
+.. code-block:: bash
+
+ # ./tdnn/onnx_pretrained.py requires kaldifeat
+ #
+ # Please refer to https://csukuangfj.github.io/kaldifeat/installation/from_wheels.html
+ # for how to install kaldifeat
+
+ pip install kaldifeat==1.25.0.dev20230726+cpu.torch2.0.0 -f https://csukuangfj.github.io/kaldifeat/cpu.html
+
+ ./tdnn/onnx_pretrained.py \
+ --nn-model ./tdnn/exp/model-epoch-14-avg-2.onnx \
+ --HLG ./data/lang_phone/HLG.pt \
+ --words-file ./data/lang_phone/words.txt \
+ download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+ download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+The output is given below:
+
+.. code-block:: bash
+
+ 2023-08-16 21:03:24,260 INFO [onnx_pretrained.py:166] {'feature_dim': 23, 'sample_rate': 8000, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'nn_model': './tdnn/exp/model-epoch-14-avg-2.onnx', 'words_file': './data/lang_phone/words.txt', 'HLG': './data/lang_phone/HLG.pt', 'sound_files': ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']}
+ 2023-08-16 21:03:24,260 INFO [onnx_pretrained.py:171] device: cpu
+ 2023-08-16 21:03:24,260 INFO [onnx_pretrained.py:173] Loading onnx model ./tdnn/exp/model-epoch-14-avg-2.onnx
+ 2023-08-16 21:03:24,267 INFO [onnx_pretrained.py:176] Loading HLG from ./data/lang_phone/HLG.pt
+ 2023-08-16 21:03:24,270 INFO [onnx_pretrained.py:180] Constructing Fbank computer
+ 2023-08-16 21:03:24,273 INFO [onnx_pretrained.py:190] Reading sound files: ['download/waves_yesno/0_0_0_1_0_0_0_1.wav', 'download/waves_yesno/0_0_1_0_0_0_1_0.wav']
+ 2023-08-16 21:03:24,279 INFO [onnx_pretrained.py:196] Decoding started
+ 2023-08-16 21:03:24,318 INFO [onnx_pretrained.py:232]
+ download/waves_yesno/0_0_0_1_0_0_0_1.wav:
+ NO NO NO YES NO NO NO YES
+
+ download/waves_yesno/0_0_1_0_0_0_1_0.wav:
+ NO NO YES NO NO NO YES NO
+
+
+ 2023-08-16 21:03:24,318 INFO [onnx_pretrained.py:234] Decoding Done
+
+.. note::
+
+ To use the ``int8`` ONNX model for decoding, please use:
+
+ .. code-block:: bash
+
+ ./tdnn/onnx_pretrained.py \
+ --nn-model ./tdnn/exp/model-epoch-14-avg-2.onnx \
+ --HLG ./data/lang_phone/HLG.pt \
+ --words-file ./data/lang_phone/words.txt \
+ download/waves_yesno/0_0_0_1_0_0_0_1.wav \
+ download/waves_yesno/0_0_1_0_0_0_1_0.wav
+
+For the more curious
+--------------------
+
+If you are wondering how to deploy the model without ``torch``, please
+continue reading. We will show how to use `sherpa-onnx`_ to run the
+exported ONNX models, which depends only on `onnxruntime`_ and does not
+depend on ``torch``.
+
+In this tutorial, we will only demonstrate the usage of `sherpa-onnx`_ with the
+pre-trained model of the `yesno`_ recipe. There are also other two frameworks
+available:
+
+ - `sherpa`_. It works with torchscript models.
+ - `sherpa-ncnn`_. It works with models exported using :ref:`icefall_export_to_ncnn` with `ncnn`_
+
+Please see ``_ for further details.
diff --git a/docs/source/for-dummies/training.rst b/docs/source/for-dummies/training.rst
new file mode 100644
index 000000000..816ef2d3b
--- /dev/null
+++ b/docs/source/for-dummies/training.rst
@@ -0,0 +1,39 @@
+.. _dummies_tutorial_training:
+
+Training
+========
+
+After :ref:`dummies_tutorial_data_preparation`, we can start training.
+
+The command to start the training is quite simple:
+
+.. code-block:: bash
+
+ cd /tmp/icefall
+ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+ cd egs/yesno/ASR
+
+ # We use CPU for training by setting the following environment variable
+ export CUDA_VISIBLE_DEVICES=""
+
+ ./tdnn/train.py
+
+That's it!
+
+You can find the training logs below:
+
+.. literalinclude:: ./code/train-yesno.txt
+
+For the more curious
+--------------------
+
+.. code-block:: bash
+
+ ./tdnn/train.py --help
+
+will print the usage information about ``./tdnn/train.py``. For instance, you
+can specify the number of epochs to train and the location to save the training
+results.
+
+The training text logs are saved in ``tdnn/exp/log`` while the tensorboard
+logs are in ``tdnn/exp/tensorboard``.
diff --git a/docs/source/huggingface/index.rst b/docs/source/huggingface/index.rst
new file mode 100644
index 000000000..bd731793b
--- /dev/null
+++ b/docs/source/huggingface/index.rst
@@ -0,0 +1,13 @@
+Huggingface
+===========
+
+This section describes how to find pre-trained models.
+It also demonstrates how to try them from within your browser
+without installing anything by using
+`Huggingface spaces `_.
+
+.. toctree::
+ :maxdepth: 2
+
+ pretrained-models
+ spaces
diff --git a/docs/source/huggingface/pic/hugging-face-sherpa-2.png b/docs/source/huggingface/pic/hugging-face-sherpa-2.png
new file mode 100644
index 000000000..3b47bd51b
Binary files /dev/null and b/docs/source/huggingface/pic/hugging-face-sherpa-2.png differ
diff --git a/docs/source/huggingface/pic/hugging-face-sherpa-3.png b/docs/source/huggingface/pic/hugging-face-sherpa-3.png
new file mode 100644
index 000000000..1d7a2d316
Binary files /dev/null and b/docs/source/huggingface/pic/hugging-face-sherpa-3.png differ
diff --git a/docs/source/huggingface/pic/hugging-face-sherpa.png b/docs/source/huggingface/pic/hugging-face-sherpa.png
new file mode 100644
index 000000000..dea0b1d46
Binary files /dev/null and b/docs/source/huggingface/pic/hugging-face-sherpa.png differ
diff --git a/docs/source/huggingface/pretrained-models.rst b/docs/source/huggingface/pretrained-models.rst
new file mode 100644
index 000000000..8ae22f76f
--- /dev/null
+++ b/docs/source/huggingface/pretrained-models.rst
@@ -0,0 +1,17 @@
+Pre-trained models
+==================
+
+We have uploaded pre-trained models for all recipes in ``icefall``
+to ``_.
+
+You can find them by visiting the following link:
+
+``_.
+
+You can also find links of pre-trained models for a specific recipe
+by looking at the corresponding ``RESULTS.md``. For instance:
+
+ - ``_
+ - ``_
+ - ``_
+ - ``_
diff --git a/docs/source/huggingface/spaces.rst b/docs/source/huggingface/spaces.rst
new file mode 100644
index 000000000..e718c3731
--- /dev/null
+++ b/docs/source/huggingface/spaces.rst
@@ -0,0 +1,65 @@
+Huggingface spaces
+==================
+
+We have integrated the server framework
+`sherpa `_
+with `Huggingface spaces `_
+so that you can try pre-trained models from within your browser
+without the need to download or install anything.
+
+All you need is a browser, which can be run on Windows, macOS, Linux, or even on your
+iPad and your phone.
+
+Start your browser and visit the following address:
+
+``_
+
+and you will see a page like the following screenshot:
+
+.. image:: ./pic/hugging-face-sherpa.png
+ :alt: screenshot of ``_
+ :target: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition
+
+You can:
+
+ 1. Select a language for recognition. Currently, we provide pre-trained models
+ from ``icefall`` for the following languages: ``Chinese``, ``English``, and
+ ``Chinese+English``.
+ 2. After selecting the target language, you can select a pre-trained model
+ corresponding to the language.
+ 3. Select the decoding method. Currently, it provides ``greedy search``
+ and ``modified_beam_search``.
+ 4. If you selected ``modified_beam_search``, you can choose the number of
+ active paths during the search.
+ 5. Either upload a file or record your speech for recognition.
+ 6. Click the button ``Submit for recognition``.
+ 7. Wait for a moment and you will get the recognition results.
+
+The following screenshot shows an example when selecting ``Chinese+English``:
+
+.. image:: ./pic/hugging-face-sherpa-3.png
+ :alt: screenshot of ``_
+ :target: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition
+
+
+In the bottom part of the page, you can find a table of examples. You can click
+one of them and then click ``Submit for recognition``.
+
+.. image:: ./pic/hugging-face-sherpa-2.png
+ :alt: screenshot of ``_
+ :target: https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition
+
+YouTube Video
+-------------
+
+We provide the following YouTube video demonstrating how to use
+``_.
+
+.. note::
+
+ To get the latest news of `next-gen Kaldi `_, please subscribe
+ the following YouTube channel by `Nadira Povey `_:
+
+ ``_
+
+.. youtube:: ElN3r9dkKE4
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 000000000..fb539d3f2
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,44 @@
+.. icefall documentation master file, created by
+ sphinx-quickstart on Mon Aug 23 16:07:39 2021.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+Icefall
+=======
+
+.. image:: _static/logo.png
+ :alt: icefall logo
+ :width: 168px
+ :align: center
+ :target: https://github.com/k2-fsa/icefall
+
+
+Documentation for `icefall `_, containing
+speech recognition recipes using `k2 `_.
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Contents:
+
+ for-dummies/index.rst
+ installation/index
+ docker/index
+ faqs
+ model-export/index
+
+
+.. toctree::
+ :maxdepth: 3
+
+ recipes/index
+
+.. toctree::
+ :maxdepth: 2
+
+ contributing/index
+ huggingface/index
+
+.. toctree::
+ :maxdepth: 2
+
+ decoding-with-langugage-models/index
diff --git a/docs/source/installation/images/README.md b/docs/source/installation/images/README.md
new file mode 100644
index 000000000..97c1e993c
--- /dev/null
+++ b/docs/source/installation/images/README.md
@@ -0,0 +1,4 @@
+
+# Introduction
+
+ is used to generate files in this directory.
diff --git a/docs/source/installation/images/device-CPU_CUDA-orange.svg b/docs/source/installation/images/device-CPU_CUDA-orange.svg
new file mode 100644
index 000000000..a023a1283
--- /dev/null
+++ b/docs/source/installation/images/device-CPU_CUDA-orange.svg
@@ -0,0 +1 @@
+
diff --git a/docs/source/installation/images/k2-gt-v1.9-blueviolet.svg b/docs/source/installation/images/k2-gt-v1.9-blueviolet.svg
new file mode 100644
index 000000000..3019ff03d
--- /dev/null
+++ b/docs/source/installation/images/k2-gt-v1.9-blueviolet.svg
@@ -0,0 +1 @@
+
diff --git a/docs/source/installation/images/os-Linux_macOS-ff69b4.svg b/docs/source/installation/images/os-Linux_macOS-ff69b4.svg
new file mode 100644
index 000000000..178813ed4
--- /dev/null
+++ b/docs/source/installation/images/os-Linux_macOS-ff69b4.svg
@@ -0,0 +1 @@
+
diff --git a/docs/source/installation/images/python-gt-v3.6-blue.svg b/docs/source/installation/images/python-gt-v3.6-blue.svg
new file mode 100644
index 000000000..df677ad09
--- /dev/null
+++ b/docs/source/installation/images/python-gt-v3.6-blue.svg
@@ -0,0 +1 @@
+
diff --git a/docs/source/installation/images/torch-gt-v1.6.0-green.svg b/docs/source/installation/images/torch-gt-v1.6.0-green.svg
new file mode 100644
index 000000000..d7007d742
--- /dev/null
+++ b/docs/source/installation/images/torch-gt-v1.6.0-green.svg
@@ -0,0 +1 @@
+
diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst
new file mode 100644
index 000000000..5a034ef5b
--- /dev/null
+++ b/docs/source/installation/index.rst
@@ -0,0 +1,550 @@
+.. _install icefall:
+
+Installation
+============
+
+.. hint::
+
+ We also provide :ref:`icefall_docker` support, which has already setup
+ the environment for you.
+
+.. hint::
+
+ We have a colab notebook guiding you step by step to setup the environment.
+
+ |yesno colab notebook|
+
+ .. |yesno colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
+ :target: https://colab.research.google.com/drive/1tIjjzaJc3IvGyKiMCDWO-TSnBgkcuN3B?usp=sharing
+
+`icefall`_ depends on `k2`_ and `lhotse`_.
+
+We recommend that you use the following steps to install the dependencies.
+
+- (0) Install CUDA toolkit and cuDNN
+- (1) Install `torch`_ and `torchaudio`_
+- (2) Install `k2`_
+- (3) Install `lhotse`_
+
+.. caution::
+
+ Installation order matters.
+
+(0) Install CUDA toolkit and cuDNN
+----------------------------------
+
+Please refer to
+``_
+to install CUDA and cuDNN.
+
+
+(1) Install torch and torchaudio
+--------------------------------
+
+Please refer ``_ to install `torch`_ and `torchaudio`_.
+
+.. caution::
+
+ Please install torch and torchaudio at the same time.
+
+(2) Install k2
+--------------
+
+Please refer to ``_
+to install `k2`_.
+
+.. caution::
+
+ Please don't change your installed PyTorch after you have installed k2.
+
+.. note::
+
+ We suggest that you install k2 from pre-compiled wheels by following
+ ``_
+
+.. hint::
+
+ Please always install the latest version of `k2`_.
+
+(3) Install lhotse
+------------------
+
+Please refer to ``_
+to install `lhotse`_.
+
+.. hint::
+
+ We strongly recommend you to use::
+
+ pip install git+https://github.com/lhotse-speech/lhotse
+
+ to install the latest version of `lhotse`_.
+
+(4) Download icefall
+--------------------
+
+`icefall`_ is a collection of Python scripts; what you need is to download it
+and set the environment variable ``PYTHONPATH`` to point to it.
+
+Assume you want to place `icefall`_ in the folder ``/tmp``. The
+following commands show you how to setup `icefall`_:
+
+.. code-block:: bash
+
+ cd /tmp
+ git clone https://github.com/k2-fsa/icefall
+ cd icefall
+ pip install -r requirements.txt
+ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+
+.. HINT::
+
+ You can put several versions of `icefall`_ in the same virtual environment.
+ To switch among different versions of `icefall`_, just set ``PYTHONPATH``
+ to point to the version you want.
+
+Installation example
+--------------------
+
+The following shows an example about setting up the environment.
+
+(1) Create a virtual environment
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ kuangfangjun:~$ virtualenv -p python3.8 test-icefall
+ created virtual environment CPython3.8.0.final.0-64 in 9422ms
+ creator CPython3Posix(dest=/star-fj/fangjun/test-icefall, clear=False, no_vcs_ignore=False, global=False)
+ seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/star-fj/fangjun/.local/share/virtualenv)
+ added seed packages: pip==22.3.1, setuptools==65.6.3, wheel==0.38.4
+ activators BashActivator,CShellActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator
+
+ kuangfangjun:~$ source test-icefall/bin/activate
+
+ (test-icefall) kuangfangjun:~$
+
+(2) Install CUDA toolkit and cuDNN
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You need to determine the version of CUDA toolkit to install.
+
+.. code-block:: bash
+
+ (test-icefall) kuangfangjun:~$ nvidia-smi | head -n 4
+
+ Wed Jul 26 21:57:49 2023
+ +-----------------------------------------------------------------------------+
+ | NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6 |
+ |-------------------------------+----------------------+----------------------+
+
+You can choose any CUDA version that is ``not`` greater than the version printed by ``nvidia-smi``.
+In our case, we can choose any version ``<= 11.6``.
+
+We will use ``CUDA 11.6`` in this example. Please follow
+``_
+to install CUDA toolkit and cuDNN if you have not done that before.
+
+After installing CUDA toolkit, you can use the following command to verify it:
+
+.. code-block:: bash
+
+ (test-icefall) kuangfangjun:~$ nvcc --version
+
+ nvcc: NVIDIA (R) Cuda compiler driver
+ Copyright (c) 2005-2019 NVIDIA Corporation
+ Built on Wed_Oct_23_19:24:38_PDT_2019
+ Cuda compilation tools, release 10.2, V10.2.89
+
+(3) Install torch and torchaudio
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Since we have selected CUDA toolkit ``11.6``, we have to install a version of `torch`_
+that is compiled against CUDA ``11.6``. We select ``torch 1.13.0+cu116`` in this
+example.
+
+After selecting the version of `torch`_ to install, we need to also install
+a compatible version of `torchaudio`_, which is ``0.13.0+cu116`` in our case.
+
+Please refer to ``_
+to select an appropriate version of `torchaudio`_ to install if you use a different
+version of `torch`_.
+
+.. code-block:: bash
+
+ (test-icefall) kuangfangjun:~$ pip install torch==1.13.0+cu116 torchaudio==0.13.0+cu116 -f https://download.pytorch.org/whl/torch_stable.html
+
+ Looking in links: https://download.pytorch.org/whl/torch_stable.html
+ Collecting torch==1.13.0+cu116
+ Downloading https://download.pytorch.org/whl/cu116/torch-1.13.0%2Bcu116-cp38-cp38-linux_x86_64.whl (1983.0 MB)
+ ________________________________________ 2.0/2.0 GB 764.4 kB/s eta 0:00:00
+ Collecting torchaudio==0.13.0+cu116
+ Downloading https://download.pytorch.org/whl/cu116/torchaudio-0.13.0%2Bcu116-cp38-cp38-linux_x86_64.whl (4.2 MB)
+ ________________________________________ 4.2/4.2 MB 1.3 MB/s eta 0:00:00
+ Requirement already satisfied: typing-extensions in /star-fj/fangjun/test-icefall/lib/python3.8/site-packages (from torch==1.13.0+cu116) (4.7.1)
+ Installing collected packages: torch, torchaudio
+ Successfully installed torch-1.13.0+cu116 torchaudio-0.13.0+cu116
+
+Verify that `torch`_ and `torchaudio`_ are successfully installed:
+
+.. code-block:: bash
+
+ (test-icefall) kuangfangjun:~$ python3 -c "import torch; print(torch.__version__)"
+
+ 1.13.0+cu116
+
+ (test-icefall) kuangfangjun:~$ python3 -c "import torchaudio; print(torchaudio.__version__)"
+
+ 0.13.0+cu116
+
+(4) Install k2
+~~~~~~~~~~~~~~
+
+We will install `k2`_ from pre-compiled wheels by following
+``_
+
+.. code-block:: bash
+
+ (test-icefall) kuangfangjun:~$ pip install k2==1.24.3.dev20230725+cuda11.6.torch1.13.0 -f https://k2-fsa.github.io/k2/cuda.html
+
+ Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
+ Looking in links: https://k2-fsa.github.io/k2/cuda.html
+ Collecting k2==1.24.3.dev20230725+cuda11.6.torch1.13.0
+ Downloading https://huggingface.co/csukuangfj/k2/resolve/main/ubuntu-cuda/k2-1.24.3.dev20230725%2Bcuda11.6.torch1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (104.3 MB)
+ ________________________________________ 104.3/104.3 MB 5.1 MB/s eta 0:00:00
+ Requirement already satisfied: torch==1.13.0 in /star-fj/fangjun/test-icefall/lib/python3.8/site-packages (from k2==1.24.3.dev20230725+cuda11.6.torch1.13.0) (1.13.0+cu116)
+ Collecting graphviz
+ Using cached https://pypi.tuna.tsinghua.edu.cn/packages/de/5e/fcbb22c68208d39edff467809d06c9d81d7d27426460ebc598e55130c1aa/graphviz-0.20.1-py3-none-any.whl (47 kB)
+ Requirement already satisfied: typing-extensions in /star-fj/fangjun/test-icefall/lib/python3.8/site-packages (from torch==1.13.0->k2==1.24.3.dev20230725+cuda11.6.torch1.13.0) (4.7.1)
+ Installing collected packages: graphviz, k2
+ Successfully installed graphviz-0.20.1 k2-1.24.3.dev20230725+cuda11.6.torch1.13.0
+
+.. hint::
+
+ Please refer to ``_ for the available
+ pre-compiled wheels about `k2`_.
+
+Verify that `k2`_ has been installed successfully:
+
+.. code-block:: bash
+
+ (test-icefall) kuangfangjun:~$ python3 -m k2.version
+
+ Collecting environment information...
+
+ k2 version: 1.24.3
+ Build type: Release
+ Git SHA1: 4c05309499a08454997adf500b56dcc629e35ae5
+ Git date: Tue Jul 25 16:23:36 2023
+ Cuda used to build k2: 11.6
+ cuDNN used to build k2: 8.3.2
+ Python version used to build k2: 3.8
+ OS used to build k2: CentOS Linux release 7.9.2009 (Core)
+ CMake version: 3.27.0
+ GCC version: 9.3.1
+ CMAKE_CUDA_FLAGS: -Wno-deprecated-gpu-targets -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_35,code=sm_35 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_50,code=sm_50 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_60,code=sm_60 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_61,code=sm_61 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_70,code=sm_70 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_75,code=sm_75 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_80,code=sm_80 -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w --expt-extended-lambda -gencode arch=compute_86,code=sm_86 -DONNX_NAMESPACE=onnx_c2 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -Xcudafe --diag_suppress=cc_clobber_ignored,--diag_suppress=integer_sign_change,--diag_suppress=useless_using_declaration,--diag_suppress=set_but_not_used,--diag_suppress=field_without_dll_interface,--diag_suppress=base_class_has_different_dll_interface,--diag_suppress=dll_interface_conflict_none_assumed,--diag_suppress=dll_interface_conflict_dllexport_assumed,--diag_suppress=implicit_return_from_non_void_function,--diag_suppress=unsigned_compare_with_zero,--diag_suppress=declared_but_not_referenced,--diag_suppress=bad_friend_decl --expt-relaxed-constexpr --expt-extended-lambda -D_GLIBCXX_USE_CXX11_ABI=0 --compiler-options -Wall --compiler-options -Wno-strict-overflow --compiler-options -Wno-unknown-pragmas
+ CMAKE_CXX_FLAGS: -D_GLIBCXX_USE_CXX11_ABI=0 -Wno-unused-variable -Wno-strict-overflow
+ PyTorch version used to build k2: 1.13.0+cu116
+ PyTorch is using Cuda: 11.6
+ NVTX enabled: True
+ With CUDA: True
+ Disable debug: True
+ Sync kernels : False
+ Disable checks: False
+ Max cpu memory allocate: 214748364800 bytes (or 200.0 GB)
+ k2 abort: False
+ __file__: /star-fj/fangjun/test-icefall/lib/python3.8/site-packages/k2/version/version.py
+ _k2.__file__: /star-fj/fangjun/test-icefall/lib/python3.8/site-packages/_k2.cpython-38-x86_64-linux-gnu.so
+
+(5) Install lhotse
+~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ (test-icefall) kuangfangjun:~$ pip install git+https://github.com/lhotse-speech/lhotse
+
+ Collecting git+https://github.com/lhotse-speech/lhotse
+ Cloning https://github.com/lhotse-speech/lhotse to /tmp/pip-req-build-vq12fd5i
+ Running command git clone --filter=blob:none --quiet https://github.com/lhotse-speech/lhotse /tmp/pip-req-build-vq12fd5i
+ Resolved https://github.com/lhotse-speech/lhotse to commit 7640d663469b22cd0b36f3246ee9b849cd25e3b7
+ Installing build dependencies ... done
+ Getting requirements to build wheel ... done
+ Preparing metadata (pyproject.toml) ... done
+ Collecting cytoolz>=0.10.1
+ Downloading https://pypi.tuna.tsinghua.edu.cn/packages/1e/3b/a7828d575aa17fb7acaf1ced49a3655aa36dad7e16eb7e6a2e4df0dda76f/cytoolz-0.12.2-cp38-cp38-
+ manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
+ ________________________________________ 2.0/2.0 MB 33.2 MB/s eta 0:00:00
+ Collecting pyyaml>=5.3.1
+ Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c8/6b/6600ac24725c7388255b2f5add93f91e58a5d7efaf4af244fdbcc11a541b/PyYAML-6.0.1-cp38-cp38-ma
+ nylinux_2_17_x86_64.manylinux2014_x86_64.whl (736 kB)
+ ________________________________________ 736.6/736.6 kB 38.6 MB/s eta 0:00:00
+ Collecting dataclasses
+ Downloading https://pypi.tuna.tsinghua.edu.cn/packages/26/2f/1095cdc2868052dd1e64520f7c0d5c8c550ad297e944e641dbf1ffbb9a5d/dataclasses-0.6-py3-none-
+ any.whl (14 kB)
+ Requirement already satisfied: torchaudio in ./test-icefall/lib/python3.8/site-packages (from lhotse==1.16.0.dev0+git.7640d66.clean) (0.13.0+cu116)
+ Collecting lilcom>=1.1.0
+ Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a8/65/df0a69c52bd085ca1ad4e5c4c1a5c680e25f9477d8e49316c4ff1e5084a4/lilcom-1.7-cp38-cp38-many
+ linux_2_17_x86_64.manylinux2014_x86_64.whl (87 kB)
+ ________________________________________ 87.1/87.1 kB 8.7 MB/s eta 0:00:00
+ Collecting tqdm
+ Using cached https://pypi.tuna.tsinghua.edu.cn/packages/e6/02/a2cff6306177ae6bc73bc0665065de51dfb3b9db7373e122e2735faf0d97/tqdm-4.65.0-py3-none-any
+ .whl (77 kB)
+ Requirement already satisfied: numpy>=1.18.1 in ./test-icefall/lib/python3.8/site-packages (from lhotse==1.16.0.dev0+git.7640d66.clean) (1.24.4)
+ Collecting audioread>=2.1.9
+ Using cached https://pypi.tuna.tsinghua.edu.cn/packages/5d/cb/82a002441902dccbe427406785db07af10182245ee639ea9f4d92907c923/audioread-3.0.0.tar.gz (
+ 377 kB)
+ Preparing metadata (setup.py) ... done
+ Collecting tabulate>=0.8.1
+ Using cached https://pypi.tuna.tsinghua.edu.cn/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-
+ any.whl (35 kB)
+ Collecting click>=7.1.1
+ Downloading https://pypi.tuna.tsinghua.edu.cn/packages/1a/70/e63223f8116931d365993d4a6b7ef653a4d920b41d03de7c59499962821f/click-8.1.6-py3-none-any.
+ whl (97 kB)
+ ________________________________________ 97.9/97.9 kB 8.4 MB/s eta 0:00:00
+ Collecting packaging
+ Using cached https://pypi.tuna.tsinghua.edu.cn/packages/ab/c3/57f0601a2d4fe15de7a553c00adbc901425661bf048f2a22dfc500caf121/packaging-23.1-py3-none-
+ any.whl (48 kB)
+ Collecting intervaltree>=3.1.0
+ Downloading https://pypi.tuna.tsinghua.edu.cn/packages/50/fb/396d568039d21344639db96d940d40eb62befe704ef849b27949ded5c3bb/intervaltree-3.1.0.tar.gz
+ (32 kB)
+ Preparing metadata (setup.py) ... done
+ Requirement already satisfied: torch in ./test-icefall/lib/python3.8/site-packages (from lhotse==1.16.0.dev0+git.7640d66.clean) (1.13.0+cu116)
+ Collecting SoundFile>=0.10
+ Downloading https://pypi.tuna.tsinghua.edu.cn/packages/ad/bd/0602167a213d9184fc688b1086dc6d374b7ae8c33eccf169f9b50ce6568c/soundfile-0.12.1-py2.py3-
+ none-manylinux_2_17_x86_64.whl (1.3 MB)
+ ________________________________________ 1.3/1.3 MB 46.5 MB/s eta 0:00:00
+ Collecting toolz>=0.8.0
+ Using cached https://pypi.tuna.tsinghua.edu.cn/packages/7f/5c/922a3508f5bda2892be3df86c74f9cf1e01217c2b1f8a0ac4841d903e3e9/toolz-0.12.0-py3-none-any.whl (55 kB)
+ Collecting sortedcontainers<3.0,>=2.0
+ Using cached https://pypi.tuna.tsinghua.edu.cn/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
+ Collecting cffi>=1.0
+ Using cached https://pypi.tuna.tsinghua.edu.cn/packages/b7/8b/06f30caa03b5b3ac006de4f93478dbd0239e2a16566d81a106c322dc4f79/cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (442 kB)
+ Requirement already satisfied: typing-extensions in ./test-icefall/lib/python3.8/site-packages (from torch->lhotse==1.16.0.dev0+git.7640d66.clean) (4.7.1)
+ Collecting pycparser
+ Using cached https://pypi.tuna.tsinghua.edu.cn/packages/62/d5/5f610ebe421e85889f2e55e33b7f9a6795bd982198517d912eb1c76e1a53/pycparser-2.21-py2.py3-none-any.whl (118 kB)
+ Building wheels for collected packages: lhotse, audioread, intervaltree
+ Building wheel for lhotse (pyproject.toml) ... done
+ Created wheel for lhotse: filename=lhotse-1.16.0.dev0+git.7640d66.clean-py3-none-any.whl size=687627 sha256=cbf0a4d2d0b639b33b91637a4175bc251d6a021a069644ecb1a9f2b3a83d072a
+ Stored in directory: /tmp/pip-ephem-wheel-cache-wwtk90_m/wheels/7f/7a/8e/a0bf241336e2e3cb573e1e21e5600952d49f5162454f2e612f
+ Building wheel for audioread (setup.py) ... done
+ Created wheel for audioread: filename=audioread-3.0.0-py3-none-any.whl size=23704 sha256=5e2d3537c96ce9cf0f645a654c671163707bf8cb8d9e358d0e2b0939a85ff4c2
+ Stored in directory: /star-fj/fangjun/.cache/pip/wheels/e2/c3/9c/f19ae5a03f8862d9f0776b0c0570f1fdd60a119d90954e3f39
+ Building wheel for intervaltree (setup.py) ... done
+ Created wheel for intervaltree: filename=intervaltree-3.1.0-py2.py3-none-any.whl size=26098 sha256=2604170976cfffe0d2f678cb1a6e5b525f561cd50babe53d631a186734fec9f9
+ Stored in directory: /star-fj/fangjun/.cache/pip/wheels/f3/ed/2b/c179ebfad4e15452d6baef59737f27beb9bfb442e0620f7271
+ Successfully built lhotse audioread intervaltree
+ Installing collected packages: sortedcontainers, dataclasses, tqdm, toolz, tabulate, pyyaml, pycparser, packaging, lilcom, intervaltree, click, audioread, cytoolz, cffi, SoundFile, lhotse
+ Successfully installed SoundFile-0.12.1 audioread-3.0.0 cffi-1.15.1 click-8.1.6 cytoolz-0.12.2 dataclasses-0.6 intervaltree-3.1.0 lhotse-1.16.0.dev0+git.7640d66.clean lilcom-1.7 packaging-23.1 pycparser-2.21 pyyaml-6.0.1 sortedcontainers-2.4.0 tabulate-0.9.0 toolz-0.12.0 tqdm-4.65.0
+
+
+Verify that `lhotse`_ has been installed successfully:
+
+.. code-block:: bash
+
+ (test-icefall) kuangfangjun:~$ python3 -c "import lhotse; print(lhotse.__version__)"
+
+ 1.16.0.dev+git.7640d66.clean
+
+(6) Download icefall
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ (test-icefall) kuangfangjun:~$ cd /tmp/
+
+ (test-icefall) kuangfangjun:tmp$ git clone https://github.com/k2-fsa/icefall
+
+ Cloning into 'icefall'...
+ remote: Enumerating objects: 12942, done.
+ remote: Counting objects: 100% (67/67), done.
+ remote: Compressing objects: 100% (56/56), done.
+ remote: Total 12942 (delta 17), reused 35 (delta 6), pack-reused 12875
+ Receiving objects: 100% (12942/12942), 14.77 MiB | 9.29 MiB/s, done.
+ Resolving deltas: 100% (8835/8835), done.
+
+ (test-icefall) kuangfangjun:tmp$ cd icefall/
+
+ (test-icefall) kuangfangjun:icefall$ pip install -r ./requirements.txt
+
+Test Your Installation
+----------------------
+
+To test that your installation is successful, let us run
+the `yesno recipe `_
+on ``CPU``.
+
+Data preparation
+~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ (test-icefall) kuangfangjun:icefall$ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+
+ (test-icefall) kuangfangjun:icefall$ cd /tmp/icefall
+
+ (test-icefall) kuangfangjun:icefall$ cd egs/yesno/ASR
+
+ (test-icefall) kuangfangjun:ASR$ ./prepare.sh
+
+
+The log of running ``./prepare.sh`` is:
+
+.. code-block::
+
+ 2023-07-27 12:41:39 (prepare.sh:27:main) dl_dir: /tmp/icefall/egs/yesno/ASR/download
+ 2023-07-27 12:41:39 (prepare.sh:30:main) Stage 0: Download data
+ /tmp/icefall/egs/yesno/ASR/download/waves_yesno.tar.gz: 100%|___________________________________________________| 4.70M/4.70M [00:00<00:00, 11.1MB/s]
+ 2023-07-27 12:41:46 (prepare.sh:39:main) Stage 1: Prepare yesno manifest
+ 2023-07-27 12:41:50 (prepare.sh:45:main) Stage 2: Compute fbank for yesno
+ 2023-07-27 12:41:55,718 INFO [compute_fbank_yesno.py:65] Processing train
+ Extracting and storing features: 100%|_______________________________________________________________________________| 90/90 [00:01<00:00, 87.82it/s]
+ 2023-07-27 12:41:56,778 INFO [compute_fbank_yesno.py:65] Processing test
+ Extracting and storing features: 100%|______________________________________________________________________________| 30/30 [00:00<00:00, 256.92it/s]
+ 2023-07-27 12:41:57 (prepare.sh:51:main) Stage 3: Prepare lang
+ 2023-07-27 12:42:02 (prepare.sh:66:main) Stage 4: Prepare G
+ /project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):79
+ [I] Reading \data\ section.
+ /project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):140
+ [I] Reading \1-grams: section.
+ 2023-07-27 12:42:02 (prepare.sh:92:main) Stage 5: Compile HLG
+ 2023-07-27 12:42:07,275 INFO [compile_hlg.py:124] Processing data/lang_phone
+ 2023-07-27 12:42:07,276 INFO [lexicon.py:171] Converting L.pt to Linv.pt
+ 2023-07-27 12:42:07,309 INFO [compile_hlg.py:48] Building ctc_topo. max_token_id: 3
+ 2023-07-27 12:42:07,310 INFO [compile_hlg.py:52] Loading G.fst.txt
+ 2023-07-27 12:42:07,314 INFO [compile_hlg.py:62] Intersecting L and G
+ 2023-07-27 12:42:07,323 INFO [compile_hlg.py:64] LG shape: (4, None)
+ 2023-07-27 12:42:07,323 INFO [compile_hlg.py:66] Connecting LG
+ 2023-07-27 12:42:07,323 INFO [compile_hlg.py:68] LG shape after k2.connect: (4, None)
+ 2023-07-27 12:42:07,323 INFO [compile_hlg.py:70]
+ 2023-07-27 12:42:07,323 INFO [compile_hlg.py:71] Determinizing LG
+ 2023-07-27 12:42:07,341 INFO [compile_hlg.py:74]
+ 2023-07-27 12:42:07,341 INFO [compile_hlg.py:76] Connecting LG after k2.determinize
+ 2023-07-27 12:42:07,341 INFO [compile_hlg.py:79] Removing disambiguation symbols on LG
+ 2023-07-27 12:42:07,354 INFO [compile_hlg.py:91] LG shape after k2.remove_epsilon: (6, None)
+ 2023-07-27 12:42:07,445 INFO [compile_hlg.py:96] Arc sorting LG
+ 2023-07-27 12:42:07,445 INFO [compile_hlg.py:99] Composing H and LG
+ 2023-07-27 12:42:07,446 INFO [compile_hlg.py:106] Connecting LG
+ 2023-07-27 12:42:07,446 INFO [compile_hlg.py:109] Arc sorting LG
+ 2023-07-27 12:42:07,447 INFO [compile_hlg.py:111] HLG.shape: (8, None)
+ 2023-07-27 12:42:07,447 INFO [compile_hlg.py:127] Saving HLG.pt to data/lang_phone
+
+Training
+~~~~~~~~
+
+Now let us run the training part:
+
+.. code-block::
+
+ (test-icefall) kuangfangjun:ASR$ export CUDA_VISIBLE_DEVICES=""
+
+ (test-icefall) kuangfangjun:ASR$ ./tdnn/train.py
+
+.. CAUTION::
+
+ We use ``export CUDA_VISIBLE_DEVICES=""`` so that `icefall`_ uses CPU
+ even if there are GPUs available.
+
+.. hint::
+
+ In case you get a ``Segmentation fault (core dump)`` error, please use:
+
+ .. code-block:: bash
+
+ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+ See more at `` if you are
+ interested.
+
+The training log is given below:
+
+.. code-block::
+
+ 2023-07-27 12:50:51,936 INFO [train.py:481] Training started
+ 2023-07-27 12:50:51,936 INFO [train.py:482] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'world_size': 1, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 15, 'seed': 42, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4c05309499a08454997adf500b56dcc629e35ae5', 'k2-git-date': 'Tue Jul 25 16:23:36 2023', 'lhotse-version': '1.16.0.dev+git.7640d66.clean', 'torch-version': '1.13.0+cu116', 'torch-cuda-available': False, 'torch-cuda-version': '11.6', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '3fb0a43-clean', 'icefall-git-date': 'Thu Jul 27 12:36:05 2023', 'icefall-path': '/tmp/icefall', 'k2-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-1-1220091118-57c4d55446-sph26', 'IP address': '10.177.77.20'}}
+ 2023-07-27 12:50:51,941 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+ 2023-07-27 12:50:51,949 INFO [train.py:495] device: cpu
+ 2023-07-27 12:50:51,965 INFO [asr_datamodule.py:146] About to get train cuts
+ 2023-07-27 12:50:51,965 INFO [asr_datamodule.py:244] About to get train cuts
+ 2023-07-27 12:50:51,967 INFO [asr_datamodule.py:149] About to create train dataset
+ 2023-07-27 12:50:51,967 INFO [asr_datamodule.py:199] Using SingleCutSampler.
+ 2023-07-27 12:50:51,967 INFO [asr_datamodule.py:205] About to create train dataloader
+ 2023-07-27 12:50:51,968 INFO [asr_datamodule.py:218] About to get test cuts
+ 2023-07-27 12:50:51,968 INFO [asr_datamodule.py:252] About to get test cuts
+ 2023-07-27 12:50:52,565 INFO [train.py:422] Epoch 0, batch 0, loss[loss=1.065, over 2436.00 frames. ], tot_loss[loss=1.065, over 2436.00 frames. ], batch size: 4
+ 2023-07-27 12:50:53,681 INFO [train.py:422] Epoch 0, batch 10, loss[loss=0.4561, over 2828.00 frames. ], tot_loss[loss=0.7076, over 22192.90 frames.], batch size: 4
+ 2023-07-27 12:50:54,167 INFO [train.py:444] Epoch 0, validation loss=0.9002, over 18067.00 frames.
+ 2023-07-27 12:50:55,011 INFO [train.py:422] Epoch 0, batch 20, loss[loss=0.2555, over 2695.00 frames. ], tot_loss[loss=0.484, over 34971.47 frames. ], batch size: 5
+ 2023-07-27 12:50:55,331 INFO [train.py:444] Epoch 0, validation loss=0.4688, over 18067.00 frames.
+ 2023-07-27 12:50:55,368 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-0.pt
+ 2023-07-27 12:50:55,633 INFO [train.py:422] Epoch 1, batch 0, loss[loss=0.2532, over 2436.00 frames. ], tot_loss[loss=0.2532, over 2436.00 frames. ],
+ batch size: 4
+ 2023-07-27 12:50:56,242 INFO [train.py:422] Epoch 1, batch 10, loss[loss=0.1139, over 2828.00 frames. ], tot_loss[loss=0.1592, over 22192.90 frames.], batch size: 4
+ 2023-07-27 12:50:56,522 INFO [train.py:444] Epoch 1, validation loss=0.1627, over 18067.00 frames.
+ 2023-07-27 12:50:57,209 INFO [train.py:422] Epoch 1, batch 20, loss[loss=0.07055, over 2695.00 frames. ], tot_loss[loss=0.1175, over 34971.47 frames.], batch size: 5
+ 2023-07-27 12:50:57,600 INFO [train.py:444] Epoch 1, validation loss=0.07091, over 18067.00 frames.
+ 2023-07-27 12:50:57,640 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-1.pt
+ 2023-07-27 12:50:57,847 INFO [train.py:422] Epoch 2, batch 0, loss[loss=0.07731, over 2436.00 frames. ], tot_loss[loss=0.07731, over 2436.00 frames.], batch size: 4
+ 2023-07-27 12:50:58,427 INFO [train.py:422] Epoch 2, batch 10, loss[loss=0.04391, over 2828.00 frames. ], tot_loss[loss=0.05341, over 22192.90 frames. ], batch size: 4
+ 2023-07-27 12:50:58,884 INFO [train.py:444] Epoch 2, validation loss=0.04384, over 18067.00 frames.
+ 2023-07-27 12:50:59,387 INFO [train.py:422] Epoch 2, batch 20, loss[loss=0.03458, over 2695.00 frames. ], tot_loss[loss=0.04616, over 34971.47 frames. ], batch size: 5
+ 2023-07-27 12:50:59,707 INFO [train.py:444] Epoch 2, validation loss=0.03379, over 18067.00 frames.
+ 2023-07-27 12:50:59,758 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-2.pt
+
+ ... ...
+
+ 2023-07-27 12:51:23,433 INFO [train.py:422] Epoch 13, batch 0, loss[loss=0.01054, over 2436.00 frames. ], tot_loss[loss=0.01054, over 2436.00 frames. ], batch size: 4
+ 2023-07-27 12:51:23,980 INFO [train.py:422] Epoch 13, batch 10, loss[loss=0.009014, over 2828.00 frames. ], tot_loss[loss=0.009974, over 22192.90 frames. ], batch size: 4
+ 2023-07-27 12:51:24,489 INFO [train.py:444] Epoch 13, validation loss=0.01085, over 18067.00 frames.
+ 2023-07-27 12:51:25,258 INFO [train.py:422] Epoch 13, batch 20, loss[loss=0.01172, over 2695.00 frames. ], tot_loss[loss=0.01055, over 34971.47 frames. ], batch size: 5
+ 2023-07-27 12:51:25,621 INFO [train.py:444] Epoch 13, validation loss=0.01074, over 18067.00 frames.
+ 2023-07-27 12:51:25,699 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-13.pt
+ 2023-07-27 12:51:25,866 INFO [train.py:422] Epoch 14, batch 0, loss[loss=0.01044, over 2436.00 frames. ], tot_loss[loss=0.01044, over 2436.00 frames. ], batch size: 4
+ 2023-07-27 12:51:26,844 INFO [train.py:422] Epoch 14, batch 10, loss[loss=0.008942, over 2828.00 frames. ], tot_loss[loss=0.01, over 22192.90 frames. ], batch size: 4
+ 2023-07-27 12:51:27,221 INFO [train.py:444] Epoch 14, validation loss=0.01082, over 18067.00 frames.
+ 2023-07-27 12:51:27,970 INFO [train.py:422] Epoch 14, batch 20, loss[loss=0.01169, over 2695.00 frames. ], tot_loss[loss=0.01054, over 34971.47 frames. ], batch size: 5
+ 2023-07-27 12:51:28,247 INFO [train.py:444] Epoch 14, validation loss=0.01073, over 18067.00 frames.
+ 2023-07-27 12:51:28,323 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-14.pt
+ 2023-07-27 12:51:28,326 INFO [train.py:555] Done!
+
+Decoding
+~~~~~~~~
+
+Let us use the trained model to decode the test set:
+
+.. code-block::
+
+ (test-icefall) kuangfangjun:ASR$ ./tdnn/decode.py
+
+ 2023-07-27 12:55:12,840 INFO [decode.py:263] Decoding started
+ 2023-07-27 12:55:12,840 INFO [decode.py:264] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'export': False, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4c05309499a08454997adf500b56dcc629e35ae5', 'k2-git-date': 'Tue Jul 25 16:23:36 2023', 'lhotse-version': '1.16.0.dev+git.7640d66.clean', 'torch-version': '1.13.0+cu116', 'torch-cuda-available': False, 'torch-cuda-version': '11.6', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '3fb0a43-clean', 'icefall-git-date': 'Thu Jul 27 12:36:05 2023', 'icefall-path': '/tmp/icefall', 'k2-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-1-1220091118-57c4d55446-sph26', 'IP address': '10.177.77.20'}}
+ 2023-07-27 12:55:12,841 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+ 2023-07-27 12:55:12,855 INFO [decode.py:273] device: cpu
+ 2023-07-27 12:55:12,868 INFO [decode.py:291] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
+ 2023-07-27 12:55:12,882 INFO [asr_datamodule.py:218] About to get test cuts
+ 2023-07-27 12:55:12,883 INFO [asr_datamodule.py:252] About to get test cuts
+ 2023-07-27 12:55:13,157 INFO [decode.py:204] batch 0/?, cuts processed until now is 4
+ 2023-07-27 12:55:13,701 INFO [decode.py:241] The transcripts are stored in tdnn/exp/recogs-test_set.txt
+ 2023-07-27 12:55:13,702 INFO [utils.py:564] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
+ 2023-07-27 12:55:13,704 INFO [decode.py:249] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
+ 2023-07-27 12:55:13,704 INFO [decode.py:316] Done!
+
+
+**Congratulations!** You have successfully setup the environment and have run the first recipe in `icefall`_.
+
+Have fun with ``icefall``!
+
+YouTube Video
+-------------
+
+We provide the following YouTube video showing how to install `icefall`_.
+It also shows how to debug various problems that you may encounter while
+using `icefall`_.
+
+.. note::
+
+ To get the latest news of `next-gen Kaldi `_, please subscribe
+ the following YouTube channel by `Nadira Povey `_:
+
+ ``_
+
+.. youtube:: LVmrBD0tLfE
diff --git a/docs/source/model-export/export-model-state-dict.rst b/docs/source/model-export/export-model-state-dict.rst
new file mode 100644
index 000000000..5596bb7a6
--- /dev/null
+++ b/docs/source/model-export/export-model-state-dict.rst
@@ -0,0 +1,135 @@
+Export model.state_dict()
+=========================
+
+When to use it
+--------------
+
+During model training, we save checkpoints periodically to disk.
+
+A checkpoint contains the following information:
+
+ - ``model.state_dict()``
+ - ``optimizer.state_dict()``
+ - and some other information related to training
+
+When we need to resume the training process from some point, we need a checkpoint.
+However, if we want to publish the model for inference, then only
+``model.state_dict()`` is needed. In this case, we need to strip all other information
+except ``model.state_dict()`` to reduce the file size of the published model.
+
+How to export
+-------------
+
+Every recipe contains a file ``export.py`` that you can use to
+export ``model.state_dict()`` by taking some checkpoints as inputs.
+
+.. hint::
+
+ Each ``export.py`` contains well-documented usage information.
+
+In the following, we use
+``_
+as an example.
+
+.. note::
+
+ The steps for other recipes are almost the same.
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+
+ ./pruned_transducer_stateless3/export.py \
+ --exp-dir ./pruned_transducer_stateless3/exp \
+ --tokens data/lang_bpe_500/tokens.txt \
+ --epoch 20 \
+ --avg 10
+
+will generate a file ``pruned_transducer_stateless3/exp/pretrained.pt``, which
+is a dict containing ``{"model": model.state_dict()}`` saved by ``torch.save()``.
+
+How to use the exported model
+-----------------------------
+
+For each recipe, we provide pretrained models hosted on huggingface.
+You can find links to pretrained models in ``RESULTS.md`` of each dataset.
+
+In the following, we demonstrate how to use the pretrained model from
+``_.
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+
+ git lfs install
+ git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
+
+After cloning the repo with ``git lfs``, you will find several files in the folder
+``icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp``
+that have a prefix ``pretrained-``. Those files contain ``model.state_dict()``
+exported by the above ``export.py``.
+
+In each recipe, there is also a file ``pretrained.py``, which can use
+``pretrained-xxx.pt`` to decode waves. The following is an example:
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+
+ ./pruned_transducer_stateless3/pretrained.py \
+ --checkpoint ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/pretrained-iter-1224000-avg-14.pt \
+ --tokens ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500/tokens.txt \
+ --method greedy_search \
+ ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1089-134686-0001.wav \
+ ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0001.wav \
+ ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/test_wavs/1221-135766-0002.wav
+
+The above commands show how to use the exported model with ``pretrained.py`` to
+decode multiple sound files. Its output is given as follows for reference:
+
+.. literalinclude:: ./code/export-model-state-dict-pretrained-out.txt
+
+Use the exported model to run decode.py
+---------------------------------------
+
+When we publish the model, we always note down its WERs on some test
+dataset in ``RESULTS.md``. This section describes how to use the
+pretrained model to reproduce the WER.
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+ git lfs install
+ git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13
+
+ cd icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp
+ ln -s pretrained-iter-1224000-avg-14.pt epoch-9999.pt
+ cd ../..
+
+We create a symlink with name ``epoch-9999.pt`` to ``pretrained-iter-1224000-avg-14.pt``,
+so that we can pass ``--epoch 9999 --avg 1`` to ``decode.py`` in the following
+command:
+
+.. code-block:: bash
+
+ ./pruned_transducer_stateless3/decode.py \
+ --epoch 9999 \
+ --avg 1 \
+ --exp-dir ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp \
+ --lang-dir ./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/data/lang_bpe_500 \
+ --max-duration 600 \
+ --decoding-method greedy_search
+
+You will find the decoding results in
+``./icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13/exp/greedy_search``.
+
+.. caution::
+
+ For some recipes, you also need to pass ``--use-averaged-model False``
+ to ``decode.py``. The reason is that the exported pretrained model is already
+ the averaged one.
+
+.. hint::
+
+ Before running ``decode.py``, we assume that you have already run
+ ``prepare.sh`` to prepare the test dataset.
diff --git a/docs/source/model-export/export-ncnn-conv-emformer.rst b/docs/source/model-export/export-ncnn-conv-emformer.rst
new file mode 100644
index 000000000..93392aee7
--- /dev/null
+++ b/docs/source/model-export/export-ncnn-conv-emformer.rst
@@ -0,0 +1,752 @@
+.. _export_conv_emformer_transducer_models_to_ncnn:
+
+Export ConvEmformer transducer models to ncnn
+=============================================
+
+We use the pre-trained model from the following repository as an example:
+
+ - ``_
+
+We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
+
+.. hint::
+
+ We use ``Ubuntu 18.04``, ``torch 1.13``, and ``Python 3.8`` for testing.
+
+.. caution::
+
+ Please use a more recent version of PyTorch. For instance, ``torch 1.8``
+ may ``not`` work.
+
+1. Download the pre-trained model
+---------------------------------
+
+.. hint::
+
+ You can also refer to ``_ to download the pre-trained model.
+
+ You have to install `git-lfs`_ before you continue.
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+
+ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
+ cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05
+
+ git lfs pull --include "exp/pretrained-epoch-30-avg-10-averaged.pt"
+ git lfs pull --include "data/lang_bpe_500/bpe.model"
+
+ cd ..
+
+.. note::
+
+ We downloaded ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
+
+
+In the above code, we downloaded the pre-trained model into the directory
+``egs/librispeech/ASR/icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05``.
+
+.. _export_for_ncnn_install_ncnn_and_pnnx:
+
+2. Install ncnn and pnnx
+------------------------
+
+.. code-block:: bash
+
+ # We put ncnn into $HOME/open-source/ncnn
+ # You can change it to anywhere you like
+
+ cd $HOME
+ mkdir -p open-source
+ cd open-source
+
+ git clone https://github.com/csukuangfj/ncnn
+ cd ncnn
+ git submodule update --recursive --init
+
+ # Note: We don't use "python setup.py install" or "pip install ." here
+
+ mkdir -p build-wheel
+ cd build-wheel
+
+ cmake \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DNCNN_PYTHON=ON \
+ -DNCNN_BUILD_BENCHMARK=OFF \
+ -DNCNN_BUILD_EXAMPLES=OFF \
+ -DNCNN_BUILD_TOOLS=ON \
+ ..
+
+ make -j4
+
+ cd ..
+
+ # Note: $PWD here is $HOME/open-source/ncnn
+
+ export PYTHONPATH=$PWD/python:$PYTHONPATH
+ export PATH=$PWD/tools/pnnx/build/src:$PATH
+ export PATH=$PWD/build-wheel/tools/quantize:$PATH
+
+ # Now build pnnx
+ cd tools/pnnx
+ mkdir build
+ cd build
+ cmake ..
+ make -j4
+
+ ./src/pnnx
+
+Congratulations! You have successfully installed the following components:
+
+ - ``pnnx``, which is an executable located in
+ ``$HOME/open-source/ncnn/tools/pnnx/build/src``. We will use
+ it to convert models exported by ``torch.jit.trace()``.
+ - ``ncnn2int8``, which is an executable located in
+ ``$HOME/open-source/ncnn/build-wheel/tools/quantize``. We will use
+ it to quantize our models to ``int8``.
+ - ``ncnn.cpython-38-x86_64-linux-gnu.so``, which is a Python module located
+ in ``$HOME/open-source/ncnn/python/ncnn``.
+
+ .. note::
+
+ I am using ``Python 3.8``, so it
+ is ``ncnn.cpython-38-x86_64-linux-gnu.so``. If you use a different
+ version, say, ``Python 3.9``, the name would be
+ ``ncnn.cpython-39-x86_64-linux-gnu.so``.
+
+ Also, if you are not using Linux, the file name would also be different.
+ But that does not matter. As long as you can compile it, it should work.
+
+We have set up ``PYTHONPATH`` so that you can use ``import ncnn`` in your
+Python code. We have also set up ``PATH`` so that you can use
+``pnnx`` and ``ncnn2int8`` later in your terminal.
+
+.. caution::
+
+ Please don't use ``_.
+ We have made some modifications to the official `ncnn`_.
+
+ We will synchronize ``_ periodically
+ with the official one.
+
+3. Export the model via torch.jit.trace()
+-----------------------------------------
+
+First, let us rename our pre-trained model:
+
+.. code-block::
+
+ cd egs/librispeech/ASR
+
+ cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp
+
+ ln -s pretrained-epoch-30-avg-10-averaged.pt epoch-30.pt
+
+ cd ../..
+
+Next, we use the following code to export our model:
+
+.. code-block:: bash
+
+ dir=./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/
+
+ ./conv_emformer_transducer_stateless2/export-for-ncnn.py \
+ --exp-dir $dir/exp \
+ --tokens $dir/data/lang_bpe_500/tokens.txt \
+ --epoch 30 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --num-encoder-layers 12 \
+ --chunk-length 32 \
+ --cnn-module-kernel 31 \
+ --left-context-length 32 \
+ --right-context-length 8 \
+ --memory-size 32 \
+ --encoder-dim 512
+
+.. caution::
+
+ If your model has different configuration parameters, please change them accordingly.
+
+.. hint::
+
+ We have renamed our model to ``epoch-30.pt`` so that we can use ``--epoch 30``.
+ There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
+
+ If you have trained a model by yourself and if you have all checkpoints
+ available, please first use ``decode.py`` to tune ``--epoch --avg``
+ and select the best combination with with ``--use-averaged-model 1``.
+
+.. note::
+
+ You will see the following log output:
+
+ .. literalinclude:: ./code/export-conv-emformer-transducer-for-ncnn-output.txt
+
+ The log shows the model has ``75490012`` parameters, i.e., ``~75 M``.
+
+ .. code-block::
+
+ ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
+
+ -rw-r--r-- 1 kuangfangjun root 289M Jan 11 12:05 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/pretrained-epoch-30-avg-10-averaged.pt
+
+ You can see that the file size of the pre-trained model is ``289 MB``, which
+ is roughly equal to ``75490012*4/1024/1024 = 287.97 MB``.
+
+After running ``conv_emformer_transducer_stateless2/export-for-ncnn.py``,
+we will get the following files:
+
+.. code-block:: bash
+
+ ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*pnnx*
+
+ -rw-r--r-- 1 kuangfangjun root 1010K Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.pt
+ -rw-r--r-- 1 kuangfangjun root 283M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.pt
+ -rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 12:15 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.pt
+
+
+.. _conv-emformer-step-4-export-torchscript-model-via-pnnx:
+
+4. Export torchscript model via pnnx
+------------------------------------
+
+.. hint::
+
+ Make sure you have set up the ``PATH`` environment variable. Otherwise,
+ it will throw an error saying that ``pnnx`` could not be found.
+
+Now, it's time to export our models to `ncnn`_ via ``pnnx``.
+
+.. code-block::
+
+ cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+ pnnx ./encoder_jit_trace-pnnx.pt
+ pnnx ./decoder_jit_trace-pnnx.pt
+ pnnx ./joiner_jit_trace-pnnx.pt
+
+It will generate the following files:
+
+.. code-block:: bash
+
+ ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*ncnn*{bin,param}
+
+ -rw-r--r-- 1 kuangfangjun root 503K Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 437 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
+ -rw-r--r-- 1 kuangfangjun root 142M Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 79K Jan 11 12:36 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
+ -rw-r--r-- 1 kuangfangjun root 1.5M Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 488 Jan 11 12:38 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
+
+There are two types of files:
+
+- ``param``: It is a text file containing the model architectures. You can
+ use a text editor to view its content.
+- ``bin``: It is a binary file containing the model parameters.
+
+We compare the file sizes of the models below before and after converting via ``pnnx``:
+
+.. see https://tableconvert.com/restructuredtext-generator
+
++----------------------------------+------------+
+| File name | File size |
++==================================+============+
+| encoder_jit_trace-pnnx.pt | 283 MB |
++----------------------------------+------------+
+| decoder_jit_trace-pnnx.pt | 1010 KB |
++----------------------------------+------------+
+| joiner_jit_trace-pnnx.pt | 3.0 MB |
++----------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin | 142 MB |
++----------------------------------+------------+
+| decoder_jit_trace-pnnx.ncnn.bin | 503 KB |
++----------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin | 1.5 MB |
++----------------------------------+------------+
+
+You can see that the file sizes of the models after conversion are about one half
+of the models before conversion:
+
+ - encoder: 283 MB vs 142 MB
+ - decoder: 1010 KB vs 503 KB
+ - joiner: 3.0 MB vs 1.5 MB
+
+The reason is that by default ``pnnx`` converts ``float32`` parameters
+to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
+for ``float16``. Thus, it is ``twice smaller`` after conversion.
+
+.. hint::
+
+ If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
+ won't convert ``float32`` to ``float16``.
+
+5. Test the exported models in icefall
+--------------------------------------
+
+.. note::
+
+ We assume you have set up the environment variable ``PYTHONPATH`` when
+ building `ncnn`_.
+
+Now we have successfully converted our pre-trained model to `ncnn`_ format.
+The generated 6 files are what we need. You can use the following code to
+test the converted models:
+
+.. code-block:: bash
+
+ ./conv_emformer_transducer_stateless2/streaming-ncnn-decode.py \
+ --tokens ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/data/lang_bpe_500/tokens.txt \
+ --encoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param \
+ --encoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin \
+ --decoder-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param \
+ --decoder-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin \
+ --joiner-param-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param \
+ --joiner-bin-filename ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin \
+ ./icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/test_wavs/1089-134686-0001.wav
+
+.. hint::
+
+ `ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
+ only 1 wave file as input.
+
+The output is given below:
+
+.. literalinclude:: ./code/test-streaming-ncnn-decode-conv-emformer-transducer-libri.txt
+
+Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
+
+
+.. _conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn:
+
+6. Modify the exported encoder for sherpa-ncnn
+----------------------------------------------
+
+In order to use the exported models in `sherpa-ncnn`_, we have to modify
+``encoder_jit_trace-pnnx.ncnn.param``.
+
+Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
+
+.. code-block::
+
+ 7767517
+ 1060 1342
+ Input in0 0 1 in0
+
+**Explanation** of the above three lines:
+
+ 1. ``7767517``, it is a magic number and should not be changed.
+ 2. ``1060 1342``, the first number ``1060`` specifies the number of layers
+ in this file, while ``1342`` specifies the number of intermediate outputs
+ of this file
+ 3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
+ is the layer name of this layer; ``0`` means this layer has no input;
+ ``1`` means this layer has one output; ``in0`` is the output name of
+ this layer.
+
+We need to add 1 extra line and also increment the number of layers.
+The result looks like below:
+
+.. code-block:: bash
+
+ 7767517
+ 1061 1342
+ SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
+ Input in0 0 1 in0
+
+**Explanation**
+
+ 1. ``7767517``, it is still the same
+ 2. ``1061 1342``, we have added an extra layer, so we need to update ``1060`` to ``1061``.
+ We don't need to change ``1342`` since the newly added layer has no inputs or outputs.
+ 3. ``SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512``
+ This line is newly added. Its explanation is given below:
+
+ - ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
+ - ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
+ - ``0 0`` means this layer has no inputs or output. Must be ``0 0``
+ - ``0=1``, 0 is the key and 1 is the value. MUST be ``0=1``
+ - ``1=12``, 1 is the key and 12 is the value of the
+ parameter ``--num-encoder-layers`` that you provided when running
+ ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+ - ``2=32``, 2 is the key and 32 is the value of the
+ parameter ``--memory-size`` that you provided when running
+ ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+ - ``3=31``, 3 is the key and 31 is the value of the
+ parameter ``--cnn-module-kernel`` that you provided when running
+ ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+ - ``4=8``, 4 is the key and 8 is the value of the
+ parameter ``--left-context-length`` that you provided when running
+ ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+ - ``5=32``, 5 is the key and 32 is the value of the
+ parameter ``--chunk-length`` that you provided when running
+ ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+ - ``6=8``, 6 is the key and 8 is the value of the
+ parameter ``--right-context-length`` that you provided when running
+ ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+ - ``7=512``, 7 is the key and 512 is the value of the
+ parameter ``--encoder-dim`` that you provided when running
+ ``conv_emformer_transducer_stateless2/export-for-ncnn.py``.
+
+ For ease of reference, we list the key-value pairs that you need to add
+ in the following table. If your model has a different setting, please
+ change the values for ``SherpaMetaData`` accordingly. Otherwise, you
+ will be ``SAD``.
+
+ +------+-----------------------------+
+ | key | value |
+ +======+=============================+
+ | 0 | 1 (fixed) |
+ +------+-----------------------------+
+ | 1 | ``--num-encoder-layers`` |
+ +------+-----------------------------+
+ | 2 | ``--memory-size`` |
+ +------+-----------------------------+
+ | 3 | ``--cnn-module-kernel`` |
+ +------+-----------------------------+
+ | 4 | ``--left-context-length`` |
+ +------+-----------------------------+
+ | 5 | ``--chunk-length`` |
+ +------+-----------------------------+
+ | 6 | ``--right-context-length`` |
+ +------+-----------------------------+
+ | 7 | ``--encoder-dim`` |
+ +------+-----------------------------+
+
+ 4. ``Input in0 0 1 in0``. No need to change it.
+
+.. caution::
+
+ When you add a new layer ``SherpaMetaData``, please remember to update the
+ number of layers. In our case, update ``1060`` to ``1061``. Otherwise,
+ you will be SAD later.
+
+.. hint::
+
+ After adding the new layer ``SherpaMetaData``, you cannot use this model
+ with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
+ supported only in `sherpa-ncnn`_.
+
+.. hint::
+
+ `ncnn`_ is very flexible. You can add new layers to it just by text-editing
+ the ``param`` file! You don't need to change the ``bin`` file.
+
+Now you can use this model in `sherpa-ncnn`_.
+Please refer to the following documentation:
+
+ - Linux/macOS/Windows/arm/aarch64: ``_
+ - ``Android``: ``_
+ - ``iOS``: ``_
+ - Python: ``_
+
+We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
+
+ - ``_
+
+ You can find more usages there.
+
+7. (Optional) int8 quantization with sherpa-ncnn
+------------------------------------------------
+
+This step is optional.
+
+In this step, we describe how to quantize our model with ``int8``.
+
+Change :ref:`conv-emformer-step-4-export-torchscript-model-via-pnnx` to
+disable ``fp16`` when using ``pnnx``:
+
+.. code-block::
+
+ cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+ pnnx ./encoder_jit_trace-pnnx.pt fp16=0
+ pnnx ./decoder_jit_trace-pnnx.pt
+ pnnx ./joiner_jit_trace-pnnx.pt fp16=0
+
+.. note::
+
+ We add ``fp16=0`` when exporting the encoder and joiner. `ncnn`_ does not
+ support quantizing the decoder model yet. We will update this documentation
+ once `ncnn`_ supports it. (Maybe in this year, 2023).
+
+It will generate the following files
+
+.. code-block:: bash
+
+ ls -lh icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/*_jit_trace-pnnx.ncnn.{param,bin}
+
+ -rw-r--r-- 1 kuangfangjun root 503K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 437 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/decoder_jit_trace-pnnx.ncnn.param
+ -rw-r--r-- 1 kuangfangjun root 283M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 79K Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/encoder_jit_trace-pnnx.ncnn.param
+ -rw-r--r-- 1 kuangfangjun root 3.0M Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 488 Jan 11 15:56 icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/joiner_jit_trace-pnnx.ncnn.param
+
+Let us compare again the file sizes:
+
++----------------------------------------+------------+
+| File name | File size |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.pt | 283 MB |
++----------------------------------------+------------+
+| decoder_jit_trace-pnnx.pt | 1010 KB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.pt | 3.0 MB |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB |
++----------------------------------------+------------+
+| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
++----------------------------------------+------------+
+
+You can see that the file sizes are doubled when we disable ``fp16``.
+
+.. note::
+
+ You can again use ``streaming-ncnn-decode.py`` to test the exported models.
+
+Next, follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
+to modify ``encoder_jit_trace-pnnx.ncnn.param``.
+
+Change
+
+.. code-block:: bash
+
+ 7767517
+ 1060 1342
+ Input in0 0 1 in0
+
+to
+
+.. code-block:: bash
+
+ 7767517
+ 1061 1342
+ SherpaMetaData sherpa_meta_data1 0 0 0=1 1=12 2=32 3=31 4=8 5=32 6=8 7=512
+ Input in0 0 1 in0
+
+.. caution::
+
+ Please follow :ref:`conv-emformer-modify-the-exported-encoder-for-sherpa-ncnn`
+ to change the values for ``SherpaMetaData`` if your model uses a different setting.
+
+
+Next, let us compile `sherpa-ncnn`_ since we will quantize our models within
+`sherpa-ncnn`_.
+
+.. code-block:: bash
+
+ # We will download sherpa-ncnn to $HOME/open-source/
+ # You can change it to anywhere you like.
+ cd $HOME
+ mkdir -p open-source
+
+ cd open-source
+ git clone https://github.com/k2-fsa/sherpa-ncnn
+ cd sherpa-ncnn
+ mkdir build
+ cd build
+ cmake ..
+ make -j 4
+
+ ./bin/generate-int8-scale-table
+
+ export PATH=$HOME/open-source/sherpa-ncnn/build/bin:$PATH
+
+The output of the above commands are:
+
+.. code-block:: bash
+
+ (py38) kuangfangjun:build$ generate-int8-scale-table
+ Please provide 10 arg. Currently given: 1
+ Usage:
+ generate-int8-scale-table encoder.param encoder.bin decoder.param decoder.bin joiner.param joiner.bin encoder-scale-table.txt joiner-scale-table.txt wave_filenames.txt
+
+ Each line in wave_filenames.txt is a path to some 16k Hz mono wave file.
+
+We need to create a file ``wave_filenames.txt``, in which we need to put
+some calibration wave files. For testing purpose, we put the ``test_wavs``
+from the pre-trained model repository ``_
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+ cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+ cat < wave_filenames.txt
+ ../test_wavs/1089-134686-0001.wav
+ ../test_wavs/1221-135766-0001.wav
+ ../test_wavs/1221-135766-0002.wav
+ EOF
+
+Now we can calculate the scales needed for quantization with the calibration data:
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+ cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+ generate-int8-scale-table \
+ ./encoder_jit_trace-pnnx.ncnn.param \
+ ./encoder_jit_trace-pnnx.ncnn.bin \
+ ./decoder_jit_trace-pnnx.ncnn.param \
+ ./decoder_jit_trace-pnnx.ncnn.bin \
+ ./joiner_jit_trace-pnnx.ncnn.param \
+ ./joiner_jit_trace-pnnx.ncnn.bin \
+ ./encoder-scale-table.txt \
+ ./joiner-scale-table.txt \
+ ./wave_filenames.txt
+
+The output logs are in the following:
+
+.. literalinclude:: ./code/generate-int-8-scale-table-for-conv-emformer.txt
+
+It generates the following two files:
+
+.. code-block:: bash
+
+ $ ls -lh encoder-scale-table.txt joiner-scale-table.txt
+ -rw-r--r-- 1 kuangfangjun root 955K Jan 11 17:28 encoder-scale-table.txt
+ -rw-r--r-- 1 kuangfangjun root 18K Jan 11 17:28 joiner-scale-table.txt
+
+.. caution::
+
+ Definitely, you need more calibration data to compute the scale table.
+
+Finally, let us use the scale table to quantize our models into ``int8``.
+
+.. code-block:: bash
+
+ ncnn2int8
+
+ usage: ncnn2int8 [inparam] [inbin] [outparam] [outbin] [calibration table]
+
+First, we quantize the encoder model:
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+ cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+ ncnn2int8 \
+ ./encoder_jit_trace-pnnx.ncnn.param \
+ ./encoder_jit_trace-pnnx.ncnn.bin \
+ ./encoder_jit_trace-pnnx.ncnn.int8.param \
+ ./encoder_jit_trace-pnnx.ncnn.int8.bin \
+ ./encoder-scale-table.txt
+
+Next, we quantize the joiner model:
+
+.. code-block:: bash
+
+ ncnn2int8 \
+ ./joiner_jit_trace-pnnx.ncnn.param \
+ ./joiner_jit_trace-pnnx.ncnn.bin \
+ ./joiner_jit_trace-pnnx.ncnn.int8.param \
+ ./joiner_jit_trace-pnnx.ncnn.int8.bin \
+ ./joiner-scale-table.txt
+
+The above two commands generate the following 4 files:
+
+.. code-block:: bash
+
+ -rw-r--r-- 1 kuangfangjun root 99M Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.bin
+ -rw-r--r-- 1 kuangfangjun root 78K Jan 11 17:34 encoder_jit_trace-pnnx.ncnn.int8.param
+ -rw-r--r-- 1 kuangfangjun root 774K Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.bin
+ -rw-r--r-- 1 kuangfangjun root 496 Jan 11 17:35 joiner_jit_trace-pnnx.ncnn.int8.param
+
+Congratulations! You have successfully quantized your model from ``float32`` to ``int8``.
+
+.. caution::
+
+ ``ncnn.int8.param`` and ``ncnn.int8.bin`` must be used in pairs.
+
+ You can replace ``ncnn.param`` and ``ncnn.bin`` with ``ncnn.int8.param``
+ and ``ncnn.int8.bin`` in `sherpa-ncnn`_ if you like.
+
+ For instance, to use only the ``int8`` encoder in ``sherpa-ncnn``, you can
+ replace the following invocation:
+
+ .. code-block:: bash
+
+ cd egs/librispeech/ASR
+ cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+ sherpa-ncnn \
+ ../data/lang_bpe_500/tokens.txt \
+ ./encoder_jit_trace-pnnx.ncnn.param \
+ ./encoder_jit_trace-pnnx.ncnn.bin \
+ ./decoder_jit_trace-pnnx.ncnn.param \
+ ./decoder_jit_trace-pnnx.ncnn.bin \
+ ./joiner_jit_trace-pnnx.ncnn.param \
+ ./joiner_jit_trace-pnnx.ncnn.bin \
+ ../test_wavs/1089-134686-0001.wav
+
+ with
+
+ .. code-block::
+
+ cd egs/librispeech/ASR
+ cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+ sherpa-ncnn \
+ ../data/lang_bpe_500/tokens.txt \
+ ./encoder_jit_trace-pnnx.ncnn.int8.param \
+ ./encoder_jit_trace-pnnx.ncnn.int8.bin \
+ ./decoder_jit_trace-pnnx.ncnn.param \
+ ./decoder_jit_trace-pnnx.ncnn.bin \
+ ./joiner_jit_trace-pnnx.ncnn.param \
+ ./joiner_jit_trace-pnnx.ncnn.bin \
+ ../test_wavs/1089-134686-0001.wav
+
+
+The following table compares again the file sizes:
+
+
++----------------------------------------+------------+
+| File name | File size |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.pt | 283 MB |
++----------------------------------------+------------+
+| decoder_jit_trace-pnnx.pt | 1010 KB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.pt | 3.0 MB |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 142 MB |
++----------------------------------------+------------+
+| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 283 MB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.int8.bin | 99 MB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.int8.bin | 774 KB |
++----------------------------------------+------------+
+
+You can see that the file sizes of the model after ``int8`` quantization
+are much smaller.
+
+.. hint::
+
+ Currently, only linear layers and convolutional layers are quantized
+ with ``int8``, so you don't see an exact ``4x`` reduction in file sizes.
+
+.. note::
+
+ You need to test the recognition accuracy after ``int8`` quantization.
+
+You can find the speed comparison at ``_.
+
+
+That's it! Have fun with `sherpa-ncnn`_!
diff --git a/docs/source/model-export/export-ncnn-lstm.rst b/docs/source/model-export/export-ncnn-lstm.rst
new file mode 100644
index 000000000..310c3d8e4
--- /dev/null
+++ b/docs/source/model-export/export-ncnn-lstm.rst
@@ -0,0 +1,644 @@
+.. _export_lstm_transducer_models_to_ncnn:
+
+Export LSTM transducer models to ncnn
+-------------------------------------
+
+We use the pre-trained model from the following repository as an example:
+
+``_
+
+We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
+
+.. hint::
+
+ We use ``Ubuntu 18.04``, ``torch 1.13``, and ``Python 3.8`` for testing.
+
+.. caution::
+
+ Please use a more recent version of PyTorch. For instance, ``torch 1.8``
+ may ``not`` work.
+
+1. Download the pre-trained model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. hint::
+
+ You have to install `git-lfs`_ before you continue.
+
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
+ cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
+
+ git lfs pull --include "exp/pretrained-iter-468000-avg-16.pt"
+ git lfs pull --include "data/lang_bpe_500/bpe.model"
+
+ cd ..
+
+.. note::
+
+ We downloaded ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
+
+In the above code, we downloaded the pre-trained model into the directory
+``egs/librispeech/ASR/icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03``.
+
+2. Install ncnn and pnnx
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Please refer to :ref:`export_for_ncnn_install_ncnn_and_pnnx` .
+
+
+3. Export the model via torch.jit.trace()
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+First, let us rename our pre-trained model:
+
+.. code-block::
+
+ cd egs/librispeech/ASR
+
+ cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp
+
+ ln -s pretrained-iter-468000-avg-16.pt epoch-99.pt
+
+ cd ../..
+
+Next, we use the following code to export our model:
+
+.. code-block:: bash
+
+ dir=./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03
+
+ ./lstm_transducer_stateless2/export-for-ncnn.py \
+ --exp-dir $dir/exp \
+ --tokens $dir/data/lang_bpe_500/tokens.txt \
+ --epoch 99 \
+ --avg 1 \
+ --use-averaged-model 0 \
+ --num-encoder-layers 12 \
+ --encoder-dim 512 \
+ --rnn-hidden-size 1024
+
+.. hint::
+
+ We have renamed our model to ``epoch-99.pt`` so that we can use ``--epoch 99``.
+ There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
+
+ If you have trained a model by yourself and if you have all checkpoints
+ available, please first use ``decode.py`` to tune ``--epoch --avg``
+ and select the best combination with with ``--use-averaged-model 1``.
+
+.. note::
+
+ You will see the following log output:
+
+ .. literalinclude:: ./code/export-lstm-transducer-for-ncnn-output.txt
+
+ The log shows the model has ``84176356`` parameters, i.e., ``~84 M``.
+
+ .. code-block::
+
+ ls -lh icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/pretrained-iter-468000-avg-16.pt
+
+ -rw-r--r-- 1 kuangfangjun root 324M Feb 17 10:34 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/pretrained-iter-468000-avg-16.pt
+
+ You can see that the file size of the pre-trained model is ``324 MB``, which
+ is roughly equal to ``84176356*4/1024/1024 = 321.107 MB``.
+
+After running ``lstm_transducer_stateless2/export-for-ncnn.py``,
+we will get the following files:
+
+.. code-block:: bash
+
+ ls -lh icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/*pnnx.pt
+
+ -rw-r--r-- 1 kuangfangjun root 1010K Feb 17 11:22 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.pt
+ -rw-r--r-- 1 kuangfangjun root 318M Feb 17 11:22 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.pt
+ -rw-r--r-- 1 kuangfangjun root 3.0M Feb 17 11:22 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.pt
+
+
+.. _lstm-transducer-step-4-export-torchscript-model-via-pnnx:
+
+4. Export torchscript model via pnnx
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. hint::
+
+ Make sure you have set up the ``PATH`` environment variable
+ in :ref:`export_for_ncnn_install_ncnn_and_pnnx`. Otherwise,
+ it will throw an error saying that ``pnnx`` could not be found.
+
+Now, it's time to export our models to `ncnn`_ via ``pnnx``.
+
+.. code-block::
+
+ cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
+
+ pnnx ./encoder_jit_trace-pnnx.pt
+ pnnx ./decoder_jit_trace-pnnx.pt
+ pnnx ./joiner_jit_trace-pnnx.pt
+
+It will generate the following files:
+
+.. code-block:: bash
+
+ ls -lh icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/*ncnn*{bin,param}
+
+ -rw-r--r-- 1 kuangfangjun root 503K Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 437 Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.param
+ -rw-r--r-- 1 kuangfangjun root 159M Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 21K Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.param
+ -rw-r--r-- 1 kuangfangjun root 1.5M Feb 17 11:33 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 488 Feb 17 11:33 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.param
+
+
+There are two types of files:
+
+- ``param``: It is a text file containing the model architectures. You can
+ use a text editor to view its content.
+- ``bin``: It is a binary file containing the model parameters.
+
+We compare the file sizes of the models below before and after converting via ``pnnx``:
+
+.. see https://tableconvert.com/restructuredtext-generator
+
++----------------------------------+------------+
+| File name | File size |
++==================================+============+
+| encoder_jit_trace-pnnx.pt | 318 MB |
++----------------------------------+------------+
+| decoder_jit_trace-pnnx.pt | 1010 KB |
++----------------------------------+------------+
+| joiner_jit_trace-pnnx.pt | 3.0 MB |
++----------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin | 159 MB |
++----------------------------------+------------+
+| decoder_jit_trace-pnnx.ncnn.bin | 503 KB |
++----------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin | 1.5 MB |
++----------------------------------+------------+
+
+You can see that the file sizes of the models after conversion are about one half
+of the models before conversion:
+
+ - encoder: 318 MB vs 159 MB
+ - decoder: 1010 KB vs 503 KB
+ - joiner: 3.0 MB vs 1.5 MB
+
+The reason is that by default ``pnnx`` converts ``float32`` parameters
+to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
+for ``float16``. Thus, it is ``twice smaller`` after conversion.
+
+.. hint::
+
+ If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
+ won't convert ``float32`` to ``float16``.
+
+5. Test the exported models in icefall
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+ We assume you have set up the environment variable ``PYTHONPATH`` when
+ building `ncnn`_.
+
+Now we have successfully converted our pre-trained model to `ncnn`_ format.
+The generated 6 files are what we need. You can use the following code to
+test the converted models:
+
+.. code-block:: bash
+
+ python3 ./lstm_transducer_stateless2/streaming-ncnn-decode.py \
+ --tokens ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/data/lang_bpe_500/tokens.txt \
+ --encoder-param-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.param \
+ --encoder-bin-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.bin \
+ --decoder-param-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.param \
+ --decoder-bin-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.bin \
+ --joiner-param-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.param \
+ --joiner-bin-filename ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.bin \
+ ./icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/test_wavs/1089-134686-0001.wav
+
+.. hint::
+
+ `ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
+ only 1 wave file as input.
+
+The output is given below:
+
+.. literalinclude:: ./code/test-streaming-ncnn-decode-lstm-transducer-libri.txt
+
+Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
+
+.. _lstm-modify-the-exported-encoder-for-sherpa-ncnn:
+
+6. Modify the exported encoder for sherpa-ncnn
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In order to use the exported models in `sherpa-ncnn`_, we have to modify
+``encoder_jit_trace-pnnx.ncnn.param``.
+
+Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
+
+.. code-block::
+
+ 7767517
+ 267 379
+ Input in0 0 1 in0
+
+**Explanation** of the above three lines:
+
+ 1. ``7767517``, it is a magic number and should not be changed.
+ 2. ``267 379``, the first number ``267`` specifies the number of layers
+ in this file, while ``379`` specifies the number of intermediate outputs
+ of this file
+ 3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
+ is the layer name of this layer; ``0`` means this layer has no input;
+ ``1`` means this layer has one output; ``in0`` is the output name of
+ this layer.
+
+We need to add 1 extra line and also increment the number of layers.
+The result looks like below:
+
+.. code-block:: bash
+
+ 7767517
+ 268 379
+ SherpaMetaData sherpa_meta_data1 0 0 0=3 1=12 2=512 3=1024
+ Input in0 0 1 in0
+
+**Explanation**
+
+ 1. ``7767517``, it is still the same
+ 2. ``268 379``, we have added an extra layer, so we need to update ``267`` to ``268``.
+ We don't need to change ``379`` since the newly added layer has no inputs or outputs.
+ 3. ``SherpaMetaData sherpa_meta_data1 0 0 0=3 1=12 2=512 3=1024``
+ This line is newly added. Its explanation is given below:
+
+ - ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
+ - ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
+ - ``0 0`` means this layer has no inputs or output. Must be ``0 0``
+ - ``0=3``, 0 is the key and 3 is the value. MUST be ``0=3``
+ - ``1=12``, 1 is the key and 12 is the value of the
+ parameter ``--num-encoder-layers`` that you provided when running
+ ``./lstm_transducer_stateless2/export-for-ncnn.py``.
+ - ``2=512``, 2 is the key and 512 is the value of the
+ parameter ``--encoder-dim`` that you provided when running
+ ``./lstm_transducer_stateless2/export-for-ncnn.py``.
+ - ``3=1024``, 3 is the key and 1024 is the value of the
+ parameter ``--rnn-hidden-size`` that you provided when running
+ ``./lstm_transducer_stateless2/export-for-ncnn.py``.
+
+ For ease of reference, we list the key-value pairs that you need to add
+ in the following table. If your model has a different setting, please
+ change the values for ``SherpaMetaData`` accordingly. Otherwise, you
+ will be ``SAD``.
+
+ +------+-----------------------------+
+ | key | value |
+ +======+=============================+
+ | 0 | 3 (fixed) |
+ +------+-----------------------------+
+ | 1 | ``--num-encoder-layers`` |
+ +------+-----------------------------+
+ | 2 | ``--encoder-dim`` |
+ +------+-----------------------------+
+ | 3 | ``--rnn-hidden-size`` |
+ +------+-----------------------------+
+
+ 4. ``Input in0 0 1 in0``. No need to change it.
+
+.. caution::
+
+ When you add a new layer ``SherpaMetaData``, please remember to update the
+ number of layers. In our case, update ``267`` to ``268``. Otherwise,
+ you will be SAD later.
+
+.. hint::
+
+ After adding the new layer ``SherpaMetaData``, you cannot use this model
+ with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
+ supported only in `sherpa-ncnn`_.
+
+.. hint::
+
+ `ncnn`_ is very flexible. You can add new layers to it just by text-editing
+ the ``param`` file! You don't need to change the ``bin`` file.
+
+Now you can use this model in `sherpa-ncnn`_.
+Please refer to the following documentation:
+
+ - Linux/macOS/Windows/arm/aarch64: ``_
+ - ``Android``: ``_
+ - ``iOS``: ``_
+ - Python: ``_
+
+We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
+
+ - ``_
+
+ You can find more usages there.
+
+7. (Optional) int8 quantization with sherpa-ncnn
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This step is optional.
+
+In this step, we describe how to quantize our model with ``int8``.
+
+Change :ref:`lstm-transducer-step-4-export-torchscript-model-via-pnnx` to
+disable ``fp16`` when using ``pnnx``:
+
+.. code-block::
+
+ cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
+
+ pnnx ./encoder_jit_trace-pnnx.pt fp16=0
+ pnnx ./decoder_jit_trace-pnnx.pt
+ pnnx ./joiner_jit_trace-pnnx.pt fp16=0
+
+.. note::
+
+ We add ``fp16=0`` when exporting the encoder and joiner. `ncnn`_ does not
+ support quantizing the decoder model yet. We will update this documentation
+ once `ncnn`_ supports it. (Maybe in this year, 2023).
+
+.. code-block:: bash
+
+ ls -lh icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/*_jit_trace-pnnx.ncnn.{param,bin}
+
+ -rw-r--r-- 1 kuangfangjun root 503K Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 437 Feb 17 11:32 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/decoder_jit_trace-pnnx.ncnn.param
+ -rw-r--r-- 1 kuangfangjun root 317M Feb 17 11:54 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 21K Feb 17 11:54 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/encoder_jit_trace-pnnx.ncnn.param
+ -rw-r--r-- 1 kuangfangjun root 3.0M Feb 17 11:54 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 488 Feb 17 11:54 icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/joiner_jit_trace-pnnx.ncnn.param
+
+
+Let us compare again the file sizes:
+
++----------------------------------------+------------+
+| File name | File size |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.pt | 318 MB |
++----------------------------------------+------------+
+| decoder_jit_trace-pnnx.pt | 1010 KB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.pt | 3.0 MB |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 159 MB |
++----------------------------------------+------------+
+| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 317 MB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
++----------------------------------------+------------+
+
+You can see that the file sizes are doubled when we disable ``fp16``.
+
+.. note::
+
+ You can again use ``streaming-ncnn-decode.py`` to test the exported models.
+
+Next, follow :ref:`lstm-modify-the-exported-encoder-for-sherpa-ncnn`
+to modify ``encoder_jit_trace-pnnx.ncnn.param``.
+
+Change
+
+.. code-block:: bash
+
+ 7767517
+ 267 379
+ Input in0 0 1 in0
+
+to
+
+.. code-block:: bash
+
+ 7767517
+ 268 379
+ SherpaMetaData sherpa_meta_data1 0 0 0=3 1=12 2=512 3=1024
+ Input in0 0 1 in0
+
+.. caution::
+
+ Please follow :ref:`lstm-modify-the-exported-encoder-for-sherpa-ncnn`
+ to change the values for ``SherpaMetaData`` if your model uses a different setting.
+
+Next, let us compile `sherpa-ncnn`_ since we will quantize our models within
+`sherpa-ncnn`_.
+
+.. code-block:: bash
+
+ # We will download sherpa-ncnn to $HOME/open-source/
+ # You can change it to anywhere you like.
+ cd $HOME
+ mkdir -p open-source
+
+ cd open-source
+ git clone https://github.com/k2-fsa/sherpa-ncnn
+ cd sherpa-ncnn
+ mkdir build
+ cd build
+ cmake ..
+ make -j 4
+
+ ./bin/generate-int8-scale-table
+
+ export PATH=$HOME/open-source/sherpa-ncnn/build/bin:$PATH
+
+The output of the above commands are:
+
+.. code-block:: bash
+
+ (py38) kuangfangjun:build$ generate-int8-scale-table
+ Please provide 10 arg. Currently given: 1
+ Usage:
+ generate-int8-scale-table encoder.param encoder.bin decoder.param decoder.bin joiner.param joiner.bin encoder-scale-table.txt joiner-scale-table.txt wave_filenames.txt
+
+ Each line in wave_filenames.txt is a path to some 16k Hz mono wave file.
+
+We need to create a file ``wave_filenames.txt``, in which we need to put
+some calibration wave files. For testing purpose, we put the ``test_wavs``
+from the pre-trained model repository
+``_
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+ cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
+
+ cat < wave_filenames.txt
+ ../test_wavs/1089-134686-0001.wav
+ ../test_wavs/1221-135766-0001.wav
+ ../test_wavs/1221-135766-0002.wav
+ EOF
+
+Now we can calculate the scales needed for quantization with the calibration data:
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+ cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
+
+ generate-int8-scale-table \
+ ./encoder_jit_trace-pnnx.ncnn.param \
+ ./encoder_jit_trace-pnnx.ncnn.bin \
+ ./decoder_jit_trace-pnnx.ncnn.param \
+ ./decoder_jit_trace-pnnx.ncnn.bin \
+ ./joiner_jit_trace-pnnx.ncnn.param \
+ ./joiner_jit_trace-pnnx.ncnn.bin \
+ ./encoder-scale-table.txt \
+ ./joiner-scale-table.txt \
+ ./wave_filenames.txt
+
+The output logs are in the following:
+
+.. literalinclude:: ./code/generate-int-8-scale-table-for-lstm.txt
+
+It generates the following two files:
+
+.. code-block:: bash
+
+ ls -lh encoder-scale-table.txt joiner-scale-table.txt
+
+ -rw-r--r-- 1 kuangfangjun root 345K Feb 17 12:13 encoder-scale-table.txt
+ -rw-r--r-- 1 kuangfangjun root 17K Feb 17 12:13 joiner-scale-table.txt
+
+.. caution::
+
+ Definitely, you need more calibration data to compute the scale table.
+
+Finally, let us use the scale table to quantize our models into ``int8``.
+
+.. code-block:: bash
+
+ ncnn2int8
+
+ usage: ncnn2int8 [inparam] [inbin] [outparam] [outbin] [calibration table]
+
+First, we quantize the encoder model:
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+ cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
+
+ ncnn2int8 \
+ ./encoder_jit_trace-pnnx.ncnn.param \
+ ./encoder_jit_trace-pnnx.ncnn.bin \
+ ./encoder_jit_trace-pnnx.ncnn.int8.param \
+ ./encoder_jit_trace-pnnx.ncnn.int8.bin \
+ ./encoder-scale-table.txt
+
+Next, we quantize the joiner model:
+
+.. code-block:: bash
+
+ ncnn2int8 \
+ ./joiner_jit_trace-pnnx.ncnn.param \
+ ./joiner_jit_trace-pnnx.ncnn.bin \
+ ./joiner_jit_trace-pnnx.ncnn.int8.param \
+ ./joiner_jit_trace-pnnx.ncnn.int8.bin \
+ ./joiner-scale-table.txt
+
+The above two commands generate the following 4 files:
+
+.. code-block::
+
+ -rw-r--r-- 1 kuangfangjun root 218M Feb 17 12:19 encoder_jit_trace-pnnx.ncnn.int8.bin
+ -rw-r--r-- 1 kuangfangjun root 21K Feb 17 12:19 encoder_jit_trace-pnnx.ncnn.int8.param
+ -rw-r--r-- 1 kuangfangjun root 774K Feb 17 12:19 joiner_jit_trace-pnnx.ncnn.int8.bin
+ -rw-r--r-- 1 kuangfangjun root 496 Feb 17 12:19 joiner_jit_trace-pnnx.ncnn.int8.param
+
+Congratulations! You have successfully quantized your model from ``float32`` to ``int8``.
+
+.. caution::
+
+ ``ncnn.int8.param`` and ``ncnn.int8.bin`` must be used in pairs.
+
+ You can replace ``ncnn.param`` and ``ncnn.bin`` with ``ncnn.int8.param``
+ and ``ncnn.int8.bin`` in `sherpa-ncnn`_ if you like.
+
+ For instance, to use only the ``int8`` encoder in ``sherpa-ncnn``, you can
+ replace the following invocation:
+
+ .. code-block::
+
+ cd egs/librispeech/ASR
+ cd icefall-asr-librispeech-lstm-transducer-stateless2-2022-09-03/exp/
+
+ sherpa-ncnn \
+ ../data/lang_bpe_500/tokens.txt \
+ ./encoder_jit_trace-pnnx.ncnn.param \
+ ./encoder_jit_trace-pnnx.ncnn.bin \
+ ./decoder_jit_trace-pnnx.ncnn.param \
+ ./decoder_jit_trace-pnnx.ncnn.bin \
+ ./joiner_jit_trace-pnnx.ncnn.param \
+ ./joiner_jit_trace-pnnx.ncnn.bin \
+ ../test_wavs/1089-134686-0001.wav
+
+ with
+
+ .. code-block:: bash
+
+ cd egs/librispeech/ASR
+ cd icefall-asr-librispeech-conv-emformer-transducer-stateless2-2022-07-05/exp/
+
+ sherpa-ncnn \
+ ../data/lang_bpe_500/tokens.txt \
+ ./encoder_jit_trace-pnnx.ncnn.int8.param \
+ ./encoder_jit_trace-pnnx.ncnn.int8.bin \
+ ./decoder_jit_trace-pnnx.ncnn.param \
+ ./decoder_jit_trace-pnnx.ncnn.bin \
+ ./joiner_jit_trace-pnnx.ncnn.param \
+ ./joiner_jit_trace-pnnx.ncnn.bin \
+ ../test_wavs/1089-134686-0001.wav
+
+The following table compares again the file sizes:
+
++----------------------------------------+------------+
+| File name | File size |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.pt | 318 MB |
++----------------------------------------+------------+
+| decoder_jit_trace-pnnx.pt | 1010 KB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.pt | 3.0 MB |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin (fp16) | 159 MB |
++----------------------------------------+------------+
+| decoder_jit_trace-pnnx.ncnn.bin (fp16) | 503 KB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin (fp16) | 1.5 MB |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin (fp32) | 317 MB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin (fp32) | 3.0 MB |
++----------------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.int8.bin | 218 MB |
++----------------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.int8.bin | 774 KB |
++----------------------------------------+------------+
+
+You can see that the file size of the joiner model after ``int8`` quantization
+is much smaller. However, the size of the encoder model is even larger than
+the ``fp16`` counterpart. The reason is that `ncnn`_ currently does not support
+quantizing ``LSTM`` layers into ``8-bit``. Please see
+``_
+
+.. hint::
+
+ Currently, only linear layers and convolutional layers are quantized
+ with ``int8``, so you don't see an exact ``4x`` reduction in file sizes.
+
+.. note::
+
+ You need to test the recognition accuracy after ``int8`` quantization.
+
+
+That's it! Have fun with `sherpa-ncnn`_!
diff --git a/docs/source/model-export/export-ncnn-zipformer.rst b/docs/source/model-export/export-ncnn-zipformer.rst
new file mode 100644
index 000000000..a5845b0e4
--- /dev/null
+++ b/docs/source/model-export/export-ncnn-zipformer.rst
@@ -0,0 +1,387 @@
+.. _export_streaming_zipformer_transducer_models_to_ncnn:
+
+Export streaming Zipformer transducer models to ncnn
+----------------------------------------------------
+
+We use the pre-trained model from the following repository as an example:
+
+``_
+
+We will show you step by step how to export it to `ncnn`_ and run it with `sherpa-ncnn`_.
+
+.. hint::
+
+ We use ``Ubuntu 18.04``, ``torch 1.13``, and ``Python 3.8`` for testing.
+
+.. caution::
+
+ Please use a more recent version of PyTorch. For instance, ``torch 1.8``
+ may ``not`` work.
+
+1. Download the pre-trained model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. hint::
+
+ You have to install `git-lfs`_ before you continue.
+
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+ cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+
+ git lfs pull --include "exp/pretrained.pt"
+ git lfs pull --include "data/lang_bpe_500/bpe.model"
+
+ cd ..
+
+.. note::
+
+ We downloaded ``exp/pretrained-xxx.pt``, not ``exp/cpu-jit_xxx.pt``.
+
+In the above code, we downloaded the pre-trained model into the directory
+``egs/librispeech/ASR/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29``.
+
+2. Install ncnn and pnnx
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Please refer to :ref:`export_for_ncnn_install_ncnn_and_pnnx` .
+
+
+3. Export the model via torch.jit.trace()
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+First, let us rename our pre-trained model:
+
+.. code-block::
+
+ cd egs/librispeech/ASR
+
+ cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+
+ ln -s pretrained.pt epoch-99.pt
+
+ cd ../..
+
+Next, we use the following code to export our model:
+
+.. code-block:: bash
+
+ dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+
+ ./pruned_transducer_stateless7_streaming/export-for-ncnn.py \
+ --tokens $dir/data/lang_bpe_500/tokens.txt \
+ --exp-dir $dir/exp \
+ --use-averaged-model 0 \
+ --epoch 99 \
+ --avg 1 \
+ --decode-chunk-len 32 \
+ --num-left-chunks 4 \
+ --num-encoder-layers "2,4,3,2,4" \
+ --feedforward-dims "1024,1024,2048,2048,1024" \
+ --nhead "8,8,8,8,8" \
+ --encoder-dims "384,384,384,384,384" \
+ --attention-dims "192,192,192,192,192" \
+ --encoder-unmasked-dims "256,256,256,256,256" \
+ --zipformer-downsampling-factors "1,2,4,8,2" \
+ --cnn-module-kernels "31,31,31,31,31" \
+ --decoder-dim 512 \
+ --joiner-dim 512
+
+.. caution::
+
+ If your model has different configuration parameters, please change them accordingly.
+
+.. hint::
+
+ We have renamed our model to ``epoch-99.pt`` so that we can use ``--epoch 99``.
+ There is only one pre-trained model, so we use ``--avg 1 --use-averaged-model 0``.
+
+ If you have trained a model by yourself and if you have all checkpoints
+ available, please first use ``decode.py`` to tune ``--epoch --avg``
+ and select the best combination with with ``--use-averaged-model 1``.
+
+.. note::
+
+ You will see the following log output:
+
+ .. literalinclude:: ./code/export-zipformer-transducer-for-ncnn-output.txt
+
+ The log shows the model has ``69920376`` parameters, i.e., ``~69.9 M``.
+
+ .. code-block:: bash
+
+ ls -lh icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/pretrained.pt
+ -rw-r--r-- 1 kuangfangjun root 269M Jan 12 12:53 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/pretrained.pt
+
+ You can see that the file size of the pre-trained model is ``269 MB``, which
+ is roughly equal to ``69920376*4/1024/1024 = 266.725 MB``.
+
+After running ``pruned_transducer_stateless7_streaming/export-for-ncnn.py``,
+we will get the following files:
+
+.. code-block:: bash
+
+ ls -lh icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/*pnnx.pt
+
+ -rw-r--r-- 1 kuangfangjun root 1022K Feb 27 20:23 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.pt
+ -rw-r--r-- 1 kuangfangjun root 266M Feb 27 20:23 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.pt
+ -rw-r--r-- 1 kuangfangjun root 2.8M Feb 27 20:23 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.pt
+
+.. _zipformer-transducer-step-4-export-torchscript-model-via-pnnx:
+
+4. Export torchscript model via pnnx
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. hint::
+
+ Make sure you have set up the ``PATH`` environment variable
+ in :ref:`export_for_ncnn_install_ncnn_and_pnnx`. Otherwise,
+ it will throw an error saying that ``pnnx`` could not be found.
+
+Now, it's time to export our models to `ncnn`_ via ``pnnx``.
+
+.. code-block::
+
+ cd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
+
+ pnnx ./encoder_jit_trace-pnnx.pt
+ pnnx ./decoder_jit_trace-pnnx.pt
+ pnnx ./joiner_jit_trace-pnnx.pt
+
+It will generate the following files:
+
+.. code-block:: bash
+
+ ls -lh icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/*ncnn*{bin,param}
+
+ -rw-r--r-- 1 kuangfangjun root 509K Feb 27 20:31 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 437 Feb 27 20:31 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.param
+ -rw-r--r-- 1 kuangfangjun root 133M Feb 27 20:30 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 152K Feb 27 20:30 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.param
+ -rw-r--r-- 1 kuangfangjun root 1.4M Feb 27 20:31 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.bin
+ -rw-r--r-- 1 kuangfangjun root 488 Feb 27 20:31 icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.param
+
+There are two types of files:
+
+- ``param``: It is a text file containing the model architectures. You can
+ use a text editor to view its content.
+- ``bin``: It is a binary file containing the model parameters.
+
+We compare the file sizes of the models below before and after converting via ``pnnx``:
+
+.. see https://tableconvert.com/restructuredtext-generator
+
++----------------------------------+------------+
+| File name | File size |
++==================================+============+
+| encoder_jit_trace-pnnx.pt | 266 MB |
++----------------------------------+------------+
+| decoder_jit_trace-pnnx.pt | 1022 KB |
++----------------------------------+------------+
+| joiner_jit_trace-pnnx.pt | 2.8 MB |
++----------------------------------+------------+
+| encoder_jit_trace-pnnx.ncnn.bin | 133 MB |
++----------------------------------+------------+
+| decoder_jit_trace-pnnx.ncnn.bin | 509 KB |
++----------------------------------+------------+
+| joiner_jit_trace-pnnx.ncnn.bin | 1.4 MB |
++----------------------------------+------------+
+
+You can see that the file sizes of the models after conversion are about one half
+of the models before conversion:
+
+ - encoder: 266 MB vs 133 MB
+ - decoder: 1022 KB vs 509 KB
+ - joiner: 2.8 MB vs 1.4 MB
+
+The reason is that by default ``pnnx`` converts ``float32`` parameters
+to ``float16``. A ``float32`` parameter occupies 4 bytes, while it is 2 bytes
+for ``float16``. Thus, it is ``twice smaller`` after conversion.
+
+.. hint::
+
+ If you use ``pnnx ./encoder_jit_trace-pnnx.pt fp16=0``, then ``pnnx``
+ won't convert ``float32`` to ``float16``.
+
+5. Test the exported models in icefall
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note::
+
+ We assume you have set up the environment variable ``PYTHONPATH`` when
+ building `ncnn`_.
+
+Now we have successfully converted our pre-trained model to `ncnn`_ format.
+The generated 6 files are what we need. You can use the following code to
+test the converted models:
+
+.. code-block:: bash
+
+ python3 ./pruned_transducer_stateless7_streaming/streaming-ncnn-decode.py \
+ --tokens ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/tokens.txt \
+ --encoder-param-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.param \
+ --encoder-bin-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/encoder_jit_trace-pnnx.ncnn.bin \
+ --decoder-param-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.param \
+ --decoder-bin-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/decoder_jit_trace-pnnx.ncnn.bin \
+ --joiner-param-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.param \
+ --joiner-bin-filename ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/joiner_jit_trace-pnnx.ncnn.bin \
+ ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/test_wavs/1089-134686-0001.wav
+
+.. hint::
+
+ `ncnn`_ supports only ``batch size == 1``, so ``streaming-ncnn-decode.py`` accepts
+ only 1 wave file as input.
+
+The output is given below:
+
+.. literalinclude:: ./code/test-streaming-ncnn-decode-zipformer-transducer-libri.txt
+
+Congratulations! You have successfully exported a model from PyTorch to `ncnn`_!
+
+.. _zipformer-modify-the-exported-encoder-for-sherpa-ncnn:
+
+6. Modify the exported encoder for sherpa-ncnn
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In order to use the exported models in `sherpa-ncnn`_, we have to modify
+``encoder_jit_trace-pnnx.ncnn.param``.
+
+Let us have a look at the first few lines of ``encoder_jit_trace-pnnx.ncnn.param``:
+
+.. code-block::
+
+ 7767517
+ 2028 2547
+ Input in0 0 1 in0
+
+**Explanation** of the above three lines:
+
+ 1. ``7767517``, it is a magic number and should not be changed.
+ 2. ``2028 2547``, the first number ``2028`` specifies the number of layers
+ in this file, while ``2547`` specifies the number of intermediate outputs
+ of this file
+ 3. ``Input in0 0 1 in0``, ``Input`` is the layer type of this layer; ``in0``
+ is the layer name of this layer; ``0`` means this layer has no input;
+ ``1`` means this layer has one output; ``in0`` is the output name of
+ this layer.
+
+We need to add 1 extra line and also increment the number of layers.
+The result looks like below:
+
+.. code-block:: bash
+
+ 7767517
+ 2029 2547
+ SherpaMetaData sherpa_meta_data1 0 0 0=2 1=32 2=4 3=7 15=1 -23316=5,2,4,3,2,4 -23317=5,384,384,384,384,384 -23318=5,192,192,192,192,192 -23319=5,1,2,4,8,2 -23320=5,31,31,31,31,31
+ Input in0 0 1 in0
+
+**Explanation**
+
+ 1. ``7767517``, it is still the same
+ 2. ``2029 2547``, we have added an extra layer, so we need to update ``2028`` to ``2029``.
+ We don't need to change ``2547`` since the newly added layer has no inputs or outputs.
+ 3. ``SherpaMetaData sherpa_meta_data1 0 0 0=2 1=32 2=4 3=7 -23316=5,2,4,3,2,4 -23317=5,384,384,384,384,384 -23318=5,192,192,192,192,192 -23319=5,1,2,4,8,2 -23320=5,31,31,31,31,31``
+ This line is newly added. Its explanation is given below:
+
+ - ``SherpaMetaData`` is the type of this layer. Must be ``SherpaMetaData``.
+ - ``sherpa_meta_data1`` is the name of this layer. Must be ``sherpa_meta_data1``.
+ - ``0 0`` means this layer has no inputs or output. Must be ``0 0``
+ - ``0=2``, 0 is the key and 2 is the value. MUST be ``0=2``
+ - ``1=32``, 1 is the key and 32 is the value of the
+ parameter ``--decode-chunk-len`` that you provided when running
+ ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
+ - ``2=4``, 2 is the key and 4 is the value of the
+ parameter ``--num-left-chunks`` that you provided when running
+ ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
+ - ``3=7``, 3 is the key and 7 is the value of for the amount of padding
+ used in the Conv2DSubsampling layer. It should be 7 for zipformer
+ if you don't change zipformer.py.
+ - ``15=1``, attribute 15, this is the model version. Starting from
+ `sherpa-ncnn`_ v2.0, we require that the model version has to
+ be >= 1.
+ - ``-23316=5,2,4,3,2,4``, attribute 16, this is an array attribute.
+ It is attribute 16 since -23300 - (-23316) = 16.
+ The first element of the array is the length of the array, which is 5 in our case.
+ ``2,4,3,2,4`` is the value of ``--num-encoder-layers``that you provided
+ when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
+ - ``-23317=5,384,384,384,384,384``, attribute 17.
+ The first element of the array is the length of the array, which is 5 in our case.
+ ``384,384,384,384,384`` is the value of ``--encoder-dims``that you provided
+ when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
+ - ``-23318=5,192,192,192,192,192``, attribute 18.
+ The first element of the array is the length of the array, which is 5 in our case.
+ ``192,192,192,192,192`` is the value of ``--attention-dims`` that you provided
+ when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
+ - ``-23319=5,1,2,4,8,2``, attribute 19.
+ The first element of the array is the length of the array, which is 5 in our case.
+ ``1,2,4,8,2`` is the value of ``--zipformer-downsampling-factors`` that you provided
+ when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
+ - ``-23320=5,31,31,31,31,31``, attribute 20.
+ The first element of the array is the length of the array, which is 5 in our case.
+ ``31,31,31,31,31`` is the value of ``--cnn-module-kernels`` that you provided
+ when running ``./pruned_transducer_stateless7_streaming/export-for-ncnn.py``.
+
+ For ease of reference, we list the key-value pairs that you need to add
+ in the following table. If your model has a different setting, please
+ change the values for ``SherpaMetaData`` accordingly. Otherwise, you
+ will be ``SAD``.
+
+ +----------+--------------------------------------------+
+ | key | value |
+ +==========+============================================+
+ | 0 | 2 (fixed) |
+ +----------+--------------------------------------------+
+ | 1 | ``-decode-chunk-len`` |
+ +----------+--------------------------------------------+
+ | 2 | ``--num-left-chunks`` |
+ +----------+--------------------------------------------+
+ | 3 | 7 (if you don't change code) |
+ +----------+--------------------------------------------+
+ | 15 | 1 (The model version) |
+ +----------+--------------------------------------------+
+ |-23316 | ``--num-encoder-layer`` |
+ +----------+--------------------------------------------+
+ |-23317 | ``--encoder-dims`` |
+ +----------+--------------------------------------------+
+ |-23318 | ``--attention-dims`` |
+ +----------+--------------------------------------------+
+ |-23319 | ``--zipformer-downsampling-factors`` |
+ +----------+--------------------------------------------+
+ |-23320 | ``--cnn-module-kernels`` |
+ +----------+--------------------------------------------+
+
+ 4. ``Input in0 0 1 in0``. No need to change it.
+
+.. caution::
+
+ When you add a new layer ``SherpaMetaData``, please remember to update the
+ number of layers. In our case, update ``2028`` to ``2029``. Otherwise,
+ you will be SAD later.
+
+.. hint::
+
+ After adding the new layer ``SherpaMetaData``, you cannot use this model
+ with ``streaming-ncnn-decode.py`` anymore since ``SherpaMetaData`` is
+ supported only in `sherpa-ncnn`_.
+
+.. hint::
+
+ `ncnn`_ is very flexible. You can add new layers to it just by text-editing
+ the ``param`` file! You don't need to change the ``bin`` file.
+
+Now you can use this model in `sherpa-ncnn`_.
+Please refer to the following documentation:
+
+ - Linux/macOS/Windows/arm/aarch64: ``_
+ - ``Android``: ``_
+ - ``iOS``: ``_
+ - Python: ``_
+
+We have a list of pre-trained models that have been exported for `sherpa-ncnn`_:
+
+ - ``_
+
+ You can find more usages there.
diff --git a/docs/source/model-export/export-ncnn.rst b/docs/source/model-export/export-ncnn.rst
new file mode 100644
index 000000000..634fb1e59
--- /dev/null
+++ b/docs/source/model-export/export-ncnn.rst
@@ -0,0 +1,39 @@
+.. _icefall_export_to_ncnn:
+
+Export to ncnn
+==============
+
+We support exporting the following models
+to `ncnn `_:
+
+ - `Zipformer transducer models `_
+
+ - `LSTM transducer models `_
+
+ - `ConvEmformer transducer models `_
+
+We also provide `sherpa-ncnn`_
+for performing speech recognition using `ncnn`_ with exported models.
+It has been tested on the following platforms:
+
+ - Linux
+ - macOS
+ - Windows
+ - ``Android``
+ - ``iOS``
+ - ``Raspberry Pi``
+ - `爱芯派 `_ (`MAIX-III AXera-Pi `_).
+ - `RV1126 `_
+
+`sherpa-ncnn`_ is self-contained and can be statically linked to produce
+a binary containing everything needed. Please refer
+to its documentation for details:
+
+ - ``_
+
+
+.. toctree::
+
+ export-ncnn-zipformer
+ export-ncnn-conv-emformer
+ export-ncnn-lstm
diff --git a/docs/source/model-export/export-onnx.rst b/docs/source/model-export/export-onnx.rst
new file mode 100644
index 000000000..d95f2acfe
--- /dev/null
+++ b/docs/source/model-export/export-onnx.rst
@@ -0,0 +1,104 @@
+Export to ONNX
+==============
+
+In this section, we describe how to export models to `ONNX`_.
+
+.. hint::
+
+ Before you continue, please run:
+
+ .. code-block:: bash
+
+ pip install onnx
+
+
+In each recipe, there is a file called ``export-onnx.py``, which is used
+to export trained models to `ONNX`_.
+
+There is also a file named ``onnx_pretrained.py``, which you can use
+the exported `ONNX`_ model in Python with `onnxruntime`_ to decode sound files.
+
+sherpa-onnx
+-----------
+
+We have a separate repository `sherpa-onnx`_ for deploying your exported models
+on various platforms such as:
+
+ - iOS
+ - Android
+ - Raspberry Pi
+ - Linux/macOS/Windows
+
+
+Please see the documentation of `sherpa-onnx`_ for details:
+
+ ``_
+
+Example
+-------
+
+In the following, we demonstrate how to export a streaming Zipformer pre-trained
+model from
+``_
+to `ONNX`_.
+
+Download the pre-trained model
+------------------------------
+
+.. hint::
+
+ We assume you have installed `git-lfs`_.
+
+.. code-block:: bash
+
+
+ cd egs/librispeech/ASR
+
+ repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+ repo=$(basename $repo_url)
+
+ pushd $repo
+ git lfs pull --include "data/lang_bpe_500/bpe.model"
+ git lfs pull --include "exp/pretrained.pt"
+ cd exp
+ ln -s pretrained.pt epoch-99.pt
+ popd
+
+Export the model to ONNX
+------------------------
+
+.. code-block:: bash
+
+ ./pruned_transducer_stateless7_streaming/export-onnx.py \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ --use-averaged-model 0 \
+ --epoch 99 \
+ --avg 1 \
+ --decode-chunk-len 32 \
+ --exp-dir $repo/exp/
+
+.. warning::
+
+ ``export-onnx.py`` from different recipes has different options.
+
+ In the above example, ``--decode-chunk-len`` is specific for the
+ streaming Zipformer. Other models won't have such an option.
+
+It will generate the following 3 files in ``$repo/exp``
+
+ - ``encoder-epoch-99-avg-1.onnx``
+ - ``decoder-epoch-99-avg-1.onnx``
+ - ``joiner-epoch-99-avg-1.onnx``
+
+Decode sound files with exported ONNX models
+--------------------------------------------
+
+.. code-block:: bash
+
+ ./pruned_transducer_stateless7_streaming/onnx_pretrained.py \
+ --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+ --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+ --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+ --tokens $repo/data/lang_bpe_500/tokens.txt \
+ $repo/test_wavs/1089-134686-0001.wav
diff --git a/docs/source/model-export/export-with-torch-jit-script.rst b/docs/source/model-export/export-with-torch-jit-script.rst
new file mode 100644
index 000000000..31c8f0bf5
--- /dev/null
+++ b/docs/source/model-export/export-with-torch-jit-script.rst
@@ -0,0 +1,58 @@
+.. _export-model-with-torch-jit-script:
+
+Export model with torch.jit.script()
+====================================
+
+In this section, we describe how to export a model via
+``torch.jit.script()``.
+
+When to use it
+--------------
+
+If we want to use our trained model with torchscript,
+we can use ``torch.jit.script()``.
+
+.. hint::
+
+ See :ref:`export-model-with-torch-jit-trace`
+ if you want to use ``torch.jit.trace()``.
+
+How to export
+-------------
+
+We use
+``_
+as an example in the following.
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+ epoch=14
+ avg=1
+
+ ./pruned_transducer_stateless3/export.py \
+ --exp-dir ./pruned_transducer_stateless3/exp \
+ --tokens data/lang_bpe_500/tokens.txt \
+ --epoch $epoch \
+ --avg $avg \
+ --jit 1
+
+It will generate a file ``cpu_jit.pt`` in ``pruned_transducer_stateless3/exp``.
+
+.. caution::
+
+ Don't be confused by ``cpu`` in ``cpu_jit.pt``. We move all parameters
+ to CPU before saving it into a ``pt`` file; that's why we use ``cpu``
+ in the filename.
+
+How to use the exported model
+-----------------------------
+
+Please refer to the following pages for usage:
+
+- ``_
+- ``_
+- ``_
+- ``_
+- ``_
+- ``_
diff --git a/docs/source/model-export/export-with-torch-jit-trace.rst b/docs/source/model-export/export-with-torch-jit-trace.rst
new file mode 100644
index 000000000..be7876ab5
--- /dev/null
+++ b/docs/source/model-export/export-with-torch-jit-trace.rst
@@ -0,0 +1,69 @@
+.. _export-model-with-torch-jit-trace:
+
+Export model with torch.jit.trace()
+===================================
+
+In this section, we describe how to export a model via
+``torch.jit.trace()``.
+
+When to use it
+--------------
+
+If we want to use our trained model with torchscript,
+we can use ``torch.jit.trace()``.
+
+.. hint::
+
+ See :ref:`export-model-with-torch-jit-script`
+ if you want to use ``torch.jit.script()``.
+
+How to export
+-------------
+
+We use
+``_
+as an example in the following.
+
+.. code-block:: bash
+
+ iter=468000
+ avg=16
+
+ cd egs/librispeech/ASR
+
+ ./lstm_transducer_stateless2/export.py \
+ --exp-dir ./lstm_transducer_stateless2/exp \
+ --tokens data/lang_bpe_500/tokens.txt \
+ --iter $iter \
+ --avg $avg \
+ --jit-trace 1
+
+It will generate three files inside ``lstm_transducer_stateless2/exp``:
+
+ - ``encoder_jit_trace.pt``
+ - ``decoder_jit_trace.pt``
+ - ``joiner_jit_trace.pt``
+
+You can use
+``_
+to decode sound files with the following commands:
+
+.. code-block:: bash
+
+ cd egs/librispeech/ASR
+ ./lstm_transducer_stateless2/jit_pretrained.py \
+ --bpe-model ./data/lang_bpe_500/bpe.model \
+ --encoder-model-filename ./lstm_transducer_stateless2/exp/encoder_jit_trace.pt \
+ --decoder-model-filename ./lstm_transducer_stateless2/exp/decoder_jit_trace.pt \
+ --joiner-model-filename ./lstm_transducer_stateless2/exp/joiner_jit_trace.pt \
+ /path/to/foo.wav \
+ /path/to/bar.wav \
+ /path/to/baz.wav
+
+How to use the exported models
+------------------------------
+
+Please refer to
+``_
+for its usage in `sherpa `_.
+You can also find pretrained models there.
diff --git a/docs/source/model-export/index.rst b/docs/source/model-export/index.rst
new file mode 100644
index 000000000..9b7a2ee2d
--- /dev/null
+++ b/docs/source/model-export/index.rst
@@ -0,0 +1,14 @@
+Model export
+============
+
+In this section, we describe various ways to export models.
+
+
+
+.. toctree::
+
+ export-model-state-dict
+ export-with-torch-jit-trace
+ export-with-torch-jit-script
+ export-onnx
+ export-ncnn
diff --git a/docs/source/recipes/Non-streaming-ASR/aishell/conformer_ctc.rst b/docs/source/recipes/Non-streaming-ASR/aishell/conformer_ctc.rst
new file mode 100644
index 000000000..aad90f9d0
--- /dev/null
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/conformer_ctc.rst
@@ -0,0 +1,747 @@
+Conformer CTC
+=============
+
+This tutorial shows you how to run a conformer ctc model
+with the `Aishell `_ dataset.
+
+
+.. HINT::
+
+ We assume you have read the page :ref:`install icefall` and have setup
+ the environment for ``icefall``.
+
+.. HINT::
+
+ We recommend you to use a GPU or several GPUs to run this recipe.
+
+In this tutorial, you will learn:
+
+ - (1) How to prepare data for training and decoding
+ - (2) How to start the training, either with a single GPU or multiple GPUs
+ - (3) How to do decoding after training, with ctc-decoding, 1best and attention decoder rescoring
+ - (4) How to use a pre-trained model, provided by us
+
+Data preparation
+----------------
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./prepare.sh
+
+The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
+All you need to do is to run it.
+
+The data preparation contains several stages, you can use the following two
+options:
+
+ - ``--stage``
+ - ``--stop-stage``
+
+to control which stage(s) should be run. By default, all stages are executed.
+
+
+For example,
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./prepare.sh --stage 0 --stop-stage 0
+
+means to run only stage 0.
+
+To run stage 2 to stage 5, use:
+
+.. code-block:: bash
+
+ $ ./prepare.sh --stage 2 --stop-stage 5
+
+.. HINT::
+
+ If you have pre-downloaded the `Aishell `_
+ dataset and the `musan `_ dataset, say,
+ they are saved in ``/tmp/aishell`` and ``/tmp/musan``, you can modify
+ the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
+ ``./prepare.sh`` won't re-download them.
+
+.. HINT::
+
+ A 3-gram language model will be downloaded from huggingface, we assume you have
+ installed and initialized ``git-lfs``. If not, you could install ``git-lfs`` by
+
+ .. code-block:: bash
+
+ $ sudo apt-get install git-lfs
+ $ git-lfs install
+
+ If you don't have the ``sudo`` permission, you could download the
+ `git-lfs binary `_ here, then add it to you ``PATH``.
+
+.. NOTE::
+
+ All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
+ are saved in ``./data`` directory.
+
+
+Training
+--------
+
+Configurable options
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./conformer_ctc/train.py --help
+
+shows you the training options that can be passed from the commandline.
+The following options are used quite often:
+
+ - ``--exp-dir``
+
+ The experiment folder to save logs and model checkpoints,
+ default ``./conformer_ctc/exp``.
+
+ - ``--num-epochs``
+
+ It is the number of epochs to train. For instance,
+ ``./conformer_ctc/train.py --num-epochs 30`` trains for 30 epochs
+ and generates ``epoch-0.pt``, ``epoch-1.pt``, ..., ``epoch-29.pt``
+ in the folder set by ``--exp-dir``.
+
+ - ``--start-epoch``
+
+ It's used to resume training.
+ ``./conformer_ctc/train.py --start-epoch 10`` loads the
+ checkpoint ``./conformer_ctc/exp/epoch-9.pt`` and starts
+ training from epoch 10, based on the state from epoch 9.
+
+ - ``--world-size``
+
+ It is used for multi-GPU single-machine DDP training.
+
+ - (a) If it is 1, then no DDP training is used.
+
+ - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
+
+ The following shows some use cases with it.
+
+ **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
+ GPU 2 for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ export CUDA_VISIBLE_DEVICES="0,2"
+ $ ./conformer_ctc/train.py --world-size 2
+
+ **Use case 2**: You have 4 GPUs and you want to use all of them
+ for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./conformer_ctc/train.py --world-size 4
+
+ **Use case 3**: You have 4 GPUs but you only want to use GPU 3
+ for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ export CUDA_VISIBLE_DEVICES="3"
+ $ ./conformer_ctc/train.py --world-size 1
+
+ .. CAUTION::
+
+ Only multi-GPU single-machine DDP training is implemented at present.
+ Multi-GPU multi-machine DDP training will be added later.
+
+ - ``--max-duration``
+
+ It specifies the number of seconds over all utterances in a
+ batch, before **padding**.
+ If you encounter CUDA OOM, please reduce it. For instance, if
+ your are using V100 NVIDIA GPU, we recommend you to set it to ``200``.
+
+ .. HINT::
+
+ Due to padding, the number of seconds of all utterances in a
+ batch will usually be larger than ``--max-duration``.
+
+ A larger value for ``--max-duration`` may cause OOM during training,
+ while a smaller value may increase the training time. You have to
+ tune it.
+
+
+Pre-configured options
+~~~~~~~~~~~~~~~~~~~~~~
+
+There are some training options, e.g., weight decay,
+number of warmup steps, etc,
+that are not passed from the commandline.
+They are pre-configured by the function ``get_params()`` in
+`conformer_ctc/train.py `_
+
+You don't need to change these pre-configured parameters. If you really need to change
+them, please modify ``./conformer_ctc/train.py`` directly.
+
+
+.. CAUTION::
+
+ The training set is perturbed by speed with two factors: 0.9 and 1.1.
+ Each epoch actually processes ``3x150 == 450`` hours of data.
+
+
+Training logs
+~~~~~~~~~~~~~
+
+Training logs and checkpoints are saved in the folder set by ``--exp-dir``
+(default ``conformer_ctc/exp``). You will find the following files in that directory:
+
+ - ``epoch-0.pt``, ``epoch-1.pt``, ...
+
+ These are checkpoint files, containing model ``state_dict`` and optimizer ``state_dict``.
+ To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
+
+ .. code-block:: bash
+
+ $ ./conformer_ctc/train.py --start-epoch 11
+
+ - ``tensorboard/``
+
+ This folder contains TensorBoard logs. Training loss, validation loss, learning
+ rate, etc, are recorded in these logs. You can visualize them by:
+
+ .. code-block:: bash
+
+ $ cd conformer_ctc/exp/tensorboard
+ $ tensorboard dev upload --logdir . --name "Aishell conformer ctc training with icefall" --description "Training with new LabelSmoothing loss, see https://github.com/k2-fsa/icefall/pull/109"
+
+ It will print something like below:
+
+ .. code-block::
+
+ TensorFlow installation not found - running with reduced feature set.
+ Upload started and will continue reading any new data as it's added to the logdir.
+
+ To stop uploading, press Ctrl-C.
+
+ New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/engw8KSkTZqS24zBV5dgCg/
+
+ [2021-11-22T11:09:27] Started scanning logdir.
+ [2021-11-22T11:10:14] Total uploaded: 116068 scalars, 0 tensors, 0 binary objects
+ Listening for new data in logdir...
+
+ Note there is a URL in the above output, click it and you will see
+ the following screenshot:
+
+ .. figure:: images/aishell-conformer-ctc-tensorboard-log.jpg
+ :width: 600
+ :alt: TensorBoard screenshot
+ :align: center
+ :target: https://tensorboard.dev/experiment/WE1DocDqRRCOSAgmGyClhg/
+
+ TensorBoard screenshot.
+
+ - ``log/log-train-xxxx``
+
+ It is the detailed training log in text format, same as the one
+ you saw printed to the console during training.
+
+Usage examples
+~~~~~~~~~~~~~~
+
+The following shows typical use cases:
+
+**Case 1**
+^^^^^^^^^^
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./conformer_ctc/train.py --max-duration 200
+
+It uses ``--max-duration`` of 200 to avoid OOM.
+
+
+**Case 2**
+^^^^^^^^^^
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ export CUDA_VISIBLE_DEVICES="0,3"
+ $ ./conformer_ctc/train.py --world-size 2
+
+It uses GPU 0 and GPU 3 for DDP training.
+
+**Case 3**
+^^^^^^^^^^
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./conformer_ctc/train.py --num-epochs 10 --start-epoch 3
+
+It loads checkpoint ``./conformer_ctc/exp/epoch-2.pt`` and starts
+training from epoch 3. Also, it trains for 10 epochs.
+
+Decoding
+--------
+
+The decoding part uses checkpoints saved by the training part, so you have
+to run the training part first.
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./conformer_ctc/decode.py --help
+
+shows the options for decoding.
+
+The commonly used options are:
+
+ - ``--method``
+
+ This specifies the decoding method.
+
+ The following command uses attention decoder for rescoring:
+
+ .. code-block::
+
+ $ cd egs/aishell/ASR
+ $ ./conformer_ctc/decode.py --method attention-decoder --max-duration 30 --nbest-scale 0.5
+
+ - ``--nbest-scale``
+
+ It is used to scale down lattice scores so that there are more unique
+ paths for rescoring.
+
+ - ``--max-duration``
+
+ It has the same meaning as the one during training. A larger
+ value may cause OOM.
+
+Pre-trained Model
+-----------------
+
+We have uploaded a pre-trained model to
+``_.
+
+We describe how to use the pre-trained model to transcribe a sound file or
+multiple sound files in the following.
+
+Install kaldifeat
+~~~~~~~~~~~~~~~~~
+
+`kaldifeat `_ is used to
+extract features for a single sound file or multiple sound files
+at the same time.
+
+Please refer to ``_ for installation.
+
+Download the pre-trained model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following commands describe how to download the pre-trained model:
+
+.. code-block::
+
+ $ cd egs/aishell/ASR
+ $ mkdir tmp
+ $ cd tmp
+ $ git lfs install
+ $ git clone https://huggingface.co/pkufool/icefall_asr_aishell_conformer_ctc
+
+.. CAUTION::
+
+ You have to use ``git lfs`` to download the pre-trained model.
+
+.. CAUTION::
+
+ In order to use this pre-trained model, your k2 version has to be v1.7 or later.
+
+After downloading, you will have the following files:
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ tree tmp
+
+.. code-block:: bash
+
+ tmp/
+ `-- icefall_asr_aishell_conformer_ctc
+ |-- README.md
+ |-- data
+ | `-- lang_char
+ | |-- HLG.pt
+ | |-- tokens.txt
+ | `-- words.txt
+ |-- exp
+ | `-- pretrained.pt
+ `-- test_waves
+ |-- BAC009S0764W0121.wav
+ |-- BAC009S0764W0122.wav
+ |-- BAC009S0764W0123.wav
+ `-- trans.txt
+
+ 5 directories, 9 files
+
+**File descriptions**:
+
+ - ``data/lang_char/HLG.pt``
+
+ It is the decoding graph.
+
+ - ``data/lang_char/tokens.txt``
+
+ It contains tokens and their IDs.
+ Provided only for convenience so that you can look up the SOS/EOS ID easily.
+
+ - ``data/lang_char/words.txt``
+
+ It contains words and their IDs.
+
+ - ``exp/pretrained.pt``
+
+ It contains pre-trained model parameters, obtained by averaging
+ checkpoints from ``epoch-25.pt`` to ``epoch-84.pt``.
+ Note: We have removed optimizer ``state_dict`` to reduce file size.
+
+ - ``test_waves/*.wav``
+
+ It contains some test sound files from Aishell ``test`` dataset.
+
+ - ``test_waves/trans.txt``
+
+ It contains the reference transcripts for the sound files in `test_waves/`.
+
+The information of the test sound files is listed below:
+
+.. code-block:: bash
+
+ $ soxi tmp/icefall_asr_aishell_conformer_ctc/test_waves/*.wav
+
+ Input File : 'tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:04.20 = 67263 samples ~ 315.295 CDDA sectors
+ File Size : 135k
+ Bit Rate : 256k
+ Sample Encoding: 16-bit Signed Integer PCM
+
+
+ Input File : 'tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:04.12 = 65840 samples ~ 308.625 CDDA sectors
+ File Size : 132k
+ Bit Rate : 256k
+ Sample Encoding: 16-bit Signed Integer PCM
+
+
+ Input File : 'tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:04.00 = 64000 samples ~ 300 CDDA sectors
+ File Size : 128k
+ Bit Rate : 256k
+ Sample Encoding: 16-bit Signed Integer PCM
+
+ Total Duration of 3 files: 00:00:12.32
+
+Usage
+~~~~~
+
+.. code-block::
+
+ $ cd egs/aishell/ASR
+ $ ./conformer_ctc/pretrained.py --help
+
+displays the help information.
+
+It supports three decoding methods:
+
+ - CTC decoding
+ - HLG decoding
+ - HLG + attention decoder rescoring
+
+CTC decoding
+^^^^^^^^^^^^
+
+CTC decoding only uses the ctc topology for decoding without a lexicon and language model
+
+The command to run CTC decoding is:
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./conformer_ctc/pretrained.py \
+ --checkpoint ./tmp/icefall_asr_aishell_conformer_ctc/exp/pretrained.pt \
+ --tokens-file ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/tokens.txt \
+ --method ctc-decoding \
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav \
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav \
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav
+
+The output is given below:
+
+.. code-block::
+
+ 2021-11-18 07:53:41,707 INFO [pretrained.py:229] {'sample_rate': 16000, 'subsampling_factor': 4, 'feature_dim': 80, 'nhead': 4, 'attention_dim': 512, 'num_decoder_layers': 6, 'vgg_frontend': False, 'use_feat_batchnorm': True, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'env_info': {'k2-version': '1.9', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'f2fd997f752ed11bbef4c306652c433e83f9cf12', 'k2-git-date': 'Sun Sep 19 09:41:46 2021', 'lhotse-version': '0.11.0.dev+git.33cfe45.clean', 'torch-cuda-available': True, 'torch-cuda-version': '10.1', 'python-version': '3.8', 'icefall-git-branch': 'aishell', 'icefall-git-sha1': 'd57a873-dirty', 'icefall-git-date': 'Wed Nov 17 19:53:25 2021', 'icefall-path': '/ceph-hw/kangwei/code/icefall_aishell3', 'k2-path': '/ceph-hw/kangwei/code/k2_release/k2/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-hw/kangwei/code/lhotse/lhotse/__init__.py'}, 'checkpoint': './tmp/icefall_asr_aishell_conformer_ctc/exp/pretrained.pt', 'tokens_file': './tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/tokens.txt', 'words_file': None, 'HLG': None, 'method': 'ctc-decoding', 'num_paths': 100, 'ngram_lm_scale': 0.3, 'attention_decoder_scale': 0.9, 'nbest_scale': 0.5, 'sos_id': 1, 'eos_id': 1, 'num_classes': 4336, 'sound_files': ['./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav', './tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav', './tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav']}
+ 2021-11-18 07:53:41,708 INFO [pretrained.py:240] device: cuda:0
+ 2021-11-18 07:53:41,708 INFO [pretrained.py:242] Creating model
+ 2021-11-18 07:53:51,131 INFO [pretrained.py:259] Constructing Fbank computer
+ 2021-11-18 07:53:51,134 INFO [pretrained.py:269] Reading sound files: ['./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav', './tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav', './tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav']
+ 2021-11-18 07:53:51,138 INFO [pretrained.py:275] Decoding started
+ 2021-11-18 07:53:51,241 INFO [pretrained.py:293] Use CTC decoding
+ 2021-11-18 07:53:51,704 INFO [pretrained.py:369]
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav:
+ 甚 至 出 现 交 易 几 乎 停 止 的 情 况
+
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav:
+ 一 二 线 城 市 虽 然 也 处 于 调 整 中
+
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav:
+ 但 因 为 聚 集 了 过 多 公 共 资 源
+
+
+ 2021-11-18 07:53:51,704 INFO [pretrained.py:371] Decoding Done
+
+
+HLG decoding
+^^^^^^^^^^^^
+
+HLG decoding uses the best path of the decoding lattice as the decoding result.
+
+The command to run HLG decoding is:
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./conformer_ctc/pretrained.py \
+ --checkpoint ./tmp/icefall_asr_aishell_conformer_ctc/exp/pretrained.pt \
+ --words-file ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/words.txt \
+ --HLG ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt \
+ --method 1best \
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav \
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav \
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav
+
+The output is given below:
+
+.. code-block::
+
+ 2021-11-18 07:37:38,683 INFO [pretrained.py:229] {'sample_rate': 16000, 'subsampling_factor': 4, 'feature_dim': 80, 'nhead': 4, 'attention_dim': 512, 'num_decoder_layers': 6, 'vgg_frontend': False, 'use_feat_batchnorm': True, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'env_info': {'k2-version': '1.9', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'f2fd997f752ed11bbef4c306652c433e83f9cf12', 'k2-git-date': 'Sun Sep 19 09:41:46 2021', 'lhotse-version': '0.11.0.dev+git.33cfe45.clean', 'torch-cuda-available': True, 'torch-cuda-version': '10.1', 'python-version': '3.8', 'icefall-git-branch': 'aishell', 'icefall-git-sha1': 'd57a873-clean', 'icefall-git-date': 'Wed Nov 17 19:53:25 2021', 'icefall-path': '/ceph-hw/kangwei/code/icefall_aishell3', 'k2-path': '/ceph-hw/kangwei/code/k2_release/k2/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-hw/kangwei/code/lhotse/lhotse/__init__.py'}, 'checkpoint': './tmp/icefall_asr_aishell_conformer_ctc/exp/pretrained.pt', 'tokens_file': None, 'words_file': './tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/words.txt', 'HLG': './tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt', 'method': '1best', 'num_paths': 100, 'ngram_lm_scale': 0.3, 'attention_decoder_scale': 0.9, 'nbest_scale': 0.5, 'sos_id': 1, 'eos_id': 1, 'num_classes': 4336, 'sound_files': ['./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav', './tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav', './tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav']}
+ 2021-11-18 07:37:38,684 INFO [pretrained.py:240] device: cuda:0
+ 2021-11-18 07:37:38,684 INFO [pretrained.py:242] Creating model
+ 2021-11-18 07:37:47,651 INFO [pretrained.py:259] Constructing Fbank computer
+ 2021-11-18 07:37:47,654 INFO [pretrained.py:269] Reading sound files: ['./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav', './tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav', './tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav']
+ 2021-11-18 07:37:47,659 INFO [pretrained.py:275] Decoding started
+ 2021-11-18 07:37:47,752 INFO [pretrained.py:321] Loading HLG from ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt
+ 2021-11-18 07:37:51,887 INFO [pretrained.py:340] Use HLG decoding
+ 2021-11-18 07:37:52,102 INFO [pretrained.py:370]
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav:
+ 甚至 出现 交易 几乎 停止 的 情况
+
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav:
+ 一二 线 城市 虽然 也 处于 调整 中
+
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav:
+ 但 因为 聚集 了 过多 公共 资源
+
+
+ 2021-11-18 07:37:52,102 INFO [pretrained.py:372] Decoding Done
+
+
+HLG decoding + attention decoder rescoring
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+It extracts n paths from the lattice, recores the extracted paths with
+an attention decoder. The path with the highest score is the decoding result.
+
+The command to run HLG decoding + attention decoder rescoring is:
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./conformer_ctc/pretrained.py \
+ --checkpoint ./tmp/icefall_asr_aishell_conformer_ctc/exp/pretrained.pt \
+ --words-file ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/words.txt \
+ --HLG ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt \
+ --method attention-decoder \
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav \
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav \
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav
+
+The output is below:
+
+.. code-block::
+
+ 2021-11-18 07:42:05,965 INFO [pretrained.py:229] {'sample_rate': 16000, 'subsampling_factor': 4, 'feature_dim': 80, 'nhead': 4, 'attention_dim': 512, 'num_decoder_layers': 6, 'vgg_frontend': False, 'use_feat_batchnorm': True, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'env_info': {'k2-version': '1.9', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'f2fd997f752ed11bbef4c306652c433e83f9cf12', 'k2-git-date': 'Sun Sep 19 09:41:46 2021', 'lhotse-version': '0.11.0.dev+git.33cfe45.clean', 'torch-cuda-available': True, 'torch-cuda-version': '10.1', 'python-version': '3.8', 'icefall-git-branch': 'aishell', 'icefall-git-sha1': 'd57a873-dirty', 'icefall-git-date': 'Wed Nov 17 19:53:25 2021', 'icefall-path': '/ceph-hw/kangwei/code/icefall_aishell3', 'k2-path': '/ceph-hw/kangwei/code/k2_release/k2/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-hw/kangwei/code/lhotse/lhotse/__init__.py'}, 'checkpoint': './tmp/icefall_asr_aishell_conformer_ctc/exp/pretrained.pt', 'tokens_file': None, 'words_file': './tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/words.txt', 'HLG': './tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt', 'method': 'attention-decoder', 'num_paths': 100, 'ngram_lm_scale': 0.3, 'attention_decoder_scale': 0.9, 'nbest_scale': 0.5, 'sos_id': 1, 'eos_id': 1, 'num_classes': 4336, 'sound_files': ['./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav', './tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav', './tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav']}
+ 2021-11-18 07:42:05,966 INFO [pretrained.py:240] device: cuda:0
+ 2021-11-18 07:42:05,966 INFO [pretrained.py:242] Creating model
+ 2021-11-18 07:42:16,821 INFO [pretrained.py:259] Constructing Fbank computer
+ 2021-11-18 07:42:16,822 INFO [pretrained.py:269] Reading sound files: ['./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav', './tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav', './tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav']
+ 2021-11-18 07:42:16,826 INFO [pretrained.py:275] Decoding started
+ 2021-11-18 07:42:16,916 INFO [pretrained.py:321] Loading HLG from ./tmp/icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt
+ 2021-11-18 07:42:21,115 INFO [pretrained.py:345] Use HLG + attention decoder rescoring
+ 2021-11-18 07:42:21,888 INFO [pretrained.py:370]
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav:
+ 甚至 出现 交易 几乎 停止 的 情况
+
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav:
+ 一二 线 城市 虽然 也 处于 调整 中
+
+ ./tmp/icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav:
+ 但 因为 聚集 了 过多 公共 资源
+
+
+ 2021-11-18 07:42:21,889 INFO [pretrained.py:372] Decoding Done
+
+
+Colab notebook
+--------------
+
+We do provide a colab notebook for this recipe showing how to use a pre-trained model.
+
+|aishell asr conformer ctc colab notebook|
+
+.. |aishell asr conformer ctc colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
+ :target: https://colab.research.google.com/drive/1WnG17io5HEZ0Gn_cnh_VzK5QYOoiiklC
+
+.. HINT::
+
+ Due to limited memory provided by Colab, you have to upgrade to Colab Pro to
+ run ``HLG decoding + attention decoder rescoring``.
+ Otherwise, you can only run ``HLG decoding`` with Colab.
+
+**Congratulations!** You have finished the aishell ASR recipe with
+conformer CTC models in ``icefall``.
+
+
+If you want to deploy your trained model in C++, please read the following section.
+
+Deployment with C++
+-------------------
+
+This section describes how to deploy the pre-trained model in C++, without
+Python dependencies.
+
+.. HINT::
+
+ At present, it does NOT support streaming decoding.
+
+First, let us compile k2 from source:
+
+.. code-block:: bash
+
+ $ cd $HOME
+ $ git clone https://github.com/k2-fsa/k2
+ $ cd k2
+ $ git checkout v2.0-pre
+
+.. CAUTION::
+
+ You have to switch to the branch ``v2.0-pre``!
+
+.. code-block:: bash
+
+ $ mkdir build-release
+ $ cd build-release
+ $ cmake -DCMAKE_BUILD_TYPE=Release ..
+ $ make -j hlg_decode
+
+ # You will find four binaries in `./bin`, i.e. ./bin/hlg_decode,
+
+Now you are ready to go!
+
+Assume you have run:
+
+ .. code-block:: bash
+
+ $ cd k2/build-release
+ $ ln -s /path/to/icefall-asr-aishell-conformer-ctc ./
+
+To view the usage of ``./bin/hlg_decode``, run:
+
+.. code-block::
+
+ $ ./bin/hlg_decode
+
+It will show you the following message:
+
+.. code-block:: bash
+
+ Please provide --nn_model
+
+ This file implements decoding with an HLG decoding graph.
+
+ Usage:
+ ./bin/hlg_decode \
+ --use_gpu true \
+ --nn_model \
+ --hlg \
+ --word_table \
+ \
+ \
+
+
+ To see all possible options, use
+ ./bin/hlg_decode --help
+
+ Caution:
+ - Only sound files (*.wav) with single channel are supported.
+ - It assumes the model is conformer_ctc/transformer.py from icefall.
+ If you use a different model, you have to change the code
+ related to `model.forward` in this file.
+
+
+HLG decoding
+~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ ./bin/hlg_decode \
+ --use_gpu true \
+ --nn_model icefall_asr_aishell_conformer_ctc/exp/cpu_jit.pt \
+ --hlg icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt \
+ --word_table icefall_asr_aishell_conformer_ctc/data/lang_char/words.txt \
+ icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav \
+ icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav \
+ icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav
+
+The output is:
+
+.. code-block::
+
+ 2021-11-18 14:48:20.89 [I] k2/torch/bin/hlg_decode.cu:115:int main(int, char**) Device: cpu
+ 2021-11-18 14:48:20.89 [I] k2/torch/bin/hlg_decode.cu:124:int main(int, char**) Load wave files
+ 2021-11-18 14:48:20.97 [I] k2/torch/bin/hlg_decode.cu:131:int main(int, char**) Build Fbank computer
+ 2021-11-18 14:48:20.98 [I] k2/torch/bin/hlg_decode.cu:142:int main(int, char**) Compute features
+ 2021-11-18 14:48:20.115 [I] k2/torch/bin/hlg_decode.cu:150:int main(int, char**) Load neural network model
+ 2021-11-18 14:48:20.693 [I] k2/torch/bin/hlg_decode.cu:165:int main(int, char**) Compute nnet_output
+ 2021-11-18 14:48:23.182 [I] k2/torch/bin/hlg_decode.cu:180:int main(int, char**) Load icefall_asr_aishell_conformer_ctc/data/lang_char/HLG.pt
+ 2021-11-18 14:48:33.489 [I] k2/torch/bin/hlg_decode.cu:185:int main(int, char**) Decoding
+ 2021-11-18 14:48:45.217 [I] k2/torch/bin/hlg_decode.cu:216:int main(int, char**)
+ Decoding result:
+
+ icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0121.wav
+ 甚至 出现 交易 几乎 停止 的 情况
+
+ icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0122.wav
+ 一二 线 城市 虽然 也 处于 调整 中
+
+ icefall_asr_aishell_conformer_ctc/test_waves/BAC009S0764W0123.wav
+ 但 因为 聚集 了 过多 公共 资源
+
+There is a Colab notebook showing you how to run a torch scripted model in C++.
+Please see |aishell asr conformer ctc torch script colab notebook|
+
+.. |aishell asr conformer ctc torch script colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
+ :target: https://colab.research.google.com/drive/1Vh7RER7saTW01DtNbvr7CY7ovNZgmfWz?usp=sharing
diff --git a/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-conformer-ctc-tensorboard-log.jpg b/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-conformer-ctc-tensorboard-log.jpg
new file mode 100644
index 000000000..c8b26f741
Binary files /dev/null and b/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-conformer-ctc-tensorboard-log.jpg differ
diff --git a/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-tdnn-lstm-ctc-tensorboard-log.jpg b/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-tdnn-lstm-ctc-tensorboard-log.jpg
new file mode 100644
index 000000000..b31db3ab5
Binary files /dev/null and b/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-tdnn-lstm-ctc-tensorboard-log.jpg differ
diff --git a/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-transducer_stateless_modified-tensorboard-log.png b/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-transducer_stateless_modified-tensorboard-log.png
new file mode 100644
index 000000000..6c84b28f2
Binary files /dev/null and b/docs/source/recipes/Non-streaming-ASR/aishell/images/aishell-transducer_stateless_modified-tensorboard-log.png differ
diff --git a/docs/source/recipes/Non-streaming-ASR/aishell/index.rst b/docs/source/recipes/Non-streaming-ASR/aishell/index.rst
new file mode 100644
index 000000000..b77d59bca
--- /dev/null
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/index.rst
@@ -0,0 +1,21 @@
+aishell
+=======
+
+Aishell is an open-source Chinese Mandarin speech corpus published by Beijing
+Shell Shell Technology Co.,Ltd.
+
+400 people from different accent areas in China are invited to participate in
+the recording, which is conducted in a quiet indoor environment using high
+fidelity microphone and downsampled to 16kHz. The manual transcription accuracy
+is above 95%, through professional speech annotation and strict quality
+inspection. The data is free for academic use. We hope to provide moderate
+amount of data for new researchers in the field of speech recognition.
+
+It can be downloaded from ``_
+
+.. toctree::
+ :maxdepth: 1
+
+ tdnn_lstm_ctc
+ conformer_ctc
+ stateless_transducer
diff --git a/docs/source/recipes/Non-streaming-ASR/aishell/stateless_transducer.rst b/docs/source/recipes/Non-streaming-ASR/aishell/stateless_transducer.rst
new file mode 100644
index 000000000..e8137b8c1
--- /dev/null
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/stateless_transducer.rst
@@ -0,0 +1,714 @@
+Stateless Transducer
+====================
+
+This tutorial shows you how to do transducer training in ``icefall``.
+
+.. HINT::
+
+ Instead of using RNN-T or RNN transducer, we only use transducer
+ here. As you will see, there are no RNNs in the model.
+
+.. HINT::
+
+ We assume you have read the page :ref:`install icefall` and have setup
+ the environment for ``icefall``.
+
+.. HINT::
+
+ We recommend you to use a GPU or several GPUs to run this recipe.
+
+In this tutorial, you will learn:
+
+ - (1) What does the transducer model look like
+ - (2) How to prepare data for training and decoding
+ - (3) How to start the training, either with a single GPU or with multiple GPUs
+ - (4) How to do decoding after training, with greedy search, beam search and, **modified beam search**
+ - (5) How to use a pre-trained model provided by us to transcribe sound files
+
+
+The Model
+---------
+
+The transducer model consists of 3 parts:
+
+- **Encoder**: It is a conformer encoder with the following parameters
+
+ - Number of heads: 8
+ - Attention dim: 512
+ - Number of layers: 12
+ - Feedforward dim: 2048
+
+- **Decoder**: We use a stateless model consisting of:
+
+ - An embedding layer with embedding dim 512
+ - A Conv1d layer with a default kernel size 2 (i.e. it sees 2
+ symbols of left-context by default)
+
+- **Joiner**: It consists of a ``nn.tanh()`` and a ``nn.Linear()``.
+
+.. Caution::
+
+ The decoder is stateless and very simple. It is borrowed from
+ ``_
+ (Rnn-Transducer with Stateless Prediction Network)
+
+ We make one modification to it: Place a Conv1d layer right after
+ the embedding layer.
+
+When using Chinese characters as modelling unit, whose vocabulary size
+is 4336 in this specific dataset,
+the number of parameters of the model is ``87939824``, i.e., about ``88 M``.
+
+The Loss
+--------
+
+We are using ``_
+to compute the transducer loss, which removes extra paddings
+in loss computation to save memory.
+
+.. Hint::
+
+ ``optimized_transducer`` implements the technqiues proposed
+ in `Improving RNN Transducer Modeling for End-to-End Speech Recognition `_ to save memory.
+
+ Furthermore, it supports ``modified transducer``, limiting the maximum
+ number of symbols that can be emitted per frame to 1, which simplifies
+ the decoding process significantly. Also, the experiment results
+ show that it does not degrade the performance.
+
+ See ``_
+ for what exactly modified transducer is.
+
+ ``_ shows that
+ in the unpruned case ``optimized_transducer`` has the advantage about minimizing
+ memory usage.
+
+.. todo::
+
+ Add tutorial about ``pruned_transducer_stateless`` that uses k2
+ pruned transducer loss.
+
+.. hint::
+
+ You can use::
+
+ pip install optimized_transducer
+
+ to install ``optimized_transducer``. Refer to
+ ``_ for other
+ alternatives.
+
+Data Preparation
+----------------
+
+To prepare the data for training, please use the following commands:
+
+.. code-block:: bash
+
+ cd egs/aishell/ASR
+ ./prepare.sh --stop-stage 4
+ ./prepare.sh --stage 6 --stop-stage 6
+
+.. note::
+
+ You can use ``./prepare.sh``, though it will generate FSTs that
+ are not used in transducer training.
+
+When you finish running the script, you will get the following two folders:
+
+ - ``data/fbank``: It saves the pre-computed features
+ - ``data/lang_char``: It contains tokens that will be used in the training
+
+Training
+--------
+
+.. code-block:: bash
+
+ cd egs/aishell/ASR
+ ./transducer_stateless_modified/train.py --help
+
+shows you the training options that can be passed from the commandline.
+The following options are used quite often:
+
+ - ``--exp-dir``
+
+ The experiment folder to save logs and model checkpoints,
+ defaults to ``./transducer_stateless_modified/exp``.
+
+ - ``--num-epochs``
+
+ It is the number of epochs to train. For instance,
+ ``./transducer_stateless_modified/train.py --num-epochs 30`` trains for 30
+ epochs and generates ``epoch-0.pt``, ``epoch-1.pt``, ..., ``epoch-29.pt``
+ in the folder set by ``--exp-dir``.
+
+ - ``--start-epoch``
+
+ It's used to resume training.
+ ``./transducer_stateless_modified/train.py --start-epoch 10`` loads the
+ checkpoint from ``exp_dir/epoch-9.pt`` and starts
+ training from epoch 10, based on the state from epoch 9.
+
+ - ``--world-size``
+
+ It is used for single-machine multi-GPU DDP training.
+
+ - (a) If it is 1, then no DDP training is used.
+
+ - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
+
+ The following shows some use cases with it.
+
+ **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
+ GPU 2 for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ export CUDA_VISIBLE_DEVICES="0,2"
+ $ ./transducer_stateless_modified/train.py --world-size 2
+
+ **Use case 2**: You have 4 GPUs and you want to use all of them
+ for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./transducer_stateless_modified/train.py --world-size 4
+
+ **Use case 3**: You have 4 GPUs but you only want to use GPU 3
+ for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ export CUDA_VISIBLE_DEVICES="3"
+ $ ./transducer_stateless_modified/train.py --world-size 1
+
+ .. CAUTION::
+
+ Only single-machine multi-GPU DDP training is implemented at present.
+ There is an on-going PR ``_
+ that adds support for multi-machine multi-GPU DDP training.
+
+ - ``--max-duration``
+
+ It specifies the number of seconds over all utterances in a
+ batch **before padding**.
+ If you encounter CUDA OOM, please reduce it. For instance, if
+ your are using V100 NVIDIA GPU with 32 GB RAM, we recommend you
+ to set it to ``300`` when the vocabulary size is 500.
+
+ .. HINT::
+
+ Due to padding, the number of seconds of all utterances in a
+ batch will usually be larger than ``--max-duration``.
+
+ A larger value for ``--max-duration`` may cause OOM during training,
+ while a smaller value may increase the training time. You have to
+ tune it.
+
+ - ``--lr-factor``
+
+ It controls the learning rate. If you use a single GPU for training, you
+ may want to use a small value for it. If you use multiple GPUs for training,
+ you may increase it.
+
+ - ``--context-size``
+
+ It specifies the kernel size in the decoder. The default value 2 means it
+ functions as a tri-gram LM.
+
+ - ``--modified-transducer-prob``
+
+ It specifies the probability to use modified transducer loss.
+ If it is 0, then no modified transducer is used; if it is 1,
+ then it uses modified transducer loss for all batches. If it is
+ ``p``, it applies modified transducer with probability ``p``.
+
+There are some training options, e.g.,
+number of warmup steps,
+that are not passed from the commandline.
+They are pre-configured by the function ``get_params()`` in
+`transducer_stateless_modified/train.py `_
+
+If you need to change them, please modify ``./transducer_stateless_modified/train.py`` directly.
+
+.. CAUTION::
+
+ The training set is perturbed by speed with two factors: 0.9 and 1.1.
+ Each epoch actually processes ``3x150 == 450`` hours of data.
+
+Training logs
+~~~~~~~~~~~~~
+
+Training logs and checkpoints are saved in the folder set by ``--exp-dir``
+(defaults to ``transducer_stateless_modified/exp``). You will find the following files in that directory:
+
+ - ``epoch-0.pt``, ``epoch-1.pt``, ...
+
+ These are checkpoint files, containing model ``state_dict`` and optimizer ``state_dict``.
+ To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
+
+ .. code-block:: bash
+
+ $ ./transducer_stateless_modified/train.py --start-epoch 11
+
+ - ``tensorboard/``
+
+ This folder contains TensorBoard logs. Training loss, validation loss, learning
+ rate, etc, are recorded in these logs. You can visualize them by:
+
+ .. code-block:: bash
+
+ $ cd transducer_stateless_modified/exp/tensorboard
+ $ tensorboard dev upload --logdir . --name "Aishell transducer training with icefall" --description "Training modified transducer, see https://github.com/k2-fsa/icefall/pull/219"
+
+ It will print something like below:
+
+ .. code-block::
+
+ TensorFlow installation not found - running with reduced feature set.
+ Upload started and will continue reading any new data as it's added to the logdir.
+
+ To stop uploading, press Ctrl-C.
+
+ New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/laGZ6HrcQxOigbFD5E0Y3Q/
+
+ [2022-03-03T14:29:45] Started scanning logdir.
+ [2022-03-03T14:29:48] Total uploaded: 8477 scalars, 0 tensors, 0 binary objects
+ Listening for new data in logdir...
+
+ Note there is a `URL `_ in the
+ above output, click it and you will see the following screenshot:
+
+ .. figure:: images/aishell-transducer_stateless_modified-tensorboard-log.png
+ :width: 600
+ :alt: TensorBoard screenshot
+ :align: center
+ :target: https://tensorboard.dev/experiment/laGZ6HrcQxOigbFD5E0Y3Q
+
+ TensorBoard screenshot.
+
+ - ``log/log-train-xxxx``
+
+ It is the detailed training log in text format, same as the one
+ you saw printed to the console during training.
+
+Usage examples
+~~~~~~~~~~~~~~
+
+The following shows typical use cases:
+
+**Case 1**
+^^^^^^^^^^
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./transducer_stateless_modified/train.py --max-duration 250
+
+It uses ``--max-duration`` of 250 to avoid OOM.
+
+
+**Case 2**
+^^^^^^^^^^
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ export CUDA_VISIBLE_DEVICES="0,3"
+ $ ./transducer_stateless_modified/train.py --world-size 2
+
+It uses GPU 0 and GPU 3 for DDP training.
+
+**Case 3**
+^^^^^^^^^^
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./transducer_stateless_modified/train.py --num-epochs 10 --start-epoch 3
+
+It loads checkpoint ``./transducer_stateless_modified/exp/epoch-2.pt`` and starts
+training from epoch 3. Also, it trains for 10 epochs.
+
+Decoding
+--------
+
+The decoding part uses checkpoints saved by the training part, so you have
+to run the training part first.
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./transducer_stateless_modified/decode.py --help
+
+shows the options for decoding.
+
+The commonly used options are:
+
+ - ``--method``
+
+ This specifies the decoding method. Currently, it supports:
+
+ - **greedy_search**. You can provide the commandline option ``--max-sym-per-frame``
+ to limit the maximum number of symbols that can be emitted per frame.
+
+ - **beam_search**. You can provide the commandline option ``--beam-size``.
+
+ - **modified_beam_search**. You can also provide the commandline option ``--beam-size``.
+ To use this method, we assume that you have trained your model with modified transducer,
+ i.e., used the option ``--modified-transducer-prob`` in the training.
+
+ The following command uses greedy search for decoding
+
+ .. code-block::
+
+ $ cd egs/aishell/ASR
+ $ ./transducer_stateless_modified/decode.py \
+ --epoch 64 \
+ --avg 33 \
+ --exp-dir ./transducer_stateless_modified/exp \
+ --max-duration 100 \
+ --decoding-method greedy_search \
+ --max-sym-per-frame 1
+
+ The following command uses beam search for decoding
+
+ .. code-block::
+
+ $ cd egs/aishell/ASR
+ $ ./transducer_stateless_modified/decode.py \
+ --epoch 64 \
+ --avg 33 \
+ --exp-dir ./transducer_stateless_modified/exp \
+ --max-duration 100 \
+ --decoding-method beam_search \
+ --beam-size 4
+
+ The following command uses ``modified`` beam search for decoding
+
+ .. code-block::
+
+ $ cd egs/aishell/ASR
+ $ ./transducer_stateless_modified/decode.py \
+ --epoch 64 \
+ --avg 33 \
+ --exp-dir ./transducer_stateless_modified/exp \
+ --max-duration 100 \
+ --decoding-method modified_beam_search \
+ --beam-size 4
+
+ - ``--max-duration``
+
+ It has the same meaning as the one used in training. A larger
+ value may cause OOM.
+
+ - ``--epoch``
+
+ It specifies the checkpoint from which epoch that should be used for decoding.
+
+ - ``--avg``
+
+ It specifies the number of models to average. For instance, if it is 3 and if
+ ``--epoch=10``, then it averages the checkpoints ``epoch-8.pt``, ``epoch-9.pt``,
+ and ``epoch-10.pt`` and the averaged checkpoint is used for decoding.
+
+After decoding, you can find the decoding logs and results in `exp_dir/log/`, e.g.,
+``exp_dir/log/greedy_search``.
+
+Pre-trained Model
+-----------------
+
+We have uploaded a pre-trained model to
+``_
+
+We describe how to use the pre-trained model to transcribe a sound file or
+multiple sound files in the following.
+
+Install kaldifeat
+~~~~~~~~~~~~~~~~~
+
+`kaldifeat `_ is used to
+extract features for a single sound file or multiple sound files
+at the same time.
+
+Please refer to ``_ for installation.
+
+Download the pre-trained model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following commands describe how to download the pre-trained model:
+
+.. code-block::
+
+ $ cd egs/aishell/ASR
+ $ mkdir tmp
+ $ cd tmp
+ $ git lfs install
+ $ git clone https://huggingface.co/csukuangfj/icefall-aishell-transducer-stateless-modified-2022-03-01
+
+
+.. CAUTION::
+
+ You have to use ``git lfs`` to download the pre-trained model.
+
+After downloading, you will have the following files:
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ tree tmp/icefall-aishell-transducer-stateless-modified-2022-03-01
+
+
+.. code-block:: bash
+
+ tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/
+ |-- README.md
+ |-- data
+ | `-- lang_char
+ | |-- L.pt
+ | |-- lexicon.txt
+ | |-- tokens.txt
+ | `-- words.txt
+ |-- exp
+ | `-- pretrained.pt
+ |-- log
+ | |-- errs-test-beam_4-epoch-64-avg-33-beam-4.txt
+ | |-- errs-test-greedy_search-epoch-64-avg-33-context-2-max-sym-per-frame-1.txt
+ | |-- log-decode-epoch-64-avg-33-beam-4-2022-03-02-12-05-03
+ | |-- log-decode-epoch-64-avg-33-context-2-max-sym-per-frame-1-2022-02-28-18-13-07
+ | |-- recogs-test-beam_4-epoch-64-avg-33-beam-4.txt
+ | `-- recogs-test-greedy_search-epoch-64-avg-33-context-2-max-sym-per-frame-1.txt
+ `-- test_wavs
+ |-- BAC009S0764W0121.wav
+ |-- BAC009S0764W0122.wav
+ |-- BAC009S0764W0123.wav
+ `-- transcript.txt
+
+ 5 directories, 16 files
+
+
+**File descriptions**:
+
+ - ``data/lang_char``
+
+ It contains language related files. You can find the vocabulary size in ``tokens.txt``.
+
+ - ``exp/pretrained.pt``
+
+ It contains pre-trained model parameters, obtained by averaging
+ checkpoints from ``epoch-32.pt`` to ``epoch-64.pt``.
+ Note: We have removed optimizer ``state_dict`` to reduce file size.
+
+ - ``log``
+
+ It contains decoding logs and decoded results.
+
+ - ``test_wavs``
+
+ It contains some test sound files from Aishell ``test`` dataset.
+
+The information of the test sound files is listed below:
+
+.. code-block:: bash
+
+ $ soxi tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/*.wav
+
+ Input File : 'tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:04.20 = 67263 samples ~ 315.295 CDDA sectors
+ File Size : 135k
+ Bit Rate : 256k
+ Sample Encoding: 16-bit Signed Integer PCM
+
+
+ Input File : 'tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:04.12 = 65840 samples ~ 308.625 CDDA sectors
+ File Size : 132k
+ Bit Rate : 256k
+ Sample Encoding: 16-bit Signed Integer PCM
+
+
+ Input File : 'tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:04.00 = 64000 samples ~ 300 CDDA sectors
+ File Size : 128k
+ Bit Rate : 256k
+ Sample Encoding: 16-bit Signed Integer PCM
+
+ Total Duration of 3 files: 00:00:12.32
+
+Usage
+~~~~~
+
+.. code-block::
+
+ $ cd egs/aishell/ASR
+ $ ./transducer_stateless_modified/pretrained.py --help
+
+displays the help information.
+
+It supports three decoding methods:
+
+ - greedy search
+ - beam search
+ - modified beam search
+
+.. note::
+
+ In modified beam search, it limits the maximum number of symbols that can be
+ emitted per frame to 1. To use this method, you have to ensure that your model
+ has been trained with the option ``--modified-transducer-prob``. Otherwise,
+ it may give you poor results.
+
+Greedy search
+^^^^^^^^^^^^^
+
+The command to run greedy search is given below:
+
+.. code-block:: bash
+
+
+ $ cd egs/aishell/ASR
+ $ ./transducer_stateless_modified/pretrained.py \
+ --checkpoint ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt \
+ --lang-dir ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char \
+ --method greedy_search \
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav \
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav \
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav
+
+The output is as follows:
+
+.. code-block::
+
+ 2022-03-03 15:35:26,531 INFO [pretrained.py:239] device: cuda:0
+ 2022-03-03 15:35:26,994 INFO [lexicon.py:176] Loading pre-compiled tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char/Linv.pt
+ 2022-03-03 15:35:27,027 INFO [pretrained.py:246] {'feature_dim': 80, 'encoder_out_dim': 512, 'subsampling_factor': 4, 'attention_dim': 512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'vgg_frontend': False, 'env_info': {'k2-version': '1.13', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'f4fefe4882bc0ae59af951da3f47335d5495ef71', 'k2-git-date': 'Thu Feb 10 15:16:02 2022', 'lhotse-version': '1.0.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '50d2281-clean', 'icefall-git-date': 'Wed Mar 2 16:02:38 2022', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-aishell', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-multi-datasets/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-aishell/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-2-0815224919-75d558775b-mmnv8', 'IP address': '10.177.72.138'}, 'sample_rate': 16000, 'checkpoint': './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt', 'lang_dir': PosixPath('tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char'), 'method': 'greedy_search', 'sound_files': ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'], 'beam_size': 4, 'context_size': 2, 'max_sym_per_frame': 3, 'blank_id': 0, 'vocab_size': 4336}
+ 2022-03-03 15:35:27,027 INFO [pretrained.py:248] About to create model
+ 2022-03-03 15:35:36,878 INFO [pretrained.py:257] Constructing Fbank computer
+ 2022-03-03 15:35:36,880 INFO [pretrained.py:267] Reading sound files: ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav']
+ 2022-03-03 15:35:36,891 INFO [pretrained.py:273] Decoding started
+ /ceph-fj/fangjun/open-source-2/icefall-aishell/egs/aishell/ASR/transducer_stateless_modified/conformer.py:113: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
+ lengths = ((x_lens - 1) // 2 - 1) // 2
+ 2022-03-03 15:35:37,163 INFO [pretrained.py:320]
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav:
+ 甚 至 出 现 交 易 几 乎 停 滞 的 情 况
+
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav:
+ 一 二 线 城 市 虽 然 也 处 于 调 整 中
+
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav:
+ 但 因 为 聚 集 了 过 多 公 共 资 源
+
+ 2022-03-03 15:35:37,163 INFO [pretrained.py:322] Decoding Done
+
+Beam search
+^^^^^^^^^^^
+
+The command to run beam search is given below:
+
+.. code-block:: bash
+
+
+ $ cd egs/aishell/ASR
+
+ $ ./transducer_stateless_modified/pretrained.py \
+ --checkpoint ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt \
+ --lang-dir ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char \
+ --method beam_search \
+ --beam-size 4 \
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav \
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav \
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav
+
+The output is as follows:
+
+.. code-block::
+
+ 2022-03-03 15:39:09,285 INFO [pretrained.py:239] device: cuda:0
+ 2022-03-03 15:39:09,708 INFO [lexicon.py:176] Loading pre-compiled tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char/Linv.pt
+ 2022-03-03 15:39:09,759 INFO [pretrained.py:246] {'feature_dim': 80, 'encoder_out_dim': 512, 'subsampling_factor': 4, 'attention_dim': 512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'vgg_frontend': False, 'env_info': {'k2-version': '1.13', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'f4fefe4882bc0ae59af951da3f47335d5495ef71', 'k2-git-date': 'Thu Feb 10 15:16:02 2022', 'lhotse-version': '1.0.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '50d2281-clean', 'icefall-git-date': 'Wed Mar 2 16:02:38 2022', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-aishell', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-multi-datasets/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-aishell/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-2-0815224919-75d558775b-mmnv8', 'IP address': '10.177.72.138'}, 'sample_rate': 16000, 'checkpoint': './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt', 'lang_dir': PosixPath('tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char'), 'method': 'beam_search', 'sound_files': ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'], 'beam_size': 4, 'context_size': 2, 'max_sym_per_frame': 3, 'blank_id': 0, 'vocab_size': 4336}
+ 2022-03-03 15:39:09,760 INFO [pretrained.py:248] About to create model
+ 2022-03-03 15:39:18,919 INFO [pretrained.py:257] Constructing Fbank computer
+ 2022-03-03 15:39:18,922 INFO [pretrained.py:267] Reading sound files: ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav']
+ 2022-03-03 15:39:18,929 INFO [pretrained.py:273] Decoding started
+ /ceph-fj/fangjun/open-source-2/icefall-aishell/egs/aishell/ASR/transducer_stateless_modified/conformer.py:113: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
+ lengths = ((x_lens - 1) // 2 - 1) // 2
+ 2022-03-03 15:39:21,046 INFO [pretrained.py:320]
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav:
+ 甚 至 出 现 交 易 几 乎 停 滞 的 情 况
+
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav:
+ 一 二 线 城 市 虽 然 也 处 于 调 整 中
+
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav:
+ 但 因 为 聚 集 了 过 多 公 共 资 源
+
+ 2022-03-03 15:39:21,047 INFO [pretrained.py:322] Decoding Done
+
+Modified Beam search
+^^^^^^^^^^^^^^^^^^^^
+
+The command to run modified beam search is given below:
+
+.. code-block:: bash
+
+
+ $ cd egs/aishell/ASR
+
+ $ ./transducer_stateless_modified/pretrained.py \
+ --checkpoint ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt \
+ --lang-dir ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char \
+ --method modified_beam_search \
+ --beam-size 4 \
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav \
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav \
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav
+
+The output is as follows:
+
+.. code-block::
+
+ 2022-03-03 15:41:23,319 INFO [pretrained.py:239] device: cuda:0
+ 2022-03-03 15:41:23,798 INFO [lexicon.py:176] Loading pre-compiled tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char/Linv.pt
+ 2022-03-03 15:41:23,831 INFO [pretrained.py:246] {'feature_dim': 80, 'encoder_out_dim': 512, 'subsampling_factor': 4, 'attention_dim': 512, 'nhead': 8, 'dim_feedforward': 2048, 'num_encoder_layers': 12, 'vgg_frontend': False, 'env_info': {'k2-version': '1.13', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': 'f4fefe4882bc0ae59af951da3f47335d5495ef71', 'k2-git-date': 'Thu Feb 10 15:16:02 2022', 'lhotse-version': '1.0.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '50d2281-clean', 'icefall-git-date': 'Wed Mar 2 16:02:38 2022', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-aishell', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-multi-datasets/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-aishell/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-2-0815224919-75d558775b-mmnv8', 'IP address': '10.177.72.138'}, 'sample_rate': 16000, 'checkpoint': './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/exp/pretrained.pt', 'lang_dir': PosixPath('tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/data/lang_char'), 'method': 'modified_beam_search', 'sound_files': ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav'], 'beam_size': 4, 'context_size': 2, 'max_sym_per_frame': 3, 'blank_id': 0, 'vocab_size': 4336}
+ 2022-03-03 15:41:23,831 INFO [pretrained.py:248] About to create model
+ 2022-03-03 15:41:32,214 INFO [pretrained.py:257] Constructing Fbank computer
+ 2022-03-03 15:41:32,215 INFO [pretrained.py:267] Reading sound files: ['./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav', './tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav']
+ 2022-03-03 15:41:32,220 INFO [pretrained.py:273] Decoding started
+ /ceph-fj/fangjun/open-source-2/icefall-aishell/egs/aishell/ASR/transducer_stateless_modified/conformer.py:113: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
+ lengths = ((x_lens - 1) // 2 - 1) // 2
+ /ceph-fj/fangjun/open-source-2/icefall-aishell/egs/aishell/ASR/transducer_stateless_modified/beam_search.py:402: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
+ topk_hyp_indexes = topk_indexes // logits.size(-1)
+ 2022-03-03 15:41:32,583 INFO [pretrained.py:320]
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0121.wav:
+ 甚 至 出 现 交 易 几 乎 停 滞 的 情 况
+
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0122.wav:
+ 一 二 线 城 市 虽 然 也 处 于 调 整 中
+
+ ./tmp/icefall-aishell-transducer-stateless-modified-2022-03-01/test_wavs/BAC009S0764W0123.wav:
+ 但 因 为 聚 集 了 过 多 公 共 资 源
+
+ 2022-03-03 15:41:32,583 INFO [pretrained.py:322] Decoding Done
+
+Colab notebook
+--------------
+
+We provide a colab notebook for this recipe showing how to use a pre-trained model to
+transcribe sound files.
+
+|aishell asr stateless modified transducer colab notebook|
+
+.. |aishell asr stateless modified transducer colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
+ :target: https://colab.research.google.com/drive/12jpTxJB44vzwtcmJl2DTdznW0OawPb9H?usp=sharing
diff --git a/docs/source/recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc.rst b/docs/source/recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc.rst
new file mode 100644
index 000000000..8e56deb6a
--- /dev/null
+++ b/docs/source/recipes/Non-streaming-ASR/aishell/tdnn_lstm_ctc.rst
@@ -0,0 +1,504 @@
+TDNN-LSTM CTC
+=============
+
+This tutorial shows you how to run a tdnn-lstm ctc model
+with the `Aishell `_ dataset.
+
+
+.. HINT::
+
+ We assume you have read the page :ref:`install icefall` and have setup
+ the environment for ``icefall``.
+
+.. HINT::
+
+ We recommend you to use a GPU or several GPUs to run this recipe.
+
+In this tutorial, you will learn:
+
+ - (1) How to prepare data for training and decoding
+ - (2) How to start the training, either with a single GPU or multiple GPUs
+ - (3) How to do decoding after training.
+ - (4) How to use a pre-trained model, provided by us
+
+Data preparation
+----------------
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./prepare.sh
+
+The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
+All you need to do is to run it.
+
+The data preparation contains several stages, you can use the following two
+options:
+
+ - ``--stage``
+ - ``--stop-stage``
+
+to control which stage(s) should be run. By default, all stages are executed.
+
+
+For example,
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./prepare.sh --stage 0 --stop-stage 0
+
+means to run only stage 0.
+
+To run stage 2 to stage 5, use:
+
+.. code-block:: bash
+
+ $ ./prepare.sh --stage 2 --stop-stage 5
+
+.. HINT::
+
+ If you have pre-downloaded the `Aishell `_
+ dataset and the `musan `_ dataset, say,
+ they are saved in ``/tmp/aishell`` and ``/tmp/musan``, you can modify
+ the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
+ ``./prepare.sh`` won't re-download them.
+
+.. HINT::
+
+ A 3-gram language model will be downloaded from huggingface, we assume you have
+ installed and initialized ``git-lfs``. If not, you could install ``git-lfs`` by
+
+ .. code-block:: bash
+
+ $ sudo apt-get install git-lfs
+ $ git-lfs install
+
+ If you don't have the ``sudo`` permission, you could download the
+ `git-lfs binary `_ here, then add it to you ``PATH``.
+
+.. NOTE::
+
+ All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
+ are saved in ``./data`` directory.
+
+
+Training
+--------
+
+Configurable options
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./tdnn_lstm_ctc/train.py --help
+
+shows you the training options that can be passed from the commandline.
+The following options are used quite often:
+
+
+ - ``--num-epochs``
+
+ It is the number of epochs to train. For instance,
+ ``./tdnn_lstm_ctc/train.py --num-epochs 30`` trains for 30 epochs
+ and generates ``epoch-0.pt``, ``epoch-1.pt``, ..., ``epoch-29.pt``
+ in the folder ``./tdnn_lstm_ctc/exp``.
+
+ - ``--start-epoch``
+
+ It's used to resume training.
+ ``./tdnn_lstm_ctc/train.py --start-epoch 10`` loads the
+ checkpoint ``./tdnn_lstm_ctc/exp/epoch-9.pt`` and starts
+ training from epoch 10, based on the state from epoch 9.
+
+ - ``--world-size``
+
+ It is used for multi-GPU single-machine DDP training.
+
+ - (a) If it is 1, then no DDP training is used.
+
+ - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
+
+ The following shows some use cases with it.
+
+ **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
+ GPU 2 for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ export CUDA_VISIBLE_DEVICES="0,2"
+ $ ./tdnn_lstm_ctc/train.py --world-size 2
+
+ **Use case 2**: You have 4 GPUs and you want to use all of them
+ for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./tdnn_lstm_ctc/train.py --world-size 4
+
+ **Use case 3**: You have 4 GPUs but you only want to use GPU 3
+ for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ export CUDA_VISIBLE_DEVICES="3"
+ $ ./tdnn_lstm_ctc/train.py --world-size 1
+
+ .. CAUTION::
+
+ Only multi-GPU single-machine DDP training is implemented at present.
+ Multi-GPU multi-machine DDP training will be added later.
+
+ - ``--max-duration``
+
+ It specifies the number of seconds over all utterances in a
+ batch, before **padding**.
+ If you encounter CUDA OOM, please reduce it. For instance, if
+ your are using V100 NVIDIA GPU, we recommend you to set it to ``2000``.
+
+ .. HINT::
+
+ Due to padding, the number of seconds of all utterances in a
+ batch will usually be larger than ``--max-duration``.
+
+ A larger value for ``--max-duration`` may cause OOM during training,
+ while a smaller value may increase the training time. You have to
+ tune it.
+
+
+Pre-configured options
+~~~~~~~~~~~~~~~~~~~~~~
+
+There are some training options, e.g., weight decay,
+number of warmup steps, results dir, etc,
+that are not passed from the commandline.
+They are pre-configured by the function ``get_params()`` in
+`tdnn_lstm_ctc/train.py `_
+
+You don't need to change these pre-configured parameters. If you really need to change
+them, please modify ``./tdnn_lstm_ctc/train.py`` directly.
+
+
+.. CAUTION::
+
+ The training set is perturbed by speed with two factors: 0.9 and 1.1.
+ Each epoch actually processes ``3x150 == 450`` hours of data.
+
+
+Training logs
+~~~~~~~~~~~~~
+
+Training logs and checkpoints are saved in ``tdnn_lstm_ctc/exp``.
+You will find the following files in that directory:
+
+ - ``epoch-0.pt``, ``epoch-1.pt``, ...
+
+ These are checkpoint files, containing model ``state_dict`` and optimizer ``state_dict``.
+ To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
+
+ .. code-block:: bash
+
+ $ ./tdnn_lstm_ctc/train.py --start-epoch 11
+
+ - ``tensorboard/``
+
+ This folder contains TensorBoard logs. Training loss, validation loss, learning
+ rate, etc, are recorded in these logs. You can visualize them by:
+
+ .. code-block:: bash
+
+ $ cd tdnn_lstm_ctc/exp/tensorboard
+ $ tensorboard dev upload --logdir . --description "TDNN-LSTM CTC training for Aishell with icefall"
+
+ It will print something like below:
+
+ .. code-block::
+
+ TensorFlow installation not found - running with reduced feature set.
+ Upload started and will continue reading any new data as it's added to the logdir.
+
+ To stop uploading, press Ctrl-C.
+
+ New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/LJI9MWUORLOw3jkdhxwk8A/
+
+ [2021-09-13T11:59:23] Started scanning logdir.
+ [2021-09-13T11:59:24] Total uploaded: 4454 scalars, 0 tensors, 0 binary objects
+ Listening for new data in logdir...
+
+ Note there is a URL in the above output, click it and you will see
+ the following screenshot:
+
+ .. figure:: images/aishell-tdnn-lstm-ctc-tensorboard-log.jpg
+ :width: 600
+ :alt: TensorBoard screenshot
+ :align: center
+ :target: https://tensorboard.dev/experiment/LJI9MWUORLOw3jkdhxwk8A/
+
+ TensorBoard screenshot.
+
+ - ``log/log-train-xxxx``
+
+ It is the detailed training log in text format, same as the one
+ you saw printed to the console during training.
+
+Usage examples
+~~~~~~~~~~~~~~
+
+The following shows typical use cases:
+
+**Case 1**
+^^^^^^^^^^
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ export CUDA_VISIBLE_DEVICES="0,3"
+ $ ./tdnn_lstm_ctc/train.py --world-size 2
+
+It uses GPU 0 and GPU 3 for DDP training.
+
+**Case 2**
+^^^^^^^^^^
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./tdnn_lstm_ctc/train.py --num-epochs 10 --start-epoch 3
+
+It loads checkpoint ``./tdnn_lstm_ctc/exp/epoch-2.pt`` and starts
+training from epoch 3. Also, it trains for 10 epochs.
+
+Decoding
+--------
+
+The decoding part uses checkpoints saved by the training part, so you have
+to run the training part first.
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./tdnn_lstm_ctc/decode.py --help
+
+shows the options for decoding.
+
+The commonly used options are:
+
+ - ``--method``
+
+ This specifies the decoding method.
+
+ The following command uses attention decoder for rescoring:
+
+ .. code-block::
+
+ $ cd egs/aishell/ASR
+ $ ./tdnn_lstm_ctc/decode.py --method 1best --max-duration 100
+
+ - ``--max-duration``
+
+ It has the same meaning as the one during training. A larger
+ value may cause OOM.
+
+Pre-trained Model
+-----------------
+
+We have uploaded a pre-trained model to
+``_.
+
+We describe how to use the pre-trained model to transcribe a sound file or
+multiple sound files in the following.
+
+Install kaldifeat
+~~~~~~~~~~~~~~~~~
+
+`kaldifeat `_ is used to
+extract features for a single sound file or multiple sound files
+at the same time.
+
+Please refer to ``_ for installation.
+
+Download the pre-trained model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following commands describe how to download the pre-trained model:
+
+.. code-block::
+
+ $ cd egs/aishell/ASR
+ $ mkdir tmp
+ $ cd tmp
+ $ git lfs install
+ $ git clone https://huggingface.co/pkufool/icefall_asr_aishell_tdnn_lstm_ctc
+
+.. CAUTION::
+
+ You have to use ``git lfs`` to download the pre-trained model.
+
+.. CAUTION::
+
+ In order to use this pre-trained model, your k2 version has to be v1.7 or later.
+
+After downloading, you will have the following files:
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ tree tmp
+
+.. code-block:: bash
+
+ tmp/
+ `-- icefall_asr_aishell_tdnn_lstm_ctc
+ |-- README.md
+ |-- data
+ | `-- lang_phone
+ | |-- HLG.pt
+ | |-- tokens.txt
+ | `-- words.txt
+ |-- exp
+ | `-- pretrained.pt
+ `-- test_waves
+ |-- BAC009S0764W0121.wav
+ |-- BAC009S0764W0122.wav
+ |-- BAC009S0764W0123.wav
+ `-- trans.txt
+
+ 5 directories, 9 files
+
+**File descriptions**:
+
+ - ``data/lang_phone/HLG.pt``
+
+ It is the decoding graph.
+
+ - ``data/lang_phone/tokens.txt``
+
+ It contains tokens and their IDs.
+ Provided only for convenience so that you can look up the SOS/EOS ID easily.
+
+ - ``data/lang_phone/words.txt``
+
+ It contains words and their IDs.
+
+ - ``exp/pretrained.pt``
+
+ It contains pre-trained model parameters, obtained by averaging
+ checkpoints from ``epoch-18.pt`` to ``epoch-40.pt``.
+ Note: We have removed optimizer ``state_dict`` to reduce file size.
+
+ - ``test_waves/*.wav``
+
+ It contains some test sound files from Aishell ``test`` dataset.
+
+ - ``test_waves/trans.txt``
+
+ It contains the reference transcripts for the sound files in `test_waves/`.
+
+The information of the test sound files is listed below:
+
+.. code-block:: bash
+
+ $ soxi tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/*.wav
+
+ Input File : 'tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0121.wav'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:04.20 = 67263 samples ~ 315.295 CDDA sectors
+ File Size : 135k
+ Bit Rate : 256k
+ Sample Encoding: 16-bit Signed Integer PCM
+
+
+ Input File : 'tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0122.wav'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:04.12 = 65840 samples ~ 308.625 CDDA sectors
+ File Size : 132k
+ Bit Rate : 256k
+ Sample Encoding: 16-bit Signed Integer PCM
+
+
+ Input File : 'tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0123.wav'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:04.00 = 64000 samples ~ 300 CDDA sectors
+ File Size : 128k
+ Bit Rate : 256k
+ Sample Encoding: 16-bit Signed Integer PCM
+
+ Total Duration of 3 files: 00:00:12.32
+
+Usage
+~~~~~
+
+.. code-block::
+
+ $ cd egs/aishell/ASR
+ $ ./tdnn_lstm_ctc/pretrained.py --help
+
+displays the help information.
+
+
+HLG decoding
+^^^^^^^^^^^^
+
+HLG decoding uses the best path of the decoding lattice as the decoding result.
+
+The command to run HLG decoding is:
+
+.. code-block:: bash
+
+ $ cd egs/aishell/ASR
+ $ ./tdnn_lstm_ctc/pretrained.py \
+ --checkpoint ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/exp/pretrained.pt \
+ --words-file ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/data/lang_phone/words.txt \
+ --HLG ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/data/lang_phone/HLG.pt \
+ --method 1best \
+ ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0121.wav \
+ ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0122.wav \
+ ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0123.wav
+
+The output is given below:
+
+.. code-block::
+
+ 2021-09-13 15:00:55,858 INFO [pretrained.py:140] device: cuda:0
+ 2021-09-13 15:00:55,858 INFO [pretrained.py:142] Creating model
+ 2021-09-13 15:01:05,389 INFO [pretrained.py:154] Loading HLG from ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/data/lang_phone/HLG.pt
+ 2021-09-13 15:01:06,531 INFO [pretrained.py:161] Constructing Fbank computer
+ 2021-09-13 15:01:06,536 INFO [pretrained.py:171] Reading sound files: ['./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0121.wav', './tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0122.wav', './tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0123.wav']
+ 2021-09-13 15:01:06,539 INFO [pretrained.py:177] Decoding started
+ 2021-09-13 15:01:06,917 INFO [pretrained.py:207] Use HLG decoding
+ 2021-09-13 15:01:07,129 INFO [pretrained.py:220]
+ ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0121.wav:
+ 甚至 出现 交易 几乎 停滞 的 情况
+
+ ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0122.wav:
+ 一二 线 城市 虽然 也 处于 调整 中
+
+ ./tmp/icefall_asr_aishell_tdnn_lstm_ctc/test_waves/BAC009S0764W0123.wav:
+ 但 因为 聚集 了 过多 公共 资源
+
+
+ 2021-09-13 15:01:07,129 INFO [pretrained.py:222] Decoding Done
+
+
+Colab notebook
+--------------
+
+We do provide a colab notebook for this recipe showing how to use a pre-trained model.
+
+|aishell asr conformer ctc colab notebook|
+
+.. |aishell asr conformer ctc colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
+ :target: https://colab.research.google.com/drive/1jbyzYq3ytm6j2nlEt-diQm-6QVWyDDEa?usp=sharing
+
+**Congratulations!** You have finished the aishell ASR recipe with
+TDNN-LSTM CTC models in ``icefall``.
diff --git a/docs/source/recipes/Non-streaming-ASR/index.rst b/docs/source/recipes/Non-streaming-ASR/index.rst
new file mode 100644
index 000000000..67123a648
--- /dev/null
+++ b/docs/source/recipes/Non-streaming-ASR/index.rst
@@ -0,0 +1,10 @@
+Non Streaming ASR
+=================
+
+.. toctree::
+ :maxdepth: 2
+
+ aishell/index
+ librispeech/index
+ timit/index
+ yesno/index
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/conformer_ctc.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/conformer_ctc.rst
new file mode 100644
index 000000000..b7f89c89f
--- /dev/null
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/conformer_ctc.rst
@@ -0,0 +1,1070 @@
+Conformer CTC
+=============
+
+This tutorial shows you how to run a conformer ctc model
+with the `LibriSpeech `_ dataset.
+
+
+.. HINT::
+
+ We assume you have read the page :ref:`install icefall` and have setup
+ the environment for ``icefall``.
+
+.. HINT::
+
+ We recommend you to use a GPU or several GPUs to run this recipe.
+
+In this tutorial, you will learn:
+
+ - (1) How to prepare data for training and decoding
+ - (2) How to start the training, either with a single GPU or multiple GPUs
+ - (3) How to do decoding after training, with n-gram LM rescoring and attention decoder rescoring
+ - (4) How to use a pre-trained model, provided by us
+ - (5) How to deploy your trained model in C++, without Python dependencies
+
+Data preparation
+----------------
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./prepare.sh
+
+The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
+All you need to do is to run it.
+
+The data preparation contains several stages, you can use the following two
+options:
+
+ - ``--stage``
+ - ``--stop-stage``
+
+to control which stage(s) should be run. By default, all stages are executed.
+
+
+For example,
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./prepare.sh --stage 0 --stop-stage 0
+
+means to run only stage 0.
+
+To run stage 2 to stage 5, use:
+
+.. code-block:: bash
+
+ $ ./prepare.sh --stage 2 --stop-stage 5
+
+.. HINT::
+
+ If you have pre-downloaded the `LibriSpeech `_
+ dataset and the `musan `_ dataset, say,
+ they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
+ the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
+ ``./prepare.sh`` won't re-download them.
+
+.. NOTE::
+
+ All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
+ are saved in ``./data`` directory.
+
+We provide the following YouTube video showing how to run ``./prepare.sh``.
+
+.. note::
+
+ To get the latest news of `next-gen Kaldi `_, please subscribe
+ the following YouTube channel by `Nadira Povey `_:
+
+ ``_
+
+.. youtube:: ofEIoJL-mGM
+
+
+Training
+--------
+
+Configurable options
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./conformer_ctc/train.py --help
+
+shows you the training options that can be passed from the commandline.
+The following options are used quite often:
+
+ - ``--full-libri``
+
+ If it's True, the training part uses all the training data, i.e.,
+ 960 hours. Otherwise, the training part uses only the subset
+ ``train-clean-100``, which has 100 hours of training data.
+
+ .. CAUTION::
+
+ The training set is perturbed by speed with two factors: 0.9 and 1.1.
+ If ``--full-libri`` is True, each epoch actually processes
+ ``3x960 == 2880`` hours of data.
+
+ - ``--num-epochs``
+
+ It is the number of epochs to train. For instance,
+ ``./conformer_ctc/train.py --num-epochs 30`` trains for 30 epochs
+ and generates ``epoch-0.pt``, ``epoch-1.pt``, ..., ``epoch-29.pt``
+ in the folder ``./conformer_ctc/exp``.
+
+ - ``--start-epoch``
+
+ It's used to resume training.
+ ``./conformer_ctc/train.py --start-epoch 10`` loads the
+ checkpoint ``./conformer_ctc/exp/epoch-9.pt`` and starts
+ training from epoch 10, based on the state from epoch 9.
+
+ - ``--world-size``
+
+ It is used for multi-GPU single-machine DDP training.
+
+ - (a) If it is 1, then no DDP training is used.
+
+ - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
+
+ The following shows some use cases with it.
+
+ **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
+ GPU 2 for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ export CUDA_VISIBLE_DEVICES="0,2"
+ $ ./conformer_ctc/train.py --world-size 2
+
+ **Use case 2**: You have 4 GPUs and you want to use all of them
+ for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./conformer_ctc/train.py --world-size 4
+
+ **Use case 3**: You have 4 GPUs but you only want to use GPU 3
+ for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ export CUDA_VISIBLE_DEVICES="3"
+ $ ./conformer_ctc/train.py --world-size 1
+
+ .. CAUTION::
+
+ Only multi-GPU single-machine DDP training is implemented at present.
+ Multi-GPU multi-machine DDP training will be added later.
+
+ - ``--max-duration``
+
+ It specifies the number of seconds over all utterances in a
+ batch, before **padding**.
+ If you encounter CUDA OOM, please reduce it. For instance, if
+ your are using V100 NVIDIA GPU, we recommend you to set it to ``200``.
+
+ .. HINT::
+
+ Due to padding, the number of seconds of all utterances in a
+ batch will usually be larger than ``--max-duration``.
+
+ A larger value for ``--max-duration`` may cause OOM during training,
+ while a smaller value may increase the training time. You have to
+ tune it.
+
+
+Pre-configured options
+~~~~~~~~~~~~~~~~~~~~~~
+
+There are some training options, e.g., weight decay,
+number of warmup steps, results dir, etc,
+that are not passed from the commandline.
+They are pre-configured by the function ``get_params()`` in
+`conformer_ctc/train.py `_
+
+You don't need to change these pre-configured parameters. If you really need to change
+them, please modify ``./conformer_ctc/train.py`` directly.
+
+
+Training logs
+~~~~~~~~~~~~~
+
+Training logs and checkpoints are saved in ``conformer_ctc/exp``.
+You will find the following files in that directory:
+
+ - ``epoch-0.pt``, ``epoch-1.pt``, ...
+
+ These are checkpoint files, containing model ``state_dict`` and optimizer ``state_dict``.
+ To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
+
+ .. code-block:: bash
+
+ $ ./conformer_ctc/train.py --start-epoch 11
+
+ - ``tensorboard/``
+
+ This folder contains TensorBoard logs. Training loss, validation loss, learning
+ rate, etc, are recorded in these logs. You can visualize them by:
+
+ .. code-block:: bash
+
+ $ cd conformer_ctc/exp/tensorboard
+ $ tensorboard dev upload --logdir . --description "Conformer CTC training for LibriSpeech with icefall"
+
+ It will print something like below:
+
+ .. code-block::
+
+ TensorFlow installation not found - running with reduced feature set.
+ Upload started and will continue reading any new data as it's added to the logdir.
+
+ To stop uploading, press Ctrl-C.
+
+ New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/lzGnETjwRxC3yghNMd4kPw/
+
+ [2021-08-24T16:42:43] Started scanning logdir.
+ Uploading 4540 scalars...
+
+ Note there is a URL in the above output, click it and you will see
+ the following screenshot:
+
+ .. figure:: images/librispeech-conformer-ctc-tensorboard-log.png
+ :width: 600
+ :alt: TensorBoard screenshot
+ :align: center
+ :target: https://tensorboard.dev/experiment/lzGnETjwRxC3yghNMd4kPw/
+
+ TensorBoard screenshot.
+
+ - ``log/log-train-xxxx``
+
+ It is the detailed training log in text format, same as the one
+ you saw printed to the console during training.
+
+Usage examples
+~~~~~~~~~~~~~~
+
+The following shows typical use cases:
+
+**Case 1**
+^^^^^^^^^^
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./conformer_ctc/train.py --max-duration 200 --full-libri 0
+
+It uses ``--max-duration`` of 200 to avoid OOM. Also, it uses only
+a subset of the LibriSpeech data for training.
+
+
+**Case 2**
+^^^^^^^^^^
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ export CUDA_VISIBLE_DEVICES="0,3"
+ $ ./conformer_ctc/train.py --world-size 2
+
+It uses GPU 0 and GPU 3 for DDP training.
+
+**Case 3**
+^^^^^^^^^^
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./conformer_ctc/train.py --num-epochs 10 --start-epoch 3
+
+It loads checkpoint ``./conformer_ctc/exp/epoch-2.pt`` and starts
+training from epoch 3. Also, it trains for 10 epochs.
+
+Decoding
+--------
+
+The decoding part uses checkpoints saved by the training part, so you have
+to run the training part first.
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./conformer_ctc/decode.py --help
+
+shows the options for decoding.
+
+The commonly used options are:
+
+ - ``--method``
+
+ This specifies the decoding method. This script supports 7 decoding methods.
+ As for ctc decoding, it uses a sentence piece model to convert word pieces to words.
+ And it needs neither a lexicon nor an n-gram LM.
+
+ For example, the following command uses CTC topology for decoding:
+
+ .. code-block::
+
+ $ cd egs/librispeech/ASR
+ $ ./conformer_ctc/decode.py --method ctc-decoding --max-duration 300
+ # Caution: The above command is tested with a model with vocab size 500.
+
+ And the following command uses attention decoder for rescoring:
+
+ .. code-block::
+
+ $ cd egs/librispeech/ASR
+ $ ./conformer_ctc/decode.py --method attention-decoder --max-duration 30 --nbest-scale 0.5
+
+ - ``--nbest-scale``
+
+ It is used to scale down lattice scores so that there are more unique
+ paths for rescoring.
+
+ - ``--max-duration``
+
+ It has the same meaning as the one during training. A larger
+ value may cause OOM.
+
+Here are some results for CTC decoding with a vocab size of 500:
+
+Usage:
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ # NOTE: Tested with a model with vocab size 500.
+ # It won't work for a model with vocab size 5000.
+ $ ./conformer_ctc/decode.py \
+ --epoch 25 \
+ --avg 1 \
+ --max-duration 300 \
+ --exp-dir conformer_ctc/exp \
+ --lang-dir data/lang_bpe_500 \
+ --method ctc-decoding
+
+The output is given below:
+
+.. code-block:: bash
+
+ 2021-09-26 12:44:31,033 INFO [decode.py:537] Decoding started
+ 2021-09-26 12:44:31,033 INFO [decode.py:538]
+ {'lm_dir': PosixPath('data/lm'), 'subsampling_factor': 4, 'vgg_frontend': False, 'use_feat_batchnorm': True,
+ 'feature_dim': 80, 'nhead': 8, 'attention_dim': 512, 'num_decoder_layers': 6, 'search_beam': 20, 'output_beam': 8,
+ 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True,
+ 'epoch': 25, 'avg': 1, 'method': 'ctc-decoding', 'num_paths': 100, 'nbest_scale': 0.5,
+ 'export': False, 'exp_dir': PosixPath('conformer_ctc/exp'), 'lang_dir': PosixPath('data/lang_bpe_500'), 'full_libri': False,
+ 'feature_dir': PosixPath('data/fbank'), 'max_duration': 100, 'bucketing_sampler': False, 'num_buckets': 30,
+ 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False,
+ 'shuffle': True, 'return_cuts': True, 'num_workers': 2}
+ 2021-09-26 12:44:31,406 INFO [lexicon.py:113] Loading pre-compiled data/lang_bpe_500/Linv.pt
+ 2021-09-26 12:44:31,464 INFO [decode.py:548] device: cuda:0
+ 2021-09-26 12:44:36,171 INFO [checkpoint.py:92] Loading checkpoint from conformer_ctc/exp/epoch-25.pt
+ 2021-09-26 12:44:36,776 INFO [decode.py:652] Number of model parameters: 109226120
+ 2021-09-26 12:44:37,714 INFO [decode.py:473] batch 0/206, cuts processed until now is 12
+ 2021-09-26 12:45:15,944 INFO [decode.py:473] batch 100/206, cuts processed until now is 1328
+ 2021-09-26 12:45:54,443 INFO [decode.py:473] batch 200/206, cuts processed until now is 2563
+ 2021-09-26 12:45:56,411 INFO [decode.py:494] The transcripts are stored in conformer_ctc/exp/recogs-test-clean-ctc-decoding.txt
+ 2021-09-26 12:45:56,592 INFO [utils.py:331] [test-clean-ctc-decoding] %WER 3.26% [1715 / 52576, 163 ins, 128 del, 1424 sub ]
+ 2021-09-26 12:45:56,807 INFO [decode.py:506] Wrote detailed error stats to conformer_ctc/exp/errs-test-clean-ctc-decoding.txt
+ 2021-09-26 12:45:56,808 INFO [decode.py:522]
+ For test-clean, WER of different settings are:
+ ctc-decoding 3.26 best for test-clean
+
+ 2021-09-26 12:45:57,362 INFO [decode.py:473] batch 0/203, cuts processed until now is 15
+ 2021-09-26 12:46:35,565 INFO [decode.py:473] batch 100/203, cuts processed until now is 1477
+ 2021-09-26 12:47:15,106 INFO [decode.py:473] batch 200/203, cuts processed until now is 2922
+ 2021-09-26 12:47:16,131 INFO [decode.py:494] The transcripts are stored in conformer_ctc/exp/recogs-test-other-ctc-decoding.txt
+ 2021-09-26 12:47:16,208 INFO [utils.py:331] [test-other-ctc-decoding] %WER 8.21% [4295 / 52343, 396 ins, 315 del, 3584 sub ]
+ 2021-09-26 12:47:16,432 INFO [decode.py:506] Wrote detailed error stats to conformer_ctc/exp/errs-test-other-ctc-decoding.txt
+ 2021-09-26 12:47:16,432 INFO [decode.py:522]
+ For test-other, WER of different settings are:
+ ctc-decoding 8.21 best for test-other
+
+ 2021-09-26 12:47:16,433 INFO [decode.py:680] Done!
+
+Pre-trained Model
+-----------------
+
+We have uploaded a pre-trained model to
+``_
+
+We describe how to use the pre-trained model to transcribe a sound file or
+multiple sound files in the following.
+
+Install kaldifeat
+~~~~~~~~~~~~~~~~~
+
+`kaldifeat `_ is used to
+extract features for a single sound file or multiple sound files
+at the same time.
+
+Please refer to ``_ for installation.
+
+Download the pre-trained model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The following commands describe how to download the pre-trained model:
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09
+ $ cd icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09
+ $ git lfs pull
+
+.. CAUTION::
+
+ You have to use ``git lfs pull`` to download the pre-trained model.
+ Otherwise, you will have the following issue when running ``decode.py``:
+
+ .. code-block::
+
+ _pickle.UnpicklingError: invalid load key, 'v'
+
+ To fix that issue, please use:
+
+ .. code-block:: bash
+
+ cd icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09
+ git lfs pull
+
+.. CAUTION::
+
+ In order to use this pre-trained model, your k2 version has to be v1.9 or later.
+
+After downloading, you will have the following files:
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ tree icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09
+
+.. code-block:: bash
+
+ icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09
+ |-- README.md
+ |-- data
+ | |-- lang_bpe_500
+ | | |-- HLG.pt
+ | | |-- HLG_modified.pt
+ | | |-- bpe.model
+ | | |-- tokens.txt
+ | | `-- words.txt
+ | `-- lm
+ | `-- G_4_gram.pt
+ |-- exp
+ | |-- cpu_jit.pt
+ | `-- pretrained.pt
+ |-- log
+ | `-- log-decode-2021-11-09-17-38-28
+ `-- test_wavs
+ |-- 1089-134686-0001.wav
+ |-- 1221-135766-0001.wav
+ |-- 1221-135766-0002.wav
+ `-- trans.txt
+
+
+**File descriptions**:
+ - ``data/lang_bpe_500/HLG.pt``
+
+ It is the decoding graph.
+
+ - ``data/lang_bpe_500/HLG_modified.pt``
+
+ It uses a modified CTC topology while building HLG.
+
+ - ``data/lang_bpe_500/bpe.model``
+
+ It is a sentencepiece model. You can use it to reproduce our results.
+
+ - ``data/lang_bpe_500/tokens.txt``
+
+ It contains tokens and their IDs, generated from ``bpe.model``.
+ Provided only for convenience so that you can look up the SOS/EOS ID easily.
+
+ - ``data/lang_bpe_500/words.txt``
+
+ It contains words and their IDs.
+
+ - ``data/lm/G_4_gram.pt``
+
+ It is a 4-gram LM, used for n-gram LM rescoring.
+
+ - ``exp/pretrained.pt``
+
+ It contains pre-trained model parameters, obtained by averaging
+ checkpoints from ``epoch-23.pt`` to ``epoch-77.pt``.
+ Note: We have removed optimizer ``state_dict`` to reduce file size.
+
+ - ``exp/cpu_jit.pt``
+
+ It contains torch scripted model that can be deployed in C++.
+
+ - ``test_wavs/*.wav``
+
+ It contains some test sound files from LibriSpeech ``test-clean`` dataset.
+
+ - ``test_wavs/trans.txt``
+
+ It contains the reference transcripts for the sound files in ``test_wavs/``.
+
+The information of the test sound files is listed below:
+
+.. code-block:: bash
+
+ $ soxi icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/*.wav
+
+ Input File : 'icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:06.62 = 106000 samples ~ 496.875 CDDA sectors
+ File Size : 212k
+ Bit Rate : 256k
+ Sample Encoding: 16-bit Signed Integer PCM
+
+
+ Input File : 'icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:16.71 = 267440 samples ~ 1253.62 CDDA sectors
+ File Size : 535k
+ Bit Rate : 256k
+ Sample Encoding: 16-bit Signed Integer PCM
+
+
+ Input File : 'icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:04.83 = 77200 samples ~ 361.875 CDDA sectors
+ File Size : 154k
+ Bit Rate : 256k
+ Sample Encoding: 16-bit Signed Integer PCM
+
+ Total Duration of 3 files: 00:00:28.16
+
+Usage
+~~~~~
+
+.. code-block::
+
+ $ cd egs/librispeech/ASR
+ $ ./conformer_ctc/pretrained.py --help
+
+displays the help information.
+
+It supports 4 decoding methods:
+
+ - CTC decoding
+ - HLG decoding
+ - HLG + n-gram LM rescoring
+ - HLG + n-gram LM rescoring + attention decoder rescoring
+
+CTC decoding
+^^^^^^^^^^^^
+
+CTC decoding uses the best path of the decoding lattice as the decoding result
+without any LM or lexicon.
+
+The command to run CTC decoding is:
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./conformer_ctc/pretrained.py \
+ --checkpoint ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt \
+ --bpe-model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/bpe.model \
+ --method ctc-decoding \
+ --num-classes 500 \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav
+
+The output is given below:
+
+.. code-block::
+
+ 2021-11-10 12:12:29,554 INFO [pretrained.py:260] {'sample_rate': 16000, 'subsampling_factor': 4, 'vgg_frontend': False, 'use_feat_batchnorm': True, 'feature_dim': 80, 'nhead': 8, 'attention_dim': 512, 'num_decoder_layers': 0, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt', 'words_file': None, 'HLG': None, 'bpe_model': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/bpe.model', 'method': 'ctc-decoding', 'G': None, 'num_paths': 100, 'ngram_lm_scale': 1.3, 'attention_decoder_scale': 1.2, 'nbest_scale': 0.5, 'sos_id': 1, 'num_classes': 500, 'eos_id': 1, 'sound_files': ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav'], 'env_info': {'k2-version': '1.9', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '7178d67e594bc7fa89c2b331ad7bd1c62a6a9eb4', 'k2-git-date': 'Tue Oct 26 22:12:54 2021', 'lhotse-version': '0.11.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.1', 'python-version': '3.8', 'icefall-git-branch': 'bpe-500', 'icefall-git-sha1': '8d93169-dirty', 'icefall-git-date': 'Wed Nov 10 11:52:44 2021', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-fix', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-bpe-500/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-bpe-500/lhotse/__init__.py'}}
+ 2021-11-10 12:12:29,554 INFO [pretrained.py:266] device: cuda:0
+ 2021-11-10 12:12:29,554 INFO [pretrained.py:268] Creating model
+ 2021-11-10 12:12:35,600 INFO [pretrained.py:285] Constructing Fbank computer
+ 2021-11-10 12:12:35,601 INFO [pretrained.py:295] Reading sound files: ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav']
+ 2021-11-10 12:12:35,758 INFO [pretrained.py:301] Decoding started
+ 2021-11-10 12:12:36,025 INFO [pretrained.py:319] Use CTC decoding
+ 2021-11-10 12:12:36,204 INFO [pretrained.py:425]
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav:
+ AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROFFELS
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav:
+ GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED B
+ OSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav:
+ YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
+
+ 2021-11-10 12:12:36,204 INFO [pretrained.py:427] Decoding Done
+
+HLG decoding
+^^^^^^^^^^^^
+
+HLG decoding uses the best path of the decoding lattice as the decoding result.
+
+The command to run HLG decoding is:
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./conformer_ctc/pretrained.py \
+ --checkpoint ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt \
+ --words-file ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt \
+ --method 1best \
+ --num-classes 500 \
+ --HLG ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav
+
+The output is given below:
+
+.. code-block::
+
+ 2021-11-10 13:33:03,723 INFO [pretrained.py:260] {'sample_rate': 16000, 'subsampling_factor': 4, 'vgg_frontend': False, 'use_feat_batchnorm': True, 'feature_dim': 80, 'nhead': 8, 'attention_dim': 512, 'num_decoder_layers': 0, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt', 'words_file': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt', 'HLG': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt', 'bpe_model': None, 'method': '1best', 'G': None, 'num_paths': 100, 'ngram_lm_scale': 1.3, 'attention_decoder_scale': 1.2, 'nbest_scale': 0.5, 'sos_id': 1, 'num_classes': 500, 'eos_id': 1, 'sound_files': ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav'], 'env_info': {'k2-version': '1.9', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '7178d67e594bc7fa89c2b331ad7bd1c62a6a9eb4', 'k2-git-date': 'Tue Oct 26 22:12:54 2021', 'lhotse-version': '0.11.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.1', 'python-version': '3.8', 'icefall-git-branch': 'bpe-500', 'icefall-git-sha1': '8d93169-dirty', 'icefall-git-date': 'Wed Nov 10 11:52:44 2021', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-fix', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-bpe-500/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-bpe-500/lhotse/__init__.py'}}
+ 2021-11-10 13:33:03,723 INFO [pretrained.py:266] device: cuda:0
+ 2021-11-10 13:33:03,723 INFO [pretrained.py:268] Creating model
+ 2021-11-10 13:33:09,775 INFO [pretrained.py:285] Constructing Fbank computer
+ 2021-11-10 13:33:09,776 INFO [pretrained.py:295] Reading sound files: ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav']
+ 2021-11-10 13:33:09,881 INFO [pretrained.py:301] Decoding started
+ 2021-11-10 13:33:09,951 INFO [pretrained.py:352] Loading HLG from ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt
+ 2021-11-10 13:33:13,234 INFO [pretrained.py:384] Use HLG decoding
+ 2021-11-10 13:33:13,571 INFO [pretrained.py:425]
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav:
+ AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav:
+ GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav:
+ YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
+
+ 2021-11-10 13:33:13,571 INFO [pretrained.py:427] Decoding Done
+
+
+HLG decoding + LM rescoring
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+It uses an n-gram LM to rescore the decoding lattice and the best
+path of the rescored lattice is the decoding result.
+
+The command to run HLG decoding + LM rescoring is:
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ ./conformer_ctc/pretrained.py \
+ --checkpoint ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt \
+ --words-file ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt \
+ --method whole-lattice-rescoring \
+ --num-classes 500 \
+ --HLG ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt \
+ --G ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt \
+ --ngram-lm-scale 1.0 \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav
+
+Its output is:
+
+.. code-block::
+
+ 2021-11-10 13:39:55,857 INFO [pretrained.py:260] {'sample_rate': 16000, 'subsampling_factor': 4, 'vgg_frontend': False, 'use_feat_batchnorm': True, 'feature_dim': 80, 'nhead': 8, 'attention_dim': 512, 'num_decoder_layers': 0, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt', 'words_file': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt', 'HLG': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt', 'bpe_model': None, 'method': 'whole-lattice-rescoring', 'G': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt', 'num_paths': 100, 'ngram_lm_scale': 1.0, 'attention_decoder_scale': 1.2, 'nbest_scale': 0.5, 'sos_id': 1, 'num_classes': 500, 'eos_id': 1, 'sound_files': ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav'], 'env_info': {'k2-version': '1.9', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-$it-sha1': '7178d67e594bc7fa89c2b331ad7bd1c62a6a9eb4', 'k2-git-date': 'Tue Oct 26 22:12:54 2021', 'lhotse-version': '0.11.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.1', 'python-version': '3.8', 'icefall-git-branch': 'bpe-500', 'icefall-git-sha1': '8d93169-dirty', 'icefall-git-date': 'Wed Nov 10 11:52:44 2021', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-fix', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-bpe-500/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-bpe-500/lhotse/__init__.py'}}
+ 2021-11-10 13:39:55,858 INFO [pretrained.py:266] device: cuda:0
+ 2021-11-10 13:39:55,858 INFO [pretrained.py:268] Creating model
+ 2021-11-10 13:40:01,979 INFO [pretrained.py:285] Constructing Fbank computer
+ 2021-11-10 13:40:01,980 INFO [pretrained.py:295] Reading sound files: ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav']
+ 2021-11-10 13:40:02,055 INFO [pretrained.py:301] Decoding started
+ 2021-11-10 13:40:02,117 INFO [pretrained.py:352] Loading HLG from ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt
+ 2021-11-10 13:40:05,051 INFO [pretrained.py:363] Loading G from ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt
+ 2021-11-10 13:40:18,959 INFO [pretrained.py:389] Use HLG decoding + LM rescoring
+ 2021-11-10 13:40:19,546 INFO [pretrained.py:425]
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav:
+ AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav:
+ GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav:
+ YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
+
+ 2021-11-10 13:40:19,546 INFO [pretrained.py:427] Decoding Done
+
+
+HLG decoding + LM rescoring + attention decoder rescoring
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+It uses an n-gram LM to rescore the decoding lattice, extracts
+n paths from the rescored lattice, recores the extracted paths with
+an attention decoder. The path with the highest score is the decoding result.
+
+The command to run HLG decoding + LM rescoring + attention decoder rescoring is:
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./conformer_ctc/pretrained.py \
+ --checkpoint ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt \
+ --words-file ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt \
+ --method attention-decoder \
+ --num-classes 500 \
+ --HLG ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt \
+ --G ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt \
+ --ngram-lm-scale 2.0 \
+ --attention-decoder-scale 2.0 \
+ --nbest-scale 0.5 \
+ --num-paths 100 \
+ --sos-id 1 \
+ --eos-id 1 \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav
+
+The output is below:
+
+.. code-block::
+
+ 2021-11-10 13:43:45,598 INFO [pretrained.py:260] {'sample_rate': 16000, 'subsampling_factor': 4, 'vgg_frontend': False, 'use_feat_batchnorm': True, 'feature_dim': 80, 'nhead': 8, 'attention_dim': 512, 'num_decoder_layers': 6, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'checkpoint': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/pretrained.pt', 'words_file': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt', 'HLG': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt', 'bpe_model': None, 'method': 'attention-decoder', 'G': './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt', 'num_paths': 100, 'ngram_lm_scale': 2.0, 'attention_decoder_scale': 2.0, 'nbest_scale': 0.5, 'sos_id': 1, 'num_classes': 500, 'eos_id': 1, 'sound_files': ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav'], 'env_info': {'k2-version': '1.9', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '7178d67e594bc7fa89c2b331ad7bd1c62a6a9eb4', 'k2-git-date': 'Tue Oct 26 22:12:54 2021', 'lhotse-version': '0.11.0.dev+missing.version.file', 'torch-cuda-available': True, 'torch-cuda-version': '10.1', 'python-version': '3.8', 'icefall-git-branch': 'bpe-500', 'icefall-git-sha1': '8d93169-dirty', 'icefall-git-date': 'Wed Nov 10 11:52:44 2021', 'icefall-path': '/ceph-fj/fangjun/open-source-2/icefall-fix', 'k2-path': '/ceph-fj/fangjun/open-source-2/k2-bpe-500/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-fj/fangjun/open-source-2/lhotse-bpe-500/lhotse/__init__.py'}}
+ 2021-11-10 13:43:45,599 INFO [pretrained.py:266] device: cuda:0
+ 2021-11-10 13:43:45,599 INFO [pretrained.py:268] Creating model
+ 2021-11-10 13:43:51,833 INFO [pretrained.py:285] Constructing Fbank computer
+ 2021-11-10 13:43:51,834 INFO [pretrained.py:295] Reading sound files: ['./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav', './icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav']
+ 2021-11-10 13:43:51,915 INFO [pretrained.py:301] Decoding started
+ 2021-11-10 13:43:52,076 INFO [pretrained.py:352] Loading HLG from ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt
+ 2021-11-10 13:43:55,110 INFO [pretrained.py:363] Loading G from ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt
+ 2021-11-10 13:44:09,329 INFO [pretrained.py:397] Use HLG + LM rescoring + attention decoder rescoring
+ 2021-11-10 13:44:10,192 INFO [pretrained.py:425]
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav:
+ AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav:
+ GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav:
+ YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
+
+ 2021-11-10 13:44:10,192 INFO [pretrained.py:427] Decoding Done
+
+
+Compute WER with the pre-trained model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To check the WER of the pre-trained model on the test datasets, run:
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ cd icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/
+ $ ln -s pretrained.pt epoch-999.pt
+ $ cd ../..
+ $ ./conformer_ctc/decode.py \
+ --exp-dir ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp \
+ --lang-dir ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500 \
+ --lm-dir ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm \
+ --epoch 999 \
+ --avg 1 \
+ --concatenate-cuts 0 \
+ --bucketing-sampler 1 \
+ --max-duration 30 \
+ --num-paths 1000 \
+ --method attention-decoder \
+ --nbest-scale 0.5
+
+
+Colab notebook
+--------------
+
+We do provide a colab notebook for this recipe showing how to use a pre-trained model.
+
+|librispeech asr conformer ctc colab notebook|
+
+.. |librispeech asr conformer ctc colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
+ :target: https://colab.research.google.com/drive/1huyupXAcHsUrKaWfI83iMEJ6J0Nh0213?usp=sharing
+
+.. HINT::
+
+ Due to limited memory provided by Colab, you have to upgrade to Colab Pro to
+ run ``HLG decoding + LM rescoring`` and
+ ``HLG decoding + LM rescoring + attention decoder rescoring``.
+ Otherwise, you can only run ``HLG decoding`` with Colab.
+
+**Congratulations!** You have finished the LibriSpeech ASR recipe with
+conformer CTC models in ``icefall``.
+
+If you want to deploy your trained model in C++, please read the following section.
+
+Deployment with C++
+-------------------
+
+This section describes how to deploy the pre-trained model in C++, without
+Python dependencies.
+
+.. HINT::
+
+ At present, it does NOT support streaming decoding.
+
+First, let us compile k2 from source:
+
+.. code-block:: bash
+
+ $ cd $HOME
+ $ git clone https://github.com/k2-fsa/k2
+ $ cd k2
+ $ git checkout v2.0-pre
+
+.. CAUTION::
+
+ You have to switch to the branch ``v2.0-pre``!
+
+.. code-block:: bash
+
+ $ mkdir build-release
+ $ cd build-release
+ $ cmake -DCMAKE_BUILD_TYPE=Release ..
+ $ make -j ctc_decode hlg_decode ngram_lm_rescore attention_rescore
+
+ # You will find four binaries in `./bin`, i.e.,
+ # ./bin/ctc_decode, ./bin/hlg_decode,
+ # ./bin/ngram_lm_rescore, and ./bin/attention_rescore
+
+Now you are ready to go!
+
+Assume you have run:
+
+ .. code-block:: bash
+
+ $ cd k2/build-release
+ $ ln -s /path/to/icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09 ./
+
+To view the usage of ``./bin/ctc_decode``, run:
+
+.. code-block::
+
+ $ ./bin/ctc_decode
+
+It will show you the following message:
+
+.. code-block:: bash
+
+ Please provide --nn_model
+
+ This file implements decoding with a CTC topology, without any
+ kinds of LM or lexicons.
+
+ Usage:
+ ./bin/ctc_decode \
+ --use_gpu true \
+ --nn_model \
+ --bpe_model \
+ \
+ \
+
+
+ To see all possible options, use
+ ./bin/ctc_decode --help
+
+ Caution:
+ - Only sound files (*.wav) with single channel are supported.
+ - It assumes the model is conformer_ctc/transformer.py from icefall.
+ If you use a different model, you have to change the code
+ related to `model.forward` in this file.
+
+
+CTC decoding
+~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ ./bin/ctc_decode \
+ --use_gpu true \
+ --nn_model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/cpu_jit.pt \
+ --bpe_model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/bpe.model \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav
+
+Its output is:
+
+.. code-block::
+
+ 2021-11-10 13:57:55.316 [I] k2/torch/bin/ctc_decode.cu:105:int main(int, char**) Use GPU
+ 2021-11-10 13:57:55.316 [I] k2/torch/bin/ctc_decode.cu:109:int main(int, char**) Device: cuda:0
+ 2021-11-10 13:57:55.316 [I] k2/torch/bin/ctc_decode.cu:118:int main(int, char**) Load wave files
+ 2021-11-10 13:58:01.221 [I] k2/torch/bin/ctc_decode.cu:125:int main(int, char**) Build Fbank computer
+ 2021-11-10 13:58:01.222 [I] k2/torch/bin/ctc_decode.cu:136:int main(int, char**) Compute features
+ 2021-11-10 13:58:01.228 [I] k2/torch/bin/ctc_decode.cu:144:int main(int, char**) Load neural network model
+ 2021-11-10 13:58:02.19 [I] k2/torch/bin/ctc_decode.cu:159:int main(int, char**) Compute nnet_output
+ 2021-11-10 13:58:02.543 [I] k2/torch/bin/ctc_decode.cu:174:int main(int, char**) Build CTC topo
+ 2021-11-10 13:58:02.547 [I] k2/torch/bin/ctc_decode.cu:177:int main(int, char**) Decoding
+ 2021-11-10 13:58:02.708 [I] k2/torch/bin/ctc_decode.cu:207:int main(int, char**)
+ Decoding result:
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav
+ AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROFFELS
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav
+ GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav
+ YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
+
+HLG decoding
+~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ ./bin/hlg_decode \
+ --use_gpu true \
+ --nn_model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/cpu_jit.pt \
+ --hlg ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt \
+ --word_table ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav
+
+The output is:
+
+.. code-block::
+
+ 2021-11-10 13:59:04.729 [I] k2/torch/bin/hlg_decode.cu:111:int main(int, char**) Use GPU
+ 2021-11-10 13:59:04.729 [I] k2/torch/bin/hlg_decode.cu:115:int main(int, char**) Device: cuda:0
+ 2021-11-10 13:59:04.729 [I] k2/torch/bin/hlg_decode.cu:124:int main(int, char**) Load wave files
+ 2021-11-10 13:59:10.702 [I] k2/torch/bin/hlg_decode.cu:131:int main(int, char**) Build Fbank computer
+ 2021-11-10 13:59:10.703 [I] k2/torch/bin/hlg_decode.cu:142:int main(int, char**) Compute features
+ 2021-11-10 13:59:10.707 [I] k2/torch/bin/hlg_decode.cu:150:int main(int, char**) Load neural network model
+ 2021-11-10 13:59:11.545 [I] k2/torch/bin/hlg_decode.cu:165:int main(int, char**) Compute nnet_output
+ 2021-11-10 13:59:12.72 [I] k2/torch/bin/hlg_decode.cu:180:int main(int, char**) Load ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt
+ 2021-11-10 13:59:12.994 [I] k2/torch/bin/hlg_decode.cu:185:int main(int, char**) Decoding
+ 2021-11-10 13:59:13.268 [I] k2/torch/bin/hlg_decode.cu:216:int main(int, char**)
+ Decoding result:
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav
+ AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav
+ GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav
+ YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
+
+
+HLG decoding + n-gram LM rescoring
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ ./bin/ngram_lm_rescore \
+ --use_gpu true \
+ --nn_model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/cpu_jit.pt \
+ --hlg ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt \
+ --g ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt \
+ --ngram_lm_scale 1.0 \
+ --word_table ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav
+
+The output is:
+
+.. code-block::
+
+ 2021-11-10 14:00:55.279 [I] k2/torch/bin/ngram_lm_rescore.cu:122:int main(int, char**) Use GPU
+ 2021-11-10 14:00:55.280 [I] k2/torch/bin/ngram_lm_rescore.cu:126:int main(int, char**) Device: cuda:0
+ 2021-11-10 14:00:55.280 [I] k2/torch/bin/ngram_lm_rescore.cu:135:int main(int, char**) Load wave files
+ 2021-11-10 14:01:01.214 [I] k2/torch/bin/ngram_lm_rescore.cu:142:int main(int, char**) Build Fbank computer
+ 2021-11-10 14:01:01.215 [I] k2/torch/bin/ngram_lm_rescore.cu:153:int main(int, char**) Compute features
+ 2021-11-10 14:01:01.219 [I] k2/torch/bin/ngram_lm_rescore.cu:161:int main(int, char**) Load neural network model
+ 2021-11-10 14:01:01.945 [I] k2/torch/bin/ngram_lm_rescore.cu:176:int main(int, char**) Compute nnet_output
+ 2021-11-10 14:01:02.475 [I] k2/torch/bin/ngram_lm_rescore.cu:191:int main(int, char**) Load ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt
+ 2021-11-10 14:01:03.398 [I] k2/torch/bin/ngram_lm_rescore.cu:199:int main(int, char**) Decoding
+ 2021-11-10 14:01:03.515 [I] k2/torch/bin/ngram_lm_rescore.cu:205:int main(int, char**) Load n-gram LM: ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt
+ 2021-11-10 14:01:07.432 [W] k2/torch/csrc/deserialization.cu:441:k2::FsaClass k2::LoadFsa(const string&, c10::optional)
+ Ignore non tensor attribute: 'dummy' of type: Int
+ 2021-11-10 14:01:07.589 [I] k2/torch/bin/ngram_lm_rescore.cu:214:int main(int, char**) Rescore with an n-gram LM
+ 2021-11-10 14:01:08.68 [I] k2/torch/bin/ngram_lm_rescore.cu:242:int main(int, char**)
+ Decoding result:
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav
+ AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav
+ GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav
+ YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
+
+
+HLG decoding + n-gram LM rescoring + attention decoder rescoring
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ ./bin/attention_rescore \
+ --use_gpu true \
+ --nn_model ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/exp/cpu_jit.pt \
+ --hlg ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt \
+ --g ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt \
+ --ngram_lm_scale 2.0 \
+ --attention_scale 2.0 \
+ --num_paths 100 \
+ --nbest_scale 0.5 \
+ --word_table ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/words.txt \
+ --sos_id 1 \
+ --eos_id 1 \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav \
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav
+
+The output is:
+
+.. code-block::
+
+ 2021-11-10 14:02:43.656 [I] k2/torch/bin/attention_rescore.cu:149:int main(int, char**) Use GPU
+ 2021-11-10 14:02:43.656 [I] k2/torch/bin/attention_rescore.cu:153:int main(int, char**) Device: cuda:0
+ 2021-11-10 14:02:43.656 [I] k2/torch/bin/attention_rescore.cu:162:int main(int, char**) Load wave files
+ 2021-11-10 14:02:49.216 [I] k2/torch/bin/attention_rescore.cu:169:int main(int, char**) Build Fbank computer
+ 2021-11-10 14:02:49.217 [I] k2/torch/bin/attention_rescore.cu:180:int main(int, char**) Compute features
+ 2021-11-10 14:02:49.222 [I] k2/torch/bin/attention_rescore.cu:188:int main(int, char**) Load neural network model
+ 2021-11-10 14:02:49.984 [I] k2/torch/bin/attention_rescore.cu:203:int main(int, char**) Compute nnet_output
+ 2021-11-10 14:02:50.624 [I] k2/torch/bin/attention_rescore.cu:220:int main(int, char**) Load ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lang_bpe_500/HLG.pt
+ 2021-11-10 14:02:51.519 [I] k2/torch/bin/attention_rescore.cu:228:int main(int, char**) Decoding
+ 2021-11-10 14:02:51.632 [I] k2/torch/bin/attention_rescore.cu:234:int main(int, char**) Load n-gram LM: ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/data/lm/G_4_gram.pt
+ 2021-11-10 14:02:55.537 [W] k2/torch/csrc/deserialization.cu:441:k2::FsaClass k2::LoadFsa(const string&, c10::optional) Ignore non tensor attribute: 'dummy' of type: Int
+ 2021-11-10 14:02:55.645 [I] k2/torch/bin/attention_rescore.cu:243:int main(int, char**) Rescore with an n-gram LM
+ 2021-11-10 14:02:55.970 [I] k2/torch/bin/attention_rescore.cu:246:int main(int, char**) Sample 100 paths
+ 2021-11-10 14:02:56.215 [I] k2/torch/bin/attention_rescore.cu:293:int main(int, char**) Run attention decoder
+ 2021-11-10 14:02:57.35 [I] k2/torch/bin/attention_rescore.cu:303:int main(int, char**) Rescoring
+ 2021-11-10 14:02:57.179 [I] k2/torch/bin/attention_rescore.cu:369:int main(int, char**)
+ Decoding result:
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1089-134686-0001.wav
+ AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0001.wav
+ GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
+
+ ./icefall-asr-librispeech-conformer-ctc-jit-bpe-500-2021-11-09/test_wavs/1221-135766-0002.wav
+ YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
+
+There is a Colab notebook showing you how to run a torch scripted model in C++.
+Please see |librispeech asr conformer ctc torch script colab notebook|
+
+.. |librispeech asr conformer ctc torch script colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
+ :target: https://colab.research.google.com/drive/1BIGLWzS36isskMXHKcqC9ysN6pspYXs_?usp=sharing
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
new file mode 100644
index 000000000..37edf7de9
--- /dev/null
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
@@ -0,0 +1,223 @@
+Distillation with HuBERT
+========================
+
+This tutorial shows you how to perform knowledge distillation in `icefall `_
+with the `LibriSpeech`_ dataset. The distillation method
+used here is called "Multi Vector Quantization Knowledge Distillation" (MVQ-KD).
+Please have a look at our paper `Predicting Multi-Codebook Vector Quantization Indexes for Knowledge Distillation `_
+for more details about MVQ-KD.
+
+.. note::
+
+ This tutorial is based on recipe
+ `pruned_transducer_stateless4 `_.
+ Currently, we only implement MVQ-KD in this recipe. However, MVQ-KD is theoretically applicable to all recipes
+ with only minor changes needed. Feel free to try out MVQ-KD in different recipes. If you
+ encounter any problems, please open an issue here `icefall `__.
+
+.. note::
+
+ We assume you have read the page :ref:`install icefall` and have setup
+ the environment for `icefall`_.
+
+.. HINT::
+
+ We recommend you to use a GPU or several GPUs to run this recipe.
+
+Data preparation
+----------------
+
+We first prepare necessary training data for `LibriSpeech`_.
+This is the same as in :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+
+.. hint::
+
+ The data preparation is the same as other recipes on LibriSpeech dataset,
+ if you have finished this step, you can skip to :ref:`codebook_index_preparation` directly.
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./prepare.sh
+
+The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
+All you need to do is to run it.
+
+The data preparation contains several stages, you can use the following two
+options:
+
+ - ``--stage``
+ - ``--stop_stage``
+
+to control which stage(s) should be run. By default, all stages are executed.
+
+For example,
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./prepare.sh --stage 0 --stop_stage 0 # run only stage 0
+ $ ./prepare.sh --stage 2 --stop_stage 5 # run from stage 2 to stage 5
+
+.. HINT::
+
+ If you have pre-downloaded the `LibriSpeech`_
+ dataset and the `musan`_ dataset, say,
+ they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
+ the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
+ ``./prepare.sh`` won't re-download them.
+
+.. NOTE::
+
+ All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
+ are saved in ``./data`` directory.
+
+We provide the following YouTube video showing how to run ``./prepare.sh``.
+
+.. note::
+
+ To get the latest news of `next-gen Kaldi `_, please subscribe
+ the following YouTube channel by `Nadira Povey `_:
+
+ ``_
+
+.. youtube:: ofEIoJL-mGM
+
+
+.. _codebook_index_preparation:
+
+Codebook index preparation
+--------------------------
+
+Here, we prepare necessary data for MVQ-KD. This requires the generation
+of codebook indexes (please read our `paper `_.
+if you are interested in details). In this tutorial, we use the pre-computed
+codebook indexes for convenience. The only thing you need to do is to
+run `./distillation_with_hubert.sh `_.
+
+.. note::
+
+ There are 5 stages in total, the first and second stage will be automatically skipped
+ when choosing to downloaded codebook indexes prepared by `icefall`_.
+ Of course, you can extract and compute the codebook indexes by yourself. This
+ will require you downloading a HuBERT-XL model and it can take a while for
+ the extraction of codebook indexes.
+
+
+As usual, you can control the stages you want to run by specifying the following
+two options:
+
+ - ``--stage``
+ - ``--stop_stage``
+
+For example,
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./distillation_with_hubert.sh --stage 0 --stop_stage 0 # run only stage 0
+ $ ./distillation_with_hubert.sh --stage 2 --stop_stage 4 # run from stage 2 to stage 5
+
+Here are a few options in `./distillation_with_hubert.sh `_
+you need to know before you proceed.
+
+- ``--full_libri`` If True, use full 960h data. Otherwise only ``train-clean-100`` will be used
+- ``--use_extracted_codebook`` If True, the first two stages will be skipped and the codebook
+ indexes uploaded by us will be downloaded.
+
+Since we are using the pre-computed codebook indexes, we set
+``use_extracted_codebook=True``. If you want to do full `LibriSpeech`_
+experiments, please set ``full_libri=True``.
+
+The following command downloads the pre-computed codebook indexes
+and prepares MVQ-augmented training manifests.
+
+.. code-block:: bash
+
+ $ ./distillation_with_hubert.sh --stage 2 --stop_stage 2 # run only stage 2
+
+Please see the
+following screenshot for the output of an example execution.
+
+.. figure:: ./images/distillation_codebook.png
+ :width: 800
+ :alt: Downloading codebook indexes and preparing training manifest.
+ :align: center
+
+ Downloading codebook indexes and preparing training manifest.
+
+.. hint::
+
+ The codebook indexes we prepared for you in this tutorial
+ are extracted from the 36-th layer of a fine-tuned HuBERT-XL model
+ with 8 codebooks. If you want to try other configurations, please
+ set ``use_extracted_codebook=False`` and set ``embedding_layer`` and
+ ``num_codebooks`` by yourself.
+
+Now, you should see the following files under the directory ``./data/vq_fbank_layer36_cb8``.
+
+.. figure:: ./images/distillation_directory.png
+ :width: 800
+ :alt: MVQ-augmented training manifests
+ :align: center
+
+ MVQ-augmented training manifests.
+
+Whola! You are ready to perform knowledge distillation training now!
+
+Training
+--------
+
+To perform training, please run stage 3 by executing the following command.
+
+.. code-block:: bash
+
+ $ ./prepare.sh --stage 3 --stop_stage 3 # run MVQ training
+
+Here is the code snippet for training:
+
+.. code-block:: bash
+
+ WORLD_SIZE=$(echo ${CUDA_VISIBLE_DEVICES} | awk '{n=split($1, _, ","); print n}')
+
+ ./pruned_transducer_stateless6/train.py \
+ --manifest-dir ./data/vq_fbank_layer36_cb8 \
+ --master-port 12359 \
+ --full-libri $full_libri \
+ --spec-aug-time-warp-factor -1 \
+ --max-duration 300 \
+ --world-size ${WORLD_SIZE} \
+ --num-epochs 30 \
+ --exp-dir $exp_dir \
+ --enable-distillation True \
+ --codebook-loss-scale 0.01
+
+There are a few training arguments in the following
+training commands that should be paid attention to.
+
+ - ``--enable-distillation`` If True, knowledge distillation training is enabled.
+ - ``--codebook-loss-scale`` The scale of the knowledge distillation loss.
+ - ``--manifest-dir`` The path to the MVQ-augmented manifest.
+
+
+Decoding
+--------
+
+After training finished, you can test the performance on using
+the following command.
+
+.. code-block:: bash
+
+ export CUDA_VISIBLE_DEVICES=0
+ ./pruned_transducer_stateless6/train.py \
+ --decoding-method "modified_beam_search" \
+ --epoch 30 \
+ --avg 10 \
+ --max-duration 200 \
+ --exp-dir $exp_dir \
+ --enable-distillation True
+
+You should get similar results as `here `__.
+
+That's all! Feel free to experiment with your own setups and report your results.
+If you encounter any problems during training, please open up an issue `here `__.
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_codebook.png b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_codebook.png
new file mode 100644
index 000000000..1a40d6c6e
Binary files /dev/null and b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_codebook.png differ
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_directory.png b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_directory.png
new file mode 100644
index 000000000..30763046f
Binary files /dev/null and b/docs/source/recipes/Non-streaming-ASR/librispeech/images/distillation_directory.png differ
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-conformer-ctc-tensorboard-log.png b/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-conformer-ctc-tensorboard-log.png
new file mode 100644
index 000000000..4e8c2ea7c
Binary files /dev/null and b/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-conformer-ctc-tensorboard-log.png differ
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-pruned-transducer-tensorboard-log.jpg b/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-pruned-transducer-tensorboard-log.jpg
new file mode 100644
index 000000000..800835749
Binary files /dev/null and b/docs/source/recipes/Non-streaming-ASR/librispeech/images/librispeech-pruned-transducer-tensorboard-log.jpg differ
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst
new file mode 100644
index 000000000..bf439861a
--- /dev/null
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/index.rst
@@ -0,0 +1,12 @@
+LibriSpeech
+===========
+
+.. toctree::
+ :maxdepth: 1
+
+ tdnn_lstm_ctc
+ conformer_ctc
+ pruned_transducer_stateless
+ zipformer_mmi
+ zipformer_ctc_blankskip
+ distillation
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
new file mode 100644
index 000000000..f356e97e7
--- /dev/null
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
@@ -0,0 +1,548 @@
+.. _non_streaming_librispeech_pruned_transducer_stateless:
+
+Pruned transducer statelessX
+============================
+
+This tutorial shows you how to run a conformer transducer model
+with the `LibriSpeech `_ dataset.
+
+.. Note::
+
+ The tutorial is suitable for `pruned_transducer_stateless `__,
+ `pruned_transducer_stateless2 `__,
+ `pruned_transducer_stateless4 `__,
+ `pruned_transducer_stateless5 `__,
+ We will take pruned_transducer_stateless4 as an example in this tutorial.
+
+.. HINT::
+
+ We assume you have read the page :ref:`install icefall` and have setup
+ the environment for ``icefall``.
+
+.. HINT::
+
+ We recommend you to use a GPU or several GPUs to run this recipe.
+
+.. hint::
+
+ Please scroll down to the bottom of this page to find download links
+ for pretrained models if you don't want to train a model from scratch.
+
+
+We use pruned RNN-T to compute the loss.
+
+.. note::
+
+ You can find the paper about pruned RNN-T at the following address:
+
+ ``_
+
+The transducer model consists of 3 parts:
+
+ - Encoder, a.k.a, the transcription network. We use a Conformer model (the reworked version by Daniel Povey)
+ - Decoder, a.k.a, the prediction network. We use a stateless model consisting of
+ ``nn.Embedding`` and ``nn.Conv1d``
+ - Joiner, a.k.a, the joint network.
+
+.. caution::
+
+ Contrary to the conventional RNN-T models, we use a stateless decoder.
+ That is, it has no recurrent connections.
+
+
+Data preparation
+----------------
+
+.. hint::
+
+ The data preparation is the same as other recipes on LibriSpeech dataset,
+ if you have finished this step, you can skip to ``Training`` directly.
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./prepare.sh
+
+The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
+All you need to do is to run it.
+
+The data preparation contains several stages, you can use the following two
+options:
+
+ - ``--stage``
+ - ``--stop-stage``
+
+to control which stage(s) should be run. By default, all stages are executed.
+
+
+For example,
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./prepare.sh --stage 0 --stop-stage 0
+
+means to run only stage 0.
+
+To run stage 2 to stage 5, use:
+
+.. code-block:: bash
+
+ $ ./prepare.sh --stage 2 --stop-stage 5
+
+.. HINT::
+
+ If you have pre-downloaded the `LibriSpeech `_
+ dataset and the `musan `_ dataset, say,
+ they are saved in ``/tmp/LibriSpeech`` and ``/tmp/musan``, you can modify
+ the ``dl_dir`` variable in ``./prepare.sh`` to point to ``/tmp`` so that
+ ``./prepare.sh`` won't re-download them.
+
+.. NOTE::
+
+ All generated files by ``./prepare.sh``, e.g., features, lexicon, etc,
+ are saved in ``./data`` directory.
+
+We provide the following YouTube video showing how to run ``./prepare.sh``.
+
+.. note::
+
+ To get the latest news of `next-gen Kaldi `_, please subscribe
+ the following YouTube channel by `Nadira Povey `_:
+
+ ``_
+
+.. youtube:: ofEIoJL-mGM
+
+
+Training
+--------
+
+Configurable options
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./pruned_transducer_stateless4/train.py --help
+
+
+shows you the training options that can be passed from the commandline.
+The following options are used quite often:
+
+ - ``--exp-dir``
+
+ The directory to save checkpoints, training logs and tensorboard.
+
+ - ``--full-libri``
+
+ If it's True, the training part uses all the training data, i.e.,
+ 960 hours. Otherwise, the training part uses only the subset
+ ``train-clean-100``, which has 100 hours of training data.
+
+ .. CAUTION::
+ The training set is perturbed by speed with two factors: 0.9 and 1.1.
+ If ``--full-libri`` is True, each epoch actually processes
+ ``3x960 == 2880`` hours of data.
+
+ - ``--num-epochs``
+
+ It is the number of epochs to train. For instance,
+ ``./pruned_transducer_stateless4/train.py --num-epochs 30`` trains for 30 epochs
+ and generates ``epoch-1.pt``, ``epoch-2.pt``, ..., ``epoch-30.pt``
+ in the folder ``./pruned_transducer_stateless4/exp``.
+
+ - ``--start-epoch``
+
+ It's used to resume training.
+ ``./pruned_transducer_stateless4/train.py --start-epoch 10`` loads the
+ checkpoint ``./pruned_transducer_stateless4/exp/epoch-9.pt`` and starts
+ training from epoch 10, based on the state from epoch 9.
+
+ - ``--world-size``
+
+ It is used for multi-GPU single-machine DDP training.
+
+ - (a) If it is 1, then no DDP training is used.
+
+ - (b) If it is 2, then GPU 0 and GPU 1 are used for DDP training.
+
+ The following shows some use cases with it.
+
+ **Use case 1**: You have 4 GPUs, but you only want to use GPU 0 and
+ GPU 2 for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ export CUDA_VISIBLE_DEVICES="0,2"
+ $ ./pruned_transducer_stateless4/train.py --world-size 2
+
+ **Use case 2**: You have 4 GPUs and you want to use all of them
+ for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./pruned_transducer_stateless4/train.py --world-size 4
+
+ **Use case 3**: You have 4 GPUs but you only want to use GPU 3
+ for training. You can do the following:
+
+ .. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ export CUDA_VISIBLE_DEVICES="3"
+ $ ./pruned_transducer_stateless4/train.py --world-size 1
+
+ .. caution::
+
+ Only multi-GPU single-machine DDP training is implemented at present.
+ Multi-GPU multi-machine DDP training will be added later.
+
+ - ``--max-duration``
+
+ It specifies the number of seconds over all utterances in a
+ batch, before **padding**.
+ If you encounter CUDA OOM, please reduce it.
+
+ .. HINT::
+
+ Due to padding, the number of seconds of all utterances in a
+ batch will usually be larger than ``--max-duration``.
+
+ A larger value for ``--max-duration`` may cause OOM during training,
+ while a smaller value may increase the training time. You have to
+ tune it.
+
+ - ``--use-fp16``
+
+ If it is True, the model will train with half precision, from our experiment
+ results, by using half precision you can train with two times larger ``--max-duration``
+ so as to get almost 2X speed up.
+
+
+Pre-configured options
+~~~~~~~~~~~~~~~~~~~~~~
+
+There are some training options, e.g., number of encoder layers,
+encoder dimension, decoder dimension, number of warmup steps etc,
+that are not passed from the commandline.
+They are pre-configured by the function ``get_params()`` in
+`pruned_transducer_stateless4/train.py `_
+
+You don't need to change these pre-configured parameters. If you really need to change
+them, please modify ``./pruned_transducer_stateless4/train.py`` directly.
+
+
+.. NOTE::
+
+ The options for `pruned_transducer_stateless5 `__ are a little different from
+ other recipes. It allows you to configure ``--num-encoder-layers``, ``--dim-feedforward``, ``--nhead``, ``--encoder-dim``, ``--decoder-dim``, ``--joiner-dim`` from commandline, so that you can train models with different size with pruned_transducer_stateless5.
+
+
+Training logs
+~~~~~~~~~~~~~
+
+Training logs and checkpoints are saved in ``--exp-dir`` (e.g. ``pruned_transducer_stateless4/exp``.
+You will find the following files in that directory:
+
+ - ``epoch-1.pt``, ``epoch-2.pt``, ...
+
+ These are checkpoint files saved at the end of each epoch, containing model
+ ``state_dict`` and optimizer ``state_dict``.
+ To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
+
+ .. code-block:: bash
+
+ $ ./pruned_transducer_stateless4/train.py --start-epoch 11
+
+ - ``checkpoint-436000.pt``, ``checkpoint-438000.pt``, ...
+
+ These are checkpoint files saved every ``--save-every-n`` batches,
+ containing model ``state_dict`` and optimizer ``state_dict``.
+ To resume training from some checkpoint, say ``checkpoint-436000``, you can use:
+
+ .. code-block:: bash
+
+ $ ./pruned_transducer_stateless4/train.py --start-batch 436000
+
+ - ``tensorboard/``
+
+ This folder contains tensorBoard logs. Training loss, validation loss, learning
+ rate, etc, are recorded in these logs. You can visualize them by:
+
+ .. code-block:: bash
+
+ $ cd pruned_transducer_stateless4/exp/tensorboard
+ $ tensorboard dev upload --logdir . --description "pruned transducer training for LibriSpeech with icefall"
+
+ It will print something like below:
+
+ .. code-block::
+
+ TensorFlow installation not found - running with reduced feature set.
+ Upload started and will continue reading any new data as it's added to the logdir.
+
+ To stop uploading, press Ctrl-C.
+
+ New experiment created. View your TensorBoard at: https://tensorboard.dev/experiment/QOGSPBgsR8KzcRMmie9JGw/
+
+ [2022-11-20T15:50:50] Started scanning logdir.
+ Uploading 4468 scalars...
+ [2022-11-20T15:53:02] Total uploaded: 210171 scalars, 0 tensors, 0 binary objects
+ Listening for new data in logdir...
+
+ Note there is a URL in the above output. Click it and you will see
+ the following screenshot:
+
+ .. figure:: images/librispeech-pruned-transducer-tensorboard-log.jpg
+ :width: 600
+ :alt: TensorBoard screenshot
+ :align: center
+ :target: https://tensorboard.dev/experiment/QOGSPBgsR8KzcRMmie9JGw/
+
+ TensorBoard screenshot.
+
+ .. hint::
+
+ If you don't have access to google, you can use the following command
+ to view the tensorboard log locally:
+
+ .. code-block:: bash
+
+ cd pruned_transducer_stateless4/exp/tensorboard
+ tensorboard --logdir . --port 6008
+
+ It will print the following message:
+
+ .. code-block::
+
+ Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
+ TensorBoard 2.8.0 at http://localhost:6008/ (Press CTRL+C to quit)
+
+ Now start your browser and go to ``_ to view the tensorboard
+ logs.
+
+
+ - ``log/log-train-xxxx``
+
+ It is the detailed training log in text format, same as the one
+ you saw printed to the console during training.
+
+Usage example
+~~~~~~~~~~~~~
+
+You can use the following command to start the training using 6 GPUs:
+
+.. code-block:: bash
+
+ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5"
+ ./pruned_transducer_stateless4/train.py \
+ --world-size 6 \
+ --num-epochs 30 \
+ --start-epoch 1 \
+ --exp-dir pruned_transducer_stateless4/exp \
+ --full-libri 1 \
+ --max-duration 300
+
+
+Decoding
+--------
+
+The decoding part uses checkpoints saved by the training part, so you have
+to run the training part first.
+
+.. hint::
+
+ There are two kinds of checkpoints:
+
+ - (1) ``epoch-1.pt``, ``epoch-2.pt``, ..., which are saved at the end
+ of each epoch. You can pass ``--epoch`` to
+ ``pruned_transducer_stateless4/decode.py`` to use them.
+
+ - (2) ``checkpoints-436000.pt``, ``epoch-438000.pt``, ..., which are saved
+ every ``--save-every-n`` batches. You can pass ``--iter`` to
+ ``pruned_transducer_stateless4/decode.py`` to use them.
+
+ We suggest that you try both types of checkpoints and choose the one
+ that produces the lowest WERs.
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./pruned_transducer_stateless4/decode.py --help
+
+shows the options for decoding.
+
+The following shows two examples (for two types of checkpoints):
+
+.. code-block:: bash
+
+ for m in greedy_search fast_beam_search modified_beam_search; do
+ for epoch in 25 20; do
+ for avg in 7 5 3 1; do
+ ./pruned_transducer_stateless4/decode.py \
+ --epoch $epoch \
+ --avg $avg \
+ --exp-dir pruned_transducer_stateless4/exp \
+ --max-duration 600 \
+ --decoding-method $m
+ done
+ done
+ done
+
+
+.. code-block:: bash
+
+ for m in greedy_search fast_beam_search modified_beam_search; do
+ for iter in 474000; do
+ for avg in 8 10 12 14 16 18; do
+ ./pruned_transducer_stateless4/decode.py \
+ --iter $iter \
+ --avg $avg \
+ --exp-dir pruned_transducer_stateless4/exp \
+ --max-duration 600 \
+ --decoding-method $m
+ done
+ done
+ done
+
+
+.. Note::
+
+ Supporting decoding methods are as follows:
+
+ - ``greedy_search`` : It takes the symbol with largest posterior probability
+ of each frame as the decoding result.
+
+ - ``beam_search`` : It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf and
+ `espnet/nets/beam_search_transducer.py `_
+ is used as a reference. Basically, it keeps topk states for each frame, and expands the kept states with their own contexts to
+ next frame.
+
+ - ``modified_beam_search`` : It implements the same algorithm as ``beam_search`` above, but it
+ runs in batch mode with ``--max-sym-per-frame=1`` being hardcoded.
+
+ - ``fast_beam_search`` : It implements graph composition between the output ``log_probs`` and
+ given ``FSAs``. It is hard to describe the details in several lines of texts, you can read
+ our paper in https://arxiv.org/pdf/2211.00484.pdf or our `rnnt decode code in k2 `_. ``fast_beam_search`` can decode with ``FSAs`` on GPU efficiently.
+
+ - ``fast_beam_search_LG`` : The same as ``fast_beam_search`` above, ``fast_beam_search`` uses
+ an trivial graph that has only one state, while ``fast_beam_search_LG`` uses an LG graph
+ (with N-gram LM).
+
+ - ``fast_beam_search_nbest`` : It produces the decoding results as follows:
+
+ - (1) Use ``fast_beam_search`` to get a lattice
+ - (2) Select ``num_paths`` paths from the lattice using ``k2.random_paths()``
+ - (3) Unique the selected paths
+ - (4) Intersect the selected paths with the lattice and compute the
+ shortest path from the intersection result
+ - (5) The path with the largest score is used as the decoding output.
+
+ - ``fast_beam_search_nbest_LG`` : It implements same logic as ``fast_beam_search_nbest``, the
+ only difference is that it uses ``fast_beam_search_LG`` to generate the lattice.
+
+
+Export Model
+------------
+
+`pruned_transducer_stateless4/export.py `_ supports exporting checkpoints from ``pruned_transducer_stateless4/exp`` in the following ways.
+
+Export ``model.state_dict()``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Checkpoints saved by ``pruned_transducer_stateless4/train.py`` also include
+``optimizer.state_dict()``. It is useful for resuming training. But after training,
+we are interested only in ``model.state_dict()``. You can use the following
+command to extract ``model.state_dict()``.
+
+.. code-block:: bash
+
+ # Assume that --epoch 25 --avg 3 produces the smallest WER
+ # (You can get such information after running ./pruned_transducer_stateless4/decode.py)
+
+ epoch=25
+ avg=3
+
+ ./pruned_transducer_stateless4/export.py \
+ --exp-dir ./pruned_transducer_stateless4/exp \
+ --bpe-model data/lang_bpe_500/bpe.model \
+ --epoch $epoch \
+ --avg $avg
+
+It will generate a file ``./pruned_transducer_stateless4/exp/pretrained.pt``.
+
+.. hint::
+
+ To use the generated ``pretrained.pt`` for ``pruned_transducer_stateless4/decode.py``,
+ you can run:
+
+ .. code-block:: bash
+
+ cd pruned_transducer_stateless4/exp
+ ln -s pretrained.pt epoch-999.pt
+
+ And then pass ``--epoch 999 --avg 1 --use-averaged-model 0`` to
+ ``./pruned_transducer_stateless4/decode.py``.
+
+To use the exported model with ``./pruned_transducer_stateless4/pretrained.py``, you
+can run:
+
+.. code-block:: bash
+
+ ./pruned_transducer_stateless4/pretrained.py \
+ --checkpoint ./pruned_transducer_stateless4/exp/pretrained.pt \
+ --bpe-model ./data/lang_bpe_500/bpe.model \
+ --method greedy_search \
+ /path/to/foo.wav \
+ /path/to/bar.wav
+
+
+Export model using ``torch.jit.script()``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ ./pruned_transducer_stateless4/export.py \
+ --exp-dir ./pruned_transducer_stateless4/exp \
+ --bpe-model data/lang_bpe_500/bpe.model \
+ --epoch 25 \
+ --avg 3 \
+ --jit 1
+
+It will generate a file ``cpu_jit.pt`` in the given ``exp_dir``. You can later
+load it by ``torch.jit.load("cpu_jit.pt")``.
+
+Note ``cpu`` in the name ``cpu_jit.pt`` means the parameters when loaded into Python
+are on CPU. You can use ``to("cuda")`` to move them to a CUDA device.
+
+.. NOTE::
+
+ You will need this ``cpu_jit.pt`` when deploying with Sherpa framework.
+
+
+Download pretrained models
+--------------------------
+
+If you don't want to train from scratch, you can download the pretrained models
+by visiting the following links:
+
+ - `pruned_transducer_stateless `__
+
+ - `pruned_transducer_stateless2 `__
+
+ - `pruned_transducer_stateless4 `__
+
+ - `pruned_transducer_stateless5 `__
+
+ See ``_
+ for the details of the above pretrained models
+
+
+Deploy with Sherpa
+------------------
+
+Please see ``_
+for how to deploy the models in ``sherpa``.
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc.rst
new file mode 100644
index 000000000..aa380396a
--- /dev/null
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/tdnn_lstm_ctc.rst
@@ -0,0 +1,404 @@
+TDNN-LSTM-CTC
+=============
+
+This tutorial shows you how to run a TDNN-LSTM-CTC model with the `LibriSpeech `_ dataset.
+
+
+.. HINT::
+
+ We assume you have read the page :ref:`install icefall` and have setup
+ the environment for ``icefall``.
+
+
+Data preparation
+----------------
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./prepare.sh
+
+The script ``./prepare.sh`` handles the data preparation for you, **automagically**.
+All you need to do is to run it.
+
+The data preparation contains several stages, you can use the following two
+options:
+
+ - ``--stage``
+ - ``--stop-stage``
+
+to control which stage(s) should be run. By default, all stages are executed.
+
+
+For example,
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./prepare.sh --stage 0 --stop-stage 0
+
+means to run only stage 0.
+
+To run stage 2 to stage 5, use:
+
+.. code-block:: bash
+
+ $ ./prepare.sh --stage 2 --stop-stage 5
+
+We provide the following YouTube video showing how to run ``./prepare.sh``.
+
+.. note::
+
+ To get the latest news of `next-gen Kaldi `_, please subscribe
+ the following YouTube channel by `Nadira Povey `_:
+
+ ``_
+
+.. youtube:: ofEIoJL-mGM
+
+Training
+--------
+
+Now describing the training of TDNN-LSTM-CTC model, contained in
+the `tdnn_lstm_ctc `_
+folder.
+
+The command to run the training part is:
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ export CUDA_VISIBLE_DEVICES="0,1,2,3"
+ $ ./tdnn_lstm_ctc/train.py --world-size 4
+
+By default, it will run ``20`` epochs. Training logs and checkpoints are saved
+in ``tdnn_lstm_ctc/exp``.
+
+In ``tdnn_lstm_ctc/exp``, you will find the following files:
+
+ - ``epoch-0.pt``, ``epoch-1.pt``, ..., ``epoch-19.pt``
+
+ These are checkpoint files, containing model ``state_dict`` and optimizer ``state_dict``.
+ To resume training from some checkpoint, say ``epoch-10.pt``, you can use:
+
+ .. code-block:: bash
+
+ $ ./tdnn_lstm_ctc/train.py --start-epoch 11
+
+ - ``tensorboard/``
+
+ This folder contains TensorBoard logs. Training loss, validation loss, learning
+ rate, etc, are recorded in these logs. You can visualize them by:
+
+ .. code-block:: bash
+
+ $ cd tdnn_lstm_ctc/exp/tensorboard
+ $ tensorboard dev upload --logdir . --description "TDNN LSTM training for librispeech with icefall"
+
+ - ``log/log-train-xxxx``
+
+ It is the detailed training log in text format, same as the one
+ you saw printed to the console during training.
+
+
+To see available training options, you can use:
+
+.. code-block:: bash
+
+ $ ./tdnn_lstm_ctc/train.py --help
+
+Other training options, e.g., learning rate, results dir, etc., are
+pre-configured in the function ``get_params()``
+in `tdnn_lstm_ctc/train.py `_.
+Normally, you don't need to change them. You can change them by modifying the code, if
+you want.
+
+Decoding
+--------
+
+The decoding part uses checkpoints saved by the training part, so you have
+to run the training part first.
+
+The command for decoding is:
+
+.. code-block:: bash
+
+ $ export CUDA_VISIBLE_DEVICES="0"
+ $ ./tdnn_lstm_ctc/decode.py
+
+You will see the WER in the output log.
+
+Decoded results are saved in ``tdnn_lstm_ctc/exp``.
+
+.. code-block:: bash
+
+ $ ./tdnn_lstm_ctc/decode.py --help
+
+shows you the available decoding options.
+
+Some commonly used options are:
+
+ - ``--epoch``
+
+ You can select which checkpoint to be used for decoding.
+ For instance, ``./tdnn_lstm_ctc/decode.py --epoch 10`` means to use
+ ``./tdnn_lstm_ctc/exp/epoch-10.pt`` for decoding.
+
+ - ``--avg``
+
+ It's related to model averaging. It specifies number of checkpoints
+ to be averaged. The averaged model is used for decoding.
+ For example, the following command:
+
+ .. code-block:: bash
+
+ $ ./tdnn_lstm_ctc/decode.py --epoch 10 --avg 3
+
+ uses the average of ``epoch-8.pt``, ``epoch-9.pt`` and ``epoch-10.pt``
+ for decoding.
+
+ - ``--export``
+
+ If it is ``True``, i.e., ``./tdnn_lstm_ctc/decode.py --export 1``, the code
+ will save the averaged model to ``tdnn_lstm_ctc/exp/pretrained.pt``.
+ See :ref:`tdnn_lstm_ctc use a pre-trained model` for how to use it.
+
+
+.. _tdnn_lstm_ctc use a pre-trained model:
+
+Pre-trained Model
+-----------------
+
+We have uploaded the pre-trained model to
+``_.
+
+The following shows you how to use the pre-trained model.
+
+
+Install kaldifeat
+~~~~~~~~~~~~~~~~~
+
+`kaldifeat `_ is used to
+extract features for a single sound file or multiple sound files
+at the same time.
+
+Please refer to ``_ for installation.
+
+Download the pre-trained model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ mkdir tmp
+ $ cd tmp
+ $ git lfs install
+ $ git clone https://huggingface.co/pkufool/icefall_asr_librispeech_tdnn-lstm_ctc
+
+.. CAUTION::
+
+ You have to use ``git lfs`` to download the pre-trained model.
+
+.. CAUTION::
+
+ In order to use this pre-trained model, your k2 version has to be v1.7 or later.
+
+After downloading, you will have the following files:
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ tree tmp
+
+.. code-block:: bash
+
+ tmp/
+ `-- icefall_asr_librispeech_tdnn-lstm_ctc
+ |-- README.md
+ |-- data
+ | |-- lang_phone
+ | | |-- HLG.pt
+ | | |-- tokens.txt
+ | | `-- words.txt
+ | `-- lm
+ | `-- G_4_gram.pt
+ |-- exp
+ | `-- pretrained.pt
+ `-- test_wavs
+ |-- 1089-134686-0001.flac
+ |-- 1221-135766-0001.flac
+ |-- 1221-135766-0002.flac
+ `-- trans.txt
+
+ 6 directories, 10 files
+
+**File descriptions**:
+
+ - ``data/lang_phone/HLG.pt``
+
+ It is the decoding graph.
+
+ - ``data/lang_phone/tokens.txt``
+
+ It contains tokens and their IDs.
+
+ - ``data/lang_phone/words.txt``
+
+ It contains words and their IDs.
+
+ - ``data/lm/G_4_gram.pt``
+
+ It is a 4-gram LM, useful for LM rescoring.
+
+ - ``exp/pretrained.pt``
+
+ It contains pre-trained model parameters, obtained by averaging
+ checkpoints from ``epoch-14.pt`` to ``epoch-19.pt``.
+ Note: We have removed optimizer ``state_dict`` to reduce file size.
+
+ - ``test_waves/*.flac``
+
+ It contains some test sound files from LibriSpeech ``test-clean`` dataset.
+
+ - ``test_waves/trans.txt``
+
+ It contains the reference transcripts for the sound files in ``test_waves/``.
+
+The information of the test sound files is listed below:
+
+.. code-block:: bash
+
+ $ soxi tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/*.flac
+
+ Input File : 'tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1089-134686-0001.flac'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:06.62 = 106000 samples ~ 496.875 CDDA sectors
+ File Size : 116k
+ Bit Rate : 140k
+ Sample Encoding: 16-bit FLAC
+
+
+ Input File : 'tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0001.flac'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:16.71 = 267440 samples ~ 1253.62 CDDA sectors
+ File Size : 343k
+ Bit Rate : 164k
+ Sample Encoding: 16-bit FLAC
+
+
+ Input File : 'tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0002.flac'
+ Channels : 1
+ Sample Rate : 16000
+ Precision : 16-bit
+ Duration : 00:00:04.83 = 77200 samples ~ 361.875 CDDA sectors
+ File Size : 105k
+ Bit Rate : 174k
+ Sample Encoding: 16-bit FLAC
+
+ Total Duration of 3 files: 00:00:28.16
+
+
+Inference with a pre-trained model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: bash
+
+ $ cd egs/librispeech/ASR
+ $ ./tdnn_lstm_ctc/pretrained.py --help
+
+shows the usage information of ``./tdnn_lstm_ctc/pretrained.py``.
+
+To decode with ``1best`` method, we can use:
+
+.. code-block:: bash
+
+ ./tdnn_lstm_ctc/pretrained.py \
+ --checkpoint ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/exp/pretraind.pt \
+ --words-file ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/data/lang_phone/words.txt \
+ --HLG ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/data/lang_phone/HLG.pt \
+ ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1089-134686-0001.flac \
+ ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0001.flac \
+ ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0002.flac
+
+The output is:
+
+.. code-block::
+
+ 2021-08-24 16:57:13,315 INFO [pretrained.py:168] device: cuda:0
+ 2021-08-24 16:57:13,315 INFO [pretrained.py:170] Creating model
+ 2021-08-24 16:57:18,331 INFO [pretrained.py:182] Loading HLG from ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/data/lang_phone/HLG.pt
+ 2021-08-24 16:57:27,581 INFO [pretrained.py:199] Constructing Fbank computer
+ 2021-08-24 16:57:27,584 INFO [pretrained.py:209] Reading sound files: ['./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0002.flac']
+ 2021-08-24 16:57:27,599 INFO [pretrained.py:215] Decoding started
+ 2021-08-24 16:57:27,791 INFO [pretrained.py:245] Use HLG decoding
+ 2021-08-24 16:57:28,098 INFO [pretrained.py:266]
+ ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1089-134686-0001.flac:
+ AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
+
+ ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0001.flac:
+ GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
+
+ ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0002.flac:
+ YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
+
+
+ 2021-08-24 16:57:28,099 INFO [pretrained.py:268] Decoding Done
+
+
+To decode with ``whole-lattice-rescoring`` methond, you can use
+
+.. code-block:: bash
+
+ ./tdnn_lstm_ctc/pretrained.py \
+ --checkpoint ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/exp/pretraind.pt \
+ --words-file ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/data/lang_phone/words.txt \
+ --HLG ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/data/lang_phone/HLG.pt \
+ --method whole-lattice-rescoring \
+ --G ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/data/lm/G_4_gram.pt \
+ --ngram-lm-scale 0.8 \
+ ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1089-134686-0001.flac \
+ ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0001.flac \
+ ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0002.flac
+
+The decoding output is:
+
+.. code-block::
+
+ 2021-08-24 16:39:24,725 INFO [pretrained.py:168] device: cuda:0
+ 2021-08-24 16:39:24,725 INFO [pretrained.py:170] Creating model
+ 2021-08-24 16:39:29,403 INFO [pretrained.py:182] Loading HLG from ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/data/lang_phone/HLG.pt
+ 2021-08-24 16:39:40,631 INFO [pretrained.py:190] Loading G from ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/data/lm/G_4_gram.pt
+ 2021-08-24 16:39:53,098 INFO [pretrained.py:199] Constructing Fbank computer
+ 2021-08-24 16:39:53,107 INFO [pretrained.py:209] Reading sound files: ['./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0002.flac']
+ 2021-08-24 16:39:53,121 INFO [pretrained.py:215] Decoding started
+ 2021-08-24 16:39:53,443 INFO [pretrained.py:250] Use HLG decoding + LM rescoring
+ 2021-08-24 16:39:54,010 INFO [pretrained.py:266]
+ ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1089-134686-0001.flac:
+ AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS
+
+ ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0001.flac:
+ GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONORED BOSOM TO CONNECT HER PARENT FOREVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN
+
+ ./tmp/icefall_asr_librispeech_tdnn-lstm_ctc/test_wavs/1221-135766-0002.flac:
+ YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION
+
+
+ 2021-08-24 16:39:54,010 INFO [pretrained.py:268] Decoding Done
+
+
+Colab notebook
+--------------
+
+We provide a colab notebook for decoding with pre-trained model.
+
+|librispeech tdnn_lstm_ctc colab notebook|
+
+.. |librispeech tdnn_lstm_ctc colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
+ :target: https://colab.research.google.com/drive/1-iSfQMp2So-We_Uu49N4AAcMInB72u9z?usp=sharing
+
+
+**Congratulations!** You have finished the TDNN-LSTM-CTC recipe on librispeech in ``icefall``.
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip.rst
new file mode 100644
index 000000000..aa73bfe33
--- /dev/null
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/zipformer_ctc_blankskip.rst
@@ -0,0 +1,454 @@
+Zipformer CTC Blank Skip
+========================
+
+.. hint::
+
+ Please scroll down to the bottom of this page to find download links
+ for pretrained models if you don't want to train a model from scratch.
+
+
+This tutorial shows you how to train a Zipformer model based on the guidance from
+a co-trained CTC model using `blank skip method