Merge remote-tracking branch 'upstream/master' into reazonspeech-recipe

2025-08-27 18:54:18 +00:00 · 2024-05-01 23:21:38 +09:00 · 2024-05-01 23:21:38 +09:00 · 3505a8ec45
commit 3505a8ec45
parent 01325b58c8 6d7c1d13a5
644 changed files with 96131 additions and 1880 deletions
--- a/.github/scripts/.gitignore
+++ b/.github/scripts/.gitignore
@ -0,0 +1 @@
 piper_phonemize.html
--- a/.github/scripts/audioset/AT/run.sh
+++ b/.github/scripts/audioset/AT/run.sh
@ -0,0 +1,94 @@
 #!/usr/bin/env bash
 set -ex
 python3 -m pip install onnxoptimizer onnxsim
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/audioset/AT
 function test_pretrained() {
  repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12
  repo=$(basename $repo_url)
  GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
  pushd $repo/exp
  git lfs pull --include pretrained.pt
  ln -s pretrained.pt epoch-99.pt
  ls -lh
  popd
  log "test pretrained.pt"
  python3 zipformer/pretrained.py \
    --checkpoint $repo/exp/pretrained.pt \
    --label-dict $repo/data/class_labels_indices.csv \
    $repo/test_wavs/1.wav \
    $repo/test_wavs/2.wav \
    $repo/test_wavs/3.wav \
    $repo/test_wavs/4.wav
  log "test jit export"
  ls -lh $repo/exp/
  python3 zipformer/export.py \
      --exp-dir $repo/exp \
      --epoch 99 \
      --avg 1 \
      --use-averaged-model 0 \
      --jit 1
  ls -lh $repo/exp/
  log "test jit models"
  python3 zipformer/jit_pretrained.py \
      --nn-model-filename $repo/exp/jit_script.pt \
      --label-dict $repo/data/class_labels_indices.csv \
      $repo/test_wavs/1.wav \
      $repo/test_wavs/2.wav \
      $repo/test_wavs/3.wav \
      $repo/test_wavs/4.wav
  log "test onnx export"
  ls -lh $repo/exp/
  python3 zipformer/export-onnx.py \
      --exp-dir $repo/exp \
      --epoch 99 \
      --avg 1 \
      --use-averaged-model 0
  ls -lh $repo/exp/
  pushd $repo/exp/
  mv model-epoch-99-avg-1.onnx model.onnx
  mv model-epoch-99-avg-1.int8.onnx model.int8.onnx
  popd
  ls -lh $repo/exp/
  log "test onnx models"
  for m in model.onnx model.int8.onnx; do
    log "$m"
    python3 zipformer/onnx_pretrained.py \
        --model-filename $repo/exp/model.onnx \
        --label-dict $repo/data/class_labels_indices.csv \
        $repo/test_wavs/1.wav \
        $repo/test_wavs/2.wav \
        $repo/test_wavs/3.wav \
        $repo/test_wavs/4.wav
  done
  log "prepare data for uploading to huggingface"
  dst=/icefall/model-onnx
  mkdir -p $dst
  cp -v $repo/exp/*.onnx $dst/
  cp -v $repo/data/* $dst/
  cp -av $repo/test_wavs $dst
  ls -lh $dst
  ls -lh $dst/test_wavs
 }
 test_pretrained
--- a/.github/scripts/docker/Dockerfile
+++ b/.github/scripts/docker/Dockerfile
@ -11,6 +11,7 @@ ARG _KALDIFEAT_VERSION="${KALDIFEAT_VERSION}+cpu.torch${TORCH_VERSION}"
 RUN apt-get update -y && \
    apt-get install -qq -y \
    cmake \
    ffmpeg \
    git \
    git-lfs \
@ -35,7 +36,9 @@ RUN pip install --no-cache-dir \
      \
      git+https://github.com/lhotse-speech/lhotse \
      kaldifeat==${_KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cpu.html \
      cython \
      dill \
      espnet_tts_frontend \
      graphviz \
      kaldi-decoder \
      kaldi_native_io \
@ -44,10 +47,15 @@ RUN pip install --no-cache-dir \
      kaldilm \
      matplotlib \
      multi_quantization \
      numba \
      numpy \
      onnxoptimizer \
      onnxsim \
      onnx \
      onnxmltools \
      onnxruntime \
      piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html \
      pypinyin==0.50.0 \
      pytest \
      sentencepiece>=0.1.96 \
      six \
--- a/.github/scripts/docker/generate_build_matrix.py
+++ b/.github/scripts/docker/generate_build_matrix.py
@ -6,8 +6,8 @@ import json
 def version_gt(a, b):
-    a_major, a_minor = a.split(".")[:2]
+    a_major, a_minor = list(map(int, a.split(".")))[:2]
-    b_major, b_minor = b.split(".")[:2]
+    b_major, b_minor = list(map(int, b.split(".")))[:2]
    if a_major > b_major:
        return True
@ -18,8 +18,8 @@ def version_gt(a, b):
 def version_ge(a, b):
-    a_major, a_minor = a.split(".")[:2]
+    a_major, a_minor = list(map(int, a.split(".")))[:2]
-    b_major, b_minor = b.split(".")[:2]
+    b_major, b_minor = list(map(int, b.split(".")))[:2]
    if a_major > b_major:
        return True
@ -43,11 +43,15 @@ def get_torchaudio_version(torch_version):
 def get_matrix():
-    k2_version = "1.24.4.dev20231220"
+    k2_version = "1.24.4.dev20240223"
-    kaldifeat_version = "1.25.3.dev20231221"
+    kaldifeat_version = "1.25.4.dev20240223"
-    version = "1.2"
+    version = "20240401"
-    python_version = ["3.8", "3.9", "3.10", "3.11"]
+    python_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
-    torch_version = ["1.13.0", "1.13.1", "2.0.0", "2.0.1", "2.1.0", "2.1.1", "2.1.2"]
+    torch_version = []
    torch_version += ["1.13.0", "1.13.1"]
    torch_version += ["2.0.0", "2.0.1"]
    torch_version += ["2.1.0", "2.1.1", "2.1.2"]
    torch_version += ["2.2.0", "2.2.1", "2.2.2"]
    matrix = []
    for p in python_version:
@ -57,10 +61,21 @@ def get_matrix():
            if version_gt(p, "3.10") and not version_gt(t, "2.0"):
                continue
            # only torch>=2.2.0 supports python 3.12
            if version_gt(p, "3.11") and not version_gt(t, "2.1"):
                continue
            k2_version_2 = k2_version
            kaldifeat_version_2 = kaldifeat_version
            if t == "2.2.2":
                k2_version_2 = "1.24.4.dev20240328"
                kaldifeat_version_2 = "1.25.4.dev20240329"
            matrix.append(
                {
-                    "k2-version": k2_version,
+                    "k2-version": k2_version_2,
-                    "kaldifeat-version": kaldifeat_version,
+                    "kaldifeat-version": kaldifeat_version_2,
                    "version": version,
                    "python-version": p,
                    "torch-version": t,
--- a/.github/scripts/generate-piper-phonemize-page.py
+++ b/.github/scripts/generate-piper-phonemize-page.py
@ -0,0 +1,29 @@
 #!/usr/bin/env python3
 def main():
    prefix = (
        "https://github.com/csukuangfj/piper-phonemize/releases/download/2023.12.5/"
    )
    files = [
        "piper_phonemize-1.2.0-cp310-cp310-macosx_10_14_x86_64.whl",
        "piper_phonemize-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
        "piper_phonemize-1.2.0-cp311-cp311-macosx_10_14_x86_64.whl",
        "piper_phonemize-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
        "piper_phonemize-1.2.0-cp312-cp312-macosx_10_14_x86_64.whl",
        "piper_phonemize-1.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
        "piper_phonemize-1.2.0-cp37-cp37m-macosx_10_14_x86_64.whl",
        "piper_phonemize-1.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
        "piper_phonemize-1.2.0-cp38-cp38-macosx_10_14_x86_64.whl",
        "piper_phonemize-1.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
        "piper_phonemize-1.2.0-cp39-cp39-macosx_10_14_x86_64.whl",
        "piper_phonemize-1.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
    ]
    with open("piper_phonemize.html", "w") as f:
        for file in files:
            url = prefix + file
            f.write(f'<a href="{url}">{file}</a><br/>\n')
 if __name__ == "__main__":
    main()
--- a/.github/scripts/librispeech/ASR/run.sh
+++ b/.github/scripts/librispeech/ASR/run.sh
@ -15,9 +15,9 @@ function prepare_data() {
  # cause OOM error for CI later.
  mkdir -p download/lm
  pushd download/lm
-  wget -q http://www.openslr.org/resources/11/librispeech-vocab.txt
+  wget -q https://huggingface.co/csukuangfj/librispeech-for-ci/resolve/main/librispeech-lm-norm.txt.gz
-  wget -q http://www.openslr.org/resources/11/librispeech-lexicon.txt
+  wget -q https://huggingface.co/csukuangfj/librispeech-for-ci/resolve/main/librispeech-lexicon.txt
-  wget -q http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz
+  wget -q https://huggingface.co/csukuangfj/librispeech-for-ci/resolve/main/librispeech-vocab.txt
  ls -lh
  gunzip librispeech-lm-norm.txt.gz
@ -64,6 +64,46 @@ function run_diagnostics() {
    --print-diagnostics 1
 }
 function test_streaming_zipformer_ctc_hlg() {
  repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-streaming-zipformer-small-2024-03-18
  log "Downloading pre-trained model from $repo_url"
  git lfs install
  git clone $repo_url
  repo=$(basename $repo_url)
  rm $repo/exp-ctc-rnnt-small/*.onnx
  ls -lh $repo/exp-ctc-rnnt-small
  # export models to onnx
  ./zipformer/export-onnx-streaming-ctc.py \
    --tokens $repo/data/lang_bpe_500/tokens.txt \
    --epoch 30 \
    --avg 3 \
    --exp-dir $repo/exp-ctc-rnnt-small \
    --causal 1 \
    --use-ctc 1 \
    --chunk-size 16 \
    --left-context-frames 128 \
    \
    --num-encoder-layers 2,2,2,2,2,2 \
    --feedforward-dim 512,768,768,768,768,768 \
    --encoder-dim 192,256,256,256,256,256 \
    --encoder-unmasked-dim 192,192,192,192,192,192
  ls -lh $repo/exp-ctc-rnnt-small
  for wav in 0.wav 1.wav 8k.wav; do
    python3 ./zipformer/onnx_pretrained_ctc_HLG_streaming.py \
      --nn-model $repo/exp-ctc-rnnt-small/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx \
      --words $repo/data/lang_bpe_500/words.txt \
      --HLG $repo/data/lang_bpe_500/HLG.fst \
      $repo/test_wavs/$wav
  done
  rm -rf $repo
 }
 function test_pruned_transducer_stateless_2022_03_12() {
  repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12
@ -1577,6 +1617,7 @@ function test_transducer_bpe_500_2021_12_23() {
 prepare_data
 run_diagnostics
 test_streaming_zipformer_ctc_hlg
 test_pruned_transducer_stateless_2022_03_12
 test_pruned_transducer_stateless2_2022_04_29
 test_pruned_transducer_stateless3_2022_04_29
--- a/.github/scripts/ljspeech/TTS/run.sh
+++ b/.github/scripts/ljspeech/TTS/run.sh
@ -0,0 +1,157 @@
 #!/usr/bin/env bash
 set -ex
 python3 -m pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
 python3 -m pip install espnet_tts_frontend
 python3 -m pip install numba
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 cd egs/ljspeech/TTS
 sed -i.bak s/600/8/g ./prepare.sh
 sed -i.bak s/"first 100"/"first 3"/g ./prepare.sh
 sed -i.bak s/500/5/g ./prepare.sh
 git diff
 function prepare_data() {
  # We have created a subset of the data for testing
  #
  mkdir download
  pushd download
  wget -q https://huggingface.co/csukuangfj/ljspeech-subset-for-ci-test/resolve/main/LJSpeech-1.1.tar.bz2
  tar xvf LJSpeech-1.1.tar.bz2
  popd
  ./prepare.sh
  tree .
 }
 function train() {
  pushd ./vits
  sed -i.bak s/200/3/g ./train.py
  git diff .
  popd
  for t in low medium high; do
    ./vits/train.py \
      --exp-dir vits/exp-$t \
      --model-type $t \
      --num-epochs 1 \
      --save-every-n 1 \
      --num-buckets 2 \
      --tokens data/tokens.txt \
      --max-duration 20
    ls -lh vits/exp-$t
  done
 }
 function infer() {
  for t in low medium high; do
    ./vits/infer.py \
      --num-buckets 2 \
      --model-type $t \
      --epoch 1 \
      --exp-dir ./vits/exp-$t \
      --tokens data/tokens.txt \
      --max-duration 20
  done
 }
 function export_onnx() {
  for t in low medium high; do
    ./vits/export-onnx.py \
      --model-type $t \
      --epoch 1 \
      --exp-dir ./vits/exp-$t \
      --tokens data/tokens.txt
    ls -lh vits/exp-$t/
  done
 }
 function test_medium() {
  git clone https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-medium-2024-03-12
  ./vits/export-onnx.py \
    --model-type medium \
    --epoch 820 \
    --exp-dir ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp \
    --tokens ./icefall-tts-ljspeech-vits-medium-2024-03-12/data/tokens.txt
  ls -lh ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp
  ./vits/test_onnx.py \
    --model-filename ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp/vits-epoch-820.onnx \
    --tokens ./icefall-tts-ljspeech-vits-medium-2024-03-12/data/tokens.txt \
    --output-filename /icefall/test-medium.wav
  ls -lh /icefall/test-medium.wav
  d=/icefall/vits-icefall-en_US-ljspeech-medium
  mkdir $d
  cp -v ./icefall-tts-ljspeech-vits-medium-2024-03-12/data/tokens.txt $d/
  cp -v ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp/vits-epoch-820.onnx $d/model.onnx
  rm -rf icefall-tts-ljspeech-vits-medium-2024-03-12
  pushd $d
  wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
  tar xf espeak-ng-data.tar.bz2
  rm espeak-ng-data.tar.bz2
  cd ..
  tar cjf vits-icefall-en_US-ljspeech-medium.tar.bz2 vits-icefall-en_US-ljspeech-medium
  rm -rf vits-icefall-en_US-ljspeech-medium
  ls -lh *.tar.bz2
  popd
 }
 function test_low() {
  git clone https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-low-2024-03-12
  ./vits/export-onnx.py \
    --model-type low \
    --epoch 1600 \
    --exp-dir ./icefall-tts-ljspeech-vits-low-2024-03-12/exp \
    --tokens ./icefall-tts-ljspeech-vits-low-2024-03-12/data/tokens.txt
  ls -lh ./icefall-tts-ljspeech-vits-low-2024-03-12/exp
  ./vits/test_onnx.py \
    --model-filename ./icefall-tts-ljspeech-vits-low-2024-03-12/exp/vits-epoch-1600.onnx \
    --tokens ./icefall-tts-ljspeech-vits-low-2024-03-12/data/tokens.txt \
    --output-filename /icefall/test-low.wav
  ls -lh /icefall/test-low.wav
  d=/icefall/vits-icefall-en_US-ljspeech-low
  mkdir $d
  cp -v ./icefall-tts-ljspeech-vits-low-2024-03-12/data/tokens.txt $d/
  cp -v ./icefall-tts-ljspeech-vits-low-2024-03-12/exp/vits-epoch-1600.onnx $d/model.onnx
  rm -rf icefall-tts-ljspeech-vits-low-2024-03-12
  pushd $d
  wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
  tar xf espeak-ng-data.tar.bz2
  rm espeak-ng-data.tar.bz2
  cd ..
  tar cjf vits-icefall-en_US-ljspeech-low.tar.bz2 vits-icefall-en_US-ljspeech-low
  rm -rf vits-icefall-en_US-ljspeech-low
  ls -lh *.tar.bz2
  popd
 }
 prepare_data
 train
 infer
 export_onnx
 rm -rf vits/exp-{low,medium,high}
 test_medium
 test_low
--- a/.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
+++ b/.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
@ -30,7 +30,7 @@ log "Test exporting to ONNX format"
 ./pruned_transducer_stateless2/export-onnx.py \
  --exp-dir $repo/exp \
-  --lang-dir $repo/data/lang_char \
+  --tokens $repo/data/lang_char/tokens.txt \
  --epoch 99 \
  --avg 1
@ -38,14 +38,14 @@ log "Export to torchscript model"
 ./pruned_transducer_stateless2/export.py \
  --exp-dir $repo/exp \
-  --lang-dir $repo/data/lang_char \
+  --tokens $repo/data/lang_char/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --jit 1
 ./pruned_transducer_stateless2/export.py \
  --exp-dir $repo/exp \
-  --lang-dir $repo/data/lang_char \
+  --tokens $repo/data/lang_char/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --jit-trace 1
--- a/.github/workflows/audioset.yml
+++ b/.github/workflows/audioset.yml
@ -0,0 +1,137 @@
 name: audioset
 on:
  push:
    branches:
      - master
  pull_request:
    branches:
      - master
  workflow_dispatch:
 concurrency:
  group: audioset-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  generate_build_matrix:
    if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
    # see https://github.com/pytorch/pytorch/pull/50633
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Generating build matrix
        id: set-matrix
        run: |
          # outputting for debugging purposes
          python ./.github/scripts/docker/generate_build_matrix.py
          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
          echo "::set-output name=matrix::${MATRIX}"
  audioset:
    needs: generate_build_matrix
    name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Free space
        shell: bash
        run: |
          ls -lh
          df -h
          rm -rf /opt/hostedtoolcache
          df -h
          echo "pwd: $PWD"
          echo "github.workspace ${{ github.workspace }}"
      - name: Run tests
        uses: addnab/docker-run-action@v3
        with:
            image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
            options: |
              --volume ${{ github.workspace }}/:/icefall
            shell: bash
            run: |
              export PYTHONPATH=/icefall:$PYTHONPATH
              cd /icefall
              git config --global --add safe.directory /icefall
              .github/scripts/audioset/AT/run.sh
      - name: Show model files
        shell: bash
        run: |
          sudo chown -R runner ./model-onnx
          ls -lh ./model-onnx
          chmod -x ./model-onnx/class_labels_indices.csv
          echo "----------"
          ls -lh ./model-onnx/*
      - name: Upload model to huggingface
        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        uses: nick-fields/retry@v3
        with:
          max_attempts: 20
          timeout_seconds: 200
          shell: bash
          command: |
            git config --global user.email "csukuangfj@gmail.com"
            git config --global user.name "Fangjun Kuang"
            rm -rf huggingface
            export GIT_LFS_SKIP_SMUDGE=1
            git clone https://huggingface.co/k2-fsa/sherpa-onnx-zipformer-audio-tagging-2024-04-09 huggingface
            cd huggingface
            git fetch
            git pull
            git merge -m "merge remote" --ff origin main
            cp ../model-onnx/*.onnx ./
            cp ../model-onnx/*.csv ./
            cp -a ../model-onnx/test_wavs ./
            ls -lh
            git add .
            git status
            git commit -m "update models"
            git status
            git push https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-onnx-zipformer-audio-tagging-2024-04-09 main || true
            rm -rf huggingface
      - name: Prepare for release
        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
        shell: bash
        run: |
          d=sherpa-onnx-zipformer-audio-tagging-2024-04-09
          mv ./model-onnx $d
          tar cjvf ${d}.tar.bz2 $d
          ls -lh
      - name: Release exported onnx models
        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
        uses: svenstaro/upload-release-action@v2
        with:
          file_glob: true
          overwrite: true
          file: sherpa-onnx-*.tar.bz2
          repo_name: k2-fsa/sherpa-onnx
          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
          tag: audio-tagging-models
--- a/.github/workflows/build-doc.yml
+++ b/.github/workflows/build-doc.yml
@ -56,11 +56,14 @@ jobs:
      - name: Build doc
        shell: bash
        run: |
          .github/scripts/generate-piper-phonemize-page.py
          cd docs
          python3 -m pip install -r ./requirements.txt
          make html
          touch build/html/.nojekyll
          cp -v ../piper_phonemize.html ./build/html/
      - name: Deploy
        uses: peaceiris/actions-gh-pages@v3
        with:
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@ -16,7 +16,7 @@ jobs:
      fail-fast: false
      matrix:
        os: [ubuntu-latest]
-        image: ["torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+        image: ["torch2.2.2-cuda12.1", "torch2.2.2-cuda11.8", "torch2.2.1-cuda12.1", "torch2.2.1-cuda11.8", "torch2.2.0-cuda12.1", "torch2.2.0-cuda11.8", "torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
    steps:
      # refer to https://github.com/actions/checkout
--- a/.github/workflows/ljspeech.yml
+++ b/.github/workflows/ljspeech.yml
@ -0,0 +1,102 @@
 name: ljspeech
 on:
  push:
    branches:
      - master
  pull_request:
    branches:
      - master
  workflow_dispatch:
 concurrency:
  group: ljspeech-${{ github.ref }}
  cancel-in-progress: true
 jobs:
  generate_build_matrix:
    if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
    # see https://github.com/pytorch/pytorch/pull/50633
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Generating build matrix
        id: set-matrix
        run: |
          # outputting for debugging purposes
          python ./.github/scripts/docker/generate_build_matrix.py
          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
          echo "::set-output name=matrix::${MATRIX}"
  ljspeech:
    needs: generate_build_matrix
    name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - name: Free space
        shell: bash
        run: |
          ls -lh
          df -h
          rm -rf /opt/hostedtoolcache
          df -h
          echo "pwd: $PWD"
          echo "github.workspace ${{ github.workspace }}"
      - name: Run tests
        uses: addnab/docker-run-action@v3
        with:
            image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
            options: |
              --volume ${{ github.workspace }}/:/icefall
            shell: bash
            run: |
              export PYTHONPATH=/icefall:$PYTHONPATH
              cd /icefall
              git config --global --add safe.directory /icefall
              .github/scripts/ljspeech/TTS/run.sh
      - name: display files
        shell: bash
        run: |
          ls -lh
      - uses: actions/upload-artifact@v4
        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
        with:
          name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
          path: ./*.wav
      - uses: actions/upload-artifact@v4
        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
        with:
          name: generated-models-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}
          path: ./*.wav
      - name: Release exported onnx models
        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
        uses: svenstaro/upload-release-action@v2
        with:
          file_glob: true
          overwrite: true
          file: vits-icefall-*.tar.bz2
          repo_name: k2-fsa/sherpa-onnx
          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
          tag: tts-models
--- a/.github/workflows/run-docker-image.yml
+++ b/.github/workflows/run-docker-image.yml
@ -14,13 +14,20 @@ jobs:
      fail-fast: false
      matrix:
        os: [ubuntu-latest]
-        image: ["torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+        image: ["torch2.2.2-cuda12.1", "torch2.2.2-cuda11.8", "torch2.2.1-cuda12.1", "torch2.2.1-cuda11.8", "torch2.2.0-cuda12.1", "torch2.2.0-cuda11.8", "torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
    steps:
      # refer to https://github.com/actions/checkout
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0
      - name: Free space
        shell: bash
        run: |
          df -h
          rm -rf /opt/hostedtoolcache
          df -h
      - name: Run the build process with Docker
        uses: addnab/docker-run-action@v3
        with:
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@ -49,7 +49,7 @@ jobs:
      - name: Install Python dependencies
        run: |
-          python3 -m pip install --upgrade pip black==22.3.0 flake8==5.0.4 click==8.1.0
+          python3 -m pip install --upgrade pip black==22.3.0 flake8==5.0.4 click==8.1.0 isort==5.10.1
          # Click issue fixed in https://github.com/psf/black/pull/2966
      - name: Run flake8
@ -67,3 +67,9 @@ jobs:
        working-directory: ${{github.workspace}}
        run: |
          black --check --diff .
      - name: Run isort
        shell: bash
        working-directory: ${{github.workspace}}
        run: |
          isort --check --diff .
--- a/.github/workflows/yesno.yml
+++ b/.github/workflows/yesno.yml
@ -59,4 +59,7 @@ jobs:
              cd /icefall
              git config --global --add safe.directory /icefall
              python3 -m torch.utils.collect_env
              python3 -m k2.version
              .github/scripts/yesno/ASR/run.sh
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -26,7 +26,7 @@ repos:
      # E121,E123,E126,E226,E24,E704,W503,W504
  - repo: https://github.com/pycqa/isort
-    rev: 5.11.5
+    rev: 5.12.0
    hooks:
      - id: isort
        args: ["--profile=black"]
--- a/README.md
+++ b/README.md
@ -2,46 +2,86 @@
 <img src="https://raw.githubusercontent.com/k2-fsa/icefall/master/docs/source/_static/logo.png" width=168>
 </div>
-## Introduction
+# Introduction
-icefall contains ASR recipes for various datasets
+The icefall project contains speech-related recipes for various datasets
-using <https://github.com/k2-fsa/k2>.
+using [k2-fsa](https://github.com/k2-fsa/k2) and [lhotse](https://github.com/lhotse-speech/lhotse).
-You can use <https://github.com/k2-fsa/sherpa> to deploy models
+You can use [sherpa](https://github.com/k2-fsa/sherpa), [sherpa-ncnn](https://github.com/k2-fsa/sherpa-ncnn) or [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx) for deployment with models
-trained with icefall.
+in icefall; these frameworks also support models not included in icefall; please refer to respective documents for more details.
 You can try pre-trained models from within your browser without the need
-to download or install anything by visiting <https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>
+to download or install anything by visiting this [huggingface space](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition).
-See <https://k2-fsa.github.io/icefall/huggingface/spaces.html> for more details.
+Please refer to [document](https://k2-fsa.github.io/icefall/huggingface/spaces.html) for more details.
-## Installation
+# Installation
-Please refer to <https://icefall.readthedocs.io/en/latest/installation/index.html>
+Please refer to [document](https://icefall.readthedocs.io/en/latest/installation/index.html)
 for installation.
-## Recipes
+# Recipes
-Please refer to <https://icefall.readthedocs.io/en/latest/recipes/index.html>
+Please refer to [document](https://icefall.readthedocs.io/en/latest/recipes/index.html)
-for more information.
+for more details.
-We provide the following recipes:
+## ASR: Automatic Speech Recognition
 ### Supported Datasets
  - [yesno][yesno]
-  - [LibriSpeech][librispeech]
+  
-  - [GigaSpeech][gigaspeech]
+  - [Aidatatang_200zh][aidatatang_200zh]
  - [AMI][ami]
  - [Aishell][aishell]
  - [Aishell2][aishell2]
  - [Aishell4][aishell4]
  - [Alimeeting][alimeeting]
  - [AMI][ami]
  - [CommonVoice][commonvoice]
  - [Corpus of Spontaneous Japanese][csj]
  - [GigaSpeech][gigaspeech]
  - [LibriCSS][libricss]
  - [LibriSpeech][librispeech]
  - [Libriheavy][libriheavy]
  - [Multi-Dialect Broadcast News Arabic Speech Recognition][mgb2]
  - [PeopleSpeech][peoplespeech]
  - [SPGISpeech][spgispeech]
  - [Switchboard][swbd]
  - [TIMIT][timit]
  - [TED-LIUM3][tedlium3]
  - [Aidatatang_200zh][aidatatang_200zh]
  - [WenetSpeech][wenetspeech]
  - [Alimeeting][alimeeting]
  - [Switchboard][swbd]
  - [TAL_CSASR][tal_csasr]
  - [Voxpopuli][voxpopuli]
  - [XBMU-AMDO31][xbmu-amdo31]
  - [WenetSpeech][wenetspeech]
-### yesno
+More datasets will be added in the future.
 ### Supported Models
 The [LibriSpeech][librispeech] recipe supports the most comprehensive set of models, you are welcome to try them out.
 #### CTC 
  - TDNN LSTM CTC
  - Conformer CTC
  - Zipformer CTC
 #### MMI
  - Conformer MMI
  - Zipformer MMI
 #### Transducer
  - Conformer-based Encoder
  - LSTM-based Encoder
  - Zipformer-based Encoder
  - LSTM-based Predictor
  - [Stateless Predictor](https://research.google/pubs/rnn-transducer-with-stateless-prediction-network/)
 #### Whisper
  - [OpenAi Whisper](https://arxiv.org/abs/2212.04356) (We support fine-tuning on AiShell-1.)
 If you are willing to contribute to icefall, please refer to [contributing](https://icefall.readthedocs.io/en/latest/contributing/index.html) for more details.
 We would like to highlight the performance of some of the recipes here.
 ### [yesno][yesno]
 This is the simplest ASR recipe in `icefall` and can be run on CPU.
 Training takes less than 30 seconds and gives you the following WER:
@ -52,350 +92,264 @@ Training takes less than 30 seconds and gives you the following WER:
 We provide a Colab notebook for this recipe: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tIjjzaJc3IvGyKiMCDWO-TSnBgkcuN3B?usp=sharing)
-### LibriSpeech
+### [LibriSpeech][librispeech]
-Please see <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>
+Please see [RESULTS.md](https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md)
 for the **latest** results.
-We provide 5 models for this recipe:
+#### [Conformer CTC](https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conformer_ctc)
 - [conformer CTC model][LibriSpeech_conformer_ctc]
 - [TDNN LSTM CTC model][LibriSpeech_tdnn_lstm_ctc]
 - [Transducer: Conformer encoder + LSTM decoder][LibriSpeech_transducer]
 - [Transducer: Conformer encoder + Embedding decoder][LibriSpeech_transducer_stateless]
 - [Transducer: Zipformer encoder + Embedding decoder][LibriSpeech_zipformer]
 #### Conformer CTC Model
 The best WER we currently have is:
 |     | test-clean | test-other |
 |-----|------------|------------|
 | WER | 2.42       | 5.73       |
-We provide a Colab notebook to run a pre-trained conformer CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1huyupXAcHsUrKaWfI83iMEJ6J0Nh0213?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1huyupXAcHsUrKaWfI83iMEJ6J0Nh0213?usp=sharing)
-#### TDNN LSTM CTC Model
+#### [TDNN LSTM CTC](https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/tdnn_lstm_ctc)
 The WER for this model is:
 |     | test-clean | test-other |
 |-----|------------|------------|
 | WER | 6.59       | 17.69      |
-We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1-iSfQMp2So-We_Uu49N4AAcMInB72u9z?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1-iSfQMp2So-We_Uu49N4AAcMInB72u9z?usp=sharing)
-#### Transducer: Conformer encoder + LSTM decoder
+#### [Transducer (Conformer Encoder + LSTM Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/transducer)
 Using Conformer as encoder and LSTM as decoder.
 The best WER with greedy search is:
 |               | test-clean | test-other |
-|-----|------------|------------|
+|---------------|------------|------------|
-| WER | 3.07       | 7.51       |
+| greedy_search | 3.07       | 7.51       |
-We provide a Colab notebook to run a pre-trained RNN-T conformer model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1_u6yK9jDkPwG_NLrZMN2XK7Aeq4suMO2?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1_u6yK9jDkPwG_NLrZMN2XK7Aeq4suMO2?usp=sharing)
-#### Transducer: Conformer encoder + Embedding decoder
+#### [Transducer (Conformer Encoder + Stateless Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/transducer)
 Using Conformer as encoder. The decoder consists of 1 embedding layer
 and 1 convolutional layer.
 The best WER using modified beam search with beam size 4 is:
 |                                       | test-clean | test-other |
-|-----|------------|------------|
+|---------------------------------------|------------|------------|
-| WER | 2.56       | 6.27       |
+| modified_beam_search (`beam_size=4`) | 2.56       | 6.27       |
 Note: No auxiliary losses are used in the training and no LMs are used
 in the decoding.
 We provide a Colab notebook to run a pre-trained transducer conformer + stateless decoder model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CO1bXJ-2khDckZIW8zjOPHGSKLHpTDlp?usp=sharing)
-#### k2 pruned RNN-T
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CO1bXJ-2khDckZIW8zjOPHGSKLHpTDlp?usp=sharing)
 #### [Transducer (Zipformer Encoder + Stateless Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/zipformer)
 WER (modified_beam_search `beam_size=4` unless further stated) 
 1. LibriSpeech-960hr
 | Encoder         | Params | test-clean | test-other | epochs  | devices    |
 |-----------------|--------|------------|------------|---------|------------|
-| zipformer       | 65.5M  | 2.21       | 4.79       | 50      | 4 32G-V100 |
+| Zipformer       | 65.5M  | 2.21       | 4.79       | 50      | 4 32G-V100 |
-| zipformer-small | 23.2M  | 2.42       | 5.73       | 50      | 2 32G-V100 |
+| Zipformer-small | 23.2M  | 2.42       | 5.73       | 50      | 2 32G-V100 |
-| zipformer-large | 148.4M | 2.06       | 4.63       | 50      | 4 32G-V100 |
+| Zipformer-large | 148.4M | 2.06       | 4.63       | 50      | 4 32G-V100 |
-| zipformer-large | 148.4M | 2.00       | 4.38       | 174     | 8 80G-A100 |
+| Zipformer-large | 148.4M | 2.00       | 4.38       | 174     | 8 80G-A100 |
-Note: No auxiliary losses are used in the training and no LMs are used
+2. LibriSpeech-960hr + GigaSpeech
 in the decoding.
-#### k2 pruned RNN-T + GigaSpeech
+| Encoder         | Params | test-clean | test-other |
-
+|-----------------|--------|------------|------------|
-|     | test-clean | test-other |
+| Zipformer       | 65.5M   | 1.78       | 4.08       |
 |-----|------------|------------|
 | WER | 1.78       | 4.08       |
 Note: No auxiliary losses are used in the training and no LMs are used
 in the decoding.
 #### k2 pruned RNN-T + GigaSpeech + CommonVoice
 |     | test-clean | test-other |
 |-----|------------|------------|
 | WER | 1.90       | 3.98       |
 Note: No auxiliary losses are used in the training and no LMs are used
 in the decoding.
-### GigaSpeech
+3. LibriSpeech-960hr + GigaSpeech + CommonVoice
-We provide three models for this recipe:
+| Encoder         | Params | test-clean | test-other |
 |-----------------|--------|------------|------------|
 | Zipformer       | 65.5M   | 1.90       | 3.98       |
 - [Conformer CTC model][GigaSpeech_conformer_ctc]
 - [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][GigaSpeech_pruned_transducer_stateless2].
 - [Transducer: Zipformer encoder + Embedding decoder][GigaSpeech_zipformer]
-#### Conformer CTC
+### [GigaSpeech][gigaspeech]
 #### [Conformer CTC](https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR/conformer_ctc)
 |     |  Dev  | Test  |
 |-----|-------|-------|
 | WER | 10.47 | 10.58 |
-#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
+#### [Transducer (pruned_transducer_stateless2)](https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR/pruned_transducer_stateless2)
 Conformer Encoder + Stateless Predictor + k2 Pruned RNN-T Loss
 |                      |  Dev  | Test  |
 |----------------------|-------|-------|
-|    greedy search     | 10.51 | 10.73 |
+|    greedy_search     | 10.51 | 10.73 |
-|   fast beam search   | 10.50 | 10.69 |
+|   fast_beam_search   | 10.50 | 10.69 |
-| modified beam search | 10.40 | 10.51 |
+| modified_beam_search | 10.40 | 10.51 |
-#### Transducer: Zipformer encoder + Embedding decoder
+#### [Transducer (Zipformer Encoder + Stateless Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR/zipformer)
 |                      |  Dev  | Test  |
 |----------------------|-------|-------|
-|    greedy search     | 10.31 | 10.50 |
+|    greedy_search     | 10.31 | 10.50 |
-|   fast beam search   | 10.26 | 10.48 |
+|   fast_beam_search   | 10.26 | 10.48 |
-| modified beam search | 10.25 | 10.38 |
+| modified_beam_search | 10.25 | 10.38 |
-### Aishell
+### [Aishell][aishell]
-We provide three models for this recipe: [conformer CTC model][Aishell_conformer_ctc],
+#### [TDNN LSTM CTC](https://github.com/k2-fsa/icefall/tree/master/egs/aishell/ASR/tdnn_lstm_ctc)
 [TDNN LSTM CTC model][Aishell_tdnn_lstm_ctc], and [Transducer Stateless Model][Aishell_pruned_transducer_stateless7],
 #### Conformer CTC Model
 The best CER we currently have is:
 |     | test |
 |-----|------|
 | CER | 4.26 |
 #### TDNN LSTM CTC Model
 The CER for this model is:
 |     | test  |
 |-----|-------|
 | CER | 10.16 |
-We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model:  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1jbyzYq3ytm6j2nlEt-diQm-6QVWyDDEa?usp=sharing)
+We provide a Colab notebook to test the pre-trained model:  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1jbyzYq3ytm6j2nlEt-diQm-6QVWyDDEa?usp=sharing)
-#### Transducer Stateless Model
+#### [Transducer (Conformer Encoder + Stateless Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/aishell/ASR/transducer_stateless)
 The best CER we currently have is:
 |     | test |
 |-----|------|
 | CER | 4.38 |
-We provide a Colab notebook to run a pre-trained TransducerStateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14XaT2MhnBkK-3_RqqWq3K90Xlbin-GZC?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14XaT2MhnBkK-3_RqqWq3K90Xlbin-GZC?usp=sharing)
 #### [Transducer (Zipformer Encoder + Stateless Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/aishell/ASR/zipformer)
 WER (modified_beam_search `beam_size=4`) 
 | Encoder         | Params | dev | test | epochs  |
 |-----------------|--------|-----|------|---------|
 | Zipformer       | 73.4M  | 4.13| 4.40 | 55      |
 | Zipformer-small | 30.2M  | 4.40| 4.67 | 55      |
 | Zipformer-large | 157.3M | 4.03| 4.28 | 56      |
-### Aishell2
+### [Aishell4][aishell4]
-We provide one model for this recipe: [Transducer Stateless Model][Aishell2_pruned_transducer_stateless5].
+#### [Transducer (pruned_transducer_stateless5)](https://github.com/k2-fsa/icefall/tree/master/egs/aishell4/ASR/pruned_transducer_stateless5)
 #### Transducer Stateless Model
 The best WER we currently have is:
 |     |   dev-ios  |  test-ios  |
 |-----|------------|------------|
 | WER |    5.32    |    5.56    |
 ### Aishell4
 We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Aishell4_pruned_transducer_stateless5].
 #### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with all subsets)
 The best CER we currently have is:
 1 Trained with all subsets: 
 |     |   test     |
 |-----|------------|
 | CER |   29.08    |
-
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)
 We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)
-### TIMIT
+### [TIMIT][timit]
-We provide two models for this recipe: [TDNN LSTM CTC model][TIMIT_tdnn_lstm_ctc]
+#### [TDNN LSTM CTC](https://github.com/k2-fsa/icefall/tree/master/egs/timit/ASR/tdnn_lstm_ctc)
 and [TDNN LiGRU CTC model][TIMIT_tdnn_ligru_ctc].
-#### TDNN LSTM CTC Model
+|   |TEST|
-
+|---|----|
 The best PER we currently have is:
 ||TEST|
 |--|--|
 |PER| 19.71% |
-We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Hs9DA4V96uapw_30uNp32OMJgkuR5VVd?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Hs9DA4V96uapw_30uNp32OMJgkuR5VVd?usp=sharing)
-#### TDNN LiGRU CTC Model
+#### [TDNN LiGRU CTC](https://github.com/k2-fsa/icefall/tree/master/egs/timit/ASR/tdnn_ligru_ctc)
-The PER for this model is:
+|   |TEST|
-
+|---|----|
 ||TEST|
 |--|--|
 |PER| 17.66% |
-We provide a Colab notebook to run a pre-trained TDNN LiGRU CTC model:  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)
-### TED-LIUM3
+### [TED-LIUM3][tedlium3]
-We provide two models for this recipe: [Transducer Stateless: Conformer encoder + Embedding decoder][TED-LIUM3_transducer_stateless] and [Pruned Transducer Stateless: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][TED-LIUM3_pruned_transducer_stateless].
+#### [Transducer (Conformer Encoder + Stateless Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/tedlium3/ASR/transducer_stateless)
 #### Transducer Stateless:  Conformer encoder + Embedding decoder
 The best WER using modified beam search with beam size 4 is:
 |                                      |  dev  |  test  |
-|-----|-------|--------|
+|--------------------------------------|-------|--------|
-| WER |  6.91 |  6.33  |
+| modified_beam_search (`beam_size=4`) |  6.91 |  6.33  |
 Note: No auxiliary losses are used in the training and no LMs are used in the decoding.
-We provide a Colab notebook to run a pre-trained Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1MmY5bBxwvKLNT4A2DJnwiqRXhdchUqPN?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1MmY5bBxwvKLNT4A2DJnwiqRXhdchUqPN?usp=sharing)
-#### Pruned Transducer Stateless: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
+#### [Transducer (pruned_transducer_stateless)](https://github.com/k2-fsa/icefall/tree/master/egs/tedlium3/ASR/pruned_transducer_stateless)
 The best WER using modified beam search with beam size 4 is:
 |                                      |  dev  |  test  |
-|-----|-------|--------|
+|--------------------------------------|-------|--------|
-| WER |  6.77 |  6.14  |
+| modified_beam_search (`beam_size=4`) |  6.77 |  6.14  |
-We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1je_1zGrOkGVVd4WLzgkXRHxl-I27yWtz?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1je_1zGrOkGVVd4WLzgkXRHxl-I27yWtz?usp=sharing)
-### Aidatatang_200zh
+### [Aidatatang_200zh][aidatatang_200zh]
-We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Aidatatang_200zh_pruned_transducer_stateless2].
+#### [Transducer (pruned_transducer_stateless2)](https://github.com/k2-fsa/icefall/tree/master/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2)
 #### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
 |                      |  Dev  | Test  |
 |----------------------|-------|-------|
-|    greedy search     | 5.53  | 6.59  |
+|    greedy_search     | 5.53  | 6.59  |
-|   fast beam search   | 5.30  | 6.34  |
+|   fast_beam_search   | 5.30  | 6.34  |
-| modified beam search | 5.27  | 6.33  |
+| modified_beam_search | 5.27  | 6.33  |
-We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wNSnSj3T5oOctbh5IGCa393gKOoQw2GH?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wNSnSj3T5oOctbh5IGCa393gKOoQw2GH?usp=sharing)
-### WenetSpeech
+### [WenetSpeech][wenetspeech]
-We provide some models for this recipe: [Pruned stateless RNN-T_2: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][WenetSpeech_pruned_transducer_stateless2] and [Pruned stateless RNN-T_5: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][WenetSpeech_pruned_transducer_stateless5].
+#### [Transducer (pruned_transducer_stateless2)](https://github.com/k2-fsa/icefall/tree/master/egs/wenetspeech/ASR/pruned_transducer_stateless2)
 #### Pruned stateless RNN-T_2: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with L subset, offline ASR)
 |                      |  Dev  | Test-Net | Test-Meeting |
 |----------------------|-------|----------|--------------|
-|    greedy search     | 7.80  |  8.75    |  13.49       |
+|    greedy_search     | 7.80  |  8.75    |  13.49       |
-| modified beam search| 7.76  |  8.71    |  13.41       |
+|   fast_beam_search   | 7.94  |  8.74    |  13.80       |
-|   fast beam search   | 7.94  |  8.74    |  13.80       |
+| modified_beam_search | 7.76  |  8.71    |  13.41       |
 We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1EV4e1CHa1GZgEF-bZgizqI9RyFFehIiN?usp=sharing)
 #### [Transducer **Streaming** (pruned_transducer_stateless5) ](https://github.com/k2-fsa/icefall/tree/master/egs/wenetspeech/ASR/pruned_transducer_stateless5)
 #### Pruned stateless RNN-T_5: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with L subset)
 **Streaming**:
 |                      |  Dev  | Test-Net | Test-Meeting |
 |----------------------|-------|----------|--------------|
 | greedy_search | 8.78 | 10.12 | 16.16 |
 | modified_beam_search | 8.53| 9.95 | 15.81 |
 | fast_beam_search| 9.01 | 10.47 | 16.28 |
 | modified_beam_search | 8.53| 9.95 | 15.81 |
 We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless2 model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1EV4e1CHa1GZgEF-bZgizqI9RyFFehIiN?usp=sharing)
-### Alimeeting
+### [Alimeeting][alimeeting]
-We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Alimeeting_pruned_transducer_stateless2].
+#### [Transducer (pruned_transducer_stateless2)](https://github.com/k2-fsa/icefall/tree/master/egs/alimeeting/ASR/pruned_transducer_stateless2)
 #### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with far subset)
 |                      |  Eval  | Test-Net |
 |----------------------|--------|----------|
-|    greedy search     | 31.77  |  34.66   |
+|    greedy_search     | 31.77  |  34.66   |
-|   fast beam search   | 31.39  |  33.02   |
+|   fast_beam_search   | 31.39  |  33.02   |
-| modified beam search | 30.38  |  34.25   |
+| modified_beam_search | 30.38  |  34.25   |
-We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tKr3f0mL17uO_ljdHGKtR7HOmthYHwJG?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tKr3f0mL17uO_ljdHGKtR7HOmthYHwJG?usp=sharing)
-### TAL_CSASR
+### [TAL_CSASR][tal_csasr]
 We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][TAL_CSASR_pruned_transducer_stateless5].
-#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
+#### [Transducer (pruned_transducer_stateless5)](https://github.com/k2-fsa/icefall/tree/master/egs/tal_csasr/ASR/pruned_transducer_stateless5)
 The best results for Chinese CER(%) and English WER(%) respectively (zh: Chinese, en: English):
 |decoding-method | dev | dev_zh | dev_en | test | test_zh | test_en |
 |--|--|--|--|--|--|--|
 |greedy_search| 7.30 | 6.48 | 19.19 |7.39| 6.66 | 19.13|
 |modified_beam_search| 7.15 | 6.35 | 18.95 | 7.22| 6.50 | 18.70 |
 |fast_beam_search| 7.18 | 6.39| 18.90 |  7.27| 6.55 | 18.77|
 |modified_beam_search| 7.15 | 6.35 | 18.95 | 7.22| 6.50 | 18.70 |
-We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1DmIx-NloI1CMU5GdZrlse7TRu4y3Dpf8?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1DmIx-NloI1CMU5GdZrlse7TRu4y3Dpf8?usp=sharing)
-## Deployment with C++
+## TTS: Text-to-Speech
-Once you have trained a model in icefall, you may want to deploy it with C++,
+### Supported Datasets
 without Python dependencies.
-Please refer to the documentation
+  - [LJSpeech][ljspeech]
-<https://icefall.readthedocs.io/en/latest/recipes/Non-streaming-ASR/librispeech/conformer_ctc.html#deployment-with-c>
+  - [VCTK][vctk]
 ### Supported Models
  - [VITS](https://arxiv.org/abs/2106.06103)
 # Deployment with C++
 Once you have trained a model in icefall, you may want to deploy it with C++ without Python dependencies.
 Please refer to the [document](https://icefall.readthedocs.io/en/latest/recipes/Non-streaming-ASR/librispeech/conformer_ctc.html#deployment-with-c)
 for how to do this.
 We also provide a Colab notebook, showing you how to run a torch scripted model in [k2][k2] with C++.
 Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1BIGLWzS36isskMXHKcqC9ysN6pspYXs_?usp=sharing)
 [LibriSpeech_tdnn_lstm_ctc]: egs/librispeech/ASR/tdnn_lstm_ctc
 [LibriSpeech_conformer_ctc]: egs/librispeech/ASR/conformer_ctc
 [LibriSpeech_transducer]: egs/librispeech/ASR/transducer
 [LibriSpeech_transducer_stateless]: egs/librispeech/ASR/transducer_stateless
 [LibriSpeech_zipformer]: egs/librispeech/ASR/zipformer
 [Aishell_tdnn_lstm_ctc]: egs/aishell/ASR/tdnn_lstm_ctc
 [Aishell_conformer_ctc]: egs/aishell/ASR/conformer_ctc
 [Aishell_pruned_transducer_stateless7]: egs/aishell/ASR/pruned_transducer_stateless7_bbpe
 [Aishell2_pruned_transducer_stateless5]: egs/aishell2/ASR/pruned_transducer_stateless5
 [Aishell4_pruned_transducer_stateless5]: egs/aishell4/ASR/pruned_transducer_stateless5
 [TIMIT_tdnn_lstm_ctc]: egs/timit/ASR/tdnn_lstm_ctc
 [TIMIT_tdnn_ligru_ctc]: egs/timit/ASR/tdnn_ligru_ctc
 [TED-LIUM3_transducer_stateless]: egs/tedlium3/ASR/transducer_stateless
 [TED-LIUM3_pruned_transducer_stateless]: egs/tedlium3/ASR/pruned_transducer_stateless
 [GigaSpeech_conformer_ctc]: egs/gigaspeech/ASR/conformer_ctc
 [GigaSpeech_pruned_transducer_stateless2]: egs/gigaspeech/ASR/pruned_transducer_stateless2
 [GigaSpeech_zipformer]: egs/gigaspeech/ASR/zipformer
 [Aidatatang_200zh_pruned_transducer_stateless2]: egs/aidatatang_200zh/ASR/pruned_transducer_stateless2
 [WenetSpeech_pruned_transducer_stateless2]: egs/wenetspeech/ASR/pruned_transducer_stateless2
 [WenetSpeech_pruned_transducer_stateless5]: egs/wenetspeech/ASR/pruned_transducer_stateless5
 [Alimeeting_pruned_transducer_stateless2]: egs/alimeeting/ASR/pruned_transducer_stateless2
 [TAL_CSASR_pruned_transducer_stateless5]: egs/tal_csasr/ASR/pruned_transducer_stateless5
 [yesno]: egs/yesno/ASR
 [librispeech]: egs/librispeech/ASR
 [aishell]: egs/aishell/ASR
@ -411,3 +365,15 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad
 [ami]: egs/ami
 [swbd]: egs/swbd/ASR
 [k2]: https://github.com/k2-fsa/k2
 [commonvoice]: egs/commonvoice/ASR
 [csj]: egs/csj/ASR
 [libricss]: egs/libricss/SURT
 [libriheavy]: egs/libriheavy/ASR
 [mgb2]: egs/mgb2/ASR
 [peoplespeech]: egs/peoplespeech/ASR
 [spgispeech]: egs/spgispeech/ASR
 [voxpopuli]: egs/voxpopuli/ASR
 [xbmu-amdo31]: egs/xbmu-amdo31/ASR
 [vctk]: egs/vctk/TTS
 [ljspeech]: egs/ljspeech/TTS
--- a/docker/torch1.12.1-cuda11.3.dockerfile
+++ b/docker/torch1.12.1-cuda11.3.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive
 # python 3.7
-ARG K2_VERSION="1.24.4.dev20230725+cuda11.3.torch1.12.1"
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.3.torch1.12.1"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.3.torch1.12.1"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.3.torch1.12.1"
 ARG TORCHAUDIO_VERSION="0.12.1+cu113"
 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
@ -55,6 +55,8 @@ RUN pip install --no-cache-dir \
      onnx \
      onnxruntime \
      onnxmltools \
      onnxoptimizer \
      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
--- a/docker/torch1.13.0-cuda11.6.dockerfile
+++ b/docker/torch1.13.0-cuda11.6.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive
 # python 3.9
-ARG K2_VERSION="1.24.4.dev20231021+cuda11.6.torch1.13.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.6.torch1.13.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.6.torch1.13.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.6.torch1.13.0"
 ARG TORCHAUDIO_VERSION="0.13.0+cu116"
 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
@ -55,6 +55,8 @@ RUN pip install --no-cache-dir \
      onnx \
      onnxruntime \
      onnxmltools \
      onnxoptimizer \
      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
--- a/docker/torch1.9.0-cuda10.2.dockerfile
+++ b/docker/torch1.9.0-cuda10.2.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive
 # python 3.7
-ARG K2_VERSION="1.24.3.dev20230726+cuda10.2.torch1.9.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda10.2.torch1.9.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda10.2.torch1.9.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda10.2.torch1.9.0"
 ARG TORCHAUDIO_VERSION="0.9.0"
 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
@ -69,6 +69,8 @@ RUN pip uninstall -y tqdm && \
      onnx \
      onnxruntime \
      onnxmltools \
      onnxoptimizer \
      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
--- a/docker/torch2.0.0-cuda11.7.dockerfile
+++ b/docker/torch2.0.0-cuda11.7.dockerfile
@ -1,12 +1,13 @@
 FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
 # python 3.10
 ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive
 # python 3.10
-ARG K2_VERSION="1.24.4.dev20231021+cuda11.7.torch2.0.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.7.torch2.0.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.7.torch2.0.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.7.torch2.0.0"
 ARG TORCHAUDIO_VERSION="2.0.0+cu117"
 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
@ -55,6 +56,8 @@ RUN pip install --no-cache-dir \
      onnx \
      onnxruntime \
      onnxmltools \
      onnxoptimizer \
      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
--- a/docker/torch2.1.0-cuda11.8.dockerfile
+++ b/docker/torch2.1.0-cuda11.8.dockerfile
@ -1,12 +1,13 @@
 FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-devel
 # python 3.10
 ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive
 # python 3.10
-ARG K2_VERSION="1.24.4.dev20231021+cuda11.8.torch2.1.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.8.torch2.1.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.8.torch2.1.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.8.torch2.1.0"
 ARG TORCHAUDIO_VERSION="2.1.0+cu118"
 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
@ -55,6 +56,8 @@ RUN pip install --no-cache-dir \
      onnx \
      onnxruntime \
      onnxmltools \
      onnxoptimizer \
      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
--- a/docker/torch2.1.0-cuda12.1.dockerfile
+++ b/docker/torch2.1.0-cuda12.1.dockerfile
@ -1,12 +1,13 @@
 FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel
 # python 3.10
 ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive
 # python 3.10
-ARG K2_VERSION="1.24.4.dev20231021+cuda12.1.torch2.1.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda12.1.torch2.1.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda12.1.torch2.1.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda12.1.torch2.1.0"
 ARG TORCHAUDIO_VERSION="2.1.0+cu121"
 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
@ -55,6 +56,8 @@ RUN pip install --no-cache-dir \
      onnx \
      onnxruntime \
      onnxmltools \
      onnxoptimizer \
      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
--- a/docker/torch2.2.0-cuda11.8.dockerfile
+++ b/docker/torch2.2.0-cuda11.8.dockerfile
@ -0,0 +1,73 @@
 FROM pytorch/pytorch:2.2.0-cuda11.8-cudnn8-devel
 # python 3.10
 ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive
 # python 3.10
 ARG K2_VERSION="1.24.4.dev20240223+cuda11.8.torch2.2.0"
 ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.8.torch2.2.0"
 ARG TORCHAUDIO_VERSION="2.2.0+cu118"
 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
 LABEL k2_version=${K2_VERSION}
 LABEL kaldifeat_version=${KALDIFEAT_VERSION}
 LABEL github_repo="https://github.com/k2-fsa/icefall"
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        curl \
        vim \
    	libssl-dev \
        autoconf \
        automake \
        bzip2 \
        ca-certificates \
        ffmpeg \
        g++ \
        gfortran \
        git \
        libtool \
        make \
        patch \
        sox \
        subversion \
        unzip \
        valgrind \
        wget \
        zlib1g-dev \
        && rm -rf /var/lib/apt/lists/*
 # Install dependencies
 RUN pip install --no-cache-dir \
      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
      git+https://github.com/lhotse-speech/lhotse \
      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
      kaldi_native_io \
      kaldialign \
      kaldifst \
      kaldilm \
      sentencepiece>=0.1.96 \
      tensorboard \
      typeguard \
      dill \
      onnx \
      onnxruntime \
      onnxmltools \
      onnxoptimizer \
      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
      pytest \
      graphviz
 RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
    cd /workspace/icefall && \
    pip install --no-cache-dir -r requirements.txt
 ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
 WORKDIR /workspace/icefall
--- a/docker/torch2.2.0-cuda12.1.dockerfile
+++ b/docker/torch2.2.0-cuda12.1.dockerfile
@ -0,0 +1,73 @@
 FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel
 # python 3.10
 ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive
 # python 3.10
 ARG K2_VERSION="1.24.4.dev20240223+cuda12.1.torch2.2.0"
 ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda12.1.torch2.2.0"
 ARG TORCHAUDIO_VERSION="2.2.0+cu121"
 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
 LABEL k2_version=${K2_VERSION}
 LABEL kaldifeat_version=${KALDIFEAT_VERSION}
 LABEL github_repo="https://github.com/k2-fsa/icefall"
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        curl \
        vim \
    	libssl-dev \
        autoconf \
        automake \
        bzip2 \
        ca-certificates \
        ffmpeg \
        g++ \
        gfortran \
        git \
        libtool \
        make \
        patch \
        sox \
        subversion \
        unzip \
        valgrind \
        wget \
        zlib1g-dev \
        && rm -rf /var/lib/apt/lists/*
 # Install dependencies
 RUN pip install --no-cache-dir \
      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
      git+https://github.com/lhotse-speech/lhotse \
      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
      kaldi_native_io \
      kaldialign \
      kaldifst \
      kaldilm \
      sentencepiece>=0.1.96 \
      tensorboard \
      typeguard \
      dill \
      onnx \
      onnxruntime \
      onnxmltools \
      onnxoptimizer \
      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
      pytest \
      graphviz
 RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
    cd /workspace/icefall && \
    pip install --no-cache-dir -r requirements.txt
 ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
 WORKDIR /workspace/icefall
--- a/docker/torch2.2.1-cuda11.8.dockerfile
+++ b/docker/torch2.2.1-cuda11.8.dockerfile
@ -0,0 +1,73 @@
 FROM pytorch/pytorch:2.2.1-cuda11.8-cudnn8-devel
 # python 3.10
 ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive
 # python 3.10
 ARG K2_VERSION="1.24.4.dev20240223+cuda11.8.torch2.2.1"
 ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.8.torch2.2.1"
 ARG TORCHAUDIO_VERSION="2.2.1+cu118"
 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
 LABEL k2_version=${K2_VERSION}
 LABEL kaldifeat_version=${KALDIFEAT_VERSION}
 LABEL github_repo="https://github.com/k2-fsa/icefall"
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        curl \
        vim \
    	libssl-dev \
        autoconf \
        automake \
        bzip2 \
        ca-certificates \
        ffmpeg \
        g++ \
        gfortran \
        git \
        libtool \
        make \
        patch \
        sox \
        subversion \
        unzip \
        valgrind \
        wget \
        zlib1g-dev \
        && rm -rf /var/lib/apt/lists/*
 # Install dependencies
 RUN pip install --no-cache-dir \
      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
      git+https://github.com/lhotse-speech/lhotse \
      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
      kaldi_native_io \
      kaldialign \
      kaldifst \
      kaldilm \
      sentencepiece>=0.1.96 \
      tensorboard \
      typeguard \
      dill \
      onnx \
      onnxruntime \
      onnxmltools \
      onnxoptimizer \
      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
      pytest \
      graphviz
 RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
    cd /workspace/icefall && \
    pip install --no-cache-dir -r requirements.txt
 ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
 WORKDIR /workspace/icefall
--- a/docker/torch2.2.1-cuda12.1.dockerfile
+++ b/docker/torch2.2.1-cuda12.1.dockerfile
@ -0,0 +1,73 @@
 FROM pytorch/pytorch:2.2.1-cuda12.1-cudnn8-devel
 # python 3.10
 ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive
 # python 3.10
 ARG K2_VERSION="1.24.4.dev20240223+cuda12.1.torch2.2.1"
 ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda12.1.torch2.2.1"
 ARG TORCHAUDIO_VERSION="2.2.1+cu121"
 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
 LABEL k2_version=${K2_VERSION}
 LABEL kaldifeat_version=${KALDIFEAT_VERSION}
 LABEL github_repo="https://github.com/k2-fsa/icefall"
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        curl \
        vim \
    	libssl-dev \
        autoconf \
        automake \
        bzip2 \
        ca-certificates \
        ffmpeg \
        g++ \
        gfortran \
        git \
        libtool \
        make \
        patch \
        sox \
        subversion \
        unzip \
        valgrind \
        wget \
        zlib1g-dev \
        && rm -rf /var/lib/apt/lists/*
 # Install dependencies
 RUN pip install --no-cache-dir \
      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
      git+https://github.com/lhotse-speech/lhotse \
      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
      kaldi_native_io \
      kaldialign \
      kaldifst \
      kaldilm \
      sentencepiece>=0.1.96 \
      tensorboard \
      typeguard \
      dill \
      onnx \
      onnxruntime \
      onnxmltools \
      onnxoptimizer \
      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
      pytest \
      graphviz
 RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
    cd /workspace/icefall && \
    pip install --no-cache-dir -r requirements.txt
 ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
 WORKDIR /workspace/icefall
--- a/docker/torch2.2.2-cuda11.8.dockerfile
+++ b/docker/torch2.2.2-cuda11.8.dockerfile
@ -0,0 +1,73 @@
 FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-devel
 # python 3.10
 ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive
 # python 3.10
 ARG K2_VERSION="1.24.4.dev20240328+cuda11.8.torch2.2.2"
 ARG KALDIFEAT_VERSION="1.25.4.dev20240329+cuda11.8.torch2.2.2"
 ARG TORCHAUDIO_VERSION="2.2.2+cu118"
 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
 LABEL k2_version=${K2_VERSION}
 LABEL kaldifeat_version=${KALDIFEAT_VERSION}
 LABEL github_repo="https://github.com/k2-fsa/icefall"
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        curl \
        vim \
    	libssl-dev \
        autoconf \
        automake \
        bzip2 \
        ca-certificates \
        ffmpeg \
        g++ \
        gfortran \
        git \
        libtool \
        make \
        patch \
        sox \
        subversion \
        unzip \
        valgrind \
        wget \
        zlib1g-dev \
        && rm -rf /var/lib/apt/lists/*
 # Install dependencies
 RUN pip install --no-cache-dir \
      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
      git+https://github.com/lhotse-speech/lhotse \
      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
      kaldi_native_io \
      kaldialign \
      kaldifst \
      kaldilm \
      sentencepiece>=0.1.96 \
      tensorboard \
      typeguard \
      dill \
      onnx \
      onnxruntime \
      onnxmltools \
      onnxoptimizer \
      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
      pytest \
      graphviz
 RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
    cd /workspace/icefall && \
    pip install --no-cache-dir -r requirements.txt
 ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
 WORKDIR /workspace/icefall
--- a/docker/torch2.2.2-cuda12.1.dockerfile
+++ b/docker/torch2.2.2-cuda12.1.dockerfile
@ -0,0 +1,73 @@
 FROM pytorch/pytorch:2.2.2-cuda12.1-cudnn8-devel
 # python 3.10
 ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive
 # python 3.10
 ARG K2_VERSION="1.24.4.dev20240328+cuda12.1.torch2.2.2"
 ARG KALDIFEAT_VERSION="1.25.4.dev20240329+cuda12.1.torch2.2.2"
 ARG TORCHAUDIO_VERSION="2.2.2+cu121"
 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
 LABEL k2_version=${K2_VERSION}
 LABEL kaldifeat_version=${KALDIFEAT_VERSION}
 LABEL github_repo="https://github.com/k2-fsa/icefall"
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
        curl \
        vim \
    	libssl-dev \
        autoconf \
        automake \
        bzip2 \
        ca-certificates \
        ffmpeg \
        g++ \
        gfortran \
        git \
        libtool \
        make \
        patch \
        sox \
        subversion \
        unzip \
        valgrind \
        wget \
        zlib1g-dev \
        && rm -rf /var/lib/apt/lists/*
 # Install dependencies
 RUN pip install --no-cache-dir \
      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
      git+https://github.com/lhotse-speech/lhotse \
      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
      kaldi_native_io \
      kaldialign \
      kaldifst \
      kaldilm \
      sentencepiece>=0.1.96 \
      tensorboard \
      typeguard \
      dill \
      onnx \
      onnxruntime \
      onnxmltools \
      onnxoptimizer \
      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
      pytest \
      graphviz
 RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
    cd /workspace/icefall && \
    pip install --no-cache-dir -r requirements.txt
 ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
 WORKDIR /workspace/icefall
--- a/docs/source/decoding-with-langugage-models/LODR.rst
+++ b/docs/source/decoding-with-langugage-models/LODR.rst
@ -30,7 +30,7 @@ of langugae model integration.
 First, let's have a look at some background information. As the predecessor of LODR, Density Ratio (DR) is first proposed `here <https://arxiv.org/abs/2002.11268>`_
 to address the language information mismatch between the training
 corpus (source domain) and the testing corpus (target domain). Assuming that the source domain and the test domain
-are acoustically similar, DR derives the following formular for decoding with Bayes' theorem:
+are acoustically similar, DR derives the following formula for decoding with Bayes' theorem:
 .. math::
@ -41,7 +41,7 @@ are acoustically similar, DR derives the following formular for decoding with Ba
 where :math:`\lambda_1` and :math:`\lambda_2` are the weights of LM scores for target domain and source domain respectively.
-Here, the source domain LM is trained on the training corpus. The only difference in the above formular compared to
+Here, the source domain LM is trained on the training corpus. The only difference in the above formula compared to
 shallow fusion is the subtraction of the source domain LM.
 Some works treat the predictor and the joiner of the neural transducer as its internal LM. However, the LM is
@ -58,7 +58,7 @@ during decoding for transducer model:
 In LODR, an additional bi-gram LM estimated on the source domain (e.g training corpus) is required. Compared to DR,
 the only difference lies in the choice of source domain LM. According to the original `paper <https://arxiv.org/abs/2203.16776>`_,
-LODR achieves similar performance compared DR in both intra-domain and cross-domain settings.
+LODR achieves similar performance compared to DR in both intra-domain and cross-domain settings.
 As a bi-gram is much faster to evaluate, LODR is usually much faster.
 Now, we will show you how to use LODR in ``icefall``.
--- a/docs/source/decoding-with-langugage-models/shallow-fusion.rst
+++ b/docs/source/decoding-with-langugage-models/shallow-fusion.rst
@ -139,7 +139,7 @@ A few parameters can be tuned to further boost the performance of shallow fusion
 - ``--lm-scale``
    Controls the scale of the LM. If too small, the external language model may not be fully utilized; if too large,
-    the LM score may dominant during decoding, leading to bad WER. A typical value of this is around 0.3.
+    the LM score might be dominant during decoding, leading to bad WER. A typical value of this is around 0.3.
 - ``--beam-size``
--- a/docs/source/docker/intro.rst
+++ b/docs/source/docker/intro.rst
@ -34,6 +34,12 @@ which will give you something like below:
 .. code-block:: bash
  "torch2.2.2-cuda12.1"
  "torch2.2.2-cuda11.8"
  "torch2.2.1-cuda12.1"
  "torch2.2.1-cuda11.8"
  "torch2.2.0-cuda12.1"
  "torch2.2.0-cuda11.8"
  "torch2.1.0-cuda12.1"
  "torch2.1.0-cuda11.8"
  "torch2.0.0-cuda11.7"
--- a/docs/source/for-dummies/environment-setup.rst
+++ b/docs/source/for-dummies/environment-setup.rst
@ -74,6 +74,10 @@ to install dependencies of `icefall`_:
   pip install k2==1.24.4.dev20231220+cpu.torch2.0.0 -f https://k2-fsa.github.io/k2/cpu.html
   # For users from China
   # 中国国内用户，如果访问不了 huggingface, 请使用
   # pip install k2==1.24.4.dev20231220+cpu.torch2.0.0 -f https://k2-fsa.github.io/k2/cpu-cn.html
   # Install the latest version of lhotse
   pip install git+https://github.com/lhotse-speech/lhotse
--- a/docs/source/installation/index.rst
+++ b/docs/source/installation/index.rst
@ -206,6 +206,9 @@ We will install `k2`_ from pre-compiled wheels by following
 .. code-block:: bash
  (test-icefall) kuangfangjun:~$ pip install k2==1.24.3.dev20230725+cuda11.6.torch1.13.0 -f https://k2-fsa.github.io/k2/cuda.html
  # For users from China
  # 中国国内用户，如果访问不了 huggingface, 请使用
  # pip install k2==1.24.3.dev20230725+cuda11.6.torch1.13.0 -f https://k2-fsa.github.io/k2/cuda-cn.html
  Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
  Looking in links: https://k2-fsa.github.io/k2/cuda.html
--- a/docs/source/recipes/Finetune/adapter/finetune_adapter.rst
+++ b/docs/source/recipes/Finetune/adapter/finetune_adapter.rst
@ -0,0 +1,225 @@
 Finetune from a pre-trained Zipformer model with adapters
 =========================================================
 This tutorial shows you how to fine-tune a pre-trained **Zipformer**
 transducer model on a new dataset with adapters. 
 Adapters are compact and efficient module that can be integrated into a pre-trained model
 to improve the model's performance on a new domain. Adapters are injected
 between different modules in the well-trained neural network. During training, only the parameters
 in the adapters will be updated. It achieves competitive performance
 while requiring much less GPU memory than full fine-tuning. For more details about adapters,
 please refer to the original `paper <https://arxiv.org/pdf/1902.00751.pdf#/>`_ for more details.
 .. HINT::
  We assume you have read the page :ref:`install icefall` and have setup
  the environment for ``icefall``.
 .. HINT::
  We recommend you to use a GPU or several GPUs to run this recipe
 For illustration purpose, we fine-tune the Zipformer transducer model
 pre-trained on `LibriSpeech`_ on the small subset of `GigaSpeech`_. You could use your
 own data for fine-tuning if you create a manifest for your new dataset.
 Data preparation
 ----------------
 Please follow the instructions in the `GigaSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR>`_
 to prepare the fine-tune data used in this tutorial. We only require the small subset in GigaSpeech for this tutorial.
 Model preparation
 -----------------
 We are using the Zipformer model trained on full LibriSpeech (960 hours) as the intialization. The
 checkpoint of the model can be downloaded via the following command:
 .. code-block:: bash
    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
    $ cd icefall-asr-librispeech-zipformer-2023-05-15/exp
    $ git lfs pull --include "pretrained.pt"
    $ ln -s pretrained.pt epoch-99.pt
    $ cd ../data/lang_bpe_500
    $ git lfs pull --include bpe.model
    $ cd ../../..
 Before fine-tuning, let's test the model's WER on the new domain. The following command performs
 decoding on the GigaSpeech test sets:
 .. code-block:: bash
    ./zipformer/decode_gigaspeech.py \
        --epoch 99 \
        --avg 1 \
        --exp-dir icefall-asr-librispeech-zipformer-2023-05-15/exp \
        --use-averaged-model 0 \
        --max-duration 1000 \
        --decoding-method greedy_search
 You should see the following numbers:
 .. code-block::
    For dev, WER of different settings are:
    greedy_search	20.06	best for dev
    For test, WER of different settings are:
    greedy_search	19.27	best for test
 Fine-tune with adapter
 ----------------------
 We insert 4 adapters with residual connection in each ``Zipformer2EncoderLayer``. 
 The original model parameters remain untouched during training and only the parameters of
 the adapters are updated. The following command starts a fine-tuning experiment with adapters:
 .. code-block:: bash
    $ do_finetune=1
    $ use_adapters=1
    $ adapter_dim=8
    $ ./zipformer_adapter/train.py \
        --world-size 2 \
        --num-epochs 20 \
        --start-epoch 1 \
        --exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
        --use-fp16 1 \
        --base-lr 0.045 \
        --use-adapters $use_adapters --adapter-dim $adapter_dim \
        --bpe-model data/lang_bpe_500/bpe.model \
        --do-finetune $do_finetune \
        --master-port 13022 \
        --finetune-ckpt icefall-asr-librispeech-zipformer-2023-05-15/exp/pretrained.pt \
        --max-duration 1000
 The following arguments are related to fine-tuning:
 - ``--do-finetune``
    If True, do fine-tuning by initializing the model from a pre-trained checkpoint.
    **Note that if you want to resume your fine-tuning experiment from certain epochs, you
    need to set this to False.**
 - ``use-adapters``
    If adapters are used during fine-tuning.
 - ``--adapter-dim``
    The bottleneck dimension of the adapter module. Typically a small number.
 You should notice that in the training log, the total number of trainale parameters is shown:
 .. code-block::
    2024-02-22 21:22:03,808 INFO [train.py:1277] A total of 761344 trainable parameters (1.148% of the whole model)
 The trainable parameters only makes up 1.15% of the entire model parameters, so the training will be much faster
 and requires less memory than full fine-tuning.
 Decoding
 --------
 After training, let's test the WERs. To test the WERs on the GigaSpeech set,
 you can execute the following command:
 .. code-block:: bash
    $ epoch=20
    $ avg=10
    $ use_adapters=1
    $ adapter_dim=8
    % ./zipformer/decode.py \
        --epoch $epoch \
        --avg $avg \
        --use-averaged-model 1 \
        --exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
        --max-duration 600 \
        --use-adapters $use_adapters \
        --adapter-dim $adapter_dim \
        --decoding-method greedy_search
 You should see the following numbers:
 .. code-block::
    For dev, WER of different settings are:
    greedy_search	15.44	best for dev
    For test, WER of different settings are:
    greedy_search	15.42	best for test
 The WER on test set is improved from 19.27 to 15.42, demonstrating the effectiveness of adapters.
 The same model can be used to perform decoding on LibriSpeech test sets. You can deactivate the adapters
 to keep the same performance of the original model:
 .. code-block:: bash
    $ epoch=20
    $ avg=1
    $ use_adapters=0
    $ adapter_dim=8
    % ./zipformer/decode.py \
        --epoch $epoch \
        --avg $avg \
        --use-averaged-model 1 \
        --exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
        --max-duration 600 \
        --use-adapters $use_adapters \
        --adapter-dim $adapter_dim \
        --decoding-method greedy_search
 .. code-block::
    For dev, WER of different settings are:
    greedy_search	2.23	best for test-clean
    For test, WER of different settings are:
    greedy_search	4.96	best for test-other
 The numbers are the same as reported in `icefall <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md#normal-scaled-model-number-of-model-parameters-65549011-ie-6555-m>`_. So adapter-based
 fine-tuning is also very flexible as the same model can be used for decoding on the original and target domain.
 Export the model
 ----------------
 After training, the model can be exported to ``onnx`` format easily using the following command:
 .. code-block:: bash
    $ use_adapters=1
    $ adapter_dim=16
    $ ./zipformer_adapter/export-onnx.py \
        --tokens icefall-asr-librispeech-zipformer-2023-05-15/data/lang_bpe_500/tokens.txt \
        --use-averaged-model 1 \
        --epoch 20 \
        --avg 10 \
        --exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
        --use-adapters $use_adapters \
        --adapter-dim $adapter_dim \
        --num-encoder-layers "2,2,3,4,3,2" \
        --downsampling-factor "1,2,4,8,4,2" \
        --feedforward-dim "512,768,1024,1536,1024,768" \
        --num-heads "4,4,4,8,4,4" \
        --encoder-dim "192,256,384,512,384,256" \
        --query-head-dim 32 \
        --value-head-dim 12 \
        --pos-head-dim 4 \
        --pos-dim 48 \
        --encoder-unmasked-dim "192,192,256,256,256,192" \
        --cnn-module-kernel "31,31,15,15,15,31" \
        --decoder-dim 512 \
        --joiner-dim 512 \
        --causal False \
        --chunk-size "16,32,64,-1" \
        --left-context-frames "64,128,256,-1"
--- a/docs/source/recipes/Finetune/from_supervised/finetune_zipformer.rst
+++ b/docs/source/recipes/Finetune/from_supervised/finetune_zipformer.rst
@ -0,0 +1,140 @@
 Finetune from a supervised pre-trained Zipformer model
 ======================================================
 This tutorial shows you how to fine-tune a supervised pre-trained **Zipformer**
 transducer model on a new dataset.
 .. HINT::
  We assume you have read the page :ref:`install icefall` and have setup
  the environment for ``icefall``.
 .. HINT::
  We recommend you to use a GPU or several GPUs to run this recipe
 For illustration purpose, we fine-tune the Zipformer transducer model
 pre-trained on `LibriSpeech`_ on the small subset of `GigaSpeech`_. You could use your
 own data for fine-tuning if you create a manifest for your new dataset.
 Data preparation
 ----------------
 Please follow the instructions in the `GigaSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR>`_
 to prepare the fine-tune data used in this tutorial. We only require the small subset in GigaSpeech for this tutorial.
 Model preparation
 -----------------
 We are using the Zipformer model trained on full LibriSpeech (960 hours) as the intialization. The
 checkpoint of the model can be downloaded via the following command:
 .. code-block:: bash
    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
    $ cd icefall-asr-librispeech-zipformer-2023-05-15/exp
    $ git lfs pull --include "pretrained.pt"
    $ ln -s pretrained.pt epoch-99.pt
    $ cd ../data/lang_bpe_500
    $ git lfs pull --include bpe.model
    $ cd ../../..
 Before fine-tuning, let's test the model's WER on the new domain. The following command performs
 decoding on the GigaSpeech test sets:
 .. code-block:: bash
    ./zipformer/decode_gigaspeech.py \
        --epoch 99 \
        --avg 1 \
        --exp-dir icefall-asr-librispeech-zipformer-2023-05-15/exp \
        --use-averaged-model 0 \
        --max-duration 1000 \
        --decoding-method greedy_search
 You should see the following numbers:
 .. code-block::
    For dev, WER of different settings are:
    greedy_search	20.06	best for dev
    For test, WER of different settings are:
    greedy_search	19.27	best for test
 Fine-tune
 ---------
 Since LibriSpeech and GigaSpeech are both English dataset, we can initialize the whole
 Zipformer model with the checkpoint downloaded in the previous step (otherwise we should consider
 initializing the stateless decoder and joiner from scratch due to the mismatch of the output
 vocabulary). The following command starts a fine-tuning experiment:
 .. code-block:: bash
    $ use_mux=0
    $ do_finetune=1
    $ ./zipformer/finetune.py \
        --world-size 2 \
        --num-epochs 20 \
        --start-epoch 1 \
        --exp-dir zipformer/exp_giga_finetune${do_finetune}_mux${use_mux} \
        --use-fp16 1 \
        --base-lr 0.0045 \
        --bpe-model data/lang_bpe_500/bpe.model \
        --do-finetune $do_finetune \
        --use-mux $use_mux \
        --master-port 13024 \
        --finetune-ckpt icefall-asr-librispeech-zipformer-2023-05-15/exp/pretrained.pt \
        --max-duration 1000
 The following arguments are related to fine-tuning:
 - ``--base-lr``
    The learning rate used for fine-tuning. We suggest to set a **small** learning rate for fine-tuning,
    otherwise the model may forget the initialization very quickly. A reasonable value should be around
    1/10 of the original lr, i.e 0.0045.
 - ``--do-finetune``
    If True, do fine-tuning by initializing the model from a pre-trained checkpoint.
    **Note that if you want to resume your fine-tuning experiment from certain epochs, you
    need to set this to False.**
 - ``--finetune-ckpt``
    The path to the pre-trained checkpoint (used for initialization).
 - ``--use-mux``
    If True, mix the fine-tune data with the original training data by using `CutSet.mux <https://lhotse.readthedocs.io/en/latest/api.html#lhotse.supervision.SupervisionSet.mux>`_
    This helps maintain the model's performance on the original domain if the original training
    is available. **If you don't have the original training data, please set it to False.**
 After fine-tuning, let's test the WERs. You can do this via the following command:
 .. code-block:: bash
    $ use_mux=0
    $ do_finetune=1
    $ ./zipformer/decode_gigaspeech.py \
        --epoch 20 \
        --avg 10 \
        --exp-dir zipformer/exp_giga_finetune${do_finetune}_mux${use_mux} \
        --use-averaged-model 1 \
        --max-duration 1000 \
        --decoding-method greedy_search
 You should see numbers similar to the ones below:
 .. code-block:: text
    For dev, WER of different settings are:
    greedy_search	13.47	best for dev
    For test, WER of different settings are:
    greedy_search	13.66	best for test
 Compared to the original checkpoint, the fine-tuned model achieves much lower WERs
 on the GigaSpeech test sets.
--- a/docs/source/recipes/Finetune/index.rst
+++ b/docs/source/recipes/Finetune/index.rst
@ -0,0 +1,16 @@
 Fine-tune a pre-trained model
 =============================
 After pre-training on public available datasets, the ASR model is already capable of
 performing general speech recognition with relatively high accuracy. However, the accuracy
 could be still low on certain domains that are quite different from the original training
 set. In this case, we can fine-tune the model with a small amount of additional labelled
 data to improve the performance on new domains.
 .. toctree::
   :maxdepth: 2
   :caption: Table of Contents
   from_supervised/finetune_zipformer
   adapter/finetune_adapter
--- a/docs/source/recipes/TTS/ljspeech/vits.rst
+++ b/docs/source/recipes/TTS/ljspeech/vits.rst
@ -1,4 +1,4 @@
-VITS
+VITS-LJSpeech
 ===============
 This tutorial shows you how to train an VITS model
@ -13,6 +13,14 @@ with the `LJSpeech <https://keithito.com/LJ-Speech-Dataset/>`_ dataset.
   The VITS paper: `Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech <https://arxiv.org/pdf/2106.06103.pdf>`_
 Install extra dependencies
 --------------------------
 .. code-block:: bash
  pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
  pip install numba espnet_tts_frontend
 Data preparation
 ----------------
@ -56,7 +64,8 @@ Training
      --start-epoch 1 \
      --use-fp16 1 \
      --exp-dir vits/exp \
-      --tokens data/tokens.txt
+      --tokens data/tokens.txt \
      --model-type high \
      --max-duration 500
 .. note::
@ -64,6 +73,11 @@ Training
    You can adjust the hyper-parameters to control the size of the VITS model and
    the training configurations. For more details, please run ``./vits/train.py --help``.
 .. warning::
   If you want a model that runs faster on CPU, please use ``--model-type low``
   or ``--model-type medium``.
 .. note::
    The training can take a long time (usually a couple of days).
@ -95,8 +109,8 @@ training part first. It will save the ground-truth and generated wavs to the dir
 Export models
 -------------
-Currently we only support ONNX model exporting. It will generate two files in the given ``exp-dir``:
+Currently we only support ONNX model exporting. It will generate one file in the given ``exp-dir``:
-``vits-epoch-*.onnx`` and ``vits-epoch-*.int8.onnx``.
+``vits-epoch-*.onnx``.
 .. code-block:: bash
@ -120,4 +134,68 @@ Download pretrained models
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following link:
-  - `<https://huggingface.co/Zengwei/icefall-tts-ljspeech-vits-2023-11-29>`_
+  - ``--model-type=high``: `<https://huggingface.co/Zengwei/icefall-tts-ljspeech-vits-2024-02-28>`_
  - ``--model-type=medium``: `<https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-medium-2024-03-12>`_
  - ``--model-type=low``: `<https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-low-2024-03-12>`_
 Usage in sherpa-onnx
 --------------------
 The following describes how to test the exported ONNX model in `sherpa-onnx`_.
 .. hint::
   `sherpa-onnx`_ supports different programming languages, e.g., C++, C, Python,
   Kotlin, Java, Swift, Go, C#, etc. It also supports Android and iOS.
   We only describe how to use pre-built binaries from `sherpa-onnx`_ below.
   Please refer to `<https://k2-fsa.github.io/sherpa/onnx/>`_
   for more documentation.
 Install sherpa-onnx
 ^^^^^^^^^^^^^^^^^^^
 .. code-block:: bash
   pip install sherpa-onnx
 To check that you have installed `sherpa-onnx`_ successfully, please run:
 .. code-block:: bash
   which sherpa-onnx-offline-tts
   sherpa-onnx-offline-tts --help
 Download lexicon files
 ^^^^^^^^^^^^^^^^^^^^^^
 .. code-block:: bash
   cd /tmp
   wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
   tar xf espeak-ng-data.tar.bz2
 Run sherpa-onnx
 ^^^^^^^^^^^^^^^
 .. code-block:: bash
  cd egs/ljspeech/TTS
  sherpa-onnx-offline-tts \
    --vits-model=vits/exp/vits-epoch-1000.onnx \
    --vits-tokens=data/tokens.txt \
    --vits-data-dir=/tmp/espeak-ng-data \
    --num-threads=1 \
    --output-filename=./high.wav \
    "Ask not what your country can do for you; ask what you can do for your country."
 .. hint::
   You can also use ``sherpa-onnx-offline-tts-play`` to play the audio
   as it is generating.
 You should get a file ``high.wav`` after running the above command.
 Congratulations! You have successfully trained and exported a text-to-speech
 model and run it with `sherpa-onnx`_.
--- a/docs/source/recipes/TTS/vctk/vits.rst
+++ b/docs/source/recipes/TTS/vctk/vits.rst
@ -1,4 +1,4 @@
-VITS
+VITS-VCTK
 ===============
 This tutorial shows you how to train an VITS model
--- a/docs/source/recipes/index.rst
+++ b/docs/source/recipes/index.rst
@ -17,3 +17,4 @@ We may add recipes for other tasks as well in the future.
   Streaming-ASR/index
   RNN-LM/index
   TTS/index
   Finetune/index
--- a/egs/aidatatang_200zh/ASR/prepare.sh
+++ b/egs/aidatatang_200zh/ASR/prepare.sh
@ -16,8 +16,8 @@ perturb_speed=true
 #
 #  - $dl_dir/aidatatang_200zh
 #      You can find "corpus" and "transcript" inside it.
-#      You can download it at
+#      You can download it at https://openslr.org/62/
-#       https://openslr.org/62/
+#      If you download the data by yourself, DON'T FORGET to extract the *.tar.gz files under corpus.
 dl_dir=$PWD/download
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -288,8 +288,9 @@ class Aidatatang_200zhAsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                buffer_size=self.args.num_buckets * 2000,
                shuffle_buffer_size=self.args.num_buckets * 5000,
                drop_last=True,
                buffer_size=50000,
            )
        else:
            logging.info("Using SimpleCutSampler.")
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/export.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/export.py
@ -1,3 +1,4 @@
 #!/usr/bin/env python3
 # Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
@ -20,7 +21,7 @@
 Usage:
 ./pruned_transducer_stateless2/export.py \
  --exp-dir ./pruned_transducer_stateless2/exp \
-  --lang-dir data/lang_char \
+  --tokens data/lang_char/tokens.txt \
  --epoch 29 \
  --avg 19
@ -45,12 +46,13 @@ import argparse
 import logging
 from pathlib import Path
 import k2
 import torch
 from scaling_converter import convert_scaled_to_non_scaled
 from train import get_params, get_transducer_model
 from icefall.checkpoint import average_checkpoints, load_checkpoint
-from icefall.lexicon import Lexicon
+from icefall.utils import num_tokens, str2bool
 from icefall.utils import str2bool
 def get_parser():
@ -85,10 +87,10 @@ def get_parser():
    )
    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
        type=str,
-        default="data/lang_char",
+        default="data/lang_char/tokens.txt",
-        help="The lang dir",
+        help="Path to the tokens.txt.",
    )
    parser.add_argument(
@ -122,10 +124,14 @@ def main():
    logging.info(f"device: {device}")
-    lexicon = Lexicon(params.lang_dir)
+    # Load tokens.txt here
    token_table = k2.SymbolTable.from_file(params.tokens)
-    params.blank_id = 0
+    # Load id of the <blk> token and the vocab size
-    params.vocab_size = max(lexicon.tokens) + 1
+    # <blk> is defined in local/train_bpe_model.py
    params.blank_id = token_table["<blk>"]
    params.unk_id = token_table["<unk>"]
    params.vocab_size = num_tokens(token_table) + 1  # +1 for <blk>
    logging.info(params)
@ -152,6 +158,7 @@ def main():
    model.eval()
    if params.jit:
        convert_scaled_to_non_scaled(model, inplace=True)
        # We won't use the forward() method of the model in C++, so just ignore
        # it here.
        # Otherwise, one of its arguments is a ragged tensor and is not
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/lstmp.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/lstmp.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/lstm_transducer_stateless2/lstmp.py
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/scaling_converter.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/scaling_converter.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/pruned_transducer_stateless3/scaling_converter.py
--- a/egs/aishell/ASR/README.md
+++ b/egs/aishell/ASR/README.md
@ -19,8 +19,17 @@ The following table lists the differences among them.
 | `transducer_stateless_modified`    | Conformer | Embedding + Conv1d | with modified transducer from `optimized_transducer`                     |
 | `transducer_stateless_modified-2`  | Conformer | Embedding + Conv1d | with modified transducer from `optimized_transducer` + extra data      |
 | `pruned_transducer_stateless3`     | Conformer (reworked) | Embedding + Conv1d | pruned RNN-T + reworked model with random combiner + using aidatatang_20zh as extra data|
-| `pruned_transducer_stateless7`     | Zipformer | Embedding | pruned RNN-T + zipformer encoder + stateless decoder with context-size 1 |
+| `pruned_transducer_stateless7`     | Zipformer | Embedding | pruned RNN-T + zipformer encoder + stateless decoder with context-size set to 1 |
 | `zipformer`                           | Upgraded Zipformer | Embedding + Conv1d | The latest recipe with context-size set to 1 |
 The decoder in `transducer_stateless` is modified from the paper
 [Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
 We place an additional Conv1d layer right after the input embedding layer.
 # Whisper
 Recipe to finetune large pretrained models
 |                                    | Encoder   | Decoder            | Comment                                                                           |
 |------------------------------------|-----------|--------------------|-----------------------------------------------------------------------------------|
 | `whisper`             | Transformer | Transformer | support fine-tuning using deepspeed
--- a/egs/aishell/ASR/RESULTS.md
+++ b/egs/aishell/ASR/RESULTS.md
@ -1,10 +1,120 @@
 ## Results
 ### Aishell training results (Fine-tuning Pretrained Models)
 #### Whisper
 [./whisper](./whisper)
 ##### fine-tuning results on Aishell test set on whisper medium, large-v2, large-v3
 |                        | test (before fine-tuning) | test (after fine-tuning)  | comment                                 |
 |------------------------|------|------|-----------------------------------------|
 | medium         | 7.23 | 3.27 | --epoch 10 --avg 4,  ddp                         |
 | large-v2       | 6.56 | 2.47 | --epoch 10 --avg 6,  deepspeed zero stage1       |
 | large-v3       | 6.06 | 2.84 | --epoch 5 --avg 3,   deepspeed zero stage1       |
 Command for training is:
 ```bash
 pip install -r whisper/requirements.txt
 ./prepare.sh --stage 30 --stop_stage 30
 #fine-tuning with deepspeed zero stage 1
 torchrun --nproc-per-node 8 ./whisper/train.py \
  --max-duration 200 \
  --exp-dir whisper/exp_large_v2 \
  --model-name large-v2 \
  --deepspeed \
  --deepspeed_config ./whisper/ds_config_zero1.json
 # fine-tuning with ddp
 torchrun --nproc-per-node 8 ./whisper/train.py \
  --max-duration 200 \
  --exp-dir whisper/exp_medium \
  --base-lr 1e-5 \
  --model-name medium
 ```
 Command for decoding using fine-tuned models:
 ```bash
 git lfs install
 git clone https://huggingface.co/yuekai/icefall_asr_aishell_whisper
 ln -s icefall_asr_aishell_whisper/exp_large_v2/epoch-10-avg6.pt whisper/exp_large_v2/epoch-999.pt
 python3 ./whisper/decode.py \
  --exp-dir whisper/exp_large_v2 \
  --model-name large-v2 \
  --epoch 999 --avg 1 \
  --beam-size 10 --max-duration 50
 ```
 Command for decoding using pretrained models (before fine-tuning):
 ```bash
 python3 ./whisper/decode.py \
  --exp-dir whisper/exp_large_v2 \
  --model-name large-v2 \
  --epoch -1 --avg 1 \
  --remove-whisper-encoder-input-length-restriction False \
  --beam-size 10 --max-duration 50
 ```
 Fine-tuned models, training logs, decoding logs, tensorboard and decoding results
 are available at
 <https://huggingface.co/yuekai/icefall_asr_aishell_whisper>
 ### Aishell training result (Stateless Transducer)
 #### Zipformer (Byte-level BPE)
 [./zipformer](./zipformer/)
 It's reworked Zipformer with Pruned RNNT loss, trained with Byte-level BPE, `vocab_size` set to 500.
 ##### normal-scaled model, number of model parameters: 65549011, i.e., 65.55 M
 |                        | test | dev  | comment                                 |
 |------------------------|------|------|-----------------------------------------|
 | greedy search          | 4.54 | 4.31 | --epoch 40 --avg 10                     |
 | modified beam search   | 4.37 | 4.11 | --epoch 40 --avg 10                     |
 | fast beam search       | 4.43 | 4.17 | --epoch 40 --avg 10                     |
 ```bash
 ./prepare.sh
 export CUDA_VISIBLE_DEVICES="0,1"
 ./zipformer/train_bbpe.py \
  --world-size 2 \
  --num-epochs 40 \
  --start-epoch 1 \
  --use-fp16 1 \
  --context-size 2 \
  --enable-musan 0 \
  --exp-dir zipformer/exp_bbpe \
  --max-duration 1000 \
  --enable-musan 0 \
  --base-lr 0.045 \
  --lr-batches 7500 \
  --lr-epochs 10 \
  --spec-aug-time-warp-factor 20
 ```
 Command for decoding is:
 ```bash
 for m in greedy_search modified_beam_search fast_beam_search ; do
  ./zipformer/decode_bbpe.py \
    --epoch 40 \
    --avg 10 \
    --exp-dir ./zipformer_bbpe/exp \
    --bpe-model data/lang_bbpe_500/bbpe.model \
    --context-size 2 \
    --decoding-method $m
 done
 ```
 Pretrained models, training logs, decoding logs, tensorboard and decoding results
 are available at
 <https://huggingface.co/zrjin/icefall-asr-aishell-zipformer-bbpe-2024-01-16>
 #### Zipformer (Non-streaming)
-[./zipformer](./zipformer)
+[./zipformer](./zipformer/)
 It's reworked Zipformer with Pruned RNNT loss.
 **Caution**: It uses `--context-size=1`.
@ -260,7 +370,7 @@ done
 Pretrained models, training logs, decoding logs, and decoding results
 are available at
 <https://huggingface.co/marcoyang/icefall-asr-aishell-zipformer-pruned-transducer-stateless7-2023-03-21>
-#### Pruned transducer stateless 7 (zipformer)
+#### Pruned transducer stateless 7 (Byte-level BPE)
 See <https://github.com/k2-fsa/icefall/pull/986>
@ -703,7 +813,6 @@ python3 ./transducer_stateless/decode.py \
       --max-sym-per-frame 3
 ```
 ### Aishell training results (Transducer-stateless)
 #### 2022-02-18
 (Pingfeng Luo) : The tensorboard log for training is available at <https://tensorboard.dev/experiment/k3QL6QMhRbCwCKYKM9po9w/>
 And pretrained model is available at <https://huggingface.co/pfluo/icefall-aishell-transducer-stateless-char-2021-12-29>
--- a/egs/aishell/ASR/conformer_ctc/README.md
+++ b/egs/aishell/ASR/conformer_ctc/README.md
@ -1,4 +1,4 @@
 Please visit
-<https://icefall.readthedocs.io/en/latest/recipes/aishell/conformer_ctc.html>
+<https://k2-fsa.github.io/icefall/recipes/Non-streaming-ASR/aishell/conformer_ctc.html>
 for how to run this recipe.
--- a/egs/aishell/ASR/conformer_ctc/decode.py
+++ b/egs/aishell/ASR/conformer_ctc/decode.py
@ -419,7 +419,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        if enable_log:
            logging.info(f"The transcripts are stored in {recog_path}")
@ -432,7 +432,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
+                f,
                f"{test_set_name}-{key}",
                results_char,
                enable_log=enable_log,
                compute_CER=True,
            )
            test_set_wers[key] = wer
--- a/egs/aishell/ASR/conformer_mmi/decode.py
+++ b/egs/aishell/ASR/conformer_mmi/decode.py
@ -431,7 +431,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        if enable_log:
            logging.info(f"The transcripts are stored in {recog_path}")
@ -444,7 +444,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
+                f,
                f"{test_set_name}-{key}",
                results_char,
                enable_log=enable_log,
                compute_CER=True,
            )
            test_set_wers[key] = wer
--- a/egs/aishell/ASR/local/compute_fbank_aishell.py
+++ b/egs/aishell/ASR/local/compute_fbank_aishell.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse import (
    CutSet,
    Fbank,
    FbankConfig,
    LilcomChunkyWriter,
    WhisperFbank,
    WhisperFbankConfig,
 )
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor, str2bool
@ -42,9 +49,14 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
-def compute_fbank_aishell(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_aishell(
    num_mel_bins: int = 80,
    perturb_speed: bool = False,
    whisper_fbank: bool = False,
    output_dir: str = "data/fbank",
 ):
    src_dir = Path("data/manifests")
-    output_dir = Path("data/fbank")
+    output_dir = Path(output_dir)
    num_jobs = min(15, os.cpu_count())
    dataset_parts = (
@ -68,7 +80,11 @@ def compute_fbank_aishell(num_mel_bins: int = 80, perturb_speed: bool = False):
        list(manifests.keys()),
        dataset_parts,
    )
-
+    if whisper_fbank:
        extractor = WhisperFbank(
            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
        )
    else:
        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
    with get_executor() as ex:  # Initialize the executor only once.
@ -82,7 +98,7 @@ def compute_fbank_aishell(num_mel_bins: int = 80, perturb_speed: bool = False):
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -111,6 +127,18 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
    parser.add_argument(
        "--whisper-fbank",
        type=str2bool,
        default=False,
        help="Use WhisperFbank instead of Fbank. Default: False.",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="data/fbank",
        help="Output directory. Default: data/fbank.",
    )
    return parser.parse_args()
@ -121,5 +149,8 @@ if __name__ == "__main__":
    args = get_args()
    compute_fbank_aishell(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
        perturb_speed=args.perturb_speed,
        whisper_fbank=args.whisper_fbank,
        output_dir=args.output_dir,
    )
--- a/egs/aishell/ASR/prepare.sh
+++ b/egs/aishell/ASR/prepare.sh
@ -360,7 +360,7 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
 fi
 if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
-  log "Stage 11: Train RNN LM model"
+  log "Stage 12: Train RNN LM model"
  python ../../../icefall/rnn_lm/train.py \
    --start-epoch 0 \
    --world-size 1 \
@ -376,3 +376,16 @@ if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
    --vocab-size 4336 \
    --master-port 12345
 fi
 # whisper large-v3 using 128 mel bins, others using 80 mel bins
 whisper_mel_bins=80
 output_dir=data/fbank_whisper
 if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then
  log "Stage 30: Compute ${whisper_mel_bins} dim fbank for whisper model fine-tuning"
  if [ ! -f $output_dir/.aishell.whisper.done ]; then
    mkdir -p $output_dir
    ./local/compute_fbank_aishell.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true --output-dir $output_dir
    ./local/compute_fbank_musan.py --num-mel-bins ${whisper_mel_bins} --whisper-fbank true --output-dir $output_dir
    touch $output_dir/.aishell.whisper.done
  fi
 fi
--- a/egs/aishell/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless2/decode.py
@ -390,7 +390,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
@ -402,7 +402,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
                f"{test_set_name}-{key}",
                results_char,
                enable_log=True,
                compute_CER=True,
            )
            test_set_wers[key] = wer
--- a/egs/aishell/ASR/pruned_transducer_stateless2/export.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless2/export.py
@ -47,12 +47,12 @@ import argparse
 import logging
 from pathlib import Path
 import k2
 import torch
 from train import add_model_arguments, get_params, get_transducer_model
 from icefall.checkpoint import average_checkpoints, find_checkpoints, load_checkpoint
-from icefall.lexicon import Lexicon
+from icefall.utils import num_tokens, str2bool
 from icefall.utils import str2bool
 def get_parser():
@ -106,10 +106,10 @@ def get_parser():
    )
    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
-        type=Path,
+        type=str,
-        default=Path("data/lang_char"),
+        default="data/lang_char/tokens.txt",
-        help="The lang dir",
+        help="Path to the tokens.txt",
    )
    parser.add_argument(
@ -136,10 +136,9 @@ def main():
    logging.info(f"device: {device}")
-    lexicon = Lexicon(params.lang_dir)
+    token_table = k2.SymbolTable.from_file(params.tokens)
-
+    params.blank_id = token_table["<blk>"]
-    params.blank_id = 0
+    params.vocab_size = num_tokens(token_table) + 1
    params.vocab_size = max(lexicon.tokens) + 1
    logging.info(params)
--- a/egs/aishell/ASR/pruned_transducer_stateless3/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless3/decode.py
@ -526,7 +526,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
@ -538,7 +538,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
                f"{test_set_name}-{key}",
                results_char,
                enable_log=True,
                compute_CER=True,
            )
            test_set_wers[key] = wer
--- a/egs/aishell/ASR/pruned_transducer_stateless3/export.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless3/export.py
@ -47,6 +47,7 @@ import argparse
 import logging
 from pathlib import Path
 import k2
 import torch
 from scaling_converter import convert_scaled_to_non_scaled
 from train import add_model_arguments, get_params, get_transducer_model
@ -57,8 +58,7 @@ from icefall.checkpoint import (
    find_checkpoints,
    load_checkpoint,
 )
-from icefall.lexicon import Lexicon
+from icefall.utils import num_tokens, str2bool
 from icefall.utils import str2bool
 def get_parser():
@ -123,10 +123,10 @@ def get_parser():
    )
    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
-        type=Path,
+        type=str,
-        default=Path("data/lang_char"),
+        default="data/lang_char/tokens.txt",
-        help="The lang dir",
+        help="Path to the tokens.txt",
    )
    parser.add_argument(
@ -153,10 +153,9 @@ def main():
    logging.info(f"device: {device}")
-    lexicon = Lexicon(params.lang_dir)
+    token_table = k2.SymbolTable.from_file(params.tokens)
-
+    params.blank_id = token_table["<blk>"]
-    params.blank_id = 0
+    params.vocab_size = num_tokens(token_table) + 1
    params.vocab_size = max(lexicon.tokens) + 1
    params.datatang_prob = 0
    logging.info(params)
--- a/egs/aishell/ASR/pruned_transducer_stateless7/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/decode.py
@ -444,7 +444,7 @@ def save_results(
        for res in results:
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
-        store_transcripts(filename=recog_path, texts=results_char)
+        store_transcripts(filename=recog_path, texts=results_char, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
@ -452,7 +452,11 @@ def save_results(
        errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
                f"{test_set_name}-{key}",
                results_char,
                enable_log=True,
                compute_CER=True,
            )
            test_set_wers[key] = wer
--- a/egs/aishell/ASR/pruned_transducer_stateless7/do_not_use_it_directly.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/do_not_use_it_directly.py
@ -89,6 +89,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
@ -881,9 +882,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
+                raise_grad_scale_is_too_small_error()
                    f"grad_scale is too small, exiting: {cur_grad_scale}"
                )
        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
--- a/egs/aishell/ASR/pruned_transducer_stateless7/export-onnx.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/export-onnx.py
@ -49,14 +49,14 @@ import logging
 from pathlib import Path
 from typing import Dict, Tuple
 import k2
 import onnx
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 from decoder2 import Decoder
 from do_not_use_it_directly import add_model_arguments, get_params, get_transducer_model
 from onnxruntime.quantization import QuantType, quantize_dynamic
 from scaling_converter import convert_scaled_to_non_scaled
 from do_not_use_it_directly import add_model_arguments, get_params, get_transducer_model
 from zipformer import Zipformer
 from icefall.checkpoint import (
@ -65,8 +65,7 @@ from icefall.checkpoint import (
    find_checkpoints,
    load_checkpoint,
 )
-from icefall.lexicon import Lexicon
+from icefall.utils import num_tokens, setup_logger, str2bool
 from icefall.utils import setup_logger, str2bool
 def get_parser():
@ -123,12 +122,10 @@ def get_parser():
    )
    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
        type=str,
-        help="""The lang dir
+        default="data/lang_char/tokens.txt",
-        It contains language related input files such as
+        help="Path to the tokens.txt",
        "lexicon.txt"
        """,
    )
    parser.add_argument(
@ -404,9 +401,9 @@ def main():
    logging.info(f"device: {device}")
-    lexicon = Lexicon(params.lang_dir)
+    token_table = k2.SymbolTable.from_file(params.tokens)
-    params.blank_id = 0
+    params.blank_id = token_table["<blk>"]
-    params.vocab_size = max(lexicon.tokens) + 1
+    params.vocab_size = num_tokens(token_table) + 1
    logging.info(params)
--- a/egs/aishell/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/train.py
@ -85,6 +85,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import (
@ -878,9 +879,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
+                raise_grad_scale_is_too_small_error(cur_grad_scale)
                    f"grad_scale is too small, exiting: {cur_grad_scale}"
                )
        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
--- a/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/decode.py
@ -581,7 +581,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
@ -594,7 +594,11 @@ def save_results(
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
                f"{test_set_name}-{key}",
                results_char,
                enable_log=True,
                compute_CER=True,
            )
            test_set_wers[key] = wer
--- a/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py
@ -78,6 +78,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -871,9 +872,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
+                raise_grad_scale_is_too_small_error(cur_grad_scale)
                    f"grad_scale is too small, exiting: {cur_grad_scale}"
                )
        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/decode.py
@ -250,7 +250,7 @@ def get_parser():
    parser.add_argument(
        "--context-size",
        type=int,
-        default=1,
+        default=2,
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )
    parser.add_argument(
@ -492,7 +492,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
@ -500,7 +500,11 @@ def save_results(
        errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results, enable_log=True
+                f,
                f"{test_set_name}-{key}",
                results,
                enable_log=True,
                compute_CER=True,
            )
            test_set_wers[key] = wer
--- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py
@ -78,6 +78,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
@ -882,9 +883,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
+                raise_grad_scale_is_too_small_error(cur_grad_scale)
                    f"grad_scale is too small, exiting: {cur_grad_scale}"
                )
        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py
@ -78,6 +78,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
@ -881,9 +882,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
+                raise_grad_scale_is_too_small_error(cur_grad_scale)
                    f"grad_scale is too small, exiting: {cur_grad_scale}"
                )
        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -275,6 +275,8 @@ class AishellAsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                buffer_size=self.args.num_buckets * 2000,
                shuffle_buffer_size=self.args.num_buckets * 5000,
                drop_last=self.args.drop_last,
            )
        else:
--- a/egs/aishell/ASR/tdnn_lstm_ctc/decode.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/decode.py
@ -278,7 +278,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
@ -289,7 +289,13 @@ def save_results(
        for res in results:
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
-            wer = write_error_stats(f, f"{test_set_name}-{key}", results_char)
+            wer = write_error_stats(
                f,
                f"{test_set_name}-{key}",
                results_char,
                enable_log=True,
                compute_CER=True,
            )
            test_set_wers[key] = wer
        logging.info("Wrote detailed error stats to {}".format(errs_filename))
--- a/egs/aishell/ASR/transducer_stateless/decode.py
+++ b/egs/aishell/ASR/transducer_stateless/decode.py
@ -327,7 +327,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
@ -338,7 +338,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
                f"{test_set_name}-{key}",
                results_char,
                enable_log=True,
                compute_CER=True,
            )
            test_set_wers[key] = wer
--- a/egs/aishell/ASR/transducer_stateless/export.py
+++ b/egs/aishell/ASR/transducer_stateless/export.py
@ -23,7 +23,7 @@
 Usage:
 ./transducer_stateless/export.py \
  --exp-dir ./transducer_stateless/exp \
-  --lang-dir data/lang_char \
+  --tokens data/lang_char/tokens.txt \
  --epoch 20 \
  --avg 10
@ -47,6 +47,7 @@ import argparse
 import logging
 from pathlib import Path
 import k2
 import torch
 import torch.nn as nn
 from conformer import Conformer
@ -56,8 +57,7 @@ from model import Transducer
 from icefall.checkpoint import average_checkpoints, load_checkpoint
 from icefall.env import get_env_info
-from icefall.lexicon import Lexicon
+from icefall.utils import AttributeDict, num_tokens, str2bool
 from icefall.utils import AttributeDict, str2bool
 def get_parser():
@ -92,10 +92,10 @@ def get_parser():
    )
    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
        type=str,
-        default="data/lang_char",
+        default="data/lang_char/tokens.txt",
-        help="The lang dir",
+        help="Path to the tokens.txt",
    )
    parser.add_argument(
@ -192,10 +192,9 @@ def main():
    logging.info(f"device: {device}")
-    lexicon = Lexicon(params.lang_dir)
+    token_table = k2.SymbolTable.from_file(params.tokens)
-
+    params.blank_id = token_table["<blk>"]
-    params.blank_id = 0
+    params.vocab_size = num_tokens(token_table) + 1
    params.vocab_size = max(lexicon.tokens) + 1
    logging.info(params)
--- a/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py
@ -226,6 +226,8 @@ class AsrDataModule:
            max_duration=self.args.max_duration,
            shuffle=self.args.shuffle,
            num_buckets=self.args.num_buckets,
            buffer_size=self.args.num_buckets * 2000,
            shuffle_buffer_size=self.args.num_buckets * 5000,
            drop_last=True,
        )
--- a/egs/aishell/ASR/transducer_stateless_modified-2/decode.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/decode.py
@ -372,7 +372,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
@ -384,7 +384,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
                f"{test_set_name}-{key}",
                results_char,
                enable_log=True,
                compute_CER=True,
            )
            test_set_wers[key] = wer
--- a/egs/aishell/ASR/transducer_stateless_modified-2/export.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/export.py
@ -46,6 +46,7 @@ import argparse
 import logging
 from pathlib import Path
 import k2
 import torch
 import torch.nn as nn
 from conformer import Conformer
@ -56,7 +57,7 @@ from model import Transducer
 from icefall.checkpoint import average_checkpoints, load_checkpoint
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, str2bool
+from icefall.utils import AttributeDict, num_tokens, str2bool
 def get_parser():
@ -99,10 +100,10 @@ def get_parser():
    )
    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
-        type=Path,
+        type=str,
-        default=Path("data/lang_char"),
+        default="data/lang_char/tokens.txt",
-        help="The lang dir",
+        help="Path to the tokens.txt",
    )
    parser.add_argument(
@ -190,10 +191,9 @@ def main():
    logging.info(f"device: {device}")
-    lexicon = Lexicon(params.lang_dir)
+    token_table = k2.SymbolTable.from_file(params.tokens)
-
+    params.blank_id = token_table["<blk>"]
-    params.blank_id = 0
+    params.vocab_size = num_tokens(token_table) + 1
    params.vocab_size = max(lexicon.tokens) + 1
    logging.info(params)
--- a/egs/aishell/ASR/transducer_stateless_modified/decode.py
+++ b/egs/aishell/ASR/transducer_stateless_modified/decode.py
@ -376,7 +376,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
@ -388,7 +388,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
                f"{test_set_name}-{key}",
                results_char,
                enable_log=True,
                compute_CER=True,
            )
            test_set_wers[key] = wer
--- a/egs/aishell/ASR/transducer_stateless_modified/export.py
+++ b/egs/aishell/ASR/transducer_stateless_modified/export.py
@ -46,6 +46,7 @@ import argparse
 import logging
 from pathlib import Path
 import k2
 import torch
 import torch.nn as nn
 from conformer import Conformer
@ -55,8 +56,7 @@ from model import Transducer
 from icefall.checkpoint import average_checkpoints, load_checkpoint
 from icefall.env import get_env_info
-from icefall.lexicon import Lexicon
+from icefall.utils import AttributeDict, num_tokens, str2bool
 from icefall.utils import AttributeDict, str2bool
 def get_parser():
@ -99,10 +99,10 @@ def get_parser():
    )
    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
-        type=Path,
+        type=str,
-        default=Path("data/lang_char"),
+        default="data/lang_char/tokens.txt",
-        help="The lang dir",
+        help="Path to the tokens.txt",
    )
    parser.add_argument(
@ -190,10 +190,9 @@ def main():
    logging.info(f"device: {device}")
-    lexicon = Lexicon(params.lang_dir)
+    token_table = k2.SymbolTable.from_file(params.tokens)
-
+    params.blank_id = token_table["<blk>"]
-    params.blank_id = 0
+    params.vocab_size = num_tokens(token_table) + 1
    params.vocab_size = max(lexicon.tokens) + 1
    logging.info(params)
--- a/egs/aishell/ASR/whisper/asr_datamodule.py
+++ b/egs/aishell/ASR/whisper/asr_datamodule.py
@ -0,0 +1 @@
 ../tdnn_lstm_ctc/asr_datamodule.py
--- a/egs/aishell/ASR/whisper/decode.py
+++ b/egs/aishell/ASR/whisper/decode.py
@ -0,0 +1,507 @@
 #!/usr/bin/env python3
 # Copyright 2021 Xiaomi Corporation (Author: Liyong Guo,
 #                                            Fangjun Kuang,
 #                                            Wei Kang)
 #           2024 Yuekai Zhang
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
 # Command for decoding using fine-tuned models:
 git lfs install
 git clone https://huggingface.co/yuekai/icefall_asr_aishell_whisper
 ln -s icefall_asr_aishell_whisper/exp_large_v2/epoch-10-avg6.pt whisper/exp_large_v2/epoch-999.pt
 python3 ./whisper/decode.py \
  --exp-dir whisper/exp_large_v2 \
  --model-name large-v2 \
  --epoch 999 --avg 1 \
  --manifest-dir data/fbank_whisper \
  --beam-size 10 --max-duration 50
 # Command for decoding using pretrained models (before fine-tuning):
 python3 ./whisper/decode.py \
  --exp-dir whisper/exp_large_v2 \
  --model-name large-v2 \
  --epoch -1 --avg 1 \
  --manifest-dir data/fbank_whisper \
  --remove-whisper-encoder-input-length-restriction False \
  --beam-size 10 --max-duration 50
 """
 import argparse
 import logging
 import re
 from collections import defaultdict
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import k2
 import torch
 import torch.nn as nn
 import whisper
 from asr_datamodule import AishellAsrDataModule
 from tn.chinese.normalizer import Normalizer
 from whisper.normalizers import BasicTextNormalizer
 from whisper_encoder_forward_monkey_patch import replace_whisper_encoder_forward
 from zhconv import convert
 from icefall.checkpoint import average_checkpoints_with_averaged_model, load_checkpoint
 from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    setup_logger,
    store_transcripts,
    str2bool,
    write_error_stats,
 )
 def average_checkpoints(
    filenames: List[Path], device: torch.device = torch.device("cpu")
 ) -> dict:
    """Average a list of checkpoints.
    The function is mainly used for deepspeed converted checkpoint averaging, which only include model state_dict.
    Args:
      filenames:
        Filenames of the checkpoints to be averaged. We assume all
        checkpoints are saved by :func:`save_checkpoint`.
      device:
        Move checkpoints to this device before averaging.
    Returns:
      Return a dict (i.e., state_dict) which is the average of all
      model state dicts contained in the checkpoints.
    """
    n = len(filenames)
    if "model" in torch.load(filenames[0], map_location=device):
        avg = torch.load(filenames[0], map_location=device)["model"]
    else:
        avg = torch.load(filenames[0], map_location=device)
    # Identify shared parameters. Two parameters are said to be shared
    # if they have the same data_ptr
    uniqued: Dict[int, str] = dict()
    for k, v in avg.items():
        v_data_ptr = v.data_ptr()
        if v_data_ptr in uniqued:
            continue
        uniqued[v_data_ptr] = k
    uniqued_names = list(uniqued.values())
    for i in range(1, n):
        if "model" in torch.load(filenames[i], map_location=device):
            state_dict = torch.load(filenames[i], map_location=device)["model"]
        else:
            state_dict = torch.load(filenames[i], map_location=device)
        for k in uniqued_names:
            avg[k] += state_dict[k]
    for k in uniqued_names:
        if avg[k].is_floating_point():
            avg[k] /= n
        else:
            avg[k] //= n
    return avg
 def remove_punctuation(text: str or List[str]):
    """Modified from https://github.com/yeyupiaoling/Whisper-Finetune/blob/master/utils/data_utils.py
    Args:
        text: It can be a string or a list of strings.
    Returns:
        Return a string or a list of strings without any punctuation.
    """
    punctuation = "!,.;:?、！，。；：？《》 "
    if isinstance(text, str):
        text = re.sub(r"[{}]+".format(punctuation), "", text).strip()
        return text
    elif isinstance(text, list):
        result_text = []
        for t in text:
            t = re.sub(r"[{}]+".format(punctuation), "", t).strip()
            result_text.append(t)
        return result_text
    else:
        raise Exception(f"Not support type {type(text)}")
 def to_simple(text: str or List[str]):
    """Convert traditional Chinese to simplified Chinese.
    Args:
        text: It can be a string or a list of strings.
    Returns:
        Return a string or a list of strings converted to simplified Chinese.
    """
    if isinstance(text, str):
        text = convert(text, "zh-cn")
        return text
    elif isinstance(text, list):
        result_text = []
        for t in text:
            t = convert(t, "zh-cn")
            result_text.append(t)
        return result_text
    else:
        raise Exception(f"Not support type{type(text)}")
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=-1,
        help="It specifies the checkpoint to use for decoding."
        "Note: Epoch counts from 0.",
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=1,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch'. ",
    )
    parser.add_argument(
        "--method",
        type=str,
        default="beam-search",
        help="""Decoding method.
        Supported values are:
          - beam-search
        """,
    )
    parser.add_argument(
        "--beam-size",
        type=int,
        default=1,
        help="beam size for beam search decoding",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="whisper/exp",
        help="The experiment dir",
    )
    parser.add_argument(
        "--model-name",
        type=str,
        default="large-v2",
        choices=["large-v2", "large-v3", "medium", "small", "base", "tiny"],
        help="""The model name to use.
        """,
    )
    parser.add_argument(
        "--remove-whisper-encoder-input-length-restriction",
        type=str2bool,
        default=True,
        help="replace whisper encoder forward method to remove input length restriction",
    )
    return parser
 def get_params() -> AttributeDict:
    params = AttributeDict(
        {
            "env_info": get_env_info(),
        }
    )
    return params
 def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
    batch: dict,
 ) -> Dict[str, List[List[int]]]:
    """Decode one batch and return the result in a dict. The dict has the
    following format:
        - key: "beam-search"
        - value: A list of lists. Each sublist is a list of token IDs.
    Args:
        params:
            It is returned by :func:`get_params`.
        model:
            The neural model.
        batch:
            It is returned by :meth:`torch.utils.data.DataLoader.__iter__`.
    Returns:
        Return a dict, whose key may be "beam-search".
    """
    dtype = torch.float16
    device = torch.device("cuda")
    feature = batch["inputs"]
    assert feature.ndim == 3
    feature = feature.to(device, dtype=dtype).transpose(1, 2)
    if not params.remove_whisper_encoder_input_length_restriction:
        T = 3000
        if feature.shape[2] < T:
            feature = torch.cat(
                [
                    feature,
                    torch.zeros(
                        feature.shape[0], feature.shape[1], T - feature.shape[2]
                    ).to(device, dtype=dtype),
                ],
                2,
            )
    supervisions = batch["supervisions"]
    feature_len = supervisions["num_frames"]
    feature_len = feature_len.to(device, dtype=dtype)
    results = model.decode(feature, params.decoding_options)
    hyps = [result.text for result in results]
    hyps = remove_punctuation(hyps)
    hyps = to_simple(hyps)
    hyps = [params.normalizer.normalize(hyp) for hyp in hyps]
    return {"beam-search": hyps}
 def decode_dataset(
    dl: torch.utils.data.DataLoader,
    params: AttributeDict,
    model: nn.Module,
 ) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.
    Args:
        dl:
            The dataloader.
        params:
            It is returned by :func:`get_params`.
        model:
            The neural model.
    Returns:
        Return a dict, whose key may be "beam-search".
    """
    results = []
    num_cuts = 0
    try:
        num_batches = len(dl)
    except TypeError:
        num_batches = "?"
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
        hyps_dict = decode_one_batch(
            params=params,
            model=model,
            batch=batch,
        )
        for lm_scale, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = ref_text.split()
                this_batch.append((cut_id, ref_words, hyp_words))
            results[lm_scale].extend(this_batch)
        num_cuts += len(batch["supervisions"]["text"])
        if batch_idx % 100 == 0:
            batch_str = f"{batch_idx}/{num_batches}"
            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
    return results
 def save_results(
    params: AttributeDict,
    test_set_name: str,
    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    enable_log = True
    test_set_wers = dict()
    for key, results in results_dict.items():
        recog_path = (
            params.exp_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results, char_level=True)
        if enable_log:
            logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
        errs_filename = (
            params.exp_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        # we compute CER for aishell dataset.
        results_char = []
        for res in results:
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
                f,
                f"{test_set_name}-{key}",
                results_char,
                enable_log=enable_log,
                compute_CER=True,
            )
            test_set_wers[key] = wer
        if enable_log:
            logging.info("Wrote detailed error stats to {}".format(errs_filename))
    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = params.exp_dir / f"cer-summary-{test_set_name}-{params.suffix}.txt"
    with open(errs_info, "w") as f:
        print("settings\tCER", file=f)
        for key, val in test_set_wers:
            print("{}\t{}".format(key, val), file=f)
    s = "\nFor {}, CER of different settings are:\n".format(test_set_name)
    note = "\tbest for {}".format(test_set_name)
    for key, val in test_set_wers:
        s += "{}\t{}{}\n".format(key, val, note)
        note = ""
    logging.info(s)
@torch.no_grad()
 def main():
    parser = get_parser()
    AishellAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
    setup_logger(
        f"{params.exp_dir}/log-{params.method}-beam{params.beam_size}/log-decode-{params.suffix}"
    )
    options = whisper.DecodingOptions(
        task="transcribe",
        language="zh",
        without_timestamps=True,
        beam_size=params.beam_size,
    )
    params.decoding_options = options
    params.cleaner = BasicTextNormalizer()
    params.normalizer = Normalizer()
    logging.info("Decoding started")
    logging.info(params)
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda")
    logging.info(f"device: {device}")
    if params.remove_whisper_encoder_input_length_restriction:
        replace_whisper_encoder_forward()
    model = whisper.load_model(params.model_name, "cpu")
    if params.epoch > 0:
        if params.avg > 1:
            start = params.epoch - params.avg
            assert start >= 1, start
            checkpoint = torch.load(
                f"{params.exp_dir}/epoch-{params.epoch}.pt", map_location="cpu"
            )
            if "model" not in checkpoint:
                # deepspeed converted checkpoint only contains model state_dict
                filenames = [
                    f"{params.exp_dir}/epoch-{epoch}.pt"
                    for epoch in range(start, params.epoch + 1)
                ]
                model.load_state_dict(average_checkpoints(filenames))
            else:
                filename_start = f"{params.exp_dir}/epoch-{start}.pt"
                filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
                logging.info(
                    f"Calculating the averaged model over epoch range from "
                    f"{start} (excluded) to {params.epoch}"
                )
                model.to(device)
                model.load_state_dict(
                    average_checkpoints_with_averaged_model(
                        filename_start=filename_start,
                        filename_end=filename_end,
                        device=device,
                    )
                )
            # save checkpoints
            filename = f"{params.exp_dir}/epoch-{params.epoch}-avg-{params.avg}.pt"
            torch.save(model.state_dict(), filename)
        else:
            checkpoint = torch.load(
                f"{params.exp_dir}/epoch-{params.epoch}.pt", map_location="cpu"
            )
            if "model" not in checkpoint:
                model.load_state_dict(checkpoint, strict=True)
            else:
                load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
    model.to(device)
    model.eval()
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    # we need cut ids to display recognition results.
    args.return_cuts = True
    aishell = AishellAsrDataModule(args)
    valid_dl = aishell.valid_dataloaders(aishell.valid_cuts())
    test_dl = aishell.test_dataloaders(aishell.test_cuts())
    test_sets = ["valid", "test"]
    test_dls = [valid_dl, test_dl]
    for test_set, test_dl in zip(test_sets, test_dls):
        results_dict = decode_dataset(
            dl=test_dl,
            params=params,
            model=model,
        )
        save_results(params=params, test_set_name=test_set, results_dict=results_dict)
    logging.info("Done!")
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 if __name__ == "__main__":
    main()
--- a/egs/aishell/ASR/whisper/ds_config_zero1.json
+++ b/egs/aishell/ASR/whisper/ds_config_zero1.json
@ -0,0 +1,38 @@
 {
    "fp16": {
        "enabled": true,
        "loss_scale": 0,
        "loss_scale_window": 100,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 0.01
    },
    "zero_optimization": {
        "stage": 1,
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": true
    },
    "optimizer": {
        "type": "Adam",
        "params": {
            "lr": 1e-5
        }
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": 0,
            "warmup_max_lr": 1e-5,
            "warmup_num_steps": 100
        }
    },
    "gradient_accumulation_steps": 1,
    "gradient_clipping": 5,
    "steps_per_print": 50,
    "train_micro_batch_size_per_gpu": 1,
    "wall_clock_breakdown": false
 }
--- a/egs/aishell/ASR/whisper/label_smoothing.py
+++ b/egs/aishell/ASR/whisper/label_smoothing.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/conformer_ctc/label_smoothing.py
--- a/egs/aishell/ASR/whisper/optim.py
+++ b/egs/aishell/ASR/whisper/optim.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/zipformer/optim.py
--- a/egs/aishell/ASR/whisper/requirements.txt
+++ b/egs/aishell/ASR/whisper/requirements.txt
@ -0,0 +1,10 @@
 k2
 kaldialign
 git+https://github.com/lhotse-speech/lhotse
 sentencepiece
 tensorboard
 librosa
 git+https://github.com/yuekaizhang/whisper.git
 zhconv
 WeTextProcessing
 deepspeed
--- a/egs/aishell/ASR/whisper/train.py
+++ b/egs/aishell/ASR/whisper/train.py
@ -0,0 +1,927 @@
 #!/usr/bin/env python3
 # Copyright    2023  Xiaomi Corp.        (authors: Xiaoyu Yang)
 #              2024  Yuekai Zhang
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
 #fine-tuning with deepspeed zero stage 1
 torchrun --nproc_per_node 8 ./whisper/train.py \
  --max-duration 200 \
  --exp-dir whisper/exp_large_v2 \
  --model-name large-v2 \
  --manifest-dir data/fbank_whisper \
  --deepspeed \
  --deepspeed_config ./whisper/ds_config_zero1.json
 # fine-tuning with ddp
 torchrun --nproc_per_node 8 ./whisper/train.py \
  --max-duration 200 \
  --exp-dir whisper/exp_medium \
  --manifest-dir data/fbank_whisper \
  --base-lr 1e-5 \
  --model-name medium
 """
 import argparse
 import copy
 import logging
 import random
 import warnings
 from pathlib import Path
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple, Union
 import deepspeed
 import k2
 import optim
 import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
 import whisper
 from asr_datamodule import AishellAsrDataModule
 from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict
 from label_smoothing import LabelSmoothingLoss
 from lhotse import CutSet, load_manifest
 from lhotse.cut import Cut
 from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
 from optim import Eden, ScaledAdam
 from torch import Tensor
 from torch.cuda.amp import GradScaler
 from torch.nn.functional import pad as pad_tensor
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter
 from whisper_encoder_forward_monkey_patch import replace_whisper_encoder_forward
 from icefall import diagnostics
 from icefall.checkpoint import load_checkpoint, remove_checkpoints
 from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
 from icefall.checkpoint import update_averaged_model
 from icefall.dist import cleanup_dist, get_rank, get_world_size, setup_dist
 from icefall.env import get_env_info
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
    filter_uneven_sized_batch,
    setup_logger,
    str2bool,
 )
 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
 def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
    if isinstance(model, DDP):
        # get underlying nn.Module
        model = model.module
    for module in model.modules():
        if hasattr(module, "batch_count"):
            module.batch_count = batch_count
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--tensorboard",
        type=str2bool,
        default=True,
        help="Should various information be logged in tensorboard.",
    )
    parser.add_argument(
        "--num-epochs",
        type=int,
        default=10,
        help="Number of epochs to train.",
    )
    parser.add_argument(
        "--start-epoch",
        type=int,
        default=1,
        help="""Resume training from this epoch. It should be positive.
        If larger than 1, it will load checkpoint from
        exp-dir/epoch-{start_epoch-1}.pt
        """,
    )
    parser.add_argument(
        "--start-batch",
        type=int,
        default=0,
        help="""If positive, --start-epoch is ignored and
        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
        """,
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="whisper/exp",
        help="""The experiment dir.
        It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
        """,
    )
    parser.add_argument(
        "--model-name",
        type=str,
        default="large-v2",
        choices=["large-v2", "large-v3", "medium", "small", "base", "tiny"],
        help="""The model name to use.
        """,
    )
    parser.add_argument(
        "--base-lr", type=float, default=1e-5, help="The base learning rate."
    )
    parser.add_argument(
        "--lr-batches",
        type=float,
        default=5000,
        help="""Number of steps that affects how rapidly the learning rate
        decreases. We suggest not to change this.""",
    )
    parser.add_argument(
        "--lr-epochs",
        type=float,
        default=6,
        help="""Number of epochs that affects how rapidly the learning rate decreases.
        """,
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="The seed for random generators intended for reproducibility",
    )
    parser.add_argument(
        "--print-diagnostics",
        type=str2bool,
        default=False,
        help="Accumulate stats on activations, print them and exit.",
    )
    parser.add_argument(
        "--inf-check",
        type=str2bool,
        default=False,
        help="Add hooks to check for infinite module outputs and gradients.",
    )
    parser.add_argument(
        "--keep-last-k",
        type=int,
        default=30,
        help="""Only keep this number of checkpoints on disk.
        For instance, if it is 3, there are only 3 checkpoints
        in the exp-dir with filenames `checkpoint-xxx.pt`.
        It does not affect checkpoints with name `epoch-xxx.pt`.
        """,
    )
    parser.add_argument(
        "--average-period",
        type=int,
        default=200,
        help="""Update the averaged model, namely `model_avg`, after processing
        this number of batches. `model_avg` is a separate version of model,
        in which each floating-point parameter is the average of all the
        parameters from the start of training. Each time we take the average,
        we do: `model_avg = model * (average_period / batch_idx_train) +
            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
        """,
    )
    parser.add_argument(
        "--use-fp16",
        type=str2bool,
        default=True,
        help="Whether to use half precision training.",
    )
    parser = deepspeed.add_config_arguments(parser)
    return parser
 def get_params() -> AttributeDict:
    """Return a dict containing training parameters.
    All training related parameters that are not passed from the commandline
    are saved in the variable `params`.
    Commandline options are merged into `params` after they are parsed, so
    you can also access them via `params`.
    Explanation of options saved in `params`:
        - frame_shift_ms: The frame shift in milliseconds.
        - allowed_excess_duration_ratio: The allowed excess duration ratio.
        - best_train_loss: The best training loss so far.
        - best_valid_loss: The best validation loss so far.
        - best_train_epoch: The epoch where the best training loss is achieved.
        - best_valid_epoch: The epoch where the best validation loss is achieved.
        - batch_idx_train: The batch index of the current batch.
        - log_interval: Log training stats every `log_interval` batches.
        - reset_interval: Reset the stats every `reset_interval` batches.
        - valid_interval: Run validation every `valid_interval` batches.
        - env_info: The environment information.
    """
    params = AttributeDict(
        {
            "frame_shift_ms": 10.0,
            "subsampling_factor": 2,
            "allowed_excess_duration_ratio": 0.1,
            "best_train_loss": float("inf"),
            "best_valid_loss": float("inf"),
            "best_train_epoch": -1,
            "best_valid_epoch": -1,
            "batch_idx_train": 0,
            "log_interval": 50,
            "reset_interval": 200,
            "valid_interval": 5000,
            "env_info": get_env_info(),
        }
    )
    return params
 def load_checkpoint_if_available(
    params: AttributeDict,
    model: nn.Module,
    model_avg: nn.Module = None,
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
 ) -> Optional[Dict[str, Any]]:
    """Load checkpoint from file.
    If params.start_batch is positive, it will load the checkpoint from
    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
    params.start_epoch is larger than 1, it will load the checkpoint from
    `params.start_epoch - 1`.
    Apart from loading state dict for `model` and `optimizer` it also updates
    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
    and `best_valid_loss` in `params`.
    Args:
      params:
        The return value of :func:`get_params`.
      model:
        The training model.
      model_avg:
        The stored model averaged from the start of training.
      optimizer:
        The optimizer that we are using.
      scheduler:
        The scheduler that we are using.
    Returns:
      Return a dict containing previously saved training info.
    """
    if params.start_batch > 0:
        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
    elif params.start_epoch > 1:
        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
    else:
        return None
    assert filename.is_file(), f"{filename} does not exist!"
    saved_params = load_checkpoint(
        filename,
        model=model,
        model_avg=model_avg,
        optimizer=optimizer,
        scheduler=scheduler,
    )
    keys = [
        "best_train_epoch",
        "best_valid_epoch",
        "batch_idx_train",
        "best_train_loss",
        "best_valid_loss",
    ]
    for k in keys:
        params[k] = saved_params[k]
    if params.start_batch > 0:
        if "cur_epoch" in saved_params:
            params["start_epoch"] = saved_params["cur_epoch"]
    return saved_params
 def save_checkpoint(
    params: AttributeDict,
    model: Union[nn.Module, DDP],
    model_avg: Optional[nn.Module] = None,
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
    scaler: Optional[GradScaler] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
    Args:
      params:
        It is returned by :func:`get_params`.
      model:
        The training model.
      model_avg:
        The stored model averaged from the start of training.
      optimizer:
        The optimizer used in the training.
      sampler:
       The sampler for the training dataset.
      scaler:
        The scaler used for mix precision training.
    """
    if rank != 0:
        return
    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
    save_checkpoint_impl(
        filename=filename,
        model=model,
        model_avg=model_avg,
        params=params,
        optimizer=optimizer,
        scheduler=scheduler,
        sampler=sampler,
        scaler=scaler,
        rank=rank,
    )
    if params.best_train_epoch == params.cur_epoch:
        best_train_filename = params.exp_dir / "best-train-loss.pt"
        copyfile(src=filename, dst=best_train_filename)
    if params.best_valid_epoch == params.cur_epoch:
        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
        copyfile(src=filename, dst=best_valid_filename)
 def compute_loss(
    params: AttributeDict,
    tokenizer: whisper.tokenizer.Tokenizer,
    model: Union[nn.Module, DDP],
    batch: dict,
    is_training: bool,
 ) -> Tuple[Tensor, MetricsTracker]:
    """
    Compute the loss for the given batch.
    Args:
        params:
            It is returned by :func:`get_params`.
        tokenizer:
            The tokenizer used to encode the text.
        model:
            The model for training.
        batch:
            A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
            for the content in it.
        is_training:
            Whether it is training.
    Returns:
        Return a tuple of two elements. The first element is the loss tensor.
    """
    # For the uneven-sized batch, the total duration after padding would possibly
    # cause OOM. Hence, for each batch, which is sorted descendingly by length,
    # we simply drop the last few shortest samples, so that the retained total frames
    # (after padding) would not exceed `allowed_max_frames`:
    # `allowed_max_frames = int(max_frames * (1.0 + allowed_excess_duration_ratio))`,
    # where `max_frames = max_duration * 1000 // frame_shift_ms`.
    # We set allowed_excess_duration_ratio=0.1.
    if isinstance(model, DDP):
        # get underlying nn.Module
        model = model.module
    def _batch_tensors(tensors: List[Tensor], pad_value: Any) -> Tensor:
        padding_size = max(tensor.shape[0] for tensor in tensors)
        dims = len(tensors[0].shape)
        padded_tensors = []
        for tensor in tensors:
            padding = [0] * 2 * dims
            padding[-1] = padding_size - tensor.shape[0]
            padded_tensors.append(pad_tensor(tensor, padding, "constant", pad_value))
        return torch.stack([tensor for tensor in padded_tensors], dim=0)
    max_frames = params.max_duration * 1000 // params.frame_shift_ms
    allowed_max_frames = int(max_frames * (1.0 + params.allowed_excess_duration_ratio))
    batch = filter_uneven_sized_batch(batch, allowed_max_frames)
    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
    feature = batch["inputs"]
    assert feature.ndim == 3
    feature = feature.to(device)
    feature = feature.transpose(1, 2)  # (N, C, T)
    supervisions = batch["supervisions"]
    feature_lens = supervisions["num_frames"].to(device)
    batch_idx_train = params.batch_idx_train
    texts = batch["supervisions"]["text"]
    # remove spaces in texts
    texts = [text.replace(" ", "") for text in texts]
    text_tokens_list = [
        list(tokenizer.sot_sequence_including_notimestamps)
        + tokenizer.encode(text)
        + [tokenizer.eot]
        for text in texts
    ]
    # convert it to torch tensor
    text_tokens_list = [
        torch.LongTensor(text_tokens) for text_tokens in text_tokens_list
    ]
    # 50256 is the index of <pad> for all whisper models
    prev_outputs_tokens = _batch_tensors(
        [tokens[:-1] for tokens in text_tokens_list], pad_value=50256
    )
    target_tokens = _batch_tensors(
        [tokens[1:] for tokens in text_tokens_list], pad_value=50256
    )
    target_lengths = torch.LongTensor(
        [tokens.shape[0] - 1 for tokens in text_tokens_list]
    )
    decoder_criterion = LabelSmoothingLoss(
        ignore_index=50256, label_smoothing=0.1, reduction="sum"
    )
    # ignore the first 3 tokens, which are always <|lang_id|>, <|transcibe|>, <|notimestampes|>
    ignore_prefix_size = 3
    with torch.set_grad_enabled(is_training):
        encoder_out = model.encoder(feature)
        text_logits = model.decoder(prev_outputs_tokens.to(device), encoder_out)
        text_logits = text_logits[:, ignore_prefix_size:, :]
        target_tokens = target_tokens[:, ignore_prefix_size:]
        loss = decoder_criterion(text_logits, target_tokens.to(device))
    assert loss.requires_grad == is_training
    info = MetricsTracker()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
    # Note: We use reduction=sum while computing the loss.
    info["loss"] = loss.detach().cpu().item()
    return loss, info
 def compute_validation_loss(
    params: AttributeDict,
    tokenizer: whisper.tokenizer.Tokenizer,
    model: Union[nn.Module, DDP],
    valid_dl: torch.utils.data.DataLoader,
    world_size: int = 1,
 ) -> MetricsTracker:
    """Run the validation process."""
    model.eval()
    tot_loss = MetricsTracker()
    for batch_idx, batch in enumerate(valid_dl):
        with torch.cuda.amp.autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                tokenizer=tokenizer,
                model=model,
                batch=batch,
                is_training=False,
            )
        assert loss.requires_grad is False
        tot_loss = tot_loss + loss_info
    if world_size > 1:
        tot_loss.reduce(loss.device)
    loss_value = tot_loss["loss"] / tot_loss["frames"]
    if loss_value < params.best_valid_loss:
        params.best_valid_epoch = params.cur_epoch
        params.best_valid_loss = loss_value
    return tot_loss
 def train_one_epoch(
    params: AttributeDict,
    tokenizer: whisper.tokenizer.Tokenizer,
    model: Union[nn.Module, DDP],
    optimizer: torch.optim.Optimizer,
    scheduler: LRSchedulerType,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
    scaler: GradScaler,
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
    rank: int = 0,
 ) -> None:
    """Train the model for one epoch.
    The training loss from the mean of all frames is saved in
    `params.train_loss`. It runs the validation process every
    `params.valid_interval` batches.
    Args:
      params:
        It is returned by :func:`get_params`.
      model:
        The model for training.
      optimizer:
        The optimizer we are using.
      scheduler:
        The learning rate scheduler, we call step() every step.
      train_dl:
        Dataloader for the training dataset.
      valid_dl:
        Dataloader for the validation dataset.
      scaler:
        The scaler used for mix precision training.
      model_avg:
        The stored model averaged from the start of training.
      tb_writer:
        Writer to write log messages to tensorboard.
      world_size:
        Number of nodes in DDP training. If it is 1, DDP is disabled.
      rank:
        The rank of the node in DDP training. If no DDP is used, it should
        be set to 0.
    """
    model.train()
    tot_loss = MetricsTracker()
    for batch_idx, batch in enumerate(train_dl):
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])
        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
            logging.info("Computing validation loss")
            valid_info = compute_validation_loss(
                params=params,
                tokenizer=tokenizer,
                model=model,
                valid_dl=valid_dl,
                world_size=world_size,
            )
            model.train()
            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
            logging.info(
                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
            )
            if tb_writer is not None:
                valid_info.write_summary(
                    tb_writer, "train/valid_", params.batch_idx_train
                )
        try:
            with torch.cuda.amp.autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    tokenizer=tokenizer,
                    model=model,
                    batch=batch,
                    is_training=True,
                )
            # summary stats
            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
            # NOTE: We use reduction==sum and loss is computed over utterances
            # in the batch and there is no normalization to it so far.
            if params.deepspeed:
                # deepspeed's backward() is different from torch's backward()
                # in that it does not accept a loss tensor as input.
                # It computes the loss internally.
                model.backward(loss)
                model.step()
            else:
                scaler.scale(loss).backward()
                set_batch_count(model, params.batch_idx_train)
                scheduler.step_batch(params.batch_idx_train)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
        except:  # noqa
            display_and_save_batch(batch, params=params)
            raise
        if params.print_diagnostics and batch_idx == 5:
            return
        if (
            rank == 0
            and params.batch_idx_train > 0
            and params.batch_idx_train % params.average_period == 0
            and not params.deepspeed
        ):
            update_averaged_model(
                params=params,
                model_cur=model,
                model_avg=model_avg,
            )
        if batch_idx % 100 == 0 and params.use_fp16 and not params.deepspeed:
            # If the grad scale was less than 1, try increasing it.    The _growth_interval
            # of the grad scaler is configurable, but we can't configure it to have different
            # behavior depending on the current grad scale.
            cur_grad_scale = scaler._scale.item()
            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
                scaler.update(cur_grad_scale * 2.0)
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                raise RuntimeError(
                    f"grad_scale is too small, exiting: {cur_grad_scale}"
                )
        if batch_idx % params.log_interval == 0:
            try:
                cur_lr = scheduler.get_last_lr()[0]
            except:  # noqa
                cur_lr = 0.0
            cur_grad_scale = (
                scaler._scale.item()
                if (params.use_fp16 and not params.deepspeed)
                else 1.0
            )
            logging.info(
                f"Epoch {params.cur_epoch}, "
                f"batch {batch_idx}, loss[{loss_info}], "
                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
                f"lr: {cur_lr:.2e}, "
                + (
                    f"grad_scale: {scaler._scale.item()}"
                    if (params.use_fp16 and not params.deepspeed)
                    else ""
                )
            )
            if tb_writer is not None:
                tb_writer.add_scalar(
                    "train/learning_rate", cur_lr, params.batch_idx_train
                )
                loss_info.write_summary(
                    tb_writer, "train/current_", params.batch_idx_train
                )
                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
                if params.use_fp16:
                    tb_writer.add_scalar(
                        "train/grad_scale",
                        cur_grad_scale,
                        params.batch_idx_train,
                    )
    loss_value = tot_loss["loss"] / tot_loss["frames"]
    params.train_loss = loss_value
    if params.train_loss < params.best_train_loss:
        params.best_train_epoch = params.cur_epoch
        params.best_train_loss = params.train_loss
 def run(rank, world_size, args):
    """
    Args:
      rank:
        It is a value between 0 and `world_size-1`, which is
        passed automatically by `mp.spawn()` in :func:`main`.
        The node with rank 0 is responsible for saving checkpoint.
      world_size:
        Number of GPUs for DDP training.
      args:
        The return value of get_parser().parse_args()
    """
    params = get_params()
    params.update(vars(args))
    fix_random_seed(params.seed)
    setup_logger(f"{params.exp_dir}/log/log-train")
    logging.info(params)
    logging.info("About to create model")
    replace_whisper_encoder_forward()
    model = whisper.load_model(params.model_name, "cpu")
    del model.alignment_heads
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    tokenizer = whisper.tokenizer.get_tokenizer(
        model.is_multilingual,
        num_languages=model.num_languages,
        language="zh",
        task="transcribe",
    )
    model_avg: Optional[nn.Module] = None
    if rank == 0:
        # model_avg is only used with rank 0
        model_avg = copy.deepcopy(model).to(torch.float64)
    assert params.start_epoch > 0, params.start_epoch
    checkpoints = load_checkpoint_if_available(
        params=params, model=model, model_avg=model_avg
    )
    if torch.cuda.is_available():
        device = torch.device("cuda", rank)
    else:
        device = torch.device("cpu")
    logging.info(f"Device: {device}")
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=params.base_lr)
    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
    if checkpoints and "optimizer" in checkpoints:
        logging.info("Loading optimizer state dict")
        optimizer.load_state_dict(checkpoints["optimizer"])
    if (
        checkpoints
        and "scheduler" in checkpoints
        and checkpoints["scheduler"] is not None
    ):
        logging.info("Loading scheduler state dict")
        scheduler.load_state_dict(checkpoints["scheduler"])
    if world_size > 1:
        if params.deepspeed:
            logging.info("Using DeepSpeed")
            model, optimizer, _, scheduler = deepspeed.initialize(
                args=params, model=model, model_parameters=model.parameters()
            )
        else:
            logging.info("Using DDP")
            setup_dist(use_ddp_launch=True)
            model = DDP(model, device_ids=[rank], find_unused_parameters=True)
    if params.print_diagnostics:
        opts = diagnostics.TensorDiagnosticOptions(
            512
        )  # allow 4 megabytes per sub-module
        diagnostic = diagnostics.attach_diagnostics(model, opts)
    if params.inf_check:
        register_inf_check_hooks(model)
    aishell = AishellAsrDataModule(args)
    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
        # We only load the sampler's state dict when it loads a checkpoint
        # saved in the middle of an epoch
        sampler_state_dict = checkpoints["sampler"]
    else:
        sampler_state_dict = None
    train_dl = aishell.train_dataloaders(aishell.train_cuts())
    valid_dl = aishell.valid_dataloaders(aishell.valid_cuts())
    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
    if args.tensorboard and rank == 0:
        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
    else:
        tb_writer = None
    logging.info(f"start training from epoch {params.start_epoch}")
    for epoch in range(params.start_epoch, params.num_epochs + 1):
        if not params.deepspeed:
            scheduler.step_epoch(epoch - 1)
        fix_random_seed(params.seed + epoch - 1)
        train_dl.sampler.set_epoch(epoch - 1)
        if tb_writer is not None:
            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
        params.cur_epoch = epoch
        train_one_epoch(
            params=params,
            tokenizer=tokenizer,
            model=model,
            model_avg=model_avg,
            optimizer=optimizer,
            scheduler=scheduler,
            train_dl=train_dl,
            valid_dl=valid_dl,
            scaler=scaler,
            tb_writer=tb_writer,
            world_size=world_size,
            rank=rank,
        )
        if params.print_diagnostics:
            diagnostic.print_diagnostics()
            break
        if params.deepspeed:
            model.save_checkpoint(
                save_dir=params.exp_dir,
                tag=f"epoch-{params.cur_epoch}",
                client_state={},
            )
            if rank == 0:
                convert_zero_checkpoint_to_fp32_state_dict(
                    params.exp_dir,
                    f"{params.exp_dir}/epoch-{params.cur_epoch}.pt",
                    tag=f"epoch-{params.cur_epoch}",
                )
        else:
            save_checkpoint(
                params=params,
                model=model,
                model_avg=model_avg,
                optimizer=optimizer,
                scheduler=scheduler,
                sampler=train_dl.sampler,
                scaler=scaler,
                rank=rank,
            )
    logging.info("Done!")
    if world_size > 1 and not params.deepspeed:
        torch.distributed.barrier()
        cleanup_dist()
 def display_and_save_batch(
    batch: dict,
    params: AttributeDict,
 ) -> None:
    """Display the batch statistics and save the batch into disk.
    Args:
      batch:
        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
        for the content in it.
      params:
        Parameters for training. See :func:`get_params`.
    """
    from lhotse.utils import uuid4
    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
    logging.info(f"Saving batch to {filename}")
    torch.save(batch, filename)
    supervisions = batch["supervisions"]
    features = batch["inputs"]
    logging.info(f"features shape: {features.shape}")
 def main():
    parser = get_parser()
    AishellAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    world_size = get_world_size()
    rank = get_rank()
    torch.set_num_threads(1)
    torch.set_num_interop_threads(1)
    run(rank=rank, world_size=world_size, args=args)
 if __name__ == "__main__":
    main()
--- a/egs/aishell/ASR/whisper/whisper_encoder_forward_monkey_patch.py
+++ b/egs/aishell/ASR/whisper/whisper_encoder_forward_monkey_patch.py
@ -0,0 +1,29 @@
 import torch
 import torch.nn.functional as F
 import whisper
 def forward(self, x: torch.Tensor):
    """
    x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
        the mel spectrogram of the audio
    """
    x = F.gelu(self.conv1(x))
    x = F.gelu(self.conv2(x))
    x = x.permute(0, 2, 1)
    x = (x + self.positional_embedding[: x.shape[1], :]).to(x.dtype)
    for block in self.blocks:
        x = block(x)
    x = self.ln_post(x)
    return x
 def replace_whisper_encoder_forward():
    """
    This function monkey patches the forward method of the whisper encoder.
    To be called before the model is loaded, it changes whisper to process audio with any length < 30s.
    """
    whisper.model.AudioEncoder.forward = forward
--- a/egs/aishell/ASR/zipformer/decode.py
+++ b/egs/aishell/ASR/zipformer/decode.py
@ -560,7 +560,7 @@ def save_results(
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
@ -570,7 +570,11 @@ def save_results(
        )
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results, enable_log=True
+                f,
                f"{test_set_name}-{key}",
                results,
                enable_log=True,
                compute_CER=True,
            )
            test_set_wers[key] = wer
--- a/egs/aishell/ASR/zipformer/decode_bbpe.py
+++ b/egs/aishell/ASR/zipformer/decode_bbpe.py
@ -0,0 +1,840 @@
 #!/usr/bin/env python3
 #
 # Copyright 2021-2024 Xiaomi Corporation (Author: Fangjun Kuang,
 #                                                 Zengwei Yao,
 #                                                 Mingshuang Luo,
 #                                                 Zengrui Jin,)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
 (1) greedy search
 ./zipformer/decode_bbpe.py \
    --epoch 35 \
    --avg 15 \
    --exp-dir ./zipformer/exp_bbpe \
    --lang-dir data/lang_bbpe_500 \
    --bpe-model data/lang_bbpe_500/bbpe.model \
    --max-duration 600 \
    --decoding-method greedy_search
 (2) modified beam search
 ./zipformer/decode_bbpe.py \
    --epoch 35 \
    --avg 15 \
    --exp-dir ./zipformer/exp_bbpe \
    --lang-dir data/lang_bbpe_500 \
    --bpe-model data/lang_bbpe_500/bbpe.model \
    --max-duration 600 \
    --decoding-method modified_beam_search \
    --beam-size 4
 (3) fast beam search (trivial_graph)
 ./zipformer/decode_bbpe.py \
    --epoch 35 \
    --avg 15 \
    --exp-dir ./zipformer/exp_bbpe \
    --lang-dir data/lang_bbpe_500 \
    --bpe-model data/lang_bbpe_500/bbpe.model \
    --max-duration 600 \
    --decoding-method fast_beam_search \
    --beam 20.0 \
    --max-contexts 8 \
    --max-states 64
 (4) fast beam search (LG)
 ./zipformer/decode_bbpe.py \
    --epoch 30 \
    --avg 15 \
    --exp-dir ./zipformer/exp_bbpe \
    --lang-dir data/lang_bbpe_500 \
    --bpe-model data/lang_bbpe_500/bbpe.model \
    --max-duration 600 \
    --decoding-method fast_beam_search_LG \
    --beam 20.0 \
    --max-contexts 8 \
    --max-states 64
 (5) fast beam search (nbest oracle WER)
 ./zipformer/decode_bbpe.py \
    --epoch 35 \
    --avg 15 \
    --exp-dir ./zipformer/exp_bbpe \
    --lang-dir data/lang_bbpe_500 \
    --bpe-model data/lang_bbpe_500/bbpe.model \
    --max-duration 600 \
    --decoding-method fast_beam_search_nbest_oracle \
    --beam 20.0 \
    --max-contexts 8 \
    --max-states 64 \
    --num-paths 200 \
    --nbest-scale 0.5
 """
 import argparse
 import logging
 import math
 from collections import defaultdict
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 import k2
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import AishellAsrDataModule
 from beam_search import (
    beam_search,
    fast_beam_search_nbest_oracle,
    fast_beam_search_one_best,
    greedy_search,
    greedy_search_batch,
    modified_beam_search,
 )
 from lhotse.cut import Cut
 from train import add_model_arguments, get_model, get_params
 from icefall import byte_encode, smart_byte_decode, tokenize_by_CJK_char
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
    make_pad_mask,
    setup_logger,
    store_transcripts,
    str2bool,
    write_error_stats,
 )
 LOG_EPS = math.log(1e-10)
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=30,
        help="""It specifies the checkpoint to use for decoding.
        Note: Epoch counts from 1.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=15,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
    )
    parser.add_argument(
        "--use-averaged-model",
        type=str2bool,
        default=True,
        help="Whether to load averaged model. Currently it only supports "
        "using --epoch. If True, it would decode with the averaged model "
        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
        "Actually only the models with epoch number of `epoch-avg` and "
        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="zipformer_bbpe/exp",
        help="The experiment dir",
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bbpe_500/bbpe.model",
        help="Path to the byte BPE model",
    )
    parser.add_argument(
        "--lang-dir",
        type=Path,
        default="data/lang_bbpe_500/",
        help="The lang dir containing word table and LG graph",
    )
    parser.add_argument(
        "--decoding-method",
        type=str,
        default="greedy_search",
        help="""Possible values are:
          - greedy_search
          - modified_beam_search
          - fast_beam_search
          - fast_beam_search_LG
          - fast_beam_search_nbest_oracle
        If you use fast_beam_search_LG, you have to specify
        `--lang-dir`, which should contain `LG.pt`.
        """,
    )
    parser.add_argument(
        "--beam-size",
        type=int,
        default=4,
        help="""An integer indicating how many candidates we will keep for each
        frame. Used only when --decoding-method is beam_search or
        modified_beam_search.""",
    )
    parser.add_argument(
        "--beam",
        type=float,
        default=20.0,
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
        Used only when --decoding-method is fast_beam_search,
        fast_beam_search, fast_beam_search_LG,
        and fast_beam_search_nbest_oracle
        """,
    )
    parser.add_argument(
        "--ngram-lm-scale",
        type=float,
        default=0.01,
        help="""
        Used only when --decoding_method is fast_beam_search_LG.
        It specifies the scale for n-gram LM scores.
        """,
    )
    parser.add_argument(
        "--ilme-scale",
        type=float,
        default=0.2,
        help="""
        Used only when --decoding_method is fast_beam_search_LG.
        It specifies the scale for the internal language model estimation.
        """,
    )
    parser.add_argument(
        "--max-contexts",
        type=int,
        default=8,
        help="""Used only when --decoding-method is
        fast_beam_search, fast_beam_search, fast_beam_search_LG,
        and fast_beam_search_nbest_oracle""",
    )
    parser.add_argument(
        "--max-states",
        type=int,
        default=64,
        help="""Used only when --decoding-method is
        fast_beam_search, fast_beam_search, fast_beam_search_LG,
        and fast_beam_search_nbest_oracle""",
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )
    parser.add_argument(
        "--max-sym-per-frame",
        type=int,
        default=1,
        help="""Maximum number of symbols per frame.
        Used only when --decoding_method is greedy_search""",
    )
    parser.add_argument(
        "--num-paths",
        type=int,
        default=200,
        help="""Number of paths for nbest decoding.
        Used only when the decoding method is fast_beam_search_nbest_oracle""",
    )
    parser.add_argument(
        "--nbest-scale",
        type=float,
        default=0.5,
        help="""Scale applied to lattice scores when computing nbest paths.
        Used only when the decoding method is and fast_beam_search_nbest_oracle""",
    )
    parser.add_argument(
        "--blank-penalty",
        type=float,
        default=0.0,
        help="""
        The penalty applied on blank symbol during decoding.
        Note: It is a positive value that would be applied to logits like
        this `logits[:, 0] -= blank_penalty` (suppose logits.shape is
        [batch_size, vocab] and blank id is 0).
        """,
    )
    add_model_arguments(parser)
    return parser
 def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
    lexicon: Lexicon,
    batch: dict,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[List[str]]]:
    """Decode one batch and return the result in a dict. The dict has the
    following format:
        - key: It indicates the setting used for decoding. For example,
               if greedy_search is used, it would be "greedy_search"
               If beam search with a beam size of 7 is used, it would be
               "beam_7"
        - value: It contains the decoding result. `len(value)` equals to
                 batch size. `value[i]` is the decoding result for the i-th
                 utterance in the given batch.
    Args:
      params:
        It's the return value of :func:`get_params`.
      model:
        The neural model.
      batch:
        It is the return value from iterating
        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
        for the format of the `batch`.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or LG, Used
        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
    Returns:
      Return the decoding result. See above description for the format of
      the returned dict.
    """
    device = next(model.parameters()).device
    feature = batch["inputs"]
    assert feature.ndim == 3
    feature = feature.to(device)
    # at entry, feature is (N, T, C)
    supervisions = batch["supervisions"]
    feature_lens = supervisions["num_frames"].to(device)
    if params.causal:
        # this seems to cause insertions at the end of the utterance if used with zipformer.
        pad_len = 30
        feature_lens += pad_len
        feature = torch.nn.functional.pad(
            feature,
            pad=(0, 0, 0, pad_len),
            value=LOG_EPS,
        )
    x, x_lens = model.encoder_embed(feature, feature_lens)
    src_key_padding_mask = make_pad_mask(x_lens)
    x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
    encoder_out, encoder_out_lens = model.encoder(x, x_lens, src_key_padding_mask)
    encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
    hyps = []
    if params.decoding_method == "fast_beam_search":
        hyp_tokens = fast_beam_search_one_best(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam,
            max_contexts=params.max_contexts,
            max_states=params.max_states,
            blank_penalty=params.blank_penalty,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(smart_byte_decode(hyp).split())
    elif params.decoding_method == "fast_beam_search_LG":
        hyp_tokens = fast_beam_search_one_best(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam,
            max_contexts=params.max_contexts,
            max_states=params.max_states,
            blank_penalty=params.blank_penalty,
            ilme_scale=params.ilme_scale,
        )
        for hyp in hyp_tokens:
            hyps.append([lexicon.word_table[i] for i in hyp])
    elif params.decoding_method == "fast_beam_search_nbest_oracle":
        ref_texts = []
        for tx in supervisions["text"]:
            ref_texts.append(byte_encode(tokenize_by_CJK_char(tx)))
        hyp_tokens = fast_beam_search_nbest_oracle(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam,
            max_contexts=params.max_contexts,
            max_states=params.max_states,
            num_paths=params.num_paths,
            ref_texts=sp.encode(ref_texts),
            nbest_scale=params.nbest_scale,
            blank_penalty=params.blank_penalty,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(smart_byte_decode(hyp).split())
    elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
        hyp_tokens = greedy_search_batch(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            blank_penalty=params.blank_penalty,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(smart_byte_decode(hyp).split())
    elif params.decoding_method == "modified_beam_search":
        hyp_tokens = modified_beam_search(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            blank_penalty=params.blank_penalty,
            beam=params.beam_size,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(smart_byte_decode(hyp).split())
    else:
        batch_size = encoder_out.size(0)
        for i in range(batch_size):
            # fmt: off
            encoder_out_i = encoder_out[i:i + 1, :encoder_out_lens[i]]
            # fmt: on
            if params.decoding_method == "greedy_search":
                hyp = greedy_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    max_sym_per_frame=params.max_sym_per_frame,
                    blank_penalty=params.blank_penalty,
                )
            elif params.decoding_method == "beam_search":
                hyp = beam_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    beam=params.beam_size,
                    blank_penalty=params.blank_penalty,
                )
            else:
                raise ValueError(
                    f"Unsupported decoding method: {params.decoding_method}"
                )
            hyps.append(smart_byte_decode(sp.decode(hyp)).split())
    key = f"blank_penalty_{params.blank_penalty}"
    if params.decoding_method == "greedy_search":
        return {"greedy_search_" + key: hyps}
    elif "fast_beam_search" in params.decoding_method:
        key += f"_beam_{params.beam}_"
        key += f"max_contexts_{params.max_contexts}_"
        key += f"max_states_{params.max_states}"
        if "nbest" in params.decoding_method:
            key += f"_num_paths_{params.num_paths}_"
            key += f"nbest_scale_{params.nbest_scale}"
        if "LG" in params.decoding_method:
            key += f"_ilme_scale_{params.ilme_scale}"
            key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
        return {key: hyps}
    else:
        return {f"beam_size_{params.beam_size}_" + key: hyps}
 def decode_dataset(
    dl: torch.utils.data.DataLoader,
    params: AttributeDict,
    model: nn.Module,
    lexicon: Lexicon,
    sp: spm.SentencePieceProcessor,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
    """Decode dataset.
    Args:
      dl:
        PyTorch's dataloader containing the dataset to decode.
      params:
        It is returned by :func:`get_params`.
      model:
        The neural model.
      lexicon:
        directory containing the lexicon.
      sp:
        SentencePiece model.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or LG, Used
        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
    Returns:
      Return a dict, whose key may be "greedy_search" if greedy search
      is used, or it may be "beam_7" if beam size of 7 is used.
      Its value is a list of tuples. Each tuple contains two elements:
      The first is the reference transcript, and the second is the
      predicted result.
    """
    num_cuts = 0
    try:
        num_batches = len(dl)
    except TypeError:
        num_batches = "?"
    if params.decoding_method == "greedy_search":
        log_interval = 50
    else:
        log_interval = 20
    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
        hyps_dict = decode_one_batch(
            params=params,
            model=model,
            sp=sp,
            lexicon=lexicon,
            decoding_graph=decoding_graph,
            batch=batch,
        )
        for name, hyps in hyps_dict.items():
            this_batch = []
            assert len(hyps) == len(texts)
            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
                ref_words = "".join(ref_text.split())
                this_batch.append((cut_id, ref_words, hyp_words))
            results[name].extend(this_batch)
        num_cuts += len(texts)
        if batch_idx % log_interval == 0:
            batch_str = f"{batch_idx}/{num_batches}"
            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
    return results
 def save_results(
    params: AttributeDict,
    test_set_name: str,
    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
        recog_path = (
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        results = sorted(results)
        store_transcripts(filename=recog_path, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")
        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
        errs_filename = (
            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        results_char = []
        for res in results:
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
                f, f"{test_set_name}-{key}", results_char, enable_log=True
            )
            test_set_wers[key] = wer
        logging.info("Wrote detailed error stats to {}".format(errs_filename))
    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = (
        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
    )
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
        for key, val in test_set_wers:
            print("{}\t{}".format(key, val), file=f)
    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
    note = "\tbest for {}".format(test_set_name)
    for key, val in test_set_wers:
        s += "{}\t{}{}\n".format(key, val, note)
        note = ""
    logging.info(s)
@torch.no_grad()
 def main():
    parser = get_parser()
    AishellAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    assert params.decoding_method in (
        "greedy_search",
        "beam_search",
        "modified_beam_search",
        "fast_beam_search",
        "fast_beam_search_LG",
        "fast_beam_search_nbest_oracle",
    )
    params.res_dir = params.exp_dir / params.decoding_method
    if params.iter > 0:
        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
    else:
        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
    if params.causal:
        assert (
            "," not in params.chunk_size
        ), "chunk_size should be one value in decoding."
        assert (
            "," not in params.left_context_frames
        ), "left_context_frames should be one value in decoding."
        params.suffix += f"-chunk-{params.chunk_size}"
        params.suffix += f"-left-context-{params.left_context_frames}"
    if "fast_beam_search" in params.decoding_method:
        params.suffix += f"-beam-{params.beam}"
        params.suffix += f"-max-contexts-{params.max_contexts}"
        params.suffix += f"-max-states-{params.max_states}"
        if "nbest" in params.decoding_method:
            params.suffix += f"-nbest-scale-{params.nbest_scale}"
            params.suffix += f"-num-paths-{params.num_paths}"
        if "LG" in params.decoding_method:
            params.suffix += f"_ilme_scale_{params.ilme_scale}"
            params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
    elif "beam_search" in params.decoding_method:
        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
    else:
        params.suffix += f"-context-{params.context_size}"
        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
    params.suffix += f"-blank-penalty-{params.blank_penalty}"
    if params.use_averaged_model:
        params.suffix += "-use-averaged-model"
    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
    logging.info("Decoding started")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"Device: {device}")
    sp = spm.SentencePieceProcessor()
    sp.load(params.bpe_model)
    # <blk> and <unk> are defined in local/train_bbpe_model.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.unk_id = sp.piece_to_id("<unk>")
    params.vocab_size = sp.get_piece_size()
    lexicon = Lexicon(params.lang_dir)
    logging.info(params)
    logging.info("About to create model")
    model = get_model(params)
    if not params.use_averaged_model:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if i >= 1:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg + 1
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model.to(device)
    model.eval()
    if "fast_beam_search" in params.decoding_method:
        if "LG" in params.decoding_method:
            lexicon = Lexicon(params.lang_dir)
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
                torch.load(lg_filename, map_location=device)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
            decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
    else:
        decoding_graph = None
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    # we need cut ids to display recognition results.
    args.return_cuts = True
    aishell = AishellAsrDataModule(args)
    def remove_short_utt(c: Cut):
        T = ((c.num_frames - 7) // 2 + 1) // 2
        if T <= 0:
            logging.warning(
                f"Exclude cut with ID {c.id} from decoding, num_frames : {c.num_frames}."
            )
        return T > 0
    dev_cuts = aishell.valid_cuts()
    dev_cuts = dev_cuts.filter(remove_short_utt)
    dev_dl = aishell.valid_dataloaders(dev_cuts)
    test_cuts = aishell.test_cuts()
    test_cuts = test_cuts.filter(remove_short_utt)
    test_dl = aishell.test_dataloaders(test_cuts)
    test_sets = ["dev", "test"]
    test_dls = [dev_dl, test_dl]
    for test_set, test_dl in zip(test_sets, test_dls):
        results_dict = decode_dataset(
            dl=test_dl,
            params=params,
            model=model,
            lexicon=lexicon,
            sp=sp,
            decoding_graph=decoding_graph,
        )
        save_results(
            params=params,
            test_set_name=test_set,
            results_dict=results_dict,
        )
    logging.info("Done!")
 if __name__ == "__main__":
    main()
--- a/egs/aishell/ASR/zipformer/jit_pretrained_bbpe.py
+++ b/egs/aishell/ASR/zipformer/jit_pretrained_bbpe.py
@ -0,0 +1,279 @@
 #!/usr/bin/env python3
 # Copyright 2021-2024 Xiaomi Corporation (Author: Fangjun Kuang,
 #                                                 Zengwei Yao,
 #                                                 Zengrui Jin,)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script loads torchscript models, exported by `torch.jit.script()`
 and uses them to decode waves.
 You can use the following command to get the exported models:
 ./zipformer/export.py \
  --exp-dir ./zipformer_bbpe/exp \
  --bpe ./data/lang_bbpe_500/bbpe.model \
  --epoch 30 \
  --avg 9 \
  --jit 1
 Usage of this script:
 ./zipformer/jit_pretrained.py \
  --nn-model-filename ./zipformer_bbpe/exp/cpu_jit.pt \
  --bpe ./data/lang_bbpe_500/bbpe.model \
  /path/to/foo.wav \
  /path/to/bar.wav
 """
 import argparse
 import logging
 import math
 from typing import List
 import kaldifeat
 import sentencepiece as spm
 import torch
 import torchaudio
 from torch.nn.utils.rnn import pad_sequence
 from icefall import smart_byte_decode
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--nn-model-filename",
        type=str,
        required=True,
        help="Path to the torchscript model cpu_jit.pt",
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        required=True,
        help="""Path to the bbpe.model.""",
    )
    parser.add_argument(
        "sound_files",
        type=str,
        nargs="+",
        help="The input sound file(s) to transcribe. "
        "Supported formats are those supported by torchaudio.load(). "
        "For example, wav and flac are supported. "
        "The sample rate has to be 16kHz.",
    )
    return parser
 def read_sound_files(
    filenames: List[str], expected_sample_rate: float = 16000
 ) -> List[torch.Tensor]:
    """Read a list of sound files into a list 1-D float32 torch tensors.
    Args:
      filenames:
        A list of sound filenames.
      expected_sample_rate:
        The expected sample rate of the sound files.
    Returns:
      Return a list of 1-D float32 torch tensors.
    """
    ans = []
    for f in filenames:
        wave, sample_rate = torchaudio.load(f)
        assert (
            sample_rate == expected_sample_rate
        ), f"expected sample rate: {expected_sample_rate}. Given: {sample_rate}"
        # We use only the first channel
        ans.append(wave[0].contiguous())
    return ans
 def greedy_search(
    model: torch.jit.ScriptModule,
    encoder_out: torch.Tensor,
    encoder_out_lens: torch.Tensor,
 ) -> List[List[int]]:
    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
    Args:
      model:
        The transducer model.
      encoder_out:
        A 3-D tensor of shape (N, T, C)
      encoder_out_lens:
        A 1-D tensor of shape (N,).
    Returns:
      Return the decoded results for each utterance.
    """
    assert encoder_out.ndim == 3
    assert encoder_out.size(0) >= 1, encoder_out.size(0)
    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
        input=encoder_out,
        lengths=encoder_out_lens.cpu(),
        batch_first=True,
        enforce_sorted=False,
    )
    device = encoder_out.device
    blank_id = model.decoder.blank_id
    batch_size_list = packed_encoder_out.batch_sizes.tolist()
    N = encoder_out.size(0)
    assert torch.all(encoder_out_lens > 0), encoder_out_lens
    assert N == batch_size_list[0], (N, batch_size_list)
    context_size = model.decoder.context_size
    hyps = [[blank_id] * context_size for _ in range(N)]
    decoder_input = torch.tensor(
        hyps,
        device=device,
        dtype=torch.int64,
    )  # (N, context_size)
    decoder_out = model.decoder(
        decoder_input,
        need_pad=torch.tensor([False]),
    ).squeeze(1)
    offset = 0
    for batch_size in batch_size_list:
        start = offset
        end = offset + batch_size
        current_encoder_out = packed_encoder_out.data[start:end]
        current_encoder_out = current_encoder_out
        # current_encoder_out's shape: (batch_size, encoder_out_dim)
        offset = end
        decoder_out = decoder_out[:batch_size]
        logits = model.joiner(
            current_encoder_out,
            decoder_out,
        )
        # logits'shape (batch_size, vocab_size)
        assert logits.ndim == 2, logits.shape
        y = logits.argmax(dim=1).tolist()
        emitted = False
        for i, v in enumerate(y):
            if v != blank_id:
                hyps[i].append(v)
                emitted = True
        if emitted:
            # update decoder output
            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
            decoder_input = torch.tensor(
                decoder_input,
                device=device,
                dtype=torch.int64,
            )
            decoder_out = model.decoder(
                decoder_input,
                need_pad=torch.tensor([False]),
            )
            decoder_out = decoder_out.squeeze(1)
    sorted_ans = [h[context_size:] for h in hyps]
    ans = []
    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
    for i in range(N):
        ans.append(sorted_ans[unsorted_indices[i]])
    return ans
@torch.no_grad()
 def main():
    parser = get_parser()
    args = parser.parse_args()
    logging.info(vars(args))
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"device: {device}")
    model = torch.jit.load(args.nn_model_filename)
    model.eval()
    model.to(device)
    sp = spm.SentencePieceProcessor()
    sp.load(args.bpe_model)
    logging.info("Constructing Fbank computer")
    opts = kaldifeat.FbankOptions()
    opts.device = device
    opts.frame_opts.dither = 0
    opts.frame_opts.snip_edges = False
    opts.frame_opts.samp_freq = 16000
    opts.mel_opts.num_bins = 80
    opts.mel_opts.high_freq = -400
    fbank = kaldifeat.Fbank(opts)
    logging.info(f"Reading sound files: {args.sound_files}")
    waves = read_sound_files(
        filenames=args.sound_files,
    )
    waves = [w.to(device) for w in waves]
    logging.info("Decoding started")
    features = fbank(waves)
    feature_lengths = [f.size(0) for f in features]
    features = pad_sequence(
        features,
        batch_first=True,
        padding_value=math.log(1e-10),
    )
    feature_lengths = torch.tensor(feature_lengths, device=device)
    encoder_out, encoder_out_lens = model.encoder(
        features=features,
        feature_lengths=feature_lengths,
    )
    hyps = greedy_search(
        model=model,
        encoder_out=encoder_out,
        encoder_out_lens=encoder_out_lens,
    )
    s = "\n"
    for filename, hyp in zip(args.sound_files, hyps):
        words = smart_byte_decode(sp.decode(hyp))
        s += f"{filename}:\n{words}\n\n"
    logging.info(s)
    logging.info("Decoding Done")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/aishell/ASR/zipformer/pretrained_bbpe.py
+++ b/egs/aishell/ASR/zipformer/pretrained_bbpe.py
@ -0,0 +1,403 @@
 #!/usr/bin/env python3
 # Copyright 2021-2024 Xiaomi Corporation (Author: Fangjun Kuang,
 #                                                 Zengwei Yao,
 #                                                 Zengrui Jin,)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script loads a checkpoint and uses it to decode waves.
 You can generate the checkpoint with the following command:
 Note: This is a example for librispeech dataset, if you are using different
 dataset, you should change the argument values according to your dataset.
 - For non-streaming model:
 ./zipformer/export.py \
  --exp-dir ./zipformer/exp_bbpe \
  --tokens ./data/lang_bbpe_500/tokens.txt \
  --epoch 30 \
  --avg 9
 - For streaming model:
 ./zipformer/export.py \
  --exp-dir ./zipformer/exp_bbpe \
  --causal 1 \
  --tokens ./data/lang_bbpe_500/tokens.txt \
  --epoch 30 \
  --avg 9
 Usage of this script:
 - For non-streaming model:
 (1) greedy search
 ./zipformer/pretrained_bbpe.py \
  --checkpoint ./zipformer/exp_bbpe/pretrained.pt \
  --bpe ./data/lang_bbpe_500/bbpe.model \
  --method greedy_search \
  /path/to/foo.wav \
  /path/to/bar.wav
 (2) modified beam search
 ./zipformer/pretrained_bbpe.py \
  --checkpoint ./zipformer/exp_bbpe/pretrained.pt \
  --bpe ./data/lang_bbpe_500/bbpe.model \
  --method modified_beam_search \
  /path/to/foo.wav \
  /path/to/bar.wav
 (3) fast beam search
 ./zipformer/pretrained_bbpe.py \
  --checkpoint ./zipformer/exp_bbpe/pretrained.pt \
  --bpe ./data/lang_bbpe_500/bbpe.model \
  --method fast_beam_search \
  /path/to/foo.wav \
  /path/to/bar.wav
 - For streaming model:
 (1) greedy search
 ./zipformer/pretrained_bbpe.py \
  --checkpoint ./zipformer/exp_bbpe/pretrained.pt \
  --causal 1 \
  --chunk-size 16 \
  --left-context-frames 128 \
  --bpe ./data/lang_bbpe_500/bbpe.model \
  --method greedy_search \
  /path/to/foo.wav \
  /path/to/bar.wav
 (2) modified beam search
 ./zipformer/pretrained_bbpe.py \
  --checkpoint ./zipformer/exp_bbpe/pretrained.pt \
  --causal 1 \
  --chunk-size 16 \
  --left-context-frames 128 \
  --bpe ./data/lang_bbpe_500/bbpe.model \
  --method modified_beam_search \
  /path/to/foo.wav \
  /path/to/bar.wav
 (3) fast beam search
 ./zipformer/pretrained_bbpe.py \
  --checkpoint ./zipformer/exp_bbpe/pretrained.pt \
  --causal 1 \
  --chunk-size 16 \
  --left-context-frames 128 \
  --bpe ./data/lang_bbpe_500/bbpe.model \
  --method fast_beam_search \
  /path/to/foo.wav \
  /path/to/bar.wav
 You can also use `./zipformer/exp_bbpe/epoch-xx.pt`.
 Note: ./zipformer/exp_bbpe/pretrained.pt is generated by ./zipformer/export_bbpe.py
 """
 import argparse
 import logging
 import math
 from typing import List
 import k2
 import kaldifeat
 import sentencepiece as spm
 import torch
 import torchaudio
 from beam_search import (
    beam_search,
    fast_beam_search_one_best,
    greedy_search,
    greedy_search_batch,
    modified_beam_search,
 )
 from torch.nn.utils.rnn import pad_sequence
 from train import add_model_arguments, get_model, get_params
 from icefall import smart_byte_decode
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--checkpoint",
        type=str,
        required=True,
        help="Path to the checkpoint. "
        "The checkpoint is assumed to be saved by "
        "icefall.checkpoint.save_checkpoint().",
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        required=True,
        help="""Path to the bbpe.model.""",
    )
    parser.add_argument(
        "--method",
        type=str,
        default="greedy_search",
        help="""Possible values are:
          - greedy_search
          - modified_beam_search
          - fast_beam_search
        """,
    )
    parser.add_argument(
        "sound_files",
        type=str,
        nargs="+",
        help="The input sound file(s) to transcribe. "
        "Supported formats are those supported by torchaudio.load(). "
        "For example, wav and flac are supported. "
        "The sample rate has to be 16kHz.",
    )
    parser.add_argument(
        "--sample-rate",
        type=int,
        default=16000,
        help="The sample rate of the input sound file",
    )
    parser.add_argument(
        "--beam-size",
        type=int,
        default=4,
        help="""An integer indicating how many candidates we will keep for each
        frame. Used only when --method is beam_search or
        modified_beam_search.""",
    )
    parser.add_argument(
        "--beam",
        type=float,
        default=4,
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
        Used only when --method is fast_beam_search""",
    )
    parser.add_argument(
        "--max-contexts",
        type=int,
        default=4,
        help="""Used only when --method is fast_beam_search""",
    )
    parser.add_argument(
        "--max-states",
        type=int,
        default=8,
        help="""Used only when --method is fast_beam_search""",
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )
    parser.add_argument(
        "--max-sym-per-frame",
        type=int,
        default=1,
        help="""Maximum number of symbols per frame. Used only when
        --method is greedy_search.
        """,
    )
    add_model_arguments(parser)
    return parser
 def read_sound_files(
    filenames: List[str], expected_sample_rate: float
 ) -> List[torch.Tensor]:
    """Read a list of sound files into a list 1-D float32 torch tensors.
    Args:
      filenames:
        A list of sound filenames.
      expected_sample_rate:
        The expected sample rate of the sound files.
    Returns:
      Return a list of 1-D float32 torch tensors.
    """
    ans = []
    for f in filenames:
        wave, sample_rate = torchaudio.load(f)
        assert (
            sample_rate == expected_sample_rate
        ), f"expected sample rate: {expected_sample_rate}. Given: {sample_rate}"
        # We use only the first channel
        ans.append(wave[0].contiguous())
    return ans
@torch.no_grad()
 def main():
    parser = get_parser()
    args = parser.parse_args()
    params = get_params()
    params.update(vars(args))
    sp = spm.SentencePieceProcessor()
    sp.load(params.bpe_model)
    # <blk> is defined in local/train_bpe_model.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.unk_id = sp.piece_to_id("<unk>")
    params.vocab_size = sp.get_piece_size()
    logging.info(f"{params}")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    logging.info(f"device: {device}")
    if params.causal:
        assert (
            "," not in params.chunk_size
        ), "chunk_size should be one value in decoding."
        assert (
            "," not in params.left_context_frames
        ), "left_context_frames should be one value in decoding."
    logging.info("Creating model")
    model = get_model(params)
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    checkpoint = torch.load(args.checkpoint, map_location="cpu")
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
    logging.info("Constructing Fbank computer")
    opts = kaldifeat.FbankOptions()
    opts.device = device
    opts.frame_opts.dither = 0
    opts.frame_opts.snip_edges = False
    opts.frame_opts.samp_freq = params.sample_rate
    opts.mel_opts.num_bins = params.feature_dim
    opts.mel_opts.high_freq = -400
    fbank = kaldifeat.Fbank(opts)
    logging.info(f"Reading sound files: {params.sound_files}")
    waves = read_sound_files(
        filenames=params.sound_files, expected_sample_rate=params.sample_rate
    )
    waves = [w.to(device) for w in waves]
    logging.info("Decoding started")
    features = fbank(waves)
    feature_lengths = [f.size(0) for f in features]
    features = pad_sequence(features, batch_first=True, padding_value=math.log(1e-10))
    feature_lengths = torch.tensor(feature_lengths, device=device)
    # model forward
    encoder_out, encoder_out_lens = model.forward_encoder(features, feature_lengths)
    num_waves = encoder_out.size(0)
    hyps = []
    msg = f"Using {params.method}"
    logging.info(msg)
    if params.method == "fast_beam_search":
        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
        hyp_tokens = fast_beam_search_one_best(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam,
            max_contexts=params.max_contexts,
            max_states=params.max_states,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(smart_byte_decode(hyp).split())
    elif params.method == "modified_beam_search":
        hyp_tokens = modified_beam_search(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam_size,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(smart_byte_decode(hyp).split())
    elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
        hyp_tokens = greedy_search_batch(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(smart_byte_decode(hyp).split())
    else:
        for i in range(num_waves):
            # fmt: off
            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
            # fmt: on
            if params.method == "greedy_search":
                hyp = greedy_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    max_sym_per_frame=params.max_sym_per_frame,
                )
            elif params.method == "beam_search":
                hyp = beam_search(
                    model=model,
                    encoder_out=encoder_out_i,
                    beam=params.beam_size,
                )
            else:
                raise ValueError(f"Unsupported method: {params.method}")
            hyps.append(smart_byte_decode(sp.decode(hyp)).split())
    s = "\n"
    for filename, hyp in zip(params.sound_files, hyps):
        words = " ".join(hyp)
        s += f"{filename}:\n{words}\n\n"
    logging.info(s)
    logging.info("Decoding Done")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/aishell/ASR/zipformer/train.py
+++ b/egs/aishell/ASR/zipformer/train.py
@ -86,6 +86,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import (
@ -985,9 +986,7 @@ def train_one_epoch(
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
-                raise RuntimeError(
+                raise_grad_scale_is_too_small_error(cur_grad_scale)
                    f"grad_scale is too small, exiting: {cur_grad_scale}"
                )
        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
--- a/egs/aishell/ASR/zipformer/train_bbpe.py
+++ b/egs/aishell/ASR/zipformer/train_bbpe.py
@ -0,0 +1,941 @@
 #!/usr/bin/env python3
 # Copyright    2021-2024  Xiaomi Corp.        (authors: Fangjun Kuang,
 #                                                       Wei Kang,
 #                                                       Mingshuang Luo,
 #                                                       Zengwei Yao,
 #                                                       Daniel Povey,
 #                                                       Zengrui Jin,)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
 export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
 ./zipformer/train_bbpe.py \
  --world-size 8 \
  --num-epochs 12 \
  --start-epoch 1 \
  --exp-dir zipformer/exp_bbpe \
  --max-duration 350
 # For mix precision training:
 ./zipformer/train_bbpe.py \
  --world-size 8 \
  --num-epochs 12 \
  --start-epoch 1 \
  --use-fp16 1 \
  --exp-dir zipformer/exp_bbpe \
  --max-duration 750
 """
 import argparse
 import copy
 import logging
 import warnings
 from pathlib import Path
 from typing import Optional, Tuple, Union
 import k2
 import sentencepiece as spm
 import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import AishellAsrDataModule
 from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from optim import Eden, ScaledAdam
 from torch import Tensor
 from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter
 from train import (
    LRSchedulerType,
    add_model_arguments,
    get_adjusted_batch_count,
    get_model,
    get_params,
    load_checkpoint_if_available,
    save_checkpoint,
    set_batch_count,
 )
 from icefall import byte_encode, diagnostics
 from icefall.checkpoint import remove_checkpoints
 from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
 from icefall.checkpoint import (
    save_checkpoint_with_global_batch_idx,
    update_averaged_model,
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
    tokenize_by_CJK_char,
 )
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--world-size",
        type=int,
        default=1,
        help="Number of GPUs for DDP training.",
    )
    parser.add_argument(
        "--master-port",
        type=int,
        default=12354,
        help="Master port to use for DDP training.",
    )
    parser.add_argument(
        "--tensorboard",
        type=str2bool,
        default=True,
        help="Should various information be logged in tensorboard.",
    )
    parser.add_argument(
        "--num-epochs",
        type=int,
        default=30,
        help="Number of epochs to train.",
    )
    parser.add_argument(
        "--start-epoch",
        type=int,
        default=1,
        help="""Resume training from this epoch. It should be positive.
        If larger than 1, it will load checkpoint from
        exp-dir/epoch-{start_epoch-1}.pt
        """,
    )
    parser.add_argument(
        "--start-batch",
        type=int,
        default=0,
        help="""If positive, --start-epoch is ignored and
        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
        """,
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="zipformer_bbpe/exp",
        help="""The experiment dir.
        It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
        """,
    )
    parser.add_argument(
        "--bpe-model",
        type=str,
        default="data/lang_bbpe_500/bbpe.model",
        help="Path to the Byte BPE model",
    )
    parser.add_argument(
        "--base-lr", type=float, default=0.045, help="The base learning rate."
    )
    parser.add_argument(
        "--lr-batches",
        type=float,
        default=7500,
        help="""Number of steps that affects how rapidly the learning rate
        decreases. We suggest not to change this.""",
    )
    parser.add_argument(
        "--lr-epochs",
        type=float,
        default=3.5,
        help="""Number of epochs that affects how rapidly the learning rate decreases.
        """,
    )
    parser.add_argument(
        "--ref-duration",
        type=float,
        default=600,
        help="""Reference batch duration for purposes of adjusting batch counts for setting various schedules inside the model""",
    )
    parser.add_argument(
        "--context-size",
        type=int,
        default=2,
        help="""The context size in the decoder. 1 means bigram; 2 means tri-gram""",
    )
    parser.add_argument(
        "--prune-range",
        type=int,
        default=5,
        help="""The prune range for rnnt loss, it means how many symbols(context)
        we are using to compute the loss""",
    )
    parser.add_argument(
        "--lm-scale",
        type=float,
        default=0.25,
        help="""The scale to smooth the loss with lm
        (output of prediction network) part.""",
    )
    parser.add_argument(
        "--am-scale",
        type=float,
        default=0.0,
        help="""The scale to smooth the loss with am (output of encoder network) part.""",
    )
    parser.add_argument(
        "--simple-loss-scale",
        type=float,
        default=0.5,
        help="""To get pruning ranges, we will calculate a simple version
        loss(joiner is just addition), this simple loss also uses for
        training (as a regularization item). We will scale the simple loss
        with this parameter before adding to the final loss.""",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=42,
        help="The seed for random generators intended for reproducibility",
    )
    parser.add_argument(
        "--print-diagnostics",
        type=str2bool,
        default=False,
        help="Accumulate stats on activations, print them and exit.",
    )
    parser.add_argument(
        "--inf-check",
        type=str2bool,
        default=False,
        help="Add hooks to check for infinite module outputs and gradients.",
    )
    parser.add_argument(
        "--save-every-n",
        type=int,
        default=4000,
        help="""Save checkpoint after processing this number of batches"
        periodically. We save checkpoint to exp-dir/ whenever
        params.batch_idx_train % save_every_n == 0. The checkpoint filename
        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
        end of each epoch where `xxx` is the epoch number counting from 0.
        """,
    )
    parser.add_argument(
        "--keep-last-k",
        type=int,
        default=30,
        help="""Only keep this number of checkpoints on disk.
        For instance, if it is 3, there are only 3 checkpoints
        in the exp-dir with filenames `checkpoint-xxx.pt`.
        It does not affect checkpoints with name `epoch-xxx.pt`.
        """,
    )
    parser.add_argument(
        "--average-period",
        type=int,
        default=200,
        help="""Update the averaged model, namely `model_avg`, after processing
        this number of batches. `model_avg` is a separate version of model,
        in which each floating-point parameter is the average of all the
        parameters from the start of training. Each time we take the average,
        we do: `model_avg = model * (average_period / batch_idx_train) +
            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
        """,
    )
    parser.add_argument(
        "--use-fp16",
        type=str2bool,
        default=False,
        help="Whether to use half precision training.",
    )
    add_model_arguments(parser)
    return parser
 def compute_loss(
    params: AttributeDict,
    model: Union[nn.Module, DDP],
    sp: spm.SentencePieceProcessor,
    batch: dict,
    is_training: bool,
 ) -> Tuple[Tensor, MetricsTracker]:
    """
    Compute CTC loss given the model and its inputs.
    Args:
      params:
        Parameters for training. See :func:`get_params`.
      model:
        The model for training. It is an instance of Zipformer in our case.
      batch:
        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
        for the content in it.
      is_training:
        True for training. False for validation. When it is True, this
        function enables autograd during computation; when it is False, it
        disables autograd.
     warmup: a floating point value which increases throughout training;
        values >= 1.0 are fully warmed up and have all modules present.
    """
    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
    feature = batch["inputs"]
    # at entry, feature is (N, T, C)
    assert feature.ndim == 3
    feature = feature.to(device)
    supervisions = batch["supervisions"]
    feature_lens = supervisions["num_frames"].to(device)
    batch_idx_train = params.batch_idx_train
    warm_step = params.warm_step
    texts = batch["supervisions"]["text"]
    y = sp.encode(texts, out_type=int)
    y = k2.RaggedTensor(y).to(device)
    with torch.set_grad_enabled(is_training):
        simple_loss, pruned_loss, _ = model(
            x=feature,
            x_lens=feature_lens,
            y=y,
            prune_range=params.prune_range,
            am_scale=params.am_scale,
            lm_scale=params.lm_scale,
        )
        s = params.simple_loss_scale
        # take down the scale on the simple loss from 1.0 at the start
        # to params.simple_loss scale by warm_step.
        simple_loss_scale = (
            s
            if batch_idx_train >= warm_step
            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
        )
        pruned_loss_scale = (
            1.0
            if batch_idx_train >= warm_step
            else 0.1 + 0.9 * (batch_idx_train / warm_step)
        )
        loss = simple_loss_scale * simple_loss + pruned_loss_scale * pruned_loss
    assert loss.requires_grad == is_training
    info = MetricsTracker()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
    # Note: We use reduction=sum while computing the loss.
    info["loss"] = loss.detach().cpu().item()
    info["simple_loss"] = simple_loss.detach().cpu().item()
    info["pruned_loss"] = pruned_loss.detach().cpu().item()
    return loss, info
 def compute_validation_loss(
    params: AttributeDict,
    model: Union[nn.Module, DDP],
    sp: spm.SentencePieceProcessor,
    valid_dl: torch.utils.data.DataLoader,
    world_size: int = 1,
 ) -> MetricsTracker:
    """Run the validation process."""
    model.eval()
    tot_loss = MetricsTracker()
    for batch_idx, batch in enumerate(valid_dl):
        loss, loss_info = compute_loss(
            params=params,
            model=model,
            sp=sp,
            batch=batch,
            is_training=False,
        )
        assert loss.requires_grad is False
        tot_loss = tot_loss + loss_info
    if world_size > 1:
        tot_loss.reduce(loss.device)
    loss_value = tot_loss["loss"] / tot_loss["frames"]
    if loss_value < params.best_valid_loss:
        params.best_valid_epoch = params.cur_epoch
        params.best_valid_loss = loss_value
    return tot_loss
 def train_one_epoch(
    params: AttributeDict,
    model: Union[nn.Module, DDP],
    optimizer: torch.optim.Optimizer,
    scheduler: LRSchedulerType,
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
    scaler: GradScaler,
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
    rank: int = 0,
 ) -> None:
    """Train the model for one epoch.
    The training loss from the mean of all frames is saved in
    `params.train_loss`. It runs the validation process every
    `params.valid_interval` batches.
    Args:
      params:
        It is returned by :func:`get_params`.
      model:
        The model for training.
      optimizer:
        The optimizer we are using.
      scheduler:
        The learning rate scheduler, we call step() every step.
      train_dl:
        Dataloader for the training dataset.
      valid_dl:
        Dataloader for the validation dataset.
      scaler:
        The scaler used for mix precision training.
      model_avg:
        The stored model averaged from the start of training.
      tb_writer:
        Writer to write log messages to tensorboard.
      world_size:
        Number of nodes in DDP training. If it is 1, DDP is disabled.
      rank:
        The rank of the node in DDP training. If no DDP is used, it should
        be set to 0.
    """
    model.train()
    tot_loss = MetricsTracker()
    cur_batch_idx = params.get("cur_batch_idx", 0)
    saved_bad_model = False
    def save_bad_model(suffix: str = ""):
        save_checkpoint_impl(
            filename=params.exp_dir / f"bad-model{suffix}-{rank}.pt",
            model=model,
            model_avg=model_avg,
            params=params,
            optimizer=optimizer,
            scheduler=scheduler,
            sampler=train_dl.sampler,
            scaler=scaler,
            rank=0,
        )
    for batch_idx, batch in enumerate(train_dl):
        if batch_idx % 10 == 0:
            set_batch_count(model, get_adjusted_batch_count(params))
        if batch_idx < cur_batch_idx:
            continue
        cur_batch_idx = batch_idx
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])
        try:
            with torch.cuda.amp.autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
                    sp=sp,
                    batch=batch,
                    is_training=True,
                )
            # summary stats
            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
            # NOTE: We use reduction==sum and loss is computed over utterances
            # in the batch and there is no normalization to it so far.
            scaler.scale(loss).backward()
            scheduler.step_batch(params.batch_idx_train)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        except:  # noqa
            save_bad_model()
            display_and_save_batch(batch, params=params, sp=sp)
            raise
        if params.print_diagnostics and batch_idx == 5:
            return
        if (
            rank == 0
            and params.batch_idx_train > 0
            and params.batch_idx_train % params.average_period == 0
        ):
            update_averaged_model(
                params=params,
                model_cur=model,
                model_avg=model_avg,
            )
        if (
            params.batch_idx_train > 0
            and params.batch_idx_train % params.save_every_n == 0
        ):
            params.cur_batch_idx = batch_idx
            save_checkpoint_with_global_batch_idx(
                out_dir=params.exp_dir,
                global_batch_idx=params.batch_idx_train,
                model=model,
                model_avg=model_avg,
                params=params,
                optimizer=optimizer,
                scheduler=scheduler,
                sampler=train_dl.sampler,
                scaler=scaler,
                rank=rank,
            )
            del params.cur_batch_idx
            remove_checkpoints(
                out_dir=params.exp_dir,
                topk=params.keep_last_k,
                rank=rank,
            )
        if batch_idx % 100 == 0 and params.use_fp16:
            # If the grad scale was less than 1, try increasing it.    The _growth_interval
            # of the grad scaler is configurable, but we can't configure it to have different
            # behavior depending on the current grad scale.
            cur_grad_scale = scaler._scale.item()
            if cur_grad_scale < 8.0 or (cur_grad_scale < 32.0 and batch_idx % 400 == 0):
                scaler.update(cur_grad_scale * 2.0)
            if cur_grad_scale < 0.01:
                if not saved_bad_model:
                    save_bad_model(suffix="-first-warning")
                    saved_bad_model = True
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
                raise_grad_scale_is_too_small_error(cur_grad_scale)
        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
            logging.info(
                f"Epoch {params.cur_epoch}, "
                f"batch {batch_idx}, loss[{loss_info}], "
                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
                f"lr: {cur_lr:.2e}, "
                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
            )
            if tb_writer is not None:
                tb_writer.add_scalar(
                    "train/learning_rate", cur_lr, params.batch_idx_train
                )
                loss_info.write_summary(
                    tb_writer, "train/current_", params.batch_idx_train
                )
                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
                if params.use_fp16:
                    tb_writer.add_scalar(
                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
                    )
        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
            logging.info("Computing validation loss")
            valid_info = compute_validation_loss(
                params=params,
                model=model,
                sp=sp,
                valid_dl=valid_dl,
                world_size=world_size,
            )
            model.train()
            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
            logging.info(
                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
            )
            if tb_writer is not None:
                valid_info.write_summary(
                    tb_writer, "train/valid_", params.batch_idx_train
                )
    loss_value = tot_loss["loss"] / tot_loss["frames"]
    params.train_loss = loss_value
    if params.train_loss < params.best_train_loss:
        params.best_train_epoch = params.cur_epoch
        params.best_train_loss = params.train_loss
 def run(rank, world_size, args):
    """
    Args:
      rank:
        It is a value between 0 and `world_size-1`, which is
        passed automatically by `mp.spawn()` in :func:`main`.
        The node with rank 0 is responsible for saving checkpoint.
      world_size:
        Number of GPUs for DDP training.
      args:
        The return value of get_parser().parse_args()
    """
    params = get_params()
    params.update(vars(args))
    fix_random_seed(params.seed)
    if world_size > 1:
        setup_dist(rank, world_size, params.master_port)
    setup_logger(f"{params.exp_dir}/log/log-train")
    logging.info("Training started")
    if args.tensorboard and rank == 0:
        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
    else:
        tb_writer = None
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", rank)
    logging.info(f"Device: {device}")
    sp = spm.SentencePieceProcessor()
    sp.load(params.bpe_model)
    # <blk> is defined in local/train_bbpe_model.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.vocab_size = sp.get_piece_size()
    logging.info(params)
    logging.info("About to create model")
    model = get_model(params)
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    assert params.save_every_n >= params.average_period
    model_avg: Optional[nn.Module] = None
    if rank == 0:
        # model_avg is only used with rank 0
        model_avg = copy.deepcopy(model).to(torch.float64)
    assert params.start_epoch > 0, params.start_epoch
    checkpoints = load_checkpoint_if_available(
        params=params, model=model, model_avg=model_avg
    )
    model.to(device)
    if world_size > 1:
        logging.info("Using DDP")
        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
    optimizer = ScaledAdam(
        get_parameter_groups_with_lrs(model, lr=params.base_lr, include_names=True),
        lr=params.base_lr,  # should have no effect
        clipping_scale=2.0,
    )
    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
    if checkpoints and "optimizer" in checkpoints:
        logging.info("Loading optimizer state dict")
        optimizer.load_state_dict(checkpoints["optimizer"])
    if (
        checkpoints
        and "scheduler" in checkpoints
        and checkpoints["scheduler"] is not None
    ):
        logging.info("Loading scheduler state dict")
        scheduler.load_state_dict(checkpoints["scheduler"])
    if params.print_diagnostics:
        opts = diagnostics.TensorDiagnosticOptions(
            512
        )  # allow 4 megabytes per sub-module
        diagnostic = diagnostics.attach_diagnostics(model, opts)
    if params.inf_check:
        register_inf_check_hooks(model)
    aishell = AishellAsrDataModule(args)
    train_cuts = aishell.train_cuts()
    valid_cuts = aishell.valid_cuts()
    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 15 seconds
        #
        # Caution: There is a reason to select 15.0 here. Please see
        # ../local/display_manifest_statistics.py
        #
        # You should use ../local/display_manifest_statistics.py to get
        # an utterance duration distribution for your dataset to select
        # the threshold
        if c.duration < 1.0 or c.duration > 15.0:
            # logging.warning(
            #    f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
            # )
            return False
        # In pruned RNN-T, we require that T >= S
        # where T is the number of feature frames after subsampling
        # and S is the number of tokens in the utterance
        # In ./zipformer.py, the conv module uses the following expression
        # for subsampling
        T = ((c.num_frames - 7) // 2 + 1) // 2
        tokens = sp.encode(c.supervisions[0].text, out_type=str)
        if T < len(tokens):
            logging.warning(
                f"Exclude cut with ID {c.id} from training. "
                f"Number of frames (before subsampling): {c.num_frames}. "
                f"Number of frames (after subsampling): {T}. "
                f"Text: {c.supervisions[0].text}. "
                f"Tokens: {tokens}. "
                f"Number of tokens: {len(tokens)}"
            )
            return False
        return True
    def tokenize_and_encode_text(c: Cut):
        # Text normalize for each sample
        text = c.supervisions[0].text
        text = byte_encode(tokenize_by_CJK_char(text))
        c.supervisions[0].text = text
        return c
    train_cuts = train_cuts.filter(remove_short_and_long_utt)
    train_cuts = train_cuts.map(tokenize_and_encode_text)
    valid_cuts = valid_cuts.map(tokenize_and_encode_text)
    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
        # We only load the sampler's state dict when it loads a checkpoint
        # saved in the middle of an epoch
        sampler_state_dict = checkpoints["sampler"]
    else:
        sampler_state_dict = None
    train_dl = aishell.train_dataloaders(
        train_cuts, sampler_state_dict=sampler_state_dict
    )
    valid_dl = aishell.valid_dataloaders(valid_cuts)
    if False and not params.print_diagnostics:
        scan_pessimistic_batches_for_oom(
            model=model,
            train_dl=train_dl,
            optimizer=optimizer,
            sp=sp,
            params=params,
        )
    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
    for epoch in range(params.start_epoch, params.num_epochs + 1):
        scheduler.step_epoch(epoch - 1)
        fix_random_seed(params.seed + epoch - 1)
        train_dl.sampler.set_epoch(epoch - 1)
        if tb_writer is not None:
            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
        params.cur_epoch = epoch
        train_one_epoch(
            params=params,
            model=model,
            model_avg=model_avg,
            optimizer=optimizer,
            scheduler=scheduler,
            sp=sp,
            train_dl=train_dl,
            valid_dl=valid_dl,
            scaler=scaler,
            tb_writer=tb_writer,
            world_size=world_size,
            rank=rank,
        )
        if params.print_diagnostics:
            diagnostic.print_diagnostics()
            break
        save_checkpoint(
            params=params,
            model=model,
            model_avg=model_avg,
            optimizer=optimizer,
            scheduler=scheduler,
            sampler=train_dl.sampler,
            scaler=scaler,
            rank=rank,
        )
    logging.info("Done!")
    if world_size > 1:
        torch.distributed.barrier()
        cleanup_dist()
 def display_and_save_batch(
    batch: dict,
    params: AttributeDict,
    sp: spm.SentencePieceProcessor,
 ) -> None:
    """Display the batch statistics and save the batch into disk.
    Args:
      batch:
        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
        for the content in it.
      params:
        Parameters for training. See :func:`get_params`.
      sp:
        The sentence piece model.
    """
    from lhotse.utils import uuid4
    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
    logging.info(f"Saving batch to {filename}")
    torch.save(batch, filename)
    supervisions = batch["supervisions"]
    features = batch["inputs"]
    logging.info(f"features shape: {features.shape}")
    y = sp.encode(supervisions["text"], out_type=int)
    num_tokens = sum(len(i) for i in y)
    logging.info(f"num tokens: {num_tokens}")
 def scan_pessimistic_batches_for_oom(
    model: Union[nn.Module, DDP],
    train_dl: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    sp: spm.SentencePieceProcessor,
    params: AttributeDict,
 ):
    from lhotse.dataset import find_pessimistic_batches
    logging.info(
        "Sanity check -- see if any of the batches in epoch 1 would cause OOM."
    )
    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
            with torch.cuda.amp.autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
                    sp=sp,
                    batch=batch,
                    is_training=True,
                )
            loss.backward()
            optimizer.zero_grad()
        except Exception as e:
            if "CUDA out of memory" in str(e):
                logging.error(
                    "Your GPU ran out of memory with the current "
                    "max_duration setting. We recommend decreasing "
                    "max_duration and trying again.\n"
                    f"Failing criterion: {criterion} "
                    f"(={crit_values[criterion]}) ..."
                )
            display_and_save_batch(batch, params=params, sp=sp)
            raise
        logging.info(
            f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
        )
 def main():
    parser = get_parser()
    AishellAsrDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    world_size = args.world_size
    assert world_size >= 1
    if world_size > 1:
        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
    else:
        run(rank=0, world_size=1, args=args)
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 if __name__ == "__main__":
    main()
--- a/egs/aishell2/ASR/local/compute_fbank_aishell2.py
+++ b/egs/aishell2/ASR/local/compute_fbank_aishell2.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse import (
    CutSet,
    Fbank,
    FbankConfig,
    LilcomChunkyWriter,
    WhisperFbank,
    WhisperFbankConfig,
 )
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor, str2bool
@ -42,10 +49,12 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
-def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_aishell2(
    num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False
 ):
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
-    num_jobs = min(15, os.cpu_count())
+    num_jobs = min(8, os.cpu_count())
    dataset_parts = (
        "train",
@ -68,7 +77,11 @@ def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False):
        list(manifests.keys()),
        dataset_parts,
    )
-
+    if whisper_fbank:
        extractor = WhisperFbank(
            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
        )
    else:
        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
    with get_executor() as ex:  # Initialize the executor only once.
@ -82,7 +95,7 @@ def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False):
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -111,7 +124,12 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
-
+    parser.add_argument(
        "--whisper-fbank",
        type=str2bool,
        default=False,
        help="Use WhisperFbank instead of Fbank. Default: False.",
    )
    return parser.parse_args()
@ -122,5 +140,7 @@ if __name__ == "__main__":
    args = get_args()
    compute_fbank_aishell2(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
        perturb_speed=args.perturb_speed,
        whisper_fbank=args.whisper_fbank,
    )
--- a/egs/aishell2/ASR/prepare.sh
+++ b/egs/aishell2/ASR/prepare.sh
@ -108,6 +108,16 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  fi
 fi
 whisper_mel_bins=80
 if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then
  log "Stage 30: Compute whisper fbank for aishell2"
  if [ ! -f data/fbank/.aishell2.whisper.done ]; then
    mkdir -p data/fbank
    ./local/compute_fbank_aishell2.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
    touch data/fbank/.aishell2.whisper.done
  fi
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Compute fbank for musan"
  if [ ! -f data/fbank/.msuan.done ]; then
--- a/egs/aishell2/ASR/pruned_transducer_stateless5/asr_datamodule.py
+++ b/egs/aishell2/ASR/pruned_transducer_stateless5/asr_datamodule.py
@ -296,6 +296,8 @@ class AiShell2AsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                buffer_size=self.args.num_buckets * 2000,
                shuffle_buffer_size=self.args.num_buckets * 5000,
                drop_last=self.args.drop_last,
            )
        else:
--- a/egs/aishell2/ASR/pruned_transducer_stateless5/export.py
+++ b/egs/aishell2/ASR/pruned_transducer_stateless5/export.py
@ -22,7 +22,7 @@
 Usage:
 ./pruned_transducer_stateless5/export.py \
  --exp-dir ./pruned_transducer_stateless5/exp \
-  --lang-dir data/lang_char
+  --tokens ./data/lang_char/tokens.txt \
  --epoch 25 \
  --avg 5
@ -48,6 +48,7 @@ import argparse
 import logging
 from pathlib import Path
 import k2
 import torch
 from train import add_model_arguments, get_params, get_transducer_model
@ -57,8 +58,7 @@ from icefall.checkpoint import (
    find_checkpoints,
    load_checkpoint,
 )
-from icefall.lexicon import Lexicon
+from icefall.utils import num_tokens, str2bool
 from icefall.utils import str2bool
 def get_parser():
@ -115,10 +115,10 @@ def get_parser():
    )
    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
        type=str,
-        default="data/lang_char",
+        default="data/lang_char/tokens.txt",
-        help="The lang dir",
+        help="Path to the tokens.txt",
    )
    parser.add_argument(
@ -154,10 +154,10 @@ def main():
    logging.info(f"device: {device}")
-    lexicon = Lexicon(params.lang_dir)
+    token_table = k2.SymbolTable.from_file(params.tokens)
-    params.blank_id = lexicon.token_table["<blk>"]
+    params.blank_id = token_table["<blk>"]
-    params.unk_id = lexicon.token_table["<unk>"]
+    params.unk_id = token_table["<unk>"]
-    params.vocab_size = max(lexicon.tokens) + 1
+    params.vocab_size = num_tokens(token_table) + 1
    logging.info(params)
--- a/egs/aishell4/ASR/local/compute_fbank_aishell4.py
+++ b/egs/aishell4/ASR/local/compute_fbank_aishell4.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path
 import torch
-from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
+from lhotse import (
    CutSet,
    Fbank,
    FbankConfig,
    LilcomChunkyWriter,
    WhisperFbank,
    WhisperFbankConfig,
 )
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor, str2bool
@ -42,10 +49,12 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
-def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_aishell4(
    num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False
 ):
    src_dir = Path("data/manifests/aishell4")
    output_dir = Path("data/fbank")
-    num_jobs = min(15, os.cpu_count())
+    num_jobs = min(8, os.cpu_count())
    dataset_parts = (
        "train_S",
@ -70,6 +79,11 @@ def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
        dataset_parts,
    )
    if whisper_fbank:
        extractor = WhisperFbank(
            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
        )
    else:
        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
    with get_executor() as ex:  # Initialize the executor only once.
@ -84,7 +98,7 @@ def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -95,7 +109,7 @@ def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=ChunkedLilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )
            logging.info("About splitting cuts into smaller chunks")
@ -121,7 +135,12 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
-
+    parser.add_argument(
        "--whisper-fbank",
        type=str2bool,
        default=False,
        help="Use WhisperFbank instead of Fbank. Default: False.",
    )
    return parser.parse_args()
@ -132,5 +151,7 @@ if __name__ == "__main__":
    args = get_args()
    compute_fbank_aishell4(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
        perturb_speed=args.perturb_speed,
        whisper_fbank=args.whisper_fbank,
    )
--- a/egs/aishell4/ASR/prepare.sh
+++ b/egs/aishell4/ASR/prepare.sh
@ -6,7 +6,7 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail
 stage=-1
-stop_stage=100
+stop_stage=7
 perturb_speed=true
@ -76,11 +76,21 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
-  log "Stage 2: Process aishell4"
+  log "Stage 2: Compute fbank for aishell4"
  if [ ! -f data/fbank/aishell4/.fbank.done ]; then
-    mkdir -p data/fbank/aishell4
+    mkdir -p data/fbank
    ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed}
-    touch data/fbank/aishell4/.fbank.done
+    touch data/fbank/.fbank.done
  fi
 fi
 whisper_mel_bins=80
 if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then
  log "Stage 20: Compute whisper fbank for aishell4"
  if [ ! -f data/fbank/aishell4/.fbank.done ]; then
    mkdir -p data/fbank
    ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
    touch data/fbank/.fbank.done
  fi
 fi
@ -106,16 +116,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  log "Stage 5: Compute fbank for aishell4"
+  log "Stage 5: Prepare char based lang"
  if [ ! -f data/fbank/.aishell4.done ]; then
    mkdir -p data/fbank
    ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed}
    touch data/fbank/.aishell4.done
  fi
 fi
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "Stage 6: Prepare char based lang"
  lang_char_dir=data/lang_char
  mkdir -p $lang_char_dir
--- a/egs/aishell4/ASR/pruned_transducer_stateless5/asr_datamodule.py
+++ b/egs/aishell4/ASR/pruned_transducer_stateless5/asr_datamodule.py
@ -306,7 +306,8 @@ class Aishell4AsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                buffer_size=100000,
+                buffer_size=self.args.num_buckets * 2000,
                shuffle_buffer_size=self.args.num_buckets * 5000,
                drop_last=self.args.drop_last,
            )
        else:
--- a/egs/aishell4/ASR/pruned_transducer_stateless5/export.py
+++ b/egs/aishell4/ASR/pruned_transducer_stateless5/export.py
@ -48,6 +48,7 @@ import argparse
 import logging
 from pathlib import Path
 import k2
 import torch
 from train import add_model_arguments, get_params, get_transducer_model
@ -57,8 +58,7 @@ from icefall.checkpoint import (
    find_checkpoints,
    load_checkpoint,
 )
-from icefall.lexicon import Lexicon
+from icefall.utils import num_tokens, str2bool
 from icefall.utils import str2bool
 def get_parser():
@ -115,13 +115,10 @@ def get_parser():
    )
    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
        type=str,
-        default="data/lang_char",
+        default="data/lang_char/tokens.txt",
-        help="""The lang dir
+        help="Path to the tokens.txt",
        It contains language related input files such as
        "lexicon.txt"
        """,
    )
    parser.add_argument(
@ -157,9 +154,9 @@ def main():
    logging.info(f"device: {device}")
-    lexicon = Lexicon(params.lang_dir)
+    token_table = k2.SymbolTable.from_file(params.tokens)
-    params.blank_id = lexicon.token_table["<blk>"]
+    params.blank_id = token_table["<blk>"]
-    params.vocab_size = max(lexicon.tokens) + 1
+    params.vocab_size = num_tokens(token_table) + 1
    logging.info(params)
--- a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
+++ b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse import (
    CutSet,
    Fbank,
    FbankConfig,
    LilcomChunkyWriter,
    WhisperFbank,
    WhisperFbankConfig,
 )
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor, str2bool
@ -42,10 +49,12 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
-def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_alimeeting(
    num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False
 ):
    src_dir = Path("data/manifests/alimeeting")
    output_dir = Path("data/fbank")
-    num_jobs = min(15, os.cpu_count())
+    num_jobs = min(8, os.cpu_count())
    dataset_parts = (
        "train",
@ -53,7 +62,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False
        "test",
    )
-    prefix = "alimeeting"
+    prefix = "alimeeting-far"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
@ -70,6 +79,11 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False
        dataset_parts,
    )
    if whisper_fbank:
        extractor = WhisperFbank(
            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
        )
    else:
        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
    with get_executor() as ex:  # Initialize the executor only once.
@ -83,7 +97,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -121,7 +135,12 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
-
+    parser.add_argument(
        "--whisper-fbank",
        type=str2bool,
        default=False,
        help="Use the Whisper Fbank feature extractor. Default: False.",
    )
    return parser.parse_args()
@ -132,5 +151,7 @@ if __name__ == "__main__":
    args = get_args()
    compute_fbank_alimeeting(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
        perturb_speed=args.perturb_speed,
        whisper_fbank=args.whisper_fbank,
    )
--- a/egs/alimeeting/ASR/prepare.sh
+++ b/egs/alimeeting/ASR/prepare.sh
@ -6,7 +6,7 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail
 stage=-1
-stop_stage=100
+stop_stage=7
 perturb_speed=true
 # We assume dl_dir (download dir) contains the following
@ -15,7 +15,7 @@ perturb_speed=true
 #
 #  - $dl_dir/alimeeting
 #     This directory contains the following files downloaded from
-#       https://openslr.org/62/
+#       https://openslr.org/119/
 #
 #     - Train_Ali_far.tar.gz
 #     - Train_Ali_near.tar.gz
@ -66,10 +66,21 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
-  log "Stage 2: Process alimeeting"
+  log "Stage 2: compute fbank for alimeeting"
-  if [ ! -f data/fbank/alimeeting/.fbank.done ]; then
+  if [ ! -f data/fbank/.fbank.done ]; then
-    mkdir -p data/fbank/alimeeting
+    mkdir -p data/fbank
    ./local/compute_fbank_alimeeting.py --perturb-speed ${perturb_speed}
    touch data/fbank/.fbank.done
  fi
 fi
 whisper_mel_bins=80
 if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then
  log "Stage 20: compute whisper fbank for alimeeting"
  if [ ! -f data/fbank/.fbank.done ]; then
    mkdir -p data/fbank
    ./local/compute_fbank_alimeeting.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
    touch data/fbank/.fbank.done
  fi
 fi
@ -95,16 +106,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  log "Stage 5: Compute fbank for alimeeting"
+  log "Stage 5: Prepare char based lang"
  if [ ! -f data/fbank/.alimeeting.done ]; then
    mkdir -p data/fbank
    ./local/compute_fbank_alimeeting.py --perturb-speed True
    touch data/fbank/.alimeeting.done
  fi
 fi
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "Stage 6: Prepare char based lang"
  lang_char_dir=data/lang_char
  mkdir -p $lang_char_dir
--- a/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -288,7 +288,8 @@ class AlimeetingAsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                buffer_size=30000,
+                buffer_size=self.args.num_buckets * 2000,
                shuffle_buffer_size=self.args.num_buckets * 5000,
                drop_last=True,
            )
        else:
--- a/egs/alimeeting/ASR/pruned_transducer_stateless2/export.py
+++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/export.py
@ -20,7 +20,7 @@
 Usage:
 ./pruned_transducer_stateless2/export.py \
  --exp-dir ./pruned_transducer_stateless2/exp \
-  --lang-dir data/lang_char \
+  --tokens ./data/lang_char/tokens.txt \
  --epoch 29 \
  --avg 18
@ -45,12 +45,12 @@ import argparse
 import logging
 from pathlib import Path
 import k2
 import torch
 from train import get_params, get_transducer_model
 from icefall.checkpoint import average_checkpoints, load_checkpoint
-from icefall.lexicon import Lexicon
+from icefall.utils import num_tokens, str2bool
 from icefall.utils import str2bool
 def get_parser():
@ -85,10 +85,10 @@ def get_parser():
    )
    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
        type=str,
-        default="data/lang_char",
+        default="data/lang_char/tokens.txt",
-        help="The lang dir",
+        help="Path to the tokens.txt",
    )
    parser.add_argument(
@ -122,10 +122,9 @@ def main():
    logging.info(f"device: {device}")
-    lexicon = Lexicon(params.lang_dir)
+    token_table = k2.SymbolTable.from_file(params.tokens)
-
+    params.blank_id = token_table["<blk>"]
-    params.blank_id = 0
+    params.vocab_size = num_tokens(token_table) + 1
    params.vocab_size = max(lexicon.tokens) + 1
    logging.info(params)
--- a/Show More
+++ b/Show More
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/lstm_transducer_stateless2/lstmp.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/pruned_transducer_stateless3/scaling_converter.py`
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/conformer_ctc/label_smoothing.py`