Merge remote-tracking branch 'upstream/master' into reazonspeech-recipe

2025-08-27 10:44:19 +00:00 · 2024-05-01 23:21:38 +09:00 · 2024-05-01 23:21:38 +09:00 · 3505a8ec45
commit 3505a8ec45
parent 01325b58c8 6d7c1d13a5
644 changed files with 96131 additions and 1880 deletions
--- a/.github/scripts/.gitignore
+++ b/.github/scripts/.gitignore
@ -0,0 +1 @@
+piper_phonemize.html
--- a/.github/scripts/audioset/AT/run.sh
+++ b/.github/scripts/audioset/AT/run.sh
@ -0,0 +1,94 @@
+#!/usr/bin/env bash
+
+set -ex
+
+python3 -m pip install onnxoptimizer onnxsim
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/audioset/AT
+
+function test_pretrained() {
+  repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12
+  repo=$(basename $repo_url)
+  GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+  pushd $repo/exp
+  git lfs pull --include pretrained.pt
+  ln -s pretrained.pt epoch-99.pt
+  ls -lh
+  popd
+
+  log "test pretrained.pt"
+
+  python3 zipformer/pretrained.py \
+    --checkpoint $repo/exp/pretrained.pt \
+    --label-dict $repo/data/class_labels_indices.csv \
+    $repo/test_wavs/1.wav \
+    $repo/test_wavs/2.wav \
+    $repo/test_wavs/3.wav \
+    $repo/test_wavs/4.wav
+
+  log "test jit export"
+  ls -lh $repo/exp/
+  python3 zipformer/export.py \
+      --exp-dir $repo/exp \
+      --epoch 99 \
+      --avg 1 \
+      --use-averaged-model 0 \
+      --jit 1
+  ls -lh $repo/exp/
+
+  log "test jit models"
+  python3 zipformer/jit_pretrained.py \
+      --nn-model-filename $repo/exp/jit_script.pt \
+      --label-dict $repo/data/class_labels_indices.csv \
+      $repo/test_wavs/1.wav \
+      $repo/test_wavs/2.wav \
+      $repo/test_wavs/3.wav \
+      $repo/test_wavs/4.wav
+
+  log "test onnx export"
+  ls -lh $repo/exp/
+  python3 zipformer/export-onnx.py \
+      --exp-dir $repo/exp \
+      --epoch 99 \
+      --avg 1 \
+      --use-averaged-model 0
+
+  ls -lh $repo/exp/
+
+  pushd $repo/exp/
+  mv model-epoch-99-avg-1.onnx model.onnx
+  mv model-epoch-99-avg-1.int8.onnx model.int8.onnx
+  popd
+
+  ls -lh $repo/exp/
+
+  log "test onnx models"
+  for m in model.onnx model.int8.onnx; do
+    log "$m"
+    python3 zipformer/onnx_pretrained.py \
+        --model-filename $repo/exp/model.onnx \
+        --label-dict $repo/data/class_labels_indices.csv \
+        $repo/test_wavs/1.wav \
+        $repo/test_wavs/2.wav \
+        $repo/test_wavs/3.wav \
+        $repo/test_wavs/4.wav
+  done
+
+  log "prepare data for uploading to huggingface"
+  dst=/icefall/model-onnx
+  mkdir -p $dst
+  cp -v $repo/exp/*.onnx $dst/
+  cp -v $repo/data/* $dst/
+  cp -av $repo/test_wavs $dst
+
+  ls -lh $dst
+  ls -lh $dst/test_wavs
+}
+
+test_pretrained
--- a/.github/scripts/docker/Dockerfile
+++ b/.github/scripts/docker/Dockerfile
@ -11,6 +11,7 @@ ARG _KALDIFEAT_VERSION="${KALDIFEAT_VERSION}+cpu.torch${TORCH_VERSION}"

 RUN apt-get update -y && \
    apt-get install -qq -y \
+    cmake \
    ffmpeg \
    git \
    git-lfs \
@ -35,7 +36,9 @@ RUN pip install --no-cache-dir \
      \
      git+https://github.com/lhotse-speech/lhotse \
      kaldifeat==${_KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cpu.html \
+      cython \
      dill \
+      espnet_tts_frontend \
      graphviz \
      kaldi-decoder \
      kaldi_native_io \
@ -44,10 +47,15 @@ RUN pip install --no-cache-dir \
      kaldilm \
      matplotlib \
      multi_quantization \
+      numba \
      numpy \
+      onnxoptimizer \
+      onnxsim \
      onnx \
      onnxmltools \
      onnxruntime \
+      piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html \
+      pypinyin==0.50.0 \
      pytest \
      sentencepiece>=0.1.96 \
      six \
--- a/.github/scripts/docker/generate_build_matrix.py
+++ b/.github/scripts/docker/generate_build_matrix.py
@ -6,8 +6,8 @@ import json


 def version_gt(a, b):
-    a_major, a_minor = a.split(".")[:2]
-    b_major, b_minor = b.split(".")[:2]
+    a_major, a_minor = list(map(int, a.split(".")))[:2]
+    b_major, b_minor = list(map(int, b.split(".")))[:2]
    if a_major > b_major:
        return True

@ -18,8 +18,8 @@ def version_gt(a, b):


 def version_ge(a, b):
-    a_major, a_minor = a.split(".")[:2]
-    b_major, b_minor = b.split(".")[:2]
+    a_major, a_minor = list(map(int, a.split(".")))[:2]
+    b_major, b_minor = list(map(int, b.split(".")))[:2]
    if a_major > b_major:
        return True

@ -43,11 +43,15 @@ def get_torchaudio_version(torch_version):


 def get_matrix():
-    k2_version = "1.24.4.dev20231220"
-    kaldifeat_version = "1.25.3.dev20231221"
-    version = "1.2"
-    python_version = ["3.8", "3.9", "3.10", "3.11"]
-    torch_version = ["1.13.0", "1.13.1", "2.0.0", "2.0.1", "2.1.0", "2.1.1", "2.1.2"]
+    k2_version = "1.24.4.dev20240223"
+    kaldifeat_version = "1.25.4.dev20240223"
+    version = "20240401"
+    python_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
+    torch_version = []
+    torch_version += ["1.13.0", "1.13.1"]
+    torch_version += ["2.0.0", "2.0.1"]
+    torch_version += ["2.1.0", "2.1.1", "2.1.2"]
+    torch_version += ["2.2.0", "2.2.1", "2.2.2"]

    matrix = []
    for p in python_version:
@ -57,10 +61,21 @@ def get_matrix():
            if version_gt(p, "3.10") and not version_gt(t, "2.0"):
                continue

+            # only torch>=2.2.0 supports python 3.12
+            if version_gt(p, "3.11") and not version_gt(t, "2.1"):
+                continue
+
+            k2_version_2 = k2_version
+            kaldifeat_version_2 = kaldifeat_version
+
+            if t == "2.2.2":
+                k2_version_2 = "1.24.4.dev20240328"
+                kaldifeat_version_2 = "1.25.4.dev20240329"
+
            matrix.append(
                {
-                    "k2-version": k2_version,
-                    "kaldifeat-version": kaldifeat_version,
+                    "k2-version": k2_version_2,
+                    "kaldifeat-version": kaldifeat_version_2,
                    "version": version,
                    "python-version": p,
                    "torch-version": t,
--- a/.github/scripts/generate-piper-phonemize-page.py
+++ b/.github/scripts/generate-piper-phonemize-page.py
@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+
+def main():
+    prefix = (
+        "https://github.com/csukuangfj/piper-phonemize/releases/download/2023.12.5/"
+    )
+    files = [
+        "piper_phonemize-1.2.0-cp310-cp310-macosx_10_14_x86_64.whl",
+        "piper_phonemize-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.2.0-cp311-cp311-macosx_10_14_x86_64.whl",
+        "piper_phonemize-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.2.0-cp312-cp312-macosx_10_14_x86_64.whl",
+        "piper_phonemize-1.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.2.0-cp37-cp37m-macosx_10_14_x86_64.whl",
+        "piper_phonemize-1.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.2.0-cp38-cp38-macosx_10_14_x86_64.whl",
+        "piper_phonemize-1.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.2.0-cp39-cp39-macosx_10_14_x86_64.whl",
+        "piper_phonemize-1.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+    ]
+    with open("piper_phonemize.html", "w") as f:
+        for file in files:
+            url = prefix + file
+            f.write(f'<a href="{url}">{file}</a><br/>\n')
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/scripts/librispeech/ASR/run.sh
+++ b/.github/scripts/librispeech/ASR/run.sh
@ -15,9 +15,9 @@ function prepare_data() {
  # cause OOM error for CI later.
  mkdir -p download/lm
  pushd download/lm
-  wget -q http://www.openslr.org/resources/11/librispeech-vocab.txt
-  wget -q http://www.openslr.org/resources/11/librispeech-lexicon.txt
-  wget -q http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz
+  wget -q https://huggingface.co/csukuangfj/librispeech-for-ci/resolve/main/librispeech-lm-norm.txt.gz
+  wget -q https://huggingface.co/csukuangfj/librispeech-for-ci/resolve/main/librispeech-lexicon.txt
+  wget -q https://huggingface.co/csukuangfj/librispeech-for-ci/resolve/main/librispeech-vocab.txt
  ls -lh
  gunzip librispeech-lm-norm.txt.gz

@ -64,6 +64,46 @@ function run_diagnostics() {
    --print-diagnostics 1
 }

+function test_streaming_zipformer_ctc_hlg() {
+  repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-streaming-zipformer-small-2024-03-18
+
+  log "Downloading pre-trained model from $repo_url"
+  git lfs install
+  git clone $repo_url
+  repo=$(basename $repo_url)
+
+  rm $repo/exp-ctc-rnnt-small/*.onnx
+  ls -lh $repo/exp-ctc-rnnt-small
+
+  # export models to onnx
+  ./zipformer/export-onnx-streaming-ctc.py \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
+    --epoch 30 \
+    --avg 3 \
+    --exp-dir $repo/exp-ctc-rnnt-small \
+    --causal 1 \
+    --use-ctc 1 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    \
+    --num-encoder-layers 2,2,2,2,2,2 \
+    --feedforward-dim 512,768,768,768,768,768 \
+    --encoder-dim 192,256,256,256,256,256 \
+    --encoder-unmasked-dim 192,192,192,192,192,192
+
+  ls -lh $repo/exp-ctc-rnnt-small
+
+  for wav in 0.wav 1.wav 8k.wav; do
+    python3 ./zipformer/onnx_pretrained_ctc_HLG_streaming.py \
+      --nn-model $repo/exp-ctc-rnnt-small/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx \
+      --words $repo/data/lang_bpe_500/words.txt \
+      --HLG $repo/data/lang_bpe_500/HLG.fst \
+      $repo/test_wavs/$wav
+  done
+
+  rm -rf $repo
+}
+
 function test_pruned_transducer_stateless_2022_03_12() {
  repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12

@ -1577,6 +1617,7 @@ function test_transducer_bpe_500_2021_12_23() {

 prepare_data
 run_diagnostics
+test_streaming_zipformer_ctc_hlg
 test_pruned_transducer_stateless_2022_03_12
 test_pruned_transducer_stateless2_2022_04_29
 test_pruned_transducer_stateless3_2022_04_29
--- a/.github/scripts/ljspeech/TTS/run.sh
+++ b/.github/scripts/ljspeech/TTS/run.sh
@ -0,0 +1,157 @@
+#!/usr/bin/env bash
+
+set -ex
+
+python3 -m pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
+python3 -m pip install espnet_tts_frontend
+python3 -m pip install numba
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/ljspeech/TTS
+
+sed -i.bak s/600/8/g ./prepare.sh
+sed -i.bak s/"first 100"/"first 3"/g ./prepare.sh
+sed -i.bak s/500/5/g ./prepare.sh
+git diff
+
+function prepare_data() {
+  # We have created a subset of the data for testing
+  #
+  mkdir download
+  pushd download
+  wget -q https://huggingface.co/csukuangfj/ljspeech-subset-for-ci-test/resolve/main/LJSpeech-1.1.tar.bz2
+  tar xvf LJSpeech-1.1.tar.bz2
+  popd
+
+  ./prepare.sh
+  tree .
+}
+
+function train() {
+  pushd ./vits
+  sed -i.bak s/200/3/g ./train.py
+  git diff .
+  popd
+
+  for t in low medium high; do
+    ./vits/train.py \
+      --exp-dir vits/exp-$t \
+      --model-type $t \
+      --num-epochs 1 \
+      --save-every-n 1 \
+      --num-buckets 2 \
+      --tokens data/tokens.txt \
+      --max-duration 20
+
+    ls -lh vits/exp-$t
+  done
+}
+
+function infer() {
+  for t in low medium high; do
+    ./vits/infer.py \
+      --num-buckets 2 \
+      --model-type $t \
+      --epoch 1 \
+      --exp-dir ./vits/exp-$t \
+      --tokens data/tokens.txt \
+      --max-duration 20
+  done
+}
+
+function export_onnx() {
+  for t in low medium high; do
+    ./vits/export-onnx.py \
+      --model-type $t \
+      --epoch 1 \
+      --exp-dir ./vits/exp-$t \
+      --tokens data/tokens.txt
+
+    ls -lh vits/exp-$t/
+  done
+}
+
+function test_medium() {
+  git clone https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-medium-2024-03-12
+
+  ./vits/export-onnx.py \
+    --model-type medium \
+    --epoch 820 \
+    --exp-dir ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp \
+    --tokens ./icefall-tts-ljspeech-vits-medium-2024-03-12/data/tokens.txt
+
+  ls -lh ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp
+
+  ./vits/test_onnx.py \
+    --model-filename ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp/vits-epoch-820.onnx \
+    --tokens ./icefall-tts-ljspeech-vits-medium-2024-03-12/data/tokens.txt \
+    --output-filename /icefall/test-medium.wav
+
+  ls -lh /icefall/test-medium.wav
+
+  d=/icefall/vits-icefall-en_US-ljspeech-medium
+  mkdir $d
+  cp -v ./icefall-tts-ljspeech-vits-medium-2024-03-12/data/tokens.txt $d/
+  cp -v ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp/vits-epoch-820.onnx $d/model.onnx
+
+  rm -rf icefall-tts-ljspeech-vits-medium-2024-03-12
+
+  pushd $d
+  wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
+  tar xf espeak-ng-data.tar.bz2
+  rm espeak-ng-data.tar.bz2
+  cd ..
+  tar cjf vits-icefall-en_US-ljspeech-medium.tar.bz2 vits-icefall-en_US-ljspeech-medium
+  rm -rf vits-icefall-en_US-ljspeech-medium
+  ls -lh *.tar.bz2
+  popd
+}
+
+function test_low() {
+  git clone https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-low-2024-03-12
+
+  ./vits/export-onnx.py \
+    --model-type low \
+    --epoch 1600 \
+    --exp-dir ./icefall-tts-ljspeech-vits-low-2024-03-12/exp \
+    --tokens ./icefall-tts-ljspeech-vits-low-2024-03-12/data/tokens.txt
+
+  ls -lh ./icefall-tts-ljspeech-vits-low-2024-03-12/exp
+
+  ./vits/test_onnx.py \
+    --model-filename ./icefall-tts-ljspeech-vits-low-2024-03-12/exp/vits-epoch-1600.onnx \
+    --tokens ./icefall-tts-ljspeech-vits-low-2024-03-12/data/tokens.txt \
+    --output-filename /icefall/test-low.wav
+
+  ls -lh /icefall/test-low.wav
+
+  d=/icefall/vits-icefall-en_US-ljspeech-low
+  mkdir $d
+  cp -v ./icefall-tts-ljspeech-vits-low-2024-03-12/data/tokens.txt $d/
+  cp -v ./icefall-tts-ljspeech-vits-low-2024-03-12/exp/vits-epoch-1600.onnx $d/model.onnx
+
+  rm -rf icefall-tts-ljspeech-vits-low-2024-03-12
+
+  pushd $d
+  wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
+  tar xf espeak-ng-data.tar.bz2
+  rm espeak-ng-data.tar.bz2
+  cd ..
+  tar cjf vits-icefall-en_US-ljspeech-low.tar.bz2 vits-icefall-en_US-ljspeech-low
+  rm -rf vits-icefall-en_US-ljspeech-low
+  ls -lh *.tar.bz2
+  popd
+}
+
+prepare_data
+train
+infer
+export_onnx
+rm -rf vits/exp-{low,medium,high}
+test_medium
+test_low
--- a/.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
+++ b/.github/scripts/run-wenetspeech-pruned-transducer-stateless2.sh
@ -30,7 +30,7 @@ log "Test exporting to ONNX format"

 ./pruned_transducer_stateless2/export-onnx.py \
  --exp-dir $repo/exp \
-  --lang-dir $repo/data/lang_char \
+  --tokens $repo/data/lang_char/tokens.txt \
  --epoch 99 \
  --avg 1

@ -38,14 +38,14 @@ log "Export to torchscript model"

 ./pruned_transducer_stateless2/export.py \
  --exp-dir $repo/exp \
-  --lang-dir $repo/data/lang_char \
+  --tokens $repo/data/lang_char/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --jit 1

 ./pruned_transducer_stateless2/export.py \
  --exp-dir $repo/exp \
-  --lang-dir $repo/data/lang_char \
+  --tokens $repo/data/lang_char/tokens.txt \
  --epoch 99 \
  --avg 1 \
  --jit-trace 1
--- a/.github/workflows/audioset.yml
+++ b/.github/workflows/audioset.yml
@ -0,0 +1,137 @@
+name: audioset
+
+on:
+  push:
+    branches:
+      - master
+
+  pull_request:
+    branches:
+      - master
+
+  workflow_dispatch:
+
+concurrency:
+  group: audioset-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  generate_build_matrix:
+    if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python ./.github/scripts/docker/generate_build_matrix.py
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  audioset:
+    needs: generate_build_matrix
+    name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Free space
+        shell: bash
+        run: |
+          ls -lh
+          df -h
+          rm -rf /opt/hostedtoolcache
+          df -h
+          echo "pwd: $PWD"
+          echo "github.workspace ${{ github.workspace }}"
+
+      - name: Run tests
+        uses: addnab/docker-run-action@v3
+        with:
+            image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
+            options: |
+              --volume ${{ github.workspace }}/:/icefall
+            shell: bash
+            run: |
+              export PYTHONPATH=/icefall:$PYTHONPATH
+              cd /icefall
+              git config --global --add safe.directory /icefall
+
+              .github/scripts/audioset/AT/run.sh
+
+      - name: Show model files
+        shell: bash
+        run: |
+          sudo chown -R runner ./model-onnx
+          ls -lh ./model-onnx
+          chmod -x ./model-onnx/class_labels_indices.csv
+
+          echo "----------"
+          ls -lh ./model-onnx/*
+
+      - name: Upload model to huggingface
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        uses: nick-fields/retry@v3
+        with:
+          max_attempts: 20
+          timeout_seconds: 200
+          shell: bash
+          command: |
+            git config --global user.email "csukuangfj@gmail.com"
+            git config --global user.name "Fangjun Kuang"
+
+            rm -rf huggingface
+            export GIT_LFS_SKIP_SMUDGE=1
+
+            git clone https://huggingface.co/k2-fsa/sherpa-onnx-zipformer-audio-tagging-2024-04-09 huggingface
+            cd huggingface
+            git fetch
+            git pull
+            git merge -m "merge remote" --ff origin main
+            cp ../model-onnx/*.onnx ./
+            cp ../model-onnx/*.csv ./
+            cp -a ../model-onnx/test_wavs ./
+            ls -lh
+            git add .
+            git status
+            git commit -m "update models"
+            git status
+
+            git push https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-onnx-zipformer-audio-tagging-2024-04-09 main || true
+            rm -rf huggingface
+
+      - name: Prepare for release
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
+        shell: bash
+        run: |
+          d=sherpa-onnx-zipformer-audio-tagging-2024-04-09
+          mv ./model-onnx $d
+          tar cjvf ${d}.tar.bz2 $d
+          ls -lh
+
+      - name: Release exported onnx models
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
+        uses: svenstaro/upload-release-action@v2
+        with:
+          file_glob: true
+          overwrite: true
+          file: sherpa-onnx-*.tar.bz2
+          repo_name: k2-fsa/sherpa-onnx
+          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+          tag: audio-tagging-models
+
--- a/.github/workflows/build-doc.yml
+++ b/.github/workflows/build-doc.yml
@ -56,11 +56,14 @@ jobs:
      - name: Build doc
        shell: bash
        run: |
+          .github/scripts/generate-piper-phonemize-page.py
          cd docs
          python3 -m pip install -r ./requirements.txt
          make html
          touch build/html/.nojekyll

+          cp -v ../piper_phonemize.html ./build/html/
+
      - name: Deploy
        uses: peaceiris/actions-gh-pages@v3
        with:
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@ -16,7 +16,7 @@ jobs:
      fail-fast: false
      matrix:
        os: [ubuntu-latest]
-        image: ["torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+        image: ["torch2.2.2-cuda12.1", "torch2.2.2-cuda11.8", "torch2.2.1-cuda12.1", "torch2.2.1-cuda11.8", "torch2.2.0-cuda12.1", "torch2.2.0-cuda11.8", "torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]

    steps:
      # refer to https://github.com/actions/checkout
--- a/.github/workflows/ljspeech.yml
+++ b/.github/workflows/ljspeech.yml
@ -0,0 +1,102 @@
+name: ljspeech
+
+on:
+  push:
+    branches:
+      - master
+
+  pull_request:
+    branches:
+      - master
+
+  workflow_dispatch:
+
+concurrency:
+  group: ljspeech-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  generate_build_matrix:
+    if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python ./.github/scripts/docker/generate_build_matrix.py
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  ljspeech:
+    needs: generate_build_matrix
+    name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Free space
+        shell: bash
+        run: |
+          ls -lh
+          df -h
+          rm -rf /opt/hostedtoolcache
+          df -h
+          echo "pwd: $PWD"
+          echo "github.workspace ${{ github.workspace }}"
+
+      - name: Run tests
+        uses: addnab/docker-run-action@v3
+        with:
+            image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
+            options: |
+              --volume ${{ github.workspace }}/:/icefall
+            shell: bash
+            run: |
+              export PYTHONPATH=/icefall:$PYTHONPATH
+              cd /icefall
+              git config --global --add safe.directory /icefall
+
+              .github/scripts/ljspeech/TTS/run.sh
+
+      - name: display files
+        shell: bash
+        run: |
+          ls -lh
+
+      - uses: actions/upload-artifact@v4
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
+        with:
+          name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
+          path: ./*.wav
+
+      - uses: actions/upload-artifact@v4
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
+        with:
+          name: generated-models-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}
+          path: ./*.wav
+
+      - name: Release exported onnx models
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
+        uses: svenstaro/upload-release-action@v2
+        with:
+          file_glob: true
+          overwrite: true
+          file: vits-icefall-*.tar.bz2
+          repo_name: k2-fsa/sherpa-onnx
+          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+          tag: tts-models
+
--- a/.github/workflows/run-docker-image.yml
+++ b/.github/workflows/run-docker-image.yml
@ -14,13 +14,20 @@ jobs:
      fail-fast: false
      matrix:
        os: [ubuntu-latest]
-        image: ["torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+        image: ["torch2.2.2-cuda12.1", "torch2.2.2-cuda11.8", "torch2.2.1-cuda12.1", "torch2.2.1-cuda11.8", "torch2.2.0-cuda12.1", "torch2.2.0-cuda11.8", "torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
    steps:
      # refer to https://github.com/actions/checkout
      - uses: actions/checkout@v2
        with:
          fetch-depth: 0

+      - name: Free space
+        shell: bash
+        run: |
+          df -h
+          rm -rf /opt/hostedtoolcache
+          df -h
+
      - name: Run the build process with Docker
        uses: addnab/docker-run-action@v3
        with:
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@ -49,7 +49,7 @@ jobs:

      - name: Install Python dependencies
        run: |
-          python3 -m pip install --upgrade pip black==22.3.0 flake8==5.0.4 click==8.1.0
+          python3 -m pip install --upgrade pip black==22.3.0 flake8==5.0.4 click==8.1.0 isort==5.10.1
          # Click issue fixed in https://github.com/psf/black/pull/2966

      - name: Run flake8
@ -67,3 +67,9 @@ jobs:
        working-directory: ${{github.workspace}}
        run: |
          black --check --diff .
+      
+      - name: Run isort
+        shell: bash
+        working-directory: ${{github.workspace}}
+        run: |
+          isort --check --diff .
--- a/.github/workflows/yesno.yml
+++ b/.github/workflows/yesno.yml
@ -59,4 +59,7 @@ jobs:
              cd /icefall
              git config --global --add safe.directory /icefall

+              python3 -m torch.utils.collect_env
+              python3 -m k2.version
+
              .github/scripts/yesno/ASR/run.sh
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -26,7 +26,7 @@ repos:
      # E121,E123,E126,E226,E24,E704,W503,W504

  - repo: https://github.com/pycqa/isort
-    rev: 5.11.5
+    rev: 5.12.0
    hooks:
      - id: isort
        args: ["--profile=black"]
--- a/README.md
+++ b/README.md
@ -2,46 +2,86 @@
 <img src="https://raw.githubusercontent.com/k2-fsa/icefall/master/docs/source/_static/logo.png" width=168>
 </div>

-## Introduction
+# Introduction

-icefall contains ASR recipes for various datasets
-using <https://github.com/k2-fsa/k2>.
+The icefall project contains speech-related recipes for various datasets
+using [k2-fsa](https://github.com/k2-fsa/k2) and [lhotse](https://github.com/lhotse-speech/lhotse).

-You can use <https://github.com/k2-fsa/sherpa> to deploy models
-trained with icefall.
+You can use [sherpa](https://github.com/k2-fsa/sherpa), [sherpa-ncnn](https://github.com/k2-fsa/sherpa-ncnn) or [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx) for deployment with models
+in icefall; these frameworks also support models not included in icefall; please refer to respective documents for more details.

 You can try pre-trained models from within your browser without the need
-to download or install anything by visiting <https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition>
-See <https://k2-fsa.github.io/icefall/huggingface/spaces.html> for more details.
+to download or install anything by visiting this [huggingface space](https://huggingface.co/spaces/k2-fsa/automatic-speech-recognition).
+Please refer to [document](https://k2-fsa.github.io/icefall/huggingface/spaces.html) for more details.

-## Installation
+# Installation

-Please refer to <https://icefall.readthedocs.io/en/latest/installation/index.html>
+Please refer to [document](https://icefall.readthedocs.io/en/latest/installation/index.html)
 for installation.

-## Recipes
+# Recipes

-Please refer to <https://icefall.readthedocs.io/en/latest/recipes/index.html>
-for more information.
+Please refer to [document](https://icefall.readthedocs.io/en/latest/recipes/index.html)
+for more details.

-We provide the following recipes:
+## ASR: Automatic Speech Recognition

+### Supported Datasets
  - [yesno][yesno]
-  - [LibriSpeech][librispeech]
-  - [GigaSpeech][gigaspeech]
-  - [AMI][ami]
+  
+  - [Aidatatang_200zh][aidatatang_200zh]
  - [Aishell][aishell]
  - [Aishell2][aishell2]
  - [Aishell4][aishell4]
+  - [Alimeeting][alimeeting]
+  - [AMI][ami]
+  - [CommonVoice][commonvoice]
+  - [Corpus of Spontaneous Japanese][csj]
+  - [GigaSpeech][gigaspeech]
+  - [LibriCSS][libricss]
+  - [LibriSpeech][librispeech]
+  - [Libriheavy][libriheavy]
+  - [Multi-Dialect Broadcast News Arabic Speech Recognition][mgb2]
+  - [PeopleSpeech][peoplespeech]
+  - [SPGISpeech][spgispeech]
+  - [Switchboard][swbd]
  - [TIMIT][timit]
  - [TED-LIUM3][tedlium3]
-  - [Aidatatang_200zh][aidatatang_200zh]
-  - [WenetSpeech][wenetspeech]
-  - [Alimeeting][alimeeting]
-  - [Switchboard][swbd]
  - [TAL_CSASR][tal_csasr]
+  - [Voxpopuli][voxpopuli]
+  - [XBMU-AMDO31][xbmu-amdo31]
+  - [WenetSpeech][wenetspeech]
  
-### yesno
+More datasets will be added in the future.
+
+### Supported Models
+
+The [LibriSpeech][librispeech] recipe supports the most comprehensive set of models, you are welcome to try them out.
+
+#### CTC 
+  - TDNN LSTM CTC
+  - Conformer CTC
+  - Zipformer CTC
+
+#### MMI
+  - Conformer MMI
+  - Zipformer MMI
+
+#### Transducer
+  - Conformer-based Encoder
+  - LSTM-based Encoder
+  - Zipformer-based Encoder
+  - LSTM-based Predictor
+  - [Stateless Predictor](https://research.google/pubs/rnn-transducer-with-stateless-prediction-network/)
+
+#### Whisper
+  - [OpenAi Whisper](https://arxiv.org/abs/2212.04356) (We support fine-tuning on AiShell-1.)
+
+If you are willing to contribute to icefall, please refer to [contributing](https://icefall.readthedocs.io/en/latest/contributing/index.html) for more details.
+
+We would like to highlight the performance of some of the recipes here.
+
+### [yesno][yesno]

 This is the simplest ASR recipe in `icefall` and can be run on CPU.
 Training takes less than 30 seconds and gives you the following WER:
@ -52,350 +92,264 @@ Training takes less than 30 seconds and gives you the following WER:
 We provide a Colab notebook for this recipe: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tIjjzaJc3IvGyKiMCDWO-TSnBgkcuN3B?usp=sharing)


-### LibriSpeech
+### [LibriSpeech][librispeech]

-Please see <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>
+Please see [RESULTS.md](https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md)
 for the **latest** results.

-We provide 5 models for this recipe:
-
- [conformer CTC model][LibriSpeech_conformer_ctc]
- [TDNN LSTM CTC model][LibriSpeech_tdnn_lstm_ctc]
- [Transducer: Conformer encoder + LSTM decoder][LibriSpeech_transducer]
- [Transducer: Conformer encoder + Embedding decoder][LibriSpeech_transducer_stateless]
- [Transducer: Zipformer encoder + Embedding decoder][LibriSpeech_zipformer]
-
-#### Conformer CTC Model
-
-The best WER we currently have is:
+#### [Conformer CTC](https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conformer_ctc)

 |     | test-clean | test-other |
 |-----|------------|------------|
 | WER | 2.42       | 5.73       |


-We provide a Colab notebook to run a pre-trained conformer CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1huyupXAcHsUrKaWfI83iMEJ6J0Nh0213?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1huyupXAcHsUrKaWfI83iMEJ6J0Nh0213?usp=sharing)

-#### TDNN LSTM CTC Model
-
-The WER for this model is:
+#### [TDNN LSTM CTC](https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/tdnn_lstm_ctc)

 |     | test-clean | test-other |
 |-----|------------|------------|
 | WER | 6.59       | 17.69      |

-We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1-iSfQMp2So-We_Uu49N4AAcMInB72u9z?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1-iSfQMp2So-We_Uu49N4AAcMInB72u9z?usp=sharing)


-#### Transducer: Conformer encoder + LSTM decoder
+#### [Transducer (Conformer Encoder + LSTM Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/transducer)

-Using Conformer as encoder and LSTM as decoder.
+|               | test-clean | test-other |
+|---------------|------------|------------|
+| greedy_search | 3.07       | 7.51       |

-The best WER with greedy search is:
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1_u6yK9jDkPwG_NLrZMN2XK7Aeq4suMO2?usp=sharing)

-|     | test-clean | test-other |
-|-----|------------|------------|
-| WER | 3.07       | 7.51       |
+#### [Transducer (Conformer Encoder + Stateless Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/transducer)

-We provide a Colab notebook to run a pre-trained RNN-T conformer model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1_u6yK9jDkPwG_NLrZMN2XK7Aeq4suMO2?usp=sharing)
-
-#### Transducer: Conformer encoder + Embedding decoder
-
-Using Conformer as encoder. The decoder consists of 1 embedding layer
-and 1 convolutional layer.
-
-The best WER using modified beam search with beam size 4 is:
-
-|     | test-clean | test-other |
-|-----|------------|------------|
-| WER | 2.56       | 6.27       |
-
-Note: No auxiliary losses are used in the training and no LMs are used
-in the decoding.
-
-We provide a Colab notebook to run a pre-trained transducer conformer + stateless decoder model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CO1bXJ-2khDckZIW8zjOPHGSKLHpTDlp?usp=sharing)
+|                                       | test-clean | test-other |
+|---------------------------------------|------------|------------|
+| modified_beam_search (`beam_size=4`) | 2.56       | 6.27       |


-#### k2 pruned RNN-T
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CO1bXJ-2khDckZIW8zjOPHGSKLHpTDlp?usp=sharing)
+
+
+#### [Transducer (Zipformer Encoder + Stateless Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/zipformer)
+
+WER (modified_beam_search `beam_size=4` unless further stated) 
+
+1. LibriSpeech-960hr

 | Encoder         | Params | test-clean | test-other | epochs  | devices    |
 |-----------------|--------|------------|------------|---------|------------|
-| zipformer       | 65.5M  | 2.21       | 4.79       | 50      | 4 32G-V100 |
-| zipformer-small | 23.2M  | 2.42       | 5.73       | 50      | 2 32G-V100 |
-| zipformer-large | 148.4M | 2.06       | 4.63       | 50      | 4 32G-V100 |
-| zipformer-large | 148.4M | 2.00       | 4.38       | 174     | 8 80G-A100 |
+| Zipformer       | 65.5M  | 2.21       | 4.79       | 50      | 4 32G-V100 |
+| Zipformer-small | 23.2M  | 2.42       | 5.73       | 50      | 2 32G-V100 |
+| Zipformer-large | 148.4M | 2.06       | 4.63       | 50      | 4 32G-V100 |
+| Zipformer-large | 148.4M | 2.00       | 4.38       | 174     | 8 80G-A100 |

-Note: No auxiliary losses are used in the training and no LMs are used
-in the decoding.
+2. LibriSpeech-960hr + GigaSpeech

-#### k2 pruned RNN-T + GigaSpeech
-
-|     | test-clean | test-other |
-|-----|------------|------------|
-| WER | 1.78       | 4.08       |
-
-Note: No auxiliary losses are used in the training and no LMs are used
-in the decoding.
-
-#### k2 pruned RNN-T + GigaSpeech + CommonVoice
-
-|     | test-clean | test-other |
-|-----|------------|------------|
-| WER | 1.90       | 3.98       |
-
-Note: No auxiliary losses are used in the training and no LMs are used
-in the decoding.
+| Encoder         | Params | test-clean | test-other |
+|-----------------|--------|------------|------------|
+| Zipformer       | 65.5M   | 1.78       | 4.08       |


-### GigaSpeech
+3. LibriSpeech-960hr + GigaSpeech + CommonVoice

-We provide three models for this recipe:
+| Encoder         | Params | test-clean | test-other |
+|-----------------|--------|------------|------------|
+| Zipformer       | 65.5M   | 1.90       | 3.98       |

- [Conformer CTC model][GigaSpeech_conformer_ctc]
- [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][GigaSpeech_pruned_transducer_stateless2].
- [Transducer: Zipformer encoder + Embedding decoder][GigaSpeech_zipformer]

-#### Conformer CTC
+### [GigaSpeech][gigaspeech]
+
+#### [Conformer CTC](https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR/conformer_ctc)

 |     |  Dev  | Test  |
 |-----|-------|-------|
 | WER | 10.47 | 10.58 |

-#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
+#### [Transducer (pruned_transducer_stateless2)](https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR/pruned_transducer_stateless2)
+
+Conformer Encoder + Stateless Predictor + k2 Pruned RNN-T Loss

 |                      |  Dev  | Test  |
 |----------------------|-------|-------|
-|    greedy search     | 10.51 | 10.73 |
-|   fast beam search   | 10.50 | 10.69 |
-| modified beam search | 10.40 | 10.51 |
+|    greedy_search     | 10.51 | 10.73 |
+|   fast_beam_search   | 10.50 | 10.69 |
+| modified_beam_search | 10.40 | 10.51 |

-#### Transducer: Zipformer encoder + Embedding decoder
+#### [Transducer (Zipformer Encoder + Stateless Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR/zipformer)

 |                      |  Dev  | Test  |
 |----------------------|-------|-------|
-|    greedy search     | 10.31 | 10.50 |
-|   fast beam search   | 10.26 | 10.48 |
-| modified beam search | 10.25 | 10.38 |
+|    greedy_search     | 10.31 | 10.50 |
+|   fast_beam_search   | 10.26 | 10.48 |
+| modified_beam_search | 10.25 | 10.38 |


-### Aishell
+### [Aishell][aishell]

-We provide three models for this recipe: [conformer CTC model][Aishell_conformer_ctc],
-[TDNN LSTM CTC model][Aishell_tdnn_lstm_ctc], and [Transducer Stateless Model][Aishell_pruned_transducer_stateless7],
-
-#### Conformer CTC Model
-
-The best CER we currently have is:
-
-|     | test |
-|-----|------|
-| CER | 4.26 |
-
-#### TDNN LSTM CTC Model
-
-The CER for this model is:
+#### [TDNN LSTM CTC](https://github.com/k2-fsa/icefall/tree/master/egs/aishell/ASR/tdnn_lstm_ctc)

 |     | test  |
 |-----|-------|
 | CER | 10.16 |

-We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model:  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1jbyzYq3ytm6j2nlEt-diQm-6QVWyDDEa?usp=sharing)
+We provide a Colab notebook to test the pre-trained model:  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1jbyzYq3ytm6j2nlEt-diQm-6QVWyDDEa?usp=sharing)

-#### Transducer Stateless Model
-
-The best CER we currently have is:
+#### [Transducer (Conformer Encoder + Stateless Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/aishell/ASR/transducer_stateless)

 |     | test |
 |-----|------|
 | CER | 4.38 |

-We provide a Colab notebook to run a pre-trained TransducerStateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14XaT2MhnBkK-3_RqqWq3K90Xlbin-GZC?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/14XaT2MhnBkK-3_RqqWq3K90Xlbin-GZC?usp=sharing)
+
+#### [Transducer (Zipformer Encoder + Stateless Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/aishell/ASR/zipformer)
+
+WER (modified_beam_search `beam_size=4`) 
+
+| Encoder         | Params | dev | test | epochs  |
+|-----------------|--------|-----|------|---------|
+| Zipformer       | 73.4M  | 4.13| 4.40 | 55      |
+| Zipformer-small | 30.2M  | 4.40| 4.67 | 55      |
+| Zipformer-large | 157.3M | 4.03| 4.28 | 56      |


-### Aishell2
+### [Aishell4][aishell4]

-We provide one model for this recipe: [Transducer Stateless Model][Aishell2_pruned_transducer_stateless5].
-
-#### Transducer Stateless Model
-
-The best WER we currently have is:
-
-|     |   dev-ios  |  test-ios  |
-|-----|------------|------------|
-| WER |    5.32    |    5.56    |
-
-
-### Aishell4
-
-We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Aishell4_pruned_transducer_stateless5].
-
-#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with all subsets)
-
-The best CER we currently have is:
+#### [Transducer (pruned_transducer_stateless5)](https://github.com/k2-fsa/icefall/tree/master/egs/aishell4/ASR/pruned_transducer_stateless5)

+1 Trained with all subsets: 
 |     |   test     |
 |-----|------------|
 | CER |   29.08    |

-
-We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)


-### TIMIT
+### [TIMIT][timit]

-We provide two models for this recipe: [TDNN LSTM CTC model][TIMIT_tdnn_lstm_ctc]
-and [TDNN LiGRU CTC model][TIMIT_tdnn_ligru_ctc].
+#### [TDNN LSTM CTC](https://github.com/k2-fsa/icefall/tree/master/egs/timit/ASR/tdnn_lstm_ctc)

-#### TDNN LSTM CTC Model
-
-The best PER we currently have is:
-
-||TEST|
-|--|--|
+|   |TEST|
+|---|----|
 |PER| 19.71% |

-We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Hs9DA4V96uapw_30uNp32OMJgkuR5VVd?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Hs9DA4V96uapw_30uNp32OMJgkuR5VVd?usp=sharing)

-#### TDNN LiGRU CTC Model
+#### [TDNN LiGRU CTC](https://github.com/k2-fsa/icefall/tree/master/egs/timit/ASR/tdnn_ligru_ctc)

-The PER for this model is:
-
-||TEST|
-|--|--|
+|   |TEST|
+|---|----|
 |PER| 17.66% |

-We provide a Colab notebook to run a pre-trained TDNN LiGRU CTC model:  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1z3lkURVv9M7uTiIgf3Np9IntMHEknaks?usp=sharing)


-### TED-LIUM3
+### [TED-LIUM3][tedlium3]

-We provide two models for this recipe: [Transducer Stateless: Conformer encoder + Embedding decoder][TED-LIUM3_transducer_stateless] and [Pruned Transducer Stateless: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][TED-LIUM3_pruned_transducer_stateless].
+#### [Transducer (Conformer Encoder + Stateless Predictor)](https://github.com/k2-fsa/icefall/tree/master/egs/tedlium3/ASR/transducer_stateless)

-#### Transducer Stateless:  Conformer encoder + Embedding decoder
-
-The best WER using modified beam search with beam size 4 is:
-
-|     |  dev  |  test  |
-|-----|-------|--------|
-| WER |  6.91 |  6.33  |
-
-Note: No auxiliary losses are used in the training and no LMs are used in the decoding.
-
-We provide a Colab notebook to run a pre-trained Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1MmY5bBxwvKLNT4A2DJnwiqRXhdchUqPN?usp=sharing)
-
-#### Pruned Transducer Stateless: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
-
-The best WER using modified beam search with beam size 4 is:
-
-|     |  dev  |  test  |
-|-----|-------|--------|
-| WER |  6.77 |  6.14  |
-
-We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1je_1zGrOkGVVd4WLzgkXRHxl-I27yWtz?usp=sharing)
+|                                      |  dev  |  test  |
+|--------------------------------------|-------|--------|
+| modified_beam_search (`beam_size=4`) |  6.91 |  6.33  |


-### Aidatatang_200zh
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1MmY5bBxwvKLNT4A2DJnwiqRXhdchUqPN?usp=sharing)

-We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Aidatatang_200zh_pruned_transducer_stateless2].
+#### [Transducer (pruned_transducer_stateless)](https://github.com/k2-fsa/icefall/tree/master/egs/tedlium3/ASR/pruned_transducer_stateless)

-#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
+|                                      |  dev  |  test  |
+|--------------------------------------|-------|--------|
+| modified_beam_search (`beam_size=4`) |  6.77 |  6.14  |
+
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1je_1zGrOkGVVd4WLzgkXRHxl-I27yWtz?usp=sharing)
+
+
+### [Aidatatang_200zh][aidatatang_200zh]
+
+#### [Transducer (pruned_transducer_stateless2)](https://github.com/k2-fsa/icefall/tree/master/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2)

 |                      |  Dev  | Test  |
 |----------------------|-------|-------|
-|    greedy search     | 5.53  | 6.59  |
-|   fast beam search   | 5.30  | 6.34  |
-| modified beam search | 5.27  | 6.33  |
+|    greedy_search     | 5.53  | 6.59  |
+|   fast_beam_search   | 5.30  | 6.34  |
+| modified_beam_search | 5.27  | 6.33  |

-We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wNSnSj3T5oOctbh5IGCa393gKOoQw2GH?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wNSnSj3T5oOctbh5IGCa393gKOoQw2GH?usp=sharing)


-### WenetSpeech
+### [WenetSpeech][wenetspeech]

-We provide some models for this recipe: [Pruned stateless RNN-T_2: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][WenetSpeech_pruned_transducer_stateless2] and [Pruned stateless RNN-T_5: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][WenetSpeech_pruned_transducer_stateless5].
-
-#### Pruned stateless RNN-T_2: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with L subset, offline ASR)
+#### [Transducer (pruned_transducer_stateless2)](https://github.com/k2-fsa/icefall/tree/master/egs/wenetspeech/ASR/pruned_transducer_stateless2)

 |                      |  Dev  | Test-Net | Test-Meeting |
 |----------------------|-------|----------|--------------|
-|    greedy search     | 7.80  |  8.75    |  13.49       |
-| modified beam search| 7.76  |  8.71    |  13.41       |
-|   fast beam search   | 7.94  |  8.74    |  13.80       |
+|    greedy_search     | 7.80  |  8.75    |  13.49       |
+|   fast_beam_search   | 7.94  |  8.74    |  13.80       |
+| modified_beam_search | 7.76  |  8.71    |  13.41       |
+
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1EV4e1CHa1GZgEF-bZgizqI9RyFFehIiN?usp=sharing)
+
+#### [Transducer **Streaming** (pruned_transducer_stateless5) ](https://github.com/k2-fsa/icefall/tree/master/egs/wenetspeech/ASR/pruned_transducer_stateless5)

-#### Pruned stateless RNN-T_5: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with L subset)
-**Streaming**:
 |                      |  Dev  | Test-Net | Test-Meeting |
 |----------------------|-------|----------|--------------|
 | greedy_search | 8.78 | 10.12 | 16.16 |
-| modified_beam_search | 8.53| 9.95 | 15.81 |
 | fast_beam_search| 9.01 | 10.47 | 16.28 |
+| modified_beam_search | 8.53| 9.95 | 15.81 |

-We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless2 model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1EV4e1CHa1GZgEF-bZgizqI9RyFFehIiN?usp=sharing)

-### Alimeeting
+### [Alimeeting][alimeeting]

-We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][Alimeeting_pruned_transducer_stateless2].
-
-#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss (trained with far subset)
+#### [Transducer (pruned_transducer_stateless2)](https://github.com/k2-fsa/icefall/tree/master/egs/alimeeting/ASR/pruned_transducer_stateless2)

 |                      |  Eval  | Test-Net |
 |----------------------|--------|----------|
-|    greedy search     | 31.77  |  34.66   |
-|   fast beam search   | 31.39  |  33.02   |
-| modified beam search | 30.38  |  34.25   |
+|    greedy_search     | 31.77  |  34.66   |
+|   fast_beam_search   | 31.39  |  33.02   |
+| modified_beam_search | 30.38  |  34.25   |

-We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tKr3f0mL17uO_ljdHGKtR7HOmthYHwJG?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tKr3f0mL17uO_ljdHGKtR7HOmthYHwJG?usp=sharing)


-### TAL_CSASR
+### [TAL_CSASR][tal_csasr]

-We provide one model for this recipe: [Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss][TAL_CSASR_pruned_transducer_stateless5].

-#### Pruned stateless RNN-T: Conformer encoder + Embedding decoder + k2 pruned RNN-T loss
+#### [Transducer (pruned_transducer_stateless5)](https://github.com/k2-fsa/icefall/tree/master/egs/tal_csasr/ASR/pruned_transducer_stateless5)

 The best results for Chinese CER(%) and English WER(%) respectively (zh: Chinese, en: English):
 |decoding-method | dev | dev_zh | dev_en | test | test_zh | test_en |
 |--|--|--|--|--|--|--|
 |greedy_search| 7.30 | 6.48 | 19.19 |7.39| 6.66 | 19.13|
-|modified_beam_search| 7.15 | 6.35 | 18.95 | 7.22| 6.50 | 18.70 |
 |fast_beam_search| 7.18 | 6.39| 18.90 |  7.27| 6.55 | 18.77|
+|modified_beam_search| 7.15 | 6.35 | 18.95 | 7.22| 6.50 | 18.70 |

-We provide a Colab notebook to run a pre-trained Pruned Transducer Stateless model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1DmIx-NloI1CMU5GdZrlse7TRu4y3Dpf8?usp=sharing)
+We provide a Colab notebook to test the pre-trained model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1DmIx-NloI1CMU5GdZrlse7TRu4y3Dpf8?usp=sharing)

-## Deployment with C++
+## TTS: Text-to-Speech

-Once you have trained a model in icefall, you may want to deploy it with C++,
-without Python dependencies.
+### Supported Datasets

-Please refer to the documentation
-<https://icefall.readthedocs.io/en/latest/recipes/Non-streaming-ASR/librispeech/conformer_ctc.html#deployment-with-c>
+  - [LJSpeech][ljspeech]
+  - [VCTK][vctk]
+
+### Supported Models
+
+  - [VITS](https://arxiv.org/abs/2106.06103)
+
+# Deployment with C++
+
+Once you have trained a model in icefall, you may want to deploy it with C++ without Python dependencies.
+
+Please refer to the [document](https://icefall.readthedocs.io/en/latest/recipes/Non-streaming-ASR/librispeech/conformer_ctc.html#deployment-with-c)
 for how to do this.

 We also provide a Colab notebook, showing you how to run a torch scripted model in [k2][k2] with C++.
 Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1BIGLWzS36isskMXHKcqC9ysN6pspYXs_?usp=sharing)


-[LibriSpeech_tdnn_lstm_ctc]: egs/librispeech/ASR/tdnn_lstm_ctc
-[LibriSpeech_conformer_ctc]: egs/librispeech/ASR/conformer_ctc
-[LibriSpeech_transducer]: egs/librispeech/ASR/transducer
-[LibriSpeech_transducer_stateless]: egs/librispeech/ASR/transducer_stateless
-[LibriSpeech_zipformer]: egs/librispeech/ASR/zipformer
-[Aishell_tdnn_lstm_ctc]: egs/aishell/ASR/tdnn_lstm_ctc
-[Aishell_conformer_ctc]: egs/aishell/ASR/conformer_ctc
-[Aishell_pruned_transducer_stateless7]: egs/aishell/ASR/pruned_transducer_stateless7_bbpe
-[Aishell2_pruned_transducer_stateless5]: egs/aishell2/ASR/pruned_transducer_stateless5
-[Aishell4_pruned_transducer_stateless5]: egs/aishell4/ASR/pruned_transducer_stateless5
-[TIMIT_tdnn_lstm_ctc]: egs/timit/ASR/tdnn_lstm_ctc
-[TIMIT_tdnn_ligru_ctc]: egs/timit/ASR/tdnn_ligru_ctc
-[TED-LIUM3_transducer_stateless]: egs/tedlium3/ASR/transducer_stateless
-[TED-LIUM3_pruned_transducer_stateless]: egs/tedlium3/ASR/pruned_transducer_stateless
-[GigaSpeech_conformer_ctc]: egs/gigaspeech/ASR/conformer_ctc
-[GigaSpeech_pruned_transducer_stateless2]: egs/gigaspeech/ASR/pruned_transducer_stateless2
-[GigaSpeech_zipformer]: egs/gigaspeech/ASR/zipformer
-[Aidatatang_200zh_pruned_transducer_stateless2]: egs/aidatatang_200zh/ASR/pruned_transducer_stateless2
-[WenetSpeech_pruned_transducer_stateless2]: egs/wenetspeech/ASR/pruned_transducer_stateless2
-[WenetSpeech_pruned_transducer_stateless5]: egs/wenetspeech/ASR/pruned_transducer_stateless5
-[Alimeeting_pruned_transducer_stateless2]: egs/alimeeting/ASR/pruned_transducer_stateless2
-[TAL_CSASR_pruned_transducer_stateless5]: egs/tal_csasr/ASR/pruned_transducer_stateless5
 [yesno]: egs/yesno/ASR
 [librispeech]: egs/librispeech/ASR
 [aishell]: egs/aishell/ASR
@ -411,3 +365,15 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad
 [ami]: egs/ami
 [swbd]: egs/swbd/ASR
 [k2]: https://github.com/k2-fsa/k2
+[commonvoice]: egs/commonvoice/ASR
+[csj]: egs/csj/ASR
+[libricss]: egs/libricss/SURT
+[libriheavy]: egs/libriheavy/ASR
+[mgb2]: egs/mgb2/ASR
+[peoplespeech]: egs/peoplespeech/ASR
+[spgispeech]: egs/spgispeech/ASR
+[voxpopuli]: egs/voxpopuli/ASR
+[xbmu-amdo31]: egs/xbmu-amdo31/ASR
+
+[vctk]: egs/vctk/TTS
+[ljspeech]: egs/ljspeech/TTS
--- a/docker/torch1.12.1-cuda11.3.dockerfile
+++ b/docker/torch1.12.1-cuda11.3.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.7
-ARG K2_VERSION="1.24.4.dev20230725+cuda11.3.torch1.12.1"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.3.torch1.12.1"
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.3.torch1.12.1"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.3.torch1.12.1"
 ARG TORCHAUDIO_VERSION="0.12.1+cu113"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
@ -55,6 +55,8 @@ RUN pip install --no-cache-dir \
      onnx \
      onnxruntime \
      onnxmltools \
+      onnxoptimizer \
+      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
--- a/docker/torch1.13.0-cuda11.6.dockerfile
+++ b/docker/torch1.13.0-cuda11.6.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.9
-ARG K2_VERSION="1.24.4.dev20231021+cuda11.6.torch1.13.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.6.torch1.13.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.6.torch1.13.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.6.torch1.13.0"
 ARG TORCHAUDIO_VERSION="0.13.0+cu116"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
@ -55,6 +55,8 @@ RUN pip install --no-cache-dir \
      onnx \
      onnxruntime \
      onnxmltools \
+      onnxoptimizer \
+      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
--- a/docker/torch1.9.0-cuda10.2.dockerfile
+++ b/docker/torch1.9.0-cuda10.2.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.7
-ARG K2_VERSION="1.24.3.dev20230726+cuda10.2.torch1.9.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda10.2.torch1.9.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda10.2.torch1.9.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda10.2.torch1.9.0"
 ARG TORCHAUDIO_VERSION="0.9.0"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
@ -69,6 +69,8 @@ RUN pip uninstall -y tqdm && \
      onnx \
      onnxruntime \
      onnxmltools \
+      onnxoptimizer \
+      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
--- a/docker/torch2.0.0-cuda11.7.dockerfile
+++ b/docker/torch2.0.0-cuda11.7.dockerfile
@ -1,12 +1,13 @@
 FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
+# python 3.10

 ENV LC_ALL C.UTF-8

 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.10
-ARG K2_VERSION="1.24.4.dev20231021+cuda11.7.torch2.0.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.7.torch2.0.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.7.torch2.0.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.7.torch2.0.0"
 ARG TORCHAUDIO_VERSION="2.0.0+cu117"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
@ -55,6 +56,8 @@ RUN pip install --no-cache-dir \
      onnx \
      onnxruntime \
      onnxmltools \
+      onnxoptimizer \
+      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
--- a/docker/torch2.1.0-cuda11.8.dockerfile
+++ b/docker/torch2.1.0-cuda11.8.dockerfile
@ -1,12 +1,13 @@
 FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-devel
+# python 3.10

 ENV LC_ALL C.UTF-8

 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.10
-ARG K2_VERSION="1.24.4.dev20231021+cuda11.8.torch2.1.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda11.8.torch2.1.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.8.torch2.1.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.8.torch2.1.0"
 ARG TORCHAUDIO_VERSION="2.1.0+cu118"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
@ -55,6 +56,8 @@ RUN pip install --no-cache-dir \
      onnx \
      onnxruntime \
      onnxmltools \
+      onnxoptimizer \
+      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
--- a/docker/torch2.1.0-cuda12.1.dockerfile
+++ b/docker/torch2.1.0-cuda12.1.dockerfile
@ -1,12 +1,13 @@
 FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel
+# python 3.10

 ENV LC_ALL C.UTF-8

 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.10
-ARG K2_VERSION="1.24.4.dev20231021+cuda12.1.torch2.1.0"
-ARG KALDIFEAT_VERSION="1.25.1.dev20231022+cuda12.1.torch2.1.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda12.1.torch2.1.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda12.1.torch2.1.0"
 ARG TORCHAUDIO_VERSION="2.1.0+cu121"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
@ -55,6 +56,8 @@ RUN pip install --no-cache-dir \
      onnx \
      onnxruntime \
      onnxmltools \
+      onnxoptimizer \
+      onnxsim \
      multi_quantization \
      typeguard \
      numpy \
--- a/docker/torch2.2.0-cuda11.8.dockerfile
+++ b/docker/torch2.2.0-cuda11.8.dockerfile
@ -0,0 +1,73 @@
+FROM pytorch/pytorch:2.2.0-cuda11.8-cudnn8-devel
+# python 3.10
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.8.torch2.2.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.8.torch2.2.0"
+ARG TORCHAUDIO_VERSION="2.2.0+cu118"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      onnxoptimizer \
+      onnxsim \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docker/torch2.2.0-cuda12.1.dockerfile
+++ b/docker/torch2.2.0-cuda12.1.dockerfile
@ -0,0 +1,73 @@
+FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-devel
+# python 3.10
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20240223+cuda12.1.torch2.2.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda12.1.torch2.2.0"
+ARG TORCHAUDIO_VERSION="2.2.0+cu121"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      onnxoptimizer \
+      onnxsim \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docker/torch2.2.1-cuda11.8.dockerfile
+++ b/docker/torch2.2.1-cuda11.8.dockerfile
@ -0,0 +1,73 @@
+FROM pytorch/pytorch:2.2.1-cuda11.8-cudnn8-devel
+# python 3.10
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.8.torch2.2.1"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.8.torch2.2.1"
+ARG TORCHAUDIO_VERSION="2.2.1+cu118"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      onnxoptimizer \
+      onnxsim \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docker/torch2.2.1-cuda12.1.dockerfile
+++ b/docker/torch2.2.1-cuda12.1.dockerfile
@ -0,0 +1,73 @@
+FROM pytorch/pytorch:2.2.1-cuda12.1-cudnn8-devel
+# python 3.10
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20240223+cuda12.1.torch2.2.1"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda12.1.torch2.2.1"
+ARG TORCHAUDIO_VERSION="2.2.1+cu121"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      onnxoptimizer \
+      onnxsim \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docker/torch2.2.2-cuda11.8.dockerfile
+++ b/docker/torch2.2.2-cuda11.8.dockerfile
@ -0,0 +1,73 @@
+FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-devel
+# python 3.10
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20240328+cuda11.8.torch2.2.2"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240329+cuda11.8.torch2.2.2"
+ARG TORCHAUDIO_VERSION="2.2.2+cu118"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      onnxoptimizer \
+      onnxsim \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docker/torch2.2.2-cuda12.1.dockerfile
+++ b/docker/torch2.2.2-cuda12.1.dockerfile
@ -0,0 +1,73 @@
+FROM pytorch/pytorch:2.2.2-cuda12.1-cudnn8-devel
+# python 3.10
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20240328+cuda12.1.torch2.2.2"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240329+cuda12.1.torch2.2.2"
+ARG TORCHAUDIO_VERSION="2.2.2+cu121"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      onnxoptimizer \
+      onnxsim \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docs/source/decoding-with-langugage-models/LODR.rst
+++ b/docs/source/decoding-with-langugage-models/LODR.rst
@ -30,7 +30,7 @@ of langugae model integration.
 First, let's have a look at some background information. As the predecessor of LODR, Density Ratio (DR) is first proposed `here <https://arxiv.org/abs/2002.11268>`_
 to address the language information mismatch between the training
 corpus (source domain) and the testing corpus (target domain). Assuming that the source domain and the test domain
-are acoustically similar, DR derives the following formular for decoding with Bayes' theorem:
+are acoustically similar, DR derives the following formula for decoding with Bayes' theorem:

 .. math::

@ -41,7 +41,7 @@ are acoustically similar, DR derives the following formular for decoding with Ba


 where :math:`\lambda_1` and :math:`\lambda_2` are the weights of LM scores for target domain and source domain respectively.
-Here, the source domain LM is trained on the training corpus. The only difference in the above formular compared to
+Here, the source domain LM is trained on the training corpus. The only difference in the above formula compared to
 shallow fusion is the subtraction of the source domain LM.

 Some works treat the predictor and the joiner of the neural transducer as its internal LM. However, the LM is
@ -58,7 +58,7 @@ during decoding for transducer model:

 In LODR, an additional bi-gram LM estimated on the source domain (e.g training corpus) is required. Compared to DR,
 the only difference lies in the choice of source domain LM. According to the original `paper <https://arxiv.org/abs/2203.16776>`_,
-LODR achieves similar performance compared DR in both intra-domain and cross-domain settings.
+LODR achieves similar performance compared to DR in both intra-domain and cross-domain settings.
 As a bi-gram is much faster to evaluate, LODR is usually much faster.

 Now, we will show you how to use LODR in ``icefall``.
--- a/docs/source/decoding-with-langugage-models/shallow-fusion.rst
+++ b/docs/source/decoding-with-langugage-models/shallow-fusion.rst
@ -139,7 +139,7 @@ A few parameters can be tuned to further boost the performance of shallow fusion
 - ``--lm-scale``

    Controls the scale of the LM. If too small, the external language model may not be fully utilized; if too large,
-    the LM score may dominant during decoding, leading to bad WER. A typical value of this is around 0.3.
+    the LM score might be dominant during decoding, leading to bad WER. A typical value of this is around 0.3.

 - ``--beam-size``

--- a/docs/source/docker/intro.rst
+++ b/docs/source/docker/intro.rst
@ -34,6 +34,12 @@ which will give you something like below:

 .. code-block:: bash

+  "torch2.2.2-cuda12.1"
+  "torch2.2.2-cuda11.8"
+  "torch2.2.1-cuda12.1"
+  "torch2.2.1-cuda11.8"
+  "torch2.2.0-cuda12.1"
+  "torch2.2.0-cuda11.8"
  "torch2.1.0-cuda12.1"
  "torch2.1.0-cuda11.8"
  "torch2.0.0-cuda11.7"
--- a/docs/source/for-dummies/environment-setup.rst
+++ b/docs/source/for-dummies/environment-setup.rst
@ -74,6 +74,10 @@ to install dependencies of `icefall`_:

   pip install k2==1.24.4.dev20231220+cpu.torch2.0.0 -f https://k2-fsa.github.io/k2/cpu.html

+   # For users from China
+   # 中国国内用户，如果访问不了 huggingface, 请使用
+   # pip install k2==1.24.4.dev20231220+cpu.torch2.0.0 -f https://k2-fsa.github.io/k2/cpu-cn.html
+
   # Install the latest version of lhotse

   pip install git+https://github.com/lhotse-speech/lhotse
--- a/docs/source/installation/index.rst
+++ b/docs/source/installation/index.rst
@ -206,6 +206,9 @@ We will install `k2`_ from pre-compiled wheels by following
 .. code-block:: bash

  (test-icefall) kuangfangjun:~$ pip install k2==1.24.3.dev20230725+cuda11.6.torch1.13.0 -f https://k2-fsa.github.io/k2/cuda.html
+  # For users from China
+  # 中国国内用户，如果访问不了 huggingface, 请使用
+  # pip install k2==1.24.3.dev20230725+cuda11.6.torch1.13.0 -f https://k2-fsa.github.io/k2/cuda-cn.html

  Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
  Looking in links: https://k2-fsa.github.io/k2/cuda.html
--- a/docs/source/recipes/Finetune/adapter/finetune_adapter.rst
+++ b/docs/source/recipes/Finetune/adapter/finetune_adapter.rst
@ -0,0 +1,225 @@
+Finetune from a pre-trained Zipformer model with adapters
+=========================================================
+
+This tutorial shows you how to fine-tune a pre-trained **Zipformer**
+transducer model on a new dataset with adapters. 
+Adapters are compact and efficient module that can be integrated into a pre-trained model
+to improve the model's performance on a new domain. Adapters are injected
+between different modules in the well-trained neural network. During training, only the parameters
+in the adapters will be updated. It achieves competitive performance
+while requiring much less GPU memory than full fine-tuning. For more details about adapters,
+please refer to the original `paper <https://arxiv.org/pdf/1902.00751.pdf#/>`_ for more details.
+
+.. HINT::
+
+  We assume you have read the page :ref:`install icefall` and have setup
+  the environment for ``icefall``.
+
+.. HINT::
+
+  We recommend you to use a GPU or several GPUs to run this recipe
+
+For illustration purpose, we fine-tune the Zipformer transducer model
+pre-trained on `LibriSpeech`_ on the small subset of `GigaSpeech`_. You could use your
+own data for fine-tuning if you create a manifest for your new dataset.
+
+Data preparation
+----------------
+
+Please follow the instructions in the `GigaSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR>`_
+to prepare the fine-tune data used in this tutorial. We only require the small subset in GigaSpeech for this tutorial.
+
+
+Model preparation
+-----------------
+
+We are using the Zipformer model trained on full LibriSpeech (960 hours) as the intialization. The
+checkpoint of the model can be downloaded via the following command:
+
+.. code-block:: bash
+
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+    $ cd icefall-asr-librispeech-zipformer-2023-05-15/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt
+    $ cd ../data/lang_bpe_500
+    $ git lfs pull --include bpe.model
+    $ cd ../../..
+
+Before fine-tuning, let's test the model's WER on the new domain. The following command performs
+decoding on the GigaSpeech test sets:
+
+.. code-block:: bash
+
+    ./zipformer/decode_gigaspeech.py \
+        --epoch 99 \
+        --avg 1 \
+        --exp-dir icefall-asr-librispeech-zipformer-2023-05-15/exp \
+        --use-averaged-model 0 \
+        --max-duration 1000 \
+        --decoding-method greedy_search
+
+You should see the following numbers:
+
+.. code-block::
+
+    For dev, WER of different settings are:
+    greedy_search	20.06	best for dev
+
+    For test, WER of different settings are:
+    greedy_search	19.27	best for test
+
+
+Fine-tune with adapter
+----------------------
+
+We insert 4 adapters with residual connection in each ``Zipformer2EncoderLayer``. 
+The original model parameters remain untouched during training and only the parameters of
+the adapters are updated. The following command starts a fine-tuning experiment with adapters:
+
+.. code-block:: bash
+    
+    $ do_finetune=1
+    $ use_adapters=1
+    $ adapter_dim=8
+
+    $ ./zipformer_adapter/train.py \
+        --world-size 2 \
+        --num-epochs 20 \
+        --start-epoch 1 \
+        --exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
+        --use-fp16 1 \
+        --base-lr 0.045 \
+        --use-adapters $use_adapters --adapter-dim $adapter_dim \
+        --bpe-model data/lang_bpe_500/bpe.model \
+        --do-finetune $do_finetune \
+        --master-port 13022 \
+        --finetune-ckpt icefall-asr-librispeech-zipformer-2023-05-15/exp/pretrained.pt \
+        --max-duration 1000
+
+The following arguments are related to fine-tuning:
+
+- ``--do-finetune``
+    If True, do fine-tuning by initializing the model from a pre-trained checkpoint.
+    **Note that if you want to resume your fine-tuning experiment from certain epochs, you
+    need to set this to False.**
+
+- ``use-adapters``
+    If adapters are used during fine-tuning.
+
+- ``--adapter-dim``
+    The bottleneck dimension of the adapter module. Typically a small number.
+
+You should notice that in the training log, the total number of trainale parameters is shown:
+
+.. code-block::
+
+    2024-02-22 21:22:03,808 INFO [train.py:1277] A total of 761344 trainable parameters (1.148% of the whole model)
+    
+The trainable parameters only makes up 1.15% of the entire model parameters, so the training will be much faster
+and requires less memory than full fine-tuning.
+
+
+Decoding
+--------
+
+After training, let's test the WERs. To test the WERs on the GigaSpeech set,
+you can execute the following command:
+
+.. code-block:: bash
+
+    $ epoch=20
+    $ avg=10
+    $ use_adapters=1
+    $ adapter_dim=8
+    
+    % ./zipformer/decode.py \
+        --epoch $epoch \
+        --avg $avg \
+        --use-averaged-model 1 \
+        --exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
+        --max-duration 600 \
+        --use-adapters $use_adapters \
+        --adapter-dim $adapter_dim \
+        --decoding-method greedy_search
+
+You should see the following numbers:
+
+.. code-block::
+
+    For dev, WER of different settings are:
+    greedy_search	15.44	best for dev
+
+    For test, WER of different settings are:
+    greedy_search	15.42	best for test
+
+
+The WER on test set is improved from 19.27 to 15.42, demonstrating the effectiveness of adapters.
+
+The same model can be used to perform decoding on LibriSpeech test sets. You can deactivate the adapters
+to keep the same performance of the original model:
+
+.. code-block:: bash
+
+    $ epoch=20
+    $ avg=1
+    $ use_adapters=0
+    $ adapter_dim=8
+    
+    % ./zipformer/decode.py \
+        --epoch $epoch \
+        --avg $avg \
+        --use-averaged-model 1 \
+        --exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
+        --max-duration 600 \
+        --use-adapters $use_adapters \
+        --adapter-dim $adapter_dim \
+        --decoding-method greedy_search
+
+
+.. code-block::
+
+    For dev, WER of different settings are:
+    greedy_search	2.23	best for test-clean
+
+    For test, WER of different settings are:
+    greedy_search	4.96	best for test-other
+
+The numbers are the same as reported in `icefall <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md#normal-scaled-model-number-of-model-parameters-65549011-ie-6555-m>`_. So adapter-based
+fine-tuning is also very flexible as the same model can be used for decoding on the original and target domain.
+
+
+Export the model
+----------------
+
+After training, the model can be exported to ``onnx`` format easily using the following command:
+
+.. code-block:: bash
+
+    $ use_adapters=1
+    $ adapter_dim=16
+
+    $ ./zipformer_adapter/export-onnx.py \
+        --tokens icefall-asr-librispeech-zipformer-2023-05-15/data/lang_bpe_500/tokens.txt \
+        --use-averaged-model 1 \
+        --epoch 20 \
+        --avg 10 \
+        --exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
+        --use-adapters $use_adapters \
+        --adapter-dim $adapter_dim \
+        --num-encoder-layers "2,2,3,4,3,2" \
+        --downsampling-factor "1,2,4,8,4,2" \
+        --feedforward-dim "512,768,1024,1536,1024,768" \
+        --num-heads "4,4,4,8,4,4" \
+        --encoder-dim "192,256,384,512,384,256" \
+        --query-head-dim 32 \
+        --value-head-dim 12 \
+        --pos-head-dim 4 \
+        --pos-dim 48 \
+        --encoder-unmasked-dim "192,192,256,256,256,192" \
+        --cnn-module-kernel "31,31,15,15,15,31" \
+        --decoder-dim 512 \
+        --joiner-dim 512 \
+        --causal False \
+        --chunk-size "16,32,64,-1" \
+        --left-context-frames "64,128,256,-1"
--- a/docs/source/recipes/Finetune/from_supervised/finetune_zipformer.rst
+++ b/docs/source/recipes/Finetune/from_supervised/finetune_zipformer.rst
@ -0,0 +1,140 @@
+Finetune from a supervised pre-trained Zipformer model
+======================================================
+
+This tutorial shows you how to fine-tune a supervised pre-trained **Zipformer**
+transducer model on a new dataset.
+
+.. HINT::
+
+  We assume you have read the page :ref:`install icefall` and have setup
+  the environment for ``icefall``.
+
+.. HINT::
+
+  We recommend you to use a GPU or several GPUs to run this recipe
+
+
+For illustration purpose, we fine-tune the Zipformer transducer model
+pre-trained on `LibriSpeech`_ on the small subset of `GigaSpeech`_. You could use your
+own data for fine-tuning if you create a manifest for your new dataset.
+
+Data preparation
+----------------
+
+Please follow the instructions in the `GigaSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR>`_
+to prepare the fine-tune data used in this tutorial. We only require the small subset in GigaSpeech for this tutorial.
+
+
+Model preparation
+-----------------
+
+We are using the Zipformer model trained on full LibriSpeech (960 hours) as the intialization. The
+checkpoint of the model can be downloaded via the following command:
+
+.. code-block:: bash
+
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+    $ cd icefall-asr-librispeech-zipformer-2023-05-15/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt
+    $ cd ../data/lang_bpe_500
+    $ git lfs pull --include bpe.model
+    $ cd ../../..
+
+Before fine-tuning, let's test the model's WER on the new domain. The following command performs
+decoding on the GigaSpeech test sets:
+
+.. code-block:: bash
+
+    ./zipformer/decode_gigaspeech.py \
+        --epoch 99 \
+        --avg 1 \
+        --exp-dir icefall-asr-librispeech-zipformer-2023-05-15/exp \
+        --use-averaged-model 0 \
+        --max-duration 1000 \
+        --decoding-method greedy_search
+
+You should see the following numbers:
+
+.. code-block::
+
+    For dev, WER of different settings are:
+    greedy_search	20.06	best for dev
+
+    For test, WER of different settings are:
+    greedy_search	19.27	best for test
+
+
+Fine-tune
+---------
+
+Since LibriSpeech and GigaSpeech are both English dataset, we can initialize the whole
+Zipformer model with the checkpoint downloaded in the previous step (otherwise we should consider
+initializing the stateless decoder and joiner from scratch due to the mismatch of the output
+vocabulary). The following command starts a fine-tuning experiment:
+
+.. code-block:: bash
+
+    $ use_mux=0
+    $ do_finetune=1
+
+    $ ./zipformer/finetune.py \
+        --world-size 2 \
+        --num-epochs 20 \
+        --start-epoch 1 \
+        --exp-dir zipformer/exp_giga_finetune${do_finetune}_mux${use_mux} \
+        --use-fp16 1 \
+        --base-lr 0.0045 \
+        --bpe-model data/lang_bpe_500/bpe.model \
+        --do-finetune $do_finetune \
+        --use-mux $use_mux \
+        --master-port 13024 \
+        --finetune-ckpt icefall-asr-librispeech-zipformer-2023-05-15/exp/pretrained.pt \
+        --max-duration 1000
+
+The following arguments are related to fine-tuning:
+
+- ``--base-lr``
+    The learning rate used for fine-tuning. We suggest to set a **small** learning rate for fine-tuning,
+    otherwise the model may forget the initialization very quickly. A reasonable value should be around
+    1/10 of the original lr, i.e 0.0045.
+
+- ``--do-finetune``
+    If True, do fine-tuning by initializing the model from a pre-trained checkpoint.
+    **Note that if you want to resume your fine-tuning experiment from certain epochs, you
+    need to set this to False.**
+
+- ``--finetune-ckpt``
+    The path to the pre-trained checkpoint (used for initialization).
+
+- ``--use-mux``
+    If True, mix the fine-tune data with the original training data by using `CutSet.mux <https://lhotse.readthedocs.io/en/latest/api.html#lhotse.supervision.SupervisionSet.mux>`_
+    This helps maintain the model's performance on the original domain if the original training
+    is available. **If you don't have the original training data, please set it to False.**
+
+After fine-tuning, let's test the WERs. You can do this via the following command:
+
+.. code-block:: bash
+
+    $ use_mux=0
+    $ do_finetune=1
+    $ ./zipformer/decode_gigaspeech.py \
+        --epoch 20 \
+        --avg 10 \
+        --exp-dir zipformer/exp_giga_finetune${do_finetune}_mux${use_mux} \
+        --use-averaged-model 1 \
+        --max-duration 1000 \
+        --decoding-method greedy_search
+
+You should see numbers similar to the ones below:
+
+.. code-block:: text
+
+    For dev, WER of different settings are:
+    greedy_search	13.47	best for dev
+
+    For test, WER of different settings are:
+    greedy_search	13.66	best for test
+
+Compared to the original checkpoint, the fine-tuned model achieves much lower WERs
+on the GigaSpeech test sets.
--- a/docs/source/recipes/Finetune/index.rst
+++ b/docs/source/recipes/Finetune/index.rst
@ -0,0 +1,16 @@
+Fine-tune a pre-trained model
+=============================
+
+After pre-training on public available datasets, the ASR model is already capable of
+performing general speech recognition with relatively high accuracy. However, the accuracy
+could be still low on certain domains that are quite different from the original training
+set. In this case, we can fine-tune the model with a small amount of additional labelled
+data to improve the performance on new domains.
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Table of Contents
+
+   from_supervised/finetune_zipformer
+   adapter/finetune_adapter
--- a/docs/source/recipes/TTS/ljspeech/vits.rst
+++ b/docs/source/recipes/TTS/ljspeech/vits.rst
@ -1,4 +1,4 @@
-VITS
+VITS-LJSpeech
 ===============

 This tutorial shows you how to train an VITS model
@ -13,6 +13,14 @@ with the `LJSpeech <https://keithito.com/LJ-Speech-Dataset/>`_ dataset.
   The VITS paper: `Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech <https://arxiv.org/pdf/2106.06103.pdf>`_


+Install extra dependencies
+--------------------------
+
+.. code-block:: bash
+
+  pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
+  pip install numba espnet_tts_frontend
+
 Data preparation
 ----------------

@ -56,7 +64,8 @@ Training
      --start-epoch 1 \
      --use-fp16 1 \
      --exp-dir vits/exp \
-      --tokens data/tokens.txt
+      --tokens data/tokens.txt \
+      --model-type high \
      --max-duration 500

 .. note::
@ -64,6 +73,11 @@ Training
    You can adjust the hyper-parameters to control the size of the VITS model and
    the training configurations. For more details, please run ``./vits/train.py --help``.

+.. warning::
+
+   If you want a model that runs faster on CPU, please use ``--model-type low``
+   or ``--model-type medium``.
+
 .. note::

    The training can take a long time (usually a couple of days).
@ -95,8 +109,8 @@ training part first. It will save the ground-truth and generated wavs to the dir
 Export models
 -------------

-Currently we only support ONNX model exporting. It will generate two files in the given ``exp-dir``:
-``vits-epoch-*.onnx`` and ``vits-epoch-*.int8.onnx``.
+Currently we only support ONNX model exporting. It will generate one file in the given ``exp-dir``:
+``vits-epoch-*.onnx``.

 .. code-block:: bash

@ -120,4 +134,68 @@ Download pretrained models
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following link:

-  - `<https://huggingface.co/Zengwei/icefall-tts-ljspeech-vits-2023-11-29>`_
+  - ``--model-type=high``: `<https://huggingface.co/Zengwei/icefall-tts-ljspeech-vits-2024-02-28>`_
+  - ``--model-type=medium``: `<https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-medium-2024-03-12>`_
+  - ``--model-type=low``: `<https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-low-2024-03-12>`_
+
+Usage in sherpa-onnx
+--------------------
+
+The following describes how to test the exported ONNX model in `sherpa-onnx`_.
+
+.. hint::
+
+   `sherpa-onnx`_ supports different programming languages, e.g., C++, C, Python,
+   Kotlin, Java, Swift, Go, C#, etc. It also supports Android and iOS.
+
+   We only describe how to use pre-built binaries from `sherpa-onnx`_ below.
+   Please refer to `<https://k2-fsa.github.io/sherpa/onnx/>`_
+   for more documentation.
+
+Install sherpa-onnx
+^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+   pip install sherpa-onnx
+
+To check that you have installed `sherpa-onnx`_ successfully, please run:
+
+.. code-block:: bash
+
+   which sherpa-onnx-offline-tts
+   sherpa-onnx-offline-tts --help
+
+Download lexicon files
+^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+   cd /tmp
+   wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
+   tar xf espeak-ng-data.tar.bz2
+
+Run sherpa-onnx
+^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+  cd egs/ljspeech/TTS
+
+  sherpa-onnx-offline-tts \
+    --vits-model=vits/exp/vits-epoch-1000.onnx \
+    --vits-tokens=data/tokens.txt \
+    --vits-data-dir=/tmp/espeak-ng-data \
+    --num-threads=1 \
+    --output-filename=./high.wav \
+    "Ask not what your country can do for you; ask what you can do for your country."
+
+.. hint::
+
+   You can also use ``sherpa-onnx-offline-tts-play`` to play the audio
+   as it is generating.
+
+You should get a file ``high.wav`` after running the above command.
+
+Congratulations! You have successfully trained and exported a text-to-speech
+model and run it with `sherpa-onnx`_.
--- a/docs/source/recipes/TTS/vctk/vits.rst
+++ b/docs/source/recipes/TTS/vctk/vits.rst
@ -1,4 +1,4 @@
-VITS
+VITS-VCTK
 ===============

 This tutorial shows you how to train an VITS model
--- a/docs/source/recipes/index.rst
+++ b/docs/source/recipes/index.rst
@ -17,3 +17,4 @@ We may add recipes for other tasks as well in the future.
   Streaming-ASR/index
   RNN-LM/index
   TTS/index
+   Finetune/index
--- a/egs/aidatatang_200zh/ASR/prepare.sh
+++ b/egs/aidatatang_200zh/ASR/prepare.sh
@ -16,8 +16,8 @@ perturb_speed=true
 #
 #  - $dl_dir/aidatatang_200zh
 #      You can find "corpus" and "transcript" inside it.
-#      You can download it at
-#       https://openslr.org/62/
+#      You can download it at https://openslr.org/62/
+#      If you download the data by yourself, DON'T FORGET to extract the *.tar.gz files under corpus.

 dl_dir=$PWD/download

--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -288,8 +288,9 @@ class Aidatatang_200zhAsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
+                buffer_size=self.args.num_buckets * 2000,
+                shuffle_buffer_size=self.args.num_buckets * 5000,
                drop_last=True,
-                buffer_size=50000,
            )
        else:
            logging.info("Using SimpleCutSampler.")
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/export.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/export.py
@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
@ -20,7 +21,7 @@
 Usage:
 ./pruned_transducer_stateless2/export.py \
  --exp-dir ./pruned_transducer_stateless2/exp \
-  --lang-dir data/lang_char \
+  --tokens data/lang_char/tokens.txt \
  --epoch 29 \
  --avg 19

@ -45,12 +46,13 @@ import argparse
 import logging
 from pathlib import Path

+import k2
 import torch
+from scaling_converter import convert_scaled_to_non_scaled
 from train import get_params, get_transducer_model

 from icefall.checkpoint import average_checkpoints, load_checkpoint
-from icefall.lexicon import Lexicon
-from icefall.utils import str2bool
+from icefall.utils import num_tokens, str2bool


 def get_parser():
@ -85,10 +87,10 @@ def get_parser():
    )

    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
        type=str,
-        default="data/lang_char",
-        help="The lang dir",
+        default="data/lang_char/tokens.txt",
+        help="Path to the tokens.txt.",
    )

    parser.add_argument(
@ -122,10 +124,14 @@ def main():

    logging.info(f"device: {device}")

-    lexicon = Lexicon(params.lang_dir)
+    # Load tokens.txt here
+    token_table = k2.SymbolTable.from_file(params.tokens)

-    params.blank_id = 0
-    params.vocab_size = max(lexicon.tokens) + 1
+    # Load id of the <blk> token and the vocab size
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = token_table["<blk>"]
+    params.unk_id = token_table["<unk>"]
+    params.vocab_size = num_tokens(token_table) + 1  # +1 for <blk>

    logging.info(params)

@ -152,6 +158,7 @@ def main():
    model.eval()

    if params.jit:
+        convert_scaled_to_non_scaled(model, inplace=True)
        # We won't use the forward() method of the model in C++, so just ignore
        # it here.
        # Otherwise, one of its arguments is a ragged tensor and is not
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/lstmp.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/lstmp.py
@ -0,0 +1 @@
+../../../librispeech/ASR/lstm_transducer_stateless2/lstmp.py
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/scaling_converter.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/scaling_converter.py
@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless3/scaling_converter.py
--- a/egs/aishell/ASR/README.md
+++ b/egs/aishell/ASR/README.md
@ -19,8 +19,17 @@ The following table lists the differences among them.
 | `transducer_stateless_modified`    | Conformer | Embedding + Conv1d | with modified transducer from `optimized_transducer`                     |
 | `transducer_stateless_modified-2`  | Conformer | Embedding + Conv1d | with modified transducer from `optimized_transducer` + extra data      |
 | `pruned_transducer_stateless3`     | Conformer (reworked) | Embedding + Conv1d | pruned RNN-T + reworked model with random combiner + using aidatatang_20zh as extra data|
-| `pruned_transducer_stateless7`     | Zipformer | Embedding | pruned RNN-T + zipformer encoder + stateless decoder with context-size 1 |
+| `pruned_transducer_stateless7`     | Zipformer | Embedding | pruned RNN-T + zipformer encoder + stateless decoder with context-size set to 1 |
+| `zipformer`                           | Upgraded Zipformer | Embedding + Conv1d | The latest recipe with context-size set to 1 |
+

 The decoder in `transducer_stateless` is modified from the paper
 [Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
 We place an additional Conv1d layer right after the input embedding layer.
+
+# Whisper
+
+Recipe to finetune large pretrained models
+|                                    | Encoder   | Decoder            | Comment                                                                           |
+|------------------------------------|-----------|--------------------|-----------------------------------------------------------------------------------|
+| `whisper`             | Transformer | Transformer | support fine-tuning using deepspeed
--- a/egs/aishell/ASR/RESULTS.md
+++ b/egs/aishell/ASR/RESULTS.md
@ -1,10 +1,120 @@
 ## Results

+### Aishell training results (Fine-tuning Pretrained Models)
+#### Whisper
+[./whisper](./whisper)
+##### fine-tuning results on Aishell test set on whisper medium, large-v2, large-v3
+
+|                        | test (before fine-tuning) | test (after fine-tuning)  | comment                                 |
+|------------------------|------|------|-----------------------------------------|
+| medium         | 7.23 | 3.27 | --epoch 10 --avg 4,  ddp                         |
+| large-v2       | 6.56 | 2.47 | --epoch 10 --avg 6,  deepspeed zero stage1       |
+| large-v3       | 6.06 | 2.84 | --epoch 5 --avg 3,   deepspeed zero stage1       |
+
+Command for training is:
+```bash
+pip install -r whisper/requirements.txt
+
+./prepare.sh --stage 30 --stop_stage 30
+
+#fine-tuning with deepspeed zero stage 1
+torchrun --nproc-per-node 8 ./whisper/train.py \
+  --max-duration 200 \
+  --exp-dir whisper/exp_large_v2 \
+  --model-name large-v2 \
+  --deepspeed \
+  --deepspeed_config ./whisper/ds_config_zero1.json
+
+# fine-tuning with ddp
+torchrun --nproc-per-node 8 ./whisper/train.py \
+  --max-duration 200 \
+  --exp-dir whisper/exp_medium \
+  --base-lr 1e-5 \
+  --model-name medium
+```
+
+Command for decoding using fine-tuned models:
+```bash
+git lfs install
+git clone https://huggingface.co/yuekai/icefall_asr_aishell_whisper
+ln -s icefall_asr_aishell_whisper/exp_large_v2/epoch-10-avg6.pt whisper/exp_large_v2/epoch-999.pt
+
+python3 ./whisper/decode.py \
+  --exp-dir whisper/exp_large_v2 \
+  --model-name large-v2 \
+  --epoch 999 --avg 1 \
+  --beam-size 10 --max-duration 50
+```
+Command for decoding using pretrained models (before fine-tuning):
+```bash
+python3 ./whisper/decode.py \
+  --exp-dir whisper/exp_large_v2 \
+  --model-name large-v2 \
+  --epoch -1 --avg 1 \
+  --remove-whisper-encoder-input-length-restriction False \
+  --beam-size 10 --max-duration 50
+```
+Fine-tuned models, training logs, decoding logs, tensorboard and decoding results
+are available at
+<https://huggingface.co/yuekai/icefall_asr_aishell_whisper>
+
 ### Aishell training result (Stateless Transducer)

+#### Zipformer (Byte-level BPE)
+
+[./zipformer](./zipformer/)
+
+It's reworked Zipformer with Pruned RNNT loss, trained with Byte-level BPE, `vocab_size` set to 500.
+
+##### normal-scaled model, number of model parameters: 65549011, i.e., 65.55 M
+
+|                        | test | dev  | comment                                 |
+|------------------------|------|------|-----------------------------------------|
+| greedy search          | 4.54 | 4.31 | --epoch 40 --avg 10                     |
+| modified beam search   | 4.37 | 4.11 | --epoch 40 --avg 10                     |
+| fast beam search       | 4.43 | 4.17 | --epoch 40 --avg 10                     |
+
+```bash
+./prepare.sh
+
+export CUDA_VISIBLE_DEVICES="0,1"
+
+./zipformer/train_bbpe.py \
+  --world-size 2 \
+  --num-epochs 40 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --context-size 2 \
+  --enable-musan 0 \
+  --exp-dir zipformer/exp_bbpe \
+  --max-duration 1000 \
+  --enable-musan 0 \
+  --base-lr 0.045 \
+  --lr-batches 7500 \
+  --lr-epochs 10 \
+  --spec-aug-time-warp-factor 20
+```
+
+Command for decoding is:
+```bash
+for m in greedy_search modified_beam_search fast_beam_search ; do
+  ./zipformer/decode_bbpe.py \
+    --epoch 40 \
+    --avg 10 \
+    --exp-dir ./zipformer_bbpe/exp \
+    --bpe-model data/lang_bbpe_500/bbpe.model \
+    --context-size 2 \
+    --decoding-method $m
+done
+```
+Pretrained models, training logs, decoding logs, tensorboard and decoding results
+are available at
+<https://huggingface.co/zrjin/icefall-asr-aishell-zipformer-bbpe-2024-01-16>
+
+
 #### Zipformer (Non-streaming)

-[./zipformer](./zipformer)
+[./zipformer](./zipformer/)

 It's reworked Zipformer with Pruned RNNT loss.
 **Caution**: It uses `--context-size=1`.
@ -260,7 +370,7 @@ done
 Pretrained models, training logs, decoding logs, and decoding results
 are available at
 <https://huggingface.co/marcoyang/icefall-asr-aishell-zipformer-pruned-transducer-stateless7-2023-03-21>
-#### Pruned transducer stateless 7 (zipformer)
+#### Pruned transducer stateless 7 (Byte-level BPE)

 See <https://github.com/k2-fsa/icefall/pull/986>

@ -703,7 +813,6 @@ python3 ./transducer_stateless/decode.py \
       --max-sym-per-frame 3
 ```

-### Aishell training results (Transducer-stateless)
 #### 2022-02-18
 (Pingfeng Luo) : The tensorboard log for training is available at <https://tensorboard.dev/experiment/k3QL6QMhRbCwCKYKM9po9w/>
 And pretrained model is available at <https://huggingface.co/pfluo/icefall-aishell-transducer-stateless-char-2021-12-29>
--- a/egs/aishell/ASR/conformer_ctc/README.md
+++ b/egs/aishell/ASR/conformer_ctc/README.md
@ -1,4 +1,4 @@

 Please visit
-<https://icefall.readthedocs.io/en/latest/recipes/aishell/conformer_ctc.html>
+<https://k2-fsa.github.io/icefall/recipes/Non-streaming-ASR/aishell/conformer_ctc.html>
 for how to run this recipe.
--- a/egs/aishell/ASR/conformer_ctc/decode.py
+++ b/egs/aishell/ASR/conformer_ctc/decode.py
@ -419,7 +419,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        if enable_log:
            logging.info(f"The transcripts are stored in {recog_path}")

@ -432,7 +432,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
+                f,
+                f"{test_set_name}-{key}",
+                results_char,
+                enable_log=enable_log,
+                compute_CER=True,
            )
            test_set_wers[key] = wer

--- a/egs/aishell/ASR/conformer_mmi/decode.py
+++ b/egs/aishell/ASR/conformer_mmi/decode.py
@ -431,7 +431,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        if enable_log:
            logging.info(f"The transcripts are stored in {recog_path}")

@ -444,7 +444,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=enable_log
+                f,
+                f"{test_set_name}-{key}",
+                results_char,
+                enable_log=enable_log,
+                compute_CER=True,
            )
            test_set_wers[key] = wer

--- a/egs/aishell/ASR/local/compute_fbank_aishell.py
+++ b/egs/aishell/ASR/local/compute_fbank_aishell.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    WhisperFbank,
+    WhisperFbankConfig,
+)
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor, str2bool
@ -42,9 +49,14 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)


-def compute_fbank_aishell(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_aishell(
+    num_mel_bins: int = 80,
+    perturb_speed: bool = False,
+    whisper_fbank: bool = False,
+    output_dir: str = "data/fbank",
+):
    src_dir = Path("data/manifests")
-    output_dir = Path("data/fbank")
+    output_dir = Path(output_dir)
    num_jobs = min(15, os.cpu_count())

    dataset_parts = (
@ -68,8 +80,12 @@ def compute_fbank_aishell(num_mel_bins: int = 80, perturb_speed: bool = False):
        list(manifests.keys()),
        dataset_parts,
    )
-
-    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+    if whisper_fbank:
+        extractor = WhisperFbank(
+            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
+        )
+    else:
+        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
@ -82,7 +98,7 @@ def compute_fbank_aishell(num_mel_bins: int = 80, perturb_speed: bool = False):
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -111,6 +127,18 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
+    parser.add_argument(
+        "--whisper-fbank",
+        type=str2bool,
+        default=False,
+        help="Use WhisperFbank instead of Fbank. Default: False.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="data/fbank",
+        help="Output directory. Default: data/fbank.",
+    )
    return parser.parse_args()


@ -121,5 +149,8 @@ if __name__ == "__main__":

    args = get_args()
    compute_fbank_aishell(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
+        perturb_speed=args.perturb_speed,
+        whisper_fbank=args.whisper_fbank,
+        output_dir=args.output_dir,
    )
--- a/egs/aishell/ASR/prepare.sh
+++ b/egs/aishell/ASR/prepare.sh
@ -360,7 +360,7 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
 fi

 if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
-  log "Stage 11: Train RNN LM model"
+  log "Stage 12: Train RNN LM model"
  python ../../../icefall/rnn_lm/train.py \
    --start-epoch 0 \
    --world-size 1 \
@ -376,3 +376,16 @@ if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
    --vocab-size 4336 \
    --master-port 12345
 fi
+
+# whisper large-v3 using 128 mel bins, others using 80 mel bins
+whisper_mel_bins=80
+output_dir=data/fbank_whisper
+if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then
+  log "Stage 30: Compute ${whisper_mel_bins} dim fbank for whisper model fine-tuning"
+  if [ ! -f $output_dir/.aishell.whisper.done ]; then
+    mkdir -p $output_dir
+    ./local/compute_fbank_aishell.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true --output-dir $output_dir
+    ./local/compute_fbank_musan.py --num-mel-bins ${whisper_mel_bins} --whisper-fbank true --output-dir $output_dir
+    touch $output_dir/.aishell.whisper.done
+  fi
+fi
--- a/egs/aishell/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless2/decode.py
@ -390,7 +390,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")

        # The following prints out WERs, per-word error statistics and aligned
@ -402,7 +402,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
+                f"{test_set_name}-{key}",
+                results_char,
+                enable_log=True,
+                compute_CER=True,
            )
            test_set_wers[key] = wer

--- a/egs/aishell/ASR/pruned_transducer_stateless2/export.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless2/export.py
@ -47,12 +47,12 @@ import argparse
 import logging
 from pathlib import Path

+import k2
 import torch
 from train import add_model_arguments, get_params, get_transducer_model

 from icefall.checkpoint import average_checkpoints, find_checkpoints, load_checkpoint
-from icefall.lexicon import Lexicon
-from icefall.utils import str2bool
+from icefall.utils import num_tokens, str2bool


 def get_parser():
@ -106,10 +106,10 @@ def get_parser():
    )

    parser.add_argument(
-        "--lang-dir",
-        type=Path,
-        default=Path("data/lang_char"),
-        help="The lang dir",
+        "--tokens",
+        type=str,
+        default="data/lang_char/tokens.txt",
+        help="Path to the tokens.txt",
    )

    parser.add_argument(
@ -136,10 +136,9 @@ def main():

    logging.info(f"device: {device}")

-    lexicon = Lexicon(params.lang_dir)
-
-    params.blank_id = 0
-    params.vocab_size = max(lexicon.tokens) + 1
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.vocab_size = num_tokens(token_table) + 1

    logging.info(params)

--- a/egs/aishell/ASR/pruned_transducer_stateless3/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless3/decode.py
@ -526,7 +526,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")

        # The following prints out WERs, per-word error statistics and aligned
@ -538,7 +538,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
+                f"{test_set_name}-{key}",
+                results_char,
+                enable_log=True,
+                compute_CER=True,
            )
            test_set_wers[key] = wer

--- a/egs/aishell/ASR/pruned_transducer_stateless3/export.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless3/export.py
@ -47,6 +47,7 @@ import argparse
 import logging
 from pathlib import Path

+import k2
 import torch
 from scaling_converter import convert_scaled_to_non_scaled
 from train import add_model_arguments, get_params, get_transducer_model
@ -57,8 +58,7 @@ from icefall.checkpoint import (
    find_checkpoints,
    load_checkpoint,
 )
-from icefall.lexicon import Lexicon
-from icefall.utils import str2bool
+from icefall.utils import num_tokens, str2bool


 def get_parser():
@ -123,10 +123,10 @@ def get_parser():
    )

    parser.add_argument(
-        "--lang-dir",
-        type=Path,
-        default=Path("data/lang_char"),
-        help="The lang dir",
+        "--tokens",
+        type=str,
+        default="data/lang_char/tokens.txt",
+        help="Path to the tokens.txt",
    )

    parser.add_argument(
@ -153,10 +153,9 @@ def main():

    logging.info(f"device: {device}")

-    lexicon = Lexicon(params.lang_dir)
-
-    params.blank_id = 0
-    params.vocab_size = max(lexicon.tokens) + 1
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.vocab_size = num_tokens(token_table) + 1
    params.datatang_prob = 0

    logging.info(params)
--- a/egs/aishell/ASR/pruned_transducer_stateless7/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/decode.py
@ -444,7 +444,7 @@ def save_results(
        for res in results:
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))

-        store_transcripts(filename=recog_path, texts=results_char)
+        store_transcripts(filename=recog_path, texts=results_char, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")

        # The following prints out WERs, per-word error statistics and aligned
@ -452,7 +452,11 @@ def save_results(
        errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
+                f"{test_set_name}-{key}",
+                results_char,
+                enable_log=True,
+                compute_CER=True,
            )
            test_set_wers[key] = wer

--- a/egs/aishell/ASR/pruned_transducer_stateless7/do_not_use_it_directly.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/do_not_use_it_directly.py
@ -89,6 +89,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
@ -881,9 +882,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error()
        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
--- a/egs/aishell/ASR/pruned_transducer_stateless7/export-onnx.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/export-onnx.py
@ -49,14 +49,14 @@ import logging
 from pathlib import Path
 from typing import Dict, Tuple

+import k2
 import onnx
-import sentencepiece as spm
 import torch
 import torch.nn as nn
 from decoder2 import Decoder
+from do_not_use_it_directly import add_model_arguments, get_params, get_transducer_model
 from onnxruntime.quantization import QuantType, quantize_dynamic
 from scaling_converter import convert_scaled_to_non_scaled
-from do_not_use_it_directly import add_model_arguments, get_params, get_transducer_model
 from zipformer import Zipformer

 from icefall.checkpoint import (
@ -65,8 +65,7 @@ from icefall.checkpoint import (
    find_checkpoints,
    load_checkpoint,
 )
-from icefall.lexicon import Lexicon
-from icefall.utils import setup_logger, str2bool
+from icefall.utils import num_tokens, setup_logger, str2bool


 def get_parser():
@ -123,12 +122,10 @@ def get_parser():
    )

    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
        type=str,
-        help="""The lang dir
-        It contains language related input files such as
-        "lexicon.txt"
-        """,
+        default="data/lang_char/tokens.txt",
+        help="Path to the tokens.txt",
    )

    parser.add_argument(
@ -404,9 +401,9 @@ def main():

    logging.info(f"device: {device}")

-    lexicon = Lexicon(params.lang_dir)
-    params.blank_id = 0
-    params.vocab_size = max(lexicon.tokens) + 1
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.vocab_size = num_tokens(token_table) + 1

    logging.info(params)

--- a/egs/aishell/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/train.py
@ -85,6 +85,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import (
@ -878,9 +879,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)
        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
--- a/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/decode.py
@ -581,7 +581,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")

        # The following prints out WERs, per-word error statistics and aligned
@ -594,7 +594,11 @@ def save_results(

        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
+                f"{test_set_name}-{key}",
+                results_char,
+                enable_log=True,
+                compute_CER=True,
            )
            test_set_wers[key] = wer

--- a/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py
@ -78,6 +78,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -871,9 +872,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/decode.py
@ -250,7 +250,7 @@ def get_parser():
    parser.add_argument(
        "--context-size",
        type=int,
-        default=1,
+        default=2,
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )
    parser.add_argument(
@ -492,7 +492,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")

        # The following prints out WERs, per-word error statistics and aligned
@ -500,7 +500,11 @@ def save_results(
        errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results, enable_log=True
+                f,
+                f"{test_set_name}-{key}",
+                results,
+                enable_log=True,
+                compute_CER=True,
            )
            test_set_wers[key] = wer

--- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py
@ -78,6 +78,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
@ -882,9 +883,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py
@ -78,6 +78,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
@ -881,9 +882,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -275,6 +275,8 @@ class AishellAsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
+                buffer_size=self.args.num_buckets * 2000,
+                shuffle_buffer_size=self.args.num_buckets * 5000,
                drop_last=self.args.drop_last,
            )
        else:
--- a/egs/aishell/ASR/tdnn_lstm_ctc/decode.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/decode.py
@ -278,7 +278,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.exp_dir / f"recogs-{test_set_name}-{key}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")

        # The following prints out WERs, per-word error statistics and aligned
@ -289,7 +289,13 @@ def save_results(
        for res in results:
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
-            wer = write_error_stats(f, f"{test_set_name}-{key}", results_char)
+            wer = write_error_stats(
+                f,
+                f"{test_set_name}-{key}",
+                results_char,
+                enable_log=True,
+                compute_CER=True,
+            )
            test_set_wers[key] = wer

        logging.info("Wrote detailed error stats to {}".format(errs_filename))
--- a/egs/aishell/ASR/transducer_stateless/decode.py
+++ b/egs/aishell/ASR/transducer_stateless/decode.py
@ -327,7 +327,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)

        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
@ -338,7 +338,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
+                f"{test_set_name}-{key}",
+                results_char,
+                enable_log=True,
+                compute_CER=True,
            )
            test_set_wers[key] = wer

--- a/egs/aishell/ASR/transducer_stateless/export.py
+++ b/egs/aishell/ASR/transducer_stateless/export.py
@ -23,7 +23,7 @@
 Usage:
 ./transducer_stateless/export.py \
  --exp-dir ./transducer_stateless/exp \
-  --lang-dir data/lang_char \
+  --tokens data/lang_char/tokens.txt \
  --epoch 20 \
  --avg 10

@ -47,6 +47,7 @@ import argparse
 import logging
 from pathlib import Path

+import k2
 import torch
 import torch.nn as nn
 from conformer import Conformer
@ -56,8 +57,7 @@ from model import Transducer

 from icefall.checkpoint import average_checkpoints, load_checkpoint
 from icefall.env import get_env_info
-from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, str2bool
+from icefall.utils import AttributeDict, num_tokens, str2bool


 def get_parser():
@ -92,10 +92,10 @@ def get_parser():
    )

    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
        type=str,
-        default="data/lang_char",
-        help="The lang dir",
+        default="data/lang_char/tokens.txt",
+        help="Path to the tokens.txt",
    )

    parser.add_argument(
@ -192,10 +192,9 @@ def main():

    logging.info(f"device: {device}")

-    lexicon = Lexicon(params.lang_dir)
-
-    params.blank_id = 0
-    params.vocab_size = max(lexicon.tokens) + 1
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.vocab_size = num_tokens(token_table) + 1

    logging.info(params)

--- a/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py
@ -226,6 +226,8 @@ class AsrDataModule:
            max_duration=self.args.max_duration,
            shuffle=self.args.shuffle,
            num_buckets=self.args.num_buckets,
+            buffer_size=self.args.num_buckets * 2000,
+            shuffle_buffer_size=self.args.num_buckets * 5000,
            drop_last=True,
        )

--- a/egs/aishell/ASR/transducer_stateless_modified-2/decode.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/decode.py
@ -372,7 +372,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")

        # The following prints out WERs, per-word error statistics and aligned
@ -384,7 +384,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
+                f"{test_set_name}-{key}",
+                results_char,
+                enable_log=True,
+                compute_CER=True,
            )
            test_set_wers[key] = wer

--- a/egs/aishell/ASR/transducer_stateless_modified-2/export.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/export.py
@ -46,6 +46,7 @@ import argparse
 import logging
 from pathlib import Path

+import k2
 import torch
 import torch.nn as nn
 from conformer import Conformer
@ -56,7 +57,7 @@ from model import Transducer
 from icefall.checkpoint import average_checkpoints, load_checkpoint
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, str2bool
+from icefall.utils import AttributeDict, num_tokens, str2bool


 def get_parser():
@ -99,10 +100,10 @@ def get_parser():
    )

    parser.add_argument(
-        "--lang-dir",
-        type=Path,
-        default=Path("data/lang_char"),
-        help="The lang dir",
+        "--tokens",
+        type=str,
+        default="data/lang_char/tokens.txt",
+        help="Path to the tokens.txt",
    )

    parser.add_argument(
@ -190,10 +191,9 @@ def main():

    logging.info(f"device: {device}")

-    lexicon = Lexicon(params.lang_dir)
-
-    params.blank_id = 0
-    params.vocab_size = max(lexicon.tokens) + 1
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.vocab_size = num_tokens(token_table) + 1

    logging.info(params)

--- a/egs/aishell/ASR/transducer_stateless_modified/decode.py
+++ b/egs/aishell/ASR/transducer_stateless_modified/decode.py
@ -376,7 +376,7 @@ def save_results(
    for key, results in results_dict.items():
        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")

        # The following prints out WERs, per-word error statistics and aligned
@ -388,7 +388,11 @@ def save_results(
            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results_char, enable_log=True
+                f,
+                f"{test_set_name}-{key}",
+                results_char,
+                enable_log=True,
+                compute_CER=True,
            )
            test_set_wers[key] = wer

--- a/egs/aishell/ASR/transducer_stateless_modified/export.py
+++ b/egs/aishell/ASR/transducer_stateless_modified/export.py
@ -46,6 +46,7 @@ import argparse
 import logging
 from pathlib import Path

+import k2
 import torch
 import torch.nn as nn
 from conformer import Conformer
@ -55,8 +56,7 @@ from model import Transducer

 from icefall.checkpoint import average_checkpoints, load_checkpoint
 from icefall.env import get_env_info
-from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, str2bool
+from icefall.utils import AttributeDict, num_tokens, str2bool


 def get_parser():
@ -99,10 +99,10 @@ def get_parser():
    )

    parser.add_argument(
-        "--lang-dir",
-        type=Path,
-        default=Path("data/lang_char"),
-        help="The lang dir",
+        "--tokens",
+        type=str,
+        default="data/lang_char/tokens.txt",
+        help="Path to the tokens.txt",
    )

    parser.add_argument(
@ -190,10 +190,9 @@ def main():

    logging.info(f"device: {device}")

-    lexicon = Lexicon(params.lang_dir)
-
-    params.blank_id = 0
-    params.vocab_size = max(lexicon.tokens) + 1
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.vocab_size = num_tokens(token_table) + 1

    logging.info(params)

--- a/egs/aishell/ASR/whisper/asr_datamodule.py
+++ b/egs/aishell/ASR/whisper/asr_datamodule.py
@ -0,0 +1 @@
+../tdnn_lstm_ctc/asr_datamodule.py
--- a/egs/aishell/ASR/whisper/decode.py
+++ b/egs/aishell/ASR/whisper/decode.py
@ -0,0 +1,507 @@
+#!/usr/bin/env python3
+# Copyright 2021 Xiaomi Corporation (Author: Liyong Guo,
+#                                            Fangjun Kuang,
+#                                            Wei Kang)
+#           2024 Yuekai Zhang
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+# Command for decoding using fine-tuned models:
+git lfs install
+git clone https://huggingface.co/yuekai/icefall_asr_aishell_whisper
+ln -s icefall_asr_aishell_whisper/exp_large_v2/epoch-10-avg6.pt whisper/exp_large_v2/epoch-999.pt
+
+python3 ./whisper/decode.py \
+  --exp-dir whisper/exp_large_v2 \
+  --model-name large-v2 \
+  --epoch 999 --avg 1 \
+  --manifest-dir data/fbank_whisper \
+  --beam-size 10 --max-duration 50
+
+# Command for decoding using pretrained models (before fine-tuning):
+
+python3 ./whisper/decode.py \
+  --exp-dir whisper/exp_large_v2 \
+  --model-name large-v2 \
+  --epoch -1 --avg 1 \
+  --manifest-dir data/fbank_whisper \
+  --remove-whisper-encoder-input-length-restriction False \
+  --beam-size 10 --max-duration 50
+
+"""
+
+import argparse
+import logging
+import re
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import torch
+import torch.nn as nn
+import whisper
+from asr_datamodule import AishellAsrDataModule
+from tn.chinese.normalizer import Normalizer
+from whisper.normalizers import BasicTextNormalizer
+from whisper_encoder_forward_monkey_patch import replace_whisper_encoder_forward
+from zhconv import convert
+
+from icefall.checkpoint import average_checkpoints_with_averaged_model, load_checkpoint
+from icefall.env import get_env_info
+from icefall.utils import (
+    AttributeDict,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_error_stats,
+)
+
+
+def average_checkpoints(
+    filenames: List[Path], device: torch.device = torch.device("cpu")
+) -> dict:
+    """Average a list of checkpoints.
+    The function is mainly used for deepspeed converted checkpoint averaging, which only include model state_dict.
+
+    Args:
+      filenames:
+        Filenames of the checkpoints to be averaged. We assume all
+        checkpoints are saved by :func:`save_checkpoint`.
+      device:
+        Move checkpoints to this device before averaging.
+    Returns:
+      Return a dict (i.e., state_dict) which is the average of all
+      model state dicts contained in the checkpoints.
+    """
+    n = len(filenames)
+
+    if "model" in torch.load(filenames[0], map_location=device):
+        avg = torch.load(filenames[0], map_location=device)["model"]
+    else:
+        avg = torch.load(filenames[0], map_location=device)
+
+    # Identify shared parameters. Two parameters are said to be shared
+    # if they have the same data_ptr
+    uniqued: Dict[int, str] = dict()
+
+    for k, v in avg.items():
+        v_data_ptr = v.data_ptr()
+        if v_data_ptr in uniqued:
+            continue
+        uniqued[v_data_ptr] = k
+
+    uniqued_names = list(uniqued.values())
+
+    for i in range(1, n):
+        if "model" in torch.load(filenames[i], map_location=device):
+            state_dict = torch.load(filenames[i], map_location=device)["model"]
+        else:
+            state_dict = torch.load(filenames[i], map_location=device)
+        for k in uniqued_names:
+            avg[k] += state_dict[k]
+
+    for k in uniqued_names:
+        if avg[k].is_floating_point():
+            avg[k] /= n
+        else:
+            avg[k] //= n
+
+    return avg
+
+
+def remove_punctuation(text: str or List[str]):
+    """Modified from https://github.com/yeyupiaoling/Whisper-Finetune/blob/master/utils/data_utils.py
+
+    Args:
+        text: It can be a string or a list of strings.
+    Returns:
+        Return a string or a list of strings without any punctuation.
+    """
+    punctuation = "!,.;:?、！，。；：？《》 "
+    if isinstance(text, str):
+        text = re.sub(r"[{}]+".format(punctuation), "", text).strip()
+        return text
+    elif isinstance(text, list):
+        result_text = []
+        for t in text:
+            t = re.sub(r"[{}]+".format(punctuation), "", t).strip()
+            result_text.append(t)
+        return result_text
+    else:
+        raise Exception(f"Not support type {type(text)}")
+
+
+def to_simple(text: str or List[str]):
+    """Convert traditional Chinese to simplified Chinese.
+    Args:
+        text: It can be a string or a list of strings.
+    Returns:
+        Return a string or a list of strings converted to simplified Chinese.
+    """
+    if isinstance(text, str):
+        text = convert(text, "zh-cn")
+        return text
+    elif isinstance(text, list):
+        result_text = []
+        for t in text:
+            t = convert(t, "zh-cn")
+            result_text.append(t)
+        return result_text
+    else:
+        raise Exception(f"Not support type{type(text)}")
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=-1,
+        help="It specifies the checkpoint to use for decoding."
+        "Note: Epoch counts from 0.",
+    )
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=1,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch'. ",
+    )
+
+    parser.add_argument(
+        "--method",
+        type=str,
+        default="beam-search",
+        help="""Decoding method.
+        Supported values are:
+          - beam-search
+        """,
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=1,
+        help="beam size for beam search decoding",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="whisper/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="large-v2",
+        choices=["large-v2", "large-v3", "medium", "small", "base", "tiny"],
+        help="""The model name to use.
+        """,
+    )
+
+    parser.add_argument(
+        "--remove-whisper-encoder-input-length-restriction",
+        type=str2bool,
+        default=True,
+        help="replace whisper encoder forward method to remove input length restriction",
+    )
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    params = AttributeDict(
+        {
+            "env_info": get_env_info(),
+        }
+    )
+    return params
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    batch: dict,
+) -> Dict[str, List[List[int]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+
+        - key: "beam-search"
+        - value: A list of lists. Each sublist is a list of token IDs.
+    Args:
+        params:
+            It is returned by :func:`get_params`.
+        model:
+            The neural model.
+        batch:
+            It is returned by :meth:`torch.utils.data.DataLoader.__iter__`.
+    Returns:
+        Return a dict, whose key may be "beam-search".
+    """
+    dtype = torch.float16
+    device = torch.device("cuda")
+
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+    feature = feature.to(device, dtype=dtype).transpose(1, 2)
+    if not params.remove_whisper_encoder_input_length_restriction:
+        T = 3000
+        if feature.shape[2] < T:
+            feature = torch.cat(
+                [
+                    feature,
+                    torch.zeros(
+                        feature.shape[0], feature.shape[1], T - feature.shape[2]
+                    ).to(device, dtype=dtype),
+                ],
+                2,
+            )
+
+    supervisions = batch["supervisions"]
+    feature_len = supervisions["num_frames"]
+    feature_len = feature_len.to(device, dtype=dtype)
+    results = model.decode(feature, params.decoding_options)
+    hyps = [result.text for result in results]
+
+    hyps = remove_punctuation(hyps)
+    hyps = to_simple(hyps)
+    hyps = [params.normalizer.normalize(hyp) for hyp in hyps]
+
+    return {"beam-search": hyps}
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+        dl:
+            The dataloader.
+        params:
+            It is returned by :func:`get_params`.
+        model:
+            The neural model.
+    Returns:
+        Return a dict, whose key may be "beam-search".
+    """
+    results = []
+
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    results = defaultdict(list)
+    for batch_idx, batch in enumerate(dl):
+        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+
+        hyps_dict = decode_one_batch(
+            params=params,
+            model=model,
+            batch=batch,
+        )
+
+        for lm_scale, hyps in hyps_dict.items():
+            this_batch = []
+            assert len(hyps) == len(texts)
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                ref_words = ref_text.split()
+                this_batch.append((cut_id, ref_words, hyp_words))
+
+            results[lm_scale].extend(this_batch)
+
+        num_cuts += len(batch["supervisions"]["text"])
+
+        if batch_idx % 100 == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+    return results
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
+):
+
+    enable_log = True
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.exp_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
+        if enable_log:
+            logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.exp_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        # we compute CER for aishell dataset.
+        results_char = []
+        for res in results:
+            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
+        with open(errs_filename, "w") as f:
+            wer = write_error_stats(
+                f,
+                f"{test_set_name}-{key}",
+                results_char,
+                enable_log=enable_log,
+                compute_CER=True,
+            )
+            test_set_wers[key] = wer
+
+        if enable_log:
+            logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = params.exp_dir / f"cer-summary-{test_set_name}-{params.suffix}.txt"
+    with open(errs_info, "w") as f:
+        print("settings\tCER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, CER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    AishellAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+    params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+    setup_logger(
+        f"{params.exp_dir}/log-{params.method}-beam{params.beam_size}/log-decode-{params.suffix}"
+    )
+
+    options = whisper.DecodingOptions(
+        task="transcribe",
+        language="zh",
+        without_timestamps=True,
+        beam_size=params.beam_size,
+    )
+    params.decoding_options = options
+    params.cleaner = BasicTextNormalizer()
+    params.normalizer = Normalizer()
+
+    logging.info("Decoding started")
+    logging.info(params)
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+
+    logging.info(f"device: {device}")
+
+    if params.remove_whisper_encoder_input_length_restriction:
+        replace_whisper_encoder_forward()
+    model = whisper.load_model(params.model_name, "cpu")
+    if params.epoch > 0:
+        if params.avg > 1:
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            checkpoint = torch.load(
+                f"{params.exp_dir}/epoch-{params.epoch}.pt", map_location="cpu"
+            )
+            if "model" not in checkpoint:
+                # deepspeed converted checkpoint only contains model state_dict
+                filenames = [
+                    f"{params.exp_dir}/epoch-{epoch}.pt"
+                    for epoch in range(start, params.epoch + 1)
+                ]
+                model.load_state_dict(average_checkpoints(filenames))
+            else:
+                filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+                filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+                logging.info(
+                    f"Calculating the averaged model over epoch range from "
+                    f"{start} (excluded) to {params.epoch}"
+                )
+                model.to(device)
+                model.load_state_dict(
+                    average_checkpoints_with_averaged_model(
+                        filename_start=filename_start,
+                        filename_end=filename_end,
+                        device=device,
+                    )
+                )
+            # save checkpoints
+            filename = f"{params.exp_dir}/epoch-{params.epoch}-avg-{params.avg}.pt"
+            torch.save(model.state_dict(), filename)
+        else:
+            checkpoint = torch.load(
+                f"{params.exp_dir}/epoch-{params.epoch}.pt", map_location="cpu"
+            )
+            if "model" not in checkpoint:
+                model.load_state_dict(checkpoint, strict=True)
+            else:
+                load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+    model.to(device)
+    model.eval()
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    aishell = AishellAsrDataModule(args)
+    valid_dl = aishell.valid_dataloaders(aishell.valid_cuts())
+    test_dl = aishell.test_dataloaders(aishell.test_cuts())
+    test_sets = ["valid", "test"]
+    test_dls = [valid_dl, test_dl]
+
+    for test_set, test_dl in zip(test_sets, test_dls):
+        results_dict = decode_dataset(
+            dl=test_dl,
+            params=params,
+            model=model,
+        )
+
+        save_results(params=params, test_set_name=test_set, results_dict=results_dict)
+
+    logging.info("Done!")
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()
--- a/egs/aishell/ASR/whisper/ds_config_zero1.json
+++ b/egs/aishell/ASR/whisper/ds_config_zero1.json
@ -0,0 +1,38 @@
+{
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 100,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 0.01
+    },
+    "zero_optimization": {
+        "stage": 1,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 1e-5
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": 0,
+            "warmup_max_lr": 1e-5,
+            "warmup_num_steps": 100
+        }
+    },
+    "gradient_accumulation_steps": 1,
+    "gradient_clipping": 5,
+    "steps_per_print": 50,
+    "train_micro_batch_size_per_gpu": 1,
+    "wall_clock_breakdown": false
+}
--- a/egs/aishell/ASR/whisper/label_smoothing.py
+++ b/egs/aishell/ASR/whisper/label_smoothing.py
@ -0,0 +1 @@
+../../../librispeech/ASR/conformer_ctc/label_smoothing.py
--- a/egs/aishell/ASR/whisper/optim.py
+++ b/egs/aishell/ASR/whisper/optim.py
@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/optim.py
--- a/egs/aishell/ASR/whisper/requirements.txt
+++ b/egs/aishell/ASR/whisper/requirements.txt
@ -0,0 +1,10 @@
+k2
+kaldialign
+git+https://github.com/lhotse-speech/lhotse
+sentencepiece
+tensorboard
+librosa
+git+https://github.com/yuekaizhang/whisper.git
+zhconv
+WeTextProcessing
+deepspeed
--- a/egs/aishell/ASR/whisper/train.py
+++ b/egs/aishell/ASR/whisper/train.py
@ -0,0 +1,927 @@
+#!/usr/bin/env python3
+# Copyright    2023  Xiaomi Corp.        (authors: Xiaoyu Yang)
+#              2024  Yuekai Zhang
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+#fine-tuning with deepspeed zero stage 1
+torchrun --nproc_per_node 8 ./whisper/train.py \
+  --max-duration 200 \
+  --exp-dir whisper/exp_large_v2 \
+  --model-name large-v2 \
+  --manifest-dir data/fbank_whisper \
+  --deepspeed \
+  --deepspeed_config ./whisper/ds_config_zero1.json
+
+# fine-tuning with ddp
+torchrun --nproc_per_node 8 ./whisper/train.py \
+  --max-duration 200 \
+  --exp-dir whisper/exp_medium \
+  --manifest-dir data/fbank_whisper \
+  --base-lr 1e-5 \
+  --model-name medium
+"""
+
+
+import argparse
+import copy
+import logging
+import random
+import warnings
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import deepspeed
+import k2
+import optim
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+import whisper
+from asr_datamodule import AishellAsrDataModule
+from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict
+from label_smoothing import LabelSmoothingLoss
+from lhotse import CutSet, load_manifest
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import fix_random_seed
+from optim import Eden, ScaledAdam
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.functional import pad as pad_tensor
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from whisper_encoder_forward_monkey_patch import replace_whisper_encoder_forward
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import update_averaged_model
+from icefall.dist import cleanup_dist, get_rank, get_world_size, setup_dist
+from icefall.env import get_env_info
+from icefall.hooks import register_inf_check_hooks
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    filter_uneven_sized_batch,
+    setup_logger,
+    str2bool,
+)
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for module in model.modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=10,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="whisper/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="large-v2",
+        choices=["large-v2", "large-v3", "medium", "small", "base", "tiny"],
+        help="""The model name to use.
+        """,
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=1e-5, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=5000,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=6,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--inf-check",
+        type=str2bool,
+        default=False,
+        help="Add hooks to check for infinite module outputs and gradients.",
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=30,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=200,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=True,
+        help="Whether to use half precision training.",
+    )
+
+    parser = deepspeed.add_config_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - frame_shift_ms: The frame shift in milliseconds.
+        - allowed_excess_duration_ratio: The allowed excess duration ratio.
+        - best_train_loss: The best training loss so far.
+        - best_valid_loss: The best validation loss so far.
+        - best_train_epoch: The epoch where the best training loss is achieved.
+        - best_valid_epoch: The epoch where the best validation loss is achieved.
+        - batch_idx_train: The batch index of the current batch.
+        - log_interval: Log training stats every `log_interval` batches.
+        - reset_interval: Reset the stats every `reset_interval` batches.
+        - valid_interval: Run validation every `valid_interval` batches.
+        - env_info: The environment information.
+    """
+    params = AttributeDict(
+        {
+            "frame_shift_ms": 10.0,
+            "subsampling_factor": 2,
+            "allowed_excess_duration_ratio": 0.1,
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "valid_interval": 5000,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_loss(
+    params: AttributeDict,
+    tokenizer: whisper.tokenizer.Tokenizer,
+    model: Union[nn.Module, DDP],
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute the loss for the given batch.
+    Args:
+        params:
+            It is returned by :func:`get_params`.
+        tokenizer:
+            The tokenizer used to encode the text.
+        model:
+            The model for training.
+        batch:
+            A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+            for the content in it.
+        is_training:
+            Whether it is training.
+    Returns:
+        Return a tuple of two elements. The first element is the loss tensor.
+    """
+    # For the uneven-sized batch, the total duration after padding would possibly
+    # cause OOM. Hence, for each batch, which is sorted descendingly by length,
+    # we simply drop the last few shortest samples, so that the retained total frames
+    # (after padding) would not exceed `allowed_max_frames`:
+    # `allowed_max_frames = int(max_frames * (1.0 + allowed_excess_duration_ratio))`,
+    # where `max_frames = max_duration * 1000 // frame_shift_ms`.
+    # We set allowed_excess_duration_ratio=0.1.
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+
+    def _batch_tensors(tensors: List[Tensor], pad_value: Any) -> Tensor:
+        padding_size = max(tensor.shape[0] for tensor in tensors)
+        dims = len(tensors[0].shape)
+        padded_tensors = []
+        for tensor in tensors:
+            padding = [0] * 2 * dims
+            padding[-1] = padding_size - tensor.shape[0]
+            padded_tensors.append(pad_tensor(tensor, padding, "constant", pad_value))
+        return torch.stack([tensor for tensor in padded_tensors], dim=0)
+
+    max_frames = params.max_duration * 1000 // params.frame_shift_ms
+    allowed_max_frames = int(max_frames * (1.0 + params.allowed_excess_duration_ratio))
+    batch = filter_uneven_sized_batch(batch, allowed_max_frames)
+
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"]
+
+    assert feature.ndim == 3
+    feature = feature.to(device)
+    feature = feature.transpose(1, 2)  # (N, C, T)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    batch_idx_train = params.batch_idx_train
+
+    texts = batch["supervisions"]["text"]
+    # remove spaces in texts
+    texts = [text.replace(" ", "") for text in texts]
+
+    text_tokens_list = [
+        list(tokenizer.sot_sequence_including_notimestamps)
+        + tokenizer.encode(text)
+        + [tokenizer.eot]
+        for text in texts
+    ]
+    # convert it to torch tensor
+    text_tokens_list = [
+        torch.LongTensor(text_tokens) for text_tokens in text_tokens_list
+    ]
+
+    # 50256 is the index of <pad> for all whisper models
+    prev_outputs_tokens = _batch_tensors(
+        [tokens[:-1] for tokens in text_tokens_list], pad_value=50256
+    )
+    target_tokens = _batch_tensors(
+        [tokens[1:] for tokens in text_tokens_list], pad_value=50256
+    )
+    target_lengths = torch.LongTensor(
+        [tokens.shape[0] - 1 for tokens in text_tokens_list]
+    )
+
+    decoder_criterion = LabelSmoothingLoss(
+        ignore_index=50256, label_smoothing=0.1, reduction="sum"
+    )
+
+    # ignore the first 3 tokens, which are always <|lang_id|>, <|transcibe|>, <|notimestampes|>
+    ignore_prefix_size = 3
+    with torch.set_grad_enabled(is_training):
+        encoder_out = model.encoder(feature)
+        text_logits = model.decoder(prev_outputs_tokens.to(device), encoder_out)
+        text_logits = text_logits[:, ignore_prefix_size:, :]
+        target_tokens = target_tokens[:, ignore_prefix_size:]
+        loss = decoder_criterion(text_logits, target_tokens.to(device))
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    tokenizer: whisper.tokenizer.Tokenizer,
+    model: Union[nn.Module, DDP],
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            loss, loss_info = compute_loss(
+                params=params,
+                tokenizer=tokenizer,
+                model=model,
+                batch=batch,
+                is_training=False,
+            )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    tokenizer: whisper.tokenizer.Tokenizer,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(train_dl):
+        params.batch_idx_train += 1
+        batch_size = len(batch["supervisions"]["text"])
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                tokenizer=tokenizer,
+                model=model,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    tokenizer=tokenizer,
+                    model=model,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            if params.deepspeed:
+                # deepspeed's backward() is different from torch's backward()
+                # in that it does not accept a loss tensor as input.
+                # It computes the loss internally.
+                model.backward(loss)
+                model.step()
+            else:
+                scaler.scale(loss).backward()
+                set_batch_count(model, params.batch_idx_train)
+                scheduler.step_batch(params.batch_idx_train)
+
+                scaler.step(optimizer)
+                scaler.update()
+                optimizer.zero_grad()
+        except:  # noqa
+            display_and_save_batch(batch, params=params)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+            and not params.deepspeed
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16 and not params.deepspeed:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+        if batch_idx % params.log_interval == 0:
+            try:
+                cur_lr = scheduler.get_last_lr()[0]
+            except:  # noqa
+                cur_lr = 0.0
+            cur_grad_scale = (
+                scaler._scale.item()
+                if (params.use_fp16 and not params.deepspeed)
+                else 1.0
+            )
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (
+                    f"grad_scale: {scaler._scale.item()}"
+                    if (params.use_fp16 and not params.deepspeed)
+                    else ""
+                )
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale",
+                        cur_grad_scale,
+                        params.batch_idx_train,
+                    )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info(params)
+
+    logging.info("About to create model")
+
+    replace_whisper_encoder_forward()
+    model = whisper.load_model(params.model_name, "cpu")
+    del model.alignment_heads
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    tokenizer = whisper.tokenizer.get_tokenizer(
+        model.is_multilingual,
+        num_languages=model.num_languages,
+        language="zh",
+        task="transcribe",
+    )
+
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model).to(torch.float64)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    else:
+        device = torch.device("cpu")
+    logging.info(f"Device: {device}")
+    model.to(device)
+
+    optimizer = torch.optim.AdamW(model.parameters(), lr=params.base_lr)
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if world_size > 1:
+        if params.deepspeed:
+            logging.info("Using DeepSpeed")
+            model, optimizer, _, scheduler = deepspeed.initialize(
+                args=params, model=model, model_parameters=model.parameters()
+            )
+        else:
+            logging.info("Using DDP")
+            setup_dist(use_ddp_launch=True)
+            model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    if params.print_diagnostics:
+        opts = diagnostics.TensorDiagnosticOptions(
+            512
+        )  # allow 4 megabytes per sub-module
+        diagnostic = diagnostics.attach_diagnostics(model, opts)
+
+    if params.inf_check:
+        register_inf_check_hooks(model)
+
+    aishell = AishellAsrDataModule(args)
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = aishell.train_dataloaders(aishell.train_cuts())
+    valid_dl = aishell.valid_dataloaders(aishell.valid_cuts())
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    logging.info(f"start training from epoch {params.start_epoch}")
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        if not params.deepspeed:
+            scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            tokenizer=tokenizer,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        if params.deepspeed:
+            model.save_checkpoint(
+                save_dir=params.exp_dir,
+                tag=f"epoch-{params.cur_epoch}",
+                client_state={},
+            )
+            if rank == 0:
+                convert_zero_checkpoint_to_fp32_state_dict(
+                    params.exp_dir,
+                    f"{params.exp_dir}/epoch-{params.cur_epoch}.pt",
+                    tag=f"epoch-{params.cur_epoch}",
+                )
+        else:
+            save_checkpoint(
+                params=params,
+                model=model,
+                model_avg=model_avg,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+
+    logging.info("Done!")
+
+    if world_size > 1 and not params.deepspeed:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    supervisions = batch["supervisions"]
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+
+def main():
+    parser = get_parser()
+    AishellAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = get_world_size()
+    rank = get_rank()
+
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
+    run(rank=rank, world_size=world_size, args=args)
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/aishell/ASR/whisper/whisper_encoder_forward_monkey_patch.py
+++ b/egs/aishell/ASR/whisper/whisper_encoder_forward_monkey_patch.py
@ -0,0 +1,29 @@
+import torch
+import torch.nn.functional as F
+import whisper
+
+
+def forward(self, x: torch.Tensor):
+    """
+    x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
+        the mel spectrogram of the audio
+    """
+    x = F.gelu(self.conv1(x))
+    x = F.gelu(self.conv2(x))
+    x = x.permute(0, 2, 1)
+
+    x = (x + self.positional_embedding[: x.shape[1], :]).to(x.dtype)
+
+    for block in self.blocks:
+        x = block(x)
+
+    x = self.ln_post(x)
+    return x
+
+
+def replace_whisper_encoder_forward():
+    """
+    This function monkey patches the forward method of the whisper encoder.
+    To be called before the model is loaded, it changes whisper to process audio with any length < 30s.
+    """
+    whisper.model.AudioEncoder.forward = forward
--- a/egs/aishell/ASR/zipformer/decode.py
+++ b/egs/aishell/ASR/zipformer/decode.py
@ -560,7 +560,7 @@ def save_results(
            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
        )
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recog_path, texts=results, char_level=True)
        logging.info(f"The transcripts are stored in {recog_path}")

        # The following prints out WERs, per-word error statistics and aligned
@ -570,7 +570,11 @@ def save_results(
        )
        with open(errs_filename, "w") as f:
            wer = write_error_stats(
-                f, f"{test_set_name}-{key}", results, enable_log=True
+                f,
+                f"{test_set_name}-{key}",
+                results,
+                enable_log=True,
+                compute_CER=True,
            )
            test_set_wers[key] = wer

--- a/egs/aishell/ASR/zipformer/decode_bbpe.py
+++ b/egs/aishell/ASR/zipformer/decode_bbpe.py
@ -0,0 +1,840 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2024 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao,
+#                                                 Mingshuang Luo,
+#                                                 Zengrui Jin,)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+(1) greedy search
+./zipformer/decode_bbpe.py \
+    --epoch 35 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp_bbpe \
+    --lang-dir data/lang_bbpe_500 \
+    --bpe-model data/lang_bbpe_500/bbpe.model \
+    --max-duration 600 \
+    --decoding-method greedy_search
+
+(2) modified beam search
+./zipformer/decode_bbpe.py \
+    --epoch 35 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp_bbpe \
+    --lang-dir data/lang_bbpe_500 \
+    --bpe-model data/lang_bbpe_500/bbpe.model \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+
+(3) fast beam search (trivial_graph)
+./zipformer/decode_bbpe.py \
+    --epoch 35 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp_bbpe \
+    --lang-dir data/lang_bbpe_500 \
+    --bpe-model data/lang_bbpe_500/bbpe.model \
+    --max-duration 600 \
+    --decoding-method fast_beam_search \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64
+
+(4) fast beam search (LG)
+./zipformer/decode_bbpe.py \
+    --epoch 30 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp_bbpe \
+    --lang-dir data/lang_bbpe_500 \
+    --bpe-model data/lang_bbpe_500/bbpe.model \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_LG \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64
+
+(5) fast beam search (nbest oracle WER)
+./zipformer/decode_bbpe.py \
+    --epoch 35 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp_bbpe \
+    --lang-dir data/lang_bbpe_500 \
+    --bpe-model data/lang_bbpe_500/bbpe.model \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest_oracle \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64 \
+    --num-paths 200 \
+    --nbest-scale 0.5
+"""
+
+
+import argparse
+import logging
+import math
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from asr_datamodule import AishellAsrDataModule
+from beam_search import (
+    beam_search,
+    fast_beam_search_nbest_oracle,
+    fast_beam_search_one_best,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from lhotse.cut import Cut
+from train import add_model_arguments, get_model, get_params
+
+from icefall import byte_encode, smart_byte_decode, tokenize_by_CJK_char
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    make_pad_mask,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_error_stats,
+)
+
+LOG_EPS = math.log(1e-10)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=30,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=15,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer_bbpe/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bbpe_500/bbpe.model",
+        help="Path to the byte BPE model",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=Path,
+        default="data/lang_bbpe_500/",
+        help="The lang dir containing word table and LG graph",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Possible values are:
+          - greedy_search
+          - modified_beam_search
+          - fast_beam_search
+          - fast_beam_search_LG
+          - fast_beam_search_nbest_oracle
+        If you use fast_beam_search_LG, you have to specify
+        `--lang-dir`, which should contain `LG.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=20.0,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search,
+        fast_beam_search, fast_beam_search_LG,
+        and fast_beam_search_nbest_oracle
+        """,
+    )
+
+    parser.add_argument(
+        "--ngram-lm-scale",
+        type=float,
+        default=0.01,
+        help="""
+        Used only when --decoding_method is fast_beam_search_LG.
+        It specifies the scale for n-gram LM scores.
+        """,
+    )
+
+    parser.add_argument(
+        "--ilme-scale",
+        type=float,
+        default=0.2,
+        help="""
+        Used only when --decoding_method is fast_beam_search_LG.
+        It specifies the scale for the internal language model estimation.
+        """,
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=8,
+        help="""Used only when --decoding-method is
+        fast_beam_search, fast_beam_search, fast_beam_search_LG,
+        and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=64,
+        help="""Used only when --decoding-method is
+        fast_beam_search, fast_beam_search, fast_beam_search_LG,
+        and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame.
+        Used only when --decoding_method is greedy_search""",
+    )
+
+    parser.add_argument(
+        "--num-paths",
+        type=int,
+        default=200,
+        help="""Number of paths for nbest decoding.
+        Used only when the decoding method is fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--nbest-scale",
+        type=float,
+        default=0.5,
+        help="""Scale applied to lattice scores when computing nbest paths.
+        Used only when the decoding method is and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--blank-penalty",
+        type=float,
+        default=0.0,
+        help="""
+        The penalty applied on blank symbol during decoding.
+        Note: It is a positive value that would be applied to logits like
+        this `logits[:, 0] -= blank_penalty` (suppose logits.shape is
+        [batch_size, vocab] and blank id is 0).
+        """,
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    lexicon: Lexicon,
+    batch: dict,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Dict[str, List[List[str]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+
+        - key: It indicates the setting used for decoding. For example,
+               if greedy_search is used, it would be "greedy_search"
+               If beam search with a beam size of 7 is used, it would be
+               "beam_7"
+        - value: It contains the decoding result. `len(value)` equals to
+                 batch size. `value[i]` is the decoding result for the i-th
+                 utterance in the given batch.
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or LG, Used
+        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
+        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+    Returns:
+      Return the decoding result. See above description for the format of
+      the returned dict.
+    """
+    device = next(model.parameters()).device
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+
+    feature = feature.to(device)
+    # at entry, feature is (N, T, C)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    if params.causal:
+        # this seems to cause insertions at the end of the utterance if used with zipformer.
+        pad_len = 30
+        feature_lens += pad_len
+        feature = torch.nn.functional.pad(
+            feature,
+            pad=(0, 0, 0, pad_len),
+            value=LOG_EPS,
+        )
+
+    x, x_lens = model.encoder_embed(feature, feature_lens)
+
+    src_key_padding_mask = make_pad_mask(x_lens)
+    x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+
+    encoder_out, encoder_out_lens = model.encoder(x, x_lens, src_key_padding_mask)
+    encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
+
+    hyps = []
+
+    if params.decoding_method == "fast_beam_search":
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            blank_penalty=params.blank_penalty,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(smart_byte_decode(hyp).split())
+    elif params.decoding_method == "fast_beam_search_LG":
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            blank_penalty=params.blank_penalty,
+            ilme_scale=params.ilme_scale,
+        )
+        for hyp in hyp_tokens:
+            hyps.append([lexicon.word_table[i] for i in hyp])
+    elif params.decoding_method == "fast_beam_search_nbest_oracle":
+        ref_texts = []
+        for tx in supervisions["text"]:
+            ref_texts.append(byte_encode(tokenize_by_CJK_char(tx)))
+
+        hyp_tokens = fast_beam_search_nbest_oracle(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            ref_texts=sp.encode(ref_texts),
+            nbest_scale=params.nbest_scale,
+            blank_penalty=params.blank_penalty,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(smart_byte_decode(hyp).split())
+    elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            blank_penalty=params.blank_penalty,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(smart_byte_decode(hyp).split())
+    elif params.decoding_method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            blank_penalty=params.blank_penalty,
+            beam=params.beam_size,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(smart_byte_decode(hyp).split())
+    else:
+        batch_size = encoder_out.size(0)
+
+        for i in range(batch_size):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i + 1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.decoding_method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                    blank_penalty=params.blank_penalty,
+                )
+            elif params.decoding_method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                    blank_penalty=params.blank_penalty,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported decoding method: {params.decoding_method}"
+                )
+            hyps.append(smart_byte_decode(sp.decode(hyp)).split())
+
+    key = f"blank_penalty_{params.blank_penalty}"
+    if params.decoding_method == "greedy_search":
+        return {"greedy_search_" + key: hyps}
+    elif "fast_beam_search" in params.decoding_method:
+        key += f"_beam_{params.beam}_"
+        key += f"max_contexts_{params.max_contexts}_"
+        key += f"max_states_{params.max_states}"
+        if "nbest" in params.decoding_method:
+            key += f"_num_paths_{params.num_paths}_"
+            key += f"nbest_scale_{params.nbest_scale}"
+        if "LG" in params.decoding_method:
+            key += f"_ilme_scale_{params.ilme_scale}"
+            key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
+
+        return {key: hyps}
+    else:
+        return {f"beam_size_{params.beam_size}_" + key: hyps}
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+    lexicon: Lexicon,
+    sp: spm.SentencePieceProcessor,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      lexicon:
+        directory containing the lexicon.
+      sp:
+        SentencePiece model.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or LG, Used
+        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
+        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    if params.decoding_method == "greedy_search":
+        log_interval = 50
+    else:
+        log_interval = 20
+
+    results = defaultdict(list)
+    for batch_idx, batch in enumerate(dl):
+        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+
+        hyps_dict = decode_one_batch(
+            params=params,
+            model=model,
+            sp=sp,
+            lexicon=lexicon,
+            decoding_graph=decoding_graph,
+            batch=batch,
+        )
+
+        for name, hyps in hyps_dict.items():
+            this_batch = []
+            assert len(hyps) == len(texts)
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                ref_words = "".join(ref_text.split())
+
+                this_batch.append((cut_id, ref_words, hyp_words))
+
+            results[name].extend(this_batch)
+
+        num_cuts += len(texts)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+    return results
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+
+        results_char = []
+        for res in results:
+            results_char.append((res[0], list("".join(res[1])), list("".join(res[2]))))
+
+        with open(errs_filename, "w") as f:
+            wer = write_error_stats(
+                f, f"{test_set_name}-{key}", results_char, enable_log=True
+            )
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = (
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+    )
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    AishellAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    assert params.decoding_method in (
+        "greedy_search",
+        "beam_search",
+        "modified_beam_search",
+        "fast_beam_search",
+        "fast_beam_search_LG",
+        "fast_beam_search_nbest_oracle",
+    )
+    params.res_dir = params.exp_dir / params.decoding_method
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    if params.causal:
+        assert (
+            "," not in params.chunk_size
+        ), "chunk_size should be one value in decoding."
+        assert (
+            "," not in params.left_context_frames
+        ), "left_context_frames should be one value in decoding."
+        params.suffix += f"-chunk-{params.chunk_size}"
+        params.suffix += f"-left-context-{params.left_context_frames}"
+
+    if "fast_beam_search" in params.decoding_method:
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+        if "nbest" in params.decoding_method:
+            params.suffix += f"-nbest-scale-{params.nbest_scale}"
+            params.suffix += f"-num-paths-{params.num_paths}"
+        if "LG" in params.decoding_method:
+            params.suffix += f"_ilme_scale_{params.ilme_scale}"
+            params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
+    elif "beam_search" in params.decoding_method:
+        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
+    else:
+        params.suffix += f"-context-{params.context_size}"
+        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
+    params.suffix += f"-blank-penalty-{params.blank_penalty}"
+
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> and <unk> are defined in local/train_bbpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.piece_to_id("<unk>")
+    params.vocab_size = sp.get_piece_size()
+
+    lexicon = Lexicon(params.lang_dir)
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_model(params)
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to(device)
+    model.eval()
+
+    if "fast_beam_search" in params.decoding_method:
+        if "LG" in params.decoding_method:
+            lexicon = Lexicon(params.lang_dir)
+            lg_filename = params.lang_dir / "LG.pt"
+            logging.info(f"Loading {lg_filename}")
+            decoding_graph = k2.Fsa.from_dict(
+                torch.load(lg_filename, map_location=device)
+            )
+            decoding_graph.scores *= params.ngram_lm_scale
+        else:
+            decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+    else:
+        decoding_graph = None
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    aishell = AishellAsrDataModule(args)
+
+    def remove_short_utt(c: Cut):
+        T = ((c.num_frames - 7) // 2 + 1) // 2
+        if T <= 0:
+            logging.warning(
+                f"Exclude cut with ID {c.id} from decoding, num_frames : {c.num_frames}."
+            )
+        return T > 0
+
+    dev_cuts = aishell.valid_cuts()
+    dev_cuts = dev_cuts.filter(remove_short_utt)
+    dev_dl = aishell.valid_dataloaders(dev_cuts)
+
+    test_cuts = aishell.test_cuts()
+    test_cuts = test_cuts.filter(remove_short_utt)
+    test_dl = aishell.test_dataloaders(test_cuts)
+
+    test_sets = ["dev", "test"]
+    test_dls = [dev_dl, test_dl]
+
+    for test_set, test_dl in zip(test_sets, test_dls):
+        results_dict = decode_dataset(
+            dl=test_dl,
+            params=params,
+            model=model,
+            lexicon=lexicon,
+            sp=sp,
+            decoding_graph=decoding_graph,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=test_set,
+            results_dict=results_dict,
+        )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/aishell/ASR/zipformer/jit_pretrained_bbpe.py
+++ b/egs/aishell/ASR/zipformer/jit_pretrained_bbpe.py
@ -0,0 +1,279 @@
+#!/usr/bin/env python3
+# Copyright 2021-2024 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao,
+#                                                 Zengrui Jin,)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script loads torchscript models, exported by `torch.jit.script()`
+and uses them to decode waves.
+You can use the following command to get the exported models:
+
+./zipformer/export.py \
+  --exp-dir ./zipformer_bbpe/exp \
+  --bpe ./data/lang_bbpe_500/bbpe.model \
+  --epoch 30 \
+  --avg 9 \
+  --jit 1
+
+Usage of this script:
+
+./zipformer/jit_pretrained.py \
+  --nn-model-filename ./zipformer_bbpe/exp/cpu_jit.pt \
+  --bpe ./data/lang_bbpe_500/bbpe.model \
+  /path/to/foo.wav \
+  /path/to/bar.wav
+"""
+
+import argparse
+import logging
+import math
+from typing import List
+
+import kaldifeat
+import sentencepiece as spm
+import torch
+import torchaudio
+from torch.nn.utils.rnn import pad_sequence
+
+from icefall import smart_byte_decode
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--nn-model-filename",
+        type=str,
+        required=True,
+        help="Path to the torchscript model cpu_jit.pt",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        required=True,
+        help="""Path to the bbpe.model.""",
+    )
+
+    parser.add_argument(
+        "sound_files",
+        type=str,
+        nargs="+",
+        help="The input sound file(s) to transcribe. "
+        "Supported formats are those supported by torchaudio.load(). "
+        "For example, wav and flac are supported. "
+        "The sample rate has to be 16kHz.",
+    )
+
+    return parser
+
+
+def read_sound_files(
+    filenames: List[str], expected_sample_rate: float = 16000
+) -> List[torch.Tensor]:
+    """Read a list of sound files into a list 1-D float32 torch tensors.
+    Args:
+      filenames:
+        A list of sound filenames.
+      expected_sample_rate:
+        The expected sample rate of the sound files.
+    Returns:
+      Return a list of 1-D float32 torch tensors.
+    """
+    ans = []
+    for f in filenames:
+        wave, sample_rate = torchaudio.load(f)
+        assert (
+            sample_rate == expected_sample_rate
+        ), f"expected sample rate: {expected_sample_rate}. Given: {sample_rate}"
+        # We use only the first channel
+        ans.append(wave[0].contiguous())
+    return ans
+
+
+def greedy_search(
+    model: torch.jit.ScriptModule,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+) -> List[List[int]]:
+    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
+    Args:
+      model:
+        The transducer model.
+      encoder_out:
+        A 3-D tensor of shape (N, T, C)
+      encoder_out_lens:
+        A 1-D tensor of shape (N,).
+    Returns:
+      Return the decoded results for each utterance.
+    """
+    assert encoder_out.ndim == 3
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    device = encoder_out.device
+    blank_id = model.decoder.blank_id
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    context_size = model.decoder.context_size
+    hyps = [[blank_id] * context_size for _ in range(N)]
+
+    decoder_input = torch.tensor(
+        hyps,
+        device=device,
+        dtype=torch.int64,
+    )  # (N, context_size)
+
+    decoder_out = model.decoder(
+        decoder_input,
+        need_pad=torch.tensor([False]),
+    ).squeeze(1)
+
+    offset = 0
+    for batch_size in batch_size_list:
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = packed_encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out
+        # current_encoder_out's shape: (batch_size, encoder_out_dim)
+        offset = end
+
+        decoder_out = decoder_out[:batch_size]
+
+        logits = model.joiner(
+            current_encoder_out,
+            decoder_out,
+        )
+        # logits'shape (batch_size, vocab_size)
+
+        assert logits.ndim == 2, logits.shape
+        y = logits.argmax(dim=1).tolist()
+        emitted = False
+        for i, v in enumerate(y):
+            if v != blank_id:
+                hyps[i].append(v)
+                emitted = True
+        if emitted:
+            # update decoder output
+            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
+            decoder_input = torch.tensor(
+                decoder_input,
+                device=device,
+                dtype=torch.int64,
+            )
+            decoder_out = model.decoder(
+                decoder_input,
+                need_pad=torch.tensor([False]),
+            )
+            decoder_out = decoder_out.squeeze(1)
+
+    sorted_ans = [h[context_size:] for h in hyps]
+    ans = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+
+    return ans
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    logging.info(vars(args))
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"device: {device}")
+
+    model = torch.jit.load(args.nn_model_filename)
+
+    model.eval()
+
+    model.to(device)
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(args.bpe_model)
+
+    logging.info("Constructing Fbank computer")
+    opts = kaldifeat.FbankOptions()
+    opts.device = device
+    opts.frame_opts.dither = 0
+    opts.frame_opts.snip_edges = False
+    opts.frame_opts.samp_freq = 16000
+    opts.mel_opts.num_bins = 80
+    opts.mel_opts.high_freq = -400
+
+    fbank = kaldifeat.Fbank(opts)
+
+    logging.info(f"Reading sound files: {args.sound_files}")
+    waves = read_sound_files(
+        filenames=args.sound_files,
+    )
+    waves = [w.to(device) for w in waves]
+
+    logging.info("Decoding started")
+    features = fbank(waves)
+    feature_lengths = [f.size(0) for f in features]
+
+    features = pad_sequence(
+        features,
+        batch_first=True,
+        padding_value=math.log(1e-10),
+    )
+
+    feature_lengths = torch.tensor(feature_lengths, device=device)
+
+    encoder_out, encoder_out_lens = model.encoder(
+        features=features,
+        feature_lengths=feature_lengths,
+    )
+
+    hyps = greedy_search(
+        model=model,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+    )
+
+    s = "\n"
+    for filename, hyp in zip(args.sound_files, hyps):
+        words = smart_byte_decode(sp.decode(hyp))
+        s += f"{filename}:\n{words}\n\n"
+    logging.info(s)
+
+    logging.info("Decoding Done")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
--- a/egs/aishell/ASR/zipformer/pretrained_bbpe.py
+++ b/egs/aishell/ASR/zipformer/pretrained_bbpe.py
@ -0,0 +1,403 @@
+#!/usr/bin/env python3
+# Copyright 2021-2024 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao,
+#                                                 Zengrui Jin,)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script loads a checkpoint and uses it to decode waves.
+You can generate the checkpoint with the following command:
+
+Note: This is a example for librispeech dataset, if you are using different
+dataset, you should change the argument values according to your dataset.
+
+- For non-streaming model:
+
+./zipformer/export.py \
+  --exp-dir ./zipformer/exp_bbpe \
+  --tokens ./data/lang_bbpe_500/tokens.txt \
+  --epoch 30 \
+  --avg 9
+
+- For streaming model:
+
+./zipformer/export.py \
+  --exp-dir ./zipformer/exp_bbpe \
+  --causal 1 \
+  --tokens ./data/lang_bbpe_500/tokens.txt \
+  --epoch 30 \
+  --avg 9
+
+Usage of this script:
+
+- For non-streaming model:
+
+(1) greedy search
+./zipformer/pretrained_bbpe.py \
+  --checkpoint ./zipformer/exp_bbpe/pretrained.pt \
+  --bpe ./data/lang_bbpe_500/bbpe.model \
+  --method greedy_search \
+  /path/to/foo.wav \
+  /path/to/bar.wav
+
+(2) modified beam search
+./zipformer/pretrained_bbpe.py \
+  --checkpoint ./zipformer/exp_bbpe/pretrained.pt \
+  --bpe ./data/lang_bbpe_500/bbpe.model \
+  --method modified_beam_search \
+  /path/to/foo.wav \
+  /path/to/bar.wav
+
+(3) fast beam search
+./zipformer/pretrained_bbpe.py \
+  --checkpoint ./zipformer/exp_bbpe/pretrained.pt \
+  --bpe ./data/lang_bbpe_500/bbpe.model \
+  --method fast_beam_search \
+  /path/to/foo.wav \
+  /path/to/bar.wav
+
+- For streaming model:
+
+(1) greedy search
+./zipformer/pretrained_bbpe.py \
+  --checkpoint ./zipformer/exp_bbpe/pretrained.pt \
+  --causal 1 \
+  --chunk-size 16 \
+  --left-context-frames 128 \
+  --bpe ./data/lang_bbpe_500/bbpe.model \
+  --method greedy_search \
+  /path/to/foo.wav \
+  /path/to/bar.wav
+
+(2) modified beam search
+./zipformer/pretrained_bbpe.py \
+  --checkpoint ./zipformer/exp_bbpe/pretrained.pt \
+  --causal 1 \
+  --chunk-size 16 \
+  --left-context-frames 128 \
+  --bpe ./data/lang_bbpe_500/bbpe.model \
+  --method modified_beam_search \
+  /path/to/foo.wav \
+  /path/to/bar.wav
+
+(3) fast beam search
+./zipformer/pretrained_bbpe.py \
+  --checkpoint ./zipformer/exp_bbpe/pretrained.pt \
+  --causal 1 \
+  --chunk-size 16 \
+  --left-context-frames 128 \
+  --bpe ./data/lang_bbpe_500/bbpe.model \
+  --method fast_beam_search \
+  /path/to/foo.wav \
+  /path/to/bar.wav
+
+
+You can also use `./zipformer/exp_bbpe/epoch-xx.pt`.
+
+Note: ./zipformer/exp_bbpe/pretrained.pt is generated by ./zipformer/export_bbpe.py
+"""
+
+
+import argparse
+import logging
+import math
+from typing import List
+
+import k2
+import kaldifeat
+import sentencepiece as spm
+import torch
+import torchaudio
+from beam_search import (
+    beam_search,
+    fast_beam_search_one_best,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from torch.nn.utils.rnn import pad_sequence
+from train import add_model_arguments, get_model, get_params
+
+from icefall import smart_byte_decode
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        required=True,
+        help="Path to the checkpoint. "
+        "The checkpoint is assumed to be saved by "
+        "icefall.checkpoint.save_checkpoint().",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        required=True,
+        help="""Path to the bbpe.model.""",
+    )
+
+    parser.add_argument(
+        "--method",
+        type=str,
+        default="greedy_search",
+        help="""Possible values are:
+          - greedy_search
+          - modified_beam_search
+          - fast_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "sound_files",
+        type=str,
+        nargs="+",
+        help="The input sound file(s) to transcribe. "
+        "Supported formats are those supported by torchaudio.load(). "
+        "For example, wav and flac are supported. "
+        "The sample rate has to be 16kHz.",
+    )
+
+    parser.add_argument(
+        "--sample-rate",
+        type=int,
+        default=16000,
+        help="The sample rate of the input sound file",
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
+        help="""Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=8,
+        help="""Used only when --method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame. Used only when
+        --method is greedy_search.
+        """,
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def read_sound_files(
+    filenames: List[str], expected_sample_rate: float
+) -> List[torch.Tensor]:
+    """Read a list of sound files into a list 1-D float32 torch tensors.
+    Args:
+      filenames:
+        A list of sound filenames.
+      expected_sample_rate:
+        The expected sample rate of the sound files.
+    Returns:
+      Return a list of 1-D float32 torch tensors.
+    """
+    ans = []
+    for f in filenames:
+        wave, sample_rate = torchaudio.load(f)
+        assert (
+            sample_rate == expected_sample_rate
+        ), f"expected sample rate: {expected_sample_rate}. Given: {sample_rate}"
+        # We use only the first channel
+        ans.append(wave[0].contiguous())
+    return ans
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+
+    params = get_params()
+
+    params.update(vars(args))
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.piece_to_id("<unk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(f"{params}")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"device: {device}")
+
+    if params.causal:
+        assert (
+            "," not in params.chunk_size
+        ), "chunk_size should be one value in decoding."
+        assert (
+            "," not in params.left_context_frames
+        ), "left_context_frames should be one value in decoding."
+
+    logging.info("Creating model")
+    model = get_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    model.load_state_dict(checkpoint["model"], strict=False)
+    model.to(device)
+    model.eval()
+
+    logging.info("Constructing Fbank computer")
+    opts = kaldifeat.FbankOptions()
+    opts.device = device
+    opts.frame_opts.dither = 0
+    opts.frame_opts.snip_edges = False
+    opts.frame_opts.samp_freq = params.sample_rate
+    opts.mel_opts.num_bins = params.feature_dim
+    opts.mel_opts.high_freq = -400
+
+    fbank = kaldifeat.Fbank(opts)
+
+    logging.info(f"Reading sound files: {params.sound_files}")
+    waves = read_sound_files(
+        filenames=params.sound_files, expected_sample_rate=params.sample_rate
+    )
+    waves = [w.to(device) for w in waves]
+
+    logging.info("Decoding started")
+    features = fbank(waves)
+    feature_lengths = [f.size(0) for f in features]
+
+    features = pad_sequence(features, batch_first=True, padding_value=math.log(1e-10))
+    feature_lengths = torch.tensor(feature_lengths, device=device)
+
+    # model forward
+    encoder_out, encoder_out_lens = model.forward_encoder(features, feature_lengths)
+
+    num_waves = encoder_out.size(0)
+    hyps = []
+    msg = f"Using {params.method}"
+    logging.info(msg)
+
+    if params.method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(smart_byte_decode(hyp).split())
+    elif params.method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(smart_byte_decode(hyp).split())
+    elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(smart_byte_decode(hyp).split())
+    else:
+        for i in range(num_waves):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                )
+            elif params.method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                )
+            else:
+                raise ValueError(f"Unsupported method: {params.method}")
+
+            hyps.append(smart_byte_decode(sp.decode(hyp)).split())
+
+    s = "\n"
+    for filename, hyp in zip(params.sound_files, hyps):
+        words = " ".join(hyp)
+        s += f"{filename}:\n{words}\n\n"
+    logging.info(s)
+
+    logging.info("Decoding Done")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
--- a/egs/aishell/ASR/zipformer/train.py
+++ b/egs/aishell/ASR/zipformer/train.py
@ -86,6 +86,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import (
@ -985,9 +986,7 @@ def train_one_epoch(
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
--- a/egs/aishell/ASR/zipformer/train_bbpe.py
+++ b/egs/aishell/ASR/zipformer/train_bbpe.py
@ -0,0 +1,941 @@
+#!/usr/bin/env python3
+# Copyright    2021-2024  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                       Wei Kang,
+#                                                       Mingshuang Luo,
+#                                                       Zengwei Yao,
+#                                                       Daniel Povey,
+#                                                       Zengrui Jin,)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+
+./zipformer/train_bbpe.py \
+  --world-size 8 \
+  --num-epochs 12 \
+  --start-epoch 1 \
+  --exp-dir zipformer/exp_bbpe \
+  --max-duration 350
+
+# For mix precision training:
+
+./zipformer/train_bbpe.py \
+  --world-size 8 \
+  --num-epochs 12 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir zipformer/exp_bbpe \
+  --max-duration 750
+
+"""
+
+
+import argparse
+import copy
+import logging
+import warnings
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+import k2
+import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import AishellAsrDataModule
+from lhotse.cut import Cut
+from lhotse.utils import fix_random_seed
+from optim import Eden, ScaledAdam
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from train import (
+    LRSchedulerType,
+    add_model_arguments,
+    get_adjusted_batch_count,
+    get_model,
+    get_params,
+    load_checkpoint_if_available,
+    save_checkpoint,
+    set_batch_count,
+)
+
+from icefall import byte_encode, diagnostics
+from icefall.checkpoint import remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.err import raise_grad_scale_is_too_small_error
+from icefall.hooks import register_inf_check_hooks
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    get_parameter_groups_with_lrs,
+    setup_logger,
+    str2bool,
+    tokenize_by_CJK_char,
+)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=30,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer_bbpe/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bbpe_500/bbpe.model",
+        help="Path to the Byte BPE model",
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=0.045, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=7500,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=3.5,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--ref-duration",
+        type=float,
+        default=600,
+        help="""Reference batch duration for purposes of adjusting batch counts for setting various schedules inside the model""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="""The context size in the decoder. 1 means bigram; 2 means tri-gram""",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="""The prune range for rnnt loss, it means how many symbols(context)
+        we are using to compute the loss""",
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="""The scale to smooth the loss with lm
+        (output of prediction network) part.""",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="""The scale to smooth the loss with am (output of encoder network) part.""",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="""To get pruning ranges, we will calculate a simple version
+        loss(joiner is just addition), this simple loss also uses for
+        training (as a regularization item). We will scale the simple loss
+        with this parameter before adding to the final loss.""",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--inf-check",
+        type=str2bool,
+        default=False,
+        help="Add hooks to check for infinite module outputs and gradients.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=4000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 0.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=30,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=200,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute CTC loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Zipformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+     warmup: a floating point value which increases throughout training;
+        values >= 1.0 are fully warmed up and have all modules present.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"]
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+    feature = feature.to(device)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    batch_idx_train = params.batch_idx_train
+    warm_step = params.warm_step
+
+    texts = batch["supervisions"]["text"]
+    y = sp.encode(texts, out_type=int)
+    y = k2.RaggedTensor(y).to(device)
+
+    with torch.set_grad_enabled(is_training):
+        simple_loss, pruned_loss, _ = model(
+            x=feature,
+            x_lens=feature_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+        )
+
+        s = params.simple_loss_scale
+        # take down the scale on the simple loss from 1.0 at the start
+        # to params.simple_loss scale by warm_step.
+        simple_loss_scale = (
+            s
+            if batch_idx_train >= warm_step
+            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
+        )
+        pruned_loss_scale = (
+            1.0
+            if batch_idx_train >= warm_step
+            else 0.1 + 0.9 * (batch_idx_train / warm_step)
+        )
+
+        loss = simple_loss_scale * simple_loss + pruned_loss_scale * pruned_loss
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    info["simple_loss"] = simple_loss.detach().cpu().item()
+    info["pruned_loss"] = pruned_loss.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            sp=sp,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    sp: spm.SentencePieceProcessor,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    cur_batch_idx = params.get("cur_batch_idx", 0)
+
+    saved_bad_model = False
+
+    def save_bad_model(suffix: str = ""):
+        save_checkpoint_impl(
+            filename=params.exp_dir / f"bad-model{suffix}-{rank}.pt",
+            model=model,
+            model_avg=model_avg,
+            params=params,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=0,
+        )
+
+    for batch_idx, batch in enumerate(train_dl):
+        if batch_idx % 10 == 0:
+            set_batch_count(model, get_adjusted_batch_count(params))
+        if batch_idx < cur_batch_idx:
+            continue
+        cur_batch_idx = batch_idx
+
+        params.batch_idx_train += 1
+        batch_size = len(batch["supervisions"]["text"])
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            scheduler.step_batch(params.batch_idx_train)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            save_bad_model()
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            params.cur_batch_idx = batch_idx
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            del params.cur_batch_idx
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+
+            if cur_grad_scale < 8.0 or (cur_grad_scale < 32.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                if not saved_bad_model:
+                    save_bad_model(suffix="-first-warning")
+                    saved_bad_model = True
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                save_bad_model()
+                raise_grad_scale_is_too_small_error(cur_grad_scale)
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = max(scheduler.get_last_lr())
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
+                    )
+
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bbpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model).to(torch.float64)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    model.to(device)
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    optimizer = ScaledAdam(
+        get_parameter_groups_with_lrs(model, lr=params.base_lr, include_names=True),
+        lr=params.base_lr,  # should have no effect
+        clipping_scale=2.0,
+    )
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        opts = diagnostics.TensorDiagnosticOptions(
+            512
+        )  # allow 4 megabytes per sub-module
+        diagnostic = diagnostics.attach_diagnostics(model, opts)
+
+    if params.inf_check:
+        register_inf_check_hooks(model)
+
+    aishell = AishellAsrDataModule(args)
+
+    train_cuts = aishell.train_cuts()
+    valid_cuts = aishell.valid_cuts()
+
+    def remove_short_and_long_utt(c: Cut):
+        # Keep only utterances with duration between 1 second and 15 seconds
+        #
+        # Caution: There is a reason to select 15.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
+        if c.duration < 1.0 or c.duration > 15.0:
+            # logging.warning(
+            #    f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
+            # )
+            return False
+
+        # In pruned RNN-T, we require that T >= S
+        # where T is the number of feature frames after subsampling
+        # and S is the number of tokens in the utterance
+
+        # In ./zipformer.py, the conv module uses the following expression
+        # for subsampling
+        T = ((c.num_frames - 7) // 2 + 1) // 2
+        tokens = sp.encode(c.supervisions[0].text, out_type=str)
+
+        if T < len(tokens):
+            logging.warning(
+                f"Exclude cut with ID {c.id} from training. "
+                f"Number of frames (before subsampling): {c.num_frames}. "
+                f"Number of frames (after subsampling): {T}. "
+                f"Text: {c.supervisions[0].text}. "
+                f"Tokens: {tokens}. "
+                f"Number of tokens: {len(tokens)}"
+            )
+            return False
+
+        return True
+
+    def tokenize_and_encode_text(c: Cut):
+        # Text normalize for each sample
+        text = c.supervisions[0].text
+        text = byte_encode(tokenize_by_CJK_char(text))
+        c.supervisions[0].text = text
+        return c
+
+    train_cuts = train_cuts.filter(remove_short_and_long_utt)
+
+    train_cuts = train_cuts.map(tokenize_and_encode_text)
+
+    valid_cuts = valid_cuts.map(tokenize_and_encode_text)
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = aishell.train_dataloaders(
+        train_cuts, sampler_state_dict=sampler_state_dict
+    )
+
+    valid_dl = aishell.valid_dataloaders(valid_cuts)
+
+    if False and not params.print_diagnostics:
+        scan_pessimistic_batches_for_oom(
+            model=model,
+            train_dl=train_dl,
+            optimizer=optimizer,
+            sp=sp,
+            params=params,
+        )
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sp=sp,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The sentence piece model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    supervisions = batch["supervisions"]
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    y = sp.encode(supervisions["text"], out_type=int)
+    num_tokens = sum(len(i) for i in y)
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def scan_pessimistic_batches_for_oom(
+    model: Union[nn.Module, DDP],
+    train_dl: torch.utils.data.DataLoader,
+    optimizer: torch.optim.Optimizer,
+    sp: spm.SentencePieceProcessor,
+    params: AttributeDict,
+):
+    from lhotse.dataset import find_pessimistic_batches
+
+    logging.info(
+        "Sanity check -- see if any of the batches in epoch 1 would cause OOM."
+    )
+    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
+    for criterion, cuts in batches.items():
+        batch = train_dl.dataset[cuts]
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, _ = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            loss.backward()
+            optimizer.zero_grad()
+        except Exception as e:
+            if "CUDA out of memory" in str(e):
+                logging.error(
+                    "Your GPU ran out of memory with the current "
+                    "max_duration setting. We recommend decreasing "
+                    "max_duration and trying again.\n"
+                    f"Failing criterion: {criterion} "
+                    f"(={crit_values[criterion]}) ..."
+                )
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+        logging.info(
+            f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+        )
+
+
+def main():
+    parser = get_parser()
+    AishellAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()
--- a/egs/aishell2/ASR/local/compute_fbank_aishell2.py
+++ b/egs/aishell2/ASR/local/compute_fbank_aishell2.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    WhisperFbank,
+    WhisperFbankConfig,
+)
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor, str2bool
@ -42,10 +49,12 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)


-def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_aishell2(
+    num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False
+):
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
-    num_jobs = min(15, os.cpu_count())
+    num_jobs = min(8, os.cpu_count())

    dataset_parts = (
        "train",
@ -68,8 +77,12 @@ def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False):
        list(manifests.keys()),
        dataset_parts,
    )
-
-    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+    if whisper_fbank:
+        extractor = WhisperFbank(
+            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
+        )
+    else:
+        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
@ -82,7 +95,7 @@ def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False):
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -111,7 +124,12 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
-
+    parser.add_argument(
+        "--whisper-fbank",
+        type=str2bool,
+        default=False,
+        help="Use WhisperFbank instead of Fbank. Default: False.",
+    )
    return parser.parse_args()


@ -122,5 +140,7 @@ if __name__ == "__main__":

    args = get_args()
    compute_fbank_aishell2(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
+        perturb_speed=args.perturb_speed,
+        whisper_fbank=args.whisper_fbank,
    )
--- a/egs/aishell2/ASR/prepare.sh
+++ b/egs/aishell2/ASR/prepare.sh
@ -108,6 +108,16 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  fi
 fi

+whisper_mel_bins=80
+if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then
+  log "Stage 30: Compute whisper fbank for aishell2"
+  if [ ! -f data/fbank/.aishell2.whisper.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_aishell2.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
+    touch data/fbank/.aishell2.whisper.done
+  fi
+fi
+
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Compute fbank for musan"
  if [ ! -f data/fbank/.msuan.done ]; then
--- a/egs/aishell2/ASR/pruned_transducer_stateless5/asr_datamodule.py
+++ b/egs/aishell2/ASR/pruned_transducer_stateless5/asr_datamodule.py
@ -296,6 +296,8 @@ class AiShell2AsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
+                buffer_size=self.args.num_buckets * 2000,
+                shuffle_buffer_size=self.args.num_buckets * 5000,
                drop_last=self.args.drop_last,
            )
        else:
--- a/egs/aishell2/ASR/pruned_transducer_stateless5/export.py
+++ b/egs/aishell2/ASR/pruned_transducer_stateless5/export.py
@ -22,7 +22,7 @@
 Usage:
 ./pruned_transducer_stateless5/export.py \
  --exp-dir ./pruned_transducer_stateless5/exp \
-  --lang-dir data/lang_char
+  --tokens ./data/lang_char/tokens.txt \
  --epoch 25 \
  --avg 5

@ -48,6 +48,7 @@ import argparse
 import logging
 from pathlib import Path

+import k2
 import torch
 from train import add_model_arguments, get_params, get_transducer_model

@ -57,8 +58,7 @@ from icefall.checkpoint import (
    find_checkpoints,
    load_checkpoint,
 )
-from icefall.lexicon import Lexicon
-from icefall.utils import str2bool
+from icefall.utils import num_tokens, str2bool


 def get_parser():
@ -115,10 +115,10 @@ def get_parser():
    )

    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
        type=str,
-        default="data/lang_char",
-        help="The lang dir",
+        default="data/lang_char/tokens.txt",
+        help="Path to the tokens.txt",
    )

    parser.add_argument(
@ -154,10 +154,10 @@ def main():

    logging.info(f"device: {device}")

-    lexicon = Lexicon(params.lang_dir)
-    params.blank_id = lexicon.token_table["<blk>"]
-    params.unk_id = lexicon.token_table["<unk>"]
-    params.vocab_size = max(lexicon.tokens) + 1
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.unk_id = token_table["<unk>"]
+    params.vocab_size = num_tokens(token_table) + 1

    logging.info(params)

--- a/egs/aishell4/ASR/local/compute_fbank_aishell4.py
+++ b/egs/aishell4/ASR/local/compute_fbank_aishell4.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path

 import torch
-from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    WhisperFbank,
+    WhisperFbankConfig,
+)
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor, str2bool
@ -42,10 +49,12 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)


-def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_aishell4(
+    num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False
+):
    src_dir = Path("data/manifests/aishell4")
    output_dir = Path("data/fbank")
-    num_jobs = min(15, os.cpu_count())
+    num_jobs = min(8, os.cpu_count())

    dataset_parts = (
        "train_S",
@ -70,7 +79,12 @@ def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
        dataset_parts,
    )

-    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+    if whisper_fbank:
+        extractor = WhisperFbank(
+            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
+        )
+    else:
+        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
@ -84,7 +98,7 @@ def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -95,7 +109,7 @@ def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=ChunkedLilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )

            logging.info("About splitting cuts into smaller chunks")
@ -121,7 +135,12 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
-
+    parser.add_argument(
+        "--whisper-fbank",
+        type=str2bool,
+        default=False,
+        help="Use WhisperFbank instead of Fbank. Default: False.",
+    )
    return parser.parse_args()


@ -132,5 +151,7 @@ if __name__ == "__main__":

    args = get_args()
    compute_fbank_aishell4(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
+        perturb_speed=args.perturb_speed,
+        whisper_fbank=args.whisper_fbank,
    )
--- a/egs/aishell4/ASR/prepare.sh
+++ b/egs/aishell4/ASR/prepare.sh
@ -6,7 +6,7 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail

 stage=-1
-stop_stage=100
+stop_stage=7
 perturb_speed=true


@ -76,11 +76,21 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
 fi

 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
-  log "Stage 2: Process aishell4"
+  log "Stage 2: Compute fbank for aishell4"
  if [ ! -f data/fbank/aishell4/.fbank.done ]; then
-    mkdir -p data/fbank/aishell4
+    mkdir -p data/fbank
    ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed}
-    touch data/fbank/aishell4/.fbank.done
+    touch data/fbank/.fbank.done
+  fi
+fi
+
+whisper_mel_bins=80
+if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then
+  log "Stage 20: Compute whisper fbank for aishell4"
+  if [ ! -f data/fbank/aishell4/.fbank.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
+    touch data/fbank/.fbank.done
  fi
 fi

@ -106,16 +116,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 fi

 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  log "Stage 5: Compute fbank for aishell4"
-  if [ ! -f data/fbank/.aishell4.done ]; then
-    mkdir -p data/fbank
-    ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed}
-    touch data/fbank/.aishell4.done
-  fi
-fi
-
-if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
-  log "Stage 6: Prepare char based lang"
+  log "Stage 5: Prepare char based lang"
  lang_char_dir=data/lang_char
  mkdir -p $lang_char_dir

--- a/egs/aishell4/ASR/pruned_transducer_stateless5/asr_datamodule.py
+++ b/egs/aishell4/ASR/pruned_transducer_stateless5/asr_datamodule.py
@ -306,7 +306,8 @@ class Aishell4AsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                buffer_size=100000,
+                buffer_size=self.args.num_buckets * 2000,
+                shuffle_buffer_size=self.args.num_buckets * 5000,
                drop_last=self.args.drop_last,
            )
        else:
--- a/egs/aishell4/ASR/pruned_transducer_stateless5/export.py
+++ b/egs/aishell4/ASR/pruned_transducer_stateless5/export.py
@ -48,6 +48,7 @@ import argparse
 import logging
 from pathlib import Path

+import k2
 import torch
 from train import add_model_arguments, get_params, get_transducer_model

@ -57,8 +58,7 @@ from icefall.checkpoint import (
    find_checkpoints,
    load_checkpoint,
 )
-from icefall.lexicon import Lexicon
-from icefall.utils import str2bool
+from icefall.utils import num_tokens, str2bool


 def get_parser():
@ -115,13 +115,10 @@ def get_parser():
    )

    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
        type=str,
-        default="data/lang_char",
-        help="""The lang dir
-        It contains language related input files such as
-        "lexicon.txt"
-        """,
+        default="data/lang_char/tokens.txt",
+        help="Path to the tokens.txt",
    )

    parser.add_argument(
@ -157,9 +154,9 @@ def main():

    logging.info(f"device: {device}")

-    lexicon = Lexicon(params.lang_dir)
-    params.blank_id = lexicon.token_table["<blk>"]
-    params.vocab_size = max(lexicon.tokens) + 1
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.vocab_size = num_tokens(token_table) + 1

    logging.info(params)

--- a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
+++ b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    WhisperFbank,
+    WhisperFbankConfig,
+)
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor, str2bool
@ -42,10 +49,12 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)


-def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_alimeeting(
+    num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False
+):
    src_dir = Path("data/manifests/alimeeting")
    output_dir = Path("data/fbank")
-    num_jobs = min(15, os.cpu_count())
+    num_jobs = min(8, os.cpu_count())

    dataset_parts = (
        "train",
@ -53,7 +62,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False
        "test",
    )

-    prefix = "alimeeting"
+    prefix = "alimeeting-far"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
@ -70,7 +79,12 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False
        dataset_parts,
    )

-    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+    if whisper_fbank:
+        extractor = WhisperFbank(
+            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
+        )
+    else:
+        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
@ -83,7 +97,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -121,7 +135,12 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
-
+    parser.add_argument(
+        "--whisper-fbank",
+        type=str2bool,
+        default=False,
+        help="Use the Whisper Fbank feature extractor. Default: False.",
+    )
    return parser.parse_args()


@ -132,5 +151,7 @@ if __name__ == "__main__":

    args = get_args()
    compute_fbank_alimeeting(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
+        perturb_speed=args.perturb_speed,
+        whisper_fbank=args.whisper_fbank,
    )
--- a/egs/alimeeting/ASR/prepare.sh
+++ b/egs/alimeeting/ASR/prepare.sh
@ -6,7 +6,7 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail

 stage=-1
-stop_stage=100
+stop_stage=7
 perturb_speed=true

 # We assume dl_dir (download dir) contains the following
@ -15,7 +15,7 @@ perturb_speed=true
 #
 #  - $dl_dir/alimeeting
 #     This directory contains the following files downloaded from
-#       https://openslr.org/62/
+#       https://openslr.org/119/
 #
 #     - Train_Ali_far.tar.gz
 #     - Train_Ali_near.tar.gz
@ -66,10 +66,21 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
 fi

 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
-  log "Stage 2: Process alimeeting"
-  if [ ! -f data/fbank/alimeeting/.fbank.done ]; then
-    mkdir -p data/fbank/alimeeting
+  log "Stage 2: compute fbank for alimeeting"
+  if [ ! -f data/fbank/.fbank.done ]; then
+    mkdir -p data/fbank
    ./local/compute_fbank_alimeeting.py --perturb-speed ${perturb_speed}
+    touch data/fbank/.fbank.done
+  fi
+fi
+
+whisper_mel_bins=80
+if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then
+  log "Stage 20: compute whisper fbank for alimeeting"
+  if [ ! -f data/fbank/.fbank.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_alimeeting.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
+    touch data/fbank/.fbank.done
  fi
 fi

@ -95,16 +106,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 fi

 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  log "Stage 5: Compute fbank for alimeeting"
-  if [ ! -f data/fbank/.alimeeting.done ]; then
-    mkdir -p data/fbank
-    ./local/compute_fbank_alimeeting.py --perturb-speed True
-    touch data/fbank/.alimeeting.done
-  fi
-fi
-
-if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
-  log "Stage 6: Prepare char based lang"
+  log "Stage 5: Prepare char based lang"
  lang_char_dir=data/lang_char
  mkdir -p $lang_char_dir

--- a/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -288,7 +288,8 @@ class AlimeetingAsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                buffer_size=30000,
+                buffer_size=self.args.num_buckets * 2000,
+                shuffle_buffer_size=self.args.num_buckets * 5000,
                drop_last=True,
            )
        else:
--- a/egs/alimeeting/ASR/pruned_transducer_stateless2/export.py
+++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/export.py
@ -20,7 +20,7 @@
 Usage:
 ./pruned_transducer_stateless2/export.py \
  --exp-dir ./pruned_transducer_stateless2/exp \
-  --lang-dir data/lang_char \
+  --tokens ./data/lang_char/tokens.txt \
  --epoch 29 \
  --avg 18

@ -45,12 +45,12 @@ import argparse
 import logging
 from pathlib import Path

+import k2
 import torch
 from train import get_params, get_transducer_model

 from icefall.checkpoint import average_checkpoints, load_checkpoint
-from icefall.lexicon import Lexicon
-from icefall.utils import str2bool
+from icefall.utils import num_tokens, str2bool


 def get_parser():
@ -85,10 +85,10 @@ def get_parser():
    )

    parser.add_argument(
-        "--lang-dir",
+        "--tokens",
        type=str,
-        default="data/lang_char",
-        help="The lang dir",
+        default="data/lang_char/tokens.txt",
+        help="Path to the tokens.txt",
    )

    parser.add_argument(
@ -122,10 +122,9 @@ def main():

    logging.info(f"device: {device}")

-    lexicon = Lexicon(params.lang_dir)
-
-    params.blank_id = 0
-    params.vocab_size = max(lexicon.tokens) + 1
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.vocab_size = num_tokens(token_table) + 1

    logging.info(params)

--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/lstm_transducer_stateless2/lstmp.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/pruned_transducer_stateless3/scaling_converter.py`
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/conformer_ctc/label_smoothing.py`