Merge branch 'k2-fsa:master' into dev/tts/vctk/tokenizer

2025-08-26 18:24:18 +00:00 · 2024-03-18 09:29:13 +08:00 · 2024-03-18 09:29:13 +08:00 · 7ea100a26a
commit 7ea100a26a
parent e69b60e579 2dfd5dbf8b
207 changed files with 23079 additions and 366 deletions
--- a/.github/scripts/.gitignore
+++ b/.github/scripts/.gitignore
@ -0,0 +1 @@
+piper_phonemize.html
--- a/.github/scripts/generate-piper-phonemize-page.py
+++ b/.github/scripts/generate-piper-phonemize-page.py
@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+
+def main():
+    prefix = (
+        "https://github.com/csukuangfj/piper-phonemize/releases/download/2023.12.5/"
+    )
+    files = [
+        "piper_phonemize-1.2.0-cp310-cp310-macosx_10_14_x86_64.whl",
+        "piper_phonemize-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.2.0-cp311-cp311-macosx_10_14_x86_64.whl",
+        "piper_phonemize-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.2.0-cp312-cp312-macosx_10_14_x86_64.whl",
+        "piper_phonemize-1.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.2.0-cp37-cp37m-macosx_10_14_x86_64.whl",
+        "piper_phonemize-1.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.2.0-cp38-cp38-macosx_10_14_x86_64.whl",
+        "piper_phonemize-1.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.2.0-cp39-cp39-macosx_10_14_x86_64.whl",
+        "piper_phonemize-1.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+    ]
+    with open("piper_phonemize.html", "w") as f:
+        for file in files:
+            url = prefix + file
+            f.write(f'<a href="{url}">{file}</a><br/>\n')
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/scripts/librispeech/ASR/run.sh
+++ b/.github/scripts/librispeech/ASR/run.sh
@ -15,9 +15,9 @@ function prepare_data() {
  # cause OOM error for CI later.
  mkdir -p download/lm
  pushd download/lm
-  wget -q http://www.openslr.org/resources/11/librispeech-vocab.txt
-  wget -q http://www.openslr.org/resources/11/librispeech-lexicon.txt
-  wget -q http://www.openslr.org/resources/11/librispeech-lm-norm.txt.gz
+  wget -q https://huggingface.co/csukuangfj/librispeech-for-ci/resolve/main/librispeech-lm-norm.txt.gz
+  wget -q https://huggingface.co/csukuangfj/librispeech-for-ci/resolve/main/librispeech-lexicon.txt
+  wget -q https://huggingface.co/csukuangfj/librispeech-for-ci/resolve/main/librispeech-vocab.txt
  ls -lh
  gunzip librispeech-lm-norm.txt.gz

--- a/.github/scripts/ljspeech/TTS/run.sh
+++ b/.github/scripts/ljspeech/TTS/run.sh
@ -0,0 +1,157 @@
+#!/usr/bin/env bash
+
+set -ex
+
+python3 -m pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
+python3 -m pip install espnet_tts_frontend
+python3 -m pip install numba
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/ljspeech/TTS
+
+sed -i.bak s/600/8/g ./prepare.sh
+sed -i.bak s/"first 100"/"first 3"/g ./prepare.sh
+sed -i.bak s/500/5/g ./prepare.sh
+git diff
+
+function prepare_data() {
+  # We have created a subset of the data for testing
+  #
+  mkdir download
+  pushd download
+  wget -q https://huggingface.co/csukuangfj/ljspeech-subset-for-ci-test/resolve/main/LJSpeech-1.1.tar.bz2
+  tar xvf LJSpeech-1.1.tar.bz2
+  popd
+
+  ./prepare.sh
+  tree .
+}
+
+function train() {
+  pushd ./vits
+  sed -i.bak s/200/3/g ./train.py
+  git diff .
+  popd
+
+  for t in low medium high; do
+    ./vits/train.py \
+      --exp-dir vits/exp-$t \
+      --model-type $t \
+      --num-epochs 1 \
+      --save-every-n 1 \
+      --num-buckets 2 \
+      --tokens data/tokens.txt \
+      --max-duration 20
+
+    ls -lh vits/exp-$t
+  done
+}
+
+function infer() {
+  for t in low medium high; do
+    ./vits/infer.py \
+      --num-buckets 2 \
+      --model-type $t \
+      --epoch 1 \
+      --exp-dir ./vits/exp-$t \
+      --tokens data/tokens.txt \
+      --max-duration 20
+  done
+}
+
+function export_onnx() {
+  for t in low medium high; do
+    ./vits/export-onnx.py \
+      --model-type $t \
+      --epoch 1 \
+      --exp-dir ./vits/exp-$t \
+      --tokens data/tokens.txt
+
+    ls -lh vits/exp-$t/
+  done
+}
+
+function test_medium() {
+  git clone https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-medium-2024-03-12
+
+  ./vits/export-onnx.py \
+    --model-type medium \
+    --epoch 820 \
+    --exp-dir ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp \
+    --tokens ./icefall-tts-ljspeech-vits-medium-2024-03-12/data/tokens.txt
+
+  ls -lh ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp
+
+  ./vits/test_onnx.py \
+    --model-filename ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp/vits-epoch-820.onnx \
+    --tokens ./icefall-tts-ljspeech-vits-medium-2024-03-12/data/tokens.txt \
+    --output-filename /icefall/test-medium.wav
+
+  ls -lh /icefall/test-medium.wav
+
+  d=/icefall/vits-icefall-en_US-ljspeech-medium
+  mkdir $d
+  cp -v ./icefall-tts-ljspeech-vits-medium-2024-03-12/data/tokens.txt $d/
+  cp -v ./icefall-tts-ljspeech-vits-medium-2024-03-12/exp/vits-epoch-820.onnx $d/model.onnx
+
+  rm -rf icefall-tts-ljspeech-vits-medium-2024-03-12
+
+  pushd $d
+  wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
+  tar xf espeak-ng-data.tar.bz2
+  rm espeak-ng-data.tar.bz2
+  cd ..
+  tar cjf vits-icefall-en_US-ljspeech-medium.tar.bz2 vits-icefall-en_US-ljspeech-medium
+  rm -rf vits-icefall-en_US-ljspeech-medium
+  ls -lh *.tar.bz2
+  popd
+}
+
+function test_low() {
+  git clone https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-low-2024-03-12
+
+  ./vits/export-onnx.py \
+    --model-type low \
+    --epoch 1600 \
+    --exp-dir ./icefall-tts-ljspeech-vits-low-2024-03-12/exp \
+    --tokens ./icefall-tts-ljspeech-vits-low-2024-03-12/data/tokens.txt
+
+  ls -lh ./icefall-tts-ljspeech-vits-low-2024-03-12/exp
+
+  ./vits/test_onnx.py \
+    --model-filename ./icefall-tts-ljspeech-vits-low-2024-03-12/exp/vits-epoch-1600.onnx \
+    --tokens ./icefall-tts-ljspeech-vits-low-2024-03-12/data/tokens.txt \
+    --output-filename /icefall/test-low.wav
+
+  ls -lh /icefall/test-low.wav
+
+  d=/icefall/vits-icefall-en_US-ljspeech-low
+  mkdir $d
+  cp -v ./icefall-tts-ljspeech-vits-low-2024-03-12/data/tokens.txt $d/
+  cp -v ./icefall-tts-ljspeech-vits-low-2024-03-12/exp/vits-epoch-1600.onnx $d/model.onnx
+
+  rm -rf icefall-tts-ljspeech-vits-low-2024-03-12
+
+  pushd $d
+  wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
+  tar xf espeak-ng-data.tar.bz2
+  rm espeak-ng-data.tar.bz2
+  cd ..
+  tar cjf vits-icefall-en_US-ljspeech-low.tar.bz2 vits-icefall-en_US-ljspeech-low
+  rm -rf vits-icefall-en_US-ljspeech-low
+  ls -lh *.tar.bz2
+  popd
+}
+
+prepare_data
+train
+infer
+export_onnx
+rm -rf vits/exp-{low,medium,high}
+test_medium
+test_low
--- a/.github/workflows/build-doc.yml
+++ b/.github/workflows/build-doc.yml
@ -56,11 +56,14 @@ jobs:
      - name: Build doc
        shell: bash
        run: |
+          .github/scripts/generate-piper-phonemize-page.py
          cd docs
          python3 -m pip install -r ./requirements.txt
          make html
          touch build/html/.nojekyll

+          cp -v ../piper_phonemize.html ./build/html/
+
      - name: Deploy
        uses: peaceiris/actions-gh-pages@v3
        with:
--- a/.github/workflows/ljspeech.yml
+++ b/.github/workflows/ljspeech.yml
@ -0,0 +1,102 @@
+name: ljspeech
+
+on:
+  push:
+    branches:
+      - master
+
+  pull_request:
+    branches:
+      - master
+
+  workflow_dispatch:
+
+concurrency:
+  group: ljspeech-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  generate_build_matrix:
+    if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python ./.github/scripts/docker/generate_build_matrix.py
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          echo "::set-output name=matrix::${MATRIX}"
+
+  ljspeech:
+    needs: generate_build_matrix
+    name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Free space
+        shell: bash
+        run: |
+          ls -lh
+          df -h
+          rm -rf /opt/hostedtoolcache
+          df -h
+          echo "pwd: $PWD"
+          echo "github.workspace ${{ github.workspace }}"
+
+      - name: Run tests
+        uses: addnab/docker-run-action@v3
+        with:
+            image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
+            options: |
+              --volume ${{ github.workspace }}/:/icefall
+            shell: bash
+            run: |
+              export PYTHONPATH=/icefall:$PYTHONPATH
+              cd /icefall
+              git config --global --add safe.directory /icefall
+
+              .github/scripts/ljspeech/TTS/run.sh
+
+      - name: display files
+        shell: bash
+        run: |
+          ls -lh
+
+      - uses: actions/upload-artifact@v4
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
+        with:
+          name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
+          path: ./*.wav
+
+      - uses: actions/upload-artifact@v4
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
+        with:
+          name: generated-models-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}
+          path: ./*.wav
+
+      - name: Release exported onnx models
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
+        uses: svenstaro/upload-release-action@v2
+        with:
+          file_glob: true
+          overwrite: true
+          file: vits-icefall-*.tar.bz2
+          repo_name: k2-fsa/sherpa-onnx
+          repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
+          tag: tts-models
+
--- a/docs/source/recipes/Finetune/adapter/finetune_adapter.rst
+++ b/docs/source/recipes/Finetune/adapter/finetune_adapter.rst
@ -0,0 +1,225 @@
+Finetune from a pre-trained Zipformer model with adapters
+=========================================================
+
+This tutorial shows you how to fine-tune a pre-trained **Zipformer**
+transducer model on a new dataset with adapters. 
+Adapters are compact and efficient module that can be integrated into a pre-trained model
+to improve the model's performance on a new domain. Adapters are injected
+between different modules in the well-trained neural network. During training, only the parameters
+in the adapters will be updated. It achieves competitive performance
+while requiring much less GPU memory than full fine-tuning. For more details about adapters,
+please refer to the original `paper <https://arxiv.org/pdf/1902.00751.pdf#/>`_ for more details.
+
+.. HINT::
+
+  We assume you have read the page :ref:`install icefall` and have setup
+  the environment for ``icefall``.
+
+.. HINT::
+
+  We recommend you to use a GPU or several GPUs to run this recipe
+
+For illustration purpose, we fine-tune the Zipformer transducer model
+pre-trained on `LibriSpeech`_ on the small subset of `GigaSpeech`_. You could use your
+own data for fine-tuning if you create a manifest for your new dataset.
+
+Data preparation
+----------------
+
+Please follow the instructions in the `GigaSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/gigaspeech/ASR>`_
+to prepare the fine-tune data used in this tutorial. We only require the small subset in GigaSpeech for this tutorial.
+
+
+Model preparation
+-----------------
+
+We are using the Zipformer model trained on full LibriSpeech (960 hours) as the intialization. The
+checkpoint of the model can be downloaded via the following command:
+
+.. code-block:: bash
+
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+    $ cd icefall-asr-librispeech-zipformer-2023-05-15/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt
+    $ cd ../data/lang_bpe_500
+    $ git lfs pull --include bpe.model
+    $ cd ../../..
+
+Before fine-tuning, let's test the model's WER on the new domain. The following command performs
+decoding on the GigaSpeech test sets:
+
+.. code-block:: bash
+
+    ./zipformer/decode_gigaspeech.py \
+        --epoch 99 \
+        --avg 1 \
+        --exp-dir icefall-asr-librispeech-zipformer-2023-05-15/exp \
+        --use-averaged-model 0 \
+        --max-duration 1000 \
+        --decoding-method greedy_search
+
+You should see the following numbers:
+
+.. code-block::
+
+    For dev, WER of different settings are:
+    greedy_search	20.06	best for dev
+
+    For test, WER of different settings are:
+    greedy_search	19.27	best for test
+
+
+Fine-tune with adapter
+----------------------
+
+We insert 4 adapters with residual connection in each ``Zipformer2EncoderLayer``. 
+The original model parameters remain untouched during training and only the parameters of
+the adapters are updated. The following command starts a fine-tuning experiment with adapters:
+
+.. code-block:: bash
+    
+    $ do_finetune=1
+    $ use_adapters=1
+    $ adapter_dim=8
+
+    $ ./zipformer_adapter/train.py \
+        --world-size 2 \
+        --num-epochs 20 \
+        --start-epoch 1 \
+        --exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
+        --use-fp16 1 \
+        --base-lr 0.045 \
+        --use-adapters $use_adapters --adapter-dim $adapter_dim \
+        --bpe-model data/lang_bpe_500/bpe.model \
+        --do-finetune $do_finetune \
+        --master-port 13022 \
+        --finetune-ckpt icefall-asr-librispeech-zipformer-2023-05-15/exp/pretrained.pt \
+        --max-duration 1000
+
+The following arguments are related to fine-tuning:
+
+- ``--do-finetune``
+    If True, do fine-tuning by initializing the model from a pre-trained checkpoint.
+    **Note that if you want to resume your fine-tuning experiment from certain epochs, you
+    need to set this to False.**
+
+- ``use-adapters``
+    If adapters are used during fine-tuning.
+
+- ``--adapter-dim``
+    The bottleneck dimension of the adapter module. Typically a small number.
+
+You should notice that in the training log, the total number of trainale parameters is shown:
+
+.. code-block::
+
+    2024-02-22 21:22:03,808 INFO [train.py:1277] A total of 761344 trainable parameters (1.148% of the whole model)
+    
+The trainable parameters only makes up 1.15% of the entire model parameters, so the training will be much faster
+and requires less memory than full fine-tuning.
+
+
+Decoding
+--------
+
+After training, let's test the WERs. To test the WERs on the GigaSpeech set,
+you can execute the following command:
+
+.. code-block:: bash
+
+    $ epoch=20
+    $ avg=10
+    $ use_adapters=1
+    $ adapter_dim=8
+    
+    % ./zipformer/decode.py \
+        --epoch $epoch \
+        --avg $avg \
+        --use-averaged-model 1 \
+        --exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
+        --max-duration 600 \
+        --use-adapters $use_adapters \
+        --adapter-dim $adapter_dim \
+        --decoding-method greedy_search
+
+You should see the following numbers:
+
+.. code-block::
+
+    For dev, WER of different settings are:
+    greedy_search	15.44	best for dev
+
+    For test, WER of different settings are:
+    greedy_search	15.42	best for test
+
+
+The WER on test set is improved from 19.27 to 15.42, demonstrating the effectiveness of adapters.
+
+The same model can be used to perform decoding on LibriSpeech test sets. You can deactivate the adapters
+to keep the same performance of the original model:
+
+.. code-block:: bash
+
+    $ epoch=20
+    $ avg=1
+    $ use_adapters=0
+    $ adapter_dim=8
+    
+    % ./zipformer/decode.py \
+        --epoch $epoch \
+        --avg $avg \
+        --use-averaged-model 1 \
+        --exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
+        --max-duration 600 \
+        --use-adapters $use_adapters \
+        --adapter-dim $adapter_dim \
+        --decoding-method greedy_search
+
+
+.. code-block::
+
+    For dev, WER of different settings are:
+    greedy_search	2.23	best for test-clean
+
+    For test, WER of different settings are:
+    greedy_search	4.96	best for test-other
+
+The numbers are the same as reported in `icefall <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md#normal-scaled-model-number-of-model-parameters-65549011-ie-6555-m>`_. So adapter-based
+fine-tuning is also very flexible as the same model can be used for decoding on the original and target domain.
+
+
+Export the model
+----------------
+
+After training, the model can be exported to ``onnx`` format easily using the following command:
+
+.. code-block:: bash
+
+    $ use_adapters=1
+    $ adapter_dim=16
+
+    $ ./zipformer_adapter/export-onnx.py \
+        --tokens icefall-asr-librispeech-zipformer-2023-05-15/data/lang_bpe_500/tokens.txt \
+        --use-averaged-model 1 \
+        --epoch 20 \
+        --avg 10 \
+        --exp-dir zipformer_adapter/exp_giga_finetune_adapters${use_adapters}_adapter_dim${adapter_dim} \
+        --use-adapters $use_adapters \
+        --adapter-dim $adapter_dim \
+        --num-encoder-layers "2,2,3,4,3,2" \
+        --downsampling-factor "1,2,4,8,4,2" \
+        --feedforward-dim "512,768,1024,1536,1024,768" \
+        --num-heads "4,4,4,8,4,4" \
+        --encoder-dim "192,256,384,512,384,256" \
+        --query-head-dim 32 \
+        --value-head-dim 12 \
+        --pos-head-dim 4 \
+        --pos-dim 48 \
+        --encoder-unmasked-dim "192,192,256,256,256,192" \
+        --cnn-module-kernel "31,31,15,15,15,31" \
+        --decoder-dim 512 \
+        --joiner-dim 512 \
+        --causal False \
+        --chunk-size "16,32,64,-1" \
+        --left-context-frames "64,128,256,-1"
--- a/docs/source/recipes/Finetune/index.rst
+++ b/docs/source/recipes/Finetune/index.rst
@ -13,3 +13,4 @@ data to improve the performance on new domains.
   :caption: Table of Contents

   from_supervised/finetune_zipformer
+   adapter/finetune_adapter
--- a/docs/source/recipes/TTS/ljspeech/vits.rst
+++ b/docs/source/recipes/TTS/ljspeech/vits.rst
@ -13,6 +13,14 @@ with the `LJSpeech <https://keithito.com/LJ-Speech-Dataset/>`_ dataset.
   The VITS paper: `Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech <https://arxiv.org/pdf/2106.06103.pdf>`_


+Install extra dependencies
+--------------------------
+
+.. code-block:: bash
+
+  pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
+  pip install numba espnet_tts_frontend
+
 Data preparation
 ----------------

@ -56,7 +64,8 @@ Training
      --start-epoch 1 \
      --use-fp16 1 \
      --exp-dir vits/exp \
-      --tokens data/tokens.txt
+      --tokens data/tokens.txt \
+      --model-type high \
      --max-duration 500

 .. note::
@ -64,6 +73,11 @@ Training
    You can adjust the hyper-parameters to control the size of the VITS model and
    the training configurations. For more details, please run ``./vits/train.py --help``.

+.. warning::
+
+   If you want a model that runs faster on CPU, please use ``--model-type low``
+   or ``--model-type medium``.
+
 .. note::

    The training can take a long time (usually a couple of days).
@ -95,8 +109,8 @@ training part first. It will save the ground-truth and generated wavs to the dir
 Export models
 -------------

-Currently we only support ONNX model exporting. It will generate two files in the given ``exp-dir``:
-``vits-epoch-*.onnx`` and ``vits-epoch-*.int8.onnx``.
+Currently we only support ONNX model exporting. It will generate one file in the given ``exp-dir``:
+``vits-epoch-*.onnx``.

 .. code-block:: bash

@ -120,4 +134,68 @@ Download pretrained models
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following link:

-  - `<https://huggingface.co/Zengwei/icefall-tts-ljspeech-vits-2024-02-28>`_
+  - ``--model-type=high``: `<https://huggingface.co/Zengwei/icefall-tts-ljspeech-vits-2024-02-28>`_
+  - ``--model-type=medium``: `<https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-medium-2024-03-12>`_
+  - ``--model-type=low``: `<https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-low-2024-03-12>`_
+
+Usage in sherpa-onnx
+--------------------
+
+The following describes how to test the exported ONNX model in `sherpa-onnx`_.
+
+.. hint::
+
+   `sherpa-onnx`_ supports different programming languages, e.g., C++, C, Python,
+   Kotlin, Java, Swift, Go, C#, etc. It also supports Android and iOS.
+
+   We only describe how to use pre-built binaries from `sherpa-onnx`_ below.
+   Please refer to `<https://k2-fsa.github.io/sherpa/onnx/>`_
+   for more documentation.
+
+Install sherpa-onnx
+^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+   pip install sherpa-onnx
+
+To check that you have installed `sherpa-onnx`_ successfully, please run:
+
+.. code-block:: bash
+
+   which sherpa-onnx-offline-tts
+   sherpa-onnx-offline-tts --help
+
+Download lexicon files
+^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+   cd /tmp
+   wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/espeak-ng-data.tar.bz2
+   tar xf espeak-ng-data.tar.bz2
+
+Run sherpa-onnx
+^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+  cd egs/ljspeech/TTS
+
+  sherpa-onnx-offline-tts \
+    --vits-model=vits/exp/vits-epoch-1000.onnx \
+    --vits-tokens=data/tokens.txt \
+    --vits-data-dir=/tmp/espeak-ng-data \
+    --num-threads=1 \
+    --output-filename=./high.wav \
+    "Ask not what your country can do for you; ask what you can do for your country."
+
+.. hint::
+
+   You can also use ``sherpa-onnx-offline-tts-play`` to play the audio
+   as it is generating.
+
+You should get a file ``high.wav`` after running the above command.
+
+Congratulations! You have successfully trained and exported a text-to-speech
+model and run it with `sherpa-onnx`_.
--- a/egs/aishell/ASR/README.md
+++ b/egs/aishell/ASR/README.md
@ -19,7 +19,9 @@ The following table lists the differences among them.
 | `transducer_stateless_modified`    | Conformer | Embedding + Conv1d | with modified transducer from `optimized_transducer`                     |
 | `transducer_stateless_modified-2`  | Conformer | Embedding + Conv1d | with modified transducer from `optimized_transducer` + extra data      |
 | `pruned_transducer_stateless3`     | Conformer (reworked) | Embedding + Conv1d | pruned RNN-T + reworked model with random combiner + using aidatatang_20zh as extra data|
-| `pruned_transducer_stateless7`     | Zipformer | Embedding | pruned RNN-T + zipformer encoder + stateless decoder with context-size 1 |
+| `pruned_transducer_stateless7`     | Zipformer | Embedding | pruned RNN-T + zipformer encoder + stateless decoder with context-size set to 1 |
+| `zipformer`                           | Upgraded Zipformer | Embedding + Conv1d | The latest recipe with context-size set to 1 |
+

 The decoder in `transducer_stateless` is modified from the paper
 [Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
--- a/egs/aishell/ASR/prepare.sh
+++ b/egs/aishell/ASR/prepare.sh
@ -360,7 +360,7 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
 fi

 if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
-  log "Stage 11: Train RNN LM model"
+  log "Stage 12: Train RNN LM model"
  python ../../../icefall/rnn_lm/train.py \
    --start-epoch 0 \
    --world-size 1 \
--- a/egs/aishell/ASR/pruned_transducer_stateless7/do_not_use_it_directly.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/do_not_use_it_directly.py
@ -89,6 +89,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
@ -881,9 +882,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error()
        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
--- a/egs/aishell/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/train.py
@ -85,6 +85,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import (
@ -878,9 +879,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)
        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
--- a/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py
@ -78,6 +78,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -871,9 +872,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/decode.py
@ -250,7 +250,7 @@ def get_parser():
    parser.add_argument(
        "--context-size",
        type=int,
-        default=1,
+        default=2,
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )
    parser.add_argument(
--- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py
@ -78,6 +78,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
@ -882,9 +883,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py
@ -78,6 +78,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
@ -881,9 +882,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/aishell/ASR/zipformer/train.py
+++ b/egs/aishell/ASR/zipformer/train.py
@ -86,6 +86,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import (
@ -985,9 +986,7 @@ def train_one_epoch(
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
--- a/egs/aishell/ASR/zipformer/train_bbpe.py
+++ b/egs/aishell/ASR/zipformer/train_bbpe.py
@ -83,6 +83,7 @@ from icefall.checkpoint import (
    update_averaged_model,
 )
 from icefall.dist import cleanup_dist, setup_dist
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -570,9 +571,7 @@ def train_one_epoch(
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
--- a/egs/aishell2/ASR/local/compute_fbank_aishell2.py
+++ b/egs/aishell2/ASR/local/compute_fbank_aishell2.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    WhisperFbank,
+    WhisperFbankConfig,
+)
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor, str2bool
@ -42,10 +49,12 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)


-def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_aishell2(
+    num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False
+):
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
-    num_jobs = min(15, os.cpu_count())
+    num_jobs = min(8, os.cpu_count())

    dataset_parts = (
        "train",
@ -68,7 +77,11 @@ def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False):
        list(manifests.keys()),
        dataset_parts,
    )
-
+    if whisper_fbank:
+        extractor = WhisperFbank(
+            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
+        )
+    else:
        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

    with get_executor() as ex:  # Initialize the executor only once.
@ -82,7 +95,7 @@ def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False):
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -111,7 +124,12 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
-
+    parser.add_argument(
+        "--whisper-fbank",
+        type=str2bool,
+        default=False,
+        help="Use WhisperFbank instead of Fbank. Default: False.",
+    )
    return parser.parse_args()


@ -122,5 +140,7 @@ if __name__ == "__main__":

    args = get_args()
    compute_fbank_aishell2(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
+        perturb_speed=args.perturb_speed,
+        whisper_fbank=args.whisper_fbank,
    )
--- a/egs/aishell2/ASR/prepare.sh
+++ b/egs/aishell2/ASR/prepare.sh
@ -108,6 +108,16 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  fi
 fi

+whisper_mel_bins=80
+if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then
+  log "Stage 30: Compute whisper fbank for aishell2"
+  if [ ! -f data/fbank/.aishell2.whisper.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_aishell2.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
+    touch data/fbank/.aishell2.whisper.done
+  fi
+fi
+
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Compute fbank for musan"
  if [ ! -f data/fbank/.msuan.done ]; then
--- a/egs/aishell4/ASR/local/compute_fbank_aishell4.py
+++ b/egs/aishell4/ASR/local/compute_fbank_aishell4.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path

 import torch
-from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    WhisperFbank,
+    WhisperFbankConfig,
+)
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor, str2bool
@ -42,10 +49,12 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)


-def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_aishell4(
+    num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False
+):
    src_dir = Path("data/manifests/aishell4")
    output_dir = Path("data/fbank")
-    num_jobs = min(15, os.cpu_count())
+    num_jobs = min(8, os.cpu_count())

    dataset_parts = (
        "train_S",
@ -70,6 +79,11 @@ def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
        dataset_parts,
    )

+    if whisper_fbank:
+        extractor = WhisperFbank(
+            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
+        )
+    else:
        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

    with get_executor() as ex:  # Initialize the executor only once.
@ -84,7 +98,7 @@ def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -95,7 +109,7 @@ def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=ChunkedLilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )

            logging.info("About splitting cuts into smaller chunks")
@ -121,7 +135,12 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
-
+    parser.add_argument(
+        "--whisper-fbank",
+        type=str2bool,
+        default=False,
+        help="Use WhisperFbank instead of Fbank. Default: False.",
+    )
    return parser.parse_args()


@ -132,5 +151,7 @@ if __name__ == "__main__":

    args = get_args()
    compute_fbank_aishell4(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
+        perturb_speed=args.perturb_speed,
+        whisper_fbank=args.whisper_fbank,
    )
--- a/egs/aishell4/ASR/prepare.sh
+++ b/egs/aishell4/ASR/prepare.sh
@ -6,7 +6,7 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail

 stage=-1
-stop_stage=100
+stop_stage=7
 perturb_speed=true


@ -76,11 +76,21 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
 fi

 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
-  log "Stage 2: Process aishell4"
+  log "Stage 2: Compute fbank for aishell4"
  if [ ! -f data/fbank/aishell4/.fbank.done ]; then
-    mkdir -p data/fbank/aishell4
+    mkdir -p data/fbank
    ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed}
-    touch data/fbank/aishell4/.fbank.done
+    touch data/fbank/.fbank.done
+  fi
+fi
+
+whisper_mel_bins=80
+if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then
+  log "Stage 20: Compute whisper fbank for aishell4"
+  if [ ! -f data/fbank/aishell4/.fbank.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
+    touch data/fbank/.fbank.done
  fi
 fi

@ -106,16 +116,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 fi

 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  log "Stage 5: Compute fbank for aishell4"
-  if [ ! -f data/fbank/.aishell4.done ]; then
-    mkdir -p data/fbank
-    ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed}
-    touch data/fbank/.aishell4.done
-  fi
-fi
-
-if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
-  log "Stage 6: Prepare char based lang"
+  log "Stage 5: Prepare char based lang"
  lang_char_dir=data/lang_char
  mkdir -p $lang_char_dir

--- a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
+++ b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    WhisperFbank,
+    WhisperFbankConfig,
+)
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor, str2bool
@ -42,10 +49,12 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)


-def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_alimeeting(
+    num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False
+):
    src_dir = Path("data/manifests/alimeeting")
    output_dir = Path("data/fbank")
-    num_jobs = min(15, os.cpu_count())
+    num_jobs = min(8, os.cpu_count())

    dataset_parts = (
        "train",
@ -53,7 +62,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False
        "test",
    )

-    prefix = "alimeeting"
+    prefix = "alimeeting-far"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
@ -70,6 +79,11 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False
        dataset_parts,
    )

+    if whisper_fbank:
+        extractor = WhisperFbank(
+            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
+        )
+    else:
        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

    with get_executor() as ex:  # Initialize the executor only once.
@ -83,7 +97,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -121,7 +135,12 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
-
+    parser.add_argument(
+        "--whisper-fbank",
+        type=str2bool,
+        default=False,
+        help="Use the Whisper Fbank feature extractor. Default: False.",
+    )
    return parser.parse_args()


@ -132,5 +151,7 @@ if __name__ == "__main__":

    args = get_args()
    compute_fbank_alimeeting(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
+        perturb_speed=args.perturb_speed,
+        whisper_fbank=args.whisper_fbank,
    )
--- a/egs/alimeeting/ASR/prepare.sh
+++ b/egs/alimeeting/ASR/prepare.sh
@ -6,7 +6,7 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail

 stage=-1
-stop_stage=100
+stop_stage=7
 perturb_speed=true

 # We assume dl_dir (download dir) contains the following
@ -15,7 +15,7 @@ perturb_speed=true
 #
 #  - $dl_dir/alimeeting
 #     This directory contains the following files downloaded from
-#       https://openslr.org/62/
+#       https://openslr.org/119/
 #
 #     - Train_Ali_far.tar.gz
 #     - Train_Ali_near.tar.gz
@ -66,10 +66,21 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
 fi

 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
-  log "Stage 2: Process alimeeting"
-  if [ ! -f data/fbank/alimeeting/.fbank.done ]; then
-    mkdir -p data/fbank/alimeeting
+  log "Stage 2: compute fbank for alimeeting"
+  if [ ! -f data/fbank/.fbank.done ]; then
+    mkdir -p data/fbank
    ./local/compute_fbank_alimeeting.py --perturb-speed ${perturb_speed}
+    touch data/fbank/.fbank.done
+  fi
+fi
+
+whisper_mel_bins=80
+if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then
+  log "Stage 20: compute whisper fbank for alimeeting"
+  if [ ! -f data/fbank/.fbank.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_alimeeting.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
+    touch data/fbank/.fbank.done
  fi
 fi

@ -95,16 +106,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 fi

 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  log "Stage 5: Compute fbank for alimeeting"
-  if [ ! -f data/fbank/.alimeeting.done ]; then
-    mkdir -p data/fbank
-    ./local/compute_fbank_alimeeting.py --perturb-speed True
-    touch data/fbank/.alimeeting.done
-  fi
-fi
-
-if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
-  log "Stage 6: Prepare char based lang"
+  log "Stage 5: Prepare char based lang"
  lang_char_dir=data/lang_char
  mkdir -p $lang_char_dir

--- a/egs/alimeeting/ASR_v2/prepare.sh
+++ b/egs/alimeeting/ASR_v2/prepare.sh
@ -12,7 +12,7 @@ use_gss=true  # Use GSS-based enhancement with MDM setting
 #
 #  - $dl_dir/alimeeting
 #     This directory contains the following files downloaded from
-#       https://openslr.org/62/
+#       https://openslr.org/119/
 #
 #     - Train_Ali_far.tar.gz
 #     - Train_Ali_near.tar.gz
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/train.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/train.py
@ -70,6 +70,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
@ -851,9 +852,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/ami/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/ami/ASR/pruned_transducer_stateless7/train.py
@ -69,6 +69,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

@ -842,9 +843,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/ami/SURT/dprnn_zipformer/train.py
+++ b/egs/ami/SURT/dprnn_zipformer/train.py
@ -75,6 +75,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -1138,9 +1139,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/ami/SURT/dprnn_zipformer/train_adapt.py
+++ b/egs/ami/SURT/dprnn_zipformer/train_adapt.py
@ -75,6 +75,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -1129,9 +1130,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/commonvoice/ASR/local/compile_hlg.py
+++ b/egs/commonvoice/ASR/local/compile_hlg.py
@ -1 +0,0 @@
-../../../librispeech/ASR/local/compile_hlg.py
--- a/egs/commonvoice/ASR/local/compile_hlg.py
+++ b/egs/commonvoice/ASR/local/compile_hlg.py
@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+# Copyright    2021-2024  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                         Zengrui Jin,)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This script takes as input lang_dir and generates HLG from
+
+    - H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt
+    - L, the lexicon, built from lang_dir/L_disambig.pt
+
+        Caution: We use a lexicon that contains disambiguation symbols
+
+    - G, the LM, built from data/lm/G_n_gram.fst.txt
+
+The generated HLG is saved in $lang_dir/HLG.pt
+"""
+import argparse
+import logging
+from pathlib import Path
+
+import k2
+import torch
+
+from icefall.lexicon import Lexicon
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lm",
+        type=str,
+        default="G_3_gram",
+        help="""Stem name for LM used in HLG compiling.
+        """,
+    )
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        """,
+    )
+
+    return parser.parse_args()
+
+
+def compile_HLG(lang_dir: str, lm: str = "G_3_gram") -> k2.Fsa:
+    """
+    Args:
+      lang_dir:
+        The language directory, e.g., data/lang_phone or data/lang_bpe_5000.
+      lm:
+        The language stem base name.
+
+    Return:
+      An FSA representing HLG.
+    """
+    lexicon = Lexicon(lang_dir)
+    max_token_id = max(lexicon.tokens)
+    logging.info(f"Building ctc_topo. max_token_id: {max_token_id}")
+    H = k2.ctc_topo(max_token_id)
+    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
+
+    if Path(f"{lang_dir}/lm/{lm}.pt").is_file():
+        logging.info(f"Loading pre-compiled {lm}")
+        d = torch.load(f"{lang_dir}/lm/{lm}.pt")
+        G = k2.Fsa.from_dict(d)
+    else:
+        logging.info(f"Loading {lm}.fst.txt")
+        with open(f"{lang_dir}/lm/{lm}.fst.txt") as f:
+            G = k2.Fsa.from_openfst(f.read(), acceptor=False)
+            torch.save(G.as_dict(), f"{lang_dir}/lm/{lm}.pt")
+
+    first_token_disambig_id = lexicon.token_table["#0"]
+    first_word_disambig_id = lexicon.word_table["#0"]
+
+    L = k2.arc_sort(L)
+    G = k2.arc_sort(G)
+
+    logging.info("Intersecting L and G")
+    LG = k2.compose(L, G)
+    logging.info(f"LG shape: {LG.shape}")
+
+    logging.info("Connecting LG")
+    LG = k2.connect(LG)
+    logging.info(f"LG shape after k2.connect: {LG.shape}")
+
+    logging.info(type(LG.aux_labels))
+    logging.info("Determinizing LG")
+
+    LG = k2.determinize(LG)
+    logging.info(type(LG.aux_labels))
+
+    logging.info("Connecting LG after k2.determinize")
+    LG = k2.connect(LG)
+
+    logging.info("Removing disambiguation symbols on LG")
+
+    # LG.labels[LG.labels >= first_token_disambig_id] = 0
+    # see https://github.com/k2-fsa/k2/pull/1140
+    labels = LG.labels
+    labels[labels >= first_token_disambig_id] = 0
+    LG.labels = labels
+
+    assert isinstance(LG.aux_labels, k2.RaggedTensor)
+    LG.aux_labels.values[LG.aux_labels.values >= first_word_disambig_id] = 0
+
+    LG = k2.remove_epsilon(LG)
+    logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}")
+
+    LG = k2.connect(LG)
+    LG.aux_labels = LG.aux_labels.remove_values_eq(0)
+
+    logging.info("Arc sorting LG")
+    LG = k2.arc_sort(LG)
+
+    logging.info("Composing H and LG")
+    # CAUTION: The name of the inner_labels is fixed
+    # to `tokens`. If you want to change it, please
+    # also change other places in icefall that are using
+    # it.
+    HLG = k2.compose(H, LG, inner_labels="tokens")
+
+    logging.info("Connecting LG")
+    HLG = k2.connect(HLG)
+
+    logging.info("Arc sorting LG")
+    HLG = k2.arc_sort(HLG)
+    logging.info(f"HLG.shape: {HLG.shape}")
+
+    return HLG
+
+
+def main():
+    args = get_args()
+    lang_dir = Path(args.lang_dir)
+
+    if (lang_dir / "HLG.pt").is_file():
+        logging.info(f"{lang_dir}/HLG.pt already exists - skipping")
+        return
+
+    logging.info(f"Processing {lang_dir}")
+
+    HLG = compile_HLG(lang_dir, args.lm)
+    logging.info(f"Saving HLG.pt to {lang_dir}")
+    torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    main()
--- a/egs/commonvoice/ASR/local/compile_lg.py
+++ b/egs/commonvoice/ASR/local/compile_lg.py
@ -1 +0,0 @@
-../../../librispeech/ASR/local/compile_lg.py
--- a/egs/commonvoice/ASR/local/compile_lg.py
+++ b/egs/commonvoice/ASR/local/compile_lg.py
@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+# Copyright    2021-2024  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                            Kang Wei,
+#                                                         Zengrui Jin,)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This script takes as input lang_dir and generates LG from
+
+    - L, the lexicon, built from lang_dir/L_disambig.pt
+
+        Caution: We use a lexicon that contains disambiguation symbols
+
+    - G, the LM, built from lang_dir/lm/G_3_gram.fst.txt
+
+The generated LG is saved in $lang_dir/LG.pt
+"""
+import argparse
+import logging
+from pathlib import Path
+
+import k2
+import torch
+
+from icefall.lexicon import Lexicon
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        """,
+    )
+    parser.add_argument(
+        "--lm",
+        type=str,
+        default="G_3_gram",
+        help="""Stem name for LM used in HLG compiling.
+        """,
+    )
+
+    return parser.parse_args()
+
+
+def compile_LG(lang_dir: str, lm: str = "G_3_gram") -> k2.Fsa:
+    """
+    Args:
+      lang_dir:
+        The language directory, e.g., data/lang_phone or data/lang_bpe_5000.
+
+    Return:
+      An FSA representing LG.
+    """
+    lexicon = Lexicon(lang_dir)
+    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
+
+    if Path(f"{lang_dir}/lm/{lm}.pt").is_file():
+        logging.info(f"Loading pre-compiled {lm}")
+        d = torch.load(f"{lang_dir}/lm/{lm}.pt")
+        G = k2.Fsa.from_dict(d)
+    else:
+        logging.info(f"Loading {lm}.fst.txt")
+        with open(f"{lang_dir}/lm/{lm}.fst.txt") as f:
+            G = k2.Fsa.from_openfst(f.read(), acceptor=False)
+            torch.save(G.as_dict(), f"{lang_dir}/lm/{lm}.pt")
+
+    first_token_disambig_id = lexicon.token_table["#0"]
+    first_word_disambig_id = lexicon.word_table["#0"]
+
+    L = k2.arc_sort(L)
+    G = k2.arc_sort(G)
+
+    logging.info("Intersecting L and G")
+    LG = k2.compose(L, G)
+    logging.info(f"LG shape: {LG.shape}")
+
+    logging.info("Connecting LG")
+    LG = k2.connect(LG)
+    logging.info(f"LG shape after k2.connect: {LG.shape}")
+
+    logging.info(type(LG.aux_labels))
+    logging.info("Determinizing LG")
+
+    LG = k2.determinize(LG, k2.DeterminizeWeightPushingType.kLogWeightPushing)
+    logging.info(type(LG.aux_labels))
+
+    logging.info("Connecting LG after k2.determinize")
+    LG = k2.connect(LG)
+
+    logging.info("Removing disambiguation symbols on LG")
+
+    # LG.labels[LG.labels >= first_token_disambig_id] = 0
+    # see https://github.com/k2-fsa/k2/pull/1140
+    labels = LG.labels
+    labels[labels >= first_token_disambig_id] = 0
+    LG.labels = labels
+
+    assert isinstance(LG.aux_labels, k2.RaggedTensor)
+    LG.aux_labels.values[LG.aux_labels.values >= first_word_disambig_id] = 0
+
+    LG = k2.remove_epsilon(LG)
+    logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}")
+
+    LG = k2.connect(LG)
+    LG.aux_labels = LG.aux_labels.remove_values_eq(0)
+
+    logging.info("Arc sorting LG")
+    LG = k2.arc_sort(LG)
+
+    return LG
+
+
+def main():
+    args = get_args()
+    lang_dir = Path(args.lang_dir)
+
+    if (lang_dir / "LG.pt").is_file():
+        logging.info(f"{lang_dir}/LG.pt already exists - skipping")
+        return
+
+    logging.info(f"Processing {lang_dir}")
+
+    LG = compile_LG(lang_dir, args.lm)
+    logging.info(f"Saving LG.pt to {lang_dir}")
+    torch.save(LG.as_dict(), f"{lang_dir}/LG.pt")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    main()
--- a/egs/commonvoice/ASR/local/preprocess_commonvoice.py
+++ b/egs/commonvoice/ASR/local/preprocess_commonvoice.py
@ -48,8 +48,27 @@ def normalize_text(utt: str, language: str) -> str:
    utt = re.sub("’", "'", utt)
    if language == "en":
        return re.sub(r"[^a-zA-Z\s]", "", utt).upper()
-    if language == "fr":
+    elif language == "fr":
        return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper()
+    elif language == "pl":
+        return re.sub(r"[^a-ząćęłńóśźżA-ZĄĆĘŁŃÓŚŹŻ' ]", "", utt).upper()
+    elif language == "yue":
+        return (
+            utt.replace(" ", "")
+            .replace("，", "")
+            .replace("。", " ")
+            .replace("？", "")
+            .replace("！", "")
+            .replace("?", "")
+        )
+    else:
+        raise NotImplementedError(
+            f"""
+            Text normalization not implemented for language: {language},
+            please consider implementing it in the local/preprocess_commonvoice.py
+            or raise an issue on GitHub to request it.
+            """
+        )


 def preprocess_commonvoice(
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7/asr_datamodule.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7/asr_datamodule.py
@ -381,9 +381,11 @@ class CommonVoiceAsrDataModule:
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.debug("About to create test dataset")
        test = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            input_strategy=(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
                if self.args.on_the_fly_feats
-            else eval(self.args.input_strategy)(),
+                else eval(self.args.input_strategy)()
+            ),
            return_cuts=self.args.return_cuts,
        )
        sampler = DynamicBucketingSampler(
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7/train.py
@ -79,6 +79,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -871,9 +872,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/commonvoice_fr.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/commonvoice_fr.py
@ -31,7 +31,7 @@ from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
-    SingleCutSampler,
+    SimpleCutSampler,
    SpecAugment,
 )
 from lhotse.dataset.input_strategies import (  # noqa F401 For AudioSamples
@ -232,7 +232,7 @@ class CommonVoiceAsrDataModule:
            logging.info("About to get Musan cuts")
            cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
            transforms.append(
-                CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
+                CutMix(cuts=cuts_musan, p=0.5, snr=(10, 20), preserve_id=True)
            )
        else:
            logging.info("Disable MUSAN")
@ -315,8 +315,8 @@ class CommonVoiceAsrDataModule:
                drop_last=self.args.drop_last,
            )
        else:
-            logging.info("Using SingleCutSampler.")
-            train_sampler = SingleCutSampler(
+            logging.info("Using SimpleCutSampler.")
+            train_sampler = SimpleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
@ -383,9 +383,11 @@ class CommonVoiceAsrDataModule:
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.debug("About to create test dataset")
        test = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            input_strategy=(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
                if self.args.on_the_fly_feats
-            else eval(self.args.input_strategy)(),
+                else eval(self.args.input_strategy)()
+            ),
            return_cuts=self.args.return_cuts,
        )
        sampler = DynamicBucketingSampler(
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py
@ -889,9 +889,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise RuntimeError(f", exiting: {cur_grad_scale}")

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/finetune.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/finetune.py
@ -81,6 +81,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -965,9 +966,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/train.py
@ -78,6 +78,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

@ -888,9 +889,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/csj/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py
+++ b/egs/csj/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py
@ -81,6 +81,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

@ -909,9 +910,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/csj/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/csj/ASR/pruned_transducer_stateless7_streaming/train.py
@ -81,6 +81,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

@ -908,9 +909,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/gigaspeech/ASR/zipformer/train.py
+++ b/egs/gigaspeech/ASR/zipformer/train.py
@ -89,6 +89,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -1031,9 +1032,7 @@ def train_one_epoch(
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
--- a/egs/gigaspeech/KWS/zipformer/finetune.py
+++ b/egs/gigaspeech/KWS/zipformer/finetune.py
@ -100,6 +100,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -371,9 +372,7 @@ def train_one_epoch(
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
--- a/egs/gigaspeech/KWS/zipformer/train.py
+++ b/egs/gigaspeech/KWS/zipformer/train.py
@ -89,6 +89,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -1034,9 +1035,7 @@ def train_one_epoch(
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
--- a/egs/libricss/SURT/dprnn_zipformer/train.py
+++ b/egs/libricss/SURT/dprnn_zipformer/train.py
@ -85,6 +85,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -1169,9 +1170,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/libricss/SURT/dprnn_zipformer/train_adapt.py
+++ b/egs/libricss/SURT/dprnn_zipformer/train_adapt.py
@ -81,6 +81,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -1056,9 +1057,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/libriheavy/ASR/zipformer/asr_datamodule.py
+++ b/egs/libriheavy/ASR/zipformer/asr_datamodule.py
@ -232,7 +232,7 @@ class LibriHeavyAsrDataModule:
            logging.info("About to get Musan cuts")
            cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
            transforms.append(
-                CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
+                CutMix(cuts=cuts_musan, p=0.5, snr=(10, 20), preserve_id=True)
            )
        else:
            logging.info("Disable MUSAN")
--- a/egs/libriheavy/ASR/zipformer/train.py
+++ b/egs/libriheavy/ASR/zipformer/train.py
@ -93,6 +93,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -1036,9 +1037,7 @@ def train_one_epoch(
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
--- a/egs/libriheavy/ASR/zipformer_prompt_asr/asr_datamodule.py
+++ b/egs/libriheavy/ASR/zipformer_prompt_asr/asr_datamodule.py
@ -425,9 +425,11 @@ class LibriHeavyAsrDataModule:
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.debug("About to create test dataset")
        test = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            input_strategy=(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
                if self.args.on_the_fly_feats
-            else PrecomputedFeatures(),
+                else PrecomputedFeatures()
+            ),
            return_cuts=self.args.return_cuts,
        )
        sampler = DynamicBucketingSampler(
--- a/egs/libriheavy/ASR/zipformer_prompt_asr/train_baseline.py
+++ b/egs/libriheavy/ASR/zipformer_prompt_asr/train_baseline.py
@ -103,6 +103,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -1051,9 +1052,7 @@ def train_one_epoch(
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
--- a/egs/libriheavy/ASR/zipformer_prompt_asr/train_bert_encoder.py
+++ b/egs/libriheavy/ASR/zipformer_prompt_asr/train_bert_encoder.py
@ -117,6 +117,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -855,9 +856,9 @@ def get_joiner_model(params: AttributeDict) -> nn.Module:
        decoder_dim=params.decoder_dim,
        joiner_dim=params.joiner_dim,
        vocab_size=params.vocab_size,
-        context_dim=4 * 768
-        if params.context_injection
-        else -1,  # the output dim of text encoder
+        context_dim=(
+            4 * 768 if params.context_injection else -1
+        ),  # the output dim of text encoder
        context_injection=params.context_injection,
    )
    return joiner
@ -1398,9 +1399,7 @@ def train_one_epoch(
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
--- a/egs/librispeech/ASR/README.md
+++ b/egs/librispeech/ASR/README.md
@ -36,6 +36,7 @@ The following table lists the differences among them.
 | `lstm_transducer_stateless3`          | LSTM | Embedding + Conv1d | Using LSTM with mechanisms in reworked model + gradient filter + delay penalty |
 | `zipformer`                           | Upgraded Zipformer | Embedding + Conv1d | The latest recipe |
 | `zipformer_adapter`                           | Upgraded Zipformer | Embedding + Conv1d | It supports domain adaptation of Zipformer using parameter efficient adapters |
+| `zipformer_adapter`                           | Upgraded Zipformer | Embedding + Conv1d | Finetune Zipformer with LoRA  |

 The decoder in `transducer_stateless` is modified from the paper
 [Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py
@ -80,6 +80,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -976,9 +977,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
@ -81,6 +81,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -878,9 +879,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/train.py
@ -81,6 +81,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -902,9 +903,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/train.py
@ -77,6 +77,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -891,9 +892,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/do_not_use_it_directly.py
@ -80,6 +80,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

@ -880,9 +881,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/train.py
@ -80,6 +80,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

@ -879,9 +880,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming_multi/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming_multi/train.py
@ -84,6 +84,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

@ -946,9 +947,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/librispeech/ASR/pruned_transducer_stateless8/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless8/train.py
@ -89,6 +89,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

@ -946,9 +947,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -479,18 +479,14 @@ class LibriSpeechAsrDataModule:
    @lru_cache()
    def gigaspeech_subset_small_cuts(self) -> CutSet:
        logging.info("About to get Gigaspeech subset-S cuts")
-        return load_manifest_lazy(self.args.manifest_dir / "gigaspeech_cuts_S.jsonl.gz")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_S.jsonl.gz")

    @lru_cache()
    def gigaspeech_dev_cuts(self) -> CutSet:
        logging.info("About to get Gigaspeech dev cuts")
-        return load_manifest_lazy(
-            self.args.manifest_dir / "gigaspeech_cuts_DEV.jsonl.gz"
-        )
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_DEV.jsonl.gz")

    @lru_cache()
    def gigaspeech_test_cuts(self) -> CutSet:
        logging.info("About to get Gigaspeech test cuts")
-        return load_manifest_lazy(
-            self.args.manifest_dir / "gigaspeech_cuts_TEST.jsonl.gz"
-        )
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
--- a/egs/librispeech/ASR/tiny_transducer_ctc/train.py
+++ b/egs/librispeech/ASR/tiny_transducer_ctc/train.py
@ -66,6 +66,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import UniqLexicon
 from icefall.utils import (
@ -883,9 +884,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/librispeech/ASR/zipformer/finetune.py
+++ b/egs/librispeech/ASR/zipformer/finetune.py
@ -92,6 +92,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -1122,9 +1123,7 @@ def train_one_epoch(
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
--- a/egs/librispeech/ASR/zipformer/train.py
+++ b/egs/librispeech/ASR/zipformer/train.py
@ -90,6 +90,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -1021,9 +1022,7 @@ def train_one_epoch(
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
--- a/egs/librispeech/ASR/zipformer_adapter/train.py
+++ b/egs/librispeech/ASR/zipformer_adapter/train.py
@ -81,6 +81,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
@ -1125,9 +1126,7 @@ def train_one_epoch(
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
                save_bad_model()
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = max(scheduler.get_last_lr())
--- a/egs/librispeech/ASR/zipformer_ctc/train.py
+++ b/egs/librispeech/ASR/zipformer_ctc/train.py
@ -62,6 +62,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
@ -797,9 +798,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/librispeech/ASR/zipformer_lora/asr_datamodule.py
+++ b/egs/librispeech/ASR/zipformer_lora/asr_datamodule.py
@ -0,0 +1 @@
+../tdnn_lstm_ctc/asr_datamodule.py
--- a/egs/librispeech/ASR/zipformer_lora/beam_search.py
+++ b/egs/librispeech/ASR/zipformer_lora/beam_search.py
@ -0,0 +1 @@
+../pruned_transducer_stateless2/beam_search.py
--- a/egs/librispeech/ASR/zipformer_lora/decode_gigaspeech.py
+++ b/egs/librispeech/ASR/zipformer_lora/decode_gigaspeech.py
--- a/egs/librispeech/ASR/zipformer_lora/decoder.py
+++ b/egs/librispeech/ASR/zipformer_lora/decoder.py
@ -0,0 +1 @@
+../zipformer/decoder.py
--- a/egs/librispeech/ASR/zipformer_lora/encoder_interface.py
+++ b/egs/librispeech/ASR/zipformer_lora/encoder_interface.py
@ -0,0 +1 @@
+../transducer_stateless/encoder_interface.py
--- a/egs/librispeech/ASR/zipformer_lora/export.py
+++ b/egs/librispeech/ASR/zipformer_lora/export.py
@ -0,0 +1,543 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao,
+#                                                 Wei Kang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script converts several saved checkpoints
+# to a single one using model averaging.
+"""
+
+Usage:
+
+Note: This is a example for librispeech dataset, if you are using different
+dataset, you should change the argument values according to your dataset.
+
+(1) Export to torchscript model using torch.jit.script()
+
+- For non-streaming model:
+
+./zipformer_lora/export.py \
+  --exp-dir ./zipformer_lora/exp \
+  --tokens data/lang_bpe_500/tokens.txt \
+  --epoch 30 \
+  --avg 9 \
+  --jit 1
+
+It will generate a file `jit_script.pt` in the given `exp_dir`. You can later
+load it by `torch.jit.load("jit_script.pt")`.
+
+Check ./jit_pretrained.py for its usage.
+
+Check https://github.com/k2-fsa/sherpa
+for how to use the exported models outside of icefall.
+
+- For streaming model:
+
+./zipformer_lora/export.py \
+  --exp-dir ./zipformer_lora/exp \
+  --causal 1 \
+  --chunk-size 16 \
+  --left-context-frames 128 \
+  --tokens data/lang_bpe_500/tokens.txt \
+  --epoch 30 \
+  --avg 9 \
+  --jit 1
+
+It will generate a file `jit_script_chunk_16_left_128.pt` in the given `exp_dir`.
+You can later load it by `torch.jit.load("jit_script_chunk_16_left_128.pt")`.
+
+Check ./jit_pretrained_streaming.py for its usage.
+
+Check https://github.com/k2-fsa/sherpa
+for how to use the exported models outside of icefall.
+
+(2) Export `model.state_dict()`
+
+- For non-streaming model:
+
+./zipformer_lora/export.py \
+  --exp-dir ./zipformer_lora/exp \
+  --tokens data/lang_bpe_500/tokens.txt \
+  --epoch 30 \
+  --avg 9
+
+- For streaming model:
+
+./zipformer_lora/export.py \
+  --exp-dir ./zipformer_lora/exp \
+  --causal 1 \
+  --tokens data/lang_bpe_500/tokens.txt \
+  --epoch 30 \
+  --avg 9
+
+It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
+load it by `icefall.checkpoint.load_checkpoint()`.
+
+- For non-streaming model:
+
+To use the generated file with `zipformer_lora/decode.py`,
+you can do:
+
+    cd /path/to/exp_dir
+    ln -s pretrained.pt epoch-9999.pt
+
+    cd /path/to/egs/librispeech/ASR
+    ./zipformer_lora/decode.py \
+        --exp-dir ./zipformer_lora/exp \
+        --epoch 9999 \
+        --avg 1 \
+        --max-duration 600 \
+        --decoding-method greedy_search \
+        --bpe-model data/lang_bpe_500/bpe.model
+
+- For streaming model:
+
+To use the generated file with `zipformer_lora/decode.py` and `zipformer_lora/streaming_decode.py`, you can do:
+
+    cd /path/to/exp_dir
+    ln -s pretrained.pt epoch-9999.pt
+
+    cd /path/to/egs/librispeech/ASR
+
+    # simulated streaming decoding
+    ./zipformer_lora/decode.py \
+        --exp-dir ./zipformer_lora/exp \
+        --epoch 9999 \
+        --avg 1 \
+        --max-duration 600 \
+        --causal 1 \
+        --chunk-size 16 \
+        --left-context-frames 128 \
+        --decoding-method greedy_search \
+        --bpe-model data/lang_bpe_500/bpe.model
+
+    # chunk-wise streaming decoding
+    ./zipformer_lora/streaming_decode.py \
+        --exp-dir ./zipformer_lora/exp \
+        --epoch 9999 \
+        --avg 1 \
+        --max-duration 600 \
+        --causal 1 \
+        --chunk-size 16 \
+        --left-context-frames 128 \
+        --decoding-method greedy_search \
+        --bpe-model data/lang_bpe_500/bpe.model
+
+Check ./pretrained.py for its usage.
+
+Note: If you don't want to train a model from scratch, we have
+provided one for you. You can get it at
+
+- non-streaming model:
+https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+
+- streaming model:
+https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17
+
+with the following commands:
+
+    sudo apt-get install git-lfs
+    git lfs install
+    git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+    git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17
+    # You will find the pre-trained models in exp dir
+"""
+
+import argparse
+import logging
+from pathlib import Path
+from typing import List, Tuple
+
+import k2
+import torch
+from finetune import add_finetune_arguments, add_model_arguments, get_model, get_params
+from scaling_converter import convert_scaled_to_non_scaled
+from torch import Tensor, nn
+
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.utils import make_pad_mask, num_tokens, str2bool
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=30,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=9,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer_lora/exp",
+        help="""It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        default="data/lang_bpe_500/tokens.txt",
+        help="Path to the tokens.txt",
+    )
+
+    parser.add_argument(
+        "--jit",
+        type=str2bool,
+        default=False,
+        help="""True to save a model after applying torch.jit.script.
+        It will generate a file named jit_script.pt.
+        Check ./jit_pretrained.py for how to use it.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    add_model_arguments(parser)
+    add_finetune_arguments(parser)
+
+    return parser
+
+
+class EncoderModel(nn.Module):
+    """A wrapper for encoder and encoder_embed"""
+
+    def __init__(self, encoder: nn.Module, encoder_embed: nn.Module) -> None:
+        super().__init__()
+        self.encoder = encoder
+        self.encoder_embed = encoder_embed
+
+    def forward(
+        self, features: Tensor, feature_lengths: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            features: (N, T, C)
+            feature_lengths: (N,)
+        """
+        x, x_lens = self.encoder_embed(features, feature_lengths)
+
+        src_key_padding_mask = make_pad_mask(x_lens)
+        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+
+        encoder_out, encoder_out_lens = self.encoder(x, x_lens, src_key_padding_mask)
+        encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
+
+        return encoder_out, encoder_out_lens
+
+
+class StreamingEncoderModel(nn.Module):
+    """A wrapper for encoder and encoder_embed"""
+
+    def __init__(self, encoder: nn.Module, encoder_embed: nn.Module) -> None:
+        super().__init__()
+        assert len(encoder.chunk_size) == 1, encoder.chunk_size
+        assert len(encoder.left_context_frames) == 1, encoder.left_context_frames
+        self.chunk_size = encoder.chunk_size[0]
+        self.left_context_len = encoder.left_context_frames[0]
+
+        # The encoder_embed subsample features (T - 7) // 2
+        # The ConvNeXt module needs (7 - 1) // 2 = 3 frames of right padding after subsampling
+        self.pad_length = 7 + 2 * 3
+
+        self.encoder = encoder
+        self.encoder_embed = encoder_embed
+
+    def forward(
+        self, features: Tensor, feature_lengths: Tensor, states: List[Tensor]
+    ) -> Tuple[Tensor, Tensor, List[Tensor]]:
+        """Streaming forward for encoder_embed and encoder.
+
+        Args:
+            features: (N, T, C)
+            feature_lengths: (N,)
+            states: a list of Tensors
+
+        Returns encoder outputs, output lengths, and updated states.
+        """
+        chunk_size = self.chunk_size
+        left_context_len = self.left_context_len
+
+        cached_embed_left_pad = states[-2]
+        x, x_lens, new_cached_embed_left_pad = self.encoder_embed.streaming_forward(
+            x=features,
+            x_lens=feature_lengths,
+            cached_left_pad=cached_embed_left_pad,
+        )
+        assert x.size(1) == chunk_size, (x.size(1), chunk_size)
+
+        src_key_padding_mask = make_pad_mask(x_lens)
+
+        # processed_mask is used to mask out initial states
+        processed_mask = torch.arange(left_context_len, device=x.device).expand(
+            x.size(0), left_context_len
+        )
+        processed_lens = states[-1]  # (batch,)
+        # (batch, left_context_size)
+        processed_mask = (processed_lens.unsqueeze(1) <= processed_mask).flip(1)
+        # Update processed lengths
+        new_processed_lens = processed_lens + x_lens
+
+        # (batch, left_context_size + chunk_size)
+        src_key_padding_mask = torch.cat([processed_mask, src_key_padding_mask], dim=1)
+
+        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+        encoder_states = states[:-2]
+
+        (
+            encoder_out,
+            encoder_out_lens,
+            new_encoder_states,
+        ) = self.encoder.streaming_forward(
+            x=x,
+            x_lens=x_lens,
+            states=encoder_states,
+            src_key_padding_mask=src_key_padding_mask,
+        )
+        encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
+
+        new_states = new_encoder_states + [
+            new_cached_embed_left_pad,
+            new_processed_lens,
+        ]
+        return encoder_out, encoder_out_lens, new_states
+
+    @torch.jit.export
+    def get_init_states(
+        self,
+        batch_size: int = 1,
+        device: torch.device = torch.device("cpu"),
+    ) -> List[torch.Tensor]:
+        """
+        Returns a list of cached tensors of all encoder layers. For layer-i, states[i*6:(i+1)*6]
+        is (cached_key, cached_nonlin_attn, cached_val1, cached_val2, cached_conv1, cached_conv2).
+        states[-2] is the cached left padding for ConvNeXt module,
+        of shape (batch_size, num_channels, left_pad, num_freqs)
+        states[-1] is processed_lens of shape (batch,), which records the number
+        of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
+        """
+        states = self.encoder.get_init_states(batch_size, device)
+
+        embed_states = self.encoder_embed.get_init_states(batch_size, device)
+        states.append(embed_states)
+
+        processed_lens = torch.zeros(batch_size, dtype=torch.int32, device=device)
+        states.append(processed_lens)
+
+        return states
+
+
+@torch.no_grad()
+def main():
+    args = get_parser().parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    device = torch.device("cpu")
+    # if torch.cuda.is_available():
+    #     device = torch.device("cuda", 0)
+
+    logging.info(f"device: {device}")
+
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.vocab_size = num_tokens(token_table) + 1
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_model(params)
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    # merge the LoRA weights
+    model.eval()
+
+    params.use_lora = False
+    base_model = get_model(params)
+
+    new_state_dict = {}
+    state_dict = model.state_dict()
+    param_names = base_model.state_dict().keys()
+    for k in param_names:
+        assert k in state_dict.keys()
+        new_state_dict[k] = state_dict[k]
+
+    base_model.load_state_dict(new_state_dict, strict=True)
+
+    model = base_model
+    model.eval()
+
+    if params.jit is True:
+        convert_scaled_to_non_scaled(model, inplace=True)
+        # We won't use the forward() method of the model in C++, so just ignore
+        # it here.
+        # Otherwise, one of its arguments is a ragged tensor and is not
+        # torch scriptabe.
+        model.__class__.forward = torch.jit.ignore(model.__class__.forward)
+
+        # Wrap encoder and encoder_embed as a module
+        if params.causal:
+            model.encoder = StreamingEncoderModel(model.encoder, model.encoder_embed)
+            chunk_size = model.encoder.chunk_size
+            left_context_len = model.encoder.left_context_len
+            filename = f"jit_script_chunk_{chunk_size}_left_{left_context_len}.pt"
+        else:
+            model.encoder = EncoderModel(model.encoder, model.encoder_embed)
+            filename = "jit_script.pt"
+
+        logging.info("Using torch.jit.script")
+        model = torch.jit.script(model)
+        model.save(str(params.exp_dir / filename))
+        logging.info(f"Saved to {filename}")
+    else:
+        logging.info("Not using torchscript. Export model.state_dict()")
+        # Save it using a format so that it can be loaded
+        # by :func:`load_checkpoint`
+        filename = params.exp_dir / "pretrained.pt"
+        torch.save({"model": model.state_dict()}, str(filename))
+        logging.info(f"Saved to {filename}")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
--- a/egs/librispeech/ASR/zipformer_lora/finetune.py
+++ b/egs/librispeech/ASR/zipformer_lora/finetune.py
--- a/egs/librispeech/ASR/zipformer_lora/joiner.py
+++ b/egs/librispeech/ASR/zipformer_lora/joiner.py
@ -0,0 +1 @@
+../zipformer/joiner.py
--- a/egs/librispeech/ASR/zipformer_lora/model.py
+++ b/egs/librispeech/ASR/zipformer_lora/model.py
@ -0,0 +1 @@
+../zipformer/model.py
--- a/egs/librispeech/ASR/zipformer_lora/optim.py
+++ b/egs/librispeech/ASR/zipformer_lora/optim.py
@ -0,0 +1 @@
+../zipformer/optim.py
--- a/egs/librispeech/ASR/zipformer_lora/scaling.py
+++ b/egs/librispeech/ASR/zipformer_lora/scaling.py
--- a/egs/librispeech/ASR/zipformer_lora/scaling_converter.py
+++ b/egs/librispeech/ASR/zipformer_lora/scaling_converter.py
@ -0,0 +1 @@
+../zipformer/scaling_converter.py
--- a/egs/librispeech/ASR/zipformer_lora/subsampling.py
+++ b/egs/librispeech/ASR/zipformer_lora/subsampling.py
@ -0,0 +1 @@
+../zipformer/subsampling.py
--- a/egs/librispeech/ASR/zipformer_lora/train.py
+++ b/egs/librispeech/ASR/zipformer_lora/train.py
--- a/egs/librispeech/ASR/zipformer_lora/zipformer.py
+++ b/egs/librispeech/ASR/zipformer_lora/zipformer.py
--- a/egs/librispeech/ASR/zipformer_mmi/train.py
+++ b/egs/librispeech/ASR/zipformer_mmi/train.py
@ -79,6 +79,7 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
+from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon, UniqLexicon
 from icefall.mmi import LFMMILoss
@ -816,9 +817,7 @@ def train_one_epoch(
            if cur_grad_scale < 0.01:
                logging.warning(f"Grad scale is small: {cur_grad_scale}")
            if cur_grad_scale < 1.0e-05:
-                raise RuntimeError(
-                    f"grad_scale is too small, exiting: {cur_grad_scale}"
-                )
+                raise_grad_scale_is_too_small_error(cur_grad_scale)

        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
--- a/egs/librispeech/WSASR/conformer_ctc2/train.py
+++ b/egs/librispeech/WSASR/conformer_ctc2/train.py
@ -31,6 +31,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
  --exp-dir conformer_ctc2/exp \
  --lang-dir data/lang_bpe_200 \
  --otc-token "<star>" \
+  --feature-dim 768 \
  --allow-bypass-arc true \
  --allow-self-loop-arc true \
  --initial-bypass-weight -19 \
@ -160,6 +161,14 @@ def get_parser():
        """,
    )

+    parser.add_argument(
+        "--feature-dim",
+        type=int,
+        default=768,
+        help="""Number of features extracted in feature extraction stage.last dimension of feature vector.
+        80 when using fbank features and 768 or 1024 whn using wave2vec""",
+    )
+
    parser.add_argument(
        "--initial-lr",
        type=float,
@ -385,7 +394,6 @@ def get_params() -> AttributeDict:
            "valid_interval": 800,  # For the 100h subset, use 800
            "alignment_interval": 25,
            # parameters for conformer
-            "feature_dim": 768,
            "subsampling_factor": 2,
            "encoder_dim": 512,
            "nhead": 8,
--- a/egs/ljspeech/TTS/README.md
+++ b/egs/ljspeech/TTS/README.md
@ -36,3 +36,68 @@ To inference, use:
  --epoch 1000 \
  --tokens data/tokens.txt
 ```
+
+## Quality vs speed
+
+If you feel that the trained model is slow at runtime, you can specify the
+argument `--model-type` during training. Possible values are:
+
+  - `low`, means **low** quality. The resulting model is very small in file size
+    and runs very fast. The following is a wave file generatd by a `low` quality model
+
+    https://github.com/k2-fsa/icefall/assets/5284924/d5758c24-470d-40ee-b089-e57fcba81633
+
+    The text is `Ask not what your country can do for you; ask what you can do for your country.`
+
+    The exported onnx model has a file size of ``26.8 MB`` (float32).
+
+  - `medium`, means **medium** quality.
+    The following is a wave file generatd by a `medium` quality model
+
+    https://github.com/k2-fsa/icefall/assets/5284924/b199d960-3665-4d0d-9ae9-a1bb69cbc8ac
+
+    The text is `Ask not what your country can do for you; ask what you can do for your country.`
+
+    The exported onnx model has a file size of ``70.9 MB`` (float32).
+
+  - `high`, means **high** quality. This is the default value.
+
+    The following is a wave file generatd by a `high` quality model
+
+    https://github.com/k2-fsa/icefall/assets/5284924/b39f3048-73a6-4267-bf95-df5abfdb28fc
+
+    The text is `Ask not what your country can do for you; ask what you can do for your country.`
+
+    The exported onnx model has a file size of ``113 MB`` (float32).
+
+
+A pre-trained `low` model trained using 4xV100 32GB GPU with the following command can be found at
+<https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-low-2024-03-12>
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+./vits/train.py \
+  --world-size 4 \
+  --num-epochs 1601 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir vits/exp \
+  --model-type low \
+  --max-duration 800
+```
+
+A pre-trained `medium` model trained using 4xV100 32GB GPU with the following command can be found at
+<https://huggingface.co/csukuangfj/icefall-tts-ljspeech-vits-medium-2024-03-12>
+```bash
+export CUDA_VISIBLE_DEVICES=4,5,6,7
+./vits/train.py \
+  --world-size 4 \
+  --num-epochs 1000 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir vits/exp-medium \
+  --model-type medium \
+  --max-duration 500
+
+# (Note it is killed after `epoch-820.pt`)
+```
--- a/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py
+++ b/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py
@ -23,7 +23,11 @@ This file reads the texts in given manifest and save the new cuts with phoneme t
 import logging
 from pathlib import Path

+try:
    import tacotron_cleaner.cleaners
+except ModuleNotFoundError as ex:
+    raise RuntimeError(f"{ex}\nPlease run\n  pip install espnet_tts_frontend\n")
+
 from lhotse import CutSet, load_manifest
 from piper_phonemize import phonemize_espeak

--- a/egs/ljspeech/TTS/prepare.sh
+++ b/egs/ljspeech/TTS/prepare.sh
@ -28,7 +28,7 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
  log "Stage -1: build monotonic_align lib"
  if [ ! -d vits/monotonic_align/build ]; then
    cd vits/monotonic_align
-    python setup.py build_ext --inplace
+    python3 setup.py build_ext --inplace
    cd ../../
  else
    log "monotonic_align lib already built"
@ -54,7 +54,7 @@ fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare LJSpeech manifest"
  # We assume that you have downloaded the LJSpeech corpus
-  # to $dl_dir/LJSpeech
+  # to $dl_dir/LJSpeech-1.1
  mkdir -p data/manifests
  if [ ! -e data/manifests/.ljspeech.done ]; then
    lhotse prepare ljspeech $dl_dir/LJSpeech-1.1 data/manifests
@ -82,8 +82,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Prepare phoneme tokens for LJSpeech"
  # We assume you have installed piper_phonemize and espnet_tts_frontend.
  # If not, please install them with:
-  #   - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize,
-  #                      could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
+  #   - piper_phonemize: pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html,
  #   - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
  if [ ! -e data/spectrogram/.ljspeech_with_token.done ]; then
    ./local/prepare_tokens_ljspeech.py
--- a/egs/ljspeech/TTS/vits/export-onnx.py
+++ b/egs/ljspeech/TTS/vits/export-onnx.py
@ -25,9 +25,8 @@ Export the model to ONNX:
  --exp-dir vits/exp \
  --tokens data/tokens.txt

-It will generate two files inside vits/exp:
+It will generate one file inside vits/exp:
  - vits-epoch-1000.onnx
-  - vits-epoch-1000.int8.onnx (quantizated model)

 See ./test_onnx.py for how to use the exported ONNX models.
 """
@ -40,7 +39,6 @@ from typing import Dict, Tuple
 import onnx
 import torch
 import torch.nn as nn
-from onnxruntime.quantization import QuantType, quantize_dynamic
 from tokenizer import Tokenizer
 from train import get_model, get_params

@ -75,6 +73,16 @@ def get_parser():
        help="""Path to vocabulary.""",
    )

+    parser.add_argument(
+        "--model-type",
+        type=str,
+        default="high",
+        choices=["low", "medium", "high"],
+        help="""If not empty, valid values are: low, medium, high.
+        It controls the model size. low -> runs faster.
+        """,
+    )
+
    return parser


@ -136,7 +144,7 @@ class OnnxModel(nn.Module):
          Return a tuple containing:
            - audio, generated wavform tensor, (B, T_wav)
        """
-        audio, _, _ = self.model.inference(
+        audio, _, _ = self.model.generator.inference(
            text=tokens,
            text_lengths=tokens_lens,
            noise_scale=noise_scale,
@ -198,6 +206,11 @@ def export_model_onnx(
        },
    )

+    if model.model.spks is None:
+        num_speakers = 1
+    else:
+        num_speakers = model.model.spks
+
    meta_data = {
        "model_type": "vits",
        "version": "1",
@ -206,8 +219,8 @@ def export_model_onnx(
        "language": "English",
        "voice": "en-us",  # Choose your language appropriately
        "has_espeak": 1,
-        "n_speakers": 1,
-        "sample_rate": 22050,  # Must match the real sample rate
+        "n_speakers": num_speakers,
+        "sample_rate": model.model.sampling_rate,  # Must match the real sample rate
    }
    logging.info(f"meta_data: {meta_data}")

@ -233,14 +246,13 @@ def main():

    load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)

-    model = model.generator
    model.to("cpu")
    model.eval()

    model = OnnxModel(model=model)

    num_param = sum([p.numel() for p in model.parameters()])
-    logging.info(f"generator parameters: {num_param}")
+    logging.info(f"generator parameters: {num_param}, or {num_param/1000/1000} M")

    suffix = f"epoch-{params.epoch}"

@ -256,18 +268,6 @@ def main():
    )
    logging.info(f"Exported generator to {model_filename}")

-    # Generate int8 quantization models
-    # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection
-
-    logging.info("Generate int8 quantization models")
-
-    model_filename_int8 = params.exp_dir / f"vits-{suffix}.int8.onnx"
-    quantize_dynamic(
-        model_input=model_filename,
-        model_output=model_filename_int8,
-        weight_type=QuantType.QUInt8,
-    )
-

 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
--- a/egs/ljspeech/TTS/vits/generator.py
+++ b/egs/ljspeech/TTS/vits/generator.py
@ -189,7 +189,7 @@ class VITSGenerator(torch.nn.Module):
        self.upsample_factor = int(np.prod(decoder_upsample_scales))
        self.spks = None
        if spks is not None and spks > 1:
-            assert global_channels > 0
+            assert global_channels > 0, global_channels
            self.spks = spks
            self.global_emb = torch.nn.Embedding(spks, global_channels)
        self.spk_embed_dim = None
--- a/egs/ljspeech/TTS/vits/infer.py
+++ b/egs/ljspeech/TTS/vits/infer.py
@ -72,6 +72,16 @@ def get_parser():
        help="""Path to vocabulary.""",
    )

+    parser.add_argument(
+        "--model-type",
+        type=str,
+        default="high",
+        choices=["low", "medium", "high"],
+        help="""If not empty, valid values are: low, medium, high.
+        It controls the model size. low -> runs faster.
+        """,
+    )
+
    return parser


@ -94,6 +104,7 @@ def infer_dataset(
      tokenizer:
        Used to convert text to phonemes.
    """
+
    #  Background worker save audios to disk.
    def _save_worker(
        batch_size: int,
--- a/egs/ljspeech/TTS/vits/monotonic_align/init.py
+++ b/egs/ljspeech/TTS/vits/monotonic_align/init.py
@ -10,7 +10,11 @@ import warnings

 import numpy as np
 import torch
+
+try:
    from numba import njit, prange
+except ModuleNotFoundError as ex:
+    raise RuntimeError(f"{ex}/nPlease run\n  pip install numba")

 try:
    from .core import maximum_path_c
--- a/egs/ljspeech/TTS/vits/test_model.py
+++ b/egs/ljspeech/TTS/vits/test_model.py
@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+# Copyright    2024  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from tokenizer import Tokenizer
+from train import get_model, get_params
+from vits import VITS
+
+
+def test_model_type(model_type):
+    tokens = "./data/tokens.txt"
+
+    params = get_params()
+
+    tokenizer = Tokenizer(tokens)
+    params.blank_id = tokenizer.pad_id
+    params.vocab_size = tokenizer.vocab_size
+    params.model_type = model_type
+
+    model = get_model(params)
+    generator = model.generator
+
+    num_param = sum([p.numel() for p in generator.parameters()])
+    print(
+        f"{model_type}: generator parameters: {num_param}, or {num_param/1000/1000} M"
+    )
+
+
+def main():
+    test_model_type("high")  # 35.63 M
+    test_model_type("low")  # 7.55 M
+    test_model_type("medium")  # 23.61 M
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/ljspeech/TTS/vits/test_onnx.py
+++ b/egs/ljspeech/TTS/vits/test_onnx.py
@ -54,6 +54,20 @@ def get_parser():
        help="""Path to vocabulary.""",
    )

+    parser.add_argument(
+        "--text",
+        type=str,
+        default="Ask not what your country can do for you; ask what you can do for your country.",
+        help="Text to generate speech for",
+    )
+
+    parser.add_argument(
+        "--output-filename",
+        type=str,
+        default="test_onnx.wav",
+        help="Filename to save the generated wave file.",
+    )
+
    return parser


@ -61,7 +75,7 @@ class OnnxModel:
    def __init__(self, model_filename: str):
        session_opts = ort.SessionOptions()
        session_opts.inter_op_num_threads = 1
-        session_opts.intra_op_num_threads = 4
+        session_opts.intra_op_num_threads = 1

        self.session_opts = session_opts

@ -72,6 +86,9 @@ class OnnxModel:
        )
        logging.info(f"{self.model.get_modelmeta().custom_metadata_map}")

+        metadata = self.model.get_modelmeta().custom_metadata_map
+        self.sample_rate = int(metadata["sample_rate"])
+
    def __call__(self, tokens: torch.Tensor, tokens_lens: torch.Tensor) -> torch.Tensor:
        """
        Args:
@ -101,13 +118,14 @@ class OnnxModel:

 def main():
    args = get_parser().parse_args()
+    logging.info(vars(args))

    tokenizer = Tokenizer(args.tokens)

    logging.info("About to create onnx model")
    model = OnnxModel(args.model_filename)

-    text = "I went there to see the land, the people and how their system works, end quote."
+    text = args.text
    tokens = tokenizer.texts_to_token_ids(
        [text], intersperse_blank=True, add_sos=True, add_eos=True
    )
@ -115,8 +133,9 @@ def main():
    tokens_lens = torch.tensor([tokens.shape[1]], dtype=torch.int64)  # (1, T)
    audio = model(tokens, tokens_lens)  # (1, T')

-    torchaudio.save(str("test_onnx.wav"), audio, sample_rate=22050)
-    logging.info("Saved to test_onnx.wav")
+    output_filename = args.output_filename
+    torchaudio.save(output_filename, audio, sample_rate=model.sample_rate)
+    logging.info(f"Saved to {output_filename}")


 if __name__ == "__main__":
--- a/egs/ljspeech/TTS/vits/text_encoder.py
+++ b/egs/ljspeech/TTS/vits/text_encoder.py
@ -92,9 +92,9 @@ class TextEncoder(torch.nn.Module):
            x_lengths (Tensor): Length tensor (B,).

        Returns:
-            Tensor: Encoded hidden representation (B, attention_dim, T_text).
-            Tensor: Projected mean tensor (B, attention_dim, T_text).
-            Tensor: Projected scale tensor (B, attention_dim, T_text).
+            Tensor: Encoded hidden representation (B, embed_dim, T_text).
+            Tensor: Projected mean tensor (B, embed_dim, T_text).
+            Tensor: Projected scale tensor (B, embed_dim, T_text).
            Tensor: Mask tensor for input tensor (B, 1, T_text).

        """
@ -108,6 +108,7 @@ class TextEncoder(torch.nn.Module):

        # encoder assume the channel last (B, T_text, embed_dim)
        x = self.encoder(x, key_padding_mask=pad_mask)
+        # Note: attention_dim == embed_dim

        # convert the channel first (B, embed_dim, T_text)
        x = x.transpose(1, 2)
--- a/egs/ljspeech/TTS/vits/tokenizer.py
+++ b/egs/ljspeech/TTS/vits/tokenizer.py
@ -18,7 +18,15 @@ import logging
 from typing import Dict, List

 import tacotron_cleaner.cleaners
+
+try:
    from piper_phonemize import phonemize_espeak
+except Exception as ex:
+    raise RuntimeError(
+        f"{ex}\nPlease run\n"
+        "pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
+    )
+
 from utils import intersperse


--- a/egs/ljspeech/TTS/vits/train.py
+++ b/egs/ljspeech/TTS/vits/train.py
@ -153,6 +153,16 @@ def get_parser():
        help="Whether to use half precision training.",
    )

+    parser.add_argument(
+        "--model-type",
+        type=str,
+        default="high",
+        choices=["low", "medium", "high"],
+        help="""If not empty, valid values are: low, medium, high.
+        It controls the model size. low -> runs faster.
+        """,
+    )
+
    return parser


@ -189,15 +199,6 @@ def get_params() -> AttributeDict:

        - feature_dim: The model input dim. It has to match the one used
                       in computing features.
-
-        - subsampling_factor:  The subsampling factor for the model.
-
-        - encoder_dim: Hidden dim for multi-head attention model.
-
-        - num_decoder_layers: Number of decoder layer of transformer decoder.
-
-        - warm_step: The warmup period that dictates the decay of the
-              scale on "simple" (un-pruned) loss.
    """
    params = AttributeDict(
        {
@ -278,6 +279,7 @@ def get_model(params: AttributeDict) -> nn.Module:
        vocab_size=params.vocab_size,
        feature_dim=params.feature_dim,
        sampling_rate=params.sampling_rate,
+        model_type=params.model_type,
        mel_loss_params=mel_loss_params,
        lambda_adv=params.lambda_adv,
        lambda_mel=params.lambda_mel,
@ -363,7 +365,7 @@ def train_one_epoch(
    model.train()
    device = model.device if isinstance(model, DDP) else next(model.parameters()).device

-    # used to summary the stats over iterations in one epoch
+    # used to track the stats over iterations in one epoch
    tot_loss = MetricsTracker()

    saved_bad_model = False
--- a/egs/ljspeech/TTS/vits/tts_datamodule.py
+++ b/egs/ljspeech/TTS/vits/tts_datamodule.py
@ -255,6 +255,7 @@ class LJSpeechTtsDataModule:
        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
+            num_buckets=self.args.num_buckets,
            shuffle=False,
        )
        logging.info("About to create valid dataloader")
@ -294,6 +295,7 @@ class LJSpeechTtsDataModule:
        test_sampler = DynamicBucketingSampler(
            cuts,
            max_duration=self.args.max_duration,
+            num_buckets=self.args.num_buckets,
            shuffle=False,
        )
        logging.info("About to create test dataloader")
--- a/egs/ljspeech/TTS/vits/vits.py
+++ b/egs/ljspeech/TTS/vits/vits.py
@ -5,6 +5,7 @@

 """VITS module for GAN-TTS task."""

+import copy
 from typing import Any, Dict, Optional, Tuple

 import torch
@ -38,6 +39,36 @@ AVAILABLE_DISCRIMINATORS = {
    "hifigan_multi_scale_multi_period_discriminator": HiFiGANMultiScaleMultiPeriodDiscriminator,  # NOQA
 }

+LOW_CONFIG = {
+    "hidden_channels": 96,
+    "decoder_upsample_scales": (8, 8, 4),
+    "decoder_channels": 256,
+    "decoder_upsample_kernel_sizes": (16, 16, 8),
+    "decoder_resblock_kernel_sizes": (3, 5, 7),
+    "decoder_resblock_dilations": ((1, 2), (2, 6), (3, 12)),
+    "text_encoder_cnn_module_kernel": 3,
+}
+
+MEDIUM_CONFIG = {
+    "hidden_channels": 192,
+    "decoder_upsample_scales": (8, 8, 4),
+    "decoder_channels": 256,
+    "decoder_upsample_kernel_sizes": (16, 16, 8),
+    "decoder_resblock_kernel_sizes": (3, 5, 7),
+    "decoder_resblock_dilations": ((1, 2), (2, 6), (3, 12)),
+    "text_encoder_cnn_module_kernel": 3,
+}
+
+HIGH_CONFIG = {
+    "hidden_channels": 192,
+    "decoder_upsample_scales": (8, 8, 2, 2),
+    "decoder_channels": 512,
+    "decoder_upsample_kernel_sizes": (16, 16, 4, 4),
+    "decoder_resblock_kernel_sizes": (3, 7, 11),
+    "decoder_resblock_dilations": ((1, 3, 5), (1, 3, 5), (1, 3, 5)),
+    "text_encoder_cnn_module_kernel": 5,
+}
+

 class VITS(nn.Module):
    """Implement VITS, `Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech`"""
@ -49,6 +80,7 @@ class VITS(nn.Module):
        feature_dim: int = 513,
        sampling_rate: int = 22050,
        generator_type: str = "vits_generator",
+        model_type: str = "",
        generator_params: Dict[str, Any] = {
            "hidden_channels": 192,
            "spks": None,
@ -155,12 +187,13 @@ class VITS(nn.Module):
        """Initialize VITS module.

        Args:
-            idim (int): Input vocabrary size.
+            idim (int): Input vocabulary size.
            odim (int): Acoustic feature dimension. The actual output channels will
                be 1 since VITS is the end-to-end text-to-wave model but for the
                compatibility odim is used to indicate the acoustic feature dimension.
            sampling_rate (int): Sampling rate, not used for the training but it will
                be referred in saving waveform during the inference.
+            model_type (str): If not empty, must be one of: low, medium, high
            generator_type (str): Generator type.
            generator_params (Dict[str, Any]): Parameter dict for generator.
            discriminator_type (str): Discriminator type.
@ -181,6 +214,24 @@ class VITS(nn.Module):
        """
        super().__init__()

+        generator_params = copy.deepcopy(generator_params)
+        discriminator_params = copy.deepcopy(discriminator_params)
+        generator_adv_loss_params = copy.deepcopy(generator_adv_loss_params)
+        discriminator_adv_loss_params = copy.deepcopy(discriminator_adv_loss_params)
+        feat_match_loss_params = copy.deepcopy(feat_match_loss_params)
+        mel_loss_params = copy.deepcopy(mel_loss_params)
+
+        if model_type != "":
+            assert model_type in ("low", "medium", "high"), model_type
+            if model_type == "low":
+                generator_params.update(LOW_CONFIG)
+            elif model_type == "medium":
+                generator_params.update(MEDIUM_CONFIG)
+            elif model_type == "high":
+                generator_params.update(HIGH_CONFIG)
+            else:
+                raise ValueError(f"Unknown model_type: ${model_type}")
+
        # define modules
        generator_class = AVAILABLE_GENERATERS[generator_type]
        if generator_type == "vits_generator":
--- a/egs/mdcc/ASR/README.md
+++ b/egs/mdcc/ASR/README.md
@ -0,0 +1,19 @@
+# Introduction
+
+Multi-Domain Cantonese Corpus (MDCC), consists of 73.6 hours of clean read speech paired with 
+transcripts, collected from Cantonese audiobooks from Hong Kong. It comprises philosophy, 
+politics, education, culture, lifestyle and family domains, covering a wide range of topics. 
+
+Manuscript can be found at: https://arxiv.org/abs/2201.02419
+
+# Transducers
+
+
+
+|                                       | Encoder             | Decoder            | Comment                     |
+|---------------------------------------|---------------------|--------------------|-----------------------------|
+| `zipformer`                           | Upgraded Zipformer | Embedding + Conv1d | The latest recipe with context-size set to 1 |
+
+The decoder is modified from the paper
+[Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
+We place an additional Conv1d layer right after the input embedding layer.
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`../../../librispeech/ASR/local/compile_hlg.py`
				`@ -0,0 +1 @@`
				`../pruned_transducer_stateless2/beam_search.py`
				`@ -0,0 +1 @@`
				`../transducer_stateless/encoder_interface.py`