Merge branch 'k2-fsa:master' into k2ssl

2025-12-10 22:45:27 +00:00 · 2024-03-10 13:10:36 +08:00 · 2024-03-10 13:10:36 +08:00 · 660f647886
commit 660f647886
parent bed950dbcb 60986c3ac1
181 changed files with 16099 additions and 413 deletions
--- a/.github/workflows/build-docker-image.yml
+++ b/.github/workflows/build-docker-image.yml
@ -16,7 +16,7 @@ jobs:
      fail-fast: false
      matrix:
        os: [ubuntu-latest]
-        image: ["torch2.2.0-cuda12.1", "torch2.2.0-cuda11.8", "torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+        image: ["torch2.2.1-cuda12.1", "torch2.2.1-cuda11.8", "torch2.2.0-cuda12.1", "torch2.2.0-cuda11.8", "torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]

    steps:
      # refer to https://github.com/actions/checkout
--- a/.github/workflows/run-docker-image.yml
+++ b/.github/workflows/run-docker-image.yml
@ -14,7 +14,7 @@ jobs:
      fail-fast: false
      matrix:
        os: [ubuntu-latest]
-        image: ["torch2.2.0-cuda12.1", "torch2.2.0-cuda11.8", "torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+        image: ["torch2.2.1-cuda12.1", "torch2.2.1-cuda11.8", "torch2.2.0-cuda12.1", "torch2.2.0-cuda11.8", "torch2.1.0-cuda12.1", "torch2.1.0-cuda11.8", "torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
    steps:
      # refer to https://github.com/actions/checkout
      - uses: actions/checkout@v2
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@ -49,7 +49,7 @@ jobs:

      - name: Install Python dependencies
        run: |
-          python3 -m pip install --upgrade pip black==22.3.0 flake8==5.0.4 click==8.1.0
+          python3 -m pip install --upgrade pip black==22.3.0 flake8==5.0.4 click==8.1.0 isort==5.10.1
          # Click issue fixed in https://github.com/psf/black/pull/2966

      - name: Run flake8
@ -67,3 +67,9 @@ jobs:
        working-directory: ${{github.workspace}}
        run: |
          black --check --diff .
+      
+      - name: Run isort
+        shell: bash
+        working-directory: ${{github.workspace}}
+        run: |
+          isort --check --diff .
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -26,7 +26,7 @@ repos:
      # E121,E123,E126,E226,E24,E704,W503,W504

  - repo: https://github.com/pycqa/isort
-    rev: 5.11.5
+    rev: 5.10.1
    hooks:
      - id: isort
        args: ["--profile=black"]
--- a/docker/torch1.12.1-cuda11.3.dockerfile
+++ b/docker/torch1.12.1-cuda11.3.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.7
-ARG K2_VERSION="1.24.4.dev20240211+cuda11.3.torch1.12.1"
-ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.3.torch1.12.1"
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.3.torch1.12.1"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.3.torch1.12.1"
 ARG TORCHAUDIO_VERSION="0.12.1+cu113"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch1.13.0-cuda11.6.dockerfile
+++ b/docker/torch1.13.0-cuda11.6.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.9
-ARG K2_VERSION="1.24.4.dev20240211+cuda11.6.torch1.13.0"
-ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.6.torch1.13.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.6.torch1.13.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.6.torch1.13.0"
 ARG TORCHAUDIO_VERSION="0.13.0+cu116"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch1.9.0-cuda10.2.dockerfile
+++ b/docker/torch1.9.0-cuda10.2.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.7
-ARG K2_VERSION="1.24.4.dev20240211+cuda10.2.torch1.9.0"
-ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda10.2.torch1.9.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda10.2.torch1.9.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda10.2.torch1.9.0"
 ARG TORCHAUDIO_VERSION="0.9.0"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch2.0.0-cuda11.7.dockerfile
+++ b/docker/torch2.0.0-cuda11.7.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.10
-ARG K2_VERSION="1.24.4.dev20240211+cuda11.7.torch2.0.0"
-ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.7.torch2.0.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.7.torch2.0.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.7.torch2.0.0"
 ARG TORCHAUDIO_VERSION="2.0.0+cu117"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch2.1.0-cuda11.8.dockerfile
+++ b/docker/torch2.1.0-cuda11.8.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.10
-ARG K2_VERSION="1.24.4.dev20240211+cuda11.8.torch2.1.0"
-ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.8.torch2.1.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.8.torch2.1.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.8.torch2.1.0"
 ARG TORCHAUDIO_VERSION="2.1.0+cu118"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch2.1.0-cuda12.1.dockerfile
+++ b/docker/torch2.1.0-cuda12.1.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.10
-ARG K2_VERSION="1.24.4.dev20240211+cuda12.1.torch2.1.0"
-ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda12.1.torch2.1.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda12.1.torch2.1.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda12.1.torch2.1.0"
 ARG TORCHAUDIO_VERSION="2.1.0+cu121"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch2.2.0-cuda11.8.dockerfile
+++ b/docker/torch2.2.0-cuda11.8.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.10
-ARG K2_VERSION="1.24.4.dev20240211+cuda11.8.torch2.2.0"
-ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda11.8.torch2.2.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.8.torch2.2.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.8.torch2.2.0"
 ARG TORCHAUDIO_VERSION="2.2.0+cu118"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch2.2.0-cuda12.1.dockerfile
+++ b/docker/torch2.2.0-cuda12.1.dockerfile
@ -5,8 +5,8 @@ ENV LC_ALL C.UTF-8
 ARG DEBIAN_FRONTEND=noninteractive

 # python 3.10
-ARG K2_VERSION="1.24.4.dev20240211+cuda12.1.torch2.2.0"
-ARG KALDIFEAT_VERSION="1.25.4.dev20240210+cuda12.1.torch2.2.0"
+ARG K2_VERSION="1.24.4.dev20240223+cuda12.1.torch2.2.0"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda12.1.torch2.2.0"
 ARG TORCHAUDIO_VERSION="2.2.0+cu121"

 LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
--- a/docker/torch2.2.1-cuda11.8.dockerfile
+++ b/docker/torch2.2.1-cuda11.8.dockerfile
@ -0,0 +1,70 @@
+FROM pytorch/pytorch:2.2.1-cuda11.8-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20240223+cuda11.8.torch2.2.1"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda11.8.torch2.2.1"
+ARG TORCHAUDIO_VERSION="2.2.1+cu118"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docker/torch2.2.1-cuda12.1.dockerfile
+++ b/docker/torch2.2.1-cuda12.1.dockerfile
@ -0,0 +1,70 @@
+FROM pytorch/pytorch:2.2.1-cuda12.1-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# python 3.10
+ARG K2_VERSION="1.24.4.dev20240223+cuda12.1.torch2.2.1"
+ARG KALDIFEAT_VERSION="1.25.4.dev20240223+cuda12.1.torch2.2.1"
+ARG TORCHAUDIO_VERSION="2.2.1+cu121"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    	libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
--- a/docs/source/docker/intro.rst
+++ b/docs/source/docker/intro.rst
@ -34,6 +34,8 @@ which will give you something like below:

 .. code-block:: bash

+  "torch2.2.1-cuda12.1"
+  "torch2.2.1-cuda11.8"
  "torch2.2.0-cuda12.1"
  "torch2.2.0-cuda11.8"
  "torch2.1.0-cuda12.1"
--- a/docs/source/recipes/TTS/ljspeech/vits.rst
+++ b/docs/source/recipes/TTS/ljspeech/vits.rst
@ -1,11 +1,11 @@
-VITS
+VITS-LJSpeech
 ===============

 This tutorial shows you how to train an VITS model
 with the `LJSpeech <https://keithito.com/LJ-Speech-Dataset/>`_ dataset.

 .. note::
-  
+
   TTS related recipes require packages in ``requirements-tts.txt``.

 .. note::
@ -120,4 +120,4 @@ Download pretrained models
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following link:

-  - `<https://huggingface.co/Zengwei/icefall-tts-ljspeech-vits-2023-11-29>`_
+  - `<https://huggingface.co/Zengwei/icefall-tts-ljspeech-vits-2024-02-28>`_
--- a/docs/source/recipes/TTS/vctk/vits.rst
+++ b/docs/source/recipes/TTS/vctk/vits.rst
@ -1,11 +1,11 @@
-VITS
+VITS-VCTK
 ===============

 This tutorial shows you how to train an VITS model
 with the `VCTK <https://datashare.ed.ac.uk/handle/10283/3443>`_ dataset.

 .. note::
-  
+
   TTS related recipes require packages in ``requirements-tts.txt``.

 .. note::
--- a/egs/aishell/ASR/RESULTS.md
+++ b/egs/aishell/ASR/RESULTS.md
@ -75,7 +75,7 @@ It's reworked Zipformer with Pruned RNNT loss, trained with Byte-level BPE, `voc
 | fast beam search       | 4.43 | 4.17 | --epoch 40 --avg 10                     |

 ```bash
-./prepare.sh 
+./prepare.sh

 export CUDA_VISIBLE_DEVICES="0,1"

--- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/decode.py
@ -250,7 +250,7 @@ def get_parser():
    parser.add_argument(
        "--context-size",
        type=int,
-        default=1,
+        default=2,
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )
    parser.add_argument(
--- a/egs/aishell2/ASR/RESULTS.md
+++ b/egs/aishell2/ASR/RESULTS.md
@ -1,6 +1,6 @@
 ## Results

-### Aishell2 char-based training results 
+### Aishell2 char-based training results

 #### Pruned transducer stateless 5

--- a/egs/aishell2/ASR/local/compute_fbank_aishell2.py
+++ b/egs/aishell2/ASR/local/compute_fbank_aishell2.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    WhisperFbank,
+    WhisperFbankConfig,
+)
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor, str2bool
@ -42,10 +49,12 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)


-def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_aishell2(
+    num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False
+):
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
-    num_jobs = min(15, os.cpu_count())
+    num_jobs = min(8, os.cpu_count())

    dataset_parts = (
        "train",
@ -68,8 +77,12 @@ def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False):
        list(manifests.keys()),
        dataset_parts,
    )
-
-    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+    if whisper_fbank:
+        extractor = WhisperFbank(
+            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
+        )
+    else:
+        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
@ -82,7 +95,7 @@ def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False):
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -111,7 +124,12 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
-
+    parser.add_argument(
+        "--whisper-fbank",
+        type=str2bool,
+        default=False,
+        help="Use WhisperFbank instead of Fbank. Default: False.",
+    )
    return parser.parse_args()


@ -122,5 +140,7 @@ if __name__ == "__main__":

    args = get_args()
    compute_fbank_aishell2(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
+        perturb_speed=args.perturb_speed,
+        whisper_fbank=args.whisper_fbank,
    )
--- a/egs/aishell2/ASR/prepare.sh
+++ b/egs/aishell2/ASR/prepare.sh
@ -108,6 +108,16 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  fi
 fi

+whisper_mel_bins=80
+if [ $stage -le 30 ] && [ $stop_stage -ge 30 ]; then
+  log "Stage 30: Compute whisper fbank for aishell2"
+  if [ ! -f data/fbank/.aishell2.whisper.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_aishell2.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
+    touch data/fbank/.aishell2.whisper.done
+  fi
+fi
+
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Compute fbank for musan"
  if [ ! -f data/fbank/.msuan.done ]; then
--- a/egs/aishell4/ASR/README.md
+++ b/egs/aishell4/ASR/README.md
@ -3,7 +3,7 @@

 This recipe contains some various ASR models trained with Aishell4 (including S, M and L three subsets).

-The AISHELL-4 is a sizable real-recorded Mandarin speech dataset collected by 8-channel circular microphone array for speech processing in conference scenarios. The dataset consists of 211 recorded meeting sessions, each containing 4 to 8 speakers, with a total length of 120 hours. This dataset aims to bridge the advanced research on multi-speaker processing and the practical application scenario in three aspects. With real recorded meetings, AISHELL-4 provides realistic acoustics and rich natural speech characteristics in conversation such as short pause, speech overlap, quick speaker turn, noise, etc. Meanwhile, the accurate transcription and speaker voice activity are provided for each meeting in AISHELL-4. This allows the researchers to explore different aspects in meeting processing, ranging from individual tasks such as speech front-end processing, speech recognition and speaker diarization, to multi-modality modeling and joint optimization of relevant tasks. 
+The AISHELL-4 is a sizable real-recorded Mandarin speech dataset collected by 8-channel circular microphone array for speech processing in conference scenarios. The dataset consists of 211 recorded meeting sessions, each containing 4 to 8 speakers, with a total length of 120 hours. This dataset aims to bridge the advanced research on multi-speaker processing and the practical application scenario in three aspects. With real recorded meetings, AISHELL-4 provides realistic acoustics and rich natural speech characteristics in conversation such as short pause, speech overlap, quick speaker turn, noise, etc. Meanwhile, the accurate transcription and speaker voice activity are provided for each meeting in AISHELL-4. This allows the researchers to explore different aspects in meeting processing, ranging from individual tasks such as speech front-end processing, speech recognition and speaker diarization, to multi-modality modeling and joint optimization of relevant tasks.

 (From [Open Speech and Language Resources](https://www.openslr.org/111/))

--- a/egs/aishell4/ASR/local/compute_fbank_aishell4.py
+++ b/egs/aishell4/ASR/local/compute_fbank_aishell4.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path

 import torch
-from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    WhisperFbank,
+    WhisperFbankConfig,
+)
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor, str2bool
@ -42,10 +49,12 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)


-def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_aishell4(
+    num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False
+):
    src_dir = Path("data/manifests/aishell4")
    output_dir = Path("data/fbank")
-    num_jobs = min(15, os.cpu_count())
+    num_jobs = min(8, os.cpu_count())

    dataset_parts = (
        "train_S",
@ -70,7 +79,12 @@ def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
        dataset_parts,
    )

-    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+    if whisper_fbank:
+        extractor = WhisperFbank(
+            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
+        )
+    else:
+        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
@ -84,7 +98,7 @@ def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -95,7 +109,7 @@ def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=ChunkedLilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )

            logging.info("About splitting cuts into smaller chunks")
@ -121,7 +135,12 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
-
+    parser.add_argument(
+        "--whisper-fbank",
+        type=str2bool,
+        default=False,
+        help="Use WhisperFbank instead of Fbank. Default: False.",
+    )
    return parser.parse_args()


@ -132,5 +151,7 @@ if __name__ == "__main__":

    args = get_args()
    compute_fbank_aishell4(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
+        perturb_speed=args.perturb_speed,
+        whisper_fbank=args.whisper_fbank,
    )
--- a/egs/aishell4/ASR/prepare.sh
+++ b/egs/aishell4/ASR/prepare.sh
@ -6,7 +6,7 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail

 stage=-1
-stop_stage=100
+stop_stage=7
 perturb_speed=true


@ -76,11 +76,21 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
 fi

 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
-  log "Stage 2: Process aishell4"
+  log "Stage 2: Compute fbank for aishell4"
  if [ ! -f data/fbank/aishell4/.fbank.done ]; then
-    mkdir -p data/fbank/aishell4
+    mkdir -p data/fbank
    ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed}
-    touch data/fbank/aishell4/.fbank.done
+    touch data/fbank/.fbank.done
+  fi
+fi
+
+whisper_mel_bins=80
+if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then
+  log "Stage 20: Compute whisper fbank for aishell4"
+  if [ ! -f data/fbank/aishell4/.fbank.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
+    touch data/fbank/.fbank.done
  fi
 fi

@ -106,16 +116,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 fi

 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  log "Stage 5: Compute fbank for aishell4"
-  if [ ! -f data/fbank/.aishell4.done ]; then
-    mkdir -p data/fbank
-    ./local/compute_fbank_aishell4.py --perturb-speed ${perturb_speed}
-    touch data/fbank/.aishell4.done
-  fi
-fi
-
-if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
-  log "Stage 6: Prepare char based lang"
+  log "Stage 5: Prepare char based lang"
  lang_char_dir=data/lang_char
  mkdir -p $lang_char_dir

--- a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
+++ b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
@ -29,7 +29,14 @@ import os
 from pathlib import Path

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse import (
+    CutSet,
+    Fbank,
+    FbankConfig,
+    LilcomChunkyWriter,
+    WhisperFbank,
+    WhisperFbankConfig,
+)
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor, str2bool
@ -42,10 +49,12 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)


-def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False):
+def compute_fbank_alimeeting(
+    num_mel_bins: int = 80, perturb_speed: bool = False, whisper_fbank: bool = False
+):
    src_dir = Path("data/manifests/alimeeting")
    output_dir = Path("data/fbank")
-    num_jobs = min(15, os.cpu_count())
+    num_jobs = min(8, os.cpu_count())

    dataset_parts = (
        "train",
@ -53,7 +62,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False
        "test",
    )

-    prefix = "alimeeting"
+    prefix = "alimeeting-far"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
@ -70,7 +79,12 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False
        dataset_parts,
    )

-    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+    if whisper_fbank:
+        extractor = WhisperFbank(
+            WhisperFbankConfig(num_filters=num_mel_bins, device="cuda")
+        )
+    else:
+        extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
@ -83,7 +97,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False
                supervisions=m["supervisions"],
            )
            if "train" in partition and perturb_speed:
-                logging.info(f"Doing speed perturb")
+                logging.info("Doing speed perturb")
                cut_set = (
                    cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                )
@ -121,7 +135,12 @@ def get_args():
        default=False,
        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
    )
-
+    parser.add_argument(
+        "--whisper-fbank",
+        type=str2bool,
+        default=False,
+        help="Use the Whisper Fbank feature extractor. Default: False.",
+    )
    return parser.parse_args()


@ -132,5 +151,7 @@ if __name__ == "__main__":

    args = get_args()
    compute_fbank_alimeeting(
-        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+        num_mel_bins=args.num_mel_bins,
+        perturb_speed=args.perturb_speed,
+        whisper_fbank=args.whisper_fbank,
    )
--- a/egs/alimeeting/ASR/prepare.sh
+++ b/egs/alimeeting/ASR/prepare.sh
@ -6,7 +6,7 @@ export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail

 stage=-1
-stop_stage=100
+stop_stage=7
 perturb_speed=true

 # We assume dl_dir (download dir) contains the following
@ -66,10 +66,21 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
 fi

 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
-  log "Stage 2: Process alimeeting"
-  if [ ! -f data/fbank/alimeeting/.fbank.done ]; then
-    mkdir -p data/fbank/alimeeting
+  log "Stage 2: compute fbank for alimeeting"
+  if [ ! -f data/fbank/.fbank.done ]; then
+    mkdir -p data/fbank
    ./local/compute_fbank_alimeeting.py --perturb-speed ${perturb_speed}
+    touch data/fbank/.fbank.done
+  fi
+fi
+
+whisper_mel_bins=80
+if [ $stage -le 20 ] && [ $stop_stage -ge 20 ]; then
+  log "Stage 20: compute whisper fbank for alimeeting"
+  if [ ! -f data/fbank/.fbank.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_alimeeting.py --perturb-speed ${perturb_speed} --num-mel-bins ${whisper_mel_bins} --whisper-fbank true
+    touch data/fbank/.fbank.done
  fi
 fi

@ -95,16 +106,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 fi

 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  log "Stage 5: Compute fbank for alimeeting"
-  if [ ! -f data/fbank/.alimeeting.done ]; then
-    mkdir -p data/fbank
-    ./local/compute_fbank_alimeeting.py --perturb-speed True
-    touch data/fbank/.alimeeting.done
-  fi
-fi
-
-if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
-  log "Stage 6: Prepare char based lang"
+  log "Stage 5: Prepare char based lang"
  lang_char_dir=data/lang_char
  mkdir -p $lang_char_dir

--- a/egs/commonvoice/ASR/local/compile_hlg.py
+++ b/egs/commonvoice/ASR/local/compile_hlg.py
@ -1 +0,0 @@
-../../../librispeech/ASR/local/compile_hlg.py
--- a/egs/commonvoice/ASR/local/compile_hlg.py
+++ b/egs/commonvoice/ASR/local/compile_hlg.py
@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+# Copyright    2021-2024  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                         Zengrui Jin,)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This script takes as input lang_dir and generates HLG from
+
+    - H, the ctc topology, built from tokens contained in lang_dir/lexicon.txt
+    - L, the lexicon, built from lang_dir/L_disambig.pt
+
+        Caution: We use a lexicon that contains disambiguation symbols
+
+    - G, the LM, built from data/lm/G_n_gram.fst.txt
+
+The generated HLG is saved in $lang_dir/HLG.pt
+"""
+import argparse
+import logging
+from pathlib import Path
+
+import k2
+import torch
+
+from icefall.lexicon import Lexicon
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lm",
+        type=str,
+        default="G_3_gram",
+        help="""Stem name for LM used in HLG compiling.
+        """,
+    )
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        """,
+    )
+
+    return parser.parse_args()
+
+
+def compile_HLG(lang_dir: str, lm: str = "G_3_gram") -> k2.Fsa:
+    """
+    Args:
+      lang_dir:
+        The language directory, e.g., data/lang_phone or data/lang_bpe_5000.
+      lm:
+        The language stem base name.
+
+    Return:
+      An FSA representing HLG.
+    """
+    lexicon = Lexicon(lang_dir)
+    max_token_id = max(lexicon.tokens)
+    logging.info(f"Building ctc_topo. max_token_id: {max_token_id}")
+    H = k2.ctc_topo(max_token_id)
+    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
+
+    if Path(f"{lang_dir}/lm/{lm}.pt").is_file():
+        logging.info(f"Loading pre-compiled {lm}")
+        d = torch.load(f"{lang_dir}/lm/{lm}.pt")
+        G = k2.Fsa.from_dict(d)
+    else:
+        logging.info(f"Loading {lm}.fst.txt")
+        with open(f"{lang_dir}/lm/{lm}.fst.txt") as f:
+            G = k2.Fsa.from_openfst(f.read(), acceptor=False)
+            torch.save(G.as_dict(), f"{lang_dir}/lm/{lm}.pt")
+
+    first_token_disambig_id = lexicon.token_table["#0"]
+    first_word_disambig_id = lexicon.word_table["#0"]
+
+    L = k2.arc_sort(L)
+    G = k2.arc_sort(G)
+
+    logging.info("Intersecting L and G")
+    LG = k2.compose(L, G)
+    logging.info(f"LG shape: {LG.shape}")
+
+    logging.info("Connecting LG")
+    LG = k2.connect(LG)
+    logging.info(f"LG shape after k2.connect: {LG.shape}")
+
+    logging.info(type(LG.aux_labels))
+    logging.info("Determinizing LG")
+
+    LG = k2.determinize(LG)
+    logging.info(type(LG.aux_labels))
+
+    logging.info("Connecting LG after k2.determinize")
+    LG = k2.connect(LG)
+
+    logging.info("Removing disambiguation symbols on LG")
+
+    # LG.labels[LG.labels >= first_token_disambig_id] = 0
+    # see https://github.com/k2-fsa/k2/pull/1140
+    labels = LG.labels
+    labels[labels >= first_token_disambig_id] = 0
+    LG.labels = labels
+
+    assert isinstance(LG.aux_labels, k2.RaggedTensor)
+    LG.aux_labels.values[LG.aux_labels.values >= first_word_disambig_id] = 0
+
+    LG = k2.remove_epsilon(LG)
+    logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}")
+
+    LG = k2.connect(LG)
+    LG.aux_labels = LG.aux_labels.remove_values_eq(0)
+
+    logging.info("Arc sorting LG")
+    LG = k2.arc_sort(LG)
+
+    logging.info("Composing H and LG")
+    # CAUTION: The name of the inner_labels is fixed
+    # to `tokens`. If you want to change it, please
+    # also change other places in icefall that are using
+    # it.
+    HLG = k2.compose(H, LG, inner_labels="tokens")
+
+    logging.info("Connecting LG")
+    HLG = k2.connect(HLG)
+
+    logging.info("Arc sorting LG")
+    HLG = k2.arc_sort(HLG)
+    logging.info(f"HLG.shape: {HLG.shape}")
+
+    return HLG
+
+
+def main():
+    args = get_args()
+    lang_dir = Path(args.lang_dir)
+
+    if (lang_dir / "HLG.pt").is_file():
+        logging.info(f"{lang_dir}/HLG.pt already exists - skipping")
+        return
+
+    logging.info(f"Processing {lang_dir}")
+
+    HLG = compile_HLG(lang_dir, args.lm)
+    logging.info(f"Saving HLG.pt to {lang_dir}")
+    torch.save(HLG.as_dict(), f"{lang_dir}/HLG.pt")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    main()
--- a/egs/commonvoice/ASR/local/compile_lg.py
+++ b/egs/commonvoice/ASR/local/compile_lg.py
@ -1 +0,0 @@
-../../../librispeech/ASR/local/compile_lg.py
--- a/egs/commonvoice/ASR/local/compile_lg.py
+++ b/egs/commonvoice/ASR/local/compile_lg.py
@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+# Copyright    2021-2024  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                            Kang Wei,
+#                                                         Zengrui Jin,)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This script takes as input lang_dir and generates LG from
+
+    - L, the lexicon, built from lang_dir/L_disambig.pt
+
+        Caution: We use a lexicon that contains disambiguation symbols
+
+    - G, the LM, built from lang_dir/lm/G_3_gram.fst.txt
+
+The generated LG is saved in $lang_dir/LG.pt
+"""
+import argparse
+import logging
+from pathlib import Path
+
+import k2
+import torch
+
+from icefall.lexicon import Lexicon
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        help="""Input and output directory.
+        """,
+    )
+    parser.add_argument(
+        "--lm",
+        type=str,
+        default="G_3_gram",
+        help="""Stem name for LM used in HLG compiling.
+        """,
+    )
+
+    return parser.parse_args()
+
+
+def compile_LG(lang_dir: str, lm: str = "G_3_gram") -> k2.Fsa:
+    """
+    Args:
+      lang_dir:
+        The language directory, e.g., data/lang_phone or data/lang_bpe_5000.
+
+    Return:
+      An FSA representing LG.
+    """
+    lexicon = Lexicon(lang_dir)
+    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
+
+    if Path(f"{lang_dir}/lm/{lm}.pt").is_file():
+        logging.info(f"Loading pre-compiled {lm}")
+        d = torch.load(f"{lang_dir}/lm/{lm}.pt")
+        G = k2.Fsa.from_dict(d)
+    else:
+        logging.info(f"Loading {lm}.fst.txt")
+        with open(f"{lang_dir}/lm/{lm}.fst.txt") as f:
+            G = k2.Fsa.from_openfst(f.read(), acceptor=False)
+            torch.save(G.as_dict(), f"{lang_dir}/lm/{lm}.pt")
+
+    first_token_disambig_id = lexicon.token_table["#0"]
+    first_word_disambig_id = lexicon.word_table["#0"]
+
+    L = k2.arc_sort(L)
+    G = k2.arc_sort(G)
+
+    logging.info("Intersecting L and G")
+    LG = k2.compose(L, G)
+    logging.info(f"LG shape: {LG.shape}")
+
+    logging.info("Connecting LG")
+    LG = k2.connect(LG)
+    logging.info(f"LG shape after k2.connect: {LG.shape}")
+
+    logging.info(type(LG.aux_labels))
+    logging.info("Determinizing LG")
+
+    LG = k2.determinize(LG, k2.DeterminizeWeightPushingType.kLogWeightPushing)
+    logging.info(type(LG.aux_labels))
+
+    logging.info("Connecting LG after k2.determinize")
+    LG = k2.connect(LG)
+
+    logging.info("Removing disambiguation symbols on LG")
+
+    # LG.labels[LG.labels >= first_token_disambig_id] = 0
+    # see https://github.com/k2-fsa/k2/pull/1140
+    labels = LG.labels
+    labels[labels >= first_token_disambig_id] = 0
+    LG.labels = labels
+
+    assert isinstance(LG.aux_labels, k2.RaggedTensor)
+    LG.aux_labels.values[LG.aux_labels.values >= first_word_disambig_id] = 0
+
+    LG = k2.remove_epsilon(LG)
+    logging.info(f"LG shape after k2.remove_epsilon: {LG.shape}")
+
+    LG = k2.connect(LG)
+    LG.aux_labels = LG.aux_labels.remove_values_eq(0)
+
+    logging.info("Arc sorting LG")
+    LG = k2.arc_sort(LG)
+
+    return LG
+
+
+def main():
+    args = get_args()
+    lang_dir = Path(args.lang_dir)
+
+    if (lang_dir / "LG.pt").is_file():
+        logging.info(f"{lang_dir}/LG.pt already exists - skipping")
+        return
+
+    logging.info(f"Processing {lang_dir}")
+
+    LG = compile_LG(lang_dir, args.lm)
+    logging.info(f"Saving LG.pt to {lang_dir}")
+    torch.save(LG.as_dict(), f"{lang_dir}/LG.pt")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    main()
--- a/egs/commonvoice/ASR/local/preprocess_commonvoice.py
+++ b/egs/commonvoice/ASR/local/preprocess_commonvoice.py
@ -48,8 +48,27 @@ def normalize_text(utt: str, language: str) -> str:
    utt = re.sub("’", "'", utt)
    if language == "en":
        return re.sub(r"[^a-zA-Z\s]", "", utt).upper()
-    if language == "fr":
+    elif language == "fr":
        return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper()
+    elif language == "pl":
+        return re.sub(r"[^a-ząćęłńóśźżA-ZĄĆĘŁŃÓŚŹŻ' ]", "", utt).upper()
+    elif language == "yue":
+        return (
+            utt.replace(" ", "")
+            .replace("，", "")
+            .replace("。", " ")
+            .replace("？", "")
+            .replace("！", "")
+            .replace("?", "")
+        )
+    else:
+        raise NotImplementedError(
+            f"""
+            Text normalization not implemented for language: {language},
+            please consider implementing it in the local/preprocess_commonvoice.py
+            or raise an issue on GitHub to request it.
+            """
+        )


 def preprocess_commonvoice(
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7/asr_datamodule.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7/asr_datamodule.py
@ -381,9 +381,11 @@ class CommonVoiceAsrDataModule:
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.debug("About to create test dataset")
        test = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
-            if self.args.on_the_fly_feats
-            else eval(self.args.input_strategy)(),
+            input_strategy=(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+                if self.args.on_the_fly_feats
+                else eval(self.args.input_strategy)()
+            ),
            return_cuts=self.args.return_cuts,
        )
        sampler = DynamicBucketingSampler(
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7/onnx_check.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7/onnx_check.py
@ -79,10 +79,10 @@ It will generate the following 3 files inside $repo/exp:
 import argparse
 import logging

-from icefall import is_module_available
+import torch
 from onnx_pretrained import OnnxModel

-import torch
+from icefall import is_module_available


 def get_parser():
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/commonvoice_fr.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/commonvoice_fr.py
@ -31,7 +31,7 @@ from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
-    SingleCutSampler,
+    SimpleCutSampler,
    SpecAugment,
 )
 from lhotse.dataset.input_strategies import (  # noqa F401 For AudioSamples
@ -315,8 +315,8 @@ class CommonVoiceAsrDataModule:
                drop_last=self.args.drop_last,
            )
        else:
-            logging.info("Using SingleCutSampler.")
-            train_sampler = SingleCutSampler(
+            logging.info("Using SimpleCutSampler.")
+            train_sampler = SimpleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
@ -383,9 +383,11 @@ class CommonVoiceAsrDataModule:
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.debug("About to create test dataset")
        test = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
-            if self.args.on_the_fly_feats
-            else eval(self.args.input_strategy)(),
+            input_strategy=(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+                if self.args.on_the_fly_feats
+                else eval(self.args.input_strategy)()
+            ),
            return_cuts=self.args.return_cuts,
        )
        sampler = DynamicBucketingSampler(
--- a/egs/csj/ASR/pruned_transducer_stateless7_streaming/export-for-ncnn.py
+++ b/egs/csj/ASR/pruned_transducer_stateless7_streaming/export-for-ncnn.py
@ -70,9 +70,9 @@ import logging
 from pathlib import Path

 import torch
+from do_not_use_it_directly import add_model_arguments, get_params, get_transducer_model
 from scaling_converter import convert_scaled_to_non_scaled
 from tokenizer import Tokenizer
-from do_not_use_it_directly import add_model_arguments, get_params, get_transducer_model

 from icefall.checkpoint import (
    average_checkpoints,
--- a/egs/gigaspeech/ASR/local/preprocess_gigaspeech.py
+++ b/egs/gigaspeech/ASR/local/preprocess_gigaspeech.py
@ -23,6 +23,7 @@ from pathlib import Path

 from lhotse import CutSet, SupervisionSegment
 from lhotse.recipes.utils import read_manifests_if_cached
+
 from icefall.utils import str2bool

 # Similar text filtering and normalization procedure as in:
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/decode.py
@ -76,6 +76,7 @@ from beam_search import (
 )
 from gigaspeech_scoring import asr_text_post_processing
 from train import get_params, get_transducer_model
+
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
--- a/egs/gigaspeech/ASR/zipformer/ctc_decode.py
+++ b/egs/gigaspeech/ASR/zipformer/ctc_decode.py
@ -88,7 +88,7 @@ import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import GigaSpeechAsrDataModule
-from train import add_model_arguments, get_params, get_model
+from train import add_model_arguments, get_model, get_params

 from icefall.checkpoint import (
    average_checkpoints,
--- a/egs/gigaspeech/ASR/zipformer/streaming_decode.py
+++ b/egs/gigaspeech/ASR/zipformer/streaming_decode.py
@ -51,7 +51,7 @@ from streaming_beam_search import (
 )
 from torch import Tensor, nn
 from torch.nn.utils.rnn import pad_sequence
-from train import add_model_arguments, get_params, get_model
+from train import add_model_arguments, get_model, get_params

 from icefall.checkpoint import (
    average_checkpoints,
--- a/egs/gigaspeech/KWS/zipformer/decode.py
+++ b/egs/gigaspeech/KWS/zipformer/decode.py
@ -42,12 +42,10 @@ import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import GigaSpeechAsrDataModule
-from beam_search import (
-    keywords_search,
-)
+from beam_search import keywords_search
+from lhotse.cut import Cut
 from train import add_model_arguments, get_model, get_params

-from lhotse.cut import Cut
 from icefall import ContextGraph
 from icefall.checkpoint import (
    average_checkpoints,
--- a/egs/gigaspeech/KWS/zipformer/finetune.py
+++ b/egs/gigaspeech/KWS/zipformer/finetune.py
@ -76,6 +76,20 @@ from torch import Tensor
 from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter
+from train import (
+    add_model_arguments,
+    add_training_arguments,
+    compute_loss,
+    compute_validation_loss,
+    display_and_save_batch,
+    get_adjusted_batch_count,
+    get_model,
+    get_params,
+    load_checkpoint_if_available,
+    save_checkpoint,
+    scan_pessimistic_batches_for_oom,
+    set_batch_count,
+)

 from icefall import diagnostics
 from icefall.checkpoint import remove_checkpoints
@ -95,21 +109,6 @@ from icefall.utils import (
    str2bool,
 )

-from train import (
-    add_model_arguments,
-    add_training_arguments,
-    compute_loss,
-    compute_validation_loss,
-    display_and_save_batch,
-    get_adjusted_batch_count,
-    get_model,
-    get_params,
-    load_checkpoint_if_available,
-    save_checkpoint,
-    scan_pessimistic_batches_for_oom,
-    set_batch_count,
-)
-
 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]


--- a/egs/libriheavy/ASR/zipformer_prompt_asr/asr_datamodule.py
+++ b/egs/libriheavy/ASR/zipformer_prompt_asr/asr_datamodule.py
@ -425,9 +425,11 @@ class LibriHeavyAsrDataModule:
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.debug("About to create test dataset")
        test = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
-            if self.args.on_the_fly_feats
-            else PrecomputedFeatures(),
+            input_strategy=(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+                if self.args.on_the_fly_feats
+                else PrecomputedFeatures()
+            ),
            return_cuts=self.args.return_cuts,
        )
        sampler = DynamicBucketingSampler(
--- a/egs/librispeech/ASR/README.md
+++ b/egs/librispeech/ASR/README.md
@ -35,6 +35,7 @@ The following table lists the differences among them.
 | `lstm_transducer_stateless2`          | LSTM | Embedding + Conv1d | Using LSTM with mechanisms in reworked model + gigaspeech (multi-dataset setup) |
 | `lstm_transducer_stateless3`          | LSTM | Embedding + Conv1d | Using LSTM with mechanisms in reworked model + gradient filter + delay penalty |
 | `zipformer`                           | Upgraded Zipformer | Embedding + Conv1d | The latest recipe |
+| `zipformer_adapter`                           | Upgraded Zipformer | Embedding + Conv1d | It supports domain adaptation of Zipformer using parameter efficient adapters |

 The decoder in `transducer_stateless` is modified from the paper
 [Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/).
--- a/egs/librispeech/ASR/conformer_ctc3/test_model.py
+++ b/egs/librispeech/ASR/conformer_ctc3/test_model.py
@ -24,8 +24,7 @@ To run this file, do:
 """

 import torch
-
-from train import get_params, get_ctc_model
+from train import get_ctc_model, get_params


 def test_model():
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/export-onnx.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/export-onnx.py
@ -59,9 +59,9 @@ import onnx
 import torch
 import torch.nn as nn
 from decoder import Decoder
+from do_not_use_it_directly import add_model_arguments, get_params, get_transducer_model
 from emformer import Emformer
 from scaling_converter import convert_scaled_to_non_scaled
-from do_not_use_it_directly import add_model_arguments, get_params, get_transducer_model

 from icefall.checkpoint import (
    average_checkpoints,
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/jit_pretrained.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/jit_pretrained.py
@ -39,7 +39,7 @@ Usage of this script:
 import argparse
 import logging
 import math
-from typing import List
+from typing import List, Optional

 import kaldifeat
 import sentencepiece as spm
@ -47,7 +47,6 @@ import torch
 import torchaudio
 from kaldifeat import FbankOptions, OnlineFbank, OnlineFeature
 from torch.nn.utils.rnn import pad_sequence
-from typing import Optional, List


 def get_parser():
--- a/egs/librispeech/ASR/long_file_recog/recognize.py
+++ b/egs/librispeech/ASR/long_file_recog/recognize.py
@ -31,28 +31,28 @@ https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stat
 """

 import argparse
-import torch.multiprocessing as mp
-import torch
-import torch.nn as nn
 import logging
 from concurrent.futures import ThreadPoolExecutor
-from typing import List, Optional, Tuple
-
 from pathlib import Path
+from typing import List, Optional, Tuple

 import k2
 import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
 from asr_datamodule import AsrDataModule
 from beam_search import (
    fast_beam_search_one_best,
    greedy_search_batch,
    modified_beam_search,
 )
-from icefall.utils import AttributeDict, convert_timestamp, setup_logger
 from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
-from lhotse.supervision import AlignmentItem
 from lhotse.serialization import SequentialJsonlWriter
+from lhotse.supervision import AlignmentItem
+
+from icefall.utils import AttributeDict, convert_timestamp, setup_logger


 def get_parser():
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/onnx_check.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/onnx_check.py
@ -73,12 +73,11 @@ It will generate the following 3 files inside $repo/exp:
 import argparse
 import logging

+import torch
 from onnx_pretrained import OnnxModel

 from icefall import is_module_available

-import torch
-

 def get_parser():
    parser = argparse.ArgumentParser(
--- a/egs/librispeech/ASR/pruned_transducer_stateless/my_profile.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/my_profile.py
@ -22,11 +22,12 @@ Usage: ./pruned_transducer_stateless/my_profile.py

 import argparse
 import logging
+
 import sentencepiece as spm
 import torch
+from train import add_model_arguments, get_encoder_model, get_params

 from icefall.profiler import get_model_profile
-from train import get_encoder_model, add_model_arguments, get_params


 def get_parser():
--- a/egs/librispeech/ASR/pruned_transducer_stateless/onnx_decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/onnx_decode.py
@ -75,8 +75,7 @@ import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
-
-from onnx_pretrained import greedy_search, OnnxModel
+from onnx_pretrained import OnnxModel, greedy_search

 from icefall.utils import setup_logger, store_transcripts, write_error_stats

--- a/egs/librispeech/ASR/pruned_transducer_stateless3/onnx_check.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/onnx_check.py
@ -78,10 +78,10 @@ It will generate the following 3 files inside $repo/exp:
 import argparse
 import logging

-from icefall import is_module_available
+import torch
 from onnx_pretrained import OnnxModel

-import torch
+from icefall import is_module_available


 def get_parser():
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/onnx_decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/onnx_decode.py
@ -76,8 +76,7 @@ import torch
 import torch.nn as nn
 from asr_datamodule import AsrDataModule
 from librispeech import LibriSpeech
-
-from onnx_pretrained import greedy_search, OnnxModel
+from onnx_pretrained import OnnxModel, greedy_search

 from icefall.utils import setup_logger, store_transcripts, write_error_stats

--- a/egs/librispeech/ASR/pruned_transducer_stateless4/my_profile.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/my_profile.py
@ -22,15 +22,15 @@ Usage: ./pruned_transducer_stateless4/my_profile.py

 import argparse
 import logging
+from typing import Tuple
+
 import sentencepiece as spm
 import torch
-
-from typing import Tuple
+from scaling import BasicNorm, DoubleSwish
 from torch import Tensor, nn
+from train import add_model_arguments, get_encoder_model, get_joiner_model, get_params

 from icefall.profiler import get_model_profile
-from scaling import BasicNorm, DoubleSwish
-from train import get_encoder_model, get_joiner_model, add_model_arguments, get_params


 def get_parser():
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/onnx_decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/onnx_decode.py
@ -82,8 +82,7 @@ import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
-
-from onnx_pretrained import greedy_search, OnnxModel
+from onnx_pretrained import OnnxModel, greedy_search

 from icefall.utils import setup_logger, store_transcripts, write_error_stats

--- a/egs/librispeech/ASR/pruned_transducer_stateless7/alignment.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/alignment.py
@ -20,7 +20,6 @@ from typing import List

 import k2
 import torch
-
 from beam_search import Hypothesis, HypothesisList, get_hyps_shape

 # The force alignment problem can be formulated as finding
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/decode_gigaspeech.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/decode_gigaspeech.py
@ -107,9 +107,6 @@ import k2
 import sentencepiece as spm
 import torch
 import torch.nn as nn
-
-# from asr_datamodule import LibriSpeechAsrDataModule
-from gigaspeech import GigaSpeechAsrDataModule
 from beam_search import (
    beam_search,
    fast_beam_search_nbest,
@ -120,6 +117,9 @@ from beam_search import (
    greedy_search_batch,
    modified_beam_search,
 )
+
+# from asr_datamodule import LibriSpeechAsrDataModule
+from gigaspeech import GigaSpeechAsrDataModule
 from gigaspeech_scoring import asr_text_post_processing
 from train import add_model_arguments, get_params, get_transducer_model

--- a/egs/librispeech/ASR/pruned_transducer_stateless7/generate_model_from_checkpoint.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/generate_model_from_checkpoint.py
@ -65,16 +65,15 @@ from typing import Dict, List

 import sentencepiece as spm
 import torch
-
 from train import add_model_arguments, get_params, get_transducer_model

-from icefall.utils import str2bool
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
+from icefall.utils import str2bool


 def get_parser():
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/my_profile.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/my_profile.py
@ -22,15 +22,15 @@ Usage: ./pruned_transducer_stateless7/my_profile.py

 import argparse
 import logging
+from typing import Tuple
+
 import sentencepiece as spm
 import torch
-
-from typing import Tuple
+from scaling import BasicNorm, DoubleSwish
 from torch import Tensor, nn
+from train import add_model_arguments, get_encoder_model, get_joiner_model, get_params

 from icefall.profiler import get_model_profile
-from scaling import BasicNorm, DoubleSwish
-from train import get_encoder_model, get_joiner_model, add_model_arguments, get_params


 def get_parser():
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/onnx_decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/onnx_decode.py
@ -75,8 +75,7 @@ import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
-
-from onnx_pretrained import greedy_search, OnnxModel
+from onnx_pretrained import OnnxModel, greedy_search

 from icefall.utils import setup_logger, store_transcripts, write_error_stats

--- a/egs/librispeech/ASR/pruned_transducer_stateless7/test_model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/test_model.py
@ -24,7 +24,6 @@ To run this file, do:
 """

 import torch
-
 from scaling_converter import convert_scaled_to_non_scaled
 from train import get_params, get_transducer_model

--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/ctc_guide_decode_bs.py
@ -118,8 +118,8 @@ from beam_search import (
    greedy_search_batch,
    modified_beam_search,
 )
-from train import add_model_arguments, get_params, get_transducer_model
 from torch.nn.utils.rnn import pad_sequence
+from train import add_model_arguments, get_params, get_transducer_model

 from icefall.checkpoint import (
    average_checkpoints,
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/lconv.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/lconv.py
@ -18,10 +18,7 @@ from typing import List, Optional, Tuple, Union

 import torch
 import torch.nn as nn
-from scaling import (
-    ActivationBalancer,
-    ScaledConv1d,
-)
+from scaling import ActivationBalancer, ScaledConv1d


 class LConv(nn.Module):
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/onnx_pretrained.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/onnx_pretrained.py
@ -52,7 +52,7 @@ import onnxruntime as ort
 import sentencepiece as spm
 import torch
 import torchaudio
-from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
+from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence

 from icefall.utils import make_pad_mask

--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/onnx_wrapper.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/onnx_wrapper.py
@ -14,6 +14,7 @@

 import torch
 from torch import nn
+
 from icefall.utils import make_pad_mask


--- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/ncnn_custom_layer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/ncnn_custom_layer.py
@ -4,7 +4,6 @@
 import ncnn
 import numpy as np

-
 layer_list = []


--- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/streaming-ncnn-decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/streaming-ncnn-decode.py
@ -42,7 +42,6 @@ import ncnn
 import torch
 import torchaudio
 from kaldifeat import FbankOptions, OnlineFbank, OnlineFeature
-
 from ncnn_custom_layer import RegisterCustomLayers


--- a/egs/librispeech/ASR/tiny_transducer_ctc/decode.py
+++ b/egs/librispeech/ASR/tiny_transducer_ctc/decode.py
@ -1,10 +1,11 @@
 import argparse
 import logging
 import math
+import pprint
 from collections import defaultdict
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple
-import pprint
+
 import k2
 import sentencepiece as spm
 import torch
--- a/egs/librispeech/ASR/zipformer/ctc_decode.py
+++ b/egs/librispeech/ASR/zipformer/ctc_decode.py
@ -88,7 +88,7 @@ import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
-from train import add_model_arguments, get_params, get_model
+from train import add_model_arguments, get_model, get_params

 from icefall.checkpoint import (
    average_checkpoints,
--- a/egs/librispeech/ASR/zipformer/model.py
+++ b/egs/librispeech/ASR/zipformer/model.py
@ -22,9 +22,9 @@ import k2
 import torch
 import torch.nn as nn
 from encoder_interface import EncoderInterface
+from scaling import ScaledLinear

 from icefall.utils import add_sos, make_pad_mask
-from scaling import ScaledLinear


 class AsrModel(nn.Module):
--- a/egs/librispeech/ASR/zipformer/my_profile.py
+++ b/egs/librispeech/ASR/zipformer/my_profile.py
@ -22,24 +22,24 @@ Usage: ./zipformer/my_profile.py

 import argparse
 import logging
+from typing import Tuple
+
 import sentencepiece as spm
 import torch
-
-from typing import Tuple
-from torch import Tensor, nn
-
-from icefall.utils import make_pad_mask
-from icefall.profiler import get_model_profile
 from scaling import BiasNorm
+from torch import Tensor, nn
 from train import (
+    add_model_arguments,
    get_encoder_embed,
    get_encoder_model,
    get_joiner_model,
-    add_model_arguments,
    get_params,
 )
 from zipformer import BypassModule

+from icefall.profiler import get_model_profile
+from icefall.utils import make_pad_mask
+

 def get_parser():
    parser = argparse.ArgumentParser(
--- a/egs/librispeech/ASR/zipformer/onnx_decode.py
+++ b/egs/librispeech/ASR/zipformer/onnx_decode.py
@ -77,11 +77,10 @@ from typing import List, Tuple
 import torch
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
-
-from onnx_pretrained import greedy_search, OnnxModel
+from k2 import SymbolTable
+from onnx_pretrained import OnnxModel, greedy_search

 from icefall.utils import setup_logger, store_transcripts, write_error_stats
-from k2 import SymbolTable


 def get_parser():
--- a/egs/librispeech/ASR/zipformer/onnx_pretrained_ctc_H.py
+++ b/egs/librispeech/ASR/zipformer/onnx_pretrained_ctc_H.py
@ -27,11 +27,10 @@ https://huggingface.co/csukuangfj/sherpa-onnx-zipformer-ctc-en-2023-10-02
 import argparse
 import logging
 import math
-from typing import List, Tuple
+from typing import Dict, List, Tuple

 import k2
 import kaldifeat
-from typing import Dict
 import kaldifst
 import onnxruntime as ort
 import torch
--- a/egs/librispeech/ASR/zipformer/onnx_pretrained_ctc_HL.py
+++ b/egs/librispeech/ASR/zipformer/onnx_pretrained_ctc_HL.py
@ -27,11 +27,10 @@ https://huggingface.co/csukuangfj/sherpa-onnx-zipformer-ctc-en-2023-10-02
 import argparse
 import logging
 import math
-from typing import List, Tuple
+from typing import Dict, List, Tuple

 import k2
 import kaldifeat
-from typing import Dict
 import kaldifst
 import onnxruntime as ort
 import torch
--- a/egs/librispeech/ASR/zipformer/onnx_pretrained_ctc_HLG.py
+++ b/egs/librispeech/ASR/zipformer/onnx_pretrained_ctc_HLG.py
@ -27,11 +27,10 @@ https://huggingface.co/csukuangfj/sherpa-onnx-zipformer-ctc-en-2023-10-02
 import argparse
 import logging
 import math
-from typing import List, Tuple
+from typing import Dict, List, Tuple

 import k2
 import kaldifeat
-from typing import Dict
 import kaldifst
 import onnxruntime as ort
 import torch
--- a/egs/librispeech/ASR/zipformer/scaling.py
+++ b/egs/librispeech/ASR/zipformer/scaling.py
@ -15,15 +15,16 @@
 # limitations under the License.


-from typing import Optional, Tuple, Union
 import logging
-import k2
-from torch.cuda.amp import custom_fwd, custom_bwd
-import random
-import torch
 import math
+import random
+from typing import Optional, Tuple, Union
+
+import k2
+import torch
 import torch.nn as nn
 from torch import Tensor
+from torch.cuda.amp import custom_bwd, custom_fwd


 def logaddexp_onnx(x: Tensor, y: Tensor) -> Tensor:
--- a/egs/librispeech/ASR/zipformer/streaming_decode.py
+++ b/egs/librispeech/ASR/zipformer/streaming_decode.py
@ -51,7 +51,7 @@ from streaming_beam_search import (
 )
 from torch import Tensor, nn
 from torch.nn.utils.rnn import pad_sequence
-from train import add_model_arguments, get_params, get_model
+from train import add_model_arguments, get_model, get_params

 from icefall.checkpoint import (
    average_checkpoints,
--- a/egs/librispeech/ASR/zipformer/subsampling.py
+++ b/egs/librispeech/ASR/zipformer/subsampling.py
@ -16,11 +16,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import Tuple
 import warnings
+from typing import Tuple

 import torch
-from torch import Tensor, nn
 from scaling import (
    Balancer,
    BiasNorm,
@ -34,6 +33,7 @@ from scaling import (
    SwooshR,
    Whiten,
 )
+from torch import Tensor, nn


 class ConvNeXt(nn.Module):
--- a/egs/librispeech/ASR/zipformer_adapter/asr_datamodule.py
+++ b/egs/librispeech/ASR/zipformer_adapter/asr_datamodule.py
@ -0,0 +1 @@
+../tdnn_lstm_ctc/asr_datamodule.py
--- a/egs/librispeech/ASR/zipformer_adapter/beam_search.py
+++ b/egs/librispeech/ASR/zipformer_adapter/beam_search.py
@ -0,0 +1 @@
+../pruned_transducer_stateless2/beam_search.py
--- a/egs/librispeech/ASR/zipformer_adapter/decode.py
+++ b/egs/librispeech/ASR/zipformer_adapter/decode.py
--- a/egs/librispeech/ASR/zipformer_adapter/decode_gigaspeech.py
+++ b/egs/librispeech/ASR/zipformer_adapter/decode_gigaspeech.py
--- a/egs/librispeech/ASR/zipformer_adapter/decoder.py
+++ b/egs/librispeech/ASR/zipformer_adapter/decoder.py
@ -0,0 +1 @@
+../zipformer/decoder.py
--- a/egs/librispeech/ASR/zipformer_adapter/encoder_interface.py
+++ b/egs/librispeech/ASR/zipformer_adapter/encoder_interface.py
@ -0,0 +1 @@
+../transducer_stateless/encoder_interface.py
--- a/egs/librispeech/ASR/zipformer_adapter/export-onnx.py
+++ b/egs/librispeech/ASR/zipformer_adapter/export-onnx.py
@ -0,0 +1,621 @@
+#!/usr/bin/env python3
+#
+# Copyright 2023 Xiaomi Corporation (Author: Fangjun Kuang, Wei Kang)
+# Copyright 2023 Danqing Fu (danqing.fu@gmail.com)
+
+"""
+This script exports a transducer model from PyTorch to ONNX.
+
+We use the pre-trained model from
+https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+as an example to show how to use this file.
+
+1. Download the pre-trained model
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "exp/pretrained.pt"
+
+cd exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+2. Export the model to ONNX
+
+./zipformer/export-onnx.py \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --use-averaged-model 0 \
+  --epoch 99 \
+  --avg 1 \
+  --exp-dir $repo/exp \
+  --num-encoder-layers "2,2,3,4,3,2" \
+  --downsampling-factor "1,2,4,8,4,2" \
+  --feedforward-dim "512,768,1024,1536,1024,768" \
+  --num-heads "4,4,4,8,4,4" \
+  --encoder-dim "192,256,384,512,384,256" \
+  --query-head-dim 32 \
+  --value-head-dim 12 \
+  --pos-head-dim 4 \
+  --pos-dim 48 \
+  --encoder-unmasked-dim "192,192,256,256,256,192" \
+  --cnn-module-kernel "31,31,15,15,15,31" \
+  --decoder-dim 512 \
+  --joiner-dim 512 \
+  --causal False \
+  --chunk-size "16,32,64,-1" \
+  --left-context-frames "64,128,256,-1"
+
+It will generate the following 3 files inside $repo/exp:
+
+  - encoder-epoch-99-avg-1.onnx
+  - decoder-epoch-99-avg-1.onnx
+  - joiner-epoch-99-avg-1.onnx
+
+See ./onnx_pretrained.py and ./onnx_check.py for how to
+use the exported ONNX models.
+"""
+
+import argparse
+import logging
+from pathlib import Path
+from typing import Dict, Tuple
+
+import k2
+import onnx
+import torch
+import torch.nn as nn
+from decoder import Decoder
+from onnxruntime.quantization import QuantType, quantize_dynamic
+from scaling_converter import convert_scaled_to_non_scaled
+from train import add_finetune_arguments, add_model_arguments, get_model, get_params
+from zipformer import Zipformer2
+
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.utils import make_pad_mask, num_tokens, str2bool
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=28,
+        help="""It specifies the checkpoint to use for averaging.
+        Note: Epoch counts from 0.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=15,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="""It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        default="data/lang_bpe_500/tokens.txt",
+        help="Path to the tokens.txt",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    add_model_arguments(parser)
+    add_finetune_arguments(parser)
+
+    return parser
+
+
+def add_meta_data(filename: str, meta_data: Dict[str, str]):
+    """Add meta data to an ONNX model. It is changed in-place.
+
+    Args:
+      filename:
+        Filename of the ONNX model to be changed.
+      meta_data:
+        Key-value pairs.
+    """
+    model = onnx.load(filename)
+    for key, value in meta_data.items():
+        meta = model.metadata_props.add()
+        meta.key = key
+        meta.value = value
+
+    onnx.save(model, filename)
+
+
+class OnnxEncoder(nn.Module):
+    """A wrapper for Zipformer and the encoder_proj from the joiner"""
+
+    def __init__(
+        self, encoder: Zipformer2, encoder_embed: nn.Module, encoder_proj: nn.Linear
+    ):
+        """
+        Args:
+          encoder:
+            A Zipformer encoder.
+          encoder_proj:
+            The projection layer for encoder from the joiner.
+        """
+        super().__init__()
+        self.encoder = encoder
+        self.encoder_embed = encoder_embed
+        self.encoder_proj = encoder_proj
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Please see the help information of Zipformer.forward
+
+        Args:
+          x:
+            A 3-D tensor of shape (N, T, C)
+          x_lens:
+            A 1-D tensor of shape (N,). Its dtype is torch.int64
+        Returns:
+          Return a tuple containing:
+            - encoder_out, A 3-D tensor of shape (N, T', joiner_dim)
+            - encoder_out_lens, A 1-D tensor of shape (N,)
+        """
+        x, x_lens = self.encoder_embed(x, x_lens)
+        src_key_padding_mask = make_pad_mask(x_lens)
+        x = x.permute(1, 0, 2)
+        encoder_out, encoder_out_lens = self.encoder(x, x_lens, src_key_padding_mask)
+        encoder_out = encoder_out.permute(1, 0, 2)
+        encoder_out = self.encoder_proj(encoder_out)
+        # Now encoder_out is of shape (N, T, joiner_dim)
+
+        return encoder_out, encoder_out_lens
+
+
+class OnnxDecoder(nn.Module):
+    """A wrapper for Decoder and the decoder_proj from the joiner"""
+
+    def __init__(self, decoder: Decoder, decoder_proj: nn.Linear):
+        super().__init__()
+        self.decoder = decoder
+        self.decoder_proj = decoder_proj
+
+    def forward(self, y: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+          y:
+            A 2-D tensor of shape (N, context_size).
+        Returns
+          Return a 2-D tensor of shape (N, joiner_dim)
+        """
+        need_pad = False
+        decoder_output = self.decoder(y, need_pad=need_pad)
+        decoder_output = decoder_output.squeeze(1)
+        output = self.decoder_proj(decoder_output)
+
+        return output
+
+
+class OnnxJoiner(nn.Module):
+    """A wrapper for the joiner"""
+
+    def __init__(self, output_linear: nn.Linear):
+        super().__init__()
+        self.output_linear = output_linear
+
+    def forward(
+        self,
+        encoder_out: torch.Tensor,
+        decoder_out: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+          encoder_out:
+            A 2-D tensor of shape (N, joiner_dim)
+          decoder_out:
+            A 2-D tensor of shape (N, joiner_dim)
+        Returns:
+          Return a 2-D tensor of shape (N, vocab_size)
+        """
+        logit = encoder_out + decoder_out
+        logit = self.output_linear(torch.tanh(logit))
+        return logit
+
+
+def export_encoder_model_onnx(
+    encoder_model: OnnxEncoder,
+    encoder_filename: str,
+    opset_version: int = 11,
+) -> None:
+    """Export the given encoder model to ONNX format.
+    The exported model has two inputs:
+
+        - x, a tensor of shape (N, T, C); dtype is torch.float32
+        - x_lens, a tensor of shape (N,); dtype is torch.int64
+
+    and it has two outputs:
+
+        - encoder_out, a tensor of shape (N, T', joiner_dim)
+        - encoder_out_lens, a tensor of shape (N,)
+
+    Args:
+      encoder_model:
+        The input encoder model
+      encoder_filename:
+        The filename to save the exported ONNX model.
+      opset_version:
+        The opset version to use.
+    """
+    x = torch.zeros(1, 100, 80, dtype=torch.float32)
+    x_lens = torch.tensor([100], dtype=torch.int64)
+
+    encoder_model = torch.jit.trace(encoder_model, (x, x_lens))
+
+    torch.onnx.export(
+        encoder_model,
+        (x, x_lens),
+        encoder_filename,
+        verbose=False,
+        opset_version=opset_version,
+        input_names=["x", "x_lens"],
+        output_names=["encoder_out", "encoder_out_lens"],
+        dynamic_axes={
+            "x": {0: "N", 1: "T"},
+            "x_lens": {0: "N"},
+            "encoder_out": {0: "N", 1: "T"},
+            "encoder_out_lens": {0: "N"},
+        },
+    )
+
+    meta_data = {
+        "model_type": "zipformer2",
+        "version": "1",
+        "model_author": "k2-fsa",
+        "comment": "non-streaming zipformer2",
+    }
+    logging.info(f"meta_data: {meta_data}")
+
+    add_meta_data(filename=encoder_filename, meta_data=meta_data)
+
+
+def export_decoder_model_onnx(
+    decoder_model: OnnxDecoder,
+    decoder_filename: str,
+    opset_version: int = 11,
+) -> None:
+    """Export the decoder model to ONNX format.
+
+    The exported model has one input:
+
+        - y: a torch.int64 tensor of shape (N, decoder_model.context_size)
+
+    and has one output:
+
+        - decoder_out: a torch.float32 tensor of shape (N, joiner_dim)
+
+    Args:
+      decoder_model:
+        The decoder model to be exported.
+      decoder_filename:
+        Filename to save the exported ONNX model.
+      opset_version:
+        The opset version to use.
+    """
+    context_size = decoder_model.decoder.context_size
+    vocab_size = decoder_model.decoder.vocab_size
+
+    y = torch.zeros(10, context_size, dtype=torch.int64)
+    decoder_model = torch.jit.script(decoder_model)
+    torch.onnx.export(
+        decoder_model,
+        y,
+        decoder_filename,
+        verbose=False,
+        opset_version=opset_version,
+        input_names=["y"],
+        output_names=["decoder_out"],
+        dynamic_axes={
+            "y": {0: "N"},
+            "decoder_out": {0: "N"},
+        },
+    )
+
+    meta_data = {
+        "context_size": str(context_size),
+        "vocab_size": str(vocab_size),
+    }
+    add_meta_data(filename=decoder_filename, meta_data=meta_data)
+
+
+def export_joiner_model_onnx(
+    joiner_model: nn.Module,
+    joiner_filename: str,
+    opset_version: int = 11,
+) -> None:
+    """Export the joiner model to ONNX format.
+    The exported joiner model has two inputs:
+
+        - encoder_out: a tensor of shape (N, joiner_dim)
+        - decoder_out: a tensor of shape (N, joiner_dim)
+
+    and produces one output:
+
+        - logit: a tensor of shape (N, vocab_size)
+    """
+    joiner_dim = joiner_model.output_linear.weight.shape[1]
+    logging.info(f"joiner dim: {joiner_dim}")
+
+    projected_encoder_out = torch.rand(11, joiner_dim, dtype=torch.float32)
+    projected_decoder_out = torch.rand(11, joiner_dim, dtype=torch.float32)
+
+    torch.onnx.export(
+        joiner_model,
+        (projected_encoder_out, projected_decoder_out),
+        joiner_filename,
+        verbose=False,
+        opset_version=opset_version,
+        input_names=[
+            "encoder_out",
+            "decoder_out",
+        ],
+        output_names=["logit"],
+        dynamic_axes={
+            "encoder_out": {0: "N"},
+            "decoder_out": {0: "N"},
+            "logit": {0: "N"},
+        },
+    )
+    meta_data = {
+        "joiner_dim": str(joiner_dim),
+    }
+    add_meta_data(filename=joiner_filename, meta_data=meta_data)
+
+
+@torch.no_grad()
+def main():
+    args = get_parser().parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"device: {device}")
+
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.vocab_size = num_tokens(token_table) + 1
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_model(params)
+
+    model.to(device)
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to("cpu")
+    model.eval()
+
+    convert_scaled_to_non_scaled(model, inplace=True, is_onnx=True)
+
+    encoder = OnnxEncoder(
+        encoder=model.encoder,
+        encoder_embed=model.encoder_embed,
+        encoder_proj=model.joiner.encoder_proj,
+    )
+
+    decoder = OnnxDecoder(
+        decoder=model.decoder,
+        decoder_proj=model.joiner.decoder_proj,
+    )
+
+    joiner = OnnxJoiner(output_linear=model.joiner.output_linear)
+
+    encoder_num_param = sum([p.numel() for p in encoder.parameters()])
+    decoder_num_param = sum([p.numel() for p in decoder.parameters()])
+    joiner_num_param = sum([p.numel() for p in joiner.parameters()])
+    total_num_param = encoder_num_param + decoder_num_param + joiner_num_param
+    logging.info(f"encoder parameters: {encoder_num_param}")
+    logging.info(f"decoder parameters: {decoder_num_param}")
+    logging.info(f"joiner parameters: {joiner_num_param}")
+    logging.info(f"total parameters: {total_num_param}")
+
+    if params.iter > 0:
+        suffix = f"iter-{params.iter}"
+    else:
+        suffix = f"epoch-{params.epoch}"
+
+    suffix += f"-avg-{params.avg}"
+
+    opset_version = 13
+
+    logging.info("Exporting encoder")
+    encoder_filename = params.exp_dir / f"encoder-{suffix}.onnx"
+    export_encoder_model_onnx(
+        encoder,
+        encoder_filename,
+        opset_version=opset_version,
+    )
+    logging.info(f"Exported encoder to {encoder_filename}")
+
+    logging.info("Exporting decoder")
+    decoder_filename = params.exp_dir / f"decoder-{suffix}.onnx"
+    export_decoder_model_onnx(
+        decoder,
+        decoder_filename,
+        opset_version=opset_version,
+    )
+    logging.info(f"Exported decoder to {decoder_filename}")
+
+    logging.info("Exporting joiner")
+    joiner_filename = params.exp_dir / f"joiner-{suffix}.onnx"
+    export_joiner_model_onnx(
+        joiner,
+        joiner_filename,
+        opset_version=opset_version,
+    )
+    logging.info(f"Exported joiner to {joiner_filename}")
+
+    # Generate int8 quantization models
+    # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection
+
+    logging.info("Generate int8 quantization models")
+
+    encoder_filename_int8 = params.exp_dir / f"encoder-{suffix}.int8.onnx"
+    quantize_dynamic(
+        model_input=encoder_filename,
+        model_output=encoder_filename_int8,
+        op_types_to_quantize=["MatMul"],
+        weight_type=QuantType.QInt8,
+    )
+
+    decoder_filename_int8 = params.exp_dir / f"decoder-{suffix}.int8.onnx"
+    quantize_dynamic(
+        model_input=decoder_filename,
+        model_output=decoder_filename_int8,
+        op_types_to_quantize=["MatMul", "Gather"],
+        weight_type=QuantType.QInt8,
+    )
+
+    joiner_filename_int8 = params.exp_dir / f"joiner-{suffix}.int8.onnx"
+    quantize_dynamic(
+        model_input=joiner_filename,
+        model_output=joiner_filename_int8,
+        op_types_to_quantize=["MatMul"],
+        weight_type=QuantType.QInt8,
+    )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
--- a/egs/librispeech/ASR/zipformer_adapter/joiner.py
+++ b/egs/librispeech/ASR/zipformer_adapter/joiner.py
@ -0,0 +1 @@
+../zipformer/joiner.py
--- a/egs/librispeech/ASR/zipformer_adapter/model.py
+++ b/egs/librispeech/ASR/zipformer_adapter/model.py
@ -0,0 +1 @@
+../zipformer/model.py
--- a/egs/librispeech/ASR/zipformer_adapter/onnx_decode.py
+++ b/egs/librispeech/ASR/zipformer_adapter/onnx_decode.py
@ -0,0 +1,386 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao,
+#                                                 Xiaoyu Yang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script loads ONNX exported models and uses them to decode the test sets.
+
+We use the pre-trained model from
+https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+as an example to show how to use this file.
+
+1. Download the pre-trained model
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/pretrained.pt"
+
+cd exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+2. Export the model to ONNX
+
+./zipformer/export-onnx.py \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --use-averaged-model 0 \
+  --epoch 99 \
+  --avg 1 \
+  --exp-dir $repo/exp \
+  --causal False
+
+It will generate the following 3 files inside $repo/exp:
+
+  - encoder-epoch-99-avg-1.onnx
+  - decoder-epoch-99-avg-1.onnx
+  - joiner-epoch-99-avg-1.onnx
+
+2. Run this file
+
+./zipformer/onnx_decode.py \
+  --exp-dir $repo/exp \
+  --max-duration 600 \
+  --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+  --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+  --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+"""
+
+
+import argparse
+import logging
+import time
+from pathlib import Path
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+from asr_datamodule import LibriSpeechAsrDataModule
+from k2 import SymbolTable
+from onnx_pretrained import OnnxModel, greedy_search
+
+from icefall.utils import setup_logger, store_transcripts, write_error_stats
+
+conversational_filler = [
+    "UH",
+    "UHH",
+    "UM",
+    "EH",
+    "MM",
+    "HM",
+    "AH",
+    "HUH",
+    "HA",
+    "ER",
+    "OOF",
+    "HEE",
+    "ACH",
+    "EEE",
+    "EW",
+]
+unk_tags = ["<UNK>", "<unk>"]
+gigaspeech_punctuations = [
+    "<COMMA>",
+    "<PERIOD>",
+    "<QUESTIONMARK>",
+    "<EXCLAMATIONPOINT>",
+]
+gigaspeech_garbage_utterance_tags = ["<SIL>", "<NOISE>", "<MUSIC>", "<OTHER>"]
+non_scoring_words = (
+    conversational_filler
+    + unk_tags
+    + gigaspeech_punctuations
+    + gigaspeech_garbage_utterance_tags
+)
+
+
+def asr_text_post_processing(text: str) -> str:
+    # 1. convert to uppercase
+    text = text.upper()
+
+    # 2. remove hyphen
+    #   "E-COMMERCE" -> "E COMMERCE", "STATE-OF-THE-ART" -> "STATE OF THE ART"
+    text = text.replace("-", " ")
+
+    # 3. remove non-scoring words from evaluation
+    remaining_words = []
+    for word in text.split():
+        if word in non_scoring_words:
+            continue
+        remaining_words.append(word)
+
+    return " ".join(remaining_words)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--encoder-model-filename",
+        type=str,
+        required=True,
+        help="Path to the encoder onnx model. ",
+    )
+
+    parser.add_argument(
+        "--decoder-model-filename",
+        type=str,
+        required=True,
+        help="Path to the decoder onnx model. ",
+    )
+
+    parser.add_argument(
+        "--joiner-model-filename",
+        type=str,
+        required=True,
+        help="Path to the joiner onnx model. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        help="""Path to tokens.txt.""",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="Valid values are greedy_search and modified_beam_search",
+    )
+
+    return parser
+
+
+def post_processing(
+    results: List[Tuple[str, List[str], List[str]]],
+) -> List[Tuple[str, List[str], List[str]]]:
+    new_results = []
+    for key, ref, hyp in results:
+        new_ref = asr_text_post_processing(" ".join(ref)).split()
+        new_hyp = asr_text_post_processing(" ".join(hyp)).split()
+        new_results.append((key, new_ref, new_hyp))
+    return new_results
+
+
+def decode_one_batch(
+    model: OnnxModel, token_table: SymbolTable, batch: dict
+) -> List[List[str]]:
+    """Decode one batch and return the result.
+    Currently it only greedy_search is supported.
+
+    Args:
+      model:
+        The neural model.
+      token_table:
+        The token table.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+
+    Returns:
+      Return the decoded results for each utterance.
+    """
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+    # at entry, feature is (N, T, C)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(dtype=torch.int64)
+
+    encoder_out, encoder_out_lens = model.run_encoder(x=feature, x_lens=feature_lens)
+
+    hyps = greedy_search(
+        model=model, encoder_out=encoder_out, encoder_out_lens=encoder_out_lens
+    )
+
+    def token_ids_to_words(token_ids: List[int]) -> str:
+        text = ""
+        for i in token_ids:
+            text += token_table[i]
+        return text.replace("▁", " ").strip()
+
+    hyps = [token_ids_to_words(h).split() for h in hyps]
+    return hyps
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    model: nn.Module,
+    token_table: SymbolTable,
+) -> Tuple[List[Tuple[str, List[str], List[str]]], float]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      model:
+        The neural model.
+      token_table:
+        The token table.
+
+    Returns:
+      - A list of tuples. Each tuple contains three elements:
+         - cut_id,
+         - reference transcript,
+         - predicted result.
+      - The total duration (in seconds) of the dataset.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    log_interval = 10
+    total_duration = 0
+
+    results = []
+    for batch_idx, batch in enumerate(dl):
+        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+        total_duration += sum([cut.duration for cut in batch["supervisions"]["cut"]])
+
+        hyps = decode_one_batch(model=model, token_table=token_table, batch=batch)
+
+        this_batch = []
+        assert len(hyps) == len(texts)
+        for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+            ref_words = ref_text.split()
+            this_batch.append((cut_id, ref_words, hyp_words))
+
+        results.extend(this_batch)
+
+        num_cuts += len(texts)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+
+    return results, total_duration
+
+
+def save_results(
+    res_dir: Path,
+    test_set_name: str,
+    results: List[Tuple[str, List[str], List[str]]],
+):
+    recog_path = res_dir / f"recogs-{test_set_name}.txt"
+    results = post_processing(results)
+    results = sorted(results)
+    store_transcripts(filename=recog_path, texts=results)
+    logging.info(f"The transcripts are stored in {recog_path}")
+
+    # The following prints out WERs, per-word error statistics and aligned
+    # ref/hyp pairs.
+    errs_filename = res_dir / f"errs-{test_set_name}.txt"
+    with open(errs_filename, "w") as f:
+        wer = write_error_stats(f, f"{test_set_name}", results, enable_log=True)
+
+    logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    errs_info = res_dir / f"wer-summary-{test_set_name}.txt"
+    with open(errs_info, "w") as f:
+        print("WER", file=f)
+        print(wer, file=f)
+
+    s = "\nFor {}, WER is {}:\n".format(test_set_name, wer)
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LibriSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+
+    assert (
+        args.decoding_method == "greedy_search"
+    ), "Only supports greedy_search currently."
+    res_dir = Path(args.exp_dir) / f"onnx-{args.decoding_method}"
+
+    setup_logger(f"{res_dir}/log-decode")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    logging.info(f"Device: {device}")
+
+    token_table = SymbolTable.from_file(args.tokens)
+
+    logging.info(vars(args))
+
+    logging.info("About to create model")
+    model = OnnxModel(
+        encoder_model_filename=args.encoder_model_filename,
+        decoder_model_filename=args.decoder_model_filename,
+        joiner_model_filename=args.joiner_model_filename,
+    )
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    librispeech = LibriSpeechAsrDataModule(args)
+
+    gigaspeech_dev_cuts = librispeech.gigaspeech_dev_cuts()
+    gigaspeech_test_cuts = librispeech.gigaspeech_test_cuts()
+
+    dev_dl = librispeech.test_dataloaders(gigaspeech_dev_cuts)
+    test_dl = librispeech.test_dataloaders(gigaspeech_test_cuts)
+
+    test_sets = ["dev", "test"]
+    test_dl = [dev_dl, test_dl]
+
+    for test_set, test_dl in zip(test_sets, test_dl):
+        start_time = time.time()
+        results, total_duration = decode_dataset(
+            dl=test_dl, model=model, token_table=token_table
+        )
+        end_time = time.time()
+        elapsed_seconds = end_time - start_time
+        rtf = elapsed_seconds / total_duration
+
+        logging.info(f"Elapsed time: {elapsed_seconds:.3f} s")
+        logging.info(f"Wave duration: {total_duration:.3f} s")
+        logging.info(
+            f"Real time factor (RTF): {elapsed_seconds:.3f}/{total_duration:.3f} = {rtf:.3f}"
+        )
+
+        save_results(res_dir=res_dir, test_set_name=test_set, results=results)
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/librispeech/ASR/zipformer_adapter/onnx_pretrained.py
+++ b/egs/librispeech/ASR/zipformer_adapter/onnx_pretrained.py
@ -0,0 +1 @@
+../zipformer/onnx_pretrained.py
--- a/egs/librispeech/ASR/zipformer_adapter/optim.py
+++ b/egs/librispeech/ASR/zipformer_adapter/optim.py
@ -0,0 +1 @@
+../zipformer/optim.py
--- a/egs/librispeech/ASR/zipformer_adapter/scaling.py
+++ b/egs/librispeech/ASR/zipformer_adapter/scaling.py
@ -0,0 +1 @@
+../zipformer/scaling.py
--- a/egs/librispeech/ASR/zipformer_adapter/scaling_converter.py
+++ b/egs/librispeech/ASR/zipformer_adapter/scaling_converter.py
@ -0,0 +1 @@
+../zipformer/scaling_converter.py
--- a/egs/librispeech/ASR/zipformer_adapter/subsampling.py
+++ b/egs/librispeech/ASR/zipformer_adapter/subsampling.py
@ -0,0 +1 @@
+../zipformer/subsampling.py
--- a/egs/librispeech/ASR/zipformer_adapter/train.py
+++ b/egs/librispeech/ASR/zipformer_adapter/train.py
--- a/egs/librispeech/ASR/zipformer_adapter/zipformer.py
+++ b/egs/librispeech/ASR/zipformer_adapter/zipformer.py
--- a/egs/librispeech/WSASR/conformer_ctc2/train.py
+++ b/egs/librispeech/WSASR/conformer_ctc2/train.py
@ -31,6 +31,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
  --exp-dir conformer_ctc2/exp \
  --lang-dir data/lang_bpe_200 \
  --otc-token "<star>" \
+  --feature-dim 768 \
  --allow-bypass-arc true \
  --allow-self-loop-arc true \
  --initial-bypass-weight -19 \
@ -160,6 +161,14 @@ def get_parser():
        """,
    )

+    parser.add_argument(
+        "--feature-dim",
+        type=int,
+        default=768,
+        help="""Number of features extracted in feature extraction stage.last dimension of feature vector.
+        80 when using fbank features and 768 or 1024 whn using wave2vec""",
+    )
+
    parser.add_argument(
        "--initial-lr",
        type=float,
@ -385,7 +394,6 @@ def get_params() -> AttributeDict:
            "valid_interval": 800,  # For the 100h subset, use 800
            "alignment_interval": 25,
            # parameters for conformer
-            "feature_dim": 768,
            "subsampling_factor": 2,
            "encoder_dim": 512,
            "nhead": 8,
--- a/egs/ljspeech/TTS/README.md
+++ b/egs/ljspeech/TTS/README.md
@ -0,0 +1,38 @@
+# Introduction
+
+This is a public domain speech dataset consisting of 13,100 short audio clips of a single speaker reading passages from 7 non-fiction books. 
+A transcription is provided for each clip. 
+Clips vary in length from 1 to 10 seconds and have a total length of approximately 24 hours.
+
+The texts were published between 1884 and 1964, and are in the public domain. 
+The audio was recorded in 2016-17 by the [LibriVox](https://librivox.org/) project and is also in the public domain.
+
+The above information is from the [LJSpeech website](https://keithito.com/LJ-Speech-Dataset/).
+
+# VITS
+
+This recipe provides a VITS model trained on the LJSpeech dataset.
+
+Pretrained model can be found [here](https://huggingface.co/Zengwei/icefall-tts-ljspeech-vits-2024-02-28).
+
+For tutorial and more details, please refer to the [VITS documentation](https://k2-fsa.github.io/icefall/recipes/TTS/ljspeech/vits.html).
+
+The training command is given below:
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+./vits/train.py \
+  --world-size 4 \
+  --num-epochs 1000 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir vits/exp \
+  --max-duration 500
+```
+
+To inference, use:
+```
+./vits/infer.py \
+  --exp-dir vits/exp \
+  --epoch 1000 \
+  --tokens data/tokens.txt
+```
--- a/egs/ljspeech/TTS/local/prepare_token_file.py
+++ b/egs/ljspeech/TTS/local/prepare_token_file.py
@ -17,7 +17,7 @@


 """
-This file reads the texts in given manifest and generates the file that maps tokens to IDs.
+This file generates the file that maps tokens to IDs.
 """

 import argparse
@ -25,80 +25,38 @@ import logging
 from pathlib import Path
 from typing import Dict

-from lhotse import load_manifest
+from piper_phonemize import get_espeak_map


 def get_args():
    parser = argparse.ArgumentParser()

-    parser.add_argument(
-        "--manifest-file",
-        type=Path,
-        default=Path("data/spectrogram/ljspeech_cuts_train.jsonl.gz"),
-        help="Path to the manifest file",
-    )
-
    parser.add_argument(
        "--tokens",
        type=Path,
        default=Path("data/tokens.txt"),
-        help="Path to the tokens",
+        help="Path to the dict that maps the text tokens to IDs",
    )

    return parser.parse_args()


-def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
-    """Write a symbol to ID mapping to a file.
+def get_token2id(filename: Path) -> Dict[str, int]:
+    """Get a dict that maps token to IDs, and save it to the given filename."""
+    all_tokens = get_espeak_map()  # token: [token_id]
+    all_tokens = {token: token_id[0] for token, token_id in all_tokens.items()}
+    # sort by token_id
+    all_tokens = sorted(all_tokens.items(), key=lambda x: x[1])

-    Note:
-      No need to implement `read_mapping` as it can be done
-      through :func:`k2.SymbolTable.from_file`.
-
-    Args:
-      filename:
-        Filename to save the mapping.
-      sym2id:
-        A dict mapping symbols to IDs.
-    Returns:
-      Return None.
-    """
    with open(filename, "w", encoding="utf-8") as f:
-        for sym, i in sym2id.items():
-            f.write(f"{sym} {i}\n")
-
-
-def get_token2id(manifest_file: Path) -> Dict[str, int]:
-    """Return a dict that maps token to IDs."""
-    extra_tokens = [
-        "<blk>",  # 0 for blank
-        "<sos/eos>",  # 1 for sos and eos symbols.
-        "<unk>",  # 2 for OOV
-    ]
-    all_tokens = set()
-
-    cut_set = load_manifest(manifest_file)
-
-    for cut in cut_set:
-        # Each cut only contain one supervision
-        assert len(cut.supervisions) == 1, len(cut.supervisions)
-        for t in cut.tokens:
-            all_tokens.add(t)
-
-    all_tokens = extra_tokens + list(all_tokens)
-
-    token2id: Dict[str, int] = {token: i for i, token in enumerate(all_tokens)}
-    return token2id
+        for token, token_id in all_tokens:
+            f.write(f"{token} {token_id}\n")


 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-
    logging.basicConfig(format=formatter, level=logging.INFO)

    args = get_args()
-    manifest_file = Path(args.manifest_file)
    out_file = Path(args.tokens)
-
-    token2id = get_token2id(manifest_file)
-    write_mapping(out_file, token2id)
+    get_token2id(out_file)
--- a/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py
+++ b/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py
@ -23,9 +23,9 @@ This file reads the texts in given manifest and save the new cuts with phoneme t
 import logging
 from pathlib import Path

-import g2p_en
 import tacotron_cleaner.cleaners
 from lhotse import CutSet, load_manifest
+from piper_phonemize import phonemize_espeak


 def prepare_tokens_ljspeech():
@ -35,17 +35,20 @@ def prepare_tokens_ljspeech():
    partition = "all"

    cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
-    g2p = g2p_en.G2p()

    new_cuts = []
    for cut in cut_set:
        # Each cut only contains one supervision
-        assert len(cut.supervisions) == 1, len(cut.supervisions)
+        assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
        text = cut.supervisions[0].normalized_text
        # Text normalization
        text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
        # Convert to phonemes
-        cut.tokens = g2p(text)
+        tokens_list = phonemize_espeak(text, "en-us")
+        tokens = []
+        for t in tokens_list:
+            tokens.extend(t)
+        cut.tokens = tokens
        new_cuts.append(cut)

    new_cut_set = CutSet.from_cuts(new_cuts)
--- a/egs/ljspeech/TTS/prepare.sh
+++ b/egs/ljspeech/TTS/prepare.sh
@ -30,7 +30,7 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
    cd vits/monotonic_align
    python setup.py build_ext --inplace
    cd ../../
-  else 
+  else
    log "monotonic_align lib already built"
  fi
 fi
@ -80,6 +80,11 @@ fi

 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Prepare phoneme tokens for LJSpeech"
+  # We assume you have installed piper_phonemize and espnet_tts_frontend.
+  # If not, please install them with:
+  #   - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize,
+  #                      could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
+  #   - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
  if [ ! -e data/spectrogram/.ljspeech_with_token.done ]; then
    ./local/prepare_tokens_ljspeech.py
    mv data/spectrogram/ljspeech_cuts_with_tokens_all.jsonl.gz \
@ -113,13 +118,12 @@ fi

 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "Stage 5: Generate token file"
-  # We assume you have installed g2p_en and espnet_tts_frontend.
+  # We assume you have installed piper_phonemize and espnet_tts_frontend.
  # If not, please install them with:
-  #   - g2p_en: `pip install g2p_en`, refer to https://github.com/Kyubyong/g2p
+  #   - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize,
+  #                      could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
  #   - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
  if [ ! -e data/tokens.txt ]; then
-    ./local/prepare_token_file.py \
-      --manifest-file data/spectrogram/ljspeech_cuts_train.jsonl.gz \
-      --tokens data/tokens.txt
+    ./local/prepare_token_file.py --tokens data/tokens.txt
  fi
 fi
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`../../../librispeech/ASR/local/compile_hlg.py`
				`@ -0,0 +1 @@`
				`../pruned_transducer_stateless2/beam_search.py`
				`@ -0,0 +1 @@`
				`../transducer_stateless/encoder_interface.py`