diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh
similarity index 100%
rename from .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
rename to .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh
index 761eb72e2..7d2853c17 100755
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh
@@ -21,9 +21,9 @@ tree $repo/
 ls -lh $repo/test_wavs/*.wav
 
 pushd $repo/exp
-git lfs pull --include "data/lang_bpe_500/HLG.pt"
 git lfs pull --include "data/lang_bpe_500/L.pt"
 git lfs pull --include "data/lang_bpe_500/LG.pt"
+git lfs pull --include "data/lang_bpe_500/HLG.pt"
 git lfs pull --include "data/lang_bpe_500/Linv.pt"
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/cpu_jit.pt"
diff --git a/.github/workflows/build-docker-image.yml b/.github/workflows/build-docker-image.yml
new file mode 100644
index 000000000..327f0ee45
--- /dev/null
+++ b/.github/workflows/build-docker-image.yml
@@ -0,0 +1,45 @@
+# see also
+# https://docs.github.com/en/actions/publishing-packages/publishing-docker-images#publishing-images-to-github-packages
+name: Build docker image
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: build_docker-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-docker-image:
+    name: ${{ matrix.image }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        image: ["torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+
+    steps:
+      # refer to https://github.com/actions/checkout
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Rename
+        shell: bash
+        run: |
+          image=${{ matrix.image }}
+          mv -v ./docker/$image.dockerfile ./Dockerfile
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          tags: k2fsa/icefall:${{ matrix.image }}
diff --git a/.github/workflows/run-aishell-2022-06-20.yml b/.github/workflows/run-aishell-2022-06-20.yml
index c46cea0f6..d14196f38 100644
--- a/.github/workflows/run-aishell-2022-06-20.yml
+++ b/.github/workflows/run-aishell-2022-06-20.yml
@@ -44,7 +44,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -119,5 +119,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: aishell-torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless3-2022-06-20
+          name: aishell-torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless3-2022-06-20
           path: egs/aishell/ASR/pruned_transducer_stateless3/exp/
diff --git a/.github/workflows/run-docker-image.yml b/.github/workflows/run-docker-image.yml
new file mode 100644
index 000000000..12604a132
--- /dev/null
+++ b/.github/workflows/run-docker-image.yml
@@ -0,0 +1,92 @@
+name: Run docker image
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: run_docker_image-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run-docker-image:
+    name: ${{ matrix.image }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        image: ["torch2.0.0-cuda11.7", "torch1.13.0-cuda11.6", "torch1.12.1-cuda11.3", "torch1.9.0-cuda10.2"]
+    steps:
+      # refer to https://github.com/actions/checkout
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Run the build process with Docker
+        uses: addnab/docker-run-action@v3
+        with:
+            image: k2fsa/icefall:${{ matrix.image }}
+            shell: bash
+            run: |
+              uname -a
+              cat /etc/*release
+
+              nvcc --version
+
+              # For torch1.9.0-cuda10.2
+              export LD_LIBRARY_PATH=/usr/local/cuda-10.2/compat:$LD_LIBRARY_PATH
+
+              # For torch1.12.1-cuda11.3
+              export LD_LIBRARY_PATH=/usr/local/cuda-11.3/compat:$LD_LIBRARY_PATH
+
+              # For torch2.0.0-cuda11.7
+              export LD_LIBRARY_PATH=/usr/local/cuda-11.7/compat:$LD_LIBRARY_PATH
+
+
+              which nvcc
+              cuda_dir=$(dirname $(which nvcc))
+              echo "cuda_dir: $cuda_dir"
+
+              find $cuda_dir -name libcuda.so*
+              echo "--------------------"
+
+              find / -name libcuda.so* 2>/dev/null
+
+              # for torch1.13.0-cuda11.6
+              if [ -e /opt/conda/lib/stubs/libcuda.so ]; then
+                cd /opt/conda/lib/stubs && ln -s libcuda.so libcuda.so.1 && cd -
+                export LD_LIBRARY_PATH=/opt/conda/lib/stubs:$LD_LIBRARY_PATH
+              fi
+
+              find / -name libcuda.so* 2>/dev/null
+              echo "LD_LIBRARY_PATH: $LD_LIBRARY_PATH"
+
+              python3 --version
+              which python3
+
+              python3 -m pip list
+
+              echo "----------torch----------"
+              python3 -m torch.utils.collect_env
+
+              echo "----------k2----------"
+              python3 -c "import k2; print(k2.__file__)"
+              python3 -c "import k2; print(k2.__dev_version__)"
+              python3 -m k2.version
+
+              echo "----------lhotse----------"
+              python3 -c "import lhotse; print(lhotse.__file__)"
+              python3 -c "import lhotse; print(lhotse.__version__)"
+
+              echo "----------kaldifeat----------"
+              python3 -c "import kaldifeat; print(kaldifeat.__file__)"
+              python3 -c "import kaldifeat; print(kaldifeat.__version__)"
+
+              echo "Test yesno recipe"
+
+              cd egs/yesno/ASR
+
+              ./prepare.sh
+
+              ./tdnn/train.py
+
+              ./tdnn/decode.py
diff --git a/.github/workflows/run-gigaspeech-2022-05-13.yml b/.github/workflows/run-gigaspeech-2022-05-13.yml
index f8ee25cc4..0e47f7538 100644
--- a/.github/workflows/run-gigaspeech-2022-05-13.yml
+++ b/.github/workflows/run-gigaspeech-2022-05-13.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -122,5 +122,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-gigaspeech-pruned_transducer_stateless2-2022-05-12
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-gigaspeech-pruned_transducer_stateless2-2022-05-12
           path: egs/gigaspeech/ASR/pruned_transducer_stateless2/exp/
diff --git a/.github/workflows/run-librispeech-2022-03-12.yml b/.github/workflows/run-librispeech-2022-03-12.yml
index d42202b79..3edbe43ec 100644
--- a/.github/workflows/run-librispeech-2022-03-12.yml
+++ b/.github/workflows/run-librispeech-2022-03-12.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless-2022-03-12
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless-2022-03-12
           path: egs/librispeech/ASR/pruned_transducer_stateless/exp/
diff --git a/.github/workflows/run-librispeech-2022-04-29.yml b/.github/workflows/run-librispeech-2022-04-29.yml
index f42c8f27a..bb44a073b 100644
--- a/.github/workflows/run-librispeech-2022-04-29.yml
+++ b/.github/workflows/run-librispeech-2022-04-29.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -174,12 +174,12 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless2-2022-04-29
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless2-2022-04-29
           path: egs/librispeech/ASR/pruned_transducer_stateless2/exp/
 
       - name: Upload decoding results for pruned_transducer_stateless3
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless3-2022-04-29
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless3-2022-04-29
           path: egs/librispeech/ASR/pruned_transducer_stateless3/exp/
diff --git a/.github/workflows/run-librispeech-2022-05-13.yml b/.github/workflows/run-librispeech-2022-05-13.yml
index 1fbd96157..e7b53b21c 100644
--- a/.github/workflows/run-librispeech-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-2022-05-13.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless5-2022-05-13
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless5-2022-05-13
           path: egs/librispeech/ASR/pruned_transducer_stateless5/exp/
diff --git a/.github/workflows/run-librispeech-2022-11-11-stateless7.yml b/.github/workflows/run-librispeech-2022-11-11-stateless7.yml
index 596596bd9..7e378c9a1 100644
--- a/.github/workflows/run-librispeech-2022-11-11-stateless7.yml
+++ b/.github/workflows/run-librispeech-2022-11-11-stateless7.yml
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-2022-11-11
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-2022-11-11
           path: egs/librispeech/ASR/pruned_transducer_stateless7/exp/
diff --git a/.github/workflows/run-librispeech-2022-11-14-stateless8.yml b/.github/workflows/run-librispeech-2022-11-14-stateless8.yml
index dca7d6d25..a2c1a0ad6 100644
--- a/.github/workflows/run-librispeech-2022-11-14-stateless8.yml
+++ b/.github/workflows/run-librispeech-2022-11-14-stateless8.yml
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless8-2022-11-14
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless8-2022-11-14
           path: egs/librispeech/ASR/pruned_transducer_stateless8/exp/
diff --git a/.github/workflows/run-librispeech-2022-12-01-stateless7-ctc.yml b/.github/workflows/run-librispeech-2022-12-01-stateless7-ctc.yml
index cd41e988e..500ab1736 100644
--- a/.github/workflows/run-librispeech-2022-12-01-stateless7-ctc.yml
+++ b/.github/workflows/run-librispeech-2022-12-01-stateless7-ctc.yml
@@ -159,5 +159,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-ctc-2022-12-01
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-ctc-2022-12-01
           path: egs/librispeech/ASR/pruned_transducer_stateless7_ctc/exp/
diff --git a/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml b/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml
index 91242c401..1a7f9f594 100644
--- a/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml
+++ b/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml
@@ -163,5 +163,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer_mmi-2022-12-08
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer_mmi-2022-12-08
           path: egs/librispeech/ASR/zipformer_mmi/exp/
diff --git a/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml b/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml
index 8490a62fc..68014e20c 100644
--- a/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml
+++ b/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml
@@ -168,5 +168,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-streaming-2022-12-29
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-streaming-2022-12-29
           path: egs/librispeech/ASR/pruned_transducer_stateless7_streaming/exp/
diff --git a/.github/workflows/run-librispeech-2022-12-15-stateless7-ctc-bs.yml b/.github/workflows/run-librispeech-2023-01-29-stateless7-ctc-bs.yml
similarity index 96%
rename from .github/workflows/run-librispeech-2022-12-15-stateless7-ctc-bs.yml
rename to .github/workflows/run-librispeech-2023-01-29-stateless7-ctc-bs.yml
index e0130a636..821abc25d 100644
--- a/.github/workflows/run-librispeech-2022-12-15-stateless7-ctc-bs.yml
+++ b/.github/workflows/run-librispeech-2023-01-29-stateless7-ctc-bs.yml
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: run-librispeech-2022-12-15-stateless7-ctc-bs
+name: run-librispeech-2023-01-29-stateless7-ctc-bs
 # zipformer
 
 on:
@@ -34,7 +34,7 @@ on:
     - cron: "50 15 * * *"
 
 jobs:
-  run_librispeech_2022_12_15_zipformer_ctc_bs:
+  run_librispeech_2023_01_29_zipformer_ctc_bs:
     if: github.event.label.name == 'run-decode' || github.event.label.name == 'blank-skip' || github.event_name == 'push' || github.event_name == 'schedule'
     runs-on: ${{ matrix.os }}
     strategy:
@@ -124,7 +124,7 @@ jobs:
           export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
           export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
 
-          .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
+          .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh
 
       - name: Display decoding results for librispeech pruned_transducer_stateless7_ctc_bs
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
@@ -159,5 +159,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-ctc-bs-2022-12-15
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-ctc-bs-2023-01-29
           path: egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/exp/
diff --git a/.github/workflows/run-librispeech-conformer-ctc3-2022-11-28.yml b/.github/workflows/run-librispeech-conformer-ctc3-2022-11-28.yml
index 40a37da57..905515dc4 100644
--- a/.github/workflows/run-librispeech-conformer-ctc3-2022-11-28.yml
+++ b/.github/workflows/run-librispeech-conformer-ctc3-2022-11-28.yml
@@ -151,5 +151,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-conformer_ctc3-2022-11-28
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-conformer_ctc3-2022-11-28
           path: egs/librispeech/ASR/conformer_ctc3/exp/
diff --git a/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml b/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
index aba29d066..501fae38c 100644
--- a/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
+++ b/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
@@ -26,7 +26,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.8]
 
       fail-fast: false
@@ -159,5 +159,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'shallow-fusion' || github.event.label.name == 'LODR'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-lstm_transducer_stateless2-2022-09-03
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-lstm_transducer_stateless2-2022-09-03
           path: egs/librispeech/ASR/lstm_transducer_stateless2/exp/
diff --git a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
index fd497601d..bf73d4f18 100644
--- a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -153,5 +153,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless3-2022-04-29
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless3-2022-04-29
           path: egs/librispeech/ASR/pruned_transducer_stateless3/exp/
diff --git a/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml b/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
index 57fe5b999..6ea308468 100644
--- a/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
+++ b/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless2-2022-06-26
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless2-2022-06-26
           path: egs/librispeech/ASR/pruned_transducer_stateless2/exp/
diff --git a/.github/workflows/run-librispeech-streaming-zipformer-2023-05-18.yml b/.github/workflows/run-librispeech-streaming-zipformer-2023-05-18.yml
index ed934d56d..5145fb43c 100644
--- a/.github/workflows/run-librispeech-streaming-zipformer-2023-05-18.yml
+++ b/.github/workflows/run-librispeech-streaming-zipformer-2023-05-18.yml
@@ -170,5 +170,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer-2022-11-11
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer-2022-11-11
           path: egs/librispeech/ASR/zipformer/exp/
diff --git a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
index 515122a66..9fe2f0389 100644
--- a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
+++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless2-2022-04-19
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-transducer_stateless2-2022-04-19
           path: egs/librispeech/ASR/transducer_stateless2/exp/
diff --git a/.github/workflows/run-librispeech-zipformer-2023-05-18.yml b/.github/workflows/run-librispeech-zipformer-2023-05-18.yml
index 7ecf0d2a0..e9d235ad1 100644
--- a/.github/workflows/run-librispeech-zipformer-2023-05-18.yml
+++ b/.github/workflows/run-librispeech-zipformer-2023-05-18.yml
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer-2022-11-11
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer-2022-11-11
           path: egs/librispeech/ASR/zipformer/exp/
diff --git a/.github/workflows/run-librispeech-zipformer-ctc-2023-06-14.yml b/.github/workflows/run-librispeech-zipformer-ctc-2023-06-14.yml
index 569ce48fc..48f0b1532 100644
--- a/.github/workflows/run-librispeech-zipformer-ctc-2023-06-14.yml
+++ b/.github/workflows/run-librispeech-zipformer-ctc-2023-06-14.yml
@@ -151,5 +151,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer-2022-11-11
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer-2022-11-11
           path: egs/librispeech/ASR/zipformer/exp/
diff --git a/.github/workflows/run-pretrained-conformer-ctc.yml b/.github/workflows/run-pretrained-conformer-ctc.yml
index 8aaea35f6..bcd326b9d 100644
--- a/.github/workflows/run-pretrained-conformer-ctc.yml
+++ b/.github/workflows/run-pretrained-conformer-ctc.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
index 03a1df48e..1e5b25f5c 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@@ -42,7 +42,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -154,5 +154,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless_multi_datasets-100h-2022-02-21
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-transducer_stateless_multi_datasets-100h-2022-02-21
           path: egs/librispeech/ASR/transducer_stateless_multi_datasets/exp/
diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
index 8da4ff56a..9063c0ed6 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@@ -42,7 +42,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -154,5 +154,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless_multi_datasets-100h-2022-03-01
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-transducer_stateless_multi_datasets-100h-2022-03-01
           path: egs/librispeech/ASR/transducer_stateless_multi_datasets/exp/
diff --git a/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml b/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
index 0b3e70d77..2d24528d3 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
diff --git a/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml b/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
index a6a59d339..761b26131 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
diff --git a/.github/workflows/run-pretrained-transducer-stateless.yml b/.github/workflows/run-pretrained-transducer-stateless.yml
index 98d84bf96..e46b9a849 100644
--- a/.github/workflows/run-pretrained-transducer-stateless.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@@ -42,7 +42,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -154,5 +154,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless-2022-02-07
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-transducer_stateless-2022-02-07
           path: egs/librispeech/ASR/transducer_stateless/exp/
diff --git a/.github/workflows/run-pretrained-transducer.yml b/.github/workflows/run-pretrained-transducer.yml
index 8c1a652e0..190e446bc 100644
--- a/.github/workflows/run-pretrained-transducer.yml
+++ b/.github/workflows/run-pretrained-transducer.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
diff --git a/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml b/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
index 6c70c646b..319a5558a 100644
--- a/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
+++ b/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.8]
 
       fail-fast: false
diff --git a/.github/workflows/run-yesno-recipe.yml b/.github/workflows/run-yesno-recipe.yml
index f997e634a..8a2c94829 100644
--- a/.github/workflows/run-yesno-recipe.yml
+++ b/.github/workflows/run-yesno-recipe.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        # os: [ubuntu-18.04, macos-10.15]
+        # os: [ubuntu-latest, macos-10.15]
         # TODO: enable macOS for CPU testing
         os: [ubuntu-latest]
         python-version: [3.8]
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e04fb5655..363556bb7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -35,9 +35,9 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         python-version: ["3.8"]
-        torch: ["1.10.0"]
-        torchaudio: ["0.10.0"]
-        k2-version: ["1.23.2.dev20221201"]
+        torch: ["1.13.0"]
+        torchaudio: ["0.13.0"]
+        k2-version: ["1.24.3.dev20230719"]
 
       fail-fast: false
 
@@ -66,14 +66,14 @@ jobs:
           pip install torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
           pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
 
-          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
+          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.github.io/k2/cpu.html
           pip install git+https://github.com/lhotse-speech/lhotse
           # icefall requirements
           pip uninstall -y protobuf
           pip install --no-binary protobuf protobuf==3.20.*
 
           pip install kaldifst
-          pip install onnxruntime
+          pip install onnxruntime matplotlib
           pip install -r requirements.txt
 
       - name: Install graphviz
@@ -83,13 +83,6 @@ jobs:
           python3 -m pip install -qq graphviz
           sudo apt-get -qq install graphviz
 
-      - name: Install graphviz
-        if: startsWith(matrix.os, 'macos')
-        shell: bash
-        run: |
-          python3 -m pip install -qq graphviz
-          brew install -q graphviz
-
       - name: Run tests
         if: startsWith(matrix.os, 'ubuntu')
         run: |
@@ -129,40 +122,10 @@ jobs:
           cd ../transducer_lstm
           pytest -v -s
 
-      - name: Run tests
-        if: startsWith(matrix.os, 'macos')
-        run: |
-          ls -lh
-          export PYTHONPATH=$PWD:$PWD/lhotse:$PYTHONPATH
-          lib_path=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")
-          echo "lib_path: $lib_path"
-          export DYLD_LIBRARY_PATH=$lib_path:$DYLD_LIBRARY_PATH
-          pytest -v -s ./test
-
-          # run tests for conformer ctc
-          cd egs/librispeech/ASR/conformer_ctc
+          cd ../zipformer
           pytest -v -s
 
-          cd ../pruned_transducer_stateless
-          pytest -v -s
-
-          cd ../pruned_transducer_stateless2
-          pytest -v -s
-
-          cd ../pruned_transducer_stateless3
-          pytest -v -s
-
-          cd ../pruned_transducer_stateless4
-          pytest -v -s
-
-          cd ../transducer_stateless
-          pytest -v -s
-
-          # cd ../transducer
-          # pytest -v -s
-
-          cd ../transducer_stateless2
-          pytest -v -s
-
-          cd ../transducer_lstm
-          pytest -v -s
+      - uses: actions/upload-artifact@v2
+        with:
+          path: egs/librispeech/ASR/zipformer/swoosh.pdf
+          name: swoosh.pdf
diff --git a/docker/README.md b/docker/README.md
index c14b9bf75..19959bfe6 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -1,5 +1,20 @@
 # icefall dockerfile
 
+## Download from dockerhub
+
+You can find pre-built docker image for icefall at the following address:
+
+  <https://hub.docker.com/r/k2fsa/icefall/tags>
+
+Example usage:
+
+```bash
+docker run --gpus all --rm -it  k2fsa/icefall:torch1.13.0-cuda11.6 /bin/bash
+```
+
+
+## Build from dockerfile
+
 2 sets of configuration are provided - (a) Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8, and (b) Ubuntu18.04-pytorch1.7.1-cuda11.0-cudnn8.
 
 If your NVIDIA driver supports CUDA Version: 11.3, please go for case (a) Ubuntu18.04-pytorch1.12.1-cuda11.3-cudnn8.
diff --git a/docker/torch1.12.1-cuda11.3.dockerfile b/docker/torch1.12.1-cuda11.3.dockerfile
new file mode 100644
index 000000000..5338bdca7
--- /dev/null
+++ b/docker/torch1.12.1-cuda11.3.dockerfile
@@ -0,0 +1,70 @@
+FROM pytorch/pytorch:1.12.1-cuda11.3-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+ARG K2_VERSION="1.24.3.dev20230725+cuda11.3.torch1.12.1"
+ARG KALDIFEAT_VERSION="1.25.0.dev20230726+cuda11.3.torch1.12.1"
+ARG TORCHAUDIO_VERSION="0.12.1+cu113"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    		libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
diff --git a/docker/torch1.13.0-cuda11.6.dockerfile b/docker/torch1.13.0-cuda11.6.dockerfile
new file mode 100644
index 000000000..4d2f96c8e
--- /dev/null
+++ b/docker/torch1.13.0-cuda11.6.dockerfile
@@ -0,0 +1,72 @@
+FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+ARG K2_VERSION="1.24.3.dev20230725+cuda11.6.torch1.13.0"
+ARG KALDIFEAT_VERSION="1.25.0.dev20230726+cuda11.6.torch1.13.0"
+ARG TORCHAUDIO_VERSION="0.13.0+cu116"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    		libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+ENV LD_LIBRARY_PATH /opt/conda/lib/stubs:$LD_LIBRARY_PATH
+
+WORKDIR /workspace/icefall
diff --git a/docker/torch1.9.0-cuda10.2.dockerfile b/docker/torch1.9.0-cuda10.2.dockerfile
new file mode 100644
index 000000000..a7cef6dc8
--- /dev/null
+++ b/docker/torch1.9.0-cuda10.2.dockerfile
@@ -0,0 +1,86 @@
+FROM pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+ARG K2_VERSION="1.24.3.dev20230726+cuda10.2.torch1.9.0"
+ARG KALDIFEAT_VERSION="1.25.0.dev20230726+cuda10.2.torch1.9.0"
+ARG TORCHAUDIO_VERSION="0.9.0"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+# see https://developer.nvidia.com/blog/updating-the-cuda-linux-gpg-repository-key/
+
+RUN rm /etc/apt/sources.list.d/cuda.list && \
+	rm /etc/apt/sources.list.d/nvidia-ml.list && \
+	apt-key del 7fa2af80
+
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    		libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
+    dpkg -i cuda-keyring_1.0-1_all.deb && \
+    rm -v cuda-keyring_1.0-1_all.deb && \
+    apt-get update && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip uninstall -y tqdm && \
+    pip install -U --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz \
+      tqdm>=4.63.0
+
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
diff --git a/docker/torch2.0.0-cuda11.7.dockerfile b/docker/torch2.0.0-cuda11.7.dockerfile
new file mode 100644
index 000000000..d91fbc24f
--- /dev/null
+++ b/docker/torch2.0.0-cuda11.7.dockerfile
@@ -0,0 +1,70 @@
+FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
+
+ENV LC_ALL C.UTF-8
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+ARG K2_VERSION="1.24.3.dev20230718+cuda11.7.torch2.0.0"
+ARG KALDIFEAT_VERSION="1.25.0.dev20230726+cuda11.7.torch2.0.0"
+ARG TORCHAUDIO_VERSION="2.0.0+cu117"
+
+LABEL authors="Fangjun Kuang <csukuangfj@gmail.com>"
+LABEL k2_version=${K2_VERSION}
+LABEL kaldifeat_version=${KALDIFEAT_VERSION}
+LABEL github_repo="https://github.com/k2-fsa/icefall"
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        curl \
+        vim \
+    		libssl-dev \
+        autoconf \
+        automake \
+        bzip2 \
+        ca-certificates \
+        ffmpeg \
+        g++ \
+        gfortran \
+        git \
+        libtool \
+        make \
+        patch \
+        sox \
+        subversion \
+        unzip \
+        valgrind \
+        wget \
+        zlib1g-dev \
+        && rm -rf /var/lib/apt/lists/*
+
+# Install dependencies
+RUN pip install --no-cache-dir \
+      torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html \
+      k2==${K2_VERSION} -f https://k2-fsa.github.io/k2/cuda.html \
+      git+https://github.com/lhotse-speech/lhotse \
+      kaldifeat==${KALDIFEAT_VERSION} -f https://csukuangfj.github.io/kaldifeat/cuda.html \
+      \
+      kaldi_native_io \
+      kaldialign \
+      kaldifst \
+      kaldilm \
+      sentencepiece>=0.1.96 \
+      tensorboard \
+      typeguard \
+      dill \
+      onnx \
+      onnxruntime \
+      onnxmltools \
+      multi_quantization \
+      typeguard \
+      numpy \
+      pytest \
+      graphviz
+
+RUN git clone https://github.com/k2-fsa/icefall /workspace/icefall && \
+    cd /workspace/icefall && \
+    pip install --no-cache-dir -r requirements.txt
+
+ENV PYTHONPATH /workspace/icefall:$PYTHONPATH
+
+WORKDIR /workspace/icefall
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6901dec02..bf231e3c1 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -86,7 +86,13 @@ rst_epilog = """
 .. _git-lfs: https://git-lfs.com/
 .. _ncnn: https://github.com/tencent/ncnn
 .. _LibriSpeech: https://www.openslr.org/12
+.. _Gigaspeech: https://github.com/SpeechColab/GigaSpeech
 .. _musan: http://www.openslr.org/17/
 .. _ONNX: https://github.com/onnx/onnx
 .. _onnxruntime: https://github.com/microsoft/onnxruntime
+.. _torch: https://github.com/pytorch/pytorch
+.. _torchaudio: https://github.com/pytorch/audio
+.. _k2: https://github.com/k2-fsa/k2
+.. _lhotse: https://github.com/lhotse-speech/lhotse
+.. _yesno: https://www.openslr.org/1/
 """
diff --git a/docs/source/decoding-with-langugage-models/LODR.rst b/docs/source/decoding-with-langugage-models/LODR.rst
new file mode 100644
index 000000000..b6625ee1d
--- /dev/null
+++ b/docs/source/decoding-with-langugage-models/LODR.rst
@@ -0,0 +1,184 @@
+.. _LODR:
+
+LODR for RNN Transducer
+=======================
+
+
+As a type of E2E model, neural transducers are usually considered as having an internal
+language model, which learns the language level information on the training corpus.
+In real-life scenario, there is often a mismatch between the training corpus and the target corpus space.
+This mismatch can be a problem when decoding for neural transducer models with language models as its internal
+language can act "against" the external LM. In this tutorial, we show how to use
+`Low-order Density Ratio <https://arxiv.org/abs/2203.16776>`_ to alleviate this effect to further improve the performance
+of langugae model integration.
+
+.. note::
+
+    This tutorial is based on the recipe
+    `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
+    which is a streaming transducer model trained on `LibriSpeech`_.
+    However, you can easily apply LODR to other recipes.
+    If you encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`__.
+
+
+.. note::
+
+    For simplicity, the training and testing corpus in this tutorial are the same (`LibriSpeech`_). However,
+    you can change the testing set to any other domains (e.g `GigaSpeech`_) and prepare the language models
+    using that corpus.
+
+First, let's have a look at some background information. As the predecessor of LODR, Density Ratio (DR) is first proposed `here <https://arxiv.org/abs/2002.11268>`_
+to address the language information mismatch between the training
+corpus (source domain) and the testing corpus (target domain). Assuming that the source domain and the test domain
+are acoustically similar, DR derives the following formular for decoding with Bayes' theorem:
+
+.. math::
+
+    \text{score}\left(y_u|\mathit{x},y\right) =
+    \log p\left(y_u|\mathit{x},y_{1:u-1}\right) +
+    \lambda_1 \log p_{\text{Target LM}}\left(y_u|\mathit{x},y_{1:u-1}\right) -
+    \lambda_2 \log p_{\text{Source LM}}\left(y_u|\mathit{x},y_{1:u-1}\right)
+
+
+where :math:`\lambda_1` and :math:`\lambda_2` are the weights of LM scores for target domain and source domain respectively.
+Here, the source domain LM is trained on the training corpus. The only difference in the above formular compared to
+shallow fusion is the subtraction of the source domain LM.
+
+Some works treat the predictor and the joiner of the neural transducer as its internal LM. However, the LM is
+considered to be weak and can only capture low-level language information. Therefore, `LODR <https://arxiv.org/abs/2203.16776>`__ proposed to use
+a low-order n-gram LM as an approximation of the ILM of the neural transducer. This leads to the following formula
+during decoding for transducer model:
+
+.. math::
+
+    \text{score}\left(y_u|\mathit{x},y\right) =
+    \log p_{rnnt}\left(y_u|\mathit{x},y_{1:u-1}\right) +
+    \lambda_1 \log p_{\text{Target LM}}\left(y_u|\mathit{x},y_{1:u-1}\right) -
+    \lambda_2 \log p_{\text{bi-gram}}\left(y_u|\mathit{x},y_{1:u-1}\right)
+
+In LODR, an additional bi-gram LM estimated on the source domain (e.g training corpus) is required. Comared to DR,
+the only difference lies in the choice of source domain LM. According to the original `paper <https://arxiv.org/abs/2203.16776>`_,
+LODR achieves similar performance compared DR in both intra-domain and cross-domain settings.
+As a bi-gram is much faster to evaluate, LODR is usually much faster.
+
+Now, we will show you how to use LODR in ``icefall``.
+For illustration purpose, we will use a pre-trained ASR model from this `link <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`_.
+If you want to train your model from scratch, please have a look at :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+The testing scenario here is intra-domain (we decode the model trained on `LibriSpeech`_ on `LibriSpeech`_ testing sets).
+
+As the initial step, let's download the pre-trained model.
+
+.. code-block:: bash
+
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+    $ pushd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
+
+To test the model, let's have a look at the decoding results **without** using LM. This can be done via the following command:
+
+.. code-block:: bash
+
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --exp-dir $exp_dir \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search
+
+The following WERs are achieved on test-clean and test-other:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	3.11	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.93	best for test-other
+
+Then, we download the external language model and bi-gram LM that are necessary for LODR.
+Note that the bi-gram is estimated on the LibriSpeech 960 hours' text.
+
+.. code-block:: bash
+
+    $ # download the external LM
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm
+    $ # create a symbolic link so that the checkpoint can be loaded
+    $ pushd icefall-librispeech-rnn-lm/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt
+    $ popd
+    $
+    $ # download the bi-gram
+    $ git lfs install
+    $ git clone https://huggingface.co/marcoyang/librispeech_bigram
+    $ pushd data/lang_bpe_500
+    $ ln -s ../../librispeech_bigram/2gram.fst.txt .
+    $ popd
+
+Then, we perform LODR decoding by setting ``--decoding-method`` to ``modified_beam_search_lm_LODR``:
+
+.. code-block:: bash
+
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ lm_dir=./icefall-librispeech-rnn-lm/exp
+    $ lm_scale=0.42
+    $ LODR_scale=-0.24
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --beam-size 4 \
+        --exp-dir $exp_dir \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search_LODR \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --use-shallow-fusion 1 \
+        --lm-type rnn \
+        --lm-exp-dir $lm_dir \
+        --lm-epoch 99 \
+        --lm-scale $lm_scale \
+        --lm-avg 1 \
+        --rnn-lm-embedding-dim 2048 \
+        --rnn-lm-hidden-dim 2048 \
+        --rnn-lm-num-layers 3 \
+        --lm-vocab-size 500 \
+        --tokens-ngram 2 \
+        --ngram-lm-scale $LODR_scale
+
+There are two extra arguments that need to be given when doing LODR. ``--tokens-ngram`` specifies the order of n-gram. As we
+are using a bi-gram, we set it to 2. ``--ngram-lm-scale`` is the scale of the bi-gram, it should be a negative number
+as we are subtracting the bi-gram's score during decoding.
+
+The decoding results obtained with the above command are shown below:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	2.61	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	6.74	best for test-other
+
+Recall that the lowest WER we obtained in :ref:`shallow_fusion` with beam size of 4 is ``2.77/7.08``, LODR
+indeed **further improves** the WER. We can do even better if we increase ``--beam-size``:
+
+.. list-table:: WER of LODR with different beam sizes
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Beam size
+     - test-clean
+     - test-other
+   * - 4
+     - 2.61
+     - 6.74
+   * - 8
+     - 2.45
+     - 6.38
+   * - 12
+     - 2.4
+     - 6.23
diff --git a/docs/source/decoding-with-langugage-models/index.rst b/docs/source/decoding-with-langugage-models/index.rst
new file mode 100644
index 000000000..6e5e3a4d9
--- /dev/null
+++ b/docs/source/decoding-with-langugage-models/index.rst
@@ -0,0 +1,33 @@
+Decoding with language models
+=============================
+
+This section describes how to use external langugage models 
+during decoding to improve the WER of transducer models.
+
+The following decoding methods with external langugage models are available:
+
+
+.. list-table:: LM-rescoring-based methods vs shallow-fusion-based methods (The numbers in each field is WER on test-clean, WER on test-other and decoding time on test-clean)
+   :widths: 25 50
+   :header-rows: 1
+
+   * - Decoding method
+     - beam=4
+   * - ``modified_beam_search``
+     - Beam search (i.e. really n-best decoding, the "beam" is the value of n), similar to the original RNN-T paper. Note, this method does not use language model. 
+   * - ``modified_beam_search_lm_shallow_fusion``
+     - As ``modified_beam_search``, but interpolate RNN-T scores with language model scores, also known as shallow fusion
+   * - ``modified_beam_search_LODR``
+     - As ``modified_beam_search_lm_shallow_fusion``, but subtract score of a (BPE-symbol-level) bigram backoff language model used as an approximation to the internal language model of RNN-T.
+   * - ``modified_beam_search_lm_rescore``
+     - As ``modified_beam_search``, but rescore the n-best hypotheses with external language model (e.g. RNNLM) and re-rank them.
+   * - ``modified_beam_search_lm_rescore_LODR``
+     - As ``modified_beam_search_lm_rescore``, but also subtract the score of a (BPE-symbol-level) bigram backoff language model during re-ranking.
+
+
+.. toctree::
+   :maxdepth: 2
+
+   shallow-fusion
+   LODR
+   rescoring
diff --git a/docs/source/decoding-with-langugage-models/rescoring.rst b/docs/source/decoding-with-langugage-models/rescoring.rst
new file mode 100644
index 000000000..02eba9129
--- /dev/null
+++ b/docs/source/decoding-with-langugage-models/rescoring.rst
@@ -0,0 +1,252 @@
+.. _rescoring:
+
+LM rescoring for Transducer
+=================================
+
+LM rescoring is a commonly used approach to incorporate external LM information. Unlike shallow-fusion-based
+methods (see :ref:`shallow_fusion`, :ref:`LODR`), rescoring is usually performed to re-rank the n-best hypotheses after beam search.
+Rescoring is usually more efficient than shallow fusion since less computation is performed on the external LM.
+In this tutorial, we will show you how to use external LM to rescore the n-best hypotheses decoded from neural transducer models in
+`icefall <https://github.com/k2-fsa/icefall>`__.
+
+.. note::
+
+    This tutorial is based on the recipe 
+    `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
+    which is a streaming transducer model trained on `LibriSpeech`_. 
+    However, you can easily apply shallow fusion to other recipes.
+    If you encounter any problems, please open an issue `here <https://github.com/k2-fsa/icefall/issues>`_.
+
+.. note::
+
+    For simplicity, the training and testing corpus in this tutorial is the same (`LibriSpeech`_). However, you can change the testing set
+    to any other domains (e.g `GigaSpeech`_) and use an external LM trained on that domain.
+
+.. HINT::
+
+  We recommend you to use a GPU for decoding.
+
+For illustration purpose, we will use a pre-trained ASR model from this `link <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`__.
+If you want to train your model from scratch, please have a look at :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+
+As the initial step, let's download the pre-trained model.
+
+.. code-block:: bash
+
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+    $ pushd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
+
+As usual, we first test the model's performance without external LM. This can be done via the following command:
+
+.. code-block:: bash
+
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --exp-dir $exp_dir \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search
+
+The following WERs are achieved on test-clean and test-other:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	3.11	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.93	best for test-other
+
+Now, we will try to improve the above WER numbers via external LM rescoring. We will download 
+a pre-trained LM from this `link <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm>`__.
+
+.. note::
+
+    This is an RNN LM trained on the LibriSpeech text corpus. So it might not be ideal for other corpus.
+    You may also train a RNN LM from scratch. Please refer to this `script <https://github.com/k2-fsa/icefall/blob/master/icefall/rnn_lm/train.py>`__
+    for training a RNN LM and this `script <https://github.com/k2-fsa/icefall/blob/master/icefall/transformer_lm/train.py>`__ to train a transformer LM.
+
+.. code-block:: bash
+
+    $ # download the external LM
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm 
+    $ # create a symbolic link so that the checkpoint can be loaded
+    $ pushd icefall-librispeech-rnn-lm/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt 
+    $ popd
+
+
+With the RNNLM available, we can rescore the n-best hypotheses generated from `modified_beam_search`. Here,
+`n` should be the number of beams, i.e ``--beam-size``. The command for LM rescoring is
+as follows. Note that the ``--decoding-method`` is set to `modified_beam_search_lm_rescore` and ``--use-shallow-fusion``
+is set to `False`.
+
+.. code-block:: bash
+    
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ lm_dir=./icefall-librispeech-rnn-lm/exp
+    $ lm_scale=0.43
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --beam-size 4 \
+        --exp-dir $exp_dir \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search_lm_rescore \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --use-shallow-fusion 0 \
+        --lm-type rnn \
+        --lm-exp-dir $lm_dir \
+        --lm-epoch 99 \
+        --lm-scale $lm_scale \
+        --lm-avg 1 \
+        --rnn-lm-embedding-dim 2048 \
+        --rnn-lm-hidden-dim 2048 \
+        --rnn-lm-num-layers 3 \
+        --lm-vocab-size 500
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	2.93	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.6	best for test-other
+
+Great! We made some improvements! Increasing the size of the n-best hypotheses will further boost the performance,
+see the following table:
+
+.. list-table:: WERs of LM rescoring with different beam sizes
+   :widths: 25 25 25
+   :header-rows: 1
+
+   * - Beam size
+     - test-clean
+     - test-other
+   * - 4
+     - 2.93
+     - 7.6
+   * - 8
+     - 2.67
+     - 7.11
+   * - 12
+     - 2.59
+     - 6.86
+
+In fact, we can also apply LODR (see :ref:`LODR`) when doing LM rescoring. To do so, we need to 
+download the bi-gram required by LODR:
+
+.. code-block:: bash
+
+    $ # download the bi-gram
+    $ git lfs install
+    $ git clone https://huggingface.co/marcoyang/librispeech_bigram
+    $ pushd data/lang_bpe_500
+    $ ln -s ../../librispeech_bigram/2gram.arpa .
+    $ popd
+
+Then we can performn LM rescoring + LODR by changing the decoding method to `modified_beam_search_lm_rescore_LODR`. 
+
+.. note:: 
+
+    This decoding method requires the dependency of `kenlm <https://github.com/kpu/kenlm>`_. You can install it
+    via this command: `pip install https://github.com/kpu/kenlm/archive/master.zip`. 
+
+.. code-block:: bash
+    
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ lm_dir=./icefall-librispeech-rnn-lm/exp
+    $ lm_scale=0.43
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --beam-size 4 \
+        --exp-dir $exp_dir \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search_lm_rescore_LODR \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --use-shallow-fusion 0 \
+        --lm-type rnn \
+        --lm-exp-dir $lm_dir \
+        --lm-epoch 99 \
+        --lm-scale $lm_scale \
+        --lm-avg 1 \
+        --rnn-lm-embedding-dim 2048 \
+        --rnn-lm-hidden-dim 2048 \
+        --rnn-lm-num-layers 3 \
+        --lm-vocab-size 500
+
+You should see the following WERs after executing the commands above:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	2.9	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.57	best for test-other
+
+It's slightly better than LM rescoring. If we further increase the beam size, we will see
+further improvements from LM rescoring + LODR:
+
+.. list-table:: WERs of LM rescoring + LODR with different beam sizes
+   :widths: 25 25 25
+   :header-rows: 1
+
+   * - Beam size
+     - test-clean
+     - test-other
+   * - 4
+     - 2.9
+     - 7.57
+   * - 8
+     - 2.63
+     - 7.04
+   * - 12
+     - 2.52
+     - 6.73
+
+As mentioned earlier, LM rescoring is usually faster than shallow-fusion based methods.
+Here, we benchmark the WERs and decoding speed of them:
+
+.. list-table:: LM-rescoring-based methods vs shallow-fusion-based methods (The numbers in each field is WER on test-clean, WER on test-other and decoding time on test-clean)
+   :widths: 25 25 25 25
+   :header-rows: 1
+
+   * - Decoding method
+     - beam=4
+     - beam=8
+     - beam=12
+   * - ``modified_beam_search``
+     - 3.11/7.93; 132s
+     - 3.1/7.95; 177s
+     - 3.1/7.96; 210s
+   * - ``modified_beam_search_lm_shallow_fusion``
+     - 2.77/7.08; 262s
+     - 2.62/6.65; 352s
+     - 2.58/6.65; 488s
+   * - ``modified_beam_search_LODR``
+     - 2.61/6.74; 400s
+     - 2.45/6.38; 610s
+     - 2.4/6.23; 870s
+   * - ``modified_beam_search_lm_rescore``
+     - 2.93/7.6; 156s
+     - 2.67/7.11; 203s
+     - 2.59/6.86; 255s
+   * - ``modified_beam_search_lm_rescore_LODR``
+     - 2.9/7.57; 160s
+     - 2.63/7.04; 203s
+     - 2.52/6.73; 263s
+
+.. note::
+
+    Decoding is performed with a single 32G V100, we set ``--max-duration`` to 600. 
+    Decoding time here is only for reference and it may vary.
\ No newline at end of file
diff --git a/docs/source/decoding-with-langugage-models/shallow-fusion.rst b/docs/source/decoding-with-langugage-models/shallow-fusion.rst
new file mode 100644
index 000000000..f15e3f1d9
--- /dev/null
+++ b/docs/source/decoding-with-langugage-models/shallow-fusion.rst
@@ -0,0 +1,176 @@
+.. _shallow_fusion:
+
+Shallow fusion for Transducer
+=================================
+
+External language models (LM) are commonly used to improve WERs for E2E ASR models.
+This tutorial shows you how to perform ``shallow fusion`` with an external LM
+to improve the word-error-rate of a transducer model.
+
+.. note::
+
+    This tutorial is based on the recipe 
+    `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
+    which is a streaming transducer model trained on `LibriSpeech`_. 
+    However, you can easily apply shallow fusion to other recipes.
+    If you encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`_.
+
+.. note::
+
+    For simplicity, the training and testing corpus in this tutorial is the same (`LibriSpeech`_). However, you can change the testing set
+    to any other domains (e.g `GigaSpeech`_) and use an external LM trained on that domain.
+
+.. HINT::
+
+  We recommend you to use a GPU for decoding.
+
+For illustration purpose, we will use a pre-trained ASR model from this `link <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`__.
+If you want to train your model from scratch, please have a look at :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+
+As the initial step, let's download the pre-trained model.
+
+.. code-block:: bash
+
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+    $ pushd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
+
+To test the model, let's have a look at the decoding results without using LM. This can be done via the following command:
+
+.. code-block:: bash
+
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --exp-dir $exp_dir \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search
+
+The following WERs are achieved on test-clean and test-other:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	3.11	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.93	best for test-other
+
+These are already good numbers! But we can further improve it by using shallow fusion with external LM.
+Training a language model usually takes a long time, we can download a pre-trained LM from this `link <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm>`__.
+
+.. code-block:: bash
+
+    $ # download the external LM
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm 
+    $ # create a symbolic link so that the checkpoint can be loaded
+    $ pushd icefall-librispeech-rnn-lm/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt 
+    $ popd
+
+.. note::
+
+    This is an RNN LM trained on the LibriSpeech text corpus. So it might not be ideal for other corpus.
+    You may also train a RNN LM from scratch. Please refer to this `script <https://github.com/k2-fsa/icefall/blob/master/icefall/rnn_lm/train.py>`__
+    for training a RNN LM and this `script <https://github.com/k2-fsa/icefall/blob/master/icefall/transformer_lm/train.py>`__ to train a transformer LM.
+
+To use shallow fusion for decoding, we can execute the following command:
+
+.. code-block:: bash
+    
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ lm_dir=./icefall-librispeech-rnn-lm/exp
+    $ lm_scale=0.29
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --beam-size 4 \
+        --exp-dir $exp_dir \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search_lm_shallow_fusion \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model \
+        --use-shallow-fusion 1 \
+        --lm-type rnn \
+        --lm-exp-dir $lm_dir \
+        --lm-epoch 99 \
+        --lm-scale $lm_scale \
+        --lm-avg 1 \
+        --rnn-lm-embedding-dim 2048 \
+        --rnn-lm-hidden-dim 2048 \
+        --rnn-lm-num-layers 3 \
+        --lm-vocab-size 500
+
+Note that we set ``--decoding-method modified_beam_search_lm_shallow_fusion`` and ``--use-shallow-fusion True``
+to use shallow fusion. ``--lm-type`` specifies the type of neural LM we are going to use, you can either choose
+between ``rnn`` or ``transformer``. The following three arguments are associated with the rnn:
+
+- ``--rnn-lm-embedding-dim``
+    The embedding dimension of the RNN LM
+
+- ``--rnn-lm-hidden-dim``
+    The hidden dimension of the RNN LM
+
+- ``--rnn-lm-num-layers``
+    The number of RNN layers in the RNN LM.
+
+
+The decoding result obtained with the above command are shown below.
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	2.77	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.08	best for test-other
+
+The improvement of shallow fusion is very obvious! The relative WER reduction on test-other is around 10.5%. 
+A few parameters can be tuned to further boost the performance of shallow fusion:
+
+- ``--lm-scale`` 
+
+    Controls the scale of the LM. If too small, the external language model may not be fully utilized; if too large, 
+    the LM score may dominant during decoding, leading to bad WER. A typical value of this is around 0.3.
+
+- ``--beam-size`` 
+    
+    The number of active paths in the search beam. It controls the trade-off between decoding efficiency and accuracy.
+
+Here, we also show how `--beam-size` effect the WER and decoding time:
+
+.. list-table:: WERs and decoding time (on test-clean) of shallow fusion with different beam sizes
+   :widths: 25 25 25 25
+   :header-rows: 1
+
+   * - Beam size
+     - test-clean
+     - test-other
+     - Decoding time on test-clean (s)
+   * - 4
+     - 2.77
+     - 7.08
+     - 262
+   * - 8
+     - 2.62
+     - 6.65
+     - 352
+   * - 12
+     - 2.58
+     - 6.65
+     - 488
+
+As we see, a larger beam size during shallow fusion improves the WER, but is also slower.
+
+
+
+
+
+
+
+ 
diff --git a/docs/source/docker/img/docker-hub.png b/docs/source/docker/img/docker-hub.png
new file mode 100644
index 000000000..a9e7715b0
Binary files /dev/null and b/docs/source/docker/img/docker-hub.png differ
diff --git a/docs/source/docker/index.rst b/docs/source/docker/index.rst
new file mode 100644
index 000000000..2c92a4cbc
--- /dev/null
+++ b/docs/source/docker/index.rst
@@ -0,0 +1,17 @@
+.. _icefall_docker:
+
+Docker
+======
+
+This section describes how to use pre-built docker images to run `icefall`_.
+
+.. hint::
+
+   If you only have CPUs available, you can still use the pre-built docker
+   images.
+
+.. toctree::
+   :maxdepth: 2
+
+   ./intro.rst
+
diff --git a/docs/source/docker/intro.rst b/docs/source/docker/intro.rst
new file mode 100644
index 000000000..b09247d85
--- /dev/null
+++ b/docs/source/docker/intro.rst
@@ -0,0 +1,171 @@
+Introduction
+=============
+
+We have pre-built docker images hosted at the following address:
+
+  `<https://hub.docker.com/repository/docker/k2fsa/icefall/general>`_
+
+.. figure:: img/docker-hub.png
+   :width: 600
+   :align: center
+
+You can find the ``Dockerfile`` at `<https://github.com/k2-fsa/icefall/tree/master/docker>`_.
+
+We describe the following items in this section:
+
+  - How to view available tags
+  - How to download pre-built docker images
+  - How to run the `yesno`_ recipe within a docker container on ``CPU``
+
+View available tags
+===================
+
+You can use the following command to view available tags:
+
+.. code-block:: bash
+
+   curl -s 'https://registry.hub.docker.com/v2/repositories/k2fsa/icefall/tags/'|jq '."results"[]["name"]'
+
+which will give you something like below:
+
+.. code-block:: bash
+
+  "torch2.0.0-cuda11.7"
+  "torch1.12.1-cuda11.3"
+  "torch1.9.0-cuda10.2"
+  "torch1.13.0-cuda11.6"
+
+.. hint::
+
+   Available tags will be updated when there are new releases of `torch`_.
+
+Please select an appropriate combination of `torch`_ and  CUDA.
+
+Download a docker image
+=======================
+
+Suppose that you select the tag ``torch1.13.0-cuda11.6``, you can use
+the following command to download it:
+
+.. code-block:: bash
+
+   sudo docker image pull k2fsa/icefall:torch1.13.0-cuda11.6
+
+Run a docker image with GPU
+===========================
+
+.. code-block:: bash
+
+  sudo docker run --gpus all --rm -it k2fsa/icefall:torch1.13.0-cuda11.6 /bin/bash
+
+Run a docker image with CPU
+===========================
+
+.. code-block:: bash
+
+  sudo docker run --rm -it k2fsa/icefall:torch1.13.0-cuda11.6 /bin/bash
+
+Run yesno within a docker container
+===================================
+
+After starting the container, the following interface is presented:
+
+.. code-block:: bash
+
+  root@60c947eac59c:/workspace/icefall#
+
+It shows the current user is ``root`` and the current working directory
+is ``/workspace/icefall``.
+
+Update the code
+---------------
+
+Please first run:
+
+.. code-block:: bash
+
+  root@60c947eac59c:/workspace/icefall# git pull
+
+so that your local copy contains the latest code.
+
+Data preparation
+----------------
+
+Now we can use
+
+.. code-block:: bash
+
+  root@60c947eac59c:/workspace/icefall# cd egs/yesno/ASR/
+
+to switch to the ``yesno`` recipe and run
+
+.. code-block:: bash
+
+  root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ./prepare.sh
+
+.. hint::
+
+   If you are running without GPU, it may report the following error:
+
+    .. code-block:: bash
+
+        File "/opt/conda/lib/python3.9/site-packages/k2/__init__.py", line 23, in <module>
+          from _k2 import DeterminizeWeightPushingType
+        ImportError: libcuda.so.1: cannot open shared object file: No such file or directory
+
+  We can use the following command to fix it:
+
+    .. code-block:: bash
+
+      root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ln -s /opt/conda/lib/stubs/libcuda.so /opt/conda/lib/stubs/libcuda.so.1
+
+The logs of running ``./prepare.sh`` are listed below:
+
+.. literalinclude:: ./log/log-preparation.txt
+
+Training
+--------
+
+After preparing the data, we can start training with the following command
+
+.. code-block:: bash
+
+  root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ./tdnn/train.py
+
+All of the training logs are given below:
+
+.. hint::
+
+   It is running on CPU and it takes only 16 seconds for this run.
+
+.. literalinclude:: ./log/log-train-2023-08-01-01-55-27
+
+
+Decoding
+--------
+
+After training, we can decode the trained model with
+
+.. code-block:: bash
+
+  root@60c947eac59c:/workspace/icefall/egs/yesno/ASR# ./tdnn/decode.py
+
+The decoding logs are given below:
+
+.. code-block:: bash
+
+  2023-08-01 02:06:22,400 INFO [decode.py:263] Decoding started
+  2023-08-01 02:06:22,400 INFO [decode.py:264] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'export': False, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4c05309499a08454997adf500b56dcc629e35ae5', 'k2-git-date': 'Tue Jul 25 16:23:36 2023', 'lhotse-version': '1.16.0.dev+git.7640d663.clean', 'torch-version': '1.13.0', 'torch-cuda-available': False, 'torch-cuda-version': '11.6', 'python-version': '3.9', 'icefall-git-branch': 'master', 'icefall-git-sha1': '375520d-clean', 'icefall-git-date': 'Fri Jul 28 07:43:08 2023', 'icefall-path': '/workspace/icefall', 'k2-path': '/opt/conda/lib/python3.9/site-packages/k2/__init__.py', 'lhotse-path': '/opt/conda/lib/python3.9/site-packages/lhotse/__init__.py', 'hostname': '60c947eac59c', 'IP address': '172.17.0.2'}}
+  2023-08-01 02:06:22,401 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+  2023-08-01 02:06:22,403 INFO [decode.py:273] device: cpu
+  2023-08-01 02:06:22,406 INFO [decode.py:291] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
+  2023-08-01 02:06:22,424 INFO [asr_datamodule.py:218] About to get test cuts
+  2023-08-01 02:06:22,425 INFO [asr_datamodule.py:252] About to get test cuts
+  2023-08-01 02:06:22,504 INFO [decode.py:204] batch 0/?, cuts processed until now is 4
+  [W NNPACK.cpp:53] Could not initialize NNPACK! Reason: Unsupported hardware.
+  2023-08-01 02:06:22,687 INFO [decode.py:241] The transcripts are stored in tdnn/exp/recogs-test_set.txt
+  2023-08-01 02:06:22,688 INFO [utils.py:564] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
+  2023-08-01 02:06:22,690 INFO [decode.py:249] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
+  2023-08-01 02:06:22,690 INFO [decode.py:316] Done!
+
+Congratulations! You have finished successfully running `icefall`_ within a docker container.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8d76eb68b..0fa8fdd1c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -21,9 +21,11 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
    :caption: Contents:
 
    installation/index
+   docker/index
    faqs
    model-export/index
 
+
 .. toctree::
    :maxdepth: 3
 
@@ -34,3 +36,8 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
 
    contributing/index
    huggingface/index
+
+.. toctree::
+   :maxdepth: 2
+   
+   decoding-with-langugage-models/index
diff --git a/docs/source/installation/index.rst b/docs/source/installation/index.rst
index 738b24ab2..5a034ef5b 100644
--- a/docs/source/installation/index.rst
+++ b/docs/source/installation/index.rst
@@ -3,40 +3,28 @@
 Installation
 ============
 
+.. hint::
 
+   We also provide :ref:`icefall_docker` support, which has already setup
+   the environment for you.
 
-``icefall`` depends on `k2 <https://github.com/k2-fsa/k2>`_ and
-`lhotse <https://github.com/lhotse-speech/lhotse>`_.
+.. hint::
+
+  We have a colab notebook guiding you step by step to setup the environment.
+
+  |yesno colab notebook|
+
+  .. |yesno colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg
+     :target: https://colab.research.google.com/drive/1tIjjzaJc3IvGyKiMCDWO-TSnBgkcuN3B?usp=sharing
+
+`icefall`_ depends on `k2`_ and `lhotse`_.
 
 We recommend that you use the following steps to install the dependencies.
 
 - (0) Install CUDA toolkit and cuDNN
-- (1) Install PyTorch and torchaudio
-- (2) Install k2
-- (3) Install lhotse
-
-.. caution::
-
-   99% users who have issues about the installation are using conda.
-
-.. caution::
-
-   99% users who have issues about the installation are using conda.
-
-.. caution::
-
-   99% users who have issues about the installation are using conda.
-
-.. hint::
-
-   We suggest that you use ``pip install`` to install PyTorch.
-
-   You can use the following command to create a virutal environment in Python:
-
-    .. code-block:: bash
-
-        python3 -m venv ./my_env
-        source ./my_env/bin/activate
+- (1) Install `torch`_ and `torchaudio`_
+- (2) Install `k2`_
+- (3) Install `lhotse`_
 
 .. caution::
 
@@ -50,27 +38,20 @@ Please refer to
 to install CUDA and cuDNN.
 
 
-(1) Install PyTorch and torchaudio
-----------------------------------
+(1) Install torch and torchaudio
+--------------------------------
 
-Please refer `<https://pytorch.org/>`_ to install PyTorch
-and torchaudio.
-
-.. hint::
-
-   You can also go to  `<https://download.pytorch.org/whl/torch_stable.html>`_
-   to download pre-compiled wheels and install them.
+Please refer `<https://pytorch.org/>`_ to install `torch`_ and `torchaudio`_.
 
 .. caution::
 
    Please install torch and torchaudio at the same time.
 
-
 (2) Install k2
 --------------
 
 Please refer to `<https://k2-fsa.github.io/k2/installation/index.html>`_
-to install ``k2``.
+to install `k2`_.
 
 .. caution::
 
@@ -78,21 +59,18 @@ to install ``k2``.
 
 .. note::
 
-   We suggest that you install k2 from source by following
-   `<https://k2-fsa.github.io/k2/installation/from_source.html>`_
-   or
-   `<https://k2-fsa.github.io/k2/installation/for_developers.html>`_.
+   We suggest that you install k2 from pre-compiled wheels by following
+   `<https://k2-fsa.github.io/k2/installation/from_wheels.html>`_
 
 .. hint::
 
-   Please always install the latest version of k2.
+   Please always install the latest version of `k2`_.
 
 (3) Install lhotse
 ------------------
 
 Please refer to `<https://lhotse.readthedocs.io/en/latest/getting-started.html#installation>`_
-to install ``lhotse``.
-
+to install `lhotse`_.
 
 .. hint::
 
@@ -100,17 +78,16 @@ to install ``lhotse``.
 
       pip install git+https://github.com/lhotse-speech/lhotse
 
-    to install the latest version of lhotse.
+    to install the latest version of `lhotse`_.
 
 (4) Download icefall
 --------------------
 
-``icefall`` is a collection of Python scripts; what you need is to download it
+`icefall`_ is a collection of Python scripts; what you need is to download it
 and set the environment variable ``PYTHONPATH`` to point to it.
 
-Assume you want to place ``icefall`` in the folder ``/tmp``. The
-following commands show you how to setup ``icefall``:
-
+Assume you want to place `icefall`_ in the folder ``/tmp``. The
+following commands show you how to setup `icefall`_:
 
 .. code-block:: bash
 
@@ -122,285 +99,334 @@ following commands show you how to setup ``icefall``:
 
 .. HINT::
 
-  You can put several versions of ``icefall`` in the same virtual environment.
-  To switch among different versions of ``icefall``, just set ``PYTHONPATH``
+  You can put several versions of `icefall`_ in the same virtual environment.
+  To switch among different versions of `icefall`_, just set ``PYTHONPATH``
   to point to the version you want.
 
-
 Installation example
 --------------------
 
 The following shows an example about setting up the environment.
 
-
 (1) Create a virtual environment
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. code-block:: bash
 
-  $ virtualenv -p python3.8  test-icefall
+   kuangfangjun:~$ virtualenv -p python3.8 test-icefall
+   created virtual environment CPython3.8.0.final.0-64 in 9422ms
+     creator CPython3Posix(dest=/star-fj/fangjun/test-icefall, clear=False, no_vcs_ignore=False, global=False)
+     seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/star-fj/fangjun/.local/share/virtualenv)
+       added seed packages: pip==22.3.1, setuptools==65.6.3, wheel==0.38.4
+     activators BashActivator,CShellActivator,FishActivator,NushellActivator,PowerShellActivator,PythonActivator
 
-  created virtual environment CPython3.8.6.final.0-64 in 1540ms
-    creator CPython3Posix(dest=/ceph-fj/fangjun/test-icefall, clear=False, no_vcs_ignore=False, global=False)
-    seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/root/fangjun/.local/share/v
-  irtualenv)
-      added seed packages: pip==21.1.3, setuptools==57.4.0, wheel==0.36.2
-    activators BashActivator,CShellActivator,FishActivator,PowerShellActivator,PythonActivator,XonshActivator
+   kuangfangjun:~$ source test-icefall/bin/activate
 
+   (test-icefall) kuangfangjun:~$
 
-(2) Activate your virtual environment
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+(2) Install CUDA toolkit and cuDNN
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You need to determine the version of CUDA toolkit to install.
 
 .. code-block:: bash
 
-  $ source test-icefall/bin/activate
+   (test-icefall) kuangfangjun:~$ nvidia-smi | head -n 4
 
-(3) Install k2
+   Wed Jul 26 21:57:49 2023
+   +-----------------------------------------------------------------------------+
+   | NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
+   |-------------------------------+----------------------+----------------------+
+
+You can choose any CUDA version that is ``not`` greater than the version printed by ``nvidia-smi``.
+In our case, we can choose any version ``<= 11.6``.
+
+We will use ``CUDA 11.6`` in this example. Please follow
+`<https://k2-fsa.github.io/k2/installation/cuda-cudnn.html#cuda-11-6>`_
+to install CUDA toolkit and cuDNN if you have not done that before.
+
+After installing CUDA toolkit, you can use the following command to verify it:
+
+.. code-block:: bash
+
+  (test-icefall) kuangfangjun:~$ nvcc --version
+
+  nvcc: NVIDIA (R) Cuda compiler driver
+  Copyright (c) 2005-2019 NVIDIA Corporation
+  Built on Wed_Oct_23_19:24:38_PDT_2019
+  Cuda compilation tools, release 10.2, V10.2.89
+
+(3) Install torch and torchaudio
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Since we have selected CUDA toolkit ``11.6``, we have to install a version of `torch`_
+that is compiled against CUDA ``11.6``. We select ``torch 1.13.0+cu116`` in this
+example.
+
+After selecting the version of `torch`_ to install, we need to also install
+a compatible version of `torchaudio`_, which is ``0.13.0+cu116`` in our case.
+
+Please refer to `<https://pytorch.org/audio/stable/installation.html#compatibility-matrix>`_
+to select an appropriate version of `torchaudio`_ to install if you use a different
+version of `torch`_.
+
+.. code-block:: bash
+
+  (test-icefall) kuangfangjun:~$ pip install torch==1.13.0+cu116 torchaudio==0.13.0+cu116 -f https://download.pytorch.org/whl/torch_stable.html
+
+  Looking in links: https://download.pytorch.org/whl/torch_stable.html
+  Collecting torch==1.13.0+cu116
+    Downloading https://download.pytorch.org/whl/cu116/torch-1.13.0%2Bcu116-cp38-cp38-linux_x86_64.whl (1983.0 MB)
+       ________________________________________ 2.0/2.0 GB 764.4 kB/s eta 0:00:00
+  Collecting torchaudio==0.13.0+cu116
+    Downloading https://download.pytorch.org/whl/cu116/torchaudio-0.13.0%2Bcu116-cp38-cp38-linux_x86_64.whl (4.2 MB)
+       ________________________________________ 4.2/4.2 MB 1.3 MB/s eta 0:00:00
+  Requirement already satisfied: typing-extensions in /star-fj/fangjun/test-icefall/lib/python3.8/site-packages (from torch==1.13.0+cu116) (4.7.1)
+  Installing collected packages: torch, torchaudio
+  Successfully installed torch-1.13.0+cu116 torchaudio-0.13.0+cu116
+
+Verify that `torch`_ and `torchaudio`_ are successfully installed:
+
+.. code-block:: bash
+
+  (test-icefall) kuangfangjun:~$ python3 -c "import torch; print(torch.__version__)"
+
+  1.13.0+cu116
+
+  (test-icefall) kuangfangjun:~$ python3 -c "import torchaudio; print(torchaudio.__version__)"
+
+  0.13.0+cu116
+
+(4) Install k2
 ~~~~~~~~~~~~~~
 
+We will install `k2`_ from pre-compiled wheels by following
+`<https://k2-fsa.github.io/k2/installation/from_wheels.html>`_
+
 .. code-block:: bash
 
-  $ pip install k2==1.4.dev20210822+cpu.torch1.9.0 -f https://k2-fsa.org/nightly/index.html
+  (test-icefall) kuangfangjun:~$ pip install k2==1.24.3.dev20230725+cuda11.6.torch1.13.0 -f https://k2-fsa.github.io/k2/cuda.html
 
-  Looking in links: https://k2-fsa.org/nightly/index.html
-  Collecting k2==1.4.dev20210822+cpu.torch1.9.0
-    Downloading https://k2-fsa.org/nightly/whl/k2-1.4.dev20210822%2Bcpu.torch1.9.0-cp38-cp38-linux_x86_64.whl (1.6 MB)
-       |________________________________| 1.6 MB 185 kB/s
+  Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
+  Looking in links: https://k2-fsa.github.io/k2/cuda.html
+  Collecting k2==1.24.3.dev20230725+cuda11.6.torch1.13.0
+    Downloading https://huggingface.co/csukuangfj/k2/resolve/main/ubuntu-cuda/k2-1.24.3.dev20230725%2Bcuda11.6.torch1.13.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (104.3 MB)
+       ________________________________________ 104.3/104.3 MB 5.1 MB/s eta 0:00:00
+  Requirement already satisfied: torch==1.13.0 in /star-fj/fangjun/test-icefall/lib/python3.8/site-packages (from k2==1.24.3.dev20230725+cuda11.6.torch1.13.0) (1.13.0+cu116)
   Collecting graphviz
-    Downloading graphviz-0.17-py3-none-any.whl (18 kB)
-  Collecting torch==1.9.0
-    Using cached torch-1.9.0-cp38-cp38-manylinux1_x86_64.whl (831.4 MB)
-  Collecting typing-extensions
-    Using cached typing_extensions-3.10.0.0-py3-none-any.whl (26 kB)
-  Installing collected packages: typing-extensions, torch, graphviz, k2
-  Successfully installed graphviz-0.17 k2-1.4.dev20210822+cpu.torch1.9.0 torch-1.9.0 typing-extensions-3.10.0.0
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/de/5e/fcbb22c68208d39edff467809d06c9d81d7d27426460ebc598e55130c1aa/graphviz-0.20.1-py3-none-any.whl (47 kB)
+  Requirement already satisfied: typing-extensions in /star-fj/fangjun/test-icefall/lib/python3.8/site-packages (from torch==1.13.0->k2==1.24.3.dev20230725+cuda11.6.torch1.13.0) (4.7.1)
+  Installing collected packages: graphviz, k2
+  Successfully installed graphviz-0.20.1 k2-1.24.3.dev20230725+cuda11.6.torch1.13.0
 
-.. WARNING::
+.. hint::
 
-  We choose to install a CPU version of k2 for testing. You would probably want to install
-  a CUDA version of k2.
+   Please refer to `<https://k2-fsa.github.io/k2/cuda.html>`_ for the available
+   pre-compiled wheels about `k2`_.
 
+Verify that `k2`_ has been installed successfully:
 
-(4) Install lhotse
+.. code-block:: bash
+
+  (test-icefall) kuangfangjun:~$ python3 -m k2.version
+
+  Collecting environment information...
+
+  k2 version: 1.24.3
+  Build type: Release
+  Git SHA1: 4c05309499a08454997adf500b56dcc629e35ae5
+  Git date: Tue Jul 25 16:23:36 2023
+  Cuda used to build k2: 11.6
+  cuDNN used to build k2: 8.3.2
+  Python version used to build k2: 3.8
+  OS used to build k2: CentOS Linux release 7.9.2009 (Core)
+  CMake version: 3.27.0
+  GCC version: 9.3.1
+  CMAKE_CUDA_FLAGS:  -Wno-deprecated-gpu-targets   -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_35,code=sm_35  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_50,code=sm_50  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_60,code=sm_60  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_61,code=sm_61  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_70,code=sm_70  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_75,code=sm_75  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_80,code=sm_80  -lineinfo --expt-extended-lambda -use_fast_math -Xptxas=-w  --expt-extended-lambda -gencode arch=compute_86,code=sm_86 -DONNX_NAMESPACE=onnx_c2 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_86,code=compute_86 -Xcudafe --diag_suppress=cc_clobber_ignored,--diag_suppress=integer_sign_change,--diag_suppress=useless_using_declaration,--diag_suppress=set_but_not_used,--diag_suppress=field_without_dll_interface,--diag_suppress=base_class_has_different_dll_interface,--diag_suppress=dll_interface_conflict_none_assumed,--diag_suppress=dll_interface_conflict_dllexport_assumed,--diag_suppress=implicit_return_from_non_void_function,--diag_suppress=unsigned_compare_with_zero,--diag_suppress=declared_but_not_referenced,--diag_suppress=bad_friend_decl --expt-relaxed-constexpr --expt-extended-lambda -D_GLIBCXX_USE_CXX11_ABI=0 --compiler-options -Wall  --compiler-options -Wno-strict-overflow  --compiler-options -Wno-unknown-pragmas
+  CMAKE_CXX_FLAGS:  -D_GLIBCXX_USE_CXX11_ABI=0 -Wno-unused-variable  -Wno-strict-overflow
+  PyTorch version used to build k2: 1.13.0+cu116
+  PyTorch is using Cuda: 11.6
+  NVTX enabled: True
+  With CUDA: True
+  Disable debug: True
+  Sync kernels : False
+  Disable checks: False
+  Max cpu memory allocate: 214748364800 bytes (or 200.0 GB)
+  k2 abort: False
+  __file__: /star-fj/fangjun/test-icefall/lib/python3.8/site-packages/k2/version/version.py
+  _k2.__file__: /star-fj/fangjun/test-icefall/lib/python3.8/site-packages/_k2.cpython-38-x86_64-linux-gnu.so
+
+(5) Install lhotse
 ~~~~~~~~~~~~~~~~~~
 
-.. code-block::
+.. code-block:: bash
 
-  $ pip install git+https://github.com/lhotse-speech/lhotse
+  (test-icefall) kuangfangjun:~$ pip install git+https://github.com/lhotse-speech/lhotse
 
   Collecting git+https://github.com/lhotse-speech/lhotse
-    Cloning https://github.com/lhotse-speech/lhotse to /tmp/pip-req-build-7b1b76ge
-    Running command git clone -q https://github.com/lhotse-speech/lhotse /tmp/pip-req-build-7b1b76ge
-  Collecting audioread>=2.1.9
-    Using cached audioread-2.1.9-py3-none-any.whl
-  Collecting SoundFile>=0.10
-    Using cached SoundFile-0.10.3.post1-py2.py3-none-any.whl (21 kB)
-  Collecting click>=7.1.1
-    Using cached click-8.0.1-py3-none-any.whl (97 kB)
+    Cloning https://github.com/lhotse-speech/lhotse to /tmp/pip-req-build-vq12fd5i
+    Running command git clone --filter=blob:none --quiet https://github.com/lhotse-speech/lhotse /tmp/pip-req-build-vq12fd5i
+    Resolved https://github.com/lhotse-speech/lhotse to commit 7640d663469b22cd0b36f3246ee9b849cd25e3b7
+    Installing build dependencies ... done
+    Getting requirements to build wheel ... done
+    Preparing metadata (pyproject.toml) ... done
   Collecting cytoolz>=0.10.1
-    Using cached cytoolz-0.11.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.9 MB)
-  Collecting dataclasses
-    Using cached dataclasses-0.6-py3-none-any.whl (14 kB)
-  Collecting h5py>=2.10.0
-    Downloading h5py-3.4.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.5 MB)
-       |________________________________| 4.5 MB 684 kB/s
-  Collecting intervaltree>=3.1.0
-    Using cached intervaltree-3.1.0-py2.py3-none-any.whl
-  Collecting lilcom>=1.1.0
-    Using cached lilcom-1.1.1-cp38-cp38-linux_x86_64.whl
-  Collecting numpy>=1.18.1
-    Using cached numpy-1.21.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.8 MB)
-  Collecting packaging
-    Using cached packaging-21.0-py3-none-any.whl (40 kB)
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/1e/3b/a7828d575aa17fb7acaf1ced49a3655aa36dad7e16eb7e6a2e4df0dda76f/cytoolz-0.12.2-cp38-cp38-
+  manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
+       ________________________________________ 2.0/2.0 MB 33.2 MB/s eta 0:00:00
   Collecting pyyaml>=5.3.1
-    Using cached PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl (662 kB)
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c8/6b/6600ac24725c7388255b2f5add93f91e58a5d7efaf4af244fdbcc11a541b/PyYAML-6.0.1-cp38-cp38-ma
+  nylinux_2_17_x86_64.manylinux2014_x86_64.whl (736 kB)
+       ________________________________________ 736.6/736.6 kB 38.6 MB/s eta 0:00:00
+  Collecting dataclasses
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/26/2f/1095cdc2868052dd1e64520f7c0d5c8c550ad297e944e641dbf1ffbb9a5d/dataclasses-0.6-py3-none-
+  any.whl (14 kB)
+  Requirement already satisfied: torchaudio in ./test-icefall/lib/python3.8/site-packages (from lhotse==1.16.0.dev0+git.7640d66.clean) (0.13.0+cu116)
+  Collecting lilcom>=1.1.0
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a8/65/df0a69c52bd085ca1ad4e5c4c1a5c680e25f9477d8e49316c4ff1e5084a4/lilcom-1.7-cp38-cp38-many
+  linux_2_17_x86_64.manylinux2014_x86_64.whl (87 kB)
+       ________________________________________ 87.1/87.1 kB 8.7 MB/s eta 0:00:00
   Collecting tqdm
-    Downloading tqdm-4.62.1-py2.py3-none-any.whl (76 kB)
-       |________________________________| 76 kB 2.7 MB/s
-  Collecting torchaudio==0.9.0
-    Downloading torchaudio-0.9.0-cp38-cp38-manylinux1_x86_64.whl (1.9 MB)
-       |________________________________| 1.9 MB 73.1 MB/s
-  Requirement already satisfied: torch==1.9.0 in ./test-icefall/lib/python3.8/site-packages (from torchaudio==0.9.0->lhotse===0.8.0.dev
-  -2a1410b-clean) (1.9.0)
-  Requirement already satisfied: typing-extensions in ./test-icefall/lib/python3.8/site-packages (from torch==1.9.0->torchaudio==0.9.0-
-  >lhotse===0.8.0.dev-2a1410b-clean) (3.10.0.0)
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/e6/02/a2cff6306177ae6bc73bc0665065de51dfb3b9db7373e122e2735faf0d97/tqdm-4.65.0-py3-none-any
+  .whl (77 kB)
+  Requirement already satisfied: numpy>=1.18.1 in ./test-icefall/lib/python3.8/site-packages (from lhotse==1.16.0.dev0+git.7640d66.clean) (1.24.4)
+  Collecting audioread>=2.1.9
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/5d/cb/82a002441902dccbe427406785db07af10182245ee639ea9f4d92907c923/audioread-3.0.0.tar.gz (
+  377 kB)
+    Preparing metadata (setup.py) ... done
+  Collecting tabulate>=0.8.1
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-
+  any.whl (35 kB)
+  Collecting click>=7.1.1
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/1a/70/e63223f8116931d365993d4a6b7ef653a4d920b41d03de7c59499962821f/click-8.1.6-py3-none-any.
+  whl (97 kB)
+       ________________________________________ 97.9/97.9 kB 8.4 MB/s eta 0:00:00
+  Collecting packaging
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/ab/c3/57f0601a2d4fe15de7a553c00adbc901425661bf048f2a22dfc500caf121/packaging-23.1-py3-none-
+  any.whl (48 kB)
+  Collecting intervaltree>=3.1.0
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/50/fb/396d568039d21344639db96d940d40eb62befe704ef849b27949ded5c3bb/intervaltree-3.1.0.tar.gz
+   (32 kB)
+    Preparing metadata (setup.py) ... done
+  Requirement already satisfied: torch in ./test-icefall/lib/python3.8/site-packages (from lhotse==1.16.0.dev0+git.7640d66.clean) (1.13.0+cu116)
+  Collecting SoundFile>=0.10
+    Downloading https://pypi.tuna.tsinghua.edu.cn/packages/ad/bd/0602167a213d9184fc688b1086dc6d374b7ae8c33eccf169f9b50ce6568c/soundfile-0.12.1-py2.py3-
+  none-manylinux_2_17_x86_64.whl (1.3 MB)
+       ________________________________________ 1.3/1.3 MB 46.5 MB/s eta 0:00:00
   Collecting toolz>=0.8.0
-    Using cached toolz-0.11.1-py3-none-any.whl (55 kB)
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/7f/5c/922a3508f5bda2892be3df86c74f9cf1e01217c2b1f8a0ac4841d903e3e9/toolz-0.12.0-py3-none-any.whl (55 kB)
   Collecting sortedcontainers<3.0,>=2.0
-    Using cached sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)
   Collecting cffi>=1.0
-    Using cached cffi-1.14.6-cp38-cp38-manylinux1_x86_64.whl (411 kB)
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/b7/8b/06f30caa03b5b3ac006de4f93478dbd0239e2a16566d81a106c322dc4f79/cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (442 kB)
+  Requirement already satisfied: typing-extensions in ./test-icefall/lib/python3.8/site-packages (from torch->lhotse==1.16.0.dev0+git.7640d66.clean) (4.7.1)
   Collecting pycparser
-    Using cached pycparser-2.20-py2.py3-none-any.whl (112 kB)
-  Collecting pyparsing>=2.0.2
-    Using cached pyparsing-2.4.7-py2.py3-none-any.whl (67 kB)
-  Building wheels for collected packages: lhotse
-    Building wheel for lhotse (setup.py) ... done
-    Created wheel for lhotse: filename=lhotse-0.8.0.dev_2a1410b_clean-py3-none-any.whl size=342242 sha256=f683444afa4dc0881133206b4646a
-  9d0f774224cc84000f55d0a67f6e4a37997
-    Stored in directory: /tmp/pip-ephem-wheel-cache-ftu0qysz/wheels/7f/7a/8e/a0bf241336e2e3cb573e1e21e5600952d49f5162454f2e612f
-    WARNING: Built wheel for lhotse is invalid: Metadata 1.2 mandates PEP 440 version, but '0.8.0.dev-2a1410b-clean' is not
-  Failed to build lhotse
-  Installing collected packages: pycparser, toolz, sortedcontainers, pyparsing, numpy, cffi, tqdm, torchaudio, SoundFile, pyyaml, packa
-  ging, lilcom, intervaltree, h5py, dataclasses, cytoolz, click, audioread, lhotse
-      Running setup.py install for lhotse ... done
-    DEPRECATION: lhotse was installed using the legacy 'setup.py install' method, because a wheel could not be built for it. A possible
-   replacement is to fix the wheel build issue reported above. You can find discussion regarding this at https://github.com/pypa/pip/is
-  sues/8368.
-  Successfully installed SoundFile-0.10.3.post1 audioread-2.1.9 cffi-1.14.6 click-8.0.1 cytoolz-0.11.0 dataclasses-0.6 h5py-3.4.0 inter
-  valtree-3.1.0 lhotse-0.8.0.dev-2a1410b-clean lilcom-1.1.1 numpy-1.21.2 packaging-21.0 pycparser-2.20 pyparsing-2.4.7 pyyaml-5.4.1 sor
-  tedcontainers-2.4.0 toolz-0.11.1 torchaudio-0.9.0 tqdm-4.62.1
+    Using cached https://pypi.tuna.tsinghua.edu.cn/packages/62/d5/5f610ebe421e85889f2e55e33b7f9a6795bd982198517d912eb1c76e1a53/pycparser-2.21-py2.py3-none-any.whl (118 kB)
+  Building wheels for collected packages: lhotse, audioread, intervaltree
+    Building wheel for lhotse (pyproject.toml) ... done
+    Created wheel for lhotse: filename=lhotse-1.16.0.dev0+git.7640d66.clean-py3-none-any.whl size=687627 sha256=cbf0a4d2d0b639b33b91637a4175bc251d6a021a069644ecb1a9f2b3a83d072a
+    Stored in directory: /tmp/pip-ephem-wheel-cache-wwtk90_m/wheels/7f/7a/8e/a0bf241336e2e3cb573e1e21e5600952d49f5162454f2e612f
+    Building wheel for audioread (setup.py) ... done
+    Created wheel for audioread: filename=audioread-3.0.0-py3-none-any.whl size=23704 sha256=5e2d3537c96ce9cf0f645a654c671163707bf8cb8d9e358d0e2b0939a85ff4c2
+    Stored in directory: /star-fj/fangjun/.cache/pip/wheels/e2/c3/9c/f19ae5a03f8862d9f0776b0c0570f1fdd60a119d90954e3f39
+    Building wheel for intervaltree (setup.py) ... done
+    Created wheel for intervaltree: filename=intervaltree-3.1.0-py2.py3-none-any.whl size=26098 sha256=2604170976cfffe0d2f678cb1a6e5b525f561cd50babe53d631a186734fec9f9
+    Stored in directory: /star-fj/fangjun/.cache/pip/wheels/f3/ed/2b/c179ebfad4e15452d6baef59737f27beb9bfb442e0620f7271
+  Successfully built lhotse audioread intervaltree
+  Installing collected packages: sortedcontainers, dataclasses, tqdm, toolz, tabulate, pyyaml, pycparser, packaging, lilcom, intervaltree, click, audioread, cytoolz, cffi, SoundFile, lhotse
+  Successfully installed SoundFile-0.12.1 audioread-3.0.0 cffi-1.15.1 click-8.1.6 cytoolz-0.12.2 dataclasses-0.6 intervaltree-3.1.0 lhotse-1.16.0.dev0+git.7640d66.clean lilcom-1.7 packaging-23.1 pycparser-2.21 pyyaml-6.0.1 sortedcontainers-2.4.0 tabulate-0.9.0 toolz-0.12.0 tqdm-4.65.0
 
-(5) Download icefall
+
+Verify that `lhotse`_ has been installed successfully:
+
+.. code-block:: bash
+
+  (test-icefall) kuangfangjun:~$ python3 -c "import lhotse; print(lhotse.__version__)"
+
+  1.16.0.dev+git.7640d66.clean
+
+(6) Download icefall
 ~~~~~~~~~~~~~~~~~~~~
 
-.. code-block::
+.. code-block:: bash
 
-  $ cd /tmp
-  $ git clone https://github.com/k2-fsa/icefall
+  (test-icefall) kuangfangjun:~$ cd /tmp/
+
+  (test-icefall) kuangfangjun:tmp$ git clone https://github.com/k2-fsa/icefall
 
   Cloning into 'icefall'...
-  remote: Enumerating objects: 500, done.
-  remote: Counting objects: 100% (500/500), done.
-  remote: Compressing objects: 100% (308/308), done.
-  remote: Total 500 (delta 263), reused 307 (delta 102), pack-reused 0
-  Receiving objects: 100% (500/500), 172.49 KiB | 385.00 KiB/s, done.
-  Resolving deltas: 100% (263/263), done.
+  remote: Enumerating objects: 12942, done.
+  remote: Counting objects: 100% (67/67), done.
+  remote: Compressing objects: 100% (56/56), done.
+  remote: Total 12942 (delta 17), reused 35 (delta 6), pack-reused 12875
+  Receiving objects: 100% (12942/12942), 14.77 MiB | 9.29 MiB/s, done.
+  Resolving deltas: 100% (8835/8835), done.
 
-  $ cd icefall
-  $ pip install -r requirements.txt
-
-  Collecting kaldilm
-    Downloading kaldilm-1.8.tar.gz (48 kB)
-       |________________________________| 48 kB 574 kB/s
-  Collecting kaldialign
-    Using cached kaldialign-0.2-cp38-cp38-linux_x86_64.whl
-  Collecting sentencepiece>=0.1.96
-    Using cached sentencepiece-0.1.96-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
-  Collecting tensorboard
-    Using cached tensorboard-2.6.0-py3-none-any.whl (5.6 MB)
-  Requirement already satisfied: setuptools>=41.0.0 in /ceph-fj/fangjun/test-icefall/lib/python3.8/site-packages (from tensorboard->-r
-  requirements.txt (line 4)) (57.4.0)
-  Collecting absl-py>=0.4
-    Using cached absl_py-0.13.0-py3-none-any.whl (132 kB)
-  Collecting google-auth-oauthlib<0.5,>=0.4.1
-    Using cached google_auth_oauthlib-0.4.5-py2.py3-none-any.whl (18 kB)
-  Collecting grpcio>=1.24.3
-    Using cached grpcio-1.39.0-cp38-cp38-manylinux2014_x86_64.whl (4.3 MB)
-  Requirement already satisfied: wheel>=0.26 in /ceph-fj/fangjun/test-icefall/lib/python3.8/site-packages (from tensorboard->-r require
-  ments.txt (line 4)) (0.36.2)
-  Requirement already satisfied: numpy>=1.12.0 in /ceph-fj/fangjun/test-icefall/lib/python3.8/site-packages (from tensorboard->-r requi
-  rements.txt (line 4)) (1.21.2)
-  Collecting protobuf>=3.6.0
-    Using cached protobuf-3.17.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
-  Collecting werkzeug>=0.11.15
-    Using cached Werkzeug-2.0.1-py3-none-any.whl (288 kB)
-  Collecting tensorboard-data-server<0.7.0,>=0.6.0
-    Using cached tensorboard_data_server-0.6.1-py3-none-manylinux2010_x86_64.whl (4.9 MB)
-  Collecting google-auth<2,>=1.6.3
-    Downloading google_auth-1.35.0-py2.py3-none-any.whl (152 kB)
-       |________________________________| 152 kB 1.4 MB/s
-  Collecting requests<3,>=2.21.0
-    Using cached requests-2.26.0-py2.py3-none-any.whl (62 kB)
-  Collecting tensorboard-plugin-wit>=1.6.0
-    Using cached tensorboard_plugin_wit-1.8.0-py3-none-any.whl (781 kB)
-  Collecting markdown>=2.6.8
-    Using cached Markdown-3.3.4-py3-none-any.whl (97 kB)
-  Collecting six
-    Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
-  Collecting cachetools<5.0,>=2.0.0
-    Using cached cachetools-4.2.2-py3-none-any.whl (11 kB)
-  Collecting rsa<5,>=3.1.4
-    Using cached rsa-4.7.2-py3-none-any.whl (34 kB)
-  Collecting pyasn1-modules>=0.2.1
-    Using cached pyasn1_modules-0.2.8-py2.py3-none-any.whl (155 kB)
-  Collecting requests-oauthlib>=0.7.0
-    Using cached requests_oauthlib-1.3.0-py2.py3-none-any.whl (23 kB)
-  Collecting pyasn1<0.5.0,>=0.4.6
-    Using cached pyasn1-0.4.8-py2.py3-none-any.whl (77 kB)
-  Collecting urllib3<1.27,>=1.21.1
-    Using cached urllib3-1.26.6-py2.py3-none-any.whl (138 kB)
-  Collecting certifi>=2017.4.17
-    Using cached certifi-2021.5.30-py2.py3-none-any.whl (145 kB)
-  Collecting charset-normalizer~=2.0.0
-    Using cached charset_normalizer-2.0.4-py3-none-any.whl (36 kB)
-  Collecting idna<4,>=2.5
-    Using cached idna-3.2-py3-none-any.whl (59 kB)
-  Collecting oauthlib>=3.0.0
-    Using cached oauthlib-3.1.1-py2.py3-none-any.whl (146 kB)
-  Building wheels for collected packages: kaldilm
-    Building wheel for kaldilm (setup.py) ... done
-    Created wheel for kaldilm: filename=kaldilm-1.8-cp38-cp38-linux_x86_64.whl size=897233 sha256=eccb906cafcd45bf9a7e1a1718e4534254bfb
-  f4c0d0cbc66eee6c88d68a63862
-    Stored in directory: /root/fangjun/.cache/pip/wheels/85/7d/63/f2dd586369b8797cb36d213bf3a84a789eeb92db93d2e723c9
-  Successfully built kaldilm
-  Installing collected packages: urllib3, pyasn1, idna, charset-normalizer, certifi, six, rsa, requests, pyasn1-modules, oauthlib, cach
-  etools, requests-oauthlib, google-auth, werkzeug, tensorboard-plugin-wit, tensorboard-data-server, protobuf, markdown, grpcio, google
-  -auth-oauthlib, absl-py, tensorboard, sentencepiece, kaldilm, kaldialign
-  Successfully installed absl-py-0.13.0 cachetools-4.2.2 certifi-2021.5.30 charset-normalizer-2.0.4 google-auth-1.35.0 google-auth-oaut
-  hlib-0.4.5 grpcio-1.39.0 idna-3.2 kaldialign-0.2 kaldilm-1.8 markdown-3.3.4 oauthlib-3.1.1 protobuf-3.17.3 pyasn1-0.4.8 pyasn1-module
-  s-0.2.8 requests-2.26.0 requests-oauthlib-1.3.0 rsa-4.7.2 sentencepiece-0.1.96 six-1.16.0 tensorboard-2.6.0 tensorboard-data-server-0
-  .6.1 tensorboard-plugin-wit-1.8.0 urllib3-1.26.6 werkzeug-2.0.1
+  (test-icefall) kuangfangjun:tmp$ cd icefall/
 
+  (test-icefall) kuangfangjun:icefall$ pip install -r ./requirements.txt
 
 Test Your Installation
 ----------------------
 
 To test that your installation is successful, let us run
 the `yesno recipe <https://github.com/k2-fsa/icefall/tree/master/egs/yesno/ASR>`_
-on CPU.
+on ``CPU``.
 
 Data preparation
 ~~~~~~~~~~~~~~~~
 
 .. code-block:: bash
 
-  $ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
-  $ cd /tmp/icefall
-  $ cd egs/yesno/ASR
-  $ ./prepare.sh
+  (test-icefall) kuangfangjun:icefall$ export PYTHONPATH=/tmp/icefall:$PYTHONPATH
+
+  (test-icefall) kuangfangjun:icefall$ cd /tmp/icefall
+
+  (test-icefall) kuangfangjun:icefall$ cd egs/yesno/ASR
+
+  (test-icefall) kuangfangjun:ASR$ ./prepare.sh
+
 
 The log of running ``./prepare.sh`` is:
 
 .. code-block::
 
-   2023-05-12 17:55:21 (prepare.sh:27:main) dl_dir: /tmp/icefall/egs/yesno/ASR/download
-   2023-05-12 17:55:21 (prepare.sh:30:main) Stage 0: Download data
-   /tmp/icefall/egs/yesno/ASR/download/waves_yesno.tar.gz: 100%|_______________________________________________________________| 4.70M/4.70M [06:54<00:00, 11.4kB/s]
-   2023-05-12 18:02:19 (prepare.sh:39:main) Stage 1: Prepare yesno manifest
-   2023-05-12 18:02:21 (prepare.sh:45:main) Stage 2: Compute fbank for yesno
-   2023-05-12 18:02:23,199 INFO [compute_fbank_yesno.py:65] Processing train
-   Extracting and storing features: 100%|_______________________________________________________________| 90/90 [00:00<00:00, 212.60it/s]
-   2023-05-12 18:02:23,640 INFO [compute_fbank_yesno.py:65] Processing test
-   Extracting and storing features: 100%|_______________________________________________________________| 30/30 [00:00<00:00, 304.53it/s]
-   2023-05-12 18:02:24 (prepare.sh:51:main) Stage 3: Prepare lang
-   2023-05-12 18:02:26 (prepare.sh:66:main) Stage 4: Prepare G
-   /project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):79
-   [I] Reading \data\ section.
-   /project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):140
-   [I] Reading \1-grams: section.
-   2023-05-12 18:02:26 (prepare.sh:92:main) Stage 5: Compile HLG
-   2023-05-12 18:02:28,581 INFO [compile_hlg.py:124] Processing data/lang_phone
-   2023-05-12 18:02:28,582 INFO [lexicon.py:171] Converting L.pt to Linv.pt
-   2023-05-12 18:02:28,609 INFO [compile_hlg.py:48] Building ctc_topo. max_token_id: 3
-   2023-05-12 18:02:28,610 INFO [compile_hlg.py:52] Loading G.fst.txt
-   2023-05-12 18:02:28,611 INFO [compile_hlg.py:62] Intersecting L and G
-   2023-05-12 18:02:28,613 INFO [compile_hlg.py:64] LG shape: (4, None)
-   2023-05-12 18:02:28,613 INFO [compile_hlg.py:66] Connecting LG
-   2023-05-12 18:02:28,614 INFO [compile_hlg.py:68] LG shape after k2.connect: (4, None)
-   2023-05-12 18:02:28,614 INFO [compile_hlg.py:70] <class 'torch.Tensor'>
-   2023-05-12 18:02:28,614 INFO [compile_hlg.py:71] Determinizing LG
-   2023-05-12 18:02:28,615 INFO [compile_hlg.py:74] <class '_k2.ragged.RaggedTensor'>
-   2023-05-12 18:02:28,615 INFO [compile_hlg.py:76] Connecting LG after k2.determinize
-   2023-05-12 18:02:28,615 INFO [compile_hlg.py:79] Removing disambiguation symbols on LG
-   2023-05-12 18:02:28,616 INFO [compile_hlg.py:91] LG shape after k2.remove_epsilon: (6, None)
-   2023-05-12 18:02:28,617 INFO [compile_hlg.py:96] Arc sorting LG
-   2023-05-12 18:02:28,617 INFO [compile_hlg.py:99] Composing H and LG
-   2023-05-12 18:02:28,619 INFO [compile_hlg.py:106] Connecting LG
-   2023-05-12 18:02:28,619 INFO [compile_hlg.py:109] Arc sorting LG
-   2023-05-12 18:02:28,619 INFO [compile_hlg.py:111] HLG.shape: (8, None)
-   2023-05-12 18:02:28,619 INFO [compile_hlg.py:127] Saving HLG.pt to data/lang_phone
-
+  2023-07-27 12:41:39 (prepare.sh:27:main) dl_dir: /tmp/icefall/egs/yesno/ASR/download
+  2023-07-27 12:41:39 (prepare.sh:30:main) Stage 0: Download data
+  /tmp/icefall/egs/yesno/ASR/download/waves_yesno.tar.gz: 100%|___________________________________________________| 4.70M/4.70M [00:00<00:00, 11.1MB/s]
+  2023-07-27 12:41:46 (prepare.sh:39:main) Stage 1: Prepare yesno manifest
+  2023-07-27 12:41:50 (prepare.sh:45:main) Stage 2: Compute fbank for yesno
+  2023-07-27 12:41:55,718 INFO [compute_fbank_yesno.py:65] Processing train
+  Extracting and storing features: 100%|_______________________________________________________________________________| 90/90 [00:01<00:00, 87.82it/s]
+  2023-07-27 12:41:56,778 INFO [compute_fbank_yesno.py:65] Processing test
+  Extracting and storing features: 100%|______________________________________________________________________________| 30/30 [00:00<00:00, 256.92it/s]
+  2023-07-27 12:41:57 (prepare.sh:51:main) Stage 3: Prepare lang
+  2023-07-27 12:42:02 (prepare.sh:66:main) Stage 4: Prepare G
+  /project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):79
+  [I] Reading \data\ section.
+  /project/kaldilm/csrc/arpa_file_parser.cc:void kaldilm::ArpaFileParser::Read(std::istream&):140
+  [I] Reading \1-grams: section.
+  2023-07-27 12:42:02 (prepare.sh:92:main) Stage 5: Compile HLG
+  2023-07-27 12:42:07,275 INFO [compile_hlg.py:124] Processing data/lang_phone
+  2023-07-27 12:42:07,276 INFO [lexicon.py:171] Converting L.pt to Linv.pt
+  2023-07-27 12:42:07,309 INFO [compile_hlg.py:48] Building ctc_topo. max_token_id: 3
+  2023-07-27 12:42:07,310 INFO [compile_hlg.py:52] Loading G.fst.txt
+  2023-07-27 12:42:07,314 INFO [compile_hlg.py:62] Intersecting L and G
+  2023-07-27 12:42:07,323 INFO [compile_hlg.py:64] LG shape: (4, None)
+  2023-07-27 12:42:07,323 INFO [compile_hlg.py:66] Connecting LG
+  2023-07-27 12:42:07,323 INFO [compile_hlg.py:68] LG shape after k2.connect: (4, None)
+  2023-07-27 12:42:07,323 INFO [compile_hlg.py:70] <class 'torch.Tensor'>
+  2023-07-27 12:42:07,323 INFO [compile_hlg.py:71] Determinizing LG
+  2023-07-27 12:42:07,341 INFO [compile_hlg.py:74] <class '_k2.ragged.RaggedTensor'>
+  2023-07-27 12:42:07,341 INFO [compile_hlg.py:76] Connecting LG after k2.determinize
+  2023-07-27 12:42:07,341 INFO [compile_hlg.py:79] Removing disambiguation symbols on LG
+  2023-07-27 12:42:07,354 INFO [compile_hlg.py:91] LG shape after k2.remove_epsilon: (6, None)
+  2023-07-27 12:42:07,445 INFO [compile_hlg.py:96] Arc sorting LG
+  2023-07-27 12:42:07,445 INFO [compile_hlg.py:99] Composing H and LG
+  2023-07-27 12:42:07,446 INFO [compile_hlg.py:106] Connecting LG
+  2023-07-27 12:42:07,446 INFO [compile_hlg.py:109] Arc sorting LG
+  2023-07-27 12:42:07,447 INFO [compile_hlg.py:111] HLG.shape: (8, None)
+  2023-07-27 12:42:07,447 INFO [compile_hlg.py:127] Saving HLG.pt to data/lang_phone
 
 Training
 ~~~~~~~~
@@ -409,12 +435,13 @@ Now let us run the training part:
 
 .. code-block::
 
-  $ export CUDA_VISIBLE_DEVICES=""
-  $ ./tdnn/train.py
+  (test-icefall) kuangfangjun:ASR$ export CUDA_VISIBLE_DEVICES=""
+
+  (test-icefall) kuangfangjun:ASR$ ./tdnn/train.py
 
 .. CAUTION::
 
-  We use ``export CUDA_VISIBLE_DEVICES=""`` so that ``icefall`` uses CPU
+  We use ``export CUDA_VISIBLE_DEVICES=""`` so that `icefall`_ uses CPU
   even if there are GPUs available.
 
 .. hint::
@@ -432,53 +459,52 @@ The training log is given below:
 
 .. code-block::
 
-   2023-05-12 18:04:59,759 INFO [train.py:481] Training started
-   2023-05-12 18:04:59,759 INFO [train.py:482] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 
-   'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 
-   'reduction': 'sum', 'use_double_scores': True, 'world_size': 1, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 15, 'seed': 42, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0,
-   'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 
-   'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '3b7f09fa35e72589914f67089c0da9f196a92ca4', 'k2-git-date': 'Mon May 8 22:58:45 2023', 
-   'lhotse-version': '1.15.0.dev+git.6fcfced.clean', 'torch-version': '2.0.0+cu118', 'torch-cuda-available': False, 'torch-cuda-version': '11.8', 'python-version': '3.1', 'icefall-git-branch': 'master', 
-   'icefall-git-sha1': '30bde4b-clean', 'icefall-git-date': 'Thu May 11 17:37:47 2023', 'icefall-path': '/tmp/icefall', 
-   'k2-path': 'tmp/lib/python3.10/site-packages/k2-1.24.3.dev20230512+cuda11.8.torch2.0.0-py3.10-linux-x86_64.egg/k2/__init__.py', 
-   'lhotse-path': 'tmp/lib/python3.10/site-packages/lhotse/__init__.py', 'hostname': 'host', 'IP address': '0.0.0.0'}}
-   2023-05-12 18:04:59,761 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
-   2023-05-12 18:04:59,764 INFO [train.py:495] device: cpu
-   2023-05-12 18:04:59,791 INFO [asr_datamodule.py:146] About to get train cuts
-   2023-05-12 18:04:59,791 INFO [asr_datamodule.py:244] About to get train cuts
-   2023-05-12 18:04:59,852 INFO [asr_datamodule.py:149] About to create train dataset
-   2023-05-12 18:04:59,852 INFO [asr_datamodule.py:199] Using SingleCutSampler.
-   2023-05-12 18:04:59,852 INFO [asr_datamodule.py:205] About to create train dataloader
-   2023-05-12 18:04:59,853 INFO [asr_datamodule.py:218] About to get test cuts
-   2023-05-12 18:04:59,853 INFO [asr_datamodule.py:252] About to get test cuts
-   2023-05-12 18:04:59,986 INFO [train.py:422] Epoch 0, batch 0, loss[loss=1.065, over 2436.00 frames. ], tot_loss[loss=1.065, over 2436.00 frames. ], batch size: 4
-   2023-05-12 18:05:00,352 INFO [train.py:422] Epoch 0, batch 10, loss[loss=0.4561, over 2828.00 frames. ], tot_loss[loss=0.7076, over 22192.90 frames. ], batch size: 4
-   2023-05-12 18:05:00,691 INFO [train.py:444] Epoch 0, validation loss=0.9002, over 18067.00 frames.
-   2023-05-12 18:05:00,996 INFO [train.py:422] Epoch 0, batch 20, loss[loss=0.2555, over 2695.00 frames. ], tot_loss[loss=0.484, over 34971.47 frames. ], batch size: 5
-   2023-05-12 18:05:01,217 INFO [train.py:444] Epoch 0, validation loss=0.4688, over 18067.00 frames.
-   2023-05-12 18:05:01,251 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-0.pt
-   2023-05-12 18:05:01,389 INFO [train.py:422] Epoch 1, batch 0, loss[loss=0.2532, over 2436.00 frames. ], tot_loss[loss=0.2532, over 2436.00 frames. ], batch size: 4
-   2023-05-12 18:05:01,637 INFO [train.py:422] Epoch 1, batch 10, loss[loss=0.1139, over 2828.00 frames. ], tot_loss[loss=0.1592, over 22192.90 frames. ], batch size: 4
-   2023-05-12 18:05:01,859 INFO [train.py:444] Epoch 1, validation loss=0.1629, over 18067.00 frames.
-   2023-05-12 18:05:02,094 INFO [train.py:422] Epoch 1, batch 20, loss[loss=0.0767, over 2695.00 frames. ], tot_loss[loss=0.118, over 34971.47 frames. ], batch size: 5
-   2023-05-12 18:05:02,350 INFO [train.py:444] Epoch 1, validation loss=0.06778, over 18067.00 frames.
-   2023-05-12 18:05:02,395 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-1.pt
+    2023-07-27 12:50:51,936 INFO [train.py:481] Training started
+    2023-07-27 12:50:51,936 INFO [train.py:482] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lr': 0.01, 'feature_dim': 23, 'weight_decay': 1e-06, 'start_epoch': 0, 'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 10, 'reset_interval': 20, 'valid_interval': 10, 'beam_size': 10, 'reduction': 'sum', 'use_double_scores': True, 'world_size': 1, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 15, 'seed': 42, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4c05309499a08454997adf500b56dcc629e35ae5', 'k2-git-date': 'Tue Jul 25 16:23:36 2023', 'lhotse-version': '1.16.0.dev+git.7640d66.clean', 'torch-version': '1.13.0+cu116', 'torch-cuda-available': False, 'torch-cuda-version': '11.6', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '3fb0a43-clean', 'icefall-git-date': 'Thu Jul 27 12:36:05 2023', 'icefall-path': '/tmp/icefall', 'k2-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-1-1220091118-57c4d55446-sph26', 'IP address': '10.177.77.20'}}
+    2023-07-27 12:50:51,941 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+    2023-07-27 12:50:51,949 INFO [train.py:495] device: cpu
+    2023-07-27 12:50:51,965 INFO [asr_datamodule.py:146] About to get train cuts
+    2023-07-27 12:50:51,965 INFO [asr_datamodule.py:244] About to get train cuts
+    2023-07-27 12:50:51,967 INFO [asr_datamodule.py:149] About to create train dataset
+    2023-07-27 12:50:51,967 INFO [asr_datamodule.py:199] Using SingleCutSampler.
+    2023-07-27 12:50:51,967 INFO [asr_datamodule.py:205] About to create train dataloader
+    2023-07-27 12:50:51,968 INFO [asr_datamodule.py:218] About to get test cuts
+    2023-07-27 12:50:51,968 INFO [asr_datamodule.py:252] About to get test cuts
+    2023-07-27 12:50:52,565 INFO [train.py:422] Epoch 0, batch 0, loss[loss=1.065, over 2436.00 frames. ], tot_loss[loss=1.065, over 2436.00 frames. ], batch size: 4
+    2023-07-27 12:50:53,681 INFO [train.py:422] Epoch 0, batch 10, loss[loss=0.4561, over 2828.00 frames. ], tot_loss[loss=0.7076, over 22192.90 frames.], batch size: 4
+    2023-07-27 12:50:54,167 INFO [train.py:444] Epoch 0, validation loss=0.9002, over 18067.00 frames.
+    2023-07-27 12:50:55,011 INFO [train.py:422] Epoch 0, batch 20, loss[loss=0.2555, over 2695.00 frames. ], tot_loss[loss=0.484, over 34971.47 frames. ], batch size: 5
+    2023-07-27 12:50:55,331 INFO [train.py:444] Epoch 0, validation loss=0.4688, over 18067.00 frames.
+    2023-07-27 12:50:55,368 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-0.pt
+    2023-07-27 12:50:55,633 INFO [train.py:422] Epoch 1, batch 0, loss[loss=0.2532, over 2436.00 frames. ], tot_loss[loss=0.2532, over 2436.00 frames. ],
+     batch size: 4
+    2023-07-27 12:50:56,242 INFO [train.py:422] Epoch 1, batch 10, loss[loss=0.1139, over 2828.00 frames. ], tot_loss[loss=0.1592, over 22192.90 frames.], batch size: 4
+    2023-07-27 12:50:56,522 INFO [train.py:444] Epoch 1, validation loss=0.1627, over 18067.00 frames.
+    2023-07-27 12:50:57,209 INFO [train.py:422] Epoch 1, batch 20, loss[loss=0.07055, over 2695.00 frames. ], tot_loss[loss=0.1175, over 34971.47 frames.], batch size: 5
+    2023-07-27 12:50:57,600 INFO [train.py:444] Epoch 1, validation loss=0.07091, over 18067.00 frames.
+    2023-07-27 12:50:57,640 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-1.pt
+    2023-07-27 12:50:57,847 INFO [train.py:422] Epoch 2, batch 0, loss[loss=0.07731, over 2436.00 frames. ], tot_loss[loss=0.07731, over 2436.00 frames.], batch size: 4
+    2023-07-27 12:50:58,427 INFO [train.py:422] Epoch 2, batch 10, loss[loss=0.04391, over 2828.00 frames. ], tot_loss[loss=0.05341, over 22192.90 frames. ], batch size: 4
+    2023-07-27 12:50:58,884 INFO [train.py:444] Epoch 2, validation loss=0.04384, over 18067.00 frames.
+    2023-07-27 12:50:59,387 INFO [train.py:422] Epoch 2, batch 20, loss[loss=0.03458, over 2695.00 frames. ], tot_loss[loss=0.04616, over 34971.47 frames. ], batch size: 5
+    2023-07-27 12:50:59,707 INFO [train.py:444] Epoch 2, validation loss=0.03379, over 18067.00 frames.
+    2023-07-27 12:50:59,758 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-2.pt
 
-  ... ...
+      ... ...
 
-   2023-05-12 18:05:14,789 INFO [train.py:422] Epoch 13, batch 0, loss[loss=0.01056, over 2436.00 frames. ], tot_loss[loss=0.01056, over 2436.00 frames. ], batch size: 4
-   2023-05-12 18:05:15,016 INFO [train.py:422] Epoch 13, batch 10, loss[loss=0.009022, over 2828.00 frames. ], tot_loss[loss=0.009985, over 22192.90 frames. ], batch size: 4
-   2023-05-12 18:05:15,271 INFO [train.py:444] Epoch 13, validation loss=0.01088, over 18067.00 frames.
-   2023-05-12 18:05:15,497 INFO [train.py:422] Epoch 13, batch 20, loss[loss=0.01174, over 2695.00 frames. ], tot_loss[loss=0.01077, over 34971.47 frames. ], batch size: 5
-   2023-05-12 18:05:15,747 INFO [train.py:444] Epoch 13, validation loss=0.01087, over 18067.00 frames.
-   2023-05-12 18:05:15,783 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-13.pt
-   2023-05-12 18:05:15,921 INFO [train.py:422] Epoch 14, batch 0, loss[loss=0.01045, over 2436.00 frames. ], tot_loss[loss=0.01045, over 2436.00 frames. ], batch size: 4
-   2023-05-12 18:05:16,146 INFO [train.py:422] Epoch 14, batch 10, loss[loss=0.008957, over 2828.00 frames. ], tot_loss[loss=0.009903, over 22192.90 frames. ], batch size: 4
-   2023-05-12 18:05:16,374 INFO [train.py:444] Epoch 14, validation loss=0.01092, over 18067.00 frames.
-   2023-05-12 18:05:16,598 INFO [train.py:422] Epoch 14, batch 20, loss[loss=0.01169, over 2695.00 frames. ], tot_loss[loss=0.01065, over 34971.47 frames. ], batch size: 5
-   2023-05-12 18:05:16,824 INFO [train.py:444] Epoch 14, validation loss=0.01077, over 18067.00 frames.
-   2023-05-12 18:05:16,862 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-14.pt
-   2023-05-12 18:05:16,865 INFO [train.py:555] Done!
+    2023-07-27 12:51:23,433 INFO [train.py:422] Epoch 13, batch 0, loss[loss=0.01054, over 2436.00 frames. ], tot_loss[loss=0.01054, over 2436.00 frames. ], batch size: 4
+    2023-07-27 12:51:23,980 INFO [train.py:422] Epoch 13, batch 10, loss[loss=0.009014, over 2828.00 frames. ], tot_loss[loss=0.009974, over 22192.90 frames. ], batch size: 4
+    2023-07-27 12:51:24,489 INFO [train.py:444] Epoch 13, validation loss=0.01085, over 18067.00 frames.
+    2023-07-27 12:51:25,258 INFO [train.py:422] Epoch 13, batch 20, loss[loss=0.01172, over 2695.00 frames. ], tot_loss[loss=0.01055, over 34971.47 frames. ], batch size: 5
+    2023-07-27 12:51:25,621 INFO [train.py:444] Epoch 13, validation loss=0.01074, over 18067.00 frames.
+    2023-07-27 12:51:25,699 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-13.pt
+    2023-07-27 12:51:25,866 INFO [train.py:422] Epoch 14, batch 0, loss[loss=0.01044, over 2436.00 frames. ], tot_loss[loss=0.01044, over 2436.00 frames. ], batch size: 4
+    2023-07-27 12:51:26,844 INFO [train.py:422] Epoch 14, batch 10, loss[loss=0.008942, over 2828.00 frames. ], tot_loss[loss=0.01, over 22192.90 frames. ], batch size: 4
+    2023-07-27 12:51:27,221 INFO [train.py:444] Epoch 14, validation loss=0.01082, over 18067.00 frames.
+    2023-07-27 12:51:27,970 INFO [train.py:422] Epoch 14, batch 20, loss[loss=0.01169, over 2695.00 frames. ], tot_loss[loss=0.01054, over 34971.47 frames. ], batch size: 5
+    2023-07-27 12:51:28,247 INFO [train.py:444] Epoch 14, validation loss=0.01073, over 18067.00 frames.
+    2023-07-27 12:51:28,323 INFO [checkpoint.py:75] Saving checkpoint to tdnn/exp/epoch-14.pt
+    2023-07-27 12:51:28,326 INFO [train.py:555] Done!
 
 Decoding
 ~~~~~~~~
@@ -487,42 +513,32 @@ Let us use the trained model to decode the test set:
 
 .. code-block::
 
-  $ ./tdnn/decode.py
+  (test-icefall) kuangfangjun:ASR$ ./tdnn/decode.py
 
-The decoding log is:
+  2023-07-27 12:55:12,840 INFO [decode.py:263] Decoding started
+  2023-07-27 12:55:12,840 INFO [decode.py:264] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23, 'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'export': False, 'feature_dir': PosixPath('data/fbank'), 'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '4c05309499a08454997adf500b56dcc629e35ae5', 'k2-git-date': 'Tue Jul 25 16:23:36 2023', 'lhotse-version': '1.16.0.dev+git.7640d66.clean', 'torch-version': '1.13.0+cu116', 'torch-cuda-available': False, 'torch-cuda-version': '11.6', 'python-version': '3.8', 'icefall-git-branch': 'master', 'icefall-git-sha1': '3fb0a43-clean', 'icefall-git-date': 'Thu Jul 27 12:36:05 2023', 'icefall-path': '/tmp/icefall', 'k2-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/k2/__init__.py', 'lhotse-path': '/star-fj/fangjun/test-icefall/lib/python3.8/site-packages/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-1-1220091118-57c4d55446-sph26', 'IP address': '10.177.77.20'}}
+  2023-07-27 12:55:12,841 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
+  2023-07-27 12:55:12,855 INFO [decode.py:273] device: cpu
+  2023-07-27 12:55:12,868 INFO [decode.py:291] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
+  2023-07-27 12:55:12,882 INFO [asr_datamodule.py:218] About to get test cuts
+  2023-07-27 12:55:12,883 INFO [asr_datamodule.py:252] About to get test cuts
+  2023-07-27 12:55:13,157 INFO [decode.py:204] batch 0/?, cuts processed until now is 4
+  2023-07-27 12:55:13,701 INFO [decode.py:241] The transcripts are stored in tdnn/exp/recogs-test_set.txt
+  2023-07-27 12:55:13,702 INFO [utils.py:564] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
+  2023-07-27 12:55:13,704 INFO [decode.py:249] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
+  2023-07-27 12:55:13,704 INFO [decode.py:316] Done!
 
-.. code-block::
 
-   2023-05-12 18:08:30,482 INFO [decode.py:263] Decoding started
-   2023-05-12 18:08:30,483 INFO [decode.py:264] {'exp_dir': PosixPath('tdnn/exp'), 'lang_dir': PosixPath('data/lang_phone'), 'lm_dir': PosixPath('data/lm'), 'feature_dim': 23, 
-   'search_beam': 20, 'output_beam': 8, 'min_active_states': 30, 'max_active_states': 10000, 'use_double_scores': True, 'epoch': 14, 'avg': 2, 'export': False, 'feature_dir': PosixPath('data/fbank'), 
-   'max_duration': 30.0, 'bucketing_sampler': False, 'num_buckets': 10, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': False, 'return_cuts': True, 
-   'num_workers': 2, 'env_info': {'k2-version': '1.24.3', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '3b7f09fa35e72589914f67089c0da9f196a92ca4', 'k2-git-date': 'Mon May 8 22:58:45 2023', 
-   'lhotse-version': '1.15.0.dev+git.6fcfced.clean', 'torch-version': '2.0.0+cu118', 'torch-cuda-available': False, 'torch-cuda-version': '11.8', 'python-version': '3.1', 'icefall-git-branch': 'master', 
-   'icefall-git-sha1': '30bde4b-clean', 'icefall-git-date': 'Thu May 11 17:37:47 2023', 'icefall-path': '/tmp/icefall', 
-   'k2-path': '/tmp/lib/python3.10/site-packages/k2-1.24.3.dev20230512+cuda11.8.torch2.0.0-py3.10-linux-x86_64.egg/k2/__init__.py', 
-   'lhotse-path': '/tmp/lib/python3.10/site-packages/lhotse/__init__.py', 'hostname': 'host', 'IP address': '0.0.0.0'}}
-   2023-05-12 18:08:30,483 INFO [lexicon.py:168] Loading pre-compiled data/lang_phone/Linv.pt
-   2023-05-12 18:08:30,487 INFO [decode.py:273] device: cpu
-   2023-05-12 18:08:30,513 INFO [decode.py:291] averaging ['tdnn/exp/epoch-13.pt', 'tdnn/exp/epoch-14.pt']
-   2023-05-12 18:08:30,521 INFO [asr_datamodule.py:218] About to get test cuts
-   2023-05-12 18:08:30,521 INFO [asr_datamodule.py:252] About to get test cuts
-   2023-05-12 18:08:30,675 INFO [decode.py:204] batch 0/?, cuts processed until now is 4
-   2023-05-12 18:08:30,923 INFO [decode.py:241] The transcripts are stored in tdnn/exp/recogs-test_set.txt
-   2023-05-12 18:08:30,924 INFO [utils.py:558] [test_set] %WER 0.42% [1 / 240, 0 ins, 1 del, 0 sub ]
-   2023-05-12 18:08:30,925 INFO [decode.py:249] Wrote detailed error stats to tdnn/exp/errs-test_set.txt
-   2023-05-12 18:08:30,925 INFO [decode.py:316] Done!
-
-**Congratulations!** You have successfully setup the environment and have run the first recipe in ``icefall``.
+**Congratulations!** You have successfully setup the environment and have run the first recipe in `icefall`_.
 
 Have fun with ``icefall``!
 
 YouTube Video
 -------------
 
-We provide the following YouTube video showing how to install ``icefall``.
+We provide the following YouTube video showing how to install `icefall`_.
 It also shows how to debug various problems that you may encounter while
-using ``icefall``.
+using `icefall`_.
 
 .. note::
 
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
index ea9f350cd..2e8d0893a 100644
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
@@ -1,7 +1,7 @@
 Distillation with HuBERT
 ========================
 
-This tutorial shows you how to perform knowledge distillation in `icefall`_
+This tutorial shows you how to perform knowledge distillation in `icefall <https://github.com/k2-fsa/icefall>`_
 with the `LibriSpeech`_ dataset. The distillation method
 used here is called "Multi Vector Quantization Knowledge Distillation" (MVQ-KD).
 Please have a look at our paper `Predicting Multi-Codebook Vector Quantization Indexes for Knowledge Distillation <https://arxiv.org/abs/2211.00508>`_
@@ -13,7 +13,7 @@ for more details about MVQ-KD.
     `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_.
     Currently, we only implement MVQ-KD in this recipe. However, MVQ-KD is theoretically applicable to all recipes
     with only minor changes needed. Feel free to try out MVQ-KD in different recipes. If you
-    encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`_.
+    encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`__.
 
 .. note::
 
@@ -217,7 +217,7 @@ the following command.
     --exp-dir $exp_dir \
     --enable-distillation True
 
-You should get similar results as `here <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS-100hours.md#distillation-with-hubert>`_.
+You should get similar results as `here <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS-100hours.md#distillation-with-hubert>`__.
 
 That's all! Feel free to experiment with your own setups and report your results.
-If you encounter any problems during training, please open up an issue `here <https://github.com/k2-fsa/icefall/issues>`_.
+If you encounter any problems during training, please open up an issue `here <https://github.com/k2-fsa/icefall/issues>`__.
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
index 42fd3df77..1bc1dd984 100644
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
@@ -8,10 +8,10 @@ with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 
 .. Note::
 
-   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
-   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
-   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
-   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_,
+   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`__,
+   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`__,
+   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`__,
+   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`__,
    We will take pruned_transducer_stateless4 as an example in this tutorial.
 
 .. HINT::
@@ -237,7 +237,7 @@ them, please modify ``./pruned_transducer_stateless4/train.py`` directly.
 
 .. NOTE::
 
-  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`_ are a little different from
+  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`__ are a little different from
   other recipes. It allows you to configure ``--num-encoder-layers``, ``--dim-feedforward``, ``--nhead``, ``--encoder-dim``, ``--decoder-dim``, ``--joiner-dim`` from commandline, so that you can train models with different size with pruned_transducer_stateless5.
 
 
@@ -529,13 +529,13 @@ Download pretrained models
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
 
-  - `pruned_transducer_stateless <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12>`_
+  - `pruned_transducer_stateless <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12>`__
 
-  - `pruned_transducer_stateless2 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29>`_
+  - `pruned_transducer_stateless2 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29>`__
 
-  - `pruned_transducer_stateless4 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless4-2022-06-03>`_
+  - `pruned_transducer_stateless4 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless4-2022-06-03>`__
 
-  - `pruned_transducer_stateless5 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless5-2022-07-07>`_
+  - `pruned_transducer_stateless5 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless5-2022-07-07>`__
 
   See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
   for the details of the above pretrained models
diff --git a/docs/source/recipes/Streaming-ASR/introduction.rst b/docs/source/recipes/Streaming-ASR/introduction.rst
index e1382e77d..ac77a51d1 100644
--- a/docs/source/recipes/Streaming-ASR/introduction.rst
+++ b/docs/source/recipes/Streaming-ASR/introduction.rst
@@ -45,9 +45,9 @@ the input features.
 
 We have three variants of Emformer models in ``icefall``.
 
- - ``pruned_stateless_emformer_rnnt2`` using Emformer from torchaudio, see `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2>`_.
+ - ``pruned_stateless_emformer_rnnt2`` using Emformer from torchaudio, see `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2>`__.
  - ``conv_emformer_transducer_stateless`` using ConvEmformer implemented by ourself. Different from the Emformer in torchaudio,
    ConvEmformer has a convolution in each layer and uses the mechanisms in our reworked conformer model.
-   See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless>`_.
+   See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless>`__.
  - ``conv_emformer_transducer_stateless2`` using ConvEmformer implemented by ourself. The only difference from the above one is that
    it uses a simplified memory bank. See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2>`_.
diff --git a/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst b/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst
index de7102ba8..2ca70bcf3 100644
--- a/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst
@@ -6,10 +6,10 @@ with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 
 .. Note::
 
-   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
-   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
-   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
-   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_,
+   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`__,
+   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`__,
+   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`__,
+   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`__,
    We will take pruned_transducer_stateless4 as an example in this tutorial.
 
 .. HINT::
@@ -264,7 +264,7 @@ them, please modify ``./pruned_transducer_stateless4/train.py`` directly.
 
 .. NOTE::
 
-  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`_ are a little different from
+  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`__ are a little different from
   other recipes. It allows you to configure ``--num-encoder-layers``, ``--dim-feedforward``, ``--nhead``, ``--encoder-dim``, ``--decoder-dim``, ``--joiner-dim`` from commandline, so that you can train models with different size with pruned_transducer_stateless5.
 
 
diff --git a/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst b/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst
index f0e8961d7..8b75473c6 100644
--- a/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst
@@ -6,7 +6,7 @@ with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 
 .. Note::
 
-   The tutorial is suitable for `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
+   The tutorial is suitable for `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`__,
 
 .. HINT::
 
@@ -642,7 +642,7 @@ Download pretrained models
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
 
-  - `pruned_transducer_stateless7_streaming <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`_
+  - `pruned_transducer_stateless7_streaming <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`__
 
   See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
   for the details of the above pretrained models
diff --git a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
index 387c14acf..9caacb78b 100755
--- a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
+++ b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
@@ -32,7 +32,7 @@ import torch
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 
-from icefall.utils import get_executor
+from icefall.utils import get_executor, str2bool
 
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
@@ -42,7 +42,7 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 
 
-def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
+def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80, perturb_speed: bool = False):
     src_dir = Path("data/manifests/aidatatang_200zh")
     output_dir = Path("data/fbank")
     num_jobs = min(15, os.cpu_count())
@@ -85,7 +85,8 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
                 recordings=m["recordings"],
                 supervisions=m["supervisions"],
             )
-            if "train" in partition:
+            if "train" in partition and perturb_speed:
+                logging.info(f"Doing speed perturb")
                 cut_set = (
                     cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                 )
@@ -109,7 +110,12 @@ def get_args():
         default=80,
         help="""The number of mel bins for Fbank""",
     )
-
+    parser.add_argument(
+        "--perturb-speed",
+        type=str2bool,
+        default=False,
+        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
+    )
     return parser.parse_args()
 
 
@@ -119,4 +125,6 @@ if __name__ == "__main__":
     logging.basicConfig(format=formatter, level=logging.INFO)
 
     args = get_args()
-    compute_fbank_aidatatang_200zh(num_mel_bins=args.num_mel_bins)
+    compute_fbank_aidatatang_200zh(
+        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+    )
diff --git a/egs/aidatatang_200zh/ASR/prepare.sh b/egs/aidatatang_200zh/ASR/prepare.sh
index 46ecd5769..2eb0b3718 100755
--- a/egs/aidatatang_200zh/ASR/prepare.sh
+++ b/egs/aidatatang_200zh/ASR/prepare.sh
@@ -77,7 +77,7 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
   log "Stage 4: Compute fbank for aidatatang_200zh"
   if [ ! -f data/fbank/.aidatatang_200zh.done ]; then
     mkdir -p data/fbank
-    ./local/compute_fbank_aidatatang_200zh.py
+    ./local/compute_fbank_aidatatang_200zh.py --perturb-speed True
     touch data/fbank/.aidatatang_200zh.done
   fi
 fi
diff --git a/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py b/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py
index 037971927..6a9bb4f42 100755
--- a/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py
+++ b/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py
@@ -32,7 +32,7 @@ import torch
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 
-from icefall.utils import get_executor
+from icefall.utils import get_executor, str2bool
 
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
@@ -42,7 +42,7 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 
 
-def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
+def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80, perturb_speed: bool = False):
     src_dir = Path("data/manifests")
     output_dir = Path("data/fbank")
     num_jobs = min(15, os.cpu_count())
@@ -85,7 +85,8 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
                 recordings=m["recordings"],
                 supervisions=m["supervisions"],
             )
-            if "train" in partition:
+            if "train" in partition and perturb_speed:
+                logging.info(f"Doing speed perturb")
                 cut_set = (
                     cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                 )
@@ -109,7 +110,12 @@ def get_args():
         default=80,
         help="""The number of mel bins for Fbank""",
     )
-
+    parser.add_argument(
+        "--perturb-speed",
+        type=str2bool,
+        default=False,
+        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
+    )
     return parser.parse_args()
 
 
@@ -119,4 +125,6 @@ if __name__ == "__main__":
     logging.basicConfig(format=formatter, level=logging.INFO)
 
     args = get_args()
-    compute_fbank_aidatatang_200zh(num_mel_bins=args.num_mel_bins)
+    compute_fbank_aidatatang_200zh(
+        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+    )
diff --git a/egs/aishell/ASR/local/compute_fbank_aishell.py b/egs/aishell/ASR/local/compute_fbank_aishell.py
index 115ca1031..c7000da1c 100755
--- a/egs/aishell/ASR/local/compute_fbank_aishell.py
+++ b/egs/aishell/ASR/local/compute_fbank_aishell.py
@@ -32,7 +32,7 @@ import torch
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 
-from icefall.utils import get_executor
+from icefall.utils import get_executor, str2bool
 
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
@@ -42,7 +42,7 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 
 
-def compute_fbank_aishell(num_mel_bins: int = 80):
+def compute_fbank_aishell(num_mel_bins: int = 80, perturb_speed: bool = False):
     src_dir = Path("data/manifests")
     output_dir = Path("data/fbank")
     num_jobs = min(15, os.cpu_count())
@@ -81,7 +81,8 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
                 recordings=m["recordings"],
                 supervisions=m["supervisions"],
             )
-            if "train" in partition:
+            if "train" in partition and perturb_speed:
+                logging.info(f"Doing speed perturb")
                 cut_set = (
                     cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                 )
@@ -104,7 +105,12 @@ def get_args():
         default=80,
         help="""The number of mel bins for Fbank""",
     )
-
+    parser.add_argument(
+        "--perturb-speed",
+        type=str2bool,
+        default=False,
+        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
+    )
     return parser.parse_args()
 
 
@@ -114,4 +120,6 @@ if __name__ == "__main__":
     logging.basicConfig(format=formatter, level=logging.INFO)
 
     args = get_args()
-    compute_fbank_aishell(num_mel_bins=args.num_mel_bins)
+    compute_fbank_aishell(
+        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+    )
diff --git a/egs/aishell/ASR/prepare.sh b/egs/aishell/ASR/prepare.sh
index b763d72c1..ff8e1301d 100755
--- a/egs/aishell/ASR/prepare.sh
+++ b/egs/aishell/ASR/prepare.sh
@@ -114,7 +114,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
   log "Stage 3: Compute fbank for aishell"
   if [ ! -f data/fbank/.aishell.done ]; then
     mkdir -p data/fbank
-    ./local/compute_fbank_aishell.py
+    ./local/compute_fbank_aishell.py --perturb-speed True
     touch data/fbank/.aishell.done
   fi
 fi
diff --git a/egs/aishell/ASR/prepare_aidatatang_200zh.sh b/egs/aishell/ASR/prepare_aidatatang_200zh.sh
index f1d4d18a7..ec89450df 100755
--- a/egs/aishell/ASR/prepare_aidatatang_200zh.sh
+++ b/egs/aishell/ASR/prepare_aidatatang_200zh.sh
@@ -53,7 +53,7 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
   log "Stage 2: Process aidatatang_200zh"
   if [ ! -f data/fbank/.aidatatang_200zh_fbank.done ]; then
     mkdir -p data/fbank
-    ./local/compute_fbank_aidatatang_200zh.py
+    ./local/compute_fbank_aidatatang_200zh.py --perturb-speed True
     touch data/fbank/.aidatatang_200zh_fbank.done
   fi
 fi
diff --git a/egs/aishell/ASR/pruned_transducer_stateless7/train.py b/egs/aishell/ASR/pruned_transducer_stateless7/train.py
index ef536c035..cbb7db086 100755
--- a/egs/aishell/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/train.py
@@ -240,7 +240,7 @@ def get_parser():
     parser.add_argument(
         "--exp-dir",
         type=str,
-        default="pruned_transducer_stateless3/exp",
+        default="pruned_transducer_stateless7/exp",
         help="""The experiment dir.
         It specifies the directory where all training related
         files, e.g., checkpoints, log, etc, are saved
diff --git a/egs/aishell/ASR/pruned_transducer_stateless7/train2.py b/egs/aishell/ASR/pruned_transducer_stateless7/train2.py
index fb35a6c95..c30f6f960 100755
--- a/egs/aishell/ASR/pruned_transducer_stateless7/train2.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/train2.py
@@ -243,7 +243,7 @@ def get_parser():
     parser.add_argument(
         "--exp-dir",
         type=str,
-        default="pruned_transducer_stateless3/exp",
+        default="pruned_transducer_stateless7/exp",
         help="""The experiment dir.
         It specifies the directory where all training related
         files, e.g., checkpoints, log, etc, are saved
diff --git a/egs/aishell2/ASR/local/compute_fbank_aishell2.py b/egs/aishell2/ASR/local/compute_fbank_aishell2.py
index ec0c584ca..1fb1621ff 100755
--- a/egs/aishell2/ASR/local/compute_fbank_aishell2.py
+++ b/egs/aishell2/ASR/local/compute_fbank_aishell2.py
@@ -32,7 +32,7 @@ import torch
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 
-from icefall.utils import get_executor
+from icefall.utils import get_executor, str2bool
 
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
@@ -42,7 +42,7 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 
 
-def compute_fbank_aishell2(num_mel_bins: int = 80):
+def compute_fbank_aishell2(num_mel_bins: int = 80, perturb_speed: bool = False):
     src_dir = Path("data/manifests")
     output_dir = Path("data/fbank")
     num_jobs = min(15, os.cpu_count())
@@ -81,7 +81,8 @@ def compute_fbank_aishell2(num_mel_bins: int = 80):
                 recordings=m["recordings"],
                 supervisions=m["supervisions"],
             )
-            if "train" in partition:
+            if "train" in partition and perturb_speed:
+                logging.info(f"Doing speed perturb")
                 cut_set = (
                     cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                 )
@@ -104,6 +105,12 @@ def get_args():
         default=80,
         help="""The number of mel bins for Fbank""",
     )
+    parser.add_argument(
+        "--perturb-speed",
+        type=str2bool,
+        default=False,
+        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
+    )
 
     return parser.parse_args()
 
@@ -114,4 +121,6 @@ if __name__ == "__main__":
     logging.basicConfig(format=formatter, level=logging.INFO)
 
     args = get_args()
-    compute_fbank_aishell2(num_mel_bins=args.num_mel_bins)
+    compute_fbank_aishell2(
+        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+    )
diff --git a/egs/aishell2/ASR/prepare.sh b/egs/aishell2/ASR/prepare.sh
index 3e8e840ab..42631c864 100755
--- a/egs/aishell2/ASR/prepare.sh
+++ b/egs/aishell2/ASR/prepare.sh
@@ -101,7 +101,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
   log "Stage 3: Compute fbank for aishell2"
   if [ ! -f data/fbank/.aishell2.done ]; then
     mkdir -p data/fbank
-    ./local/compute_fbank_aishell2.py
+    ./local/compute_fbank_aishell2.py --perturb-speed True
     touch data/fbank/.aishell2.done
   fi
 fi
diff --git a/egs/aishell4/ASR/local/compute_fbank_aishell4.py b/egs/aishell4/ASR/local/compute_fbank_aishell4.py
index 400c406f0..f19163988 100755
--- a/egs/aishell4/ASR/local/compute_fbank_aishell4.py
+++ b/egs/aishell4/ASR/local/compute_fbank_aishell4.py
@@ -32,7 +32,7 @@ import torch
 from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
 from lhotse.recipes.utils import read_manifests_if_cached
 
-from icefall.utils import get_executor
+from icefall.utils import get_executor, str2bool
 
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
@@ -42,7 +42,7 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 
 
-def compute_fbank_aishell4(num_mel_bins: int = 80):
+def compute_fbank_aishell4(num_mel_bins: int = 80, perturb_speed: bool = False):
     src_dir = Path("data/manifests/aishell4")
     output_dir = Path("data/fbank")
     num_jobs = min(15, os.cpu_count())
@@ -83,10 +83,12 @@ def compute_fbank_aishell4(num_mel_bins: int = 80):
                 recordings=m["recordings"],
                 supervisions=m["supervisions"],
             )
-            if "train" in partition:
+            if "train" in partition and perturb_speed:
+                logging.info(f"Doing speed perturb")
                 cut_set = (
                     cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                 )
+
             cut_set = cut_set.compute_and_store_features(
                 extractor=extractor,
                 storage_path=f"{output_dir}/{prefix}_feats_{partition}",
@@ -113,6 +115,12 @@ def get_args():
         default=80,
         help="""The number of mel bins for Fbank""",
     )
+    parser.add_argument(
+        "--perturb-speed",
+        type=str2bool,
+        default=False,
+        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
+    )
 
     return parser.parse_args()
 
@@ -123,4 +131,6 @@ if __name__ == "__main__":
     logging.basicConfig(format=formatter, level=logging.INFO)
 
     args = get_args()
-    compute_fbank_aishell4(num_mel_bins=args.num_mel_bins)
+    compute_fbank_aishell4(
+        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+    )
diff --git a/egs/aishell4/ASR/prepare.sh b/egs/aishell4/ASR/prepare.sh
index cb2b73a3e..1b1ec0005 100755
--- a/egs/aishell4/ASR/prepare.sh
+++ b/egs/aishell4/ASR/prepare.sh
@@ -107,7 +107,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
   log "Stage 5: Compute fbank for aishell4"
   if [ ! -f data/fbank/.aishell4.done ]; then
     mkdir -p data/fbank
-    ./local/compute_fbank_aishell4.py
+    ./local/compute_fbank_aishell4.py --perturb-speed True
     touch data/fbank/.aishell4.done
   fi
 fi
diff --git a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
index 96115a230..f8c10648a 100755
--- a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
+++ b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
@@ -32,7 +32,7 @@ import torch
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 
-from icefall.utils import get_executor
+from icefall.utils import get_executor, str2bool
 
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
@@ -42,7 +42,7 @@ torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 
 
-def compute_fbank_alimeeting(num_mel_bins: int = 80):
+def compute_fbank_alimeeting(num_mel_bins: int = 80, perturb_speed: bool = False):
     src_dir = Path("data/manifests/alimeeting")
     output_dir = Path("data/fbank")
     num_jobs = min(15, os.cpu_count())
@@ -82,7 +82,8 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
                 recordings=m["recordings"],
                 supervisions=m["supervisions"],
             )
-            if "train" in partition:
+            if "train" in partition and perturb_speed:
+                logging.info(f"Doing speed perturb")
                 cut_set = (
                     cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
                 )
@@ -114,6 +115,12 @@ def get_args():
         default=80,
         help="""The number of mel bins for Fbank""",
     )
+    parser.add_argument(
+        "--perturb-speed",
+        type=str2bool,
+        default=False,
+        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
+    )
 
     return parser.parse_args()
 
@@ -124,4 +131,6 @@ if __name__ == "__main__":
     logging.basicConfig(format=formatter, level=logging.INFO)
 
     args = get_args()
-    compute_fbank_alimeeting(num_mel_bins=args.num_mel_bins)
+    compute_fbank_alimeeting(
+        num_mel_bins=args.num_mel_bins, perturb_speed=args.perturb_speed
+    )
diff --git a/egs/alimeeting/ASR/prepare.sh b/egs/alimeeting/ASR/prepare.sh
index 604cc92c6..1709733c7 100755
--- a/egs/alimeeting/ASR/prepare.sh
+++ b/egs/alimeeting/ASR/prepare.sh
@@ -97,7 +97,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
   log "Stage 5: Compute fbank for alimeeting"
   if [ ! -f data/fbank/.alimeeting.done ]; then
     mkdir -p data/fbank
-    ./local/compute_fbank_alimeeting.py
+    ./local/compute_fbank_alimeeting.py --perturb-speed True
     touch data/fbank/.alimeeting.done
   fi
 fi
diff --git a/egs/alimeeting/ASR_v2/local/compute_fbank_alimeeting.py b/egs/alimeeting/ASR_v2/local/compute_fbank_alimeeting.py
index c6aa2ab36..833d11c72 100755
--- a/egs/alimeeting/ASR_v2/local/compute_fbank_alimeeting.py
+++ b/egs/alimeeting/ASR_v2/local/compute_fbank_alimeeting.py
@@ -25,6 +25,7 @@ It looks for manifests in the directory data/manifests.
 
 The generated fbank features are saved in data/fbank.
 """
+import argparse
 import logging
 from pathlib import Path
 
@@ -39,6 +40,8 @@ from lhotse.features.kaldifeat import (
 )
 from lhotse.recipes.utils import read_manifests_if_cached
 
+from icefall.utils import str2bool
+
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
@@ -48,7 +51,7 @@ torch.set_num_interop_threads(1)
 torch.multiprocessing.set_sharing_strategy("file_system")
 
 
-def compute_fbank_ami():
+def compute_fbank_ami(perturb_speed: bool = False):
     src_dir = Path("data/manifests")
     output_dir = Path("data/fbank")
 
@@ -84,8 +87,12 @@ def compute_fbank_ami():
         suffix="jsonl.gz",
     )
 
-    def _extract_feats(cuts: CutSet, storage_path: Path, manifest_path: Path) -> None:
-        cuts = cuts + cuts.perturb_speed(0.9) + cuts.perturb_speed(1.1)
+    def _extract_feats(
+        cuts: CutSet, storage_path: Path, manifest_path: Path, speed_perturb: bool
+    ) -> None:
+        if speed_perturb:
+            logging.info(f"Doing speed perturb")
+            cuts = cuts + cuts.perturb_speed(0.9) + cuts.perturb_speed(1.1)
         _ = cuts.compute_and_store_features_batch(
             extractor=extractor,
             storage_path=storage_path,
@@ -109,6 +116,7 @@ def compute_fbank_ami():
         cuts_ihm,
         output_dir / "feats_train_ihm",
         src_dir / "cuts_train_ihm.jsonl.gz",
+        perturb_speed,
     )
 
     logging.info("Processing train split IHM + reverberated IHM")
@@ -117,6 +125,7 @@ def compute_fbank_ami():
         cuts_ihm_rvb,
         output_dir / "feats_train_ihm_rvb",
         src_dir / "cuts_train_ihm_rvb.jsonl.gz",
+        perturb_speed,
     )
 
     logging.info("Processing train split SDM")
@@ -129,6 +138,7 @@ def compute_fbank_ami():
         cuts_sdm,
         output_dir / "feats_train_sdm",
         src_dir / "cuts_train_sdm.jsonl.gz",
+        perturb_speed,
     )
 
     logging.info("Processing train split GSS")
@@ -141,6 +151,7 @@ def compute_fbank_ami():
         cuts_gss,
         output_dir / "feats_train_gss",
         src_dir / "cuts_train_gss.jsonl.gz",
+        perturb_speed,
     )
 
     logging.info("Preparing test cuts: IHM, SDM, GSS (optional)")
@@ -186,8 +197,21 @@ def compute_fbank_ami():
         )
 
 
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--perturb-speed",
+        type=str2bool,
+        default=False,
+        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
+    )
+    return parser.parse_args()
+
+
 if __name__ == "__main__":
     formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
     logging.basicConfig(format=formatter, level=logging.INFO)
 
-    compute_fbank_ami()
+    args = get_args()
+
+    compute_fbank_ami(perturb_speed=args.perturb_speed)
diff --git a/egs/alimeeting/ASR_v2/prepare.sh b/egs/alimeeting/ASR_v2/prepare.sh
index 76a108771..1098840f8 100755
--- a/egs/alimeeting/ASR_v2/prepare.sh
+++ b/egs/alimeeting/ASR_v2/prepare.sh
@@ -85,7 +85,7 @@ fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
   log "Stage 5: Compute fbank for alimeeting"
   mkdir -p data/fbank
-  python local/compute_fbank_alimeeting.py
+  python local/compute_fbank_alimeeting.py --perturb-speed True
   log "Combine features from train splits"
   lhotse combine data/manifests/cuts_train_{ihm,ihm_rvb,sdm,gss}.jsonl.gz - | shuf |\
     gzip -c > data/manifests/cuts_train_all.jsonl.gz
diff --git a/egs/ami/SURT/README.md b/egs/ami/SURT/README.md
new file mode 100644
index 000000000..74a8ba014
--- /dev/null
+++ b/egs/ami/SURT/README.md
@@ -0,0 +1,156 @@
+# Introduction
+
+This is a multi-talker ASR recipe for the AMI and ICSI datasets. We train a Streaming
+Unmixing and Recognition Transducer (SURT) model for the task.
+
+Please refer to the `egs/libricss/SURT` recipe README for details about the task and the
+model.
+
+## Description of the recipe
+
+### Pre-requisites
+
+The recipes in this directory need the following packages to be installed:
+
+- [meeteval](https://github.com/fgnt/meeteval)
+- [einops](https://github.com/arogozhnikov/einops)
+
+Additionally, we initialize the model with the pre-trained model from the LibriCSS recipe.
+Please download this checkpoint (see below) or train the LibriCSS recipe first.
+
+### Training
+
+To train the model, run the following from within `egs/ami/SURT`:
+
+```bash
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+python dprnn_zipformer/train.py \
+    --use-fp16 True \
+    --exp-dir dprnn_zipformer/exp/surt_base \
+    --world-size 4 \
+    --max-duration 500 \
+    --max-duration-valid 250 \
+    --max-cuts 200 \
+    --num-buckets 50 \
+    --num-epochs 30 \
+    --enable-spec-aug True \
+    --enable-musan False \
+    --ctc-loss-scale 0.2 \
+    --heat-loss-scale 0.2 \
+    --base-lr 0.004 \
+    --model-init-ckpt exp/libricss_base.pt \
+    --chunk-width-randomization True \
+    --num-mask-encoder-layers 4 \
+    --num-encoder-layers 2,2,2,2,2
+```
+
+The above is for SURT-base (~26M). For SURT-large (~38M), use:
+
+```bash
+    --model-init-ckpt exp/libricss_large.pt \
+    --num-mask-encoder-layers 6 \
+    --num-encoder-layers 2,4,3,2,4 \
+    --model-init-ckpt exp/zipformer_large.pt \
+```
+
+**NOTE:** You may need to decrease the `--max-duration` for SURT-large to avoid OOM.
+
+### Adaptation
+
+The training step above only trains on simulated mixtures. For best results, we also
+adapt the final model on the AMI+ICSI train set. For this, run the following from within
+`egs/ami/SURT`:
+
+```bash
+export CUDA_VISIBLE_DEVICES="0"
+
+python dprnn_zipformer/train_adapt.py \
+    --use-fp16 True \
+    --exp-dir dprnn_zipformer/exp/surt_base_adapt \
+    --world-size 4 \
+    --max-duration 500 \
+    --max-duration-valid 250 \
+    --max-cuts 200 \
+    --num-buckets 50 \
+    --num-epochs 8 \
+    --lr-epochs 2 \
+    --enable-spec-aug True \
+    --enable-musan False \
+    --ctc-loss-scale 0.2 \
+    --base-lr 0.0004 \
+    --model-init-ckpt dprnn_zipformer/exp/surt_base/epoch-30.pt \
+    --chunk-width-randomization True \
+    --num-mask-encoder-layers 4 \
+    --num-encoder-layers 2,2,2,2,2
+```
+
+For SURT-large, use the following config:
+
+```bash
+    --num-mask-encoder-layers 6 \
+    --num-encoder-layers 2,4,3,2,4 \
+    --model-init-ckpt dprnn_zipformer/exp/surt_large/epoch-30.pt \
+    --num-epochs 15 \
+    --lr-epochs 4 \
+```
+
+
+### Decoding
+
+To decode the model, run the following from within `egs/ami/SURT`:
+
+#### Greedy search
+
+```bash
+export CUDA_VISIBLE_DEVICES="0"
+
+python dprnn_zipformer/decode.py \
+    --epoch 20 --avg 1 --use-averaged-model False \
+    --exp-dir dprnn_zipformer/exp/surt_base_adapt \
+    --max-duration 250 \
+    --decoding-method greedy_search
+```
+
+#### Beam search
+
+```bash
+python dprnn_zipformer/decode.py \
+    --epoch 20 --avg 1 --use-averaged-model False \
+    --exp-dir dprnn_zipformer/exp/surt_base_adapt \
+    --max-duration 250 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+```
+
+## Results (using beam search)
+
+**AMI**
+
+| Model      | IHM-Mix |  SDM |  MDM |
+|------------|:-------:|:----:|:----:|
+| SURT-base  |   39.8  | 65.4 | 46.6 |
+|   + adapt  |   37.4  | 46.9 | 43.7 |
+| SURT-large |   36.8  | 62.5 | 44.4 |
+|    + adapt |   **35.1**  | **44.6** | **41.4** |
+
+**ICSI**
+
+| Model      | IHM-Mix |  SDM |
+|------------|:-------:|:----:|
+| SURT-base  |   28.3  | 60.0 |
+|   + adapt  |   26.3  | 33.9 |
+| SURT-large |   27.8  | 59.7 |
+|    + adapt |   **24.4**  | **32.3** |
+
+## Pre-trained models and logs
+
+* LibriCSS pre-trained model (for initialization): [base](https://huggingface.co/desh2608/icefall-surt-libricss-dprnn-zipformer/tree/main/exp/surt_base) [large](https://huggingface.co/desh2608/icefall-surt-libricss-dprnn-zipformer/tree/main/exp/surt_large)
+
+* Pre-trained models: <https://huggingface.co/desh2608/icefall-surt-ami-dprnn-zipformer>
+
+* Training logs:
+    - surt_base: <https://tensorboard.dev/experiment/8awy98VZSWegLmH4l2JWSA/>
+    - surt_base_adapt: <https://tensorboard.dev/experiment/aGVgXVzYRDKbGUbPekcNjg/>
+    - surt_large: <https://tensorboard.dev/experiment/ZXMkez0VSYKbPLqRk4clOQ/>
+    - surt_large_adapt: <https://tensorboard.dev/experiment/WLKL1e7bTVyEjSonYSNYwg/>
diff --git a/egs/ami/SURT/dprnn_zipformer/asr_datamodule.py b/egs/ami/SURT/dprnn_zipformer/asr_datamodule.py
new file mode 100644
index 000000000..ec8106bc3
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/asr_datamodule.py
@@ -0,0 +1,399 @@
+# Copyright      2021  Piotr Żelasko
+# Copyright      2022  Xiaomi Corporation     (Author: Mingshuang Luo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import inspect
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
+from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
+    CutMix,
+    DynamicBucketingSampler,
+    K2SurtDataset,
+    PrecomputedFeatures,
+    SimpleCutSampler,
+    SpecAugment,
+)
+from lhotse.dataset.input_strategies import OnTheFlyFeatures
+from lhotse.utils import fix_random_seed
+from torch.utils.data import DataLoader
+
+from icefall.utils import str2bool
+
+
+class _SeedWorkers:
+    def __init__(self, seed: int):
+        self.seed = seed
+
+    def __call__(self, worker_id: int):
+        fix_random_seed(self.seed + worker_id)
+
+
+class AmiAsrDataModule:
+    """
+    DataModule for k2 SURT experiments.
+    It assumes there is always one train and valid dataloader,
+    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
+    and test-other).
+
+    It contains all the common data pipeline modules used in ASR
+    experiments, e.g.:
+    - dynamic batch size,
+    - bucketing samplers,
+    - augmentation,
+    - on-the-fly feature extraction
+
+    This class should be derived for specific corpora used in ASR tasks.
+    """
+
+    def __init__(self, args: argparse.Namespace):
+        self.args = args
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(
+            title="ASR data related options",
+            description="These options are used for the preparation of "
+            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
+            "effective batch sizes, sampling strategies, applied data "
+            "augmentations, etc.",
+        )
+        group.add_argument(
+            "--manifest-dir",
+            type=Path,
+            default=Path("data/manifests"),
+            help="Path to directory with train/valid/test cuts.",
+        )
+        group.add_argument(
+            "--max-duration",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--max-duration-valid",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--max-cuts",
+            type=int,
+            default=100,
+            help="Maximum number of cuts in a single batch. You can "
+            "reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--bucketing-sampler",
+            type=str2bool,
+            default=True,
+            help="When enabled, the batches will come from buckets of "
+            "similar duration (saves padding frames).",
+        )
+        group.add_argument(
+            "--num-buckets",
+            type=int,
+            default=30,
+            help="The number of buckets for the DynamicBucketingSampler"
+            "(you might want to increase it for larger datasets).",
+        )
+        group.add_argument(
+            "--on-the-fly-feats",
+            type=str2bool,
+            default=False,
+            help=(
+                "When enabled, use on-the-fly cut mixing and feature "
+                "extraction. Will drop existing precomputed feature manifests "
+                "if available."
+            ),
+        )
+        group.add_argument(
+            "--shuffle",
+            type=str2bool,
+            default=True,
+            help="When enabled (=default), the examples will be "
+            "shuffled for each epoch.",
+        )
+        group.add_argument(
+            "--drop-last",
+            type=str2bool,
+            default=True,
+            help="Whether to drop last batch. Used by sampler.",
+        )
+        group.add_argument(
+            "--return-cuts",
+            type=str2bool,
+            default=True,
+            help="When enabled, each batch will have the "
+            "field: batch['supervisions']['cut'] with the cuts that "
+            "were used to construct it.",
+        )
+
+        group.add_argument(
+            "--num-workers",
+            type=int,
+            default=2,
+            help="The number of training dataloader workers that "
+            "collect the batches.",
+        )
+
+        group.add_argument(
+            "--enable-spec-aug",
+            type=str2bool,
+            default=True,
+            help="When enabled, use SpecAugment for training dataset.",
+        )
+
+        group.add_argument(
+            "--spec-aug-time-warp-factor",
+            type=int,
+            default=80,
+            help="Used only when --enable-spec-aug is True. "
+            "It specifies the factor for time warping in SpecAugment. "
+            "Larger values mean more warping. "
+            "A value less than 1 means to disable time warp.",
+        )
+
+        group.add_argument(
+            "--enable-musan",
+            type=str2bool,
+            default=True,
+            help="When enabled, select noise from MUSAN and mix it"
+            "with training dataset. ",
+        )
+
+    def train_dataloaders(
+        self,
+        cuts_train: CutSet,
+        sampler_state_dict: Optional[Dict[str, Any]] = None,
+        sources: bool = False,
+    ) -> DataLoader:
+        """
+        Args:
+          cuts_train:
+            CutSet for training.
+          sampler_state_dict:
+            The state dict for the training sampler.
+        """
+        transforms = []
+        if self.args.enable_musan:
+            logging.info("Enable MUSAN")
+            logging.info("About to get Musan cuts")
+            cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
+            transforms.append(
+                CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
+            )
+        else:
+            logging.info("Disable MUSAN")
+
+        input_transforms = []
+        if self.args.enable_spec_aug:
+            logging.info("Enable SpecAugment")
+            logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
+            # Set the value of num_frame_masks according to Lhotse's version.
+            # In different Lhotse's versions, the default of num_frame_masks is
+            # different.
+            num_frame_masks = 10
+            num_frame_masks_parameter = inspect.signature(
+                SpecAugment.__init__
+            ).parameters["num_frame_masks"]
+            if num_frame_masks_parameter.default == 1:
+                num_frame_masks = 2
+            logging.info(f"Num frame mask: {num_frame_masks}")
+            input_transforms.append(
+                SpecAugment(
+                    time_warp_factor=self.args.spec_aug_time_warp_factor,
+                    num_frame_masks=num_frame_masks,
+                    features_mask_size=27,
+                    num_feature_masks=2,
+                    frames_mask_size=100,
+                )
+            )
+        else:
+            logging.info("Disable SpecAugment")
+
+        logging.info("About to create train dataset")
+        train = K2SurtDataset(
+            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+            cut_transforms=transforms,
+            input_transforms=input_transforms,
+            return_cuts=self.args.return_cuts,
+            return_sources=sources,
+            strict=False,
+        )
+
+        if self.args.bucketing_sampler:
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                quadratic_duration=30.0,
+                max_cuts=self.args.max_cuts,
+                shuffle=self.args.shuffle,
+                num_buckets=self.args.num_buckets,
+                drop_last=self.args.drop_last,
+            )
+        else:
+            logging.info("Using SingleCutSampler.")
+            train_sampler = SimpleCutSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                max_cuts=self.args.max_cuts,
+                shuffle=self.args.shuffle,
+            )
+        logging.info("About to create train dataloader")
+
+        if sampler_state_dict is not None:
+            logging.info("Loading sampler state dict")
+            train_sampler.load_state_dict(sampler_state_dict)
+
+        # 'seed' is derived from the current random state, which will have
+        # previously been set in the main process.
+        seed = torch.randint(0, 100000, ()).item()
+        worker_init_fn = _SeedWorkers(seed)
+
+        train_dl = DataLoader(
+            train,
+            sampler=train_sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+            worker_init_fn=worker_init_fn,
+        )
+
+        return train_dl
+
+    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
+        transforms = []
+
+        logging.info("About to create dev dataset")
+        validate = K2SurtDataset(
+            input_strategy=OnTheFlyFeatures(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            )
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+            cut_transforms=transforms,
+            return_cuts=self.args.return_cuts,
+            return_sources=False,
+            strict=False,
+        )
+        valid_sampler = DynamicBucketingSampler(
+            cuts_valid,
+            max_duration=self.args.max_duration_valid,
+            quadratic_duration=30.0,
+            max_cuts=self.args.max_cuts,
+            shuffle=False,
+        )
+        logging.info("About to create dev dataloader")
+
+        # 'seed' is derived from the current random state, which will have
+        # previously been set in the main process.
+        seed = torch.randint(0, 100000, ()).item()
+        worker_init_fn = _SeedWorkers(seed)
+
+        valid_dl = DataLoader(
+            validate,
+            sampler=valid_sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+            worker_init_fn=worker_init_fn,
+        )
+
+        return valid_dl
+
+    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
+        logging.debug("About to create test dataset")
+        test = K2SurtDataset(
+            input_strategy=OnTheFlyFeatures(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            )
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+            return_cuts=self.args.return_cuts,
+            return_sources=False,
+            strict=False,
+        )
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration_valid,
+            max_cuts=self.args.max_cuts,
+            shuffle=False,
+        )
+
+        # 'seed' is derived from the current random state, which will have
+        # previously been set in the main process.
+        seed = torch.randint(0, 100000, ()).item()
+        worker_init_fn = _SeedWorkers(seed)
+
+        logging.debug("About to create test dataloader")
+        test_dl = DataLoader(
+            test,
+            batch_size=None,
+            sampler=sampler,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+            worker_init_fn=worker_init_fn,
+        )
+        return test_dl
+
+    @lru_cache()
+    def aimix_train_cuts(
+        self,
+        rvb_affix: str = "clean",
+        sources: bool = True,
+    ) -> CutSet:
+        logging.info("About to get train cuts")
+        source_affix = "_sources" if sources else ""
+        cs = load_manifest_lazy(
+            self.args.manifest_dir / f"cuts_train_{rvb_affix}{source_affix}.jsonl.gz"
+        )
+        cs = cs.filter(lambda c: c.duration >= 1.0 and c.duration <= 30.0)
+        return cs
+
+    @lru_cache()
+    def train_cuts(
+        self,
+    ) -> CutSet:
+        logging.info("About to get train cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "cuts_train_ami_icsi.jsonl.gz"
+        )
+
+    @lru_cache()
+    def ami_cuts(self, split: str = "dev", type: str = "sdm") -> CutSet:
+        logging.info(f"About to get AMI {split} {type} cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / f"cuts_ami-{type}_{split}.jsonl.gz"
+        )
+
+    @lru_cache()
+    def icsi_cuts(self, split: str = "dev", type: str = "sdm") -> CutSet:
+        logging.info(f"About to get ICSI {split} {type} cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / f"cuts_icsi-{type}_{split}.jsonl.gz"
+        )
diff --git a/egs/ami/SURT/dprnn_zipformer/beam_search.py b/egs/ami/SURT/dprnn_zipformer/beam_search.py
new file mode 120000
index 000000000..581b29833
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/beam_search.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/beam_search.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/decode.py b/egs/ami/SURT/dprnn_zipformer/decode.py
new file mode 100755
index 000000000..d1a1eddc9
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/decode.py
@@ -0,0 +1,622 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+(1) greedy search
+./dprnn_zipformer/decode.py \
+    --epoch 20 \
+    --avg 1 \
+    --use-averaged-model false \
+    --exp-dir ./dprnn_zipformer/exp_adapt \
+    --max-duration 600 \
+    --decoding-method greedy_search
+
+(2) beam search (not recommended)
+./dprnn_zipformer/decode.py \
+    --epoch 20 \
+    --avg 1 \
+    --use-averaged-model false \
+    --exp-dir ./dprnn_zipformer/exp_adapt \
+    --max-duration 600 \
+    --decoding-method beam_search \
+    --beam-size 4
+
+(3) modified beam search
+./dprnn_zipformer/decode.py \
+    --epoch 20 \
+    --avg 1 \
+    --use-averaged-model false \
+    --exp-dir ./dprnn_zipformer/exp_adapt \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+"""
+
+
+import argparse
+import logging
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from asr_datamodule import AmiAsrDataModule
+from beam_search import (
+    beam_search,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from lhotse.utils import EPSILON
+from train import add_model_arguments, get_params, get_surt_model
+
+from icefall import LmScorer, NgramLm
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_surt_error_stats,
+)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=20,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=1,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="dprnn_zipformer/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Possible values are:
+          - greedy_search
+          - beam_search
+          - modified_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame.
+        Used only when --decoding_method is greedy_search""",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+) -> Dict[str, List[List[str]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+
+        - key: It indicates the setting used for decoding. For example,
+               if greedy_search is used, it would be "greedy_search"
+               If beam search with a beam size of 7 is used, it would be
+               "beam_7"
+        - value: It contains the decoding result. `len(value)` equals to
+                 batch size. `value[i]` is the decoding result for the i-th
+                 utterance in the given batch.
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+    Returns:
+      Return the decoding result. See above description for the format of
+      the returned dict.
+    """
+    device = next(model.parameters()).device
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+
+    feature = feature.to(device)
+    feature_lens = batch["input_lens"].to(device)
+
+    # Apply the mask encoder
+    B, T, F = feature.shape
+    processed = model.mask_encoder(feature)  # B,T,F*num_channels
+    masks = processed.view(B, T, F, params.num_channels).unbind(dim=-1)
+    x_masked = [feature * m for m in masks]
+
+    # Recognition
+    # Stack the inputs along the batch axis
+    h = torch.cat(x_masked, dim=0)
+    h_lens = torch.cat([feature_lens for _ in range(params.num_channels)], dim=0)
+    encoder_out, encoder_out_lens = model.encoder(x=h, x_lens=h_lens)
+
+    if model.joint_encoder_layer is not None:
+        encoder_out = model.joint_encoder_layer(encoder_out)
+
+    def _group_channels(hyps: List[str]) -> List[List[str]]:
+        """
+        Currently we have a batch of size M*B, where M is the number of
+        channels and B is the batch size. We need to group the hypotheses
+        into B groups, each of which contains M hypotheses.
+
+        Example:
+            hyps = ['a1', 'b1', 'c1', 'a2', 'b2', 'c2']
+            _group_channels(hyps) = [['a1', 'a2'], ['b1', 'b2'], ['c1', 'c2']]
+        """
+        assert len(hyps) == B * params.num_channels
+        out_hyps = []
+        for i in range(B):
+            out_hyps.append(hyps[i::B])
+        return out_hyps
+
+    hyps = []
+    if params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp)
+    elif params.decoding_method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp)
+    else:
+        batch_size = encoder_out.size(0)
+
+        for i in range(batch_size):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.decoding_method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                )
+            elif params.decoding_method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported decoding method: {params.decoding_method}"
+                )
+            hyps.append(sp.decode(hyp))
+
+    if params.decoding_method == "greedy_search":
+        return {"greedy_search": _group_channels(hyps)}
+    elif "fast_beam_search" in params.decoding_method:
+        key = f"beam_{params.beam}_"
+        key += f"max_contexts_{params.max_contexts}_"
+        key += f"max_states_{params.max_states}"
+        if "nbest" in params.decoding_method:
+            key += f"_num_paths_{params.num_paths}_"
+            key += f"nbest_scale_{params.nbest_scale}"
+            if "LG" in params.decoding_method:
+                key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
+
+        return {key: _group_channels(hyps)}
+    else:
+        return {f"beam_size_{params.beam_size}": _group_channels(hyps)}
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    if params.decoding_method == "greedy_search":
+        log_interval = 50
+    else:
+        log_interval = 20
+
+    results = defaultdict(list)
+    for batch_idx, batch in enumerate(dl):
+        cut_ids = [cut.id for cut in batch["cuts"]]
+        cuts_batch = batch["cuts"]
+
+        hyps_dict = decode_one_batch(
+            params=params,
+            model=model,
+            sp=sp,
+        )
+
+        for name, hyps in hyps_dict.items():
+            this_batch = []
+            for cut_id, hyp_words in zip(cut_ids, hyps):
+                # Reference is a list of supervision texts sorted by start time.
+                ref_words = [
+                    s.text.strip()
+                    for s in sorted(
+                        cuts_batch[cut_id].supervisions, key=lambda s: s.start
+                    )
+                ]
+                this_batch.append((cut_id, ref_words, hyp_words))
+
+            results[name].extend(this_batch)
+
+        num_cuts += len(cut_ids)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+    return results
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        with open(errs_filename, "w") as f:
+            wer = write_surt_error_stats(
+                f,
+                f"{test_set_name}-{key}",
+                results,
+                enable_log=True,
+                num_channels=params.num_channels,
+            )
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = (
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+    )
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LmScorer.add_arguments(parser)
+    AmiAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+    args.lang_dir = Path(args.lang_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    assert params.decoding_method in (
+        "greedy_search",
+        "beam_search",
+        "modified_beam_search",
+    ), f"Decoding method {params.decoding_method} is not supported."
+    params.res_dir = params.exp_dir / params.decoding_method
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    if "beam_search" in params.decoding_method:
+        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
+    else:
+        params.suffix += f"-context-{params.context_size}"
+        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
+
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> and <unk> are defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.piece_to_id("<unk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+    assert model.encoder.decode_chunk_size == params.decode_chunk_len // 2, (
+        model.encoder.decode_chunk_size,
+        params.decode_chunk_len,
+    )
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to(device)
+    model.eval()
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    ami = AmiAsrDataModule(args)
+
+    # NOTE(@desh2608): we filter segments longer than 120s to avoid OOM errors in decoding.
+    # However, 99.9% of the segments are shorter than 120s, so this should not
+    # substantially affect the results. In future, we will implement an overlapped
+    # inference method to avoid OOM errors.
+
+    test_sets = {}
+    for split in ["dev", "test"]:
+        for type in ["ihm-mix", "sdm", "mdm8-bf"]:
+            test_sets[f"ami-{split}_{type}"] = (
+                ami.ami_cuts(split=split, type=type)
+                .trim_to_supervision_groups(max_pause=0.0)
+                .filter(lambda c: 0.1 < c.duration < 120.0)
+                .to_eager()
+            )
+
+    for split in ["dev", "test"]:
+        for type in ["ihm-mix", "sdm"]:
+            test_sets[f"icsi-{split}_{type}"] = (
+                ami.icsi_cuts(split=split, type=type)
+                .trim_to_supervision_groups(max_pause=0.0)
+                .filter(lambda c: 0.1 < c.duration < 120.0)
+                .to_eager()
+            )
+
+    for test_set, test_cuts in test_sets.items():
+        test_dl = ami.test_dataloaders(test_cuts)
+        results_dict = decode_dataset(
+            dl=test_dl,
+            params=params,
+            model=model,
+            sp=sp,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=test_set,
+            results_dict=results_dict,
+        )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/ami/SURT/dprnn_zipformer/decoder.py b/egs/ami/SURT/dprnn_zipformer/decoder.py
new file mode 120000
index 000000000..c34865c25
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/decoder.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/decoder.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/dprnn.py b/egs/ami/SURT/dprnn_zipformer/dprnn.py
new file mode 120000
index 000000000..8918beb32
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/dprnn.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/dprnn.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/encoder_interface.py b/egs/ami/SURT/dprnn_zipformer/encoder_interface.py
new file mode 120000
index 000000000..0ba945d0f
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/encoder_interface.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/encoder_interface.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/export.py b/egs/ami/SURT/dprnn_zipformer/export.py
new file mode 120000
index 000000000..3deae4471
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/export.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/export.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/joiner.py b/egs/ami/SURT/dprnn_zipformer/joiner.py
new file mode 120000
index 000000000..79fbe8769
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/joiner.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/joiner.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/model.py b/egs/ami/SURT/dprnn_zipformer/model.py
new file mode 120000
index 000000000..ae8c65c99
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/model.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/model.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/optim.py b/egs/ami/SURT/dprnn_zipformer/optim.py
new file mode 120000
index 000000000..366d0f7a2
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/optim.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/optim.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/scaling.py b/egs/ami/SURT/dprnn_zipformer/scaling.py
new file mode 120000
index 000000000..f11d49d77
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/scaling.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/scaling.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/scaling_converter.py b/egs/ami/SURT/dprnn_zipformer/scaling_converter.py
new file mode 120000
index 000000000..1533cbe0e
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/scaling_converter.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/scaling_converter.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/test_model.py b/egs/ami/SURT/dprnn_zipformer/test_model.py
new file mode 120000
index 000000000..1259849e0
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/test_model.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7_streaming/test_model.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/train.py b/egs/ami/SURT/dprnn_zipformer/train.py
new file mode 100755
index 000000000..cd5fafc34
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/train.py
@@ -0,0 +1,1420 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                  Wei Kang,
+#                                                  Mingshuang Luo,)
+#                                                  Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+cd egs/ami/SURT/
+./prepare.sh
+
+./dprnn_zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir dprnn_zipformer/exp \
+  --max-duration 650
+"""
+
+import argparse
+import copy
+import logging
+import warnings
+from itertools import chain
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple, Union
+
+import k2
+import optim
+import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import AmiAsrDataModule
+from decoder import Decoder
+from dprnn import DPRNN
+from einops.layers.torch import Rearrange
+from joiner import Joiner
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import LOG_EPSILON, fix_random_seed
+from model import SURT
+from optim import Eden, ScaledAdam
+from scaling import ScaledLinear, ScaledLSTM
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from zipformer import Zipformer
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for module in model.modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+
+
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-mask-encoder-layers",
+        type=int,
+        default=4,
+        help="Number of layers in the DPRNN based mask encoder.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-dim",
+        type=int,
+        default=256,
+        help="Hidden dimension of the LSTM blocks in DPRNN.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-segment-size",
+        type=int,
+        default=32,
+        help="Segment size of the SegLSTM in DPRNN. Ideally, this should be equal to the "
+        "decode-chunk-length of the zipformer encoder.",
+    )
+
+    parser.add_argument(
+        "--chunk-width-randomization",
+        type=bool,
+        default=False,
+        help="Whether to randomize the chunk width in DPRNN.",
+    )
+
+    # Zipformer config is based on:
+    # https://github.com/k2-fsa/icefall/pull/745#issuecomment-1405282740
+    parser.add_argument(
+        "--num-encoder-layers",
+        type=str,
+        default="2,2,2,2,2",
+        help="Number of zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--feedforward-dims",
+        type=str,
+        default="768,768,768,768,768",
+        help="Feedforward dimension of the zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--nhead",
+        type=str,
+        default="8,8,8,8,8",
+        help="Number of attention heads in the zipformer encoder layers.",
+    )
+
+    parser.add_argument(
+        "--encoder-dims",
+        type=str,
+        default="256,256,256,256,256",
+        help="Embedding dimension in the 2 blocks of zipformer encoder layers, comma separated",
+    )
+
+    parser.add_argument(
+        "--attention-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="""Attention dimension in the 2 blocks of zipformer encoder layers, comma separated;
+        not the same as embedding dimension.""",
+    )
+
+    parser.add_argument(
+        "--encoder-unmasked-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="Unmasked dimensions in the encoders, relates to augmentation during training.  "
+        "Must be <= each of encoder_dims.  Empirically, less than 256 seems to make performance "
+        " worse.",
+    )
+
+    parser.add_argument(
+        "--zipformer-downsampling-factors",
+        type=str,
+        default="1,2,4,8,2",
+        help="Downsampling factor for each stack of encoder layers.",
+    )
+
+    parser.add_argument(
+        "--cnn-module-kernels",
+        type=str,
+        default="31,31,31,31,31",
+        help="Sizes of kernels in convolution modules",
+    )
+
+    parser.add_argument(
+        "--use-joint-encoder-layer",
+        type=str,
+        default="lstm",
+        choices=["linear", "lstm", "none"],
+        help="Whether to use a joint layer to combine all branches.",
+    )
+
+    parser.add_argument(
+        "--decoder-dim",
+        type=int,
+        default=512,
+        help="Embedding dimension in the decoder model.",
+    )
+
+    parser.add_argument(
+        "--joiner-dim",
+        type=int,
+        default=512,
+        help="""Dimension used in the joiner model.
+        Outputs from the encoder and decoder model are projected
+        to this dimension before adding.
+        """,
+    )
+
+    parser.add_argument(
+        "--short-chunk-size",
+        type=int,
+        default=50,
+        help="""Chunk length of dynamic training, the chunk size would be either
+        max sequence length of current batch or uniformly sampled from (1, short_chunk_size).
+        """,
+    )
+
+    parser.add_argument(
+        "--num-left-chunks",
+        type=int,
+        default=4,
+        help="How many left context can be seen in chunks when calculating attention.",
+    )
+
+    parser.add_argument(
+        "--decode-chunk-len",
+        type=int,
+        default=32,
+        help="The chunk size for decoding (in frames before subsampling)",
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=30,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="conv_lstm_transducer_stateless_ctc/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--model-init-ckpt",
+        type=str,
+        default=None,
+        help="""The model checkpoint to initialize the model (either full or part).
+        If not specified, the model is randomly initialized.
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=0.004, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=5000,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=5,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="The prune range for rnnt loss, it means how many symbols(context)"
+        "we are using to compute the loss",
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="The scale to smooth the loss with lm "
+        "(output of prediction network) part.",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="The scale to smooth the loss with am (output of encoder network) part.",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="To get pruning ranges, we will calculate a simple version"
+        "loss(joiner is just addition), this simple loss also uses for"
+        "training (as a regularization item). We will scale the simple loss"
+        "with this parameter before adding to the final loss.",
+    )
+
+    parser.add_argument(
+        "--ctc-loss-scale",
+        type=float,
+        default=0.2,
+        help="Scale for CTC loss.",
+    )
+
+    parser.add_argument(
+        "--heat-loss-scale",
+        type=float,
+        default=0.2,
+        help="Scale for HEAT loss on separated sources.",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=2000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 0.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=1,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=100,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - feature_dim: The model input dim. It has to match the one used
+                       in computing features.
+
+        - subsampling_factor:  The subsampling factor for the model.
+
+        - num_decoder_layers: Number of decoder layer of transformer decoder.
+
+        - warm_step: The warm_step for Noam optimizer.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "valid_interval": 2000,
+            # parameters for SURT
+            "num_channels": 2,
+            "feature_dim": 80,
+            "subsampling_factor": 4,  # not passed in, this is fixed
+            # parameters for Noam
+            "model_warm_step": 5000,  # arg given to model, not for lrate
+            # parameters for ctc loss
+            "beam_size": 10,
+            "use_double_scores": True,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def get_mask_encoder_model(params: AttributeDict) -> nn.Module:
+    mask_encoder = DPRNN(
+        feature_dim=params.feature_dim,
+        input_size=params.mask_encoder_dim,
+        hidden_size=params.mask_encoder_dim,
+        output_size=params.feature_dim * params.num_channels,
+        segment_size=params.mask_encoder_segment_size,
+        num_blocks=params.num_mask_encoder_layers,
+        chunk_width_randomization=params.chunk_width_randomization,
+    )
+    return mask_encoder
+
+
+def get_encoder_model(params: AttributeDict) -> nn.Module:
+    # TODO: We can add an option to switch between Zipformer and Transformer
+    def to_int_tuple(s: str):
+        return tuple(map(int, s.split(",")))
+
+    encoder = Zipformer(
+        num_features=params.feature_dim,
+        output_downsampling_factor=2,
+        zipformer_downsampling_factors=to_int_tuple(
+            params.zipformer_downsampling_factors
+        ),
+        encoder_dims=to_int_tuple(params.encoder_dims),
+        attention_dim=to_int_tuple(params.attention_dims),
+        encoder_unmasked_dims=to_int_tuple(params.encoder_unmasked_dims),
+        nhead=to_int_tuple(params.nhead),
+        feedforward_dim=to_int_tuple(params.feedforward_dims),
+        cnn_module_kernels=to_int_tuple(params.cnn_module_kernels),
+        num_encoder_layers=to_int_tuple(params.num_encoder_layers),
+        num_left_chunks=params.num_left_chunks,
+        short_chunk_size=params.short_chunk_size,
+        decode_chunk_size=params.decode_chunk_len // 2,
+    )
+    return encoder
+
+
+def get_joint_encoder_layer(params: AttributeDict) -> nn.Module:
+    class TakeFirst(nn.Module):
+        def forward(self, x):
+            return x[0]
+
+    if params.use_joint_encoder_layer == "linear":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            nn.Linear(
+                params.num_channels * encoder_dim, params.num_channels * encoder_dim
+            ),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "lstm":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            ScaledLSTM(
+                input_size=params.num_channels * encoder_dim,
+                hidden_size=params.num_channels * encoder_dim,
+                num_layers=1,
+                bias=True,
+                batch_first=True,
+                dropout=0.0,
+                bidirectional=False,
+            ),
+            TakeFirst(),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "none":
+        joint_layer = None
+    else:
+        raise ValueError(
+            f"Unknown joint encoder layer type: {params.use_joint_encoder_layer}"
+        )
+    return joint_layer
+
+
+def get_decoder_model(params: AttributeDict) -> nn.Module:
+    decoder = Decoder(
+        vocab_size=params.vocab_size,
+        decoder_dim=params.decoder_dim,
+        blank_id=params.blank_id,
+        context_size=params.context_size,
+    )
+    return decoder
+
+
+def get_joiner_model(params: AttributeDict) -> nn.Module:
+    joiner = Joiner(
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return joiner
+
+
+def get_surt_model(
+    params: AttributeDict,
+) -> nn.Module:
+    mask_encoder = get_mask_encoder_model(params)
+    encoder = get_encoder_model(params)
+    joint_layer = get_joint_encoder_layer(params)
+    decoder = get_decoder_model(params)
+    joiner = get_joiner_model(params)
+
+    model = SURT(
+        mask_encoder=mask_encoder,
+        encoder=encoder,
+        joint_encoder_layer=joint_layer,
+        decoder=decoder,
+        joiner=joiner,
+        num_channels=params.num_channels,
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return model
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_heat_loss(x_masked, batch, num_channels=2) -> Tensor:
+    """
+    Compute HEAT loss for separated sources using the output of mask encoder.
+    Args:
+      x_masked:
+        The output of mask encoder. It is a tensor of shape (B, T, C).
+      batch:
+        A batch of data. See `lhotse.dataset.K2SurtDatasetWithSources()`
+        for the content in it.
+      num_channels:
+        The number of output branches in the SURT model.
+    """
+    B, T, D = x_masked[0].shape
+    device = x_masked[0].device
+
+    # Create training targets for each channel.
+    targets = []
+    for i in range(num_channels):
+        target = torch.ones_like(x_masked[i]) * LOG_EPSILON
+        targets.append(target)
+
+    source_feats = batch["source_feats"]
+    source_boundaries = batch["source_boundaries"]
+    input_lens = batch["input_lens"].to(device)
+    # Assign sources to channels based on the HEAT criteria
+    for b in range(B):
+        cut_source_feats = source_feats[b]
+        cut_source_boundaries = source_boundaries[b]
+        last_seg_end = [0 for _ in range(num_channels)]
+        for source_feat, (start, end) in zip(cut_source_feats, cut_source_boundaries):
+            assigned = False
+            end = min(end, T)
+            source_feat = source_feat[: end - start, :]
+            for i in range(num_channels):
+                if start >= last_seg_end[i]:
+                    targets[i][b, start:end, :] += source_feat.to(device)
+                    last_seg_end[i] = max(end, last_seg_end[i])
+                    assigned = True
+                    break
+            if not assigned:
+                min_end_channel = last_seg_end.index(min(last_seg_end))
+                targets[min_end_channel][b, start:end, :] += source_feat.to(device)
+                last_seg_end[min_end_channel] = max(end, last_seg_end[min_end_channel])
+
+    # Get padding mask based on input lengths
+    pad_mask = torch.arange(T, device=device).expand(B, T) > input_lens.unsqueeze(1)
+    pad_mask = pad_mask.unsqueeze(-1)
+
+    # Compute masked loss for each channel
+    losses = torch.zeros((num_channels, B, T, D), device=device)
+    for i in range(num_channels):
+        loss = nn.functional.mse_loss(x_masked[i], targets[i], reduction="none")
+        # Apply padding mask to loss
+        loss.masked_fill_(pad_mask, 0)
+        losses[i] = loss
+
+    # loss: C x B x T x D. pad_mask: B x T x 1
+    # We want to compute loss for each item in the batch. Each item has loss given
+    # by the sum over C, and average over T and D. For T, we need to use the padding.
+    loss = losses.sum(0).mean(-1).sum(-1) / batch["input_lens"].to(device)
+    return loss
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute RNN-T loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Conformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"].to(device)
+    feature_lens = batch["input_lens"].to(device)
+
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+
+    # The dataloader returns text as a list of cuts, each of which is a list of channel
+    # text. We flatten this to a list where all channels are together, i.e., it looks like
+    # [utt1_ch1, utt2_ch1, ..., uttN_ch1, utt1_ch2, ...., uttN,ch2].
+    text = [val for tup in zip(*batch["text"]) for val in tup]
+    assert len(text) == len(feature) * params.num_channels
+
+    # Convert all channel texts to token IDs and create a ragged tensor.
+    y = sp.encode(text, out_type=int)
+    y = k2.RaggedTensor(y).to(device)
+
+    batch_idx_train = params.batch_idx_train
+    warm_step = params.model_warm_step
+
+    with torch.set_grad_enabled(is_training):
+        (simple_loss, pruned_loss, ctc_loss, x_masked) = model(
+            x=feature,
+            x_lens=feature_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+            reduction="none",
+            subsampling_factor=params.subsampling_factor,
+        )
+        simple_loss_is_finite = torch.isfinite(simple_loss)
+        pruned_loss_is_finite = torch.isfinite(pruned_loss)
+        ctc_loss_is_finite = torch.isfinite(ctc_loss)
+
+        # Compute HEAT loss
+        if is_training and params.heat_loss_scale > 0.0:
+            heat_loss = compute_heat_loss(
+                x_masked, batch, num_channels=params.num_channels
+            )
+        else:
+            heat_loss = torch.tensor(0.0, device=device)
+
+        heat_loss_is_finite = torch.isfinite(heat_loss)
+        is_finite = (
+            simple_loss_is_finite
+            & pruned_loss_is_finite
+            & ctc_loss_is_finite
+            & heat_loss_is_finite
+        )
+        if not torch.all(is_finite):
+            logging.info(
+                "Not all losses are finite!\n"
+                f"simple_losses: {simple_loss}\n"
+                f"pruned_losses: {pruned_loss}\n"
+                f"ctc_losses: {ctc_loss}\n"
+                f"heat_losses: {heat_loss}\n"
+            )
+            display_and_save_batch(batch, params=params, sp=sp)
+            simple_loss = simple_loss[simple_loss_is_finite]
+            pruned_loss = pruned_loss[pruned_loss_is_finite]
+            ctc_loss = ctc_loss[ctc_loss_is_finite]
+            heat_loss = heat_loss[heat_loss_is_finite]
+
+            # If either all simple_loss or pruned_loss is inf or nan,
+            # we stop the training process by raising an exception
+            if (
+                torch.all(~simple_loss_is_finite)
+                or torch.all(~pruned_loss_is_finite)
+                or torch.all(~ctc_loss_is_finite)
+                or torch.all(~heat_loss_is_finite)
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
+
+        simple_loss_sum = simple_loss.sum()
+        pruned_loss_sum = pruned_loss.sum()
+        ctc_loss_sum = ctc_loss.sum()
+        heat_loss_sum = heat_loss.sum()
+
+        s = params.simple_loss_scale
+        # take down the scale on the simple loss from 1.0 at the start
+        # to params.simple_loss scale by warm_step.
+        simple_loss_scale = (
+            s
+            if batch_idx_train >= warm_step
+            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
+        )
+        pruned_loss_scale = (
+            1.0
+            if batch_idx_train >= warm_step
+            else 0.1 + 0.9 * (batch_idx_train / warm_step)
+        )
+        loss = (
+            simple_loss_scale * simple_loss_sum
+            + pruned_loss_scale * pruned_loss_sum
+            + params.ctc_loss_scale * ctc_loss_sum
+            + params.heat_loss_scale * heat_loss_sum
+        )
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        # info["frames"] is an approximate number for two reasons:
+        # (1) The acutal subsampling factor is ((lens - 1) // 2 - 1) // 2
+        # (2) If some utterances in the batch lead to inf/nan loss, they
+        #     are filtered out.
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances`  # noqa
+    info["utterances"] = feature.size(0)
+    # averaged input duration in frames over utterances
+    info["utt_duration"] = feature_lens.sum().item()
+    # averaged padding proportion over utterances
+    info["utt_pad_proportion"] = (
+        ((feature.size(1) - feature_lens) / feature.size(1)).sum().item()
+    )
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    info["simple_loss"] = simple_loss_sum.detach().cpu().item()
+    info["pruned_loss"] = pruned_loss_sum.detach().cpu().item()
+    if params.ctc_loss_scale > 0.0:
+        info["ctc_loss"] = ctc_loss_sum.detach().cpu().item()
+    if params.heat_loss_scale > 0.0:
+        info["heat_loss"] = heat_loss_sum.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            sp=sp,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    sp: spm.SentencePieceProcessor,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    torch.cuda.empty_cache()
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    cur_batch_idx = params.get("cur_batch_idx", 0)
+
+    for batch_idx, batch in enumerate(train_dl):
+        if batch_idx < cur_batch_idx:
+            continue
+        cur_batch_idx = batch_idx
+
+        params.batch_idx_train += 1
+        batch_size = batch["inputs"].shape[0]
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            set_batch_count(model, params.batch_idx_train)
+            scheduler.step_batch(params.batch_idx_train)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            params.cur_batch_idx = batch_idx
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            del params.cur_batch_idx
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = scheduler.get_last_lr()[0]
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
+                    )
+
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    model.to(device)
+
+    if checkpoints is None and params.model_init_ckpt is not None:
+        logging.info(
+            f"Initializing model with checkpoint from {params.model_init_ckpt}"
+        )
+        init_ckpt = torch.load(params.model_init_ckpt, map_location=device)
+        model.load_state_dict(init_ckpt["model"], strict=False)
+
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    parameters_names = []
+    parameters_names.append(
+        [name_param_pair[0] for name_param_pair in model.named_parameters()]
+    )
+    optimizer = ScaledAdam(
+        model.parameters(),
+        lr=params.base_lr,
+        clipping_scale=2.0,
+        parameters_names=parameters_names,
+    )
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        diagnostic = diagnostics.attach_diagnostics(model)
+
+    ami = AmiAsrDataModule(args)
+
+    train_cuts = ami.aimix_train_cuts(rvb_affix="comb", sources=True)
+    dev_cuts = ami.ami_cuts(split="dev", type="ihm-mix")
+    dev_cuts = dev_cuts.trim_to_supervision_groups(max_pause=0.0).filter(
+        lambda c: 0.2 <= c.duration <= 60.0
+    )
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = ami.train_dataloaders(
+        train_cuts,
+        sampler_state_dict=sampler_state_dict,
+        sources=True,
+    )
+    valid_dl = ami.valid_dataloaders(dev_cuts)
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sp=sp,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    y = [sp.encode(text_ch) for text_ch in batch["text"]]
+    num_tokens = [sum(len(yi) for yi in y_ch) for y_ch in y]
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def main():
+    parser = get_parser()
+    AmiAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/ami/SURT/dprnn_zipformer/train_adapt.py b/egs/ami/SURT/dprnn_zipformer/train_adapt.py
new file mode 100755
index 000000000..9f3b4425f
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/train_adapt.py
@@ -0,0 +1,1411 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                  Wei Kang,
+#                                                  Mingshuang Luo,)
+#                                                  Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+# ./dprnn_zipformer/train.py should be run before this script.
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+./dprnn_zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir dprnn_zipformer/exp_adapt \
+  --model-init-ckpt dprnn_zipformer/exp/epoch-30.pt \
+  --max-duration 550
+"""
+
+import argparse
+import copy
+import logging
+import warnings
+from itertools import chain
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple, Union
+
+import k2
+import optim
+import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import AmiAsrDataModule
+from decoder import Decoder
+from dprnn import DPRNN
+from einops.layers.torch import Rearrange
+from joiner import Joiner
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import LOG_EPSILON, fix_random_seed
+from model import SURT
+from optim import Eden, ScaledAdam
+from scaling import ScaledLinear, ScaledLSTM
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from zipformer import Zipformer
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for module in model.modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+
+
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-mask-encoder-layers",
+        type=int,
+        default=4,
+        help="Number of layers in the DPRNN based mask encoder.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-dim",
+        type=int,
+        default=256,
+        help="Hidden dimension of the LSTM blocks in DPRNN.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-segment-size",
+        type=int,
+        default=32,
+        help="Segment size of the SegLSTM in DPRNN. Ideally, this should be equal to the "
+        "decode-chunk-length of the zipformer encoder.",
+    )
+
+    parser.add_argument(
+        "--chunk-width-randomization",
+        type=bool,
+        default=False,
+        help="Whether to randomize the chunk width in DPRNN.",
+    )
+
+    # Zipformer config is based on:
+    # https://github.com/k2-fsa/icefall/pull/745#issuecomment-1405282740
+    parser.add_argument(
+        "--num-encoder-layers",
+        type=str,
+        default="2,2,2,2,2",
+        help="Number of zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--feedforward-dims",
+        type=str,
+        default="768,768,768,768,768",
+        help="Feedforward dimension of the zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--nhead",
+        type=str,
+        default="8,8,8,8,8",
+        help="Number of attention heads in the zipformer encoder layers.",
+    )
+
+    parser.add_argument(
+        "--encoder-dims",
+        type=str,
+        default="256,256,256,256,256",
+        help="Embedding dimension in the 2 blocks of zipformer encoder layers, comma separated",
+    )
+
+    parser.add_argument(
+        "--attention-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="""Attention dimension in the 2 blocks of zipformer encoder layers, comma separated;
+        not the same as embedding dimension.""",
+    )
+
+    parser.add_argument(
+        "--encoder-unmasked-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="Unmasked dimensions in the encoders, relates to augmentation during training.  "
+        "Must be <= each of encoder_dims.  Empirically, less than 256 seems to make performance "
+        " worse.",
+    )
+
+    parser.add_argument(
+        "--zipformer-downsampling-factors",
+        type=str,
+        default="1,2,4,8,2",
+        help="Downsampling factor for each stack of encoder layers.",
+    )
+
+    parser.add_argument(
+        "--cnn-module-kernels",
+        type=str,
+        default="31,31,31,31,31",
+        help="Sizes of kernels in convolution modules",
+    )
+
+    parser.add_argument(
+        "--use-joint-encoder-layer",
+        type=str,
+        default="linear",
+        choices=["linear", "lstm", "none"],
+        help="Whether to use a joint layer to combine all branches.",
+    )
+
+    parser.add_argument(
+        "--decoder-dim",
+        type=int,
+        default=512,
+        help="Embedding dimension in the decoder model.",
+    )
+
+    parser.add_argument(
+        "--joiner-dim",
+        type=int,
+        default=512,
+        help="""Dimension used in the joiner model.
+        Outputs from the encoder and decoder model are projected
+        to this dimension before adding.
+        """,
+    )
+
+    parser.add_argument(
+        "--short-chunk-size",
+        type=int,
+        default=50,
+        help="""Chunk length of dynamic training, the chunk size would be either
+        max sequence length of current batch or uniformly sampled from (1, short_chunk_size).
+        """,
+    )
+
+    parser.add_argument(
+        "--num-left-chunks",
+        type=int,
+        default=4,
+        help="How many left context can be seen in chunks when calculating attention.",
+    )
+
+    parser.add_argument(
+        "--decode-chunk-len",
+        type=int,
+        default=32,
+        help="The chunk size for decoding (in frames before subsampling)",
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=20,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="conv_lstm_transducer_stateless_ctc/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--model-init-ckpt",
+        type=str,
+        default=None,
+        help="""The model checkpoint to initialize the model (either full or part).
+        If not specified, the model is randomly initialized.
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=0.0001, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=5000,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=2,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="The prune range for rnnt loss, it means how many symbols(context)"
+        "we are using to compute the loss",
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="The scale to smooth the loss with lm "
+        "(output of prediction network) part.",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="The scale to smooth the loss with am (output of encoder network) part.",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="To get pruning ranges, we will calculate a simple version"
+        "loss(joiner is just addition), this simple loss also uses for"
+        "training (as a regularization item). We will scale the simple loss"
+        "with this parameter before adding to the final loss.",
+    )
+
+    parser.add_argument(
+        "--ctc-loss-scale",
+        type=float,
+        default=0.2,
+        help="Scale for CTC loss.",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=2000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 0.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=1,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=100,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - feature_dim: The model input dim. It has to match the one used
+                       in computing features.
+
+        - subsampling_factor:  The subsampling factor for the model.
+
+        - num_decoder_layers: Number of decoder layer of transformer decoder.
+
+        - warm_step: The warm_step for Noam optimizer.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "valid_interval": 2000,
+            # parameters for SURT
+            "num_channels": 2,
+            "feature_dim": 80,
+            "subsampling_factor": 4,  # not passed in, this is fixed
+            # parameters for Noam
+            "model_warm_step": 5000,  # arg given to model, not for lrate
+            # parameters for ctc loss
+            "beam_size": 10,
+            "use_double_scores": True,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def get_mask_encoder_model(params: AttributeDict) -> nn.Module:
+    mask_encoder = DPRNN(
+        feature_dim=params.feature_dim,
+        input_size=params.mask_encoder_dim,
+        hidden_size=params.mask_encoder_dim,
+        output_size=params.feature_dim * params.num_channels,
+        segment_size=params.mask_encoder_segment_size,
+        num_blocks=params.num_mask_encoder_layers,
+        chunk_width_randomization=params.chunk_width_randomization,
+    )
+    return mask_encoder
+
+
+def get_encoder_model(params: AttributeDict) -> nn.Module:
+    # TODO: We can add an option to switch between Zipformer and Transformer
+    def to_int_tuple(s: str):
+        return tuple(map(int, s.split(",")))
+
+    encoder = Zipformer(
+        num_features=params.feature_dim,
+        output_downsampling_factor=2,
+        zipformer_downsampling_factors=to_int_tuple(
+            params.zipformer_downsampling_factors
+        ),
+        encoder_dims=to_int_tuple(params.encoder_dims),
+        attention_dim=to_int_tuple(params.attention_dims),
+        encoder_unmasked_dims=to_int_tuple(params.encoder_unmasked_dims),
+        nhead=to_int_tuple(params.nhead),
+        feedforward_dim=to_int_tuple(params.feedforward_dims),
+        cnn_module_kernels=to_int_tuple(params.cnn_module_kernels),
+        num_encoder_layers=to_int_tuple(params.num_encoder_layers),
+        num_left_chunks=params.num_left_chunks,
+        short_chunk_size=params.short_chunk_size,
+        decode_chunk_size=params.decode_chunk_len // 2,
+    )
+    return encoder
+
+
+def get_joint_encoder_layer(params: AttributeDict) -> nn.Module:
+    class TakeFirst(nn.Module):
+        def forward(self, x):
+            return x[0]
+
+    if params.use_joint_encoder_layer == "linear":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            nn.Linear(
+                params.num_channels * encoder_dim, params.num_channels * encoder_dim
+            ),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "lstm":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            ScaledLSTM(
+                input_size=params.num_channels * encoder_dim,
+                hidden_size=params.num_channels * encoder_dim,
+                num_layers=1,
+                bias=True,
+                batch_first=True,
+                dropout=0.0,
+                bidirectional=False,
+            ),
+            TakeFirst(),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "none":
+        joint_layer = None
+    else:
+        raise ValueError(
+            f"Unknown joint encoder layer type: {params.use_joint_encoder_layer}"
+        )
+    return joint_layer
+
+
+def get_decoder_model(params: AttributeDict) -> nn.Module:
+    decoder = Decoder(
+        vocab_size=params.vocab_size,
+        decoder_dim=params.decoder_dim,
+        blank_id=params.blank_id,
+        context_size=params.context_size,
+    )
+    return decoder
+
+
+def get_joiner_model(params: AttributeDict) -> nn.Module:
+    joiner = Joiner(
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return joiner
+
+
+def get_surt_model(
+    params: AttributeDict,
+) -> nn.Module:
+    mask_encoder = get_mask_encoder_model(params)
+    encoder = get_encoder_model(params)
+    joint_layer = get_joint_encoder_layer(params)
+    decoder = get_decoder_model(params)
+    joiner = get_joiner_model(params)
+
+    model = SURT(
+        mask_encoder=mask_encoder,
+        encoder=encoder,
+        joint_encoder_layer=joint_layer,
+        decoder=decoder,
+        joiner=joiner,
+        num_channels=params.num_channels,
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return model
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_heat_loss(x_masked, batch, num_channels=2) -> Tensor:
+    """
+    Compute HEAT loss for separated sources using the output of mask encoder.
+    Args:
+      x_masked:
+        The output of mask encoder. It is a tensor of shape (B, T, C).
+      batch:
+        A batch of data. See `lhotse.dataset.K2SurtDatasetWithSources()`
+        for the content in it.
+      num_channels:
+        The number of output branches in the SURT model.
+    """
+    B, T, D = x_masked[0].shape
+    device = x_masked[0].device
+
+    # Create training targets for each channel.
+    targets = []
+    for i in range(num_channels):
+        target = torch.ones_like(x_masked[i]) * LOG_EPSILON
+        targets.append(target)
+
+    source_feats = batch["source_feats"]
+    source_boundaries = batch["source_boundaries"]
+    input_lens = batch["input_lens"].to(device)
+    # Assign sources to channels based on the HEAT criteria
+    for b in range(B):
+        cut_source_feats = source_feats[b]
+        cut_source_boundaries = source_boundaries[b]
+        last_seg_end = [0 for _ in range(num_channels)]
+        for source_feat, (start, end) in zip(cut_source_feats, cut_source_boundaries):
+            assigned = False
+            for i in range(num_channels):
+                if start >= last_seg_end[i]:
+                    targets[i][b, start:end, :] += source_feat.to(device)
+                    last_seg_end[i] = max(end, last_seg_end[i])
+                    assigned = True
+                    break
+            if not assigned:
+                min_end_channel = last_seg_end.index(min(last_seg_end))
+                targets[min_end_channel][b, start:end, :] += source_feat
+                last_seg_end[min_end_channel] = max(end, last_seg_end[min_end_channel])
+
+    # Get padding mask based on input lengths
+    pad_mask = torch.arange(T, device=device).expand(B, T) > input_lens.unsqueeze(1)
+    pad_mask = pad_mask.unsqueeze(-1)
+
+    # Compute masked loss for each channel
+    losses = torch.zeros((num_channels, B, T, D), device=device)
+    for i in range(num_channels):
+        loss = nn.functional.mse_loss(x_masked[i], targets[i], reduction="none")
+        # Apply padding mask to loss
+        loss.masked_fill_(pad_mask, 0)
+        losses[i] = loss
+
+    # loss: C x B x T x D. pad_mask: B x T x 1
+    # We want to compute loss for each item in the batch. Each item has loss given
+    # by the sum over C, and average over T and D. For T, we need to use the padding.
+    loss = losses.sum(0).mean(-1).sum(-1) / batch["input_lens"].to(device)
+    return loss
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute RNN-T loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Conformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"].to(device)
+    feature_lens = batch["input_lens"].to(device)
+
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+
+    # The dataloader returns text as a list of cuts, each of which is a list of channel
+    # text. We flatten this to a list where all channels are together, i.e., it looks like
+    # [utt1_ch1, utt2_ch1, ..., uttN_ch1, utt1_ch2, ...., uttN,ch2].
+    text = [val for tup in zip(*batch["text"]) for val in tup]
+    assert len(text) == len(feature) * params.num_channels
+
+    # Convert all channel texts to token IDs and create a ragged tensor.
+    y = sp.encode(text, out_type=int)
+    y = k2.RaggedTensor(y).to(device)
+
+    batch_idx_train = params.batch_idx_train
+    warm_step = params.model_warm_step
+
+    with torch.set_grad_enabled(is_training):
+        (simple_loss, pruned_loss, ctc_loss, x_masked) = model(
+            x=feature,
+            x_lens=feature_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+            reduction="none",
+            subsampling_factor=params.subsampling_factor,
+        )
+        simple_loss_is_finite = torch.isfinite(simple_loss)
+        pruned_loss_is_finite = torch.isfinite(pruned_loss)
+        ctc_loss_is_finite = torch.isfinite(ctc_loss)
+
+        # Compute HEAT loss
+        if is_training and params.heat_loss_scale > 0.0:
+            heat_loss = compute_heat_loss(
+                x_masked, batch, num_channels=params.num_channels
+            )
+        else:
+            heat_loss = torch.tensor(0.0, device=device)
+
+        heat_loss_is_finite = torch.isfinite(heat_loss)
+        is_finite = (
+            simple_loss_is_finite
+            & pruned_loss_is_finite
+            & ctc_loss_is_finite
+            & heat_loss_is_finite
+        )
+        if not torch.all(is_finite):
+            # logging.info(
+            #     "Not all losses are finite!\n"
+            #     f"simple_losses: {simple_loss}\n"
+            #     f"pruned_losses: {pruned_loss}\n"
+            #     f"ctc_losses: {ctc_loss}\n"
+            #     f"heat_losses: {heat_loss}\n"
+            # )
+            # display_and_save_batch(batch, params=params, sp=sp)
+            simple_loss = simple_loss[simple_loss_is_finite]
+            pruned_loss = pruned_loss[pruned_loss_is_finite]
+            ctc_loss = ctc_loss[ctc_loss_is_finite]
+            heat_loss = heat_loss[heat_loss_is_finite]
+
+            # If either all simple_loss or pruned_loss is inf or nan,
+            # we stop the training process by raising an exception
+            if (
+                torch.all(~simple_loss_is_finite)
+                or torch.all(~pruned_loss_is_finite)
+                or torch.all(~ctc_loss_is_finite)
+                or torch.all(~heat_loss_is_finite)
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
+
+        simple_loss_sum = simple_loss.sum()
+        pruned_loss_sum = pruned_loss.sum()
+        ctc_loss_sum = ctc_loss.sum()
+        heat_loss_sum = heat_loss.sum()
+
+        s = params.simple_loss_scale
+        # take down the scale on the simple loss from 1.0 at the start
+        # to params.simple_loss scale by warm_step.
+        simple_loss_scale = (
+            s
+            if batch_idx_train >= warm_step
+            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
+        )
+        pruned_loss_scale = (
+            1.0
+            if batch_idx_train >= warm_step
+            else 0.1 + 0.9 * (batch_idx_train / warm_step)
+        )
+        loss = (
+            simple_loss_scale * simple_loss_sum
+            + pruned_loss_scale * pruned_loss_sum
+            + params.ctc_loss_scale * ctc_loss_sum
+            + params.heat_loss_scale * heat_loss_sum
+        )
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        # info["frames"] is an approximate number for two reasons:
+        # (1) The acutal subsampling factor is ((lens - 1) // 2 - 1) // 2
+        # (2) If some utterances in the batch lead to inf/nan loss, they
+        #     are filtered out.
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances`  # noqa
+    info["utterances"] = feature.size(0)
+    # averaged input duration in frames over utterances
+    info["utt_duration"] = feature_lens.sum().item()
+    # averaged padding proportion over utterances
+    info["utt_pad_proportion"] = (
+        ((feature.size(1) - feature_lens) / feature.size(1)).sum().item()
+    )
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    info["simple_loss"] = simple_loss_sum.detach().cpu().item()
+    info["pruned_loss"] = pruned_loss_sum.detach().cpu().item()
+    if params.ctc_loss_scale > 0.0:
+        info["ctc_loss"] = ctc_loss_sum.detach().cpu().item()
+    if params.heat_loss_scale > 0.0:
+        info["heat_loss"] = heat_loss_sum.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            sp=sp,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    sp: spm.SentencePieceProcessor,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    torch.cuda.empty_cache()
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    cur_batch_idx = params.get("cur_batch_idx", 0)
+
+    for batch_idx, batch in enumerate(train_dl):
+        if batch_idx < cur_batch_idx:
+            continue
+        cur_batch_idx = batch_idx
+
+        params.batch_idx_train += 1
+        batch_size = batch["inputs"].shape[0]
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            set_batch_count(model, params.batch_idx_train)
+            scheduler.step_batch(params.batch_idx_train)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            params.cur_batch_idx = batch_idx
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            del params.cur_batch_idx
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = scheduler.get_last_lr()[0]
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
+                    )
+
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    model.to(device)
+
+    if checkpoints is None and params.model_init_ckpt is not None:
+        logging.info(
+            f"Initializing model with checkpoint from {params.model_init_ckpt}"
+        )
+        init_ckpt = torch.load(params.model_init_ckpt, map_location=device)
+        model.load_state_dict(init_ckpt["model"], strict=False)
+
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    parameters_names = []
+    parameters_names.append(
+        [name_param_pair[0] for name_param_pair in model.named_parameters()]
+    )
+    optimizer = ScaledAdam(
+        model.parameters(),
+        lr=params.base_lr,
+        clipping_scale=2.0,
+        parameters_names=parameters_names,
+    )
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        diagnostic = diagnostics.attach_diagnostics(model)
+
+    ami = AmiAsrDataModule(args)
+
+    train_cuts = ami.train_cuts()
+    train_cuts = train_cuts.filter(lambda c: 0.5 <= c.duration <= 35.0)
+    dev_cuts = ami.ami_cuts(split="dev", type="ihm-mix")
+    dev_cuts = dev_cuts.trim_to_supervision_groups(max_pause=0.0).filter(
+        lambda c: 0.2 <= c.duration <= 60.0
+    )
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = ami.train_dataloaders(
+        train_cuts,
+        sampler_state_dict=sampler_state_dict,
+    )
+    valid_dl = ami.valid_dataloaders(dev_cuts)
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sp=sp,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    y = [sp.encode(text_ch) for text_ch in batch["text"]]
+    num_tokens = [sum(len(yi) for yi in y_ch) for y_ch in y]
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def main():
+    parser = get_parser()
+    AmiAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/ami/SURT/dprnn_zipformer/zipformer.py b/egs/ami/SURT/dprnn_zipformer/zipformer.py
new file mode 120000
index 000000000..59b772024
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/zipformer.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/zipformer.py
\ No newline at end of file
diff --git a/egs/ami/SURT/local/add_source_feats.py b/egs/ami/SURT/local/add_source_feats.py
new file mode 100755
index 000000000..0917b88a6
--- /dev/null
+++ b/egs/ami/SURT/local/add_source_feats.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file adds source features as temporal arrays to the mixture manifests.
+It looks for manifests in the directory data/manifests.
+"""
+import logging
+from pathlib import Path
+
+import numpy as np
+from lhotse import CutSet, LilcomChunkyWriter, load_manifest, load_manifest_lazy
+from tqdm import tqdm
+
+
+def add_source_feats():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    logging.info("Reading mixed cuts")
+    mixed_cuts_clean = load_manifest_lazy(src_dir / "cuts_train_clean.jsonl.gz")
+    mixed_cuts_reverb = load_manifest_lazy(src_dir / "cuts_train_reverb.jsonl.gz")
+
+    logging.info("Reading source cuts")
+    source_cuts = load_manifest(src_dir / "ihm_cuts_train_trimmed.jsonl.gz")
+
+    logging.info("Adding source features to the mixed cuts")
+    pbar = tqdm(total=len(mixed_cuts_clean), desc="Adding source features")
+    with CutSet.open_writer(
+        src_dir / "cuts_train_clean_sources.jsonl.gz"
+    ) as cut_writer_clean, CutSet.open_writer(
+        src_dir / "cuts_train_reverb_sources.jsonl.gz"
+    ) as cut_writer_reverb, LilcomChunkyWriter(
+        output_dir / "feats_train_clean_sources"
+    ) as source_feat_writer:
+        for cut_clean, cut_reverb in zip(mixed_cuts_clean, mixed_cuts_reverb):
+            assert cut_reverb.id == cut_clean.id + "_rvb"
+            source_feats = []
+            source_feat_offsets = []
+            cur_offset = 0
+            for sup in sorted(
+                cut_clean.supervisions, key=lambda s: (s.start, s.speaker)
+            ):
+                source_cut = source_cuts[sup.id]
+                source_feats.append(source_cut.load_features())
+                source_feat_offsets.append(cur_offset)
+                cur_offset += source_cut.num_frames
+            cut_clean.source_feats = source_feat_writer.store_array(
+                cut_clean.id, np.concatenate(source_feats, axis=0)
+            )
+            cut_clean.source_feat_offsets = source_feat_offsets
+            cut_writer_clean.write(cut_clean)
+            # Also write the reverb cut
+            cut_reverb.source_feats = cut_clean.source_feats
+            cut_reverb.source_feat_offsets = cut_clean.source_feat_offsets
+            cut_writer_reverb.write(cut_reverb)
+            pbar.update(1)
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    add_source_feats()
diff --git a/egs/ami/SURT/local/compute_fbank_aimix.py b/egs/ami/SURT/local/compute_fbank_aimix.py
new file mode 100755
index 000000000..91b3a060b
--- /dev/null
+++ b/egs/ami/SURT/local/compute_fbank_aimix.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the synthetically mixed AMI and ICSI
+train set.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+import logging
+import random
+import warnings
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+import torchaudio
+from lhotse import (
+    AudioSource,
+    LilcomChunkyWriter,
+    Recording,
+    load_manifest,
+    load_manifest_lazy,
+)
+from lhotse.audio import set_ffmpeg_torchaudio_info_enabled
+from lhotse.cut import MixedCut, MixTrack, MultiCut
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.utils import fix_random_seed, uuid4
+from tqdm import tqdm
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+torchaudio.set_audio_backend("soundfile")
+set_ffmpeg_torchaudio_info_enabled(False)
+
+
+def compute_fbank_aimix():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    logging.info("Reading manifests")
+    train_cuts = load_manifest_lazy(src_dir / "ai-mix_cuts_clean_full.jsonl.gz")
+
+    # only uses RIRs and noises from REVERB challenge
+    real_rirs = load_manifest(src_dir / "real-rir_recordings_all.jsonl.gz").filter(
+        lambda r: "RVB2014" in r.id
+    )
+    noises = load_manifest(src_dir / "iso-noise_recordings_all.jsonl.gz").filter(
+        lambda r: "RVB2014" in r.id
+    )
+
+    # Apply perturbation to the training cuts
+    logging.info("Applying perturbation to the training cuts")
+    train_cuts_rvb = train_cuts.map(
+        lambda c: augment(
+            c, perturb_snr=True, rirs=real_rirs, noises=noises, perturb_loudness=True
+        )
+    )
+
+    logging.info("Extracting fbank features for training cuts")
+    _ = train_cuts.compute_and_store_features_batch(
+        extractor=extractor,
+        storage_path=output_dir / "ai-mix_feats_clean",
+        manifest_path=src_dir / "cuts_train_clean.jsonl.gz",
+        batch_duration=5000,
+        num_workers=4,
+        storage_type=LilcomChunkyWriter,
+        overwrite=True,
+    )
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        _ = train_cuts_rvb.compute_and_store_features_batch(
+            extractor=extractor,
+            storage_path=output_dir / "ai-mix_feats_reverb",
+            manifest_path=src_dir / "cuts_train_reverb.jsonl.gz",
+            batch_duration=5000,
+            num_workers=4,
+            storage_type=LilcomChunkyWriter,
+            overwrite=True,
+        )
+
+
+def augment(cut, perturb_snr=False, rirs=None, noises=None, perturb_loudness=False):
+    """
+    Given a mixed cut, this function optionally applies the following augmentations:
+    - Perturbing the SNRs of the tracks (in range [-5, 5] dB)
+    - Reverberation using a randomly selected RIR
+    - Adding noise
+    - Perturbing the loudness (in range [-20, -25] dB)
+    """
+    out_cut = cut.drop_features()
+
+    # Perturb the SNRs (optional)
+    if perturb_snr:
+        snrs = [random.uniform(-5, 5) for _ in range(len(cut.tracks))]
+        for i, (track, snr) in enumerate(zip(out_cut.tracks, snrs)):
+            if i == 0:
+                # Skip the first track since it is the reference
+                continue
+            track.snr = snr
+
+    # Reverberate the cut (optional)
+    if rirs is not None:
+        # Select an RIR at random
+        rir = random.choice(rirs)
+        # Select a channel at random
+        rir_channel = random.choice(list(range(rir.num_channels)))
+        # Reverberate the cut
+        out_cut = out_cut.reverb_rir(rir_recording=rir, rir_channels=[rir_channel])
+
+    # Add noise (optional)
+    if noises is not None:
+        # Select a noise recording at random
+        noise = random.choice(noises).to_cut()
+        if isinstance(noise, MultiCut):
+            noise = noise.to_mono()[0]
+        # Select an SNR at random
+        snr = random.uniform(10, 30)
+        # Repeat the noise to match the duration of the cut
+        noise = repeat_cut(noise, out_cut.duration)
+        out_cut = MixedCut(
+            id=out_cut.id,
+            tracks=[
+                MixTrack(cut=out_cut, type="MixedCut"),
+                MixTrack(cut=noise, type="DataCut", snr=snr),
+            ],
+        )
+
+    # Perturb the loudness (optional)
+    if perturb_loudness:
+        target_loudness = random.uniform(-20, -25)
+        out_cut = out_cut.normalize_loudness(target_loudness, mix_first=True)
+    return out_cut
+
+
+def repeat_cut(cut, duration):
+    while cut.duration < duration:
+        cut = cut.mix(cut, offset_other_by=cut.duration)
+    return cut.truncate(duration=duration)
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    fix_random_seed(42)
+    compute_fbank_aimix()
diff --git a/egs/ami/SURT/local/compute_fbank_ami.py b/egs/ami/SURT/local/compute_fbank_ami.py
new file mode 100755
index 000000000..351b41765
--- /dev/null
+++ b/egs/ami/SURT/local/compute_fbank_ami.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the AMI dataset.
+We compute features for full recordings (i.e., without trimming to supervisions).
+This way we can create arbitrary segmentations later.
+
+The generated fbank features are saved in data/fbank.
+"""
+import logging
+import math
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+from lhotse import CutSet, LilcomChunkyWriter
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
+def compute_fbank_ami():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    logging.info("Reading manifests")
+    manifests = {}
+    for part in ["ihm-mix", "sdm", "mdm8-bf"]:
+        manifests[part] = read_manifests_if_cached(
+            dataset_parts=["train", "dev", "test"],
+            output_dir=src_dir,
+            prefix=f"ami-{part}",
+            suffix="jsonl.gz",
+        )
+
+    for part in ["ihm-mix", "sdm", "mdm8-bf"]:
+        for split in ["train", "dev", "test"]:
+            logging.info(f"Processing {part} {split}")
+            cuts = CutSet.from_manifests(
+                **manifests[part][split]
+            ).compute_and_store_features_batch(
+                extractor=extractor,
+                storage_path=output_dir / f"ami-{part}_{split}_feats",
+                manifest_path=src_dir / f"cuts_ami-{part}_{split}.jsonl.gz",
+                batch_duration=5000,
+                num_workers=4,
+                storage_type=LilcomChunkyWriter,
+            )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    compute_fbank_ami()
diff --git a/egs/ami/SURT/local/compute_fbank_icsi.py b/egs/ami/SURT/local/compute_fbank_icsi.py
new file mode 100755
index 000000000..4e2ff3f3b
--- /dev/null
+++ b/egs/ami/SURT/local/compute_fbank_icsi.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the ICSI dataset.
+We compute features for full recordings (i.e., without trimming to supervisions).
+This way we can create arbitrary segmentations later.
+
+The generated fbank features are saved in data/fbank.
+"""
+import logging
+import math
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+from lhotse import CutSet, LilcomChunkyWriter
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
+def compute_fbank_icsi():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    logging.info("Reading manifests")
+    manifests = {}
+    for part in ["ihm-mix", "sdm"]:
+        manifests[part] = read_manifests_if_cached(
+            dataset_parts=["train"],
+            output_dir=src_dir,
+            prefix=f"icsi-{part}",
+            suffix="jsonl.gz",
+        )
+
+    for part in ["ihm-mix", "sdm"]:
+        for split in ["train"]:
+            logging.info(f"Processing {part} {split}")
+            cuts = CutSet.from_manifests(
+                **manifests[part][split]
+            ).compute_and_store_features_batch(
+                extractor=extractor,
+                storage_path=output_dir / f"icsi-{part}_{split}_feats",
+                manifest_path=src_dir / f"cuts_icsi-{part}_{split}.jsonl.gz",
+                batch_duration=5000,
+                num_workers=4,
+                storage_type=LilcomChunkyWriter,
+                overwrite=True,
+            )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    compute_fbank_icsi()
diff --git a/egs/ami/SURT/local/compute_fbank_ihm.py b/egs/ami/SURT/local/compute_fbank_ihm.py
new file mode 100755
index 000000000..56f54aa21
--- /dev/null
+++ b/egs/ami/SURT/local/compute_fbank_ihm.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the trimmed sub-segments which will be
+used for simulating the training mixtures.
+
+The generated fbank features are saved in data/fbank.
+"""
+import logging
+import math
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+import torchaudio
+from lhotse import CutSet, LilcomChunkyWriter, load_manifest
+from lhotse.audio import set_ffmpeg_torchaudio_info_enabled
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+from tqdm import tqdm
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+torchaudio.set_audio_backend("soundfile")
+set_ffmpeg_torchaudio_info_enabled(False)
+
+
+def compute_fbank_ihm():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    logging.info("Reading manifests")
+    manifests = {}
+    for data in ["ami", "icsi"]:
+        manifests[data] = read_manifests_if_cached(
+            dataset_parts=["train"],
+            output_dir=src_dir,
+            types=["recordings", "supervisions"],
+            prefix=f"{data}-ihm",
+            suffix="jsonl.gz",
+        )
+
+    logging.info("Computing features")
+    for data in ["ami", "icsi"]:
+        cs = CutSet.from_manifests(**manifests[data]["train"])
+        cs = cs.trim_to_supervisions(keep_overlapping=False)
+        cs = cs.normalize_loudness(target=-23.0, affix_id=False)
+        cs = cs + cs.perturb_speed(0.9) + cs.perturb_speed(1.1)
+        _ = cs.compute_and_store_features_batch(
+            extractor=extractor,
+            storage_path=output_dir / f"{data}-ihm_train_feats",
+            manifest_path=src_dir / f"{data}-ihm_cuts_train.jsonl.gz",
+            batch_duration=5000,
+            num_workers=4,
+            storage_type=LilcomChunkyWriter,
+            overwrite=True,
+        )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    compute_fbank_ihm()
diff --git a/egs/ami/SURT/local/prepare_ami_train_cuts.py b/egs/ami/SURT/local/prepare_ami_train_cuts.py
new file mode 100755
index 000000000..72fced70d
--- /dev/null
+++ b/egs/ami/SURT/local/prepare_ami_train_cuts.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file creates AMI train segments.
+"""
+import logging
+import math
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+from lhotse import LilcomChunkyWriter, load_manifest_lazy
+from lhotse.cut import Cut, CutSet
+from lhotse.utils import EPSILON, add_durations
+from tqdm import tqdm
+
+
+def cut_into_windows(cuts: CutSet, duration: float):
+    """
+    This function takes a CutSet and cuts each cut into windows of roughly
+    `duration` seconds. By roughly, we mean that we try to adjust for the last supervision
+    that exceeds the duration, or is shorter than the duration.
+    """
+    res = []
+    with tqdm() as pbar:
+        for cut in cuts:
+            pbar.update(1)
+            sups = cut.index_supervisions()[cut.id]
+            sr = cut.sampling_rate
+            start = 0.0
+            end = duration
+            num_tries = 0
+            while start < cut.duration and num_tries < 2:
+                # Find the supervision that are cut by the window endpoint
+                hitlist = [iv for iv in sups.at(end) if iv.begin < end]
+                # If there are no supervisions, we are done
+                if not hitlist:
+                    res.append(
+                        cut.truncate(
+                            offset=start,
+                            duration=add_durations(end, -start, sampling_rate=sr),
+                            keep_excessive_supervisions=False,
+                        )
+                    )
+                    # Update the start and end for the next window
+                    start = end
+                    end = add_durations(end, duration, sampling_rate=sr)
+                else:
+                    # find ratio of durations cut by the window endpoint
+                    ratios = [
+                        add_durations(end, -iv.end, sampling_rate=sr) / iv.length()
+                        for iv in hitlist
+                    ]
+                    # we retain the supervisions that have >50% of their duration
+                    # in the window, and discard the others
+                    retained = []
+                    discarded = []
+                    for iv, ratio in zip(hitlist, ratios):
+                        if ratio > 0.5:
+                            retained.append(iv)
+                        else:
+                            discarded.append(iv)
+                    cur_end = max(iv.end for iv in retained) if retained else end
+                    res.append(
+                        cut.truncate(
+                            offset=start,
+                            duration=add_durations(cur_end, -start, sampling_rate=sr),
+                            keep_excessive_supervisions=False,
+                        )
+                    )
+                    # For the next window, we start at the earliest discarded supervision
+                    next_start = min(iv.begin for iv in discarded) if discarded else end
+                    next_end = add_durations(next_start, duration, sampling_rate=sr)
+                    # It may happen that next_start is the same as start, in which case
+                    # we will advance the window anyway
+                    if next_start == start:
+                        logging.warning(
+                            f"Next start is the same as start: {next_start} == {start} for cut {cut.id}"
+                        )
+                        start = end + EPSILON
+                        end = add_durations(start, duration, sampling_rate=sr)
+                        num_tries += 1
+                    else:
+                        start = next_start
+                        end = next_end
+    return CutSet.from_cuts(res)
+
+
+def prepare_train_cuts():
+    src_dir = Path("data/manifests")
+
+    logging.info("Loading the manifests")
+    train_cuts_ihm = load_manifest_lazy(
+        src_dir / "cuts_ami-ihm-mix_train.jsonl.gz"
+    ).map(lambda c: c.with_id(f"{c.id}_ihm-mix"))
+    train_cuts_sdm = load_manifest_lazy(src_dir / "cuts_ami-sdm_train.jsonl.gz").map(
+        lambda c: c.with_id(f"{c.id}_sdm")
+    )
+    train_cuts_mdm = load_manifest_lazy(
+        src_dir / "cuts_ami-mdm8-bf_train.jsonl.gz"
+    ).map(lambda c: c.with_id(f"{c.id}_mdm8-bf"))
+
+    # Combine all cuts into one CutSet
+    train_cuts = train_cuts_ihm + train_cuts_sdm + train_cuts_mdm
+
+    train_cuts_1 = train_cuts.trim_to_supervision_groups(max_pause=0.5)
+    train_cuts_2 = train_cuts.trim_to_supervision_groups(max_pause=0.0)
+
+    # Combine the two segmentations
+    train_all = train_cuts_1 + train_cuts_2
+
+    # At this point, some of the cuts may be very long. We will cut them into windows of
+    # roughly 30 seconds.
+    logging.info("Cutting the segments into windows of 30 seconds")
+    train_all_30 = cut_into_windows(train_all, duration=30.0)
+    logging.info(f"Number of cuts after cutting into windows: {len(train_all_30)}")
+
+    # Show statistics
+    train_all.describe(full=True)
+
+    # Save the cuts
+    logging.info("Saving the cuts")
+    train_all.to_file(src_dir / "cuts_train_ami.jsonl.gz")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    prepare_train_cuts()
diff --git a/egs/ami/SURT/local/prepare_icsi_train_cuts.py b/egs/ami/SURT/local/prepare_icsi_train_cuts.py
new file mode 100755
index 000000000..818e26bfb
--- /dev/null
+++ b/egs/ami/SURT/local/prepare_icsi_train_cuts.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file creates ICSI train segments.
+"""
+import logging
+from pathlib import Path
+
+from lhotse import load_manifest_lazy
+from prepare_ami_train_cuts import cut_into_windows
+
+
+def prepare_train_cuts():
+    src_dir = Path("data/manifests")
+
+    logging.info("Loading the manifests")
+    train_cuts_ihm = load_manifest_lazy(
+        src_dir / "cuts_icsi-ihm-mix_train.jsonl.gz"
+    ).map(lambda c: c.with_id(f"{c.id}_ihm-mix"))
+    train_cuts_sdm = load_manifest_lazy(src_dir / "cuts_icsi-sdm_train.jsonl.gz").map(
+        lambda c: c.with_id(f"{c.id}_sdm")
+    )
+
+    # Combine all cuts into one CutSet
+    train_cuts = train_cuts_ihm + train_cuts_sdm
+
+    train_cuts_1 = train_cuts.trim_to_supervision_groups(max_pause=0.5)
+    train_cuts_2 = train_cuts.trim_to_supervision_groups(max_pause=0.0)
+
+    # Combine the two segmentations
+    train_all = train_cuts_1 + train_cuts_2
+
+    # At this point, some of the cuts may be very long. We will cut them into windows of
+    # roughly 30 seconds.
+    logging.info("Cutting the segments into windows of 30 seconds")
+    train_all_30 = cut_into_windows(train_all, duration=30.0)
+    logging.info(f"Number of cuts after cutting into windows: {len(train_all_30)}")
+
+    # Show statistics
+    train_all.describe(full=True)
+
+    # Save the cuts
+    logging.info("Saving the cuts")
+    train_all.to_file(src_dir / "cuts_train_icsi.jsonl.gz")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    prepare_train_cuts()
diff --git a/egs/ami/SURT/local/prepare_lang_bpe.py b/egs/ami/SURT/local/prepare_lang_bpe.py
new file mode 120000
index 000000000..36b40e7fc
--- /dev/null
+++ b/egs/ami/SURT/local/prepare_lang_bpe.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/prepare_lang_bpe.py
\ No newline at end of file
diff --git a/egs/ami/SURT/local/train_bpe_model.py b/egs/ami/SURT/local/train_bpe_model.py
new file mode 120000
index 000000000..6fad36421
--- /dev/null
+++ b/egs/ami/SURT/local/train_bpe_model.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/train_bpe_model.py
\ No newline at end of file
diff --git a/egs/ami/SURT/prepare.sh b/egs/ami/SURT/prepare.sh
new file mode 100755
index 000000000..ea4e5baf2
--- /dev/null
+++ b/egs/ami/SURT/prepare.sh
@@ -0,0 +1,195 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+stage=-1
+stop_stage=100
+
+# We assume dl_dir (download dir) contains the following
+# directories and files. If not, they will be downloaded
+# by this script automatically.
+#
+#  - $dl_dir/ami
+#      You can find audio and transcripts for AMI in this path.
+#
+#  - $dl_dir/icsi
+#      You can find audio and transcripts for ICSI in this path.
+#
+#  - $dl_dir/rirs_noises
+#      This directory contains the RIRS_NOISES corpus downloaded from https://openslr.org/28/.
+#
+dl_dir=$PWD/download
+
+. shared/parse_options.sh || exit 1
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+vocab_size=500
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Download data"
+
+  # If you have pre-downloaded it to /path/to/amicorpus,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/amicorpus $dl_dir/amicorpus
+  #
+  if [ ! -d $dl_dir/amicorpus ]; then
+    for mic in ihm ihm-mix sdm mdm8-bf; do
+      lhotse download ami --mic $mic $dl_dir/amicorpus
+    done
+  fi
+
+  # If you have pre-downloaded it to /path/to/icsi,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/icsi $dl_dir/icsi
+  #
+  if [ ! -d $dl_dir/icsi ]; then
+    lhotse download icsi $dl_dir/icsi
+  fi
+
+  # If you have pre-downloaded it to /path/to/rirs_noises,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/rirs_noises $dl_dir/
+  #
+  if [ ! -d $dl_dir/rirs_noises ]; then
+    lhotse download rirs_noises $dl_dir
+  fi
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare AMI manifests"
+  # We assume that you have downloaded the AMI corpus
+  # to $dl_dir/amicorpus. We perform text normalization for the transcripts.
+  mkdir -p data/manifests
+  for mic in ihm ihm-mix sdm mdm8-bf; do
+    log "Preparing AMI manifest for $mic"
+    lhotse prepare ami --mic $mic --max-words-per-segment 30 --merge-consecutive $dl_dir/amicorpus data/manifests/
+  done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Prepare ICSI manifests"
+  # We assume that you have downloaded the ICSI corpus
+  # to $dl_dir/icsi. We perform text normalization for the transcripts.
+  mkdir -p data/manifests
+  log "Preparing ICSI manifest"
+  for mic in ihm ihm-mix sdm; do
+    lhotse prepare icsi --mic $mic $dl_dir/icsi data/manifests/
+  done
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Prepare RIRs"
+  # We assume that you have downloaded the RIRS_NOISES corpus
+  # to $dl_dir/rirs_noises
+  lhotse prepare rir-noise -p real_rir -p iso_noise $dl_dir/rirs_noises data/manifests
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  log "Stage 3: Extract features for AMI and ICSI recordings"
+  python local/compute_fbank_ami.py
+  python local/compute_fbank_icsi.py
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Create sources for simulating mixtures"
+  # In the following script, we speed-perturb the IHM recordings and extract features.
+  python local/compute_fbank_ihm.py
+  lhotse combine data/manifests/ami-ihm_cuts_train.jsonl.gz \
+    data/manifests/icsi-ihm_cuts_train.jsonl.gz - |\
+    lhotse cut trim-to-alignments --type word --max-pause 0.5 - - |\
+    lhotse filter 'duration<=12.0' - - |\
+    shuf | gzip -c > data/manifests/ihm_cuts_train_trimmed.jsonl.gz
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
+  log "Stage 6: Create training mixtures"
+  lhotse workflows simulate-meetings \
+    --method conversational \
+    --same-spk-pause 0.5 \
+    --diff-spk-pause 0.5 \
+    --diff-spk-overlap 1.0 \
+    --prob-diff-spk-overlap 0.8 \
+    --num-meetings 200000 \
+    --num-speakers-per-meeting 2,3 \
+    --max-duration-per-speaker 15.0 \
+    --max-utterances-per-speaker 3 \
+    --seed 1234 \
+    --num-jobs 2 \
+    data/manifests/ihm_cuts_train_trimmed.jsonl.gz \
+    data/manifests/ai-mix_cuts_clean.jsonl.gz
+
+  python local/compute_fbank_aimix.py
+
+  # Add source features to the manifest (will be used for masking loss)
+  # This may take ~2 hours.
+  python local/add_source_feats.py
+
+  # Combine clean and reverb
+  cat <(gunzip -c data/manifests/cuts_train_clean_sources.jsonl.gz) \
+    <(gunzip -c data/manifests/cuts_train_reverb_sources.jsonl.gz) |\
+    shuf | gzip -c > data/manifests/cuts_train_comb_sources.jsonl.gz
+fi
+
+if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
+  log "Stage 7: Create training mixtures from real sessions"
+  python local/prepare_ami_train_cuts.py
+  python local/prepare_icsi_train_cuts.py
+
+  # Combine AMI and ICSI
+  cat <(gunzip -c data/manifests/cuts_train_ami.jsonl.gz) \
+    <(gunzip -c data/manifests/cuts_train_icsi.jsonl.gz) |\
+    shuf | gzip -c > data/manifests/cuts_train_ami_icsi.jsonl.gz
+fi
+
+if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
+  log "Stage 8: Dump transcripts for BPE model training (using AMI and ICSI)."
+  mkdir -p data/lm
+  cat <(gunzip -c data/manifests/ami-sdm_supervisions_train.jsonl.gz | jq '.text' | sed 's:"::g') \
+      <(gunzip -c data/manifests/icsi-sdm_supervisions_train.jsonl.gz | jq '.text' | sed 's:"::g') \
+  > data/lm/transcript_words.txt
+fi
+
+if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
+  log "Stage 9: Prepare BPE based lang (combining AMI and ICSI)"
+
+  lang_dir=data/lang_bpe_${vocab_size}
+  mkdir -p $lang_dir
+
+  # Add special words to words.txt
+  echo "<eps> 0" > $lang_dir/words.txt
+  echo "!SIL 1" >> $lang_dir/words.txt
+  echo "<UNK> 2" >> $lang_dir/words.txt
+
+  # Add regular words to words.txt
+  cat data/lm/transcript_words.txt | grep -o -E '\w+' | sort -u | awk '{print $0,NR+2}' >> $lang_dir/words.txt
+
+  # Add remaining special word symbols expected by LM scripts.
+  num_words=$(cat $lang_dir/words.txt | wc -l)
+  echo "<s> ${num_words}" >> $lang_dir/words.txt
+  num_words=$(cat $lang_dir/words.txt | wc -l)
+  echo "</s> ${num_words}" >> $lang_dir/words.txt
+  num_words=$(cat $lang_dir/words.txt | wc -l)
+  echo "#0 ${num_words}" >> $lang_dir/words.txt
+
+  ./local/train_bpe_model.py \
+    --lang-dir $lang_dir \
+    --vocab-size $vocab_size \
+    --transcript data/lm/transcript_words.txt
+
+  if [ ! -f $lang_dir/L_disambig.pt ]; then
+    ./local/prepare_lang_bpe.py --lang-dir $lang_dir
+  fi
+fi
diff --git a/egs/ami/SURT/shared b/egs/ami/SURT/shared
new file mode 120000
index 000000000..4cbd91a7e
--- /dev/null
+++ b/egs/ami/SURT/shared
@@ -0,0 +1 @@
+../../../icefall/shared
\ No newline at end of file
diff --git a/egs/commonvoice/ASR/local/preprocess_commonvoice.py b/egs/commonvoice/ASR/local/preprocess_commonvoice.py
index c5ec14502..e60459765 100755
--- a/egs/commonvoice/ASR/local/preprocess_commonvoice.py
+++ b/egs/commonvoice/ASR/local/preprocess_commonvoice.py
@@ -45,7 +45,7 @@ def get_args():
 
 def normalize_text(utt: str) -> str:
     utt = re.sub(r"[{0}]+".format("-"), " ", utt)
-    return re.sub(r"[^a-zA-Z\s]", "", utt).upper()
+    return re.sub(r"[^a-zA-Z\s']", "", utt).upper()
 
 
 def preprocess_commonvoice(
diff --git a/egs/libricss/SURT/README.md b/egs/libricss/SURT/README.md
new file mode 100644
index 000000000..10a1aaad1
--- /dev/null
+++ b/egs/libricss/SURT/README.md
@@ -0,0 +1,249 @@
+# Introduction
+
+This is a multi-talker ASR recipe for the LibriCSS dataset. We train a Streaming
+Unmixing and Recognition Transducer (SURT) model for the task. In this README,
+we will describe the task, the model, and the training process. We will also
+provide links to pre-trained models and training logs.
+
+## Task
+
+LibriCSS is a multi-talker meeting corpus formed from mixing together LibriSpeech utterances
+and replaying in a real meeting room. It consists of 10 1-hour sessions of audio, each
+recorded on a 7-channel microphone. The sessions are recorded at a sampling rate of 16 kHz.
+For more information, refer to the paper:
+Z. Chen et al., "Continuous speech separation: dataset and analysis,"
+ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP),
+Barcelona, Spain, 2020
+
+In this recipe, we perform the "continuous, streaming, multi-talker ASR" task on LibriCSS.
+
+* By "continuous", we mean that the model should be able to transcribe unsegmented audio
+without the need of an external VAD.
+* By "streaming", we mean that the model has limited right context. We use a right-context
+of at most 32 frames (320 ms).
+* By "multi-talker", we mean that the model should be able to transcribe overlapping speech
+from multiple speakers.
+
+For now, we do not care about speaker attribution, i.e., the transcription is speaker
+agnostic. The evaluation depends on the particular model type. In this case, we use
+the optimal reference combination WER (ORC-WER) metric as implemented in the
+[meeteval](https://github.com/fgnt/meeteval) toolkit.
+
+## Model
+
+We use the Streaming Unmixing and Recognition Transducer (SURT) model for this task.
+The model is based on the papers:
+
+- Lu, Liang et al. “Streaming End-to-End Multi-Talker Speech Recognition.” IEEE Signal Processing Letters 28 (2020): 803-807.
+- Raj, Desh et al. “Continuous Streaming Multi-Talker ASR with Dual-Path Transducers.” ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2021): 7317-7321.
+
+The model is a combination of a speech separation model and a speech recognition model,
+but trained end-to-end with a single loss function. The overall architecture is shown
+in the figure below. Note that this architecture is slightly different from the one
+in the above papers. A detailed description of the model can be found in the following
+paper: [SURT 2.0: Advanced in transducer-based multi-talker ASR](https://arxiv.org/abs/2306.10559).
+
+<p align="center">
+
+  <img src="surt.png">
+  Streaming Unmixing and Recognition Transducer
+
+</p>
+
+In the [dprnn_zipformer](./dprnn_zipformer) recipe, for example, we use a DPRNN-based masking network
+and a Zipfomer-based recognition network. But other combinations are possible as well.
+
+## Training objective
+
+We train the model using the pruned transducer loss, similar to other ASR recipes in
+icefall. However, an important consideration is how to assign references to the output
+channels (2 in this case). For this, we use the heuristic error assignment training (HEAT)
+strategy, which assigns references to the first available channel based on their start
+times. An illustrative example is shown in the figure below:
+
+<p align="center">
+
+  <img src="heat.png">
+  Illustration of HEAT-based reference assignment.
+
+</p>
+
+## Description of the recipe
+
+### Pre-requisites
+
+The recipes in this directory need the following packages to be installed:
+
+- [meeteval](https://github.com/fgnt/meeteval)
+- [einops](https://github.com/arogozhnikov/einops)
+
+Additionally, we initialize the "recognition" transducer with a pre-trained model,
+trained on LibriSpeech. For this, please run the following from within `egs/librispeech/ASR`:
+
+```bash
+./prepare.sh
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+python pruned_transducer_stateless7_streaming/train.py \
+    --use-fp16 True \
+    --exp-dir pruned_transducer_stateless7_streaming/exp \
+    --world-size 4 \
+    --max-duration 800 \
+    --num-epochs 10 \
+    --keep-last-k 1 \
+    --manifest-dir data/manifests \
+    --enable-musan true \
+    --master-port 54321 \
+    --bpe-model data/lang_bpe_500/bpe.model \
+    --num-encoder-layers 2,2,2,2,2 \
+    --feedforward-dims 768,768,768,768,768 \
+    --nhead 8,8,8,8,8 \
+    --encoder-dims 256,256,256,256,256 \
+    --attention-dims 192,192,192,192,192 \
+    --encoder-unmasked-dims 192,192,192,192,192 \
+    --zipformer-downsampling-factors 1,2,4,8,2 \
+    --cnn-module-kernels 31,31,31,31,31 \
+    --decoder-dim 512 \
+    --joiner-dim 512
+```
+
+The above is for SURT-base (~26M). For SURT-large (~38M), use `--num-encoder-layers 2,4,3,2,4`.
+
+Once the above model is trained for 10 epochs, copy it to `egs/libricss/SURT/exp`:
+
+```bash
+cp -r pruned_transducer_stateless7_streaming/exp/epoch-10.pt exp/zipformer_base.pt
+```
+
+**NOTE:** We also provide this pre-trained checkpoint (see the section below), so you can skip
+the above step if you want.
+
+### Training
+
+To train the model, run the following from within `egs/libricss/SURT`:
+
+```bash
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+python dprnn_zipformer/train.py \
+    --use-fp16 True \
+    --exp-dir dprnn_zipformer/exp/surt_base \
+    --world-size 4 \
+    --max-duration 500 \
+    --max-duration-valid 250 \
+    --max-cuts 200 \
+    --num-buckets 50 \
+    --num-epochs 30 \
+    --enable-spec-aug True \
+    --enable-musan False \
+    --ctc-loss-scale 0.2 \
+    --heat-loss-scale 0.2 \
+    --base-lr 0.004 \
+    --model-init-ckpt exp/zipformer_base.pt \
+    --chunk-width-randomization True \
+    --num-mask-encoder-layers 4 \
+    --num-encoder-layers 2,2,2,2,2
+```
+
+The above is for SURT-base (~26M). For SURT-large (~38M), use:
+
+```bash
+    --num-mask-encoder-layers 6 \
+    --num-encoder-layers 2,4,3,2,4 \
+    --model-init-ckpt exp/zipformer_large.pt \
+```
+
+**NOTE:** You may need to decrease the `--max-duration` for SURT-large to avoid OOM.
+
+### Adaptation
+
+The training step above only trains on simulated mixtures. For best results, we also
+adapt the final model on the LibriCSS dev set. For this, run the following from within
+`egs/libricss/SURT`:
+
+```bash
+export CUDA_VISIBLE_DEVICES="0"
+
+python dprnn_zipformer/train_adapt.py \
+    --use-fp16 True \
+    --exp-dir dprnn_zipformer/exp/surt_base_adapt \
+    --world-size 1 \
+    --max-duration 500 \
+    --max-duration-valid 250 \
+    --max-cuts 200 \
+    --num-buckets 50 \
+    --num-epochs 8 \
+    --lr-epochs 2 \
+    --enable-spec-aug True \
+    --enable-musan False \
+    --ctc-loss-scale 0.2 \
+    --base-lr 0.0004 \
+    --model-init-ckpt dprnn_zipformer/exp/surt_base/epoch-30.pt \
+    --chunk-width-randomization True \
+    --num-mask-encoder-layers 4 \
+    --num-encoder-layers 2,2,2,2,2
+```
+
+For SURT-large, use the following config:
+
+```bash
+    --num-mask-encoder-layers 6 \
+    --num-encoder-layers 2,4,3,2,4 \
+    --model-init-ckpt dprnn_zipformer/exp/surt_large/epoch-30.pt \
+    --num-epochs 15 \
+    --lr-epochs 4 \
+```
+
+
+### Decoding
+
+To decode the model, run the following from within `egs/libricss/SURT`:
+
+#### Greedy search
+
+```bash
+export CUDA_VISIBLE_DEVICES="0"
+
+python dprnn_zipformer/decode.py \
+    --epoch 8 --avg 1 --use-averaged-model False \
+    --exp-dir dprnn_zipformer/exp/surt_base_adapt \
+    --max-duration 250 \
+    --decoding-method greedy_search
+```
+
+#### Beam search
+
+```bash
+python dprnn_zipformer/decode.py \
+    --epoch 8 --avg 1 --use-averaged-model False \
+    --exp-dir dprnn_zipformer/exp/surt_base_adapt \
+    --max-duration 250 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+```
+
+## Results (using beam search)
+
+#### IHM-Mix
+
+| Model | # params | 0L | 0S | OV10 | OV20 | OV30 | OV40 | Avg. |
+|------------|:-------:|:----:|:---:|----:|:----:|:----:|:----:|:----:|
+| dprnn_zipformer (base)  | 26.7 | 5.1 | 4.2 | 13.7 | 18.7 | 20.5 | 20.6 | 13.8 |
+| dprnn_zipformer (large) | 37.9 | 4.6 | 3.8 | 12.7 | 14.3 | 16.7 | 21.2 | 12.2 |
+
+#### SDM
+
+| Model | # params | 0L | 0S | OV10 | OV20 | OV30 | OV40 | Avg. |
+|------------|:-------:|:----:|:---:|----:|:----:|:----:|:----:|:----:|
+| dprnn_zipformer (base)  | 26.7 | 6.8 | 7.2 | 21.4 | 24.5 | 28.6 | 31.2 | 20.0 |
+| dprnn_zipformer (large) | 37.9 | 6.4 | 6.9 | 17.9 | 19.7 | 25.2 | 25.5 | 16.9 |
+
+## Pre-trained models and logs
+
+* Pre-trained models: <https://huggingface.co/desh2608/icefall-surt-libricss-dprnn-zipformer>
+
+* Training logs:
+    - surt_base: <https://tensorboard.dev/experiment/YLGJTkBETb2aqDQ61jbxvQ/>
+    - surt_base_adapt: <https://tensorboard.dev/experiment/pjXMFVL9RMej85rMHyd0EQ/>
+    - surt_large: <https://tensorboard.dev/experiment/82HvYqfrSOKZ4w8Jod2QMw/>
+    - surt_large_adapt: <https://tensorboard.dev/experiment/5oIdEgRqS9Wb6yVuxaExEw/>
diff --git a/egs/libricss/SURT/dprnn_zipformer/asr_datamodule.py b/egs/libricss/SURT/dprnn_zipformer/asr_datamodule.py
new file mode 100644
index 000000000..51df91598
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/asr_datamodule.py
@@ -0,0 +1,372 @@
+# Copyright      2021  Piotr Żelasko
+# Copyright      2022  Xiaomi Corporation     (Author: Mingshuang Luo)
+# Copyright      2023  Johns Hopkins Univrtsity (Author: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import inspect
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
+from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
+    CutMix,
+    DynamicBucketingSampler,
+    K2SurtDataset,
+    PrecomputedFeatures,
+    SimpleCutSampler,
+    SpecAugment,
+)
+from lhotse.dataset.input_strategies import OnTheFlyFeatures
+from lhotse.utils import fix_random_seed
+from torch.utils.data import DataLoader
+
+from icefall.utils import str2bool
+
+
+class _SeedWorkers:
+    def __init__(self, seed: int):
+        self.seed = seed
+
+    def __call__(self, worker_id: int):
+        fix_random_seed(self.seed + worker_id)
+
+
+class LibriCssAsrDataModule:
+    """
+    DataModule for k2 ASR experiments.
+    It assumes there is always one train and valid dataloader,
+    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
+    and test-other).
+
+    It contains all the common data pipeline modules used in ASR
+    experiments, e.g.:
+    - dynamic batch size,
+    - bucketing samplers,
+    - augmentation,
+    - on-the-fly feature extraction
+
+    This class should be derived for specific corpora used in ASR tasks.
+    """
+
+    def __init__(self, args: argparse.Namespace):
+        self.args = args
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(
+            title="ASR data related options",
+            description="These options are used for the preparation of "
+            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
+            "effective batch sizes, sampling strategies, applied data "
+            "augmentations, etc.",
+        )
+        group.add_argument(
+            "--manifest-dir",
+            type=Path,
+            default=Path("data/manifests"),
+            help="Path to directory with train/valid/test cuts.",
+        )
+        group.add_argument(
+            "--max-duration",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--max-duration-valid",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--max-cuts",
+            type=int,
+            default=100,
+            help="Maximum number of cuts in a single batch. You can "
+            "reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--bucketing-sampler",
+            type=str2bool,
+            default=True,
+            help="When enabled, the batches will come from buckets of "
+            "similar duration (saves padding frames).",
+        )
+        group.add_argument(
+            "--num-buckets",
+            type=int,
+            default=30,
+            help="The number of buckets for the DynamicBucketingSampler"
+            "(you might want to increase it for larger datasets).",
+        )
+        group.add_argument(
+            "--on-the-fly-feats",
+            type=str2bool,
+            default=False,
+            help=(
+                "When enabled, use on-the-fly cut mixing and feature "
+                "extraction. Will drop existing precomputed feature manifests "
+                "if available."
+            ),
+        )
+        group.add_argument(
+            "--shuffle",
+            type=str2bool,
+            default=True,
+            help="When enabled (=default), the examples will be "
+            "shuffled for each epoch.",
+        )
+        group.add_argument(
+            "--drop-last",
+            type=str2bool,
+            default=True,
+            help="Whether to drop last batch. Used by sampler.",
+        )
+        group.add_argument(
+            "--return-cuts",
+            type=str2bool,
+            default=True,
+            help="When enabled, each batch will have the "
+            "field: batch['supervisions']['cut'] with the cuts that "
+            "were used to construct it.",
+        )
+
+        group.add_argument(
+            "--num-workers",
+            type=int,
+            default=2,
+            help="The number of training dataloader workers that "
+            "collect the batches.",
+        )
+
+        group.add_argument(
+            "--enable-spec-aug",
+            type=str2bool,
+            default=True,
+            help="When enabled, use SpecAugment for training dataset.",
+        )
+
+        group.add_argument(
+            "--spec-aug-time-warp-factor",
+            type=int,
+            default=80,
+            help="Used only when --enable-spec-aug is True. "
+            "It specifies the factor for time warping in SpecAugment. "
+            "Larger values mean more warping. "
+            "A value less than 1 means to disable time warp.",
+        )
+
+        group.add_argument(
+            "--enable-musan",
+            type=str2bool,
+            default=True,
+            help="When enabled, select noise from MUSAN and mix it"
+            "with training dataset. ",
+        )
+
+    def train_dataloaders(
+        self,
+        cuts_train: CutSet,
+        sampler_state_dict: Optional[Dict[str, Any]] = None,
+        return_sources: bool = True,
+        strict: bool = True,
+    ) -> DataLoader:
+        """
+        Args:
+          cuts_train:
+            CutSet for training.
+          sampler_state_dict:
+            The state dict for the training sampler.
+        """
+        transforms = []
+        if self.args.enable_musan:
+            logging.info("Enable MUSAN")
+            logging.info("About to get Musan cuts")
+            cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
+            transforms.append(
+                CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
+            )
+        else:
+            logging.info("Disable MUSAN")
+
+        input_transforms = []
+        if self.args.enable_spec_aug:
+            logging.info("Enable SpecAugment")
+            logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
+            # Set the value of num_frame_masks according to Lhotse's version.
+            # In different Lhotse's versions, the default of num_frame_masks is
+            # different.
+            num_frame_masks = 10
+            num_frame_masks_parameter = inspect.signature(
+                SpecAugment.__init__
+            ).parameters["num_frame_masks"]
+            if num_frame_masks_parameter.default == 1:
+                num_frame_masks = 2
+            logging.info(f"Num frame mask: {num_frame_masks}")
+            input_transforms.append(
+                SpecAugment(
+                    time_warp_factor=self.args.spec_aug_time_warp_factor,
+                    num_frame_masks=num_frame_masks,
+                    features_mask_size=27,
+                    num_feature_masks=2,
+                    frames_mask_size=100,
+                )
+            )
+        else:
+            logging.info("Disable SpecAugment")
+
+        logging.info("About to create train dataset")
+        train = K2SurtDataset(
+            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+            cut_transforms=transforms,
+            input_transforms=input_transforms,
+            return_cuts=self.args.return_cuts,
+            return_sources=return_sources,
+            strict=strict,
+        )
+
+        if self.args.bucketing_sampler:
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                quadratic_duration=30.0,
+                max_cuts=self.args.max_cuts,
+                shuffle=self.args.shuffle,
+                num_buckets=self.args.num_buckets,
+                drop_last=self.args.drop_last,
+            )
+        else:
+            logging.info("Using SingleCutSampler.")
+            train_sampler = SimpleCutSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                max_cuts=self.args.max_cuts,
+                shuffle=self.args.shuffle,
+            )
+        logging.info("About to create train dataloader")
+
+        if sampler_state_dict is not None:
+            logging.info("Loading sampler state dict")
+            train_sampler.load_state_dict(sampler_state_dict)
+
+        # 'seed' is derived from the current random state, which will have
+        # previously been set in the main process.
+        seed = torch.randint(0, 100000, ()).item()
+        worker_init_fn = _SeedWorkers(seed)
+
+        train_dl = DataLoader(
+            train,
+            sampler=train_sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+            worker_init_fn=worker_init_fn,
+        )
+
+        return train_dl
+
+    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
+        transforms = []
+
+        logging.info("About to create dev dataset")
+        validate = K2SurtDataset(
+            input_strategy=OnTheFlyFeatures(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            )
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+            cut_transforms=transforms,
+            return_cuts=self.args.return_cuts,
+            return_sources=False,
+            strict=False,
+        )
+        valid_sampler = DynamicBucketingSampler(
+            cuts_valid,
+            max_duration=self.args.max_duration_valid,
+            max_cuts=self.args.max_cuts,
+            shuffle=False,
+        )
+        logging.info("About to create dev dataloader")
+        valid_dl = DataLoader(
+            validate,
+            sampler=valid_sampler,
+            batch_size=None,
+            num_workers=2,
+            persistent_workers=False,
+        )
+
+        return valid_dl
+
+    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
+        logging.debug("About to create test dataset")
+        test = K2SurtDataset(
+            input_strategy=OnTheFlyFeatures(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            )
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+            return_cuts=self.args.return_cuts,
+            return_sources=False,
+            strict=False,
+        )
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration_valid,
+            max_cuts=self.args.max_cuts,
+            shuffle=False,
+        )
+        logging.debug("About to create test dataloader")
+        test_dl = DataLoader(
+            test,
+            batch_size=None,
+            sampler=sampler,
+            num_workers=self.args.num_workers,
+        )
+        return test_dl
+
+    @lru_cache()
+    def lsmix_cuts(
+        self,
+        rvb_affix: str = "clean",
+        type_affix: str = "full",
+        sources: bool = True,
+    ) -> CutSet:
+        logging.info("About to get train cuts")
+        source_affix = "_sources" if sources else ""
+        cs = load_manifest_lazy(
+            self.args.manifest_dir
+            / f"cuts_train_{rvb_affix}_{type_affix}{source_affix}.jsonl.gz"
+        )
+        cs = cs.filter(lambda c: c.duration >= 1.0 and c.duration <= 30.0)
+        return cs
+
+    @lru_cache()
+    def libricss_cuts(self, split="dev", type="sdm") -> CutSet:
+        logging.info(f"About to get LibriCSS {split} {type} cuts")
+        cs = load_manifest_lazy(
+            self.args.manifest_dir / f"cuts_{split}_libricss-{type}.jsonl.gz"
+        )
+        return cs
diff --git a/egs/libricss/SURT/dprnn_zipformer/beam_search.py b/egs/libricss/SURT/dprnn_zipformer/beam_search.py
new file mode 100644
index 000000000..c8e4643d0
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/beam_search.py
@@ -0,0 +1,730 @@
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang
+#                                                  Xiaoyu Yang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple, Union
+
+import k2
+import torch
+from model import SURT
+
+from icefall import NgramLmStateCost
+from icefall.utils import DecodingResults
+
+
+def greedy_search(
+    model: SURT,
+    encoder_out: torch.Tensor,
+    max_sym_per_frame: int,
+    return_timestamps: bool = False,
+) -> Union[List[int], DecodingResults]:
+    """Greedy search for a single utterance.
+    Args:
+      model:
+        An instance of `SURT`.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
+      max_sym_per_frame:
+        Maximum number of symbols per frame. If it is set to 0, the WER
+        would be 100%.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 4
+
+    # support only batch_size == 1 for now
+    assert encoder_out.size(0) == 1, encoder_out.size(0)
+
+    blank_id = model.decoder.blank_id
+    context_size = model.decoder.context_size
+    unk_id = getattr(model, "unk_id", blank_id)
+
+    device = next(model.parameters()).device
+
+    decoder_input = torch.tensor(
+        [-1] * (context_size - 1) + [blank_id], device=device, dtype=torch.int64
+    ).reshape(1, context_size)
+
+    decoder_out = model.decoder(decoder_input, need_pad=False)
+    decoder_out = model.joiner.decoder_proj(decoder_out)
+
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    T = encoder_out.size(1)
+    t = 0
+    hyp = [blank_id] * context_size
+
+    # timestamp[i] is the frame index after subsampling
+    # on which hyp[i] is decoded
+    timestamp = []
+
+    # Maximum symbols per utterance.
+    max_sym_per_utt = 1000
+
+    # symbols per frame
+    sym_per_frame = 0
+
+    # symbols per utterance decoded so far
+    sym_per_utt = 0
+
+    while t < T and sym_per_utt < max_sym_per_utt:
+        if sym_per_frame >= max_sym_per_frame:
+            sym_per_frame = 0
+            t += 1
+            continue
+
+        # fmt: off
+        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
+        # fmt: on
+        logits = model.joiner(
+            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
+        )
+        # logits is (1, 1, 1, vocab_size)
+
+        y = logits.argmax().item()
+        if y not in (blank_id, unk_id):
+            hyp.append(y)
+            timestamp.append(t)
+            decoder_input = torch.tensor([hyp[-context_size:]], device=device).reshape(
+                1, context_size
+            )
+
+            decoder_out = model.decoder(decoder_input, need_pad=False)
+            decoder_out = model.joiner.decoder_proj(decoder_out)
+
+            sym_per_utt += 1
+            sym_per_frame += 1
+        else:
+            sym_per_frame = 0
+            t += 1
+    hyp = hyp[context_size:]  # remove blanks
+
+    if not return_timestamps:
+        return hyp
+    else:
+        return DecodingResults(
+            hyps=[hyp],
+            timestamps=[timestamp],
+        )
+
+
+def greedy_search_batch(
+    model: SURT,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    return_timestamps: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
+    Args:
+      model:
+        The SURT model.
+      encoder_out:
+        Output from the encoder. Its shape is (N, T, C), where N >= 1.
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    device = next(model.parameters()).device
+
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    hyps = [[-1] * (context_size - 1) + [blank_id] for _ in range(N)]
+
+    # timestamp[n][i] is the frame index after subsampling
+    # on which hyp[n][i] is decoded
+    timestamps = [[] for _ in range(N)]
+
+    decoder_input = torch.tensor(
+        hyps,
+        device=device,
+        dtype=torch.int64,
+    )  # (N, context_size)
+
+    decoder_out = model.decoder(decoder_input, need_pad=False)
+    decoder_out = model.joiner.decoder_proj(decoder_out)
+    # decoder_out: (N, 1, decoder_out_dim)
+
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    for (t, batch_size) in enumerate(batch_size_list):
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        decoder_out = decoder_out[:batch_size]
+
+        logits = model.joiner(
+            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
+        )
+        # logits'shape (batch_size, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)  # (batch_size, vocab_size)
+        assert logits.ndim == 2, logits.shape
+        y = logits.argmax(dim=1).tolist()
+        emitted = False
+        for i, v in enumerate(y):
+            if v not in (blank_id, unk_id):
+                hyps[i].append(v)
+                timestamps[i].append(t)
+                emitted = True
+        if emitted:
+            # update decoder output
+            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
+            decoder_input = torch.tensor(
+                decoder_input,
+                device=device,
+                dtype=torch.int64,
+            )
+            decoder_out = model.decoder(decoder_input, need_pad=False)
+            decoder_out = model.joiner.decoder_proj(decoder_out)
+
+    sorted_ans = [h[context_size:] for h in hyps]
+    ans = []
+    ans_timestamps = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+        ans_timestamps.append(timestamps[unsorted_indices[i]])
+
+    if not return_timestamps:
+        return ans
+    else:
+        return DecodingResults(
+            hyps=ans,
+            timestamps=ans_timestamps,
+        )
+
+
+def modified_beam_search(
+    model: SURT,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: int = 4,
+    temperature: float = 1.0,
+    return_timestamps: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
+
+    Args:
+      model:
+        The SURT model.
+      encoder_out:
+        Output from the encoder. Its shape is (N, T, C).
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
+      beam:
+        Number of active paths during the beam search.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3, encoder_out.shape
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+    device = next(model.parameters()).device
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    B = [HypothesisList() for _ in range(N)]
+    for i in range(N):
+        B[i].add(
+            Hypothesis(
+                ys=[blank_id] * context_size,
+                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
+                timestamp=[],
+            )
+        )
+
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    finalized_B = []
+    for (t, batch_size) in enumerate(batch_size_list):
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        finalized_B = B[batch_size:] + finalized_B
+        B = B[:batch_size]
+
+        hyps_shape = get_hyps_shape(B).to(device)
+
+        A = [list(b) for b in B]
+        B = [HypothesisList() for _ in range(batch_size)]
+
+        ys_log_probs = torch.cat(
+            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
+        )  # (num_hyps, 1)
+
+        decoder_input = torch.tensor(
+            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
+            device=device,
+            dtype=torch.int64,
+        )  # (num_hyps, context_size)
+
+        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
+
+        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
+        # as index, so we use `to(torch.int64)` below.
+        current_encoder_out = torch.index_select(
+            current_encoder_out,
+            dim=0,
+            index=hyps_shape.row_ids(1).to(torch.int64),
+        )  # (num_hyps, 1, 1, encoder_out_dim)
+
+        logits = model.joiner(
+            current_encoder_out,
+            decoder_out,
+            project_input=False,
+        )  # (num_hyps, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
+
+        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
+
+        log_probs.add_(ys_log_probs)
+
+        vocab_size = log_probs.size(-1)
+
+        log_probs = log_probs.reshape(-1)
+
+        row_splits = hyps_shape.row_splits(1) * vocab_size
+        log_probs_shape = k2.ragged.create_ragged_shape2(
+            row_splits=row_splits, cached_tot_size=log_probs.numel()
+        )
+        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
+
+        for i in range(batch_size):
+            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
+                topk_token_indexes = (topk_indexes % vocab_size).tolist()
+
+            for k in range(len(topk_hyp_indexes)):
+                hyp_idx = topk_hyp_indexes[k]
+                hyp = A[i][hyp_idx]
+
+                new_ys = hyp.ys[:]
+                new_token = topk_token_indexes[k]
+                new_timestamp = hyp.timestamp[:]
+                if new_token not in (blank_id, unk_id):
+                    new_ys.append(new_token)
+                    new_timestamp.append(t)
+
+                new_log_prob = topk_log_probs[k]
+                new_hyp = Hypothesis(
+                    ys=new_ys, log_prob=new_log_prob, timestamp=new_timestamp
+                )
+                B[i].add(new_hyp)
+
+    B = B + finalized_B
+    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
+
+    sorted_ans = [h.ys[context_size:] for h in best_hyps]
+    sorted_timestamps = [h.timestamp for h in best_hyps]
+    ans = []
+    ans_timestamps = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+        ans_timestamps.append(sorted_timestamps[unsorted_indices[i]])
+
+    if not return_timestamps:
+        return ans
+    else:
+        return DecodingResults(
+            hyps=ans,
+            timestamps=ans_timestamps,
+        )
+
+
+def beam_search(
+    model: SURT,
+    encoder_out: torch.Tensor,
+    beam: int = 4,
+    temperature: float = 1.0,
+    return_timestamps: bool = False,
+) -> Union[List[int], DecodingResults]:
+    """
+    It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf
+
+    espnet/nets/beam_search_SURT.py#L247 is used as a reference.
+
+    Args:
+      model:
+        An instance of `SURT`.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
+      beam:
+        Beam size.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3
+
+    # support only batch_size == 1 for now
+    assert encoder_out.size(0) == 1, encoder_out.size(0)
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+
+    device = next(model.parameters()).device
+
+    decoder_input = torch.tensor(
+        [blank_id] * context_size,
+        device=device,
+        dtype=torch.int64,
+    ).reshape(1, context_size)
+
+    decoder_out = model.decoder(decoder_input, need_pad=False)
+    decoder_out = model.joiner.decoder_proj(decoder_out)
+
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    T = encoder_out.size(1)
+    t = 0
+
+    B = HypothesisList()
+    B.add(Hypothesis(ys=[blank_id] * context_size, log_prob=0.0, timestamp=[]))
+
+    max_sym_per_utt = 20000
+
+    sym_per_utt = 0
+
+    decoder_cache: Dict[str, torch.Tensor] = {}
+
+    while t < T and sym_per_utt < max_sym_per_utt:
+        # fmt: off
+        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
+        # fmt: on
+        A = B
+        B = HypothesisList()
+
+        joint_cache: Dict[str, torch.Tensor] = {}
+
+        # TODO(fangjun): Implement prefix search to update the `log_prob`
+        # of hypotheses in A
+
+        while True:
+            y_star = A.get_most_probable()
+            A.remove(y_star)
+
+            cached_key = y_star.key
+
+            if cached_key not in decoder_cache:
+                decoder_input = torch.tensor(
+                    [y_star.ys[-context_size:]],
+                    device=device,
+                    dtype=torch.int64,
+                ).reshape(1, context_size)
+
+                decoder_out = model.decoder(decoder_input, need_pad=False)
+                decoder_out = model.joiner.decoder_proj(decoder_out)
+                decoder_cache[cached_key] = decoder_out
+            else:
+                decoder_out = decoder_cache[cached_key]
+
+            cached_key += f"-t-{t}"
+            if cached_key not in joint_cache:
+                logits = model.joiner(
+                    current_encoder_out,
+                    decoder_out.unsqueeze(1),
+                    project_input=False,
+                )
+
+                # TODO(fangjun): Scale the blank posterior
+                log_prob = (logits / temperature).log_softmax(dim=-1)
+                # log_prob is (1, 1, 1, vocab_size)
+                log_prob = log_prob.squeeze()
+                # Now log_prob is (vocab_size,)
+                joint_cache[cached_key] = log_prob
+            else:
+                log_prob = joint_cache[cached_key]
+
+            # First, process the blank symbol
+            skip_log_prob = log_prob[blank_id]
+            new_y_star_log_prob = y_star.log_prob + skip_log_prob
+
+            # ys[:] returns a copy of ys
+            B.add(
+                Hypothesis(
+                    ys=y_star.ys[:],
+                    log_prob=new_y_star_log_prob,
+                    timestamp=y_star.timestamp[:],
+                )
+            )
+
+            # Second, process other non-blank labels
+            values, indices = log_prob.topk(beam + 1)
+            for i, v in zip(indices.tolist(), values.tolist()):
+                if i in (blank_id, unk_id):
+                    continue
+                new_ys = y_star.ys + [i]
+                new_log_prob = y_star.log_prob + v
+                new_timestamp = y_star.timestamp + [t]
+                A.add(
+                    Hypothesis(
+                        ys=new_ys,
+                        log_prob=new_log_prob,
+                        timestamp=new_timestamp,
+                    )
+                )
+
+            # Check whether B contains more than "beam" elements more probable
+            # than the most probable in A
+            A_most_probable = A.get_most_probable()
+
+            kept_B = B.filter(A_most_probable.log_prob)
+
+            if len(kept_B) >= beam:
+                B = kept_B.topk(beam)
+                break
+
+        t += 1
+
+    best_hyp = B.get_most_probable(length_norm=True)
+    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
+
+    if not return_timestamps:
+        return ys
+    else:
+        return DecodingResults(hyps=[ys], timestamps=[best_hyp.timestamp])
+
+
+@dataclass
+class Hypothesis:
+    # The predicted tokens so far.
+    # Newly predicted tokens are appended to `ys`.
+    ys: List[int]
+
+    # The log prob of ys.
+    # It contains only one entry.
+    log_prob: torch.Tensor
+
+    # timestamp[i] is the frame index after subsampling
+    # on which ys[i] is decoded
+    timestamp: List[int] = field(default_factory=list)
+
+    # the lm score for next token given the current ys
+    lm_score: Optional[torch.Tensor] = None
+
+    # the RNNLM states (h and c in LSTM)
+    state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+
+    # N-gram LM state
+    state_cost: Optional[NgramLmStateCost] = None
+
+    @property
+    def key(self) -> str:
+        """Return a string representation of self.ys"""
+        return "_".join(map(str, self.ys))
+
+
+class HypothesisList(object):
+    def __init__(self, data: Optional[Dict[str, Hypothesis]] = None) -> None:
+        """
+        Args:
+          data:
+            A dict of Hypotheses. Its key is its `value.key`.
+        """
+        if data is None:
+            self._data = {}
+        else:
+            self._data = data
+
+    @property
+    def data(self) -> Dict[str, Hypothesis]:
+        return self._data
+
+    def add(self, hyp: Hypothesis) -> None:
+        """Add a Hypothesis to `self`.
+
+        If `hyp` already exists in `self`, its probability is updated using
+        `log-sum-exp` with the existed one.
+
+        Args:
+          hyp:
+            The hypothesis to be added.
+        """
+        key = hyp.key
+        if key in self:
+            old_hyp = self._data[key]  # shallow copy
+            torch.logaddexp(old_hyp.log_prob, hyp.log_prob, out=old_hyp.log_prob)
+        else:
+            self._data[key] = hyp
+
+    def get_most_probable(self, length_norm: bool = False) -> Hypothesis:
+        """Get the most probable hypothesis, i.e., the one with
+        the largest `log_prob`.
+
+        Args:
+          length_norm:
+            If True, the `log_prob` of a hypothesis is normalized by the
+            number of tokens in it.
+        Returns:
+          Return the hypothesis that has the largest `log_prob`.
+        """
+        if length_norm:
+            return max(self._data.values(), key=lambda hyp: hyp.log_prob / len(hyp.ys))
+        else:
+            return max(self._data.values(), key=lambda hyp: hyp.log_prob)
+
+    def remove(self, hyp: Hypothesis) -> None:
+        """Remove a given hypothesis.
+
+        Caution:
+          `self` is modified **in-place**.
+
+        Args:
+          hyp:
+            The hypothesis to be removed from `self`.
+            Note: It must be contained in `self`. Otherwise,
+            an exception is raised.
+        """
+        key = hyp.key
+        assert key in self, f"{key} does not exist"
+        del self._data[key]
+
+    def filter(self, threshold: torch.Tensor) -> "HypothesisList":
+        """Remove all Hypotheses whose log_prob is less than threshold.
+
+        Caution:
+          `self` is not modified. Instead, a new HypothesisList is returned.
+
+        Returns:
+          Return a new HypothesisList containing all hypotheses from `self`
+          with `log_prob` being greater than the given `threshold`.
+        """
+        ans = HypothesisList()
+        for _, hyp in self._data.items():
+            if hyp.log_prob > threshold:
+                ans.add(hyp)  # shallow copy
+        return ans
+
+    def topk(self, k: int) -> "HypothesisList":
+        """Return the top-k hypothesis."""
+        hyps = list(self._data.items())
+
+        hyps = sorted(hyps, key=lambda h: h[1].log_prob, reverse=True)[:k]
+
+        ans = HypothesisList(dict(hyps))
+        return ans
+
+    def __contains__(self, key: str):
+        return key in self._data
+
+    def __iter__(self):
+        return iter(self._data.values())
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __str__(self) -> str:
+        s = []
+        for key in self:
+            s.append(key)
+        return ", ".join(s)
+
+
+def get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
+    """Return a ragged shape with axes [utt][num_hyps].
+
+    Args:
+      hyps:
+        len(hyps) == batch_size. It contains the current hypothesis for
+        each utterance in the batch.
+    Returns:
+      Return a ragged shape with 2 axes [utt][num_hyps]. Note that
+      the shape is on CPU.
+    """
+    num_hyps = [len(h) for h in hyps]
+
+    # torch.cumsum() is inclusive sum, so we put a 0 at the beginning
+    # to get exclusive sum later.
+    num_hyps.insert(0, 0)
+
+    num_hyps = torch.tensor(num_hyps)
+    row_splits = torch.cumsum(num_hyps, dim=0, dtype=torch.int32)
+    ans = k2.ragged.create_ragged_shape2(
+        row_splits=row_splits, cached_tot_size=row_splits[-1].item()
+    )
+    return ans
diff --git a/egs/libricss/SURT/dprnn_zipformer/decode.py b/egs/libricss/SURT/dprnn_zipformer/decode.py
new file mode 100755
index 000000000..6abbffe00
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/decode.py
@@ -0,0 +1,654 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+(1) greedy search
+./dprnn_zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --use-averaged-model true \
+    --exp-dir ./dprnn_zipformer/exp \
+    --max-duration 600 \
+    --decoding-method greedy_search
+
+(2) modified beam search
+./dprnn_zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --use-averaged-model true \
+    --exp-dir ./dprnn_zipformer/exp \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+"""
+
+
+import argparse
+import logging
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from asr_datamodule import LibriCssAsrDataModule
+from beam_search import (
+    beam_search,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from lhotse.utils import EPSILON
+from train import add_model_arguments, get_params, get_surt_model
+
+from icefall import LmScorer, NgramLm
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_surt_error_stats,
+)
+
+OVERLAP_RATIOS = ["0L", "0S", "OV10", "OV20", "OV30", "OV40"]
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=30,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=9,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="dprnn_zipformer/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=Path,
+        default="data/lang_bpe_500",
+        help="The lang dir containing word table and LG graph",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Possible values are:
+          - greedy_search
+          - beam_search
+          - modified_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame.
+        Used only when --decoding_method is greedy_search""",
+    )
+
+    parser.add_argument(
+        "--save-masks",
+        type=str2bool,
+        default=False,
+        help="""If true, save masks generated by unmixing module.""",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+) -> Dict[str, List[List[str]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+
+        - key: It indicates the setting used for decoding. For example,
+               if greedy_search is used, it would be "greedy_search"
+               If beam search with a beam size of 7 is used, it would be
+               "beam_7"
+        - value: It contains the decoding result. `len(value)` equals to
+                 batch size. `value[i]` is the decoding result for the i-th
+                 utterance in the given batch.
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+    Returns:
+      Return the decoding result. See above description for the format of
+      the returned dict.
+    """
+    device = next(model.parameters()).device
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+
+    feature = feature.to(device)
+    feature_lens = batch["input_lens"].to(device)
+
+    # Apply the mask encoder
+    B, T, F = feature.shape
+    processed = model.mask_encoder(feature)  # B,T,F*num_channels
+    masks = processed.view(B, T, F, params.num_channels).unbind(dim=-1)
+    x_masked = [feature * m for m in masks]
+
+    masks_dict = {}
+    if params.save_masks:
+        # To save the masks, we split them by batch and trim each mask to the length of
+        # the corresponding feature. We save them in a dict, where the key is the
+        # cut ID and the value is the mask.
+        for i in range(B):
+            mask = torch.cat(
+                [x_masked[j][i, : feature_lens[i]] for j in range(params.num_channels)],
+                dim=-1,
+            )
+            mask = mask.cpu().numpy()
+            masks_dict[batch["cuts"][i].id] = mask
+
+    # Recognition
+    # Concatenate the inputs along the batch axis
+    h = torch.cat(x_masked, dim=0)
+    h_lens = feature_lens.repeat(params.num_channels)
+    encoder_out, encoder_out_lens = model.encoder(x=h, x_lens=h_lens)
+
+    if model.joint_encoder_layer is not None:
+        encoder_out = model.joint_encoder_layer(encoder_out)
+
+    def _group_channels(hyps: List[str]) -> List[List[str]]:
+        """
+        Currently we have a batch of size M*B, where M is the number of
+        channels and B is the batch size. We need to group the hypotheses
+        into B groups, each of which contains M hypotheses.
+
+        Example:
+            hyps = ['a1', 'b1', 'c1', 'a2', 'b2', 'c2']
+            _group_channels(hyps) = [['a1', 'a2'], ['b1', 'b2'], ['c1', 'c2']]
+        """
+        assert len(hyps) == B * params.num_channels
+        out_hyps = []
+        for i in range(B):
+            out_hyps.append(hyps[i::B])
+        return out_hyps
+
+    hyps = []
+    if params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp)
+    elif params.decoding_method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp)
+    else:
+        batch_size = encoder_out.size(0)
+
+        for i in range(batch_size):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.decoding_method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                )
+            elif params.decoding_method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported decoding method: {params.decoding_method}"
+                )
+            hyps.append(sp.decode(hyp))
+
+    if params.decoding_method == "greedy_search":
+        return {"greedy_search": _group_channels(hyps)}, masks_dict
+    else:
+        return {f"beam_size_{params.beam_size}": _group_channels(hyps)}, masks_dict
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    if params.decoding_method == "greedy_search":
+        log_interval = 50
+    else:
+        log_interval = 20
+
+    results = defaultdict(list)
+    masks = {}
+    for batch_idx, batch in enumerate(dl):
+        cut_ids = [cut.id for cut in batch["cuts"]]
+        cuts_batch = batch["cuts"]
+
+        hyps_dict, masks_dict = decode_one_batch(
+            params=params,
+            model=model,
+            sp=sp,
+        )
+        masks.update(masks_dict)
+
+        for name, hyps in hyps_dict.items():
+            this_batch = []
+            for cut_id, hyp_words in zip(cut_ids, hyps):
+                # Reference is a list of supervision texts sorted by start time.
+                ref_words = [
+                    s.text.strip()
+                    for s in sorted(
+                        cuts_batch[cut_id].supervisions, key=lambda s: s.start
+                    )
+                ]
+                this_batch.append((cut_id, ref_words, hyp_words))
+
+            results[name].extend(this_batch)
+
+        num_cuts += len(cut_ids)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+    return results, masks_dict
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        with open(errs_filename, "w") as f:
+            wer = write_surt_error_stats(
+                f,
+                f"{test_set_name}-{key}",
+                results,
+                enable_log=True,
+                num_channels=params.num_channels,
+            )
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = (
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+    )
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+def save_masks(
+    params: AttributeDict,
+    test_set_name: str,
+    masks: List[torch.Tensor],
+):
+    masks_path = params.res_dir / f"masks-{test_set_name}.txt"
+    torch.save(masks, masks_path)
+    logging.info(f"The masks are stored in {masks_path}")
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LmScorer.add_arguments(parser)
+    LibriCssAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+    args.lang_dir = Path(args.lang_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    assert params.decoding_method in (
+        "greedy_search",
+        "beam_search",
+        "modified_beam_search",
+    ), f"Decoding method {params.decoding_method} is not supported."
+    params.res_dir = params.exp_dir / params.decoding_method
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    if "beam_search" in params.decoding_method:
+        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
+    else:
+        params.suffix += f"-context-{params.context_size}"
+        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
+
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> and <unk> are defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.piece_to_id("<unk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+    assert model.encoder.decode_chunk_size == params.decode_chunk_len // 2, (
+        model.encoder.decode_chunk_size,
+        params.decode_chunk_len,
+    )
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to(device)
+    model.eval()
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    libricss = LibriCssAsrDataModule(args)
+
+    dev_cuts = libricss.libricss_cuts(split="dev", type="ihm-mix").to_eager()
+    dev_cuts_grouped = [dev_cuts.filter(lambda x: ol in x.id) for ol in OVERLAP_RATIOS]
+    test_cuts = libricss.libricss_cuts(split="test", type="ihm-mix").to_eager()
+    test_cuts_grouped = [
+        test_cuts.filter(lambda x: ol in x.id) for ol in OVERLAP_RATIOS
+    ]
+
+    for dev_set, ol in zip(dev_cuts_grouped, OVERLAP_RATIOS):
+        dev_dl = libricss.test_dataloaders(dev_set)
+        results_dict, masks = decode_dataset(
+            dl=dev_dl,
+            params=params,
+            model=model,
+            sp=sp,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=f"dev_{ol}",
+            results_dict=results_dict,
+        )
+
+        if params.save_masks:
+            save_masks(
+                params=params,
+                test_set_name=f"dev_{ol}",
+                masks=masks,
+            )
+
+    for test_set, ol in zip(test_cuts_grouped, OVERLAP_RATIOS):
+        test_dl = libricss.test_dataloaders(test_set)
+        results_dict, masks = decode_dataset(
+            dl=test_dl,
+            params=params,
+            model=model,
+            sp=sp,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=f"test_{ol}",
+            results_dict=results_dict,
+        )
+
+        if params.save_masks:
+            save_masks(
+                params=params,
+                test_set_name=f"test_{ol}",
+                masks=masks,
+            )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/libricss/SURT/dprnn_zipformer/decoder.py b/egs/libricss/SURT/dprnn_zipformer/decoder.py
new file mode 120000
index 000000000..8283d8c5a
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/decoder.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7/decoder.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/dprnn_zipformer/dprnn.py b/egs/libricss/SURT/dprnn_zipformer/dprnn.py
new file mode 100644
index 000000000..440dea885
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/dprnn.py
@@ -0,0 +1,305 @@
+import random
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from scaling import ActivationBalancer, BasicNorm, DoubleSwish, ScaledLinear, ScaledLSTM
+from torch.autograd import Variable
+
+EPS = torch.finfo(torch.get_default_dtype()).eps
+
+
+def _pad_segment(input, segment_size):
+    # Source: https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/dprnn.py#L342
+    # input is the features: (B, N, T)
+    batch_size, dim, seq_len = input.shape
+    segment_stride = segment_size // 2
+
+    rest = segment_size - (segment_stride + seq_len % segment_size) % segment_size
+    if rest > 0:
+        pad = Variable(torch.zeros(batch_size, dim, rest)).type(input.type())
+        input = torch.cat([input, pad], 2)
+
+    pad_aux = Variable(torch.zeros(batch_size, dim, segment_stride)).type(input.type())
+    input = torch.cat([pad_aux, input, pad_aux], 2)
+
+    return input, rest
+
+
+def split_feature(input, segment_size):
+    # Source: https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/dprnn.py#L358
+    # split the feature into chunks of segment size
+    # input is the features: (B, N, T)
+
+    input, rest = _pad_segment(input, segment_size)
+    batch_size, dim, seq_len = input.shape
+    segment_stride = segment_size // 2
+
+    segments1 = (
+        input[:, :, :-segment_stride]
+        .contiguous()
+        .view(batch_size, dim, -1, segment_size)
+    )
+    segments2 = (
+        input[:, :, segment_stride:]
+        .contiguous()
+        .view(batch_size, dim, -1, segment_size)
+    )
+    segments = (
+        torch.cat([segments1, segments2], 3)
+        .view(batch_size, dim, -1, segment_size)
+        .transpose(2, 3)
+    )
+
+    return segments.contiguous(), rest
+
+
+def merge_feature(input, rest):
+    # Source: https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/dprnn.py#L385
+    # merge the splitted features into full utterance
+    # input is the features: (B, N, L, K)
+
+    batch_size, dim, segment_size, _ = input.shape
+    segment_stride = segment_size // 2
+    input = (
+        input.transpose(2, 3).contiguous().view(batch_size, dim, -1, segment_size * 2)
+    )  # B, N, K, L
+
+    input1 = (
+        input[:, :, :, :segment_size]
+        .contiguous()
+        .view(batch_size, dim, -1)[:, :, segment_stride:]
+    )
+    input2 = (
+        input[:, :, :, segment_size:]
+        .contiguous()
+        .view(batch_size, dim, -1)[:, :, :-segment_stride]
+    )
+
+    output = input1 + input2
+    if rest > 0:
+        output = output[:, :, :-rest]
+
+    return output.contiguous()  # B, N, T
+
+
+class RNNEncoderLayer(nn.Module):
+    """
+    RNNEncoderLayer is made up of lstm and feedforward networks.
+    Args:
+      input_size:
+        The number of expected features in the input (required).
+      hidden_size:
+        The hidden dimension of rnn layer.
+      dropout:
+        The dropout value (default=0.1).
+      layer_dropout:
+        The dropout value for model-level warmup (default=0.075).
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        dropout: float = 0.1,
+        bidirectional: bool = False,
+    ) -> None:
+        super(RNNEncoderLayer, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+
+        assert hidden_size >= input_size, (hidden_size, input_size)
+        self.lstm = ScaledLSTM(
+            input_size=input_size,
+            hidden_size=hidden_size // 2 if bidirectional else hidden_size,
+            proj_size=0,
+            num_layers=1,
+            dropout=0.0,
+            batch_first=True,
+            bidirectional=bidirectional,
+        )
+        self.norm_final = BasicNorm(input_size)
+
+        # try to ensure the output is close to zero-mean (or at least, zero-median).  # noqa
+        self.balancer = ActivationBalancer(
+            num_channels=input_size,
+            channel_dim=-1,
+            min_positive=0.45,
+            max_positive=0.55,
+            max_abs=6.0,
+        )
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(
+        self,
+        src: torch.Tensor,
+        states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        warmup: float = 1.0,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Pass the input through the encoder layer.
+        Args:
+          src:
+            The sequence to the encoder layer (required).
+            Its shape is (S, N, E), where S is the sequence length,
+            N is the batch size, and E is the feature number.
+          states:
+            A tuple of 2 tensors (optional). It is for streaming inference.
+            states[0] is the hidden states of all layers,
+              with shape of (1, N, input_size);
+            states[1] is the cell states of all layers,
+              with shape of (1, N, hidden_size).
+        """
+        src_orig = src
+
+        # alpha = 1.0 means fully use this encoder layer, 0.0 would mean
+        # completely bypass it.
+        alpha = warmup if self.training else 1.0
+
+        # lstm module
+        src_lstm, new_states = self.lstm(src, states)
+        src = self.dropout(src_lstm) + src
+        src = self.norm_final(self.balancer(src))
+
+        if alpha != 1.0:
+            src = alpha * src + (1 - alpha) * src_orig
+
+        return src
+
+
+# dual-path RNN
+class DPRNN(nn.Module):
+    """Deep dual-path RNN.
+    Source: https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/dprnn.py
+
+    args:
+        input_size: int, dimension of the input feature. The input should have shape
+                    (batch, seq_len, input_size).
+        hidden_size: int, dimension of the hidden state.
+        output_size: int, dimension of the output size.
+        dropout: float, dropout ratio. Default is 0.
+        num_blocks: int, number of stacked RNN layers. Default is 1.
+    """
+
+    def __init__(
+        self,
+        feature_dim,
+        input_size,
+        hidden_size,
+        output_size,
+        dropout=0.1,
+        num_blocks=1,
+        segment_size=50,
+        chunk_width_randomization=False,
+    ):
+        super().__init__()
+
+        self.input_size = input_size
+        self.output_size = output_size
+        self.hidden_size = hidden_size
+
+        self.segment_size = segment_size
+        self.chunk_width_randomization = chunk_width_randomization
+
+        self.input_embed = nn.Sequential(
+            ScaledLinear(feature_dim, input_size),
+            BasicNorm(input_size),
+            ActivationBalancer(
+                num_channels=input_size,
+                channel_dim=-1,
+                min_positive=0.45,
+                max_positive=0.55,
+            ),
+        )
+
+        # dual-path RNN
+        self.row_rnn = nn.ModuleList([])
+        self.col_rnn = nn.ModuleList([])
+        for _ in range(num_blocks):
+            # intra-RNN is non-causal
+            self.row_rnn.append(
+                RNNEncoderLayer(
+                    input_size, hidden_size, dropout=dropout, bidirectional=True
+                )
+            )
+            self.col_rnn.append(
+                RNNEncoderLayer(
+                    input_size, hidden_size, dropout=dropout, bidirectional=False
+                )
+            )
+
+        # output layer
+        self.out_embed = nn.Sequential(
+            ScaledLinear(input_size, output_size),
+            BasicNorm(output_size),
+            ActivationBalancer(
+                num_channels=output_size,
+                channel_dim=-1,
+                min_positive=0.45,
+                max_positive=0.55,
+            ),
+        )
+
+    def forward(self, input):
+        # input shape: B, T, F
+        input = self.input_embed(input)
+        B, T, D = input.shape
+
+        if self.chunk_width_randomization and self.training:
+            segment_size = random.randint(self.segment_size // 2, self.segment_size)
+        else:
+            segment_size = self.segment_size
+        input, rest = split_feature(input.transpose(1, 2), segment_size)
+        # input shape: batch, N, dim1, dim2
+        # apply RNN on dim1 first and then dim2
+        # output shape: B, output_size, dim1, dim2
+        # input = input.to(device)
+        batch_size, _, dim1, dim2 = input.shape
+        output = input
+        for i in range(len(self.row_rnn)):
+            row_input = (
+                output.permute(0, 3, 2, 1)
+                .contiguous()
+                .view(batch_size * dim2, dim1, -1)
+            )  # B*dim2, dim1, N
+            output = self.row_rnn[i](row_input)  # B*dim2, dim1, H
+            output = (
+                output.view(batch_size, dim2, dim1, -1).permute(0, 3, 2, 1).contiguous()
+            )  # B, N, dim1, dim2
+
+            col_input = (
+                output.permute(0, 2, 3, 1)
+                .contiguous()
+                .view(batch_size * dim1, dim2, -1)
+            )  # B*dim1, dim2, N
+            output = self.col_rnn[i](col_input)  # B*dim1, dim2, H
+            output = (
+                output.view(batch_size, dim1, dim2, -1).permute(0, 3, 1, 2).contiguous()
+            )  # B, N, dim1, dim2
+
+        output = merge_feature(output, rest)
+        output = output.transpose(1, 2)
+        output = self.out_embed(output)
+
+        # Apply ReLU to the output
+        output = torch.relu(output)
+
+        return output
+
+
+if __name__ == "__main__":
+
+    model = DPRNN(
+        80,
+        256,
+        256,
+        160,
+        dropout=0.1,
+        num_blocks=4,
+        segment_size=32,
+        chunk_width_randomization=True,
+    )
+    input = torch.randn(2, 1002, 80)
+    print(sum(p.numel() for p in model.parameters()))
+    print(model(input).shape)
diff --git a/egs/libricss/SURT/dprnn_zipformer/encoder_interface.py b/egs/libricss/SURT/dprnn_zipformer/encoder_interface.py
new file mode 120000
index 000000000..0c2673d46
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/encoder_interface.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7/encoder_interface.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/dprnn_zipformer/export.py b/egs/libricss/SURT/dprnn_zipformer/export.py
new file mode 100755
index 000000000..f51f2a7ab
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/export.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script converts several saved checkpoints
+# to a single one using model averaging.
+"""
+
+Usage:
+
+(1) Export to torchscript model using torch.jit.script()
+
+./dprnn_zipformer/export.py \
+  --exp-dir ./dprnn_zipformer/exp \
+  --bpe-model data/lang_bpe_500/bpe.model \
+  --epoch 30 \
+  --avg 9 \
+  --jit 1
+
+It will generate a file `cpu_jit.pt` in the given `exp_dir`. You can later
+load it by `torch.jit.load("cpu_jit.pt")`.
+
+Note `cpu` in the name `cpu_jit.pt` means the parameters when loaded into Python
+are on CPU. You can use `to("cuda")` to move them to a CUDA device.
+
+Check
+https://github.com/k2-fsa/sherpa
+for how to use the exported models outside of icefall.
+
+(2) Export `model.state_dict()`
+
+./dprnn_zipformer/export.py \
+  --exp-dir ./dprnn_zipformer/exp \
+  --bpe-model data/lang_bpe_500/bpe.model \
+  --epoch 30 \
+  --avg 9
+
+It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
+load it by `icefall.checkpoint.load_checkpoint()`.
+
+To use the generated file with `dprnn_zipformer/decode.py`,
+you can do:
+
+    cd /path/to/exp_dir
+    ln -s pretrained.pt epoch-9999.pt
+
+    cd /path/to/egs/librispeech/ASR
+    ./dprnn_zipformer/decode.py \
+        --exp-dir ./dprnn_zipformer/exp \
+        --epoch 9999 \
+        --avg 1 \
+        --max-duration 600 \
+        --decoding-method greedy_search \
+        --bpe-model data/lang_bpe_500/bpe.model
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from scaling_converter import convert_scaled_to_non_scaled
+from train import add_model_arguments, get_params, get_surt_model
+
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.utils import str2bool
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=30,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=9,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="dprnn_zipformer/exp",
+        help="""It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--jit",
+        type=str2bool,
+        default=False,
+        help="""True to save a model after applying torch.jit.script.
+        It will generate a file named cpu_jit.pt
+
+        Check ./jit_pretrained.py for how to use it.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+@torch.no_grad()
+def main():
+    args = get_parser().parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+
+    model.to(device)
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to("cpu")
+    model.eval()
+
+    if params.jit is True:
+        convert_scaled_to_non_scaled(model, inplace=True)
+        # We won't use the forward() method of the model in C++, so just ignore
+        # it here.
+        # Otherwise, one of its arguments is a ragged tensor and is not
+        # torch scriptabe.
+        model.__class__.forward = torch.jit.ignore(model.__class__.forward)
+        logging.info("Using torch.jit.script")
+        model = torch.jit.script(model)
+        filename = params.exp_dir / "cpu_jit.pt"
+        model.save(str(filename))
+        logging.info(f"Saved to {filename}")
+    else:
+        logging.info("Not using torchscript. Export model.state_dict()")
+        # Save it using a format so that it can be loaded
+        # by :func:`load_checkpoint`
+        filename = params.exp_dir / "pretrained.pt"
+        torch.save({"model": model.state_dict()}, str(filename))
+        logging.info(f"Saved to {filename}")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
diff --git a/egs/libricss/SURT/dprnn_zipformer/joiner.py b/egs/libricss/SURT/dprnn_zipformer/joiner.py
new file mode 120000
index 000000000..0f0c3c90a
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/joiner.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7/joiner.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/dprnn_zipformer/model.py b/egs/libricss/SURT/dprnn_zipformer/model.py
new file mode 100644
index 000000000..688e1e78d
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/model.py
@@ -0,0 +1,316 @@
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang, Wei Kang)
+# Copyright    2023  Johns Hopkins University (author: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple
+
+import k2
+import torch
+import torch.nn as nn
+from encoder_interface import EncoderInterface
+
+from icefall.utils import add_sos
+
+
+class SURT(nn.Module):
+    """It implements Streaming Unmixing and Recognition Transducer (SURT).
+    https://arxiv.org/abs/2011.13148
+    """
+
+    def __init__(
+        self,
+        mask_encoder: nn.Module,
+        encoder: EncoderInterface,
+        joint_encoder_layer: Optional[nn.Module],
+        decoder: nn.Module,
+        joiner: nn.Module,
+        num_channels: int,
+        encoder_dim: int,
+        decoder_dim: int,
+        joiner_dim: int,
+        vocab_size: int,
+    ):
+        """
+        Args:
+          mask_encoder:
+            It is the masking network. It generates a mask for each channel of the
+            encoder. These masks are applied to the input features, and then passed
+            to the transcription network.
+          encoder:
+            It is the transcription network in the paper. Its accepts
+            two inputs: `x` of (N, T, encoder_dim) and `x_lens` of shape (N,).
+            It returns two tensors: `logits` of shape (N, T, encoder_dm) and
+            `logit_lens` of shape (N,).
+          decoder:
+            It is the prediction network in the paper. Its input shape
+            is (N, U) and its output shape is (N, U, decoder_dim).
+            It should contain one attribute: `blank_id`.
+          joiner:
+            It has two inputs with shapes: (N, T, encoder_dim) and (N, U, decoder_dim).
+            Its output shape is (N, T, U, vocab_size). Note that its output contains
+            unnormalized probs, i.e., not processed by log-softmax.
+          num_channels:
+            It is the number of channels that the input features will be split into.
+            In general, it should be equal to the maximum number of simultaneously
+            active speakers. For most real scenarios, using 2 channels is sufficient.
+        """
+        super().__init__()
+        assert isinstance(encoder, EncoderInterface), type(encoder)
+        assert hasattr(decoder, "blank_id")
+
+        self.mask_encoder = mask_encoder
+        self.encoder = encoder
+        self.joint_encoder_layer = joint_encoder_layer
+        self.decoder = decoder
+        self.joiner = joiner
+        self.num_channels = num_channels
+
+        self.simple_am_proj = nn.Linear(
+            encoder_dim,
+            vocab_size,
+        )
+        self.simple_lm_proj = nn.Linear(decoder_dim, vocab_size)
+
+        self.ctc_output = nn.Sequential(
+            nn.Dropout(p=0.1),
+            nn.Linear(encoder_dim, vocab_size),
+            nn.LogSoftmax(dim=-1),
+        )
+
+    def forward_helper(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: k2.RaggedTensor,
+        prune_range: int = 5,
+        am_scale: float = 0.0,
+        lm_scale: float = 0.0,
+        reduction: str = "sum",
+        beam_size: int = 10,
+        use_double_scores: bool = False,
+        subsampling_factor: int = 1,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Compute transducer loss for one branch of the SURT model.
+        """
+        encoder_out, x_lens = self.encoder(x, x_lens)
+        assert torch.all(x_lens > 0)
+
+        if self.joint_encoder_layer is not None:
+            encoder_out = self.joint_encoder_layer(encoder_out)
+
+        # compute ctc log-probs
+        ctc_output = self.ctc_output(encoder_out)
+
+        # For the decoder, i.e., the prediction network
+        row_splits = y.shape.row_splits(1)
+        y_lens = row_splits[1:] - row_splits[:-1]
+
+        blank_id = self.decoder.blank_id
+        sos_y = add_sos(y, sos_id=blank_id)
+
+        # sos_y_padded: [B, S + 1], start with SOS.
+        sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id)
+
+        # decoder_out: [B, S + 1, decoder_dim]
+        decoder_out = self.decoder(sos_y_padded)
+
+        # Note: y does not start with SOS
+        # y_padded : [B, S]
+        y_padded = y.pad(mode="constant", padding_value=0)
+
+        y_padded = y_padded.to(torch.int64)
+        boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device)
+        boundary[:, 2] = y_lens
+        boundary[:, 3] = x_lens
+
+        lm = self.simple_lm_proj(decoder_out)
+        am = self.simple_am_proj(encoder_out)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
+                lm=lm.float(),
+                am=am.float(),
+                symbols=y_padded,
+                termination_symbol=blank_id,
+                lm_only_scale=lm_scale,
+                am_only_scale=am_scale,
+                boundary=boundary,
+                reduction=reduction,
+                return_grad=True,
+            )
+
+        # ranges : [B, T, prune_range]
+        ranges = k2.get_rnnt_prune_ranges(
+            px_grad=px_grad,
+            py_grad=py_grad,
+            boundary=boundary,
+            s_range=prune_range,
+        )
+
+        # am_pruned : [B, T, prune_range, encoder_dim]
+        # lm_pruned : [B, T, prune_range, decoder_dim]
+        am_pruned, lm_pruned = k2.do_rnnt_pruning(
+            am=self.joiner.encoder_proj(encoder_out),
+            lm=self.joiner.decoder_proj(decoder_out),
+            ranges=ranges,
+        )
+
+        # logits : [B, T, prune_range, vocab_size]
+
+        # project_input=False since we applied the decoder's input projections
+        # prior to do_rnnt_pruning (this is an optimization for speed).
+        logits = self.joiner(am_pruned, lm_pruned, project_input=False)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            pruned_loss = k2.rnnt_loss_pruned(
+                logits=logits.float(),
+                symbols=y_padded,
+                ranges=ranges,
+                termination_symbol=blank_id,
+                boundary=boundary,
+                reduction=reduction,
+            )
+
+        # Compute ctc loss
+        supervision_segments = torch.stack(
+            (
+                torch.arange(len(x_lens), device="cpu"),
+                torch.zeros_like(x_lens, device="cpu"),
+                torch.clone(x_lens).detach().cpu(),
+            ),
+            dim=1,
+        ).to(torch.int32)
+        # We need to sort supervision_segments in decreasing order of num_frames
+        indices = torch.argsort(supervision_segments[:, 2], descending=True)
+        supervision_segments = supervision_segments[indices]
+
+        # Works with a BPE model
+        decoding_graph = k2.ctc_graph(y, modified=False, device=x.device)
+        dense_fsa_vec = k2.DenseFsaVec(
+            ctc_output,
+            supervision_segments,
+            allow_truncate=subsampling_factor - 1,
+        )
+        ctc_loss = k2.ctc_loss(
+            decoding_graph=decoding_graph,
+            dense_fsa_vec=dense_fsa_vec,
+            output_beam=beam_size,
+            reduction="none",
+            use_double_scores=use_double_scores,
+        )
+
+        return (simple_loss, pruned_loss, ctc_loss)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: k2.RaggedTensor,
+        prune_range: int = 5,
+        am_scale: float = 0.0,
+        lm_scale: float = 0.0,
+        reduction: str = "sum",
+        beam_size: int = 10,
+        use_double_scores: bool = False,
+        subsampling_factor: int = 1,
+        return_masks: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+          x:
+            A 3-D tensor of shape (N, T, C).
+          x_lens:
+            A 1-D tensor of shape (N,). It contains the number of frames in `x`
+            before padding.
+          y:
+            A ragged tensor of shape (N*num_channels, S). It contains the labels
+            of the N utterances. The labels are in the range [0, vocab_size). All
+            the channels are concatenated together one after another.
+          prune_range:
+            The prune range for rnnt loss, it means how many symbols(context)
+            we are considering for each frame to compute the loss.
+          am_scale:
+            The scale to smooth the loss with am (output of encoder network)
+            part
+          lm_scale:
+            The scale to smooth the loss with lm (output of predictor network)
+            part
+          reduction:
+            "sum" to sum the losses over all utterances in the batch.
+            "none" to return the loss in a 1-D tensor for each utterance
+            in the batch.
+          beam_size:
+            The beam size used in CTC decoding.
+          use_double_scores:
+            If True, use double precision for CTC decoding.
+          subsampling_factor:
+            The subsampling factor of the model. It is used to compute the
+            supervision segments for CTC loss.
+          return_masks:
+            If True, return the masks as well as masked features.
+        Returns:
+          Return the transducer loss.
+
+        Note:
+           Regarding am_scale & lm_scale, it will make the loss-function one of
+           the form:
+              lm_scale * lm_probs + am_scale * am_probs +
+              (1-lm_scale-am_scale) * combined_probs
+        """
+        assert x.ndim == 3, x.shape
+        assert x_lens.ndim == 1, x_lens.shape
+        assert y.num_axes == 2, y.num_axes
+
+        assert x.size(0) == x_lens.size(0), (x.size(), x_lens.size())
+
+        # Apply the mask encoder
+        B, T, F = x.shape
+        processed = self.mask_encoder(x)  # B,T,F*num_channels
+        masks = processed.view(B, T, F, self.num_channels).unbind(dim=-1)
+        x_masked = [x * m for m in masks]
+
+        # Recognition
+        # Stack the inputs along the batch axis
+        h = torch.cat(x_masked, dim=0)
+        h_lens = torch.cat([x_lens for _ in range(self.num_channels)], dim=0)
+
+        simple_loss, pruned_loss, ctc_loss = self.forward_helper(
+            h,
+            h_lens,
+            y,
+            prune_range,
+            am_scale,
+            lm_scale,
+            reduction=reduction,
+            beam_size=beam_size,
+            use_double_scores=use_double_scores,
+            subsampling_factor=subsampling_factor,
+        )
+
+        # Chunks the outputs into 2 parts along batch axis and then stack them along a new axis.
+        simple_loss = torch.stack(
+            torch.chunk(simple_loss, self.num_channels, dim=0), dim=0
+        )
+        pruned_loss = torch.stack(
+            torch.chunk(pruned_loss, self.num_channels, dim=0), dim=0
+        )
+        ctc_loss = torch.stack(torch.chunk(ctc_loss, self.num_channels, dim=0), dim=0)
+
+        if return_masks:
+            return (simple_loss, pruned_loss, ctc_loss, x_masked, masks)
+        else:
+            return (simple_loss, pruned_loss, ctc_loss, x_masked)
diff --git a/egs/libricss/SURT/dprnn_zipformer/optim.py b/egs/libricss/SURT/dprnn_zipformer/optim.py
new file mode 120000
index 000000000..8a05abb5f
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/optim.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7/optim.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/dprnn_zipformer/scaling.py b/egs/libricss/SURT/dprnn_zipformer/scaling.py
new file mode 120000
index 000000000..5f9be9fe0
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/scaling.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7/scaling.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/dprnn_zipformer/scaling_converter.py b/egs/libricss/SURT/dprnn_zipformer/scaling_converter.py
new file mode 120000
index 000000000..f9960e5c6
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/scaling_converter.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7/scaling_converter.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/dprnn_zipformer/train.py b/egs/libricss/SURT/dprnn_zipformer/train.py
new file mode 100755
index 000000000..6598f8b5d
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/train.py
@@ -0,0 +1,1452 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                  Wei Kang,
+#                                                  Mingshuang Luo,)
+#                                                  Zengwei Yao)
+#              2023  Johns Hopkins University (author: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+cd egs/libricss/SURT
+./prepare.sh
+
+./dprnn_zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --exp-dir dprnn_zipformer/exp \
+  --max-duration 300
+
+# For mix precision training:
+
+./dprnn_zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir dprnn_zipformer/exp \
+  --max-duration 550
+"""
+
+import argparse
+import copy
+import logging
+import warnings
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple, Union
+
+import k2
+import optim
+import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import LibriCssAsrDataModule
+from decoder import Decoder
+from dprnn import DPRNN
+from einops.layers.torch import Rearrange
+from graph_pit.loss.optimized import optimized_graph_pit_mse_loss as gpit_mse
+from joiner import Joiner
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import LOG_EPSILON, fix_random_seed
+from model import SURT
+from optim import Eden, ScaledAdam
+from scaling import ScaledLSTM
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from zipformer import Zipformer
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for module in model.modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+
+
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-mask-encoder-layers",
+        type=int,
+        default=4,
+        help="Number of layers in the DPRNN based mask encoder.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-dim",
+        type=int,
+        default=256,
+        help="Hidden dimension of the LSTM blocks in DPRNN.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-segment-size",
+        type=int,
+        default=32,
+        help="Segment size of the SegLSTM in DPRNN. Ideally, this should be equal to the "
+        "decode-chunk-length of the zipformer encoder.",
+    )
+
+    parser.add_argument(
+        "--chunk-width-randomization",
+        type=bool,
+        default=False,
+        help="Whether to randomize the chunk width in DPRNN.",
+    )
+
+    # Zipformer config is based on:
+    # https://github.com/k2-fsa/icefall/pull/745#issuecomment-1405282740
+    parser.add_argument(
+        "--num-encoder-layers",
+        type=str,
+        default="2,2,2,2,2",
+        help="Number of zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--feedforward-dims",
+        type=str,
+        default="768,768,768,768,768",
+        help="Feedforward dimension of the zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--nhead",
+        type=str,
+        default="8,8,8,8,8",
+        help="Number of attention heads in the zipformer encoder layers.",
+    )
+
+    parser.add_argument(
+        "--encoder-dims",
+        type=str,
+        default="256,256,256,256,256",
+        help="Embedding dimension in the 2 blocks of zipformer encoder layers, comma separated",
+    )
+
+    parser.add_argument(
+        "--attention-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="""Attention dimension in the 2 blocks of zipformer encoder layers, comma separated;
+        not the same as embedding dimension.""",
+    )
+
+    parser.add_argument(
+        "--encoder-unmasked-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="Unmasked dimensions in the encoders, relates to augmentation during training.  "
+        "Must be <= each of encoder_dims.  Empirically, less than 256 seems to make performance "
+        " worse.",
+    )
+
+    parser.add_argument(
+        "--zipformer-downsampling-factors",
+        type=str,
+        default="1,2,4,8,2",
+        help="Downsampling factor for each stack of encoder layers.",
+    )
+
+    parser.add_argument(
+        "--cnn-module-kernels",
+        type=str,
+        default="31,31,31,31,31",
+        help="Sizes of kernels in convolution modules",
+    )
+
+    parser.add_argument(
+        "--use-joint-encoder-layer",
+        type=str,
+        default="lstm",
+        choices=["linear", "lstm", "none"],
+        help="Whether to use a joint layer to combine all branches.",
+    )
+
+    parser.add_argument(
+        "--decoder-dim",
+        type=int,
+        default=512,
+        help="Embedding dimension in the decoder model.",
+    )
+
+    parser.add_argument(
+        "--joiner-dim",
+        type=int,
+        default=512,
+        help="""Dimension used in the joiner model.
+        Outputs from the encoder and decoder model are projected
+        to this dimension before adding.
+        """,
+    )
+
+    parser.add_argument(
+        "--short-chunk-size",
+        type=int,
+        default=50,
+        help="""Chunk length of dynamic training, the chunk size would be either
+        max sequence length of current batch or uniformly sampled from (1, short_chunk_size).
+        """,
+    )
+
+    parser.add_argument(
+        "--num-left-chunks",
+        type=int,
+        default=4,
+        help="How many left context can be seen in chunks when calculating attention.",
+    )
+
+    parser.add_argument(
+        "--decode-chunk-len",
+        type=int,
+        default=32,
+        help="The chunk size for decoding (in frames before subsampling)",
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=30,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="conv_lstm_transducer_stateless_ctc/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--model-init-ckpt",
+        type=str,
+        default=None,
+        help="""The model checkpoint to initialize the model (either full or part).
+        If not specified, the model is randomly initialized.
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=0.004, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=5000,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=6,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="The prune range for rnnt loss, it means how many symbols(context)"
+        "we are using to compute the loss",
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="The scale to smooth the loss with lm "
+        "(output of prediction network) part.",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="The scale to smooth the loss with am (output of encoder network) part.",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="To get pruning ranges, we will calculate a simple version"
+        "loss(joiner is just addition), this simple loss also uses for"
+        "training (as a regularization item). We will scale the simple loss"
+        "with this parameter before adding to the final loss.",
+    )
+
+    parser.add_argument(
+        "--ctc-loss-scale",
+        type=float,
+        default=0.2,
+        help="Scale for CTC loss.",
+    )
+
+    parser.add_argument(
+        "--heat-loss-scale",
+        type=float,
+        default=0.0,
+        help="Scale for HEAT loss on separated sources.",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=2000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 0.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=1,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=100,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - feature_dim: The model input dim. It has to match the one used
+                       in computing features.
+
+        - subsampling_factor:  The subsampling factor for the model.
+
+        - num_decoder_layers: Number of decoder layer of transformer decoder.
+
+        - warm_step: The warm_step for Noam optimizer.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "valid_interval": 2000,
+            # parameters for SURT
+            "num_channels": 2,
+            "feature_dim": 80,
+            "subsampling_factor": 4,  # not passed in, this is fixed
+            # parameters for Noam
+            "model_warm_step": 5000,  # arg given to model, not for lrate
+            # parameters for ctc loss
+            "beam_size": 10,
+            "use_double_scores": True,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def get_mask_encoder_model(params: AttributeDict) -> nn.Module:
+    mask_encoder = DPRNN(
+        feature_dim=params.feature_dim,
+        input_size=params.mask_encoder_dim,
+        hidden_size=params.mask_encoder_dim,
+        output_size=params.feature_dim * params.num_channels,
+        segment_size=params.mask_encoder_segment_size,
+        num_blocks=params.num_mask_encoder_layers,
+        chunk_width_randomization=params.chunk_width_randomization,
+    )
+    return mask_encoder
+
+
+def get_encoder_model(params: AttributeDict) -> nn.Module:
+    # TODO: We can add an option to switch between Zipformer and Transformer
+    def to_int_tuple(s: str):
+        return tuple(map(int, s.split(",")))
+
+    encoder = Zipformer(
+        num_features=params.feature_dim,
+        output_downsampling_factor=2,
+        zipformer_downsampling_factors=to_int_tuple(
+            params.zipformer_downsampling_factors
+        ),
+        encoder_dims=to_int_tuple(params.encoder_dims),
+        attention_dim=to_int_tuple(params.attention_dims),
+        encoder_unmasked_dims=to_int_tuple(params.encoder_unmasked_dims),
+        nhead=to_int_tuple(params.nhead),
+        feedforward_dim=to_int_tuple(params.feedforward_dims),
+        cnn_module_kernels=to_int_tuple(params.cnn_module_kernels),
+        num_encoder_layers=to_int_tuple(params.num_encoder_layers),
+        num_left_chunks=params.num_left_chunks,
+        short_chunk_size=params.short_chunk_size,
+        decode_chunk_size=params.decode_chunk_len // 2,
+    )
+    return encoder
+
+
+def get_joint_encoder_layer(params: AttributeDict) -> nn.Module:
+    class TakeFirst(nn.Module):
+        def forward(self, x):
+            return x[0]
+
+    if params.use_joint_encoder_layer == "linear":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            nn.Linear(
+                params.num_channels * encoder_dim, params.num_channels * encoder_dim
+            ),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "lstm":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            ScaledLSTM(
+                input_size=params.num_channels * encoder_dim,
+                hidden_size=params.num_channels * encoder_dim,
+                num_layers=1,
+                bias=True,
+                batch_first=True,
+                dropout=0.0,
+                bidirectional=False,
+            ),
+            TakeFirst(),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "none":
+        joint_layer = None
+    else:
+        raise ValueError(
+            f"Unknown joint encoder layer type: {params.use_joint_encoder_layer}"
+        )
+    return joint_layer
+
+
+def get_decoder_model(params: AttributeDict) -> nn.Module:
+    decoder = Decoder(
+        vocab_size=params.vocab_size,
+        decoder_dim=params.decoder_dim,
+        blank_id=params.blank_id,
+        context_size=params.context_size,
+    )
+    return decoder
+
+
+def get_joiner_model(params: AttributeDict) -> nn.Module:
+    joiner = Joiner(
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return joiner
+
+
+def get_surt_model(
+    params: AttributeDict,
+) -> nn.Module:
+    mask_encoder = get_mask_encoder_model(params)
+    encoder = get_encoder_model(params)
+    joint_layer = get_joint_encoder_layer(params)
+    decoder = get_decoder_model(params)
+    joiner = get_joiner_model(params)
+
+    model = SURT(
+        mask_encoder=mask_encoder,
+        encoder=encoder,
+        joint_encoder_layer=joint_layer,
+        decoder=decoder,
+        joiner=joiner,
+        num_channels=params.num_channels,
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return model
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_heat_loss(x_masked, batch, num_channels=2) -> Tensor:
+    """
+    Compute HEAT loss for separated sources using the output of mask encoder.
+    Args:
+      x_masked:
+        The output of mask encoder. It is a tensor of shape (B, T, C).
+      batch:
+        A batch of data. See `lhotse.dataset.K2SurtDatasetWithSources()`
+        for the content in it.
+      num_channels:
+        The number of output branches in the SURT model.
+    """
+    B, T, D = x_masked[0].shape
+    device = x_masked[0].device
+
+    # Create training targets for each channel.
+    targets = []
+    for i in range(num_channels):
+        target = torch.ones_like(x_masked[i]) * LOG_EPSILON
+        targets.append(target)
+
+    source_feats = batch["source_feats"]
+    source_boundaries = batch["source_boundaries"]
+    input_lens = batch["input_lens"].to(device)
+    # Assign sources to channels based on the HEAT criteria
+    for b in range(B):
+        cut_source_feats = source_feats[b]
+        cut_source_boundaries = source_boundaries[b]
+        last_seg_end = [0 for _ in range(num_channels)]
+        for source_feat, (start, end) in zip(cut_source_feats, cut_source_boundaries):
+            assigned = False
+            for i in range(num_channels):
+                if start >= last_seg_end[i]:
+                    targets[i][b, start:end, :] += source_feat.to(device)
+                    last_seg_end[i] = max(end, last_seg_end[i])
+                    assigned = True
+                    break
+            if not assigned:
+                min_end_channel = last_seg_end.index(min(last_seg_end))
+                targets[min_end_channel][b, start:end, :] += source_feat
+                last_seg_end[min_end_channel] = max(end, last_seg_end[min_end_channel])
+
+    # Get padding mask based on input lengths
+    pad_mask = torch.arange(T, device=device).expand(B, T) > input_lens.unsqueeze(1)
+    pad_mask = pad_mask.unsqueeze(-1)
+
+    # Compute masked loss for each channel
+    losses = torch.zeros((num_channels, B, T, D), device=device)
+    for i in range(num_channels):
+        loss = nn.functional.mse_loss(x_masked[i], targets[i], reduction="none")
+        # Apply padding mask to loss
+        loss.masked_fill_(pad_mask, 0)
+        losses[i] = loss
+
+    # loss: C x B x T x D. pad_mask: B x T x 1
+    # We want to compute loss for each item in the batch. Each item has loss given
+    # by the sum over C, and average over T and D. For T, we need to use the padding.
+    loss = losses.sum(0).mean(-1).sum(-1) / batch["input_lens"].to(device)
+    return loss
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute RNN-T loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Conformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"].to(device)
+    feature_lens = batch["input_lens"].to(device)
+
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+
+    # The dataloader returns text as a list of cuts, each of which is a list of channel
+    # text. We flatten this to a list where all channels are together, i.e., it looks like
+    # [utt1_ch1, utt2_ch1, ..., uttN_ch1, utt1_ch2, ...., uttN,ch2].
+    text = [val for tup in zip(*batch["text"]) for val in tup]
+    assert len(text) == len(feature) * params.num_channels
+
+    # Convert all channel texts to token IDs and create a ragged tensor.
+    y = sp.encode(text, out_type=int)
+    y = k2.RaggedTensor(y).to(device)
+
+    batch_idx_train = params.batch_idx_train
+    warm_step = params.model_warm_step
+
+    with torch.set_grad_enabled(is_training):
+        (simple_loss, pruned_loss, ctc_loss, x_masked) = model(
+            x=feature,
+            x_lens=feature_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+            reduction="none",
+            subsampling_factor=params.subsampling_factor,
+        )
+        simple_loss_is_finite = torch.isfinite(simple_loss)
+        pruned_loss_is_finite = torch.isfinite(pruned_loss)
+        ctc_loss_is_finite = torch.isfinite(ctc_loss)
+
+        # Compute HEAT loss
+        if is_training and params.heat_loss_scale > 0.0:
+            heat_loss = compute_heat_loss(
+                x_masked, batch, num_channels=params.num_channels
+            )
+        else:
+            heat_loss = torch.tensor(0.0, device=device)
+
+        heat_loss_is_finite = torch.isfinite(heat_loss)
+        is_finite = (
+            simple_loss_is_finite
+            & pruned_loss_is_finite
+            & ctc_loss_is_finite
+            & heat_loss_is_finite
+        )
+        if not torch.all(is_finite):
+            logging.info(
+                "Not all losses are finite!\n"
+                f"simple_losses: {simple_loss}\n"
+                f"pruned_losses: {pruned_loss}\n"
+                f"ctc_losses: {ctc_loss}\n"
+                f"heat_losses: {heat_loss}\n"
+            )
+            display_and_save_batch(batch, params=params, sp=sp)
+            simple_loss = simple_loss[simple_loss_is_finite]
+            pruned_loss = pruned_loss[pruned_loss_is_finite]
+            ctc_loss = ctc_loss[ctc_loss_is_finite]
+            heat_loss = heat_loss[heat_loss_is_finite]
+
+            # If either all simple_loss or pruned_loss is inf or nan,
+            # we stop the training process by raising an exception
+            if (
+                torch.all(~simple_loss_is_finite)
+                or torch.all(~pruned_loss_is_finite)
+                or torch.all(~ctc_loss_is_finite)
+                or torch.all(~heat_loss_is_finite)
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
+
+        simple_loss_sum = simple_loss.sum()
+        pruned_loss_sum = pruned_loss.sum()
+        ctc_loss_sum = ctc_loss.sum()
+        heat_loss_sum = heat_loss.sum()
+
+        s = params.simple_loss_scale
+        # take down the scale on the simple loss from 1.0 at the start
+        # to params.simple_loss scale by warm_step.
+        simple_loss_scale = (
+            s
+            if batch_idx_train >= warm_step
+            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
+        )
+        pruned_loss_scale = (
+            1.0
+            if batch_idx_train >= warm_step
+            else 0.1 + 0.9 * (batch_idx_train / warm_step)
+        )
+        loss = (
+            simple_loss_scale * simple_loss_sum
+            + pruned_loss_scale * pruned_loss_sum
+            + params.ctc_loss_scale * ctc_loss_sum
+            + params.heat_loss_scale * heat_loss_sum
+        )
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        # info["frames"] is an approximate number for two reasons:
+        # (1) The acutal subsampling factor is ((lens - 1) // 2 - 1) // 2
+        # (2) If some utterances in the batch lead to inf/nan loss, they
+        #     are filtered out.
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances`  # noqa
+    info["utterances"] = feature.size(0)
+    # averaged input duration in frames over utterances
+    info["utt_duration"] = feature_lens.sum().item()
+    # averaged padding proportion over utterances
+    info["utt_pad_proportion"] = (
+        ((feature.size(1) - feature_lens) / feature.size(1)).sum().item()
+    )
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    info["simple_loss"] = simple_loss_sum.detach().cpu().item()
+    info["pruned_loss"] = pruned_loss_sum.detach().cpu().item()
+    if params.ctc_loss_scale > 0.0:
+        info["ctc_loss"] = ctc_loss_sum.detach().cpu().item()
+    if params.heat_loss_scale > 0.0:
+        info["heat_loss"] = heat_loss_sum.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            sp=sp,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    sp: spm.SentencePieceProcessor,
+    train_dl: torch.utils.data.DataLoader,
+    train_dl_warmup: Optional[torch.utils.data.DataLoader],
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      train_dl_warmup:
+        Dataloader for the training dataset with 2 speakers. This is used during the
+        warmup stage.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    torch.cuda.empty_cache()
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    iter_train = iter(train_dl)
+    iter_train_warmup = iter(train_dl_warmup) if train_dl_warmup is not None else None
+
+    batch_idx = 0
+
+    while True:
+        # We first sample a batch from the main dataset. This is because we want to
+        # make sure all epochs have the same number of batches.
+        try:
+            batch = next(iter_train)
+        except StopIteration:
+            break
+
+        # If we are in warmup stage, get the batch from the warmup dataset.
+        if (
+            params.batch_idx_train <= params.model_warm_step
+            and iter_train_warmup is not None
+        ):
+            try:
+                batch = next(iter_train_warmup)
+            except StopIteration:
+                iter_train_warmup = iter(train_dl_warmup)
+                batch = next(iter_train_warmup)
+
+        batch_idx += 1
+
+        params.batch_idx_train += 1
+        batch_size = batch["inputs"].shape[0]
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            set_batch_count(model, params.batch_idx_train)
+            scheduler.step_batch(params.batch_idx_train)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            params.cur_batch_idx = batch_idx
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            del params.cur_batch_idx
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = scheduler.get_last_lr()[0]
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
+                    )
+
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    model.to(device)
+
+    if checkpoints is None and params.model_init_ckpt is not None:
+        logging.info(
+            f"Initializing model with checkpoint from {params.model_init_ckpt}"
+        )
+        init_ckpt = torch.load(params.model_init_ckpt, map_location=device)
+        model.load_state_dict(init_ckpt["model"], strict=False)
+
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    parameters_names = []
+    parameters_names.append(
+        [name_param_pair[0] for name_param_pair in model.named_parameters()]
+    )
+    optimizer = ScaledAdam(
+        model.parameters(),
+        lr=params.base_lr,
+        clipping_scale=2.0,
+        parameters_names=parameters_names,
+    )
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        diagnostic = diagnostics.attach_diagnostics(model)
+
+    libricss = LibriCssAsrDataModule(args)
+
+    train_cuts = libricss.lsmix_cuts(rvb_affix="comb", type_affix="full", sources=True)
+    train_cuts_ov40 = libricss.lsmix_cuts(
+        rvb_affix="comb", type_affix="ov40", sources=True
+    )
+    dev_cuts = libricss.libricss_cuts(split="dev", type="sdm")
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = libricss.train_dataloaders(
+        train_cuts,
+        sampler_state_dict=sampler_state_dict,
+    )
+    train_dl_ov40 = libricss.train_dataloaders(train_cuts_ov40)
+    valid_dl = libricss.valid_dataloaders(dev_cuts)
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sp=sp,
+            train_dl=train_dl,
+            train_dl_warmup=train_dl_ov40,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    y = [sp.encode(text_ch) for text_ch in batch["text"]]
+    num_tokens = [sum(len(yi) for yi in y_ch) for y_ch in y]
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def main():
+    parser = get_parser()
+    LibriCssAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/libricss/SURT/dprnn_zipformer/train_adapt.py b/egs/libricss/SURT/dprnn_zipformer/train_adapt.py
new file mode 100755
index 000000000..1c1b0c28c
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/train_adapt.py
@@ -0,0 +1,1343 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                  Wei Kang,
+#                                                  Mingshuang Luo,)
+#                                                  Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+export CUDA_VISIBLE_DEVICES=0
+
+./dprnn_zipformer/train.py \
+  --world-size 1 \
+  --num-epochs 15 \
+  --start-epoch 1 \
+  --exp-dir dprnn_zipformer/exp \
+  --max-duration 300
+
+# For mix precision training:
+
+./dprnn_zipformer/train.py \
+  --world-size 1 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir dprnn_zipformer/exp \
+  --max-duration 550
+"""
+
+import argparse
+import copy
+import logging
+import warnings
+from itertools import chain
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple, Union
+
+import k2
+import optim
+import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import LibriCssAsrDataModule
+from decoder import Decoder
+from dprnn import DPRNN
+from einops.layers.torch import Rearrange
+from joiner import Joiner
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import LOG_EPSILON, fix_random_seed
+from model import SURT
+from optim import Eden, ScaledAdam
+from scaling import ScaledLinear, ScaledLSTM
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from zipformer import Zipformer
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for module in model.modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+
+
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-mask-encoder-layers",
+        type=int,
+        default=4,
+        help="Number of layers in the DPRNN based mask encoder.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-dim",
+        type=int,
+        default=256,
+        help="Hidden dimension of the LSTM blocks in DPRNN.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-segment-size",
+        type=int,
+        default=32,
+        help="Segment size of the SegLSTM in DPRNN. Ideally, this should be equal to the "
+        "decode-chunk-length of the zipformer encoder.",
+    )
+
+    parser.add_argument(
+        "--chunk-width-randomization",
+        type=bool,
+        default=False,
+        help="Whether to randomize the chunk width in DPRNN.",
+    )
+
+    # Zipformer config is based on:
+    # https://github.com/k2-fsa/icefall/pull/745#issuecomment-1405282740
+    parser.add_argument(
+        "--num-encoder-layers",
+        type=str,
+        default="2,2,2,2,2",
+        help="Number of zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--feedforward-dims",
+        type=str,
+        default="768,768,768,768,768",
+        help="Feedforward dimension of the zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--nhead",
+        type=str,
+        default="8,8,8,8,8",
+        help="Number of attention heads in the zipformer encoder layers.",
+    )
+
+    parser.add_argument(
+        "--encoder-dims",
+        type=str,
+        default="256,256,256,256,256",
+        help="Embedding dimension in the 2 blocks of zipformer encoder layers, comma separated",
+    )
+
+    parser.add_argument(
+        "--attention-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="""Attention dimension in the 2 blocks of zipformer encoder layers, comma separated;
+        not the same as embedding dimension.""",
+    )
+
+    parser.add_argument(
+        "--encoder-unmasked-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="Unmasked dimensions in the encoders, relates to augmentation during training.  "
+        "Must be <= each of encoder_dims.  Empirically, less than 256 seems to make performance "
+        " worse.",
+    )
+
+    parser.add_argument(
+        "--zipformer-downsampling-factors",
+        type=str,
+        default="1,2,4,8,2",
+        help="Downsampling factor for each stack of encoder layers.",
+    )
+
+    parser.add_argument(
+        "--cnn-module-kernels",
+        type=str,
+        default="31,31,31,31,31",
+        help="Sizes of kernels in convolution modules",
+    )
+
+    parser.add_argument(
+        "--use-joint-encoder-layer",
+        type=str,
+        default="lstm",
+        choices=["linear", "lstm", "none"],
+        help="Whether to use a joint layer to combine all branches.",
+    )
+
+    parser.add_argument(
+        "--decoder-dim",
+        type=int,
+        default=512,
+        help="Embedding dimension in the decoder model.",
+    )
+
+    parser.add_argument(
+        "--joiner-dim",
+        type=int,
+        default=512,
+        help="""Dimension used in the joiner model.
+        Outputs from the encoder and decoder model are projected
+        to this dimension before adding.
+        """,
+    )
+
+    parser.add_argument(
+        "--short-chunk-size",
+        type=int,
+        default=50,
+        help="""Chunk length of dynamic training, the chunk size would be either
+        max sequence length of current batch or uniformly sampled from (1, short_chunk_size).
+        """,
+    )
+
+    parser.add_argument(
+        "--num-left-chunks",
+        type=int,
+        default=4,
+        help="How many left context can be seen in chunks when calculating attention.",
+    )
+
+    parser.add_argument(
+        "--decode-chunk-len",
+        type=int,
+        default=32,
+        help="The chunk size for decoding (in frames before subsampling)",
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=15,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="conv_lstm_transducer_stateless_ctc/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--model-init-ckpt",
+        type=str,
+        default=None,
+        help="""The model checkpoint to initialize the model (either full or part).
+        If not specified, the model is randomly initialized.
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=0.0004, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=1000,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=2,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="The prune range for rnnt loss, it means how many symbols(context)"
+        "we are using to compute the loss",
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="The scale to smooth the loss with lm "
+        "(output of prediction network) part.",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="The scale to smooth the loss with am (output of encoder network) part.",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="To get pruning ranges, we will calculate a simple version"
+        "loss(joiner is just addition), this simple loss also uses for"
+        "training (as a regularization item). We will scale the simple loss"
+        "with this parameter before adding to the final loss.",
+    )
+
+    parser.add_argument(
+        "--ctc-loss-scale",
+        type=float,
+        default=0.2,
+        help="Scale for CTC loss.",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=1000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 0.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=5,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=100,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - feature_dim: The model input dim. It has to match the one used
+                       in computing features.
+
+        - subsampling_factor:  The subsampling factor for the model.
+
+        - num_decoder_layers: Number of decoder layer of transformer decoder.
+
+        - warm_step: The warm_step for Noam optimizer.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 10,
+            "reset_interval": 200,
+            "valid_interval": 100,
+            # parameters for SURT
+            "num_channels": 2,
+            "feature_dim": 80,
+            "subsampling_factor": 4,  # not passed in, this is fixed
+            # parameters for Noam
+            "model_warm_step": 5000,  # arg given to model, not for lrate
+            # parameters for ctc loss
+            "beam_size": 10,
+            "use_double_scores": True,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def get_mask_encoder_model(params: AttributeDict) -> nn.Module:
+    mask_encoder = DPRNN(
+        feature_dim=params.feature_dim,
+        input_size=params.mask_encoder_dim,
+        hidden_size=params.mask_encoder_dim,
+        output_size=params.feature_dim * params.num_channels,
+        segment_size=params.mask_encoder_segment_size,
+        num_blocks=params.num_mask_encoder_layers,
+        chunk_width_randomization=params.chunk_width_randomization,
+    )
+    return mask_encoder
+
+
+def get_encoder_model(params: AttributeDict) -> nn.Module:
+    # TODO: We can add an option to switch between Zipformer and Transformer
+    def to_int_tuple(s: str):
+        return tuple(map(int, s.split(",")))
+
+    encoder = Zipformer(
+        num_features=params.feature_dim,
+        output_downsampling_factor=2,
+        zipformer_downsampling_factors=to_int_tuple(
+            params.zipformer_downsampling_factors
+        ),
+        encoder_dims=to_int_tuple(params.encoder_dims),
+        attention_dim=to_int_tuple(params.attention_dims),
+        encoder_unmasked_dims=to_int_tuple(params.encoder_unmasked_dims),
+        nhead=to_int_tuple(params.nhead),
+        feedforward_dim=to_int_tuple(params.feedforward_dims),
+        cnn_module_kernels=to_int_tuple(params.cnn_module_kernels),
+        num_encoder_layers=to_int_tuple(params.num_encoder_layers),
+        num_left_chunks=params.num_left_chunks,
+        short_chunk_size=params.short_chunk_size,
+        decode_chunk_size=params.decode_chunk_len // 2,
+    )
+    return encoder
+
+
+def get_joint_encoder_layer(params: AttributeDict) -> nn.Module:
+    class TakeFirst(nn.Module):
+        def forward(self, x):
+            return x[0]
+
+    if params.use_joint_encoder_layer == "linear":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            nn.Linear(
+                params.num_channels * encoder_dim, params.num_channels * encoder_dim
+            ),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "lstm":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            ScaledLSTM(
+                input_size=params.num_channels * encoder_dim,
+                hidden_size=params.num_channels * encoder_dim,
+                num_layers=1,
+                bias=True,
+                batch_first=True,
+                dropout=0.0,
+                bidirectional=False,
+            ),
+            TakeFirst(),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "none":
+        joint_layer = None
+    else:
+        raise ValueError(
+            f"Unknown joint encoder layer type: {params.use_joint_encoder_layer}"
+        )
+    return joint_layer
+
+
+def get_decoder_model(params: AttributeDict) -> nn.Module:
+    decoder = Decoder(
+        vocab_size=params.vocab_size,
+        decoder_dim=params.decoder_dim,
+        blank_id=params.blank_id,
+        context_size=params.context_size,
+    )
+    return decoder
+
+
+def get_joiner_model(params: AttributeDict) -> nn.Module:
+    joiner = Joiner(
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return joiner
+
+
+def get_surt_model(
+    params: AttributeDict,
+) -> nn.Module:
+    mask_encoder = get_mask_encoder_model(params)
+    encoder = get_encoder_model(params)
+    joint_layer = get_joint_encoder_layer(params)
+    decoder = get_decoder_model(params)
+    joiner = get_joiner_model(params)
+
+    model = SURT(
+        mask_encoder=mask_encoder,
+        encoder=encoder,
+        joint_encoder_layer=joint_layer,
+        decoder=decoder,
+        joiner=joiner,
+        num_channels=params.num_channels,
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return model
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute RNN-T loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Conformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"].to(device)
+    feature_lens = batch["input_lens"].to(device)
+
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+
+    # The dataloader returns text as a list of cuts, each of which is a list of channel
+    # text. We flatten this to a list where all channels are together, i.e., it looks like
+    # [utt1_ch1, utt2_ch1, ..., uttN_ch1, utt1_ch2, ...., uttN,ch2].
+    text = [val for tup in zip(*batch["text"]) for val in tup]
+    assert len(text) == len(feature) * params.num_channels
+
+    # Convert all channel texts to token IDs and create a ragged tensor.
+    y = sp.encode(text, out_type=int)
+    y = k2.RaggedTensor(y).to(device)
+
+    batch_idx_train = params.batch_idx_train
+    warm_step = params.model_warm_step
+
+    with torch.set_grad_enabled(is_training):
+        (simple_loss, pruned_loss, ctc_loss, x_masked) = model(
+            x=feature,
+            x_lens=feature_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+            reduction="none",
+            subsampling_factor=params.subsampling_factor,
+        )
+        simple_loss_is_finite = torch.isfinite(simple_loss)
+        pruned_loss_is_finite = torch.isfinite(pruned_loss)
+        ctc_loss_is_finite = torch.isfinite(ctc_loss)
+
+        is_finite = simple_loss_is_finite & pruned_loss_is_finite & ctc_loss_is_finite
+        if not torch.all(is_finite):
+            logging.info(
+                "Not all losses are finite!\n"
+                f"simple_losses: {simple_loss}\n"
+                f"pruned_losses: {pruned_loss}\n"
+                f"ctc_losses: {ctc_loss}\n"
+            )
+            display_and_save_batch(batch, params=params, sp=sp)
+            simple_loss = simple_loss[simple_loss_is_finite]
+            pruned_loss = pruned_loss[pruned_loss_is_finite]
+            ctc_loss = ctc_loss[ctc_loss_is_finite]
+
+            # If either all simple_loss or pruned_loss is inf or nan,
+            # we stop the training process by raising an exception
+            if (
+                torch.all(~simple_loss_is_finite)
+                or torch.all(~pruned_loss_is_finite)
+                or torch.all(~ctc_loss_is_finite)
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
+
+        simple_loss_sum = simple_loss.sum()
+        pruned_loss_sum = pruned_loss.sum()
+        ctc_loss_sum = ctc_loss.sum()
+
+        s = params.simple_loss_scale
+        # take down the scale on the simple loss from 1.0 at the start
+        # to params.simple_loss scale by warm_step.
+        simple_loss_scale = (
+            s
+            if batch_idx_train >= warm_step
+            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
+        )
+        pruned_loss_scale = (
+            1.0
+            if batch_idx_train >= warm_step
+            else 0.1 + 0.9 * (batch_idx_train / warm_step)
+        )
+        loss = (
+            simple_loss_scale * simple_loss_sum
+            + pruned_loss_scale * pruned_loss_sum
+            + params.ctc_loss_scale * ctc_loss_sum
+        )
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        # info["frames"] is an approximate number for two reasons:
+        # (1) The acutal subsampling factor is ((lens - 1) // 2 - 1) // 2
+        # (2) If some utterances in the batch lead to inf/nan loss, they
+        #     are filtered out.
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances`  # noqa
+    info["utterances"] = feature.size(0)
+    # averaged input duration in frames over utterances
+    info["utt_duration"] = feature_lens.sum().item()
+    # averaged padding proportion over utterances
+    info["utt_pad_proportion"] = (
+        ((feature.size(1) - feature_lens) / feature.size(1)).sum().item()
+    )
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    info["simple_loss"] = simple_loss_sum.detach().cpu().item()
+    info["pruned_loss"] = pruned_loss_sum.detach().cpu().item()
+    if params.ctc_loss_scale > 0.0:
+        info["ctc_loss"] = ctc_loss_sum.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            sp=sp,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    sp: spm.SentencePieceProcessor,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      train_dl_warmup:
+        Dataloader for the training dataset with 2 speakers. This is used during the
+        warmup stage.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    torch.cuda.empty_cache()
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    cur_batch_idx = params.get("cur_batch_idx", 0)
+
+    for batch_idx, batch in enumerate(train_dl):
+        if batch_idx < cur_batch_idx:
+            continue
+        cur_batch_idx = batch_idx
+
+        params.batch_idx_train += 1
+        batch_size = batch["inputs"].shape[0]
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            set_batch_count(model, params.batch_idx_train)
+            scheduler.step_batch(params.batch_idx_train)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            params.cur_batch_idx = batch_idx
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            del params.cur_batch_idx
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = scheduler.get_last_lr()[0]
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
+                    )
+
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    model.to(device)
+
+    if checkpoints is None and params.model_init_ckpt is not None:
+        logging.info(
+            f"Initializing model with checkpoint from {params.model_init_ckpt}"
+        )
+        init_ckpt = torch.load(params.model_init_ckpt, map_location=device)
+        model.load_state_dict(init_ckpt["model"], strict=True)
+
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    parameters_names = []
+    parameters_names.append(
+        [name_param_pair[0] for name_param_pair in model.named_parameters()]
+    )
+    optimizer = ScaledAdam(
+        model.parameters(),
+        lr=params.base_lr,
+        clipping_scale=2.0,
+        parameters_names=parameters_names,
+    )
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        diagnostic = diagnostics.attach_diagnostics(model)
+
+    libricss = LibriCssAsrDataModule(args)
+
+    train_cuts_ihm = libricss.libricss_cuts(split="dev", type="ihm-mix")
+    train_cuts_sdm = libricss.libricss_cuts(split="dev", type="sdm")
+    train_cuts = train_cuts_ihm + train_cuts_sdm
+
+    # This will create 2 copies of the sessions with different segmentation
+    train_cuts = train_cuts.trim_to_supervision_groups(
+        max_pause=0.1
+    ) + train_cuts.trim_to_supervision_groups(max_pause=0.5)
+    dev_cuts = libricss.libricss_cuts(split="dev", type="sdm")
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = libricss.train_dataloaders(
+        train_cuts,
+        sampler_state_dict=sampler_state_dict,
+        return_sources=False,
+        strict=False,
+    )
+    valid_dl = libricss.valid_dataloaders(dev_cuts)
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sp=sp,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    y = [sp.encode(text_ch) for text_ch in batch["text"]]
+    num_tokens = [sum(len(yi) for yi in y_ch) for y_ch in y]
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def main():
+    parser = get_parser()
+    LibriCssAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/libricss/SURT/dprnn_zipformer/zipformer.py b/egs/libricss/SURT/dprnn_zipformer/zipformer.py
new file mode 120000
index 000000000..ec183baa7
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/zipformer.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/heat.png b/egs/libricss/SURT/heat.png
new file mode 100644
index 000000000..ac7ecfff4
Binary files /dev/null and b/egs/libricss/SURT/heat.png differ
diff --git a/egs/libricss/SURT/local/add_source_feats.py b/egs/libricss/SURT/local/add_source_feats.py
new file mode 100755
index 000000000..c9775561f
--- /dev/null
+++ b/egs/libricss/SURT/local/add_source_feats.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file adds source features as temporal arrays to the mixture manifests.
+It looks for manifests in the directory data/manifests.
+"""
+import logging
+from pathlib import Path
+
+import numpy as np
+from lhotse import CutSet, LilcomChunkyWriter, load_manifest, load_manifest_lazy
+from tqdm import tqdm
+
+
+def add_source_feats(num_jobs=1):
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    for type_affix in ["full", "ov40"]:
+        logging.info(f"Adding source features for {type_affix}")
+        mixed_name_clean = f"train_clean_{type_affix}"
+        mixed_name_rvb = f"train_rvb_{type_affix}"
+
+        logging.info("Reading mixed cuts")
+        mixed_cuts_clean = load_manifest_lazy(
+            src_dir / f"cuts_{mixed_name_clean}.jsonl.gz"
+        )
+        mixed_cuts_rvb = load_manifest_lazy(src_dir / f"cuts_{mixed_name_rvb}.jsonl.gz")
+
+        logging.info("Reading source cuts")
+        source_cuts = load_manifest(src_dir / "librispeech_cuts_train_trimmed.jsonl.gz")
+
+        logging.info("Adding source features to the mixed cuts")
+        with tqdm() as pbar, CutSet.open_writer(
+            src_dir / f"cuts_{mixed_name_clean}_sources.jsonl.gz"
+        ) as cut_writer_clean, CutSet.open_writer(
+            src_dir / f"cuts_{mixed_name_rvb}_sources.jsonl.gz"
+        ) as cut_writer_rvb, LilcomChunkyWriter(
+            output_dir / f"feats_train_{type_affix}_sources"
+        ) as source_feat_writer:
+            for cut_clean, cut_rvb in zip(mixed_cuts_clean, mixed_cuts_rvb):
+                assert cut_rvb.id == cut_clean.id + "_rvb"
+                # Create source_feats and source_feat_offsets
+                # (See `lhotse.datasets.K2SurtDataset` for details)
+                source_feats = []
+                source_feat_offsets = []
+                cur_offset = 0
+                for sup in sorted(
+                    cut_clean.supervisions, key=lambda s: (s.start, s.speaker)
+                ):
+                    source_cut = source_cuts[sup.id]
+                    source_feats.append(source_cut.load_features())
+                    source_feat_offsets.append(cur_offset)
+                    cur_offset += source_cut.num_frames
+                cut_clean.source_feats = source_feat_writer.store_array(
+                    cut_clean.id, np.concatenate(source_feats, axis=0)
+                )
+                cut_clean.source_feat_offsets = source_feat_offsets
+                cut_writer_clean.write(cut_clean)
+                cut_rvb.source_feats = cut_clean.source_feats
+                cut_rvb.source_feat_offsets = cut_clean.source_feat_offsets
+                cut_writer_rvb.write(cut_rvb)
+                pbar.update(1)
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    add_source_feats()
diff --git a/egs/libricss/SURT/local/compute_fbank_libricss.py b/egs/libricss/SURT/local/compute_fbank_libricss.py
new file mode 100755
index 000000000..afd66899c
--- /dev/null
+++ b/egs/libricss/SURT/local/compute_fbank_libricss.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the LibriCSS dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+import logging
+from pathlib import Path
+
+import pyloudnorm as pyln
+import torch
+import torch.multiprocessing
+from lhotse import LilcomChunkyWriter, load_manifest_lazy
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
+def compute_fbank_libricss():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    logging.info("Reading manifests")
+    cuts_ihm_mix = load_manifest_lazy(
+        src_dir / "libricss-ihm-mix_segments_all.jsonl.gz"
+    )
+    cuts_sdm = load_manifest_lazy(src_dir / "libricss-sdm_segments_all.jsonl.gz")
+
+    for name, cuts in [("ihm-mix", cuts_ihm_mix), ("sdm", cuts_sdm)]:
+        dev_cuts = cuts.filter(lambda c: "session0" in c.id)
+        test_cuts = cuts.filter(lambda c: "session0" not in c.id)
+
+        # If SDM cuts, apply loudness normalization
+        if name == "sdm":
+            dev_cuts = dev_cuts.normalize_loudness(target=-23.0)
+            test_cuts = test_cuts.normalize_loudness(target=-23.0)
+
+        logging.info(f"Extracting fbank features for {name} dev cuts")
+        _ = dev_cuts.compute_and_store_features_batch(
+            extractor=extractor,
+            storage_path=output_dir / f"libricss-{name}_feats_dev",
+            manifest_path=src_dir / f"cuts_dev_libricss-{name}.jsonl.gz",
+            batch_duration=500,
+            num_workers=2,
+            storage_type=LilcomChunkyWriter,
+            overwrite=True,
+        )
+
+        logging.info(f"Extracting fbank features for {name} test cuts")
+        _ = test_cuts.compute_and_store_features_batch(
+            extractor=extractor,
+            storage_path=output_dir / f"libricss-{name}_feats_test",
+            manifest_path=src_dir / f"cuts_test_libricss-{name}.jsonl.gz",
+            batch_duration=2000,
+            num_workers=4,
+            storage_type=LilcomChunkyWriter,
+            overwrite=True,
+        )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    compute_fbank_libricss()
diff --git a/egs/libricss/SURT/local/compute_fbank_librispeech.py b/egs/libricss/SURT/local/compute_fbank_librispeech.py
new file mode 100755
index 000000000..5c8aece9c
--- /dev/null
+++ b/egs/libricss/SURT/local/compute_fbank_librispeech.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the LibriSpeech dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+
+import logging
+from pathlib import Path
+
+import torch
+from lhotse import CutSet, LilcomChunkyWriter
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
+def compute_fbank_librispeech():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+    num_mel_bins = 80
+
+    dataset_parts = (
+        "train-clean-100",
+        "train-clean-360",
+        "train-other-500",
+    )
+    prefix = "librispeech"
+    suffix = "jsonl.gz"
+    manifests = read_manifests_if_cached(
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
+    )
+    assert manifests is not None
+
+    assert len(manifests) == len(dataset_parts), (
+        len(manifests),
+        len(dataset_parts),
+        list(manifests.keys()),
+        dataset_parts,
+    )
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=16000),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    for partition, m in manifests.items():
+        cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
+        if (output_dir / cuts_filename).is_file():
+            logging.info(f"{partition} already exists - skipping.")
+            continue
+        logging.info(f"Processing {partition}")
+        cut_set = CutSet.from_manifests(
+            recordings=m["recordings"],
+            supervisions=m["supervisions"],
+        )
+
+        cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
+
+        cut_set = cut_set.compute_and_store_features_batch(
+            extractor=extractor,
+            storage_path=f"{output_dir}/{prefix}_feats_{partition}",
+            manifest_path=f"{src_dir}/{cuts_filename}",
+            batch_duration=4000,
+            num_workers=2,
+            storage_type=LilcomChunkyWriter,
+            overwrite=True,
+        )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    compute_fbank_librispeech()
diff --git a/egs/libricss/SURT/local/compute_fbank_lsmix.py b/egs/libricss/SURT/local/compute_fbank_lsmix.py
new file mode 100755
index 000000000..da42f8ba1
--- /dev/null
+++ b/egs/libricss/SURT/local/compute_fbank_lsmix.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the synthetically mixed LibriSpeech
+train and dev sets.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+import logging
+import random
+import warnings
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+from lhotse import LilcomChunkyWriter, load_manifest
+from lhotse.cut import MixedCut, MixTrack, MultiCut
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+from lhotse.utils import fix_random_seed, uuid4
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
+def compute_fbank_lsmix():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    logging.info("Reading manifests")
+    manifests = read_manifests_if_cached(
+        dataset_parts=["train_clean_full", "train_clean_ov40"],
+        types=["cuts"],
+        output_dir=src_dir,
+        prefix="lsmix",
+        suffix="jsonl.gz",
+        lazy=True,
+    )
+
+    cs = {}
+    cs["clean_full"] = manifests["train_clean_full"]["cuts"]
+    cs["clean_ov40"] = manifests["train_clean_ov40"]["cuts"]
+
+    # only uses RIRs and noises from REVERB challenge
+    real_rirs = load_manifest(src_dir / "real-rir_recordings_all.jsonl.gz").filter(
+        lambda r: "RVB2014" in r.id
+    )
+    noises = load_manifest(src_dir / "iso-noise_recordings_all.jsonl.gz").filter(
+        lambda r: "RVB2014" in r.id
+    )
+
+    # Apply perturbation to the training cuts
+    logging.info("Applying perturbation to the training cuts")
+    cs["rvb_full"] = cs["clean_full"].map(
+        lambda c: augment(
+            c, perturb_snr=True, rirs=real_rirs, noises=noises, perturb_loudness=True
+        )
+    )
+    cs["rvb_ov40"] = cs["clean_ov40"].map(
+        lambda c: augment(
+            c, perturb_snr=True, rirs=real_rirs, noises=noises, perturb_loudness=True
+        )
+    )
+
+    for type_affix in ["full", "ov40"]:
+        for rvb_affix in ["clean", "rvb"]:
+            logging.info(
+                f"Extracting fbank features for {type_affix} {rvb_affix} training cuts"
+            )
+            cuts = cs[f"{rvb_affix}_{type_affix}"]
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                _ = cuts.compute_and_store_features_batch(
+                    extractor=extractor,
+                    storage_path=output_dir
+                    / f"lsmix_feats_train_{rvb_affix}_{type_affix}",
+                    manifest_path=src_dir
+                    / f"cuts_train_{rvb_affix}_{type_affix}.jsonl.gz",
+                    batch_duration=5000,
+                    num_workers=4,
+                    storage_type=LilcomChunkyWriter,
+                    overwrite=True,
+                )
+
+
+def augment(cut, perturb_snr=False, rirs=None, noises=None, perturb_loudness=False):
+    """
+    Given a mixed cut, this function optionally applies the following augmentations:
+    - Perturbing the SNRs of the tracks (in range [-5, 5] dB)
+    - Reverberation using a randomly selected RIR
+    - Adding noise
+    - Perturbing the loudness (in range [-20, -25] dB)
+    """
+    out_cut = cut.drop_features()
+
+    # Perturb the SNRs (optional)
+    if perturb_snr:
+        snrs = [random.uniform(-5, 5) for _ in range(len(cut.tracks))]
+        for i, (track, snr) in enumerate(zip(out_cut.tracks, snrs)):
+            if i == 0:
+                # Skip the first track since it is the reference
+                continue
+            track.snr = snr
+
+    # Reverberate the cut (optional)
+    if rirs is not None:
+        # Select an RIR at random
+        rir = random.choice(rirs)
+        # Select a channel at random
+        rir_channel = random.choice(list(range(rir.num_channels)))
+        # Reverberate the cut
+        out_cut = out_cut.reverb_rir(rir_recording=rir, rir_channels=[rir_channel])
+
+    # Add noise (optional)
+    if noises is not None:
+        # Select a noise recording at random
+        noise = random.choice(noises).to_cut()
+        if isinstance(noise, MultiCut):
+            noise = noise.to_mono()[0]
+        # Select an SNR at random
+        snr = random.uniform(10, 30)
+        # Repeat the noise to match the duration of the cut
+        noise = repeat_cut(noise, out_cut.duration)
+        out_cut = MixedCut(
+            id=out_cut.id,
+            tracks=[
+                MixTrack(cut=out_cut, type="MixedCut"),
+                MixTrack(cut=noise, type="DataCut", snr=snr),
+            ],
+        )
+
+    # Perturb the loudness (optional)
+    if perturb_loudness:
+        target_loudness = random.uniform(-20, -25)
+        out_cut = out_cut.normalize_loudness(target_loudness, mix_first=True)
+    return out_cut
+
+
+def repeat_cut(cut, duration):
+    while cut.duration < duration:
+        cut = cut.mix(cut, offset_other_by=cut.duration)
+    return cut.truncate(duration=duration)
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    fix_random_seed(42)
+    compute_fbank_lsmix()
diff --git a/egs/libricss/SURT/local/compute_fbank_musan.py b/egs/libricss/SURT/local/compute_fbank_musan.py
new file mode 100755
index 000000000..1fcf951f9
--- /dev/null
+++ b/egs/libricss/SURT/local/compute_fbank_musan.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the musan dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+
+import logging
+from pathlib import Path
+
+import torch
+from lhotse import CutSet, LilcomChunkyWriter, combine
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+
+def compute_fbank_musan():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    dataset_parts = (
+        "music",
+        "speech",
+        "noise",
+    )
+    prefix = "musan"
+    suffix = "jsonl.gz"
+    manifests = read_manifests_if_cached(
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
+    )
+    assert manifests is not None
+
+    assert len(manifests) == len(dataset_parts), (
+        len(manifests),
+        len(dataset_parts),
+        list(manifests.keys()),
+        dataset_parts,
+    )
+
+    musan_cuts_path = src_dir / "musan_cuts.jsonl.gz"
+
+    if musan_cuts_path.is_file():
+        logging.info(f"{musan_cuts_path} already exists - skipping")
+        return
+
+    logging.info("Extracting features for Musan")
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    # create chunks of Musan with duration 5 - 10 seconds
+    _ = (
+        CutSet.from_manifests(
+            recordings=combine(part["recordings"] for part in manifests.values())
+        )
+        .cut_into_windows(10.0)
+        .filter(lambda c: c.duration > 5)
+        .compute_and_store_features_batch(
+            extractor=extractor,
+            storage_path=output_dir / "musan_feats",
+            manifest_path=musan_cuts_path,
+            batch_duration=500,
+            num_workers=4,
+            storage_type=LilcomChunkyWriter,
+        )
+    )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    compute_fbank_musan()
diff --git a/egs/libricss/SURT/prepare.sh b/egs/libricss/SURT/prepare.sh
new file mode 100755
index 000000000..028240e44
--- /dev/null
+++ b/egs/libricss/SURT/prepare.sh
@@ -0,0 +1,204 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+stage=-1
+stop_stage=100
+
+# We assume dl_dir (download dir) contains the following
+# directories and files. If not, they will be downloaded
+# by this script automatically.
+#
+#  - $dl_dir/librispeech
+#      You can find audio and transcripts for LibriSpeech in this path.
+#
+#  - $dl_dir/libricss
+#      You can find audio and transcripts for LibriCSS in this path.
+#
+#  - $dl_dir/musan
+#      This directory contains the following directories downloaded from
+#       http://www.openslr.org/17/
+#
+#     - music
+#     - noise
+#     - speech
+#
+#  - $dl_dir/rirs_noises
+#      This directory contains the RIRS_NOISES corpus downloaded from https://openslr.org/28/.
+#
+dl_dir=$PWD/download
+
+. shared/parse_options.sh || exit 1
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+vocab_size=500
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Download data"
+
+  # If you have pre-downloaded it to /path/to/librispeech,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/librispeech $dl_dir/librispeech
+  #
+  if [ ! -d $dl_dir/librispeech ]; then
+    lhotse download librispeech $dl_dir/librispeech
+  fi
+
+  # If you have pre-downloaded it to /path/to/libricss,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/libricss $dl_dir/libricss
+  #
+  if [ ! -d $dl_dir/libricss ]; then
+    lhotse download libricss $dl_dir/libricss
+  fi
+
+  # If you have pre-downloaded it to /path/to/musan,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/musan $dl_dir/
+  #
+  if [ ! -d $dl_dir/musan ]; then
+    lhotse download musan $dl_dir
+  fi
+
+  # If you have pre-downloaded it to /path/to/rirs_noises,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/rirs_noises $dl_dir/
+  #
+  if [ ! -d $dl_dir/rirs_noises ]; then
+    lhotse download rirs_noises $dl_dir
+  fi
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare LibriSpeech manifests"
+  # We assume that you have downloaded the LibriSpeech corpus
+  # to $dl_dir/librispeech. We perform text normalization for the transcripts.
+  # NOTE: Alignments are required for this recipe.
+  mkdir -p data/manifests
+  lhotse prepare librispeech -p train-clean-100 -p train-clean-360 -p train-other-500 -p dev-clean \
+    -j 4 --alignments-dir $dl_dir/libri_alignments/LibriSpeech $dl_dir/librispeech data/manifests/
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Prepare LibriCSS manifests"
+  # We assume that you have downloaded the LibriCSS corpus
+  # to $dl_dir/libricss. We perform text normalization for the transcripts.
+  mkdir -p data/manifests
+  for mic in sdm ihm-mix; do
+    lhotse prepare libricss --type $mic --segmented $dl_dir/libricss data/manifests/
+  done
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Prepare musan manifest and RIRs"
+  # We assume that you have downloaded the musan corpus
+  # to $dl_dir/musan
+  mkdir -p data/manifests
+  lhotse prepare musan $dl_dir/musan data/manifests
+
+  # We assume that you have downloaded the RIRS_NOISES corpus
+  # to $dl_dir/rirs_noises
+  lhotse prepare rir-noise -p real_rir -p iso_noise $dl_dir/rirs_noises data/manifests
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  log "Stage 4: Extract features for LibriSpeech, trim to alignments, and shuffle the cuts"
+  python local/compute_fbank_librispeech.py
+  lhotse combine data/manifests/librispeech_cuts_train* - |\
+    lhotse cut trim-to-alignments --type word --max-pause 0.2 - - |\
+    shuf | gzip -c > data/manifests/librispeech_cuts_train_trimmed.jsonl.gz
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Create simulated mixtures from LibriSpeech (train and dev). This may take a while."
+  # We create a high overlap set which will be used during the model warmup phase, and a
+  # full training set that will be used for the subsequent training.
+
+  gunzip -c data/manifests/libricss-sdm_supervisions_all.jsonl.gz |\
+    grep -v "0L" | grep -v "OV10" |\
+    gzip -c > data/manifests/libricss-sdm_supervisions_all_v1.jsonl.gz
+
+  gunzip -c data/manifests/libricss-sdm_supervisions_all.jsonl.gz |\
+    grep "OV40" |\
+    gzip -c > data/manifests/libricss-sdm_supervisions_ov40.jsonl.gz
+
+  # Warmup mixtures (100k) based on high overlap (OV40)
+  log "Generating 100k anechoic train mixtures for warmup"
+  lhotse workflows simulate-meetings \
+    --method conversational \
+    --fit-to-supervisions data/manifests/libricss-sdm_supervisions_ov40.jsonl.gz \
+    --num-meetings 100000 \
+    --num-speakers-per-meeting 2,3 \
+    --max-duration-per-speaker 15.0 \
+    --max-utterances-per-speaker 3 \
+    --seed 1234 \
+    --num-jobs 4 \
+    data/manifests/librispeech_cuts_train_trimmed.jsonl.gz \
+    data/manifests/lsmix_cuts_train_clean_ov40.jsonl.gz
+
+  # Full training set (2,3 speakers) anechoic
+  log "Generating anechoic ${part} set (full)"
+  lhotse workflows simulate-meetings \
+    --method conversational \
+    --fit-to-supervisions data/manifests/libricss-sdm_supervisions_all_v1.jsonl.gz \
+    --num-repeats 1 \
+    --num-speakers-per-meeting 2,3 \
+    --max-duration-per-speaker 15.0 \
+    --max-utterances-per-speaker 3 \
+    --seed 1234 \
+    --num-jobs 4 \
+    data/manifests/librispeech_cuts_train_trimmed.jsonl.gz \
+    data/manifests/lsmix_cuts_train_clean_full.jsonl.gz
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
+  log "Stage 6: Compute fbank features for musan"
+  mkdir -p data/fbank
+  python local/compute_fbank_musan.py
+fi
+
+if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
+  log "Stage 7: Compute fbank features for simulated Libri-mix"
+  mkdir -p data/fbank
+  python local/compute_fbank_lsmix.py
+fi
+
+if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
+  log "Stage 8: Add source feats to mixtures (useful for auxiliary tasks)"
+  python local/add_source_feats.py
+
+  log "Combining lsmix-clean and lsmix-rvb"
+  for type in full ov40; do
+    cat <(gunzip -c data/manifests/cuts_train_clean_${type}_sources.jsonl.gz) \
+      <(gunzip -c data/manifests/cuts_train_rvb_${type}_sources.jsonl.gz) |\
+      shuf | gzip -c > data/manifests/cuts_train_comb_${type}_sources.jsonl.gz
+  done
+fi
+
+if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
+  log "Stage 9: Compute fbank features for LibriCSS"
+  mkdir -p data/fbank
+  python local/compute_fbank_libricss.py
+fi
+
+if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
+  log "Stage 10: Download LibriSpeech BPE model from HuggingFace."
+  mkdir -p data/lang_bpe_500
+  pushd data/lang_bpe_500
+  wget https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/resolve/main/data/lang_bpe_500/bpe.model
+  popd
+fi
diff --git a/egs/libricss/SURT/shared b/egs/libricss/SURT/shared
new file mode 120000
index 000000000..4cbd91a7e
--- /dev/null
+++ b/egs/libricss/SURT/shared
@@ -0,0 +1 @@
+../../../icefall/shared
\ No newline at end of file
diff --git a/egs/libricss/SURT/surt.png b/egs/libricss/SURT/surt.png
new file mode 100644
index 000000000..fcc8119d4
Binary files /dev/null and b/egs/libricss/SURT/surt.png differ
diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md
index 1b8e690bd..b945f43fd 100644
--- a/egs/librispeech/ASR/RESULTS.md
+++ b/egs/librispeech/ASR/RESULTS.md
@@ -90,6 +90,11 @@ You can use <https://github.com/k2-fsa/sherpa> to deploy it.
 | greedy_search        | 2.23       | 4.96       | --epoch 40 --avg 16 |
 | modified_beam_search | 2.21       | 4.91       | --epoch 40 --avg 16 |
 | fast_beam_search     | 2.24       | 4.93       | --epoch 40 --avg 16 |
+| modified_beam_search_shallow_fusion | 2.01 | 4.37 | --epoch 40 --avg 16 --beam-size 12 --lm-scale 0.3 |
+| modified_beam_search_LODR | 1.94 | 4.17 | --epoch 40 --avg 16 --beam-size 12 --lm-scale 0.52 --LODR-scale -0.26 |
+| modified_beam_search_rescore | 2.04 | 4.39 | --epoch 40 --avg 16 --beam-size 12 |
+| modified_beam_search_rescore_LODR | 2.01  | 4.33 | --epoch 40 --avg 16 --beam-size 12  |
+
 
 The training command is:
 ```bash
@@ -119,6 +124,8 @@ for m in greedy_search modified_beam_search fast_beam_search; do
 done
 ```
 
+To decode with external language models, please refer to the documentation [here](https://k2-fsa.github.io/icefall/decoding-with-langugage-models/index.html).
+
 ##### small-scaled model, number of model parameters: 23285615, i.e., 23.3 M
 
 The tensorboard log can be found at
diff --git a/egs/librispeech/ASR/conformer_ctc2/train.py b/egs/librispeech/ASR/conformer_ctc2/train.py
index 3366af13e..c4a13b101 100755
--- a/egs/librispeech/ASR/conformer_ctc2/train.py
+++ b/egs/librispeech/ASR/conformer_ctc2/train.py
@@ -675,7 +675,6 @@ def train_one_epoch(
     for batch_idx, batch in enumerate(train_dl):
         params.batch_idx_train += 1
         batch_size = len(batch["supervisions"]["text"])
-        batch_name = batch["supervisions"]["uttid"]
 
         with torch.cuda.amp.autocast(enabled=params.use_fp16):
             loss, loss_info = compute_loss(
@@ -698,10 +697,7 @@ def train_one_epoch(
             scaler.scale(loss).backward()
         except RuntimeError as e:
             if "CUDA out of memory" in str(e):
-                logging.error(
-                    f"failing batch size:{batch_size} "
-                    f"failing batch names {batch_name}"
-                )
+                logging.error(f"failing batch size:{batch_size} ")
             raise
 
         scheduler.step_batch(params.batch_idx_train)
@@ -756,10 +752,7 @@ def train_one_epoch(
             if loss_info["ctc_loss"] == float("inf") or loss_info["att_loss"] == float(
                 "inf"
             ):
-                logging.error(
-                    "Your loss contains inf, something goes wrong"
-                    f"failing batch names {batch_name}"
-                )
+                logging.error("Your loss contains inf, something goes wrong")
             if tb_writer is not None:
                 tb_writer.add_scalar(
                     "train/learning_rate", cur_lr, params.batch_idx_train
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
index 17b63a659..fd59d4b7f 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
@@ -50,6 +50,7 @@ def fast_beam_search_one_best(
     ilme_scale: float = 0.0,
     blank_penalty: float = 0.0,
     return_timestamps: bool = False,
+    allow_partial: bool = False,
 ) -> Union[List[List[int]], DecodingResults]:
     """It limits the maximum number of symbols per frame to 1.
 
@@ -91,6 +92,7 @@ def fast_beam_search_one_best(
         max_contexts=max_contexts,
         temperature=temperature,
         ilme_scale=ilme_scale,
+        allow_partial=allow_partial,
         blank_penalty=blank_penalty,
     )
 
@@ -117,6 +119,7 @@ def fast_beam_search_nbest_LG(
     blank_penalty: float = 0.0,
     ilme_scale: float = 0.0,
     return_timestamps: bool = False,
+    allow_partial: bool = False,
 ) -> Union[List[List[int]], DecodingResults]:
     """It limits the maximum number of symbols per frame to 1.
 
@@ -170,6 +173,7 @@ def fast_beam_search_nbest_LG(
         max_states=max_states,
         max_contexts=max_contexts,
         temperature=temperature,
+        allow_partial=allow_partial,
         blank_penalty=blank_penalty,
         ilme_scale=ilme_scale,
     )
@@ -246,6 +250,7 @@ def fast_beam_search_nbest(
     temperature: float = 1.0,
     blank_penalty: float = 0.0,
     return_timestamps: bool = False,
+    allow_partial: bool = False,
 ) -> Union[List[List[int]], DecodingResults]:
     """It limits the maximum number of symbols per frame to 1.
 
@@ -300,6 +305,7 @@ def fast_beam_search_nbest(
         max_contexts=max_contexts,
         blank_penalty=blank_penalty,
         temperature=temperature,
+        allow_partial=allow_partial,
     )
 
     nbest = Nbest.from_lattice(
@@ -339,6 +345,7 @@ def fast_beam_search_nbest_oracle(
     temperature: float = 1.0,
     blank_penalty: float = 0.0,
     return_timestamps: bool = False,
+    allow_partial: bool = False,
 ) -> Union[List[List[int]], DecodingResults]:
     """It limits the maximum number of symbols per frame to 1.
 
@@ -396,6 +403,7 @@ def fast_beam_search_nbest_oracle(
         max_states=max_states,
         max_contexts=max_contexts,
         temperature=temperature,
+        allow_partial=allow_partial,
         blank_penalty=blank_penalty,
     )
 
@@ -440,7 +448,9 @@ def fast_beam_search(
     max_states: int,
     max_contexts: int,
     temperature: float = 1.0,
-    ilme_scale: float = 0.0,
+    subtract_ilme: bool = False,
+    ilme_scale: float = 0.1,
+    allow_partial: bool = False,
     blank_penalty: float = 0.0,
 ) -> k2.Fsa:
     """It limits the maximum number of symbols per frame to 1.
@@ -533,7 +543,9 @@ def fast_beam_search(
 
         decoding_streams.advance(log_probs)
     decoding_streams.terminate_and_flush_to_streams()
-    lattice = decoding_streams.format_output(encoder_out_lens.tolist())
+    lattice = decoding_streams.format_output(
+        encoder_out_lens.tolist(), allow_partial=allow_partial
+    )
 
     return lattice
 
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
index 9bac46004..bcd419fb7 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
@@ -849,6 +849,8 @@ class RelPositionalEncoding(torch.nn.Module):
             torch.Tensor: Encoded tensor (batch, 2*time-1, `*`).
 
         """
+        if isinstance(left_context, torch.Tensor):
+            left_context = left_context.item()
         self.extend_pe(x, left_context)
         x_size_1 = x.size(1) + left_context
         pos_emb = self.pe[
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/test_onnx.py b/egs/librispeech/ASR/pruned_transducer_stateless3/test_onnx.py
index 598fcf344..810da8da6 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/test_onnx.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/test_onnx.py
@@ -113,7 +113,7 @@ def test_rel_pos():
 
     torch.onnx.export(
         encoder_pos,
-        x,
+        (x, torch.zeros(1, dtype=torch.int64)),
         filename,
         verbose=False,
         opset_version=opset_version,
@@ -139,7 +139,9 @@ def test_rel_pos():
     assert input_nodes[0].name == "x"
     assert input_nodes[0].shape == ["N", "T", num_features]
 
-    inputs = {input_nodes[0].name: x.numpy()}
+    inputs = {
+        input_nodes[0].name: x.numpy(),
+    }
     onnx_y, onnx_pos_emb = session.run(["y", "pos_emb"], inputs)
     onnx_y = torch.from_numpy(onnx_y)
     onnx_pos_emb = torch.from_numpy(onnx_pos_emb)
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/test_onnx.py b/egs/librispeech/ASR/pruned_transducer_stateless7/test_onnx.py
index 2440d267c..1e9b67226 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/test_onnx.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/test_onnx.py
@@ -265,7 +265,7 @@ def test_zipformer_encoder():
 
     torch.onnx.export(
         encoder,
-        (x),
+        (x, torch.ones(1, dtype=torch.float32)),
         filename,
         verbose=False,
         opset_version=opset_version,
@@ -289,6 +289,7 @@ def test_zipformer_encoder():
     input_nodes = session.get_inputs()
     inputs = {
         input_nodes[0].name: x.numpy(),
+        input_nodes[1].name: torch.ones(1, dtype=torch.float32).numpy(),
     }
     onnx_y = session.run(["y"], inputs)[0]
     onnx_y = torch.from_numpy(onnx_y)
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py
index 0841f7cf1..c44cb1eaf 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py
@@ -81,20 +81,20 @@ class FrameReducer(nn.Module):
             fake_limit_indexes = torch.topk(
                 ctc_output[:, :, blank_id], max_limit_len
             ).indices
-            T = (
+            T_arange = (
                 torch.arange(max_limit_len)
                 .expand_as(
                     fake_limit_indexes,
                 )
                 .to(device=x.device)
             )
-            T = torch.remainder(T, limit_lens.unsqueeze(1))
-            limit_indexes = torch.gather(fake_limit_indexes, 1, T)
+            T_arange = torch.remainder(T_arange, limit_lens.unsqueeze(1))
+            limit_indexes = torch.gather(fake_limit_indexes, 1, T_arange)
             limit_mask = torch.full_like(
                 non_blank_mask,
-                False,
+                0,
                 device=x.device,
-            ).scatter_(1, limit_indexes, True)
+            ).scatter_(1, limit_indexes, 1)
 
             non_blank_mask = non_blank_mask | ~limit_mask
 
@@ -108,9 +108,9 @@ class FrameReducer(nn.Module):
             )
             - out_lens
         )
-        max_pad_len = pad_lens_list.max()
+        max_pad_len = int(pad_lens_list.max())
 
-        out = F.pad(x, (0, 0, 0, max_pad_len))
+        out = F.pad(x, [0, 0, 0, max_pad_len])
 
         valid_pad_mask = ~make_pad_mask(pad_lens_list)
         total_valid_mask = torch.concat([non_blank_mask, valid_pad_mask], dim=1)
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/decode.py b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/decode.py
index 3444f8193..02029c108 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/decode.py
@@ -396,6 +396,12 @@ def decode_one_batch(
         The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
         only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
         fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+      LM:
+        A neural network language model.
+      ngram_lm:
+        A ngram language model
+      ngram_lm_scale:
+        The scale for the ngram language model.
     Returns:
       Return the decoding result. See above description for the format of
       the returned dict.
@@ -907,6 +913,7 @@ def main():
         ngram_file_name = str(params.lang_dir / f"{params.tokens_ngram}gram.arpa")
         logging.info(f"lm filename: {ngram_file_name}")
         ngram_lm = kenlm.Model(ngram_file_name)
+        ngram_lm_scale = None  # use a list to search
 
     elif params.decoding_method == "modified_beam_search_LODR":
         lm_filename = f"{params.tokens_ngram}gram.fst.txt"
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/export.py b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/export.py
index 5735ee692..c191b5bcc 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/export.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/export.py
@@ -856,6 +856,10 @@ def main():
         # Otherwise, one of its arguments is a ragged tensor and is not
         # torch scriptabe.
         model.__class__.forward = torch.jit.ignore(model.__class__.forward)
+        model.encoder.__class__.non_streaming_forward = model.encoder.__class__.forward
+        model.encoder.__class__.non_streaming_forward = torch.jit.export(
+            model.encoder.__class__.non_streaming_forward
+        )
         model.encoder.__class__.forward = model.encoder.__class__.streaming_forward
         logging.info("Using torch.jit.script")
         model = torch.jit.script(model)
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/jit_pretrained.py b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/jit_pretrained.py
index 4fd5e1820..c8301b2da 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/jit_pretrained.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/jit_pretrained.py
@@ -252,7 +252,7 @@ def main():
 
     feature_lengths = torch.tensor(feature_lengths, device=device)
 
-    encoder_out, encoder_out_lens = model.encoder(
+    encoder_out, encoder_out_lens = model.encoder.non_streaming_forward(
         x=features,
         x_lens=feature_lengths,
     )
diff --git a/egs/librispeech/ASR/zipformer/.gitignore b/egs/librispeech/ASR/zipformer/.gitignore
new file mode 100644
index 000000000..e47ac1582
--- /dev/null
+++ b/egs/librispeech/ASR/zipformer/.gitignore
@@ -0,0 +1 @@
+swoosh.pdf
diff --git a/egs/librispeech/ASR/zipformer/decode.py b/egs/librispeech/ASR/zipformer/decode.py
index 93680602e..2cc157e7a 100755
--- a/egs/librispeech/ASR/zipformer/decode.py
+++ b/egs/librispeech/ASR/zipformer/decode.py
@@ -115,9 +115,14 @@ from beam_search import (
     greedy_search,
     greedy_search_batch,
     modified_beam_search,
+    modified_beam_search_lm_rescore,
+    modified_beam_search_lm_rescore_LODR,
+    modified_beam_search_lm_shallow_fusion,
+    modified_beam_search_LODR,
 )
-from train import add_model_arguments, get_params, get_model
+from train import add_model_arguments, get_model, get_params
 
+from icefall import LmScorer, NgramLm
 from icefall.checkpoint import (
     average_checkpoints,
     average_checkpoints_with_averaged_model,
@@ -273,8 +278,7 @@ def get_parser():
         "--context-size",
         type=int,
         default=2,
-        help="The context size in the decoder. 1 means bigram; "
-        "2 means tri-gram",
+        help="The context size in the decoder. 1 means bigram; " "2 means tri-gram",
     )
     parser.add_argument(
         "--max-sym-per-frame",
@@ -302,6 +306,47 @@ def get_parser():
         fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
     )
 
+    parser.add_argument(
+        "--use-shallow-fusion",
+        type=str2bool,
+        default=False,
+        help="""Use neural network LM for shallow fusion.
+        If you want to use LODR, you will also need to set this to true
+        """,
+    )
+
+    parser.add_argument(
+        "--lm-type",
+        type=str,
+        default="rnn",
+        help="Type of NN lm",
+        choices=["rnn", "transformer"],
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.3,
+        help="""The scale of the neural network LM
+        Used only when `--use-shallow-fusion` is set to True.
+        """,
+    )
+
+    parser.add_argument(
+        "--tokens-ngram",
+        type=int,
+        default=2,
+        help="""The order of the ngram lm.
+        """,
+    )
+
+    parser.add_argument(
+        "--backoff-id",
+        type=int,
+        default=500,
+        help="ID of the backoff symbol in the ngram LM",
+    )
+
     add_model_arguments(parser)
 
     return parser
@@ -314,6 +359,9 @@ def decode_one_batch(
     batch: dict,
     word_table: Optional[k2.SymbolTable] = None,
     decoding_graph: Optional[k2.Fsa] = None,
+    LM: Optional[LmScorer] = None,
+    ngram_lm=None,
+    ngram_lm_scale: float = 0.0,
 ) -> Dict[str, List[List[str]]]:
     """Decode one batch and return the result in a dict. The dict has the
     following format:
@@ -342,6 +390,12 @@ def decode_one_batch(
         The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
         only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
         fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+      LM:
+        A neural network language model.
+      ngram_lm:
+        A ngram language model
+      ngram_lm_scale:
+        The scale for the ngram language model.
     Returns:
       Return the decoding result. See above description for the format of
       the returned dict.
@@ -425,10 +479,7 @@ def decode_one_batch(
         )
         for hyp in sp.decode(hyp_tokens):
             hyps.append(hyp.split())
-    elif (
-        params.decoding_method == "greedy_search"
-        and params.max_sym_per_frame == 1
-    ):
+    elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
         hyp_tokens = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
@@ -445,6 +496,50 @@ def decode_one_batch(
         )
         for hyp in sp.decode(hyp_tokens):
             hyps.append(hyp.split())
+    elif params.decoding_method == "modified_beam_search_lm_shallow_fusion":
+        hyp_tokens = modified_beam_search_lm_shallow_fusion(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+            LM=LM,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.decoding_method == "modified_beam_search_LODR":
+        hyp_tokens = modified_beam_search_LODR(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+            LODR_lm=ngram_lm,
+            LODR_lm_scale=ngram_lm_scale,
+            LM=LM,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
+    elif params.decoding_method == "modified_beam_search_lm_rescore":
+        lm_scale_list = [0.01 * i for i in range(10, 50)]
+        ans_dict = modified_beam_search_lm_rescore(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+            LM=LM,
+            lm_scale_list=lm_scale_list,
+        )
+    elif params.decoding_method == "modified_beam_search_lm_rescore_LODR":
+        lm_scale_list = [0.02 * i for i in range(2, 30)]
+        ans_dict = modified_beam_search_lm_rescore_LODR(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+            LM=LM,
+            LODR_lm=ngram_lm,
+            sp=sp,
+            lm_scale_list=lm_scale_list,
+        )
     else:
         batch_size = encoder_out.size(0)
 
@@ -483,6 +578,16 @@ def decode_one_batch(
                 key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
 
         return {key: hyps}
+    elif params.decoding_method in (
+        "modified_beam_search_lm_rescore",
+        "modified_beam_search_lm_rescore_LODR",
+    ):
+        ans = dict()
+        assert ans_dict is not None
+        for key, hyps in ans_dict.items():
+            hyps = [sp.decode(hyp).split() for hyp in hyps]
+            ans[f"beam_size_{params.beam_size}_{key}"] = hyps
+        return ans
     else:
         return {f"beam_size_{params.beam_size}": hyps}
 
@@ -494,6 +599,9 @@ def decode_dataset(
     sp: spm.SentencePieceProcessor,
     word_table: Optional[k2.SymbolTable] = None,
     decoding_graph: Optional[k2.Fsa] = None,
+    LM: Optional[LmScorer] = None,
+    ngram_lm=None,
+    ngram_lm_scale: float = 0.0,
 ) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
     """Decode dataset.
 
@@ -543,6 +651,9 @@ def decode_dataset(
             decoding_graph=decoding_graph,
             word_table=word_table,
             batch=batch,
+            LM=LM,
+            ngram_lm=ngram_lm,
+            ngram_lm_scale=ngram_lm_scale,
         )
 
         for name, hyps in hyps_dict.items():
@@ -559,9 +670,7 @@ def decode_dataset(
         if batch_idx % log_interval == 0:
             batch_str = f"{batch_idx}/{num_batches}"
 
-            logging.info(
-                f"batch {batch_str}, cuts processed until now is {num_cuts}"
-            )
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
     return results
 
 
@@ -594,8 +703,7 @@ def save_results(
 
     test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
     errs_info = (
-        params.res_dir
-        / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
     )
     with open(errs_info, "w") as f:
         print("settings\tWER", file=f)
@@ -614,6 +722,7 @@ def save_results(
 def main():
     parser = get_parser()
     LibriSpeechAsrDataModule.add_arguments(parser)
+    LmScorer.add_arguments(parser)
     args = parser.parse_args()
     args.exp_dir = Path(args.exp_dir)
 
@@ -628,6 +737,10 @@ def main():
         "fast_beam_search_nbest_LG",
         "fast_beam_search_nbest_oracle",
         "modified_beam_search",
+        "modified_beam_search_LODR",
+        "modified_beam_search_lm_shallow_fusion",
+        "modified_beam_search_lm_rescore",
+        "modified_beam_search_lm_rescore_LODR",
     )
     params.res_dir = params.exp_dir / params.decoding_method
 
@@ -656,13 +769,19 @@ def main():
             if "LG" in params.decoding_method:
                 params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
     elif "beam_search" in params.decoding_method:
-        params.suffix += (
-            f"-{params.decoding_method}-beam-size-{params.beam_size}"
-        )
+        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
     else:
         params.suffix += f"-context-{params.context_size}"
         params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
 
+    if params.use_shallow_fusion:
+        params.suffix += f"-{params.lm_type}-lm-scale-{params.lm_scale}"
+
+        if "LODR" in params.decoding_method:
+            params.suffix += (
+                f"-LODR-{params.tokens_ngram}gram-scale-{params.ngram_lm_scale}"
+            )
+
     if params.use_averaged_model:
         params.suffix += "-use-averaged-model"
 
@@ -690,9 +809,9 @@ def main():
 
     if not params.use_averaged_model:
         if params.iter > 0:
-            filenames = find_checkpoints(
-                params.exp_dir, iteration=-params.iter
-            )[: params.avg]
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
             if len(filenames) == 0:
                 raise ValueError(
                     f"No checkpoints found for"
@@ -719,9 +838,9 @@ def main():
             model.load_state_dict(average_checkpoints(filenames, device=device))
     else:
         if params.iter > 0:
-            filenames = find_checkpoints(
-                params.exp_dir, iteration=-params.iter
-            )[: params.avg + 1]
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
             if len(filenames) == 0:
                 raise ValueError(
                     f"No checkpoints found for"
@@ -768,6 +887,54 @@ def main():
     model.to(device)
     model.eval()
 
+    # only load the neural network LM if required
+    if params.use_shallow_fusion or params.decoding_method in (
+        "modified_beam_search_lm_rescore",
+        "modified_beam_search_lm_rescore_LODR",
+        "modified_beam_search_lm_shallow_fusion",
+        "modified_beam_search_LODR",
+    ):
+        LM = LmScorer(
+            lm_type=params.lm_type,
+            params=params,
+            device=device,
+            lm_scale=params.lm_scale,
+        )
+        LM.to(device)
+        LM.eval()
+    else:
+        LM = None
+
+    # only load N-gram LM when needed
+    if params.decoding_method == "modified_beam_search_lm_rescore_LODR":
+        try:
+            import kenlm
+        except ImportError:
+            print("Please install kenlm first. You can use")
+            print(" pip install https://github.com/kpu/kenlm/archive/master.zip")
+            print("to install it")
+            import sys
+
+            sys.exit(-1)
+        ngram_file_name = str(params.lang_dir / f"{params.tokens_ngram}gram.arpa")
+        logging.info(f"lm filename: {ngram_file_name}")
+        ngram_lm = kenlm.Model(ngram_file_name)
+        ngram_lm_scale = None  # use a list to search
+
+    elif params.decoding_method == "modified_beam_search_LODR":
+        lm_filename = f"{params.tokens_ngram}gram.fst.txt"
+        logging.info(f"Loading token level lm: {lm_filename}")
+        ngram_lm = NgramLm(
+            str(params.lang_dir / lm_filename),
+            backoff_id=params.backoff_id,
+            is_binary=False,
+        )
+        logging.info(f"num states: {ngram_lm.lm.num_states}")
+        ngram_lm_scale = params.ngram_lm_scale
+    else:
+        ngram_lm = None
+        ngram_lm_scale = None
+
     if "fast_beam_search" in params.decoding_method:
         if params.decoding_method == "fast_beam_search_nbest_LG":
             lexicon = Lexicon(params.lang_dir)
@@ -780,9 +947,7 @@ def main():
             decoding_graph.scores *= params.ngram_lm_scale
         else:
             word_table = None
-            decoding_graph = k2.trivial_graph(
-                params.vocab_size - 1, device=device
-            )
+            decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
     else:
         decoding_graph = None
         word_table = None
@@ -811,6 +976,9 @@ def main():
             sp=sp,
             word_table=word_table,
             decoding_graph=decoding_graph,
+            LM=LM,
+            ngram_lm=ngram_lm,
+            ngram_lm_scale=ngram_lm_scale,
         )
 
         save_results(
diff --git a/egs/librispeech/ASR/zipformer/decode_stream.py b/egs/librispeech/ASR/zipformer/decode_stream.py
index 946db275c..d6918bf32 100644
--- a/egs/librispeech/ASR/zipformer/decode_stream.py
+++ b/egs/librispeech/ASR/zipformer/decode_stream.py
@@ -79,12 +79,12 @@ class DecodeStream(object):
         self.pad_length = 7 + 2 * 3
 
         if params.decoding_method == "greedy_search":
-            self.hyp = [params.blank_id] * params.context_size
+            self.hyp = [-1] * (params.context_size - 1) + [params.blank_id]
         elif params.decoding_method == "modified_beam_search":
             self.hyps = HypothesisList()
             self.hyps.add(
                 Hypothesis(
-                    ys=[params.blank_id] * params.context_size,
+                    ys=[-1] * (params.context_size - 1) + [params.blank_id],
                     log_prob=torch.zeros(1, dtype=torch.float32, device=device),
                 )
             )
diff --git a/egs/librispeech/ASR/zipformer/export-onnx-streaming.py b/egs/librispeech/ASR/zipformer/export-onnx-streaming.py
index 80dc19b37..3eb06f68c 100755
--- a/egs/librispeech/ASR/zipformer/export-onnx-streaming.py
+++ b/egs/librispeech/ASR/zipformer/export-onnx-streaming.py
@@ -86,7 +86,7 @@ from icefall.checkpoint import (
     find_checkpoints,
     load_checkpoint,
 )
-from icefall.utils import make_pad_mask, str2bool
+from icefall.utils import str2bool
 
 
 def get_parser():
@@ -218,7 +218,7 @@ class OnnxEncoder(nn.Module):
         )
         assert x.size(1) == self.chunk_size, (x.size(1), self.chunk_size)
 
-        src_key_padding_mask = make_pad_mask(x_lens)
+        src_key_padding_mask = torch.zeros(N, self.chunk_size, dtype=torch.bool)
 
         # processed_mask is used to mask out initial states
         processed_mask = torch.arange(left_context_len, device=x.device).expand(
@@ -272,6 +272,7 @@ class OnnxEncoder(nn.Module):
         states = self.encoder.get_init_states(batch_size, device)
 
         embed_states = self.encoder_embed.get_init_states(batch_size, device)
+
         states.append(embed_states)
 
         processed_lens = torch.zeros(batch_size, dtype=torch.int64, device=device)
@@ -756,7 +757,7 @@ def main():
     quantize_dynamic(
         model_input=decoder_filename,
         model_output=decoder_filename_int8,
-        op_types_to_quantize=["MatMul"],
+        op_types_to_quantize=["MatMul", "Gather"],
         weight_type=QuantType.QInt8,
     )
 
diff --git a/egs/librispeech/ASR/zipformer/export-onnx.py b/egs/librispeech/ASR/zipformer/export-onnx.py
index 1bc10c896..724fdd2a6 100755
--- a/egs/librispeech/ASR/zipformer/export-onnx.py
+++ b/egs/librispeech/ASR/zipformer/export-onnx.py
@@ -602,7 +602,7 @@ def main():
     quantize_dynamic(
         model_input=decoder_filename,
         model_output=decoder_filename_int8,
-        op_types_to_quantize=["MatMul"],
+        op_types_to_quantize=["MatMul", "Gather"],
         weight_type=QuantType.QInt8,
     )
 
diff --git a/egs/librispeech/ASR/zipformer/jit_pretrained_ctc.py b/egs/librispeech/ASR/zipformer/jit_pretrained_ctc.py
index 14faeedd1..904d8cd76 100755
--- a/egs/librispeech/ASR/zipformer/jit_pretrained_ctc.py
+++ b/egs/librispeech/ASR/zipformer/jit_pretrained_ctc.py
@@ -264,7 +264,7 @@ def main():
     params.update(vars(args))
 
     token_table = k2.SymbolTable.from_file(params.tokens)
-    params.vocab_size = num_tokens(token_table)
+    params.vocab_size = num_tokens(token_table) + 1
 
     logging.info(f"{params}")
 
diff --git a/egs/librispeech/ASR/zipformer/model.py b/egs/librispeech/ASR/zipformer/model.py
index b541ee697..f2f86af47 100644
--- a/egs/librispeech/ASR/zipformer/model.py
+++ b/egs/librispeech/ASR/zipformer/model.py
@@ -320,7 +320,7 @@ class AsrModel(nn.Module):
         assert x_lens.ndim == 1, x_lens.shape
         assert y.num_axes == 2, y.num_axes
 
-        assert x.size(0) == x_lens.size(0) == y.dim0
+        assert x.size(0) == x_lens.size(0) == y.dim0, (x.shape, x_lens.shape, y.dim0)
 
         # Compute encoder outputs
         encoder_out, encoder_out_lens = self.forward_encoder(x, x_lens)
diff --git a/egs/librispeech/ASR/zipformer/onnx_decode.py b/egs/librispeech/ASR/zipformer/onnx_decode.py
new file mode 100755
index 000000000..2aca36ca9
--- /dev/null
+++ b/egs/librispeech/ASR/zipformer/onnx_decode.py
@@ -0,0 +1,323 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao,
+#                                                 Xiaoyu Yang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script loads ONNX exported models and uses them to decode the test sets.
+
+We use the pre-trained model from
+https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+as an example to show how to use this file.
+
+1. Download the pre-trained model
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/pretrained.pt"
+
+cd exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+2. Export the model to ONNX
+
+./zipformer/export-onnx.py \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --use-averaged-model 0 \
+  --epoch 99 \
+  --avg 1 \
+  --exp-dir $repo/exp \
+  --causal False
+
+It will generate the following 3 files inside $repo/exp:
+
+  - encoder-epoch-99-avg-1.onnx
+  - decoder-epoch-99-avg-1.onnx
+  - joiner-epoch-99-avg-1.onnx
+
+2. Run this file
+
+./zipformer/onnx_decode.py \
+  --exp-dir $repo/exp \
+  --max-duration 600 \
+  --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+  --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+  --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+"""
+
+
+import argparse
+import logging
+import time
+from pathlib import Path
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+from asr_datamodule import LibriSpeechAsrDataModule
+
+from onnx_pretrained import greedy_search, OnnxModel
+
+from icefall.utils import setup_logger, store_transcripts, write_error_stats
+from k2 import SymbolTable
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--encoder-model-filename",
+        type=str,
+        required=True,
+        help="Path to the encoder onnx model. ",
+    )
+
+    parser.add_argument(
+        "--decoder-model-filename",
+        type=str,
+        required=True,
+        help="Path to the decoder onnx model. ",
+    )
+
+    parser.add_argument(
+        "--joiner-model-filename",
+        type=str,
+        required=True,
+        help="Path to the joiner onnx model. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        help="""Path to tokens.txt.""",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="Valid values are greedy_search and modified_beam_search",
+    )
+
+    return parser
+
+
+def decode_one_batch(
+    model: OnnxModel, token_table: SymbolTable, batch: dict
+) -> List[List[str]]:
+    """Decode one batch and return the result.
+    Currently it only greedy_search is supported.
+
+    Args:
+      model:
+        The neural model.
+      token_table:
+        The token table.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+
+    Returns:
+      Return the decoded results for each utterance.
+    """
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+    # at entry, feature is (N, T, C)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(dtype=torch.int64)
+
+    encoder_out, encoder_out_lens = model.run_encoder(x=feature, x_lens=feature_lens)
+
+    hyps = greedy_search(
+        model=model, encoder_out=encoder_out, encoder_out_lens=encoder_out_lens
+    )
+
+    def token_ids_to_words(token_ids: List[int]) -> str:
+        text = ""
+        for i in token_ids:
+            text += token_table[i]
+        return text.replace("▁", " ").strip()
+
+    hyps = [token_ids_to_words(h).split() for h in hyps]
+    return hyps
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    model: nn.Module,
+    token_table: SymbolTable,
+) -> Tuple[List[Tuple[str, List[str], List[str]]], float]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      model:
+        The neural model.
+      token_table:
+        The token table.
+
+    Returns:
+      - A list of tuples. Each tuple contains three elements:
+         - cut_id,
+         - reference transcript,
+         - predicted result.
+      - The total duration (in seconds) of the dataset.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    log_interval = 10
+    total_duration = 0
+
+    results = []
+    for batch_idx, batch in enumerate(dl):
+        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+        total_duration += sum([cut.duration for cut in batch["supervisions"]["cut"]])
+
+        hyps = decode_one_batch(model=model, token_table=token_table, batch=batch)
+
+        this_batch = []
+        assert len(hyps) == len(texts)
+        for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+            ref_words = ref_text.split()
+            this_batch.append((cut_id, ref_words, hyp_words))
+
+        results.extend(this_batch)
+
+        num_cuts += len(texts)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+
+    return results, total_duration
+
+
+def save_results(
+    res_dir: Path,
+    test_set_name: str,
+    results: List[Tuple[str, List[str], List[str]]],
+):
+    recog_path = res_dir / f"recogs-{test_set_name}.txt"
+    results = sorted(results)
+    store_transcripts(filename=recog_path, texts=results)
+    logging.info(f"The transcripts are stored in {recog_path}")
+
+    # The following prints out WERs, per-word error statistics and aligned
+    # ref/hyp pairs.
+    errs_filename = res_dir / f"errs-{test_set_name}.txt"
+    with open(errs_filename, "w") as f:
+        wer = write_error_stats(f, f"{test_set_name}", results, enable_log=True)
+
+    logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    errs_info = res_dir / f"wer-summary-{test_set_name}.txt"
+    with open(errs_info, "w") as f:
+        print("WER", file=f)
+        print(wer, file=f)
+
+    s = "\nFor {}, WER is {}:\n".format(test_set_name, wer)
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LibriSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+
+    assert (
+        args.decoding_method == "greedy_search"
+    ), "Only supports greedy_search currently."
+    res_dir = Path(args.exp_dir) / f"onnx-{args.decoding_method}"
+
+    setup_logger(f"{res_dir}/log-decode")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    logging.info(f"Device: {device}")
+
+    token_table = SymbolTable.from_file(args.tokens)
+
+    logging.info(vars(args))
+
+    logging.info("About to create model")
+    model = OnnxModel(
+        encoder_model_filename=args.encoder_model_filename,
+        decoder_model_filename=args.decoder_model_filename,
+        joiner_model_filename=args.joiner_model_filename,
+    )
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    librispeech = LibriSpeechAsrDataModule(args)
+
+    test_clean_cuts = librispeech.test_clean_cuts()
+    test_other_cuts = librispeech.test_other_cuts()
+
+    test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
+    test_other_dl = librispeech.test_dataloaders(test_other_cuts)
+
+    test_sets = ["test-clean", "test-other"]
+    test_dl = [test_clean_dl, test_other_dl]
+
+    for test_set, test_dl in zip(test_sets, test_dl):
+        start_time = time.time()
+        results, total_duration = decode_dataset(dl=test_dl, model=model, token_table=token_table)
+        end_time = time.time()
+        elapsed_seconds = end_time - start_time
+        rtf = elapsed_seconds / total_duration
+
+        logging.info(f"Elapsed time: {elapsed_seconds:.3f} s")
+        logging.info(f"Wave duration: {total_duration:.3f} s")
+        logging.info(
+            f"Real time factor (RTF): {elapsed_seconds:.3f}/{total_duration:.3f} = {rtf:.3f}"
+        )
+
+        save_results(res_dir=res_dir, test_set_name=test_set, results=results)
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/ASR/zipformer/onnx_pretrained.py b/egs/librispeech/ASR/zipformer/onnx_pretrained.py
index b821c4e19..e8a521460 100755
--- a/egs/librispeech/ASR/zipformer/onnx_pretrained.py
+++ b/egs/librispeech/ASR/zipformer/onnx_pretrained.py
@@ -56,7 +56,7 @@ It will generate the following 3 files inside $repo/exp:
 
 3. Run this file
 
-./pruned_transducer_stateless3/onnx_pretrained.py \
+./zipformer/onnx_pretrained.py \
   --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
   --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
   --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
diff --git a/egs/librispeech/ASR/zipformer/scaling.py b/egs/librispeech/ASR/zipformer/scaling.py
index 9f23eeead..7c98ef045 100644
--- a/egs/librispeech/ASR/zipformer/scaling.py
+++ b/egs/librispeech/ASR/zipformer/scaling.py
@@ -25,6 +25,11 @@ import math
 import torch.nn as nn
 from torch import Tensor
 
+def logaddexp_onnx(x: Tensor, y: Tensor) -> Tensor:
+    max_value = torch.max(x, y)
+    diff = torch.abs(x - y)
+    return max_value + torch.log1p(torch.exp(-diff))
+
 
 # RuntimeError: Exporting the operator logaddexp to ONNX opset version
 # 14 is not supported. Please feel free to request support or submit
@@ -33,10 +38,22 @@ from torch import Tensor
 # The following function is to solve the above error when exporting
 # models to ONNX via torch.jit.trace()
 def logaddexp(x: Tensor, y: Tensor) -> Tensor:
-    if not torch.jit.is_tracing():
+    # Caution(fangjun): Put torch.jit.is_scripting() before
+    # torch.onnx.is_in_onnx_export();
+    # otherwise, it will cause errors for torch.jit.script().
+    #
+    # torch.logaddexp() works for both torch.jit.script() and
+    # torch.jit.trace() but it causes errors for ONNX export.
+    #
+    if torch.jit.is_scripting():
+        # Note: We cannot use torch.jit.is_tracing() here as it also
+        # matches torch.onnx.export().
         return torch.logaddexp(x, y)
+    elif torch.onnx.is_in_onnx_export():
+        return logaddexp_onnx(x, y)
     else:
-        return (x.exp() + y.exp()).log()
+        # for torch.jit.trace()
+        return torch.logaddexp(x, y)
 
 class PiecewiseLinear(object):
     """
@@ -108,7 +125,7 @@ class PiecewiseLinear(object):
                          p: 'PiecewiseLinear',
                          include_crossings: bool = False):
         """
-        Returns (self_mod, p_mod) which are equivalent piecewise lienar
+        Returns (self_mod, p_mod) which are equivalent piecewise linear
         functions to self and p, but with the same x values.
 
           p: the other piecewise linear function
@@ -149,7 +166,7 @@ class ScheduledFloat(torch.nn.Module):
     in, float(parent_module.whatever), and use it as something like a dropout prob.
 
     It is a floating point value whose value changes depending on the batch count of the
-    training loop.  It is a piecewise linear function where you specifiy the (x,y) pairs
+    training loop.  It is a piecewise linear function where you specify the (x,y) pairs
     in sorted order on x; x corresponds to the batch index.  For batch-index values before the
     first x or after the last x, we just use the first or last y value.
 
@@ -326,7 +343,7 @@ class MaxEigLimiterFunction(torch.autograd.Function):
 class BiasNormFunction(torch.autograd.Function):
     # This computes:
     #   scales = (torch.mean((x - bias) ** 2, keepdim=True)) ** -0.5 * log_scale.exp()
-    #   return (x - bias) * scales
+    #   return x * scales
     # (after unsqueezing the bias), but it does it in a memory-efficient way so that
     # it can just store the returned value (chances are, this will also be needed for
     # some other reason, related to the next operation, so we can save memory).
@@ -383,8 +400,8 @@ class BiasNorm(torch.nn.Module):
     Args:
        num_channels: the number of channels, e.g. 512.
        channel_dim: the axis/dimension corresponding to the channel,
-         interprted as an offset from the input's ndim if negative.
-         shis is NOT the num_channels; it should typically be one of
+         interpreted as an offset from the input's ndim if negative.
+         This is NOT the num_channels; it should typically be one of
          {-2, -1, 0, 1, 2, 3}.
       log_scale: the initial log-scale that we multiply the output by; this
          is learnable.
@@ -1269,7 +1286,7 @@ class Dropout3(nn.Module):
 
 class SwooshLFunction(torch.autograd.Function):
     """
-      swoosh(x) =  log(1 + exp(x-4)) - 0.08*x - 0.035
+      swoosh_l(x) =  log(1 + exp(x-4)) - 0.08*x - 0.035
     """
 
     @staticmethod
@@ -1334,10 +1351,17 @@ class SwooshL(torch.nn.Module):
             return k2.swoosh_l(x)
         # return SwooshLFunction.apply(x)
 
+class SwooshLOnnx(torch.nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        """Return Swoosh-L activation.
+        """
+        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+        return logaddexp_onnx(zero, x - 4.0) - 0.08 * x - 0.035
+
 
 class SwooshRFunction(torch.autograd.Function):
     """
-      swoosh(x) =  log(1 + exp(x-1)) - 0.08*x - 0.313261687
+      swoosh_r(x) =  log(1 + exp(x-1)) - 0.08*x - 0.313261687
 
      derivatives are between -0.08 and 0.92.
     """
@@ -1400,6 +1424,13 @@ class SwooshR(torch.nn.Module):
             return k2.swoosh_r(x)
         # return SwooshRFunction.apply(x)
 
+class SwooshROnnx(torch.nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        """Return Swoosh-R activation.
+        """
+        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+        return logaddexp_onnx(zero, x - 1.) - 0.08 * x - 0.313261687
+
 
 # simple version of SwooshL that does not redefine the backprop, used in
 # ActivationDropoutAndLinearFunction.
diff --git a/egs/librispeech/ASR/zipformer/scaling_converter.py b/egs/librispeech/ASR/zipformer/scaling_converter.py
index 54a5c2a6a..76622fa12 100644
--- a/egs/librispeech/ASR/zipformer/scaling_converter.py
+++ b/egs/librispeech/ASR/zipformer/scaling_converter.py
@@ -26,7 +26,16 @@ from typing import List, Tuple
 
 import torch
 import torch.nn as nn
-from scaling import Balancer, Dropout3, ScaleGrad, Whiten
+from scaling import (
+    Balancer,
+    Dropout3,
+    ScaleGrad,
+    SwooshL,
+    SwooshLOnnx,
+    SwooshR,
+    SwooshROnnx,
+    Whiten,
+)
 from zipformer import CompactRelPositionalEncoding
 
 
@@ -75,6 +84,10 @@ def convert_scaled_to_non_scaled(
     for name, m in model.named_modules():
         if isinstance(m, (Balancer, Dropout3, ScaleGrad, Whiten)):
             d[name] = nn.Identity()
+        elif is_onnx and isinstance(m, SwooshR):
+            d[name] = SwooshROnnx()
+        elif is_onnx and isinstance(m, SwooshL):
+            d[name] = SwooshLOnnx()
         elif is_onnx and isinstance(m, CompactRelPositionalEncoding):
             # We want to recreate the positional encoding vector when
             # the input changes, so we have to use torch.jit.script()
diff --git a/egs/librispeech/ASR/zipformer/subsampling.py b/egs/librispeech/ASR/zipformer/subsampling.py
index d6bf57db4..6532ddccb 100644
--- a/egs/librispeech/ASR/zipformer/subsampling.py
+++ b/egs/librispeech/ASR/zipformer/subsampling.py
@@ -138,9 +138,11 @@ class ConvNeXt(nn.Module):
 
         x = bypass + x
         x = self.out_balancer(x)
-        x = x.transpose(1, 3)  # (N, W, H, C); need channel dim to be last
-        x = self.out_whiten(x)
-        x = x.transpose(1, 3)  # (N, C, H, W)
+
+        if x.requires_grad:
+            x = x.transpose(1, 3)  # (N, W, H, C); need channel dim to be last
+            x = self.out_whiten(x)
+            x = x.transpose(1, 3)  # (N, C, H, W)
 
         return x
 
@@ -266,6 +268,7 @@ class Conv2dSubsampling(nn.Module):
         # just one convnext layer
         self.convnext = ConvNeXt(layer3_channels, kernel_size=(7, 7))
 
+        # (in_channels-3)//4
         self.out_width = (((in_channels - 1) // 2) - 1) // 2
         self.layer3_channels = layer3_channels
 
@@ -299,7 +302,7 @@ class Conv2dSubsampling(nn.Module):
             A tensor of shape (batch_size,) containing the number of frames in
 
         Returns:
-          - a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
+          - a tensor of shape (N, (T-7)//2, odim)
           - output lengths, of shape (batch_size,)
         """
         # On entry, x is (N, T, idim)
@@ -310,14 +313,14 @@ class Conv2dSubsampling(nn.Module):
         x = self.conv(x)
         x = self.convnext(x)
 
-        # Now x is of shape (N, odim, ((T-3)//2 - 1)//2, ((idim-1)//2 - 1)//2)
+        # Now x is of shape (N, odim, (T-7)//2, (idim-3)//4)
         b, c, t, f = x.size()
 
         x = x.transpose(1, 2).reshape(b, t, c * f)
-        # now x: (N, ((T-1)//2 - 1))//2, out_width * layer3_channels))
+        # now x: (N, (T-7)//2, out_width * layer3_channels))
 
         x = self.out(x)
-        # Now x is of shape (N, ((T-1)//2 - 1))//2, odim)
+        # Now x is of shape (N, (T-7)//2, odim)
         x = self.out_whiten(x)
         x = self.out_norm(x)
         x = self.dropout(x)
@@ -328,7 +331,7 @@ class Conv2dSubsampling(nn.Module):
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 x_lens = (x_lens - 7) // 2
-        assert x.size(1) == x_lens.max().item()
+        assert x.size(1) == x_lens.max().item() , (x.size(1), x_lens.max())
 
         return x, x_lens
 
@@ -347,7 +350,7 @@ class Conv2dSubsampling(nn.Module):
             A tensor of shape (batch_size,) containing the number of frames in
 
         Returns:
-          - a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
+          - a tensor of shape (N, (T-7)//2, odim)
           - output lengths, of shape (batch_size,)
           - updated cache
         """
@@ -383,7 +386,7 @@ class Conv2dSubsampling(nn.Module):
                 assert self.convnext.padding[0] == 3
                 x_lens = (x_lens - 7) // 2 - 3
 
-        assert x.size(1) == x_lens.max().item()
+        assert x.size(1) == x_lens.max().item(), (x.shape, x_lens.max())
 
         return x, x_lens, cached_left_pad
 
diff --git a/egs/librispeech/ASR/zipformer/test_scaling.py b/egs/librispeech/ASR/zipformer/test_scaling.py
new file mode 100755
index 000000000..5c04291e7
--- /dev/null
+++ b/egs/librispeech/ASR/zipformer/test_scaling.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+import matplotlib.pyplot as plt
+import torch
+from scaling import PiecewiseLinear, ScheduledFloat, SwooshL, SwooshR
+
+
+def test_piecewise_linear():
+    # An identity map in the range [0, 1].
+    # 1 - identity map in the range [1, 2]
+    # x1=0, y1=0
+    # x2=1, y2=1
+    # x3=2, y3=0
+    pl = PiecewiseLinear((0, 0), (1, 1), (2, 0))
+    assert pl(0.25) == 0.25, pl(0.25)
+    assert pl(0.625) == 0.625, pl(0.625)
+    assert pl(1.25) == 0.75, pl(1.25)
+
+    assert pl(-10) == pl(0), pl(-10)  # out of range
+    assert pl(10) == pl(2), pl(10)  # out of range
+
+    # multiplication
+    pl10 = pl * 10
+    assert pl10(1) == 10 * pl(1)
+    assert pl10(0.5) == 10 * pl(0.5)
+
+
+def test_scheduled_float():
+    # Initial value is 0.2 and it decreases linearly towards 0 at 4000
+    dropout = ScheduledFloat((0, 0.2), (4000, 0.0), default=0.0)
+    dropout.batch_count = 0
+    assert float(dropout) == 0.2, (float(dropout), dropout.batch_count)
+
+    dropout.batch_count = 1000
+    assert abs(float(dropout) - 0.15) < 1e-5, (float(dropout), dropout.batch_count)
+
+    dropout.batch_count = 2000
+    assert float(dropout) == 0.1, (float(dropout), dropout.batch_count)
+
+    dropout.batch_count = 3000
+    assert abs(float(dropout) - 0.05) < 1e-5, (float(dropout), dropout.batch_count)
+
+    dropout.batch_count = 4000
+    assert float(dropout) == 0.0, (float(dropout), dropout.batch_count)
+
+    dropout.batch_count = 5000  # out of range
+    assert float(dropout) == 0.0, (float(dropout), dropout.batch_count)
+
+
+def test_swoosh():
+    x1 = torch.linspace(start=-10, end=0, steps=100, dtype=torch.float32)
+    x2 = torch.linspace(start=0, end=10, steps=100, dtype=torch.float32)
+    x = torch.cat([x1, x2[1:]])
+
+    left = SwooshL()(x)
+    r = SwooshR()(x)
+
+    relu = torch.nn.functional.relu(x)
+    print(left[x == 0], r[x == 0])
+    plt.plot(x, left, "k")
+    plt.plot(x, r, "r")
+    plt.plot(x, relu, "b")
+    plt.axis([-10, 10, -1, 10])  # [xmin, xmax, ymin, ymax]
+    plt.legend(
+        [
+            "SwooshL(x) = log(1 + exp(x-4)) - 0.08x - 0.035 ",
+            "SwooshR(x) = log(1 + exp(x-1)) - 0.08x - 0.313261687",
+            "ReLU(x) = max(0, x)",
+        ]
+    )
+    plt.grid()
+    plt.savefig("swoosh.pdf")
+
+
+def main():
+    test_piecewise_linear()
+    test_scheduled_float()
+    test_swoosh()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/ASR/zipformer/test_subsampling.py b/egs/librispeech/ASR/zipformer/test_subsampling.py
new file mode 100755
index 000000000..078227fb6
--- /dev/null
+++ b/egs/librispeech/ASR/zipformer/test_subsampling.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+
+import torch
+from scaling import ScheduledFloat
+from subsampling import Conv2dSubsampling
+
+
+def test_conv2d_subsampling():
+    layer1_channels = 8
+    layer2_channels = 32
+    layer3_channels = 128
+
+    out_channels = 192
+    encoder_embed = Conv2dSubsampling(
+        in_channels=80,
+        out_channels=out_channels,
+        layer1_channels=layer1_channels,
+        layer2_channels=layer2_channels,
+        layer3_channels=layer3_channels,
+        dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
+    )
+    N = 2
+    T = 200
+    num_features = 80
+    x = torch.rand(N, T, num_features)
+    x_copy = x.clone()
+
+    x = x.unsqueeze(1)  # (N, 1, T, num_features)
+
+    x = encoder_embed.conv[0](x)  # conv2d, in 1, out 8, kernel 3, padding (0,1)
+    assert x.shape == (N, layer1_channels, T - 2, num_features)
+    # (2, 8, 198, 80)
+
+    x = encoder_embed.conv[1](x)  # scale grad
+    x = encoder_embed.conv[2](x)  # balancer
+    x = encoder_embed.conv[3](x)  # swooshR
+
+    x = encoder_embed.conv[4](x)  # conv2d, in 8, out 32, kernel 3, stride 2
+    assert x.shape == (
+        N,
+        layer2_channels,
+        ((T - 2) - 3) // 2 + 1,
+        (num_features - 3) // 2 + 1,
+    )
+    # (2, 32, 98, 39)
+
+    x = encoder_embed.conv[5](x)  # balancer
+    x = encoder_embed.conv[6](x)  # swooshR
+
+    # conv2d:
+    # in 32, out 128, kernel 3, stride (1, 2)
+    x = encoder_embed.conv[7](x)
+    assert x.shape == (
+        N,
+        layer3_channels,
+        (((T - 2) - 3) // 2 + 1) - 2,
+        (((num_features - 3) // 2 + 1) - 3) // 2 + 1,
+    )
+    # (2, 128, 96, 19)
+
+    x = encoder_embed.conv[8](x)  # balancer
+    x = encoder_embed.conv[9](x)  # swooshR
+
+    # (((T - 2) - 3) // 2 + 1) - 2
+    # = (T - 2) - 3) // 2 + 1 - 2
+    # = ((T - 2) - 3) // 2 - 1
+    # = (T - 2 - 3) // 2 - 1
+    # = (T - 5) // 2 - 1
+    # = (T - 7) // 2
+    assert x.shape[2] == (x_copy.shape[1] - 7) // 2
+
+    # (((num_features - 3) // 2 + 1) - 3) // 2 + 1,
+    # = ((num_features - 3) // 2 + 1 - 3) // 2 + 1,
+    # = ((num_features - 3) // 2 - 2) // 2 + 1,
+    # = (num_features - 3 - 4) // 2 // 2 + 1,
+    # = (num_features - 7) // 2 // 2 + 1,
+    # = (num_features - 7) // 4 + 1,
+    # = (num_features - 3) // 4
+    assert x.shape[3] == (x_copy.shape[2] - 3) // 4
+
+    assert x.shape == (N, layer3_channels, (T - 7) // 2, (num_features - 3) // 4)
+
+    # Input shape to convnext is
+    #
+    # (N, layer3_channels, (T-7)//2, (num_features - 3)//4)
+
+    # conv2d: in layer3_channels, out layer3_channels, groups layer3_channels
+    # kernel_size 7, padding 3
+    x = encoder_embed.convnext.depthwise_conv(x)
+    assert x.shape == (N, layer3_channels, (T - 7) // 2, (num_features - 3) // 4)
+
+    # conv2d: in layer3_channels, out hidden_ratio * layer3_channels, kernel_size 1
+    x = encoder_embed.convnext.pointwise_conv1(x)
+    assert x.shape == (N, layer3_channels * 3, (T - 7) // 2, (num_features - 3) // 4)
+
+    x = encoder_embed.convnext.hidden_balancer(x)  # balancer
+    x = encoder_embed.convnext.activation(x)  # swooshL
+
+    # conv2d: in hidden_ratio * layer3_channels, out layer3_channels, kernel 1
+    x = encoder_embed.convnext.pointwise_conv2(x)
+    assert x.shape == (N, layer3_channels, (T - 7) // 2, (num_features - 3) // 4)
+
+    # bypass and layer drop, omitted here.
+    x = encoder_embed.convnext.out_balancer(x)
+
+    # Note: the input and output shape of ConvNeXt are the same
+
+    x = x.transpose(1, 2).reshape(N, (T - 7) // 2, -1)
+    assert x.shape == (N, (T - 7) // 2, layer3_channels * ((num_features - 3) // 4))
+
+    x = encoder_embed.out(x)
+    assert x.shape == (N, (T - 7) // 2, out_channels)
+
+    x = encoder_embed.out_whiten(x)
+    x = encoder_embed.out_norm(x)
+    # final layer is dropout
+
+    # test streaming forward
+
+    subsampling_factor = 2
+    cached_left_padding = encoder_embed.get_init_states(batch_size=N)
+    depthwise_conv_kernel_size = 7
+    pad_size = (depthwise_conv_kernel_size - 1) // 2
+
+    assert cached_left_padding.shape == (
+        N,
+        layer3_channels,
+        pad_size,
+        (num_features - 3) // 4,
+    )
+
+    chunk_size = 16
+    right_padding = pad_size * subsampling_factor
+    T = chunk_size * subsampling_factor + 7 + right_padding
+    x = torch.rand(N, T, num_features)
+    x_lens = torch.tensor([T] * N)
+    y, y_lens, next_cached_left_padding = encoder_embed.streaming_forward(
+        x, x_lens, cached_left_padding
+    )
+
+    assert y.shape == (N, chunk_size, out_channels), y.shape
+    assert next_cached_left_padding.shape == cached_left_padding.shape
+
+    assert y.shape[1] == y_lens[0] == y_lens[1]
+
+
+def main():
+    test_conv2d_subsampling()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/ASR/zipformer/zipformer.py b/egs/librispeech/ASR/zipformer/zipformer.py
index 7d98dbeb1..b39af02b8 100644
--- a/egs/librispeech/ASR/zipformer/zipformer.py
+++ b/egs/librispeech/ASR/zipformer/zipformer.py
@@ -219,7 +219,7 @@ class Zipformer2(EncoderInterface):
 
         (num_frames0, batch_size, _encoder_dims0) = x.shape
 
-        assert self.encoder_dim[0] == _encoder_dims0
+        assert self.encoder_dim[0] == _encoder_dims0, (self.encoder_dim[0], _encoder_dims0)
 
         feature_mask_dropout_prob = 0.125
 
@@ -334,7 +334,7 @@ class Zipformer2(EncoderInterface):
         x = self._get_full_dim_output(outputs)
         x = self.downsample_output(x)
         # class Downsample has this rounding behavior..
-        assert self.output_downsampling_factor == 2
+        assert self.output_downsampling_factor == 2, self.output_downsampling_factor
         if torch.jit.is_scripting() or torch.jit.is_tracing():
             lengths = (x_lens + 1) // 2
         else:
diff --git a/egs/tedlium3/ASR/RESULTS.md b/egs/tedlium3/ASR/RESULTS.md
index 38eaa8f44..bd8a5b43f 100644
--- a/egs/tedlium3/ASR/RESULTS.md
+++ b/egs/tedlium3/ASR/RESULTS.md
@@ -1,5 +1,111 @@
 ## Results
 
+### TedLium3 BPE training results (Zipformer)
+
+#### 2023-06-15 (Regular transducer)
+
+Using the codes from this PR https://github.com/k2-fsa/icefall/pull/1125.
+
+Number of model parameters: 65549011, i.e., 65.5 M
+
+The WERs are
+
+|                                    |     dev    |    test    | comment                                  |
+|------------------------------------|------------|------------|------------------------------------------|
+|          greedy search             | 6.74       | 6.16       | --epoch 50, --avg 22, --max-duration 500 |
+|      beam search (beam size 4)     | 6.56       | 5.95       | --epoch 50, --avg 22, --max-duration 500 |
+| modified beam search (beam size 4) | 6.54       | 6.00       | --epoch 50, --avg 22, --max-duration 500 |
+| fast beam search (set as default)  | 6.91       | 6.28       | --epoch 50, --avg 22, --max-duration 500 |
+
+The training command for reproducing is given below:
+
+```
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+./zipformer/train.py \
+  --use-fp16 true \
+  --world-size 4 \
+  --num-epochs 50 \
+  --start-epoch 0 \
+  --exp-dir zipformer/exp \
+  --max-duration 1000
+```
+
+The tensorboard training log can be found at
+https://tensorboard.dev/experiment/AKXbJha0S9aXyfmuvG4h5A/#scalars
+
+The decoding command is:
+```
+epoch=50
+avg=22
+
+## greedy search
+./zipformer/decode.py \
+  --epoch $epoch \
+  --avg $avg \
+  --exp-dir zipformer/exp \
+  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --max-duration 500
+
+## beam search
+./zipformer/decode.py \
+  --epoch $epoch \
+  --avg $avg \
+  --exp-dir zipformer/exp \
+  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --max-duration 500 \
+  --decoding-method beam_search \
+  --beam-size 4
+
+## modified beam search
+./zipformer/decode.py \
+  --epoch $epoch \
+  --avg $avg \
+  --exp-dir zipformer/exp \
+  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --max-duration 500 \
+  --decoding-method modified_beam_search \
+  --beam-size 4
+
+## fast beam search
+./zipformer/decode.py \
+  --epoch $epoch \
+  --avg $avg \
+  --exp-dir ./zipformer/exp \
+  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --max-duration 1500 \
+  --decoding-method fast_beam_search \
+  --beam 4 \
+  --max-contexts 4 \
+  --max-states 8
+```
+
+A pre-trained model and decoding logs can be found at <https://huggingface.co/desh2608/icefall-asr-tedlium3-zipformer>
+
+#### 2023-06-26 (Modified transducer)
+
+```
+./zipformer/train.py \
+  --use-fp16 true \
+  --world-size 4 \
+  --num-epochs 50 \
+  --start-epoch 0 \
+  --exp-dir zipformer/exp \
+  --max-duration 1000 \
+  --rnnt-type modified
+```
+
+The tensorboard training log can be found at
+https://tensorboard.dev/experiment/3d4bYmbJTGiWQQaW88CVEQ/#scalars
+
+|                                    |     dev    |    test    | comment                                  |
+|------------------------------------|------------|------------|------------------------------------------|
+|          greedy search             | 6.32       | 5.83       | --epoch 50, --avg 22, --max-duration 500 |
+| modified beam search (beam size 4) | 6.16       | 5.79       | --epoch 50, --avg 22, --max-duration 500 |
+| fast beam search (set as default)  | 6.30       | 5.89       | --epoch 50, --avg 22, --max-duration 500 |
+
+A pre-trained model and decoding logs can be found at <https://huggingface.co/desh2608/icefall-asr-tedlium3-zipformer>.
+
 ### TedLium3 BPE training results (Conformer-CTC 2)
 
 #### [conformer_ctc2](./conformer_ctc2)
diff --git a/egs/tedlium3/ASR/zipformer/__init__.py b/egs/tedlium3/ASR/zipformer/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/egs/tedlium3/ASR/zipformer/asr_datamodule.py b/egs/tedlium3/ASR/zipformer/asr_datamodule.py
new file mode 120000
index 000000000..49b2ee483
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/asr_datamodule.py
@@ -0,0 +1 @@
+../transducer_stateless/asr_datamodule.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/beam_search.py b/egs/tedlium3/ASR/zipformer/beam_search.py
new file mode 120000
index 000000000..e24eca39f
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/beam_search.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless2/beam_search.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/decode.py b/egs/tedlium3/ASR/zipformer/decode.py
new file mode 100755
index 000000000..ea1cbba1b
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/decode.py
@@ -0,0 +1,833 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+(1) greedy search
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method greedy_search
+
+(2) beam search (not recommended)
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method beam_search \
+    --beam-size 4
+
+(3) modified beam search
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+
+(4) fast beam search (one best)
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64
+
+(5) fast beam search (nbest)
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64 \
+    --num-paths 200 \
+    --nbest-scale 0.5
+
+(6) fast beam search (nbest oracle WER)
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest_oracle \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64 \
+    --num-paths 200 \
+    --nbest-scale 0.5
+
+(7) fast beam search (with LG)
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest_LG \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64
+"""
+
+
+import argparse
+import logging
+import math
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from asr_datamodule import TedLiumAsrDataModule
+from beam_search import (
+    beam_search,
+    fast_beam_search_nbest,
+    fast_beam_search_nbest_LG,
+    fast_beam_search_nbest_oracle,
+    fast_beam_search_one_best,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from train import add_model_arguments, get_params, get_transducer_model
+
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    make_pad_mask,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_error_stats,
+)
+
+LOG_EPS = math.log(1e-10)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=30,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=15,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=Path,
+        default="data/lang_bpe_500",
+        help="The lang dir containing word table and LG graph",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Possible values are:
+          - greedy_search
+          - beam_search
+          - modified_beam_search
+          - fast_beam_search
+          - fast_beam_search_nbest
+          - fast_beam_search_nbest_oracle
+          - fast_beam_search_nbest_LG
+        If you use fast_beam_search_nbest_LG, you have to specify
+        `--lang-dir`, which should contain `LG.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=20.0,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search,
+        fast_beam_search_nbest, fast_beam_search_nbest_LG,
+        and fast_beam_search_nbest_oracle
+        """,
+    )
+
+    parser.add_argument(
+        "--ngram-lm-scale",
+        type=float,
+        default=0.01,
+        help="""
+        Used only when --decoding_method is fast_beam_search_nbest_LG.
+        It specifies the scale for n-gram LM scores.
+        """,
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=8,
+        help="""Used only when --decoding-method is
+        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
+        and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=64,
+        help="""Used only when --decoding-method is
+        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
+        and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; " "2 means tri-gram",
+    )
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame.
+        Used only when --decoding_method is greedy_search""",
+    )
+
+    parser.add_argument(
+        "--num-paths",
+        type=int,
+        default=200,
+        help="""Number of paths for nbest decoding.
+        Used only when the decoding method is fast_beam_search_nbest,
+        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--nbest-scale",
+        type=float,
+        default=0.5,
+        help="""Scale applied to lattice scores when computing nbest paths.
+        Used only when the decoding method is fast_beam_search_nbest,
+        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    word_table: Optional[k2.SymbolTable] = None,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Dict[str, List[List[str]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+
+        - key: It indicates the setting used for decoding. For example,
+               if greedy_search is used, it would be "greedy_search"
+               If beam search with a beam size of 7 is used, it would be
+               "beam_7"
+        - value: It contains the decoding result. `len(value)` equals to
+                 batch size. `value[i]` is the decoding result for the i-th
+                 utterance in the given batch.
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+      word_table:
+        The word symbol table.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
+        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+    Returns:
+      Return the decoding result. See above description for the format of
+      the returned dict.
+    """
+    device = next(model.parameters()).device
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+
+    feature = feature.to(device)
+    # at entry, feature is (N, T, C)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    if params.causal:
+        # this seems to cause insertions at the end of the utterance if used with zipformer.
+        pad_len = 30
+        feature_lens += pad_len
+        feature = torch.nn.functional.pad(
+            feature,
+            pad=(0, 0, 0, pad_len),
+            value=LOG_EPS,
+        )
+
+    x, x_lens = model.encoder_embed(feature, feature_lens)
+
+    src_key_padding_mask = make_pad_mask(x_lens)
+    x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+
+    encoder_out, encoder_out_lens = model.encoder(x, x_lens, src_key_padding_mask)
+    encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
+
+    hyps = []
+    unk = sp.decode(sp.unk_id()).strip()
+
+    if params.decoding_method == "fast_beam_search":
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            allow_partial=True,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyp = [w for w in hyp.split() if w != unk]
+            hyps.append(hyp)
+    elif params.decoding_method == "fast_beam_search_nbest_LG":
+        hyp_tokens = fast_beam_search_nbest_LG(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            nbest_scale=params.nbest_scale,
+            allow_partial=True,
+        )
+        for hyp in hyp_tokens:
+            hyp = [word_table[i] for i in hyp if word_table[i] != unk]
+            hyps.append(hyp)
+    elif params.decoding_method == "fast_beam_search_nbest":
+        hyp_tokens = fast_beam_search_nbest(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            nbest_scale=params.nbest_scale,
+            allow_partial=True,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyp = [w for w in hyp.split() if w != unk]
+            hyps.append(hyp)
+    elif params.decoding_method == "fast_beam_search_nbest_oracle":
+        hyp_tokens = fast_beam_search_nbest_oracle(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            ref_texts=sp.encode(supervisions["text"]),
+            nbest_scale=params.nbest_scale,
+            allow_partial=True,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyp = [w for w in hyp.split() if w != unk]
+            hyps.append(hyp)
+    elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyp = [w for w in hyp.split() if w != unk]
+            hyps.append(hyp)
+    elif params.decoding_method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyp = [w for w in hyp.split() if w != unk]
+            hyps.append(hyp)
+    else:
+        batch_size = encoder_out.size(0)
+
+        for i in range(batch_size):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.decoding_method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                )
+            elif params.decoding_method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported decoding method: {params.decoding_method}"
+                )
+            hyp = [w for w in sp.decode(hyp).split() if w != unk]
+            hyps.append(hyp)
+
+    if params.decoding_method == "greedy_search":
+        return {"greedy_search": hyps}
+    elif "fast_beam_search" in params.decoding_method:
+        key = f"beam_{params.beam}_"
+        key += f"max_contexts_{params.max_contexts}_"
+        key += f"max_states_{params.max_states}"
+        if "nbest" in params.decoding_method:
+            key += f"_num_paths_{params.num_paths}_"
+            key += f"nbest_scale_{params.nbest_scale}"
+            if "LG" in params.decoding_method:
+                key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
+
+        return {key: hyps}
+    else:
+        return {f"beam_size_{params.beam_size}": hyps}
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    word_table: Optional[k2.SymbolTable] = None,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      word_table:
+        The word symbol table.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
+        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    if params.decoding_method == "greedy_search":
+        log_interval = 50
+    else:
+        log_interval = 20
+
+    results = defaultdict(list)
+    for batch_idx, batch in enumerate(dl):
+        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+
+        hyps_dict = decode_one_batch(
+            params=params,
+            model=model,
+            sp=sp,
+            decoding_graph=decoding_graph,
+            word_table=word_table,
+            batch=batch,
+        )
+
+        for name, hyps in hyps_dict.items():
+            this_batch = []
+            assert len(hyps) == len(texts)
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                ref_words = ref_text.split()
+                this_batch.append((cut_id, ref_words, hyp_words))
+
+            results[name].extend(this_batch)
+
+        num_cuts += len(texts)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+    return results
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        with open(errs_filename, "w") as f:
+            wer = write_error_stats(
+                f, f"{test_set_name}-{key}", results, enable_log=True
+            )
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = (
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+    )
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    TedLiumAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    assert params.decoding_method in (
+        "greedy_search",
+        "beam_search",
+        "fast_beam_search",
+        "fast_beam_search_nbest",
+        "fast_beam_search_nbest_LG",
+        "fast_beam_search_nbest_oracle",
+        "modified_beam_search",
+    )
+    params.res_dir = params.exp_dir / params.decoding_method
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    if params.causal:
+        assert (
+            "," not in params.chunk_size
+        ), "chunk_size should be one value in decoding."
+        assert (
+            "," not in params.left_context_frames
+        ), "left_context_frames should be one value in decoding."
+        params.suffix += f"-chunk-{params.chunk_size}"
+        params.suffix += f"-left-context-{params.left_context_frames}"
+
+    if "fast_beam_search" in params.decoding_method:
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+        if "nbest" in params.decoding_method:
+            params.suffix += f"-nbest-scale-{params.nbest_scale}"
+            params.suffix += f"-num-paths-{params.num_paths}"
+            if "LG" in params.decoding_method:
+                params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
+    elif "beam_search" in params.decoding_method:
+        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
+    else:
+        params.suffix += f"-context-{params.context_size}"
+        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
+
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> and <unk> are defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.piece_to_id("<unk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_transducer_model(params)
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to(device)
+    model.eval()
+
+    if "fast_beam_search" in params.decoding_method:
+        if params.decoding_method == "fast_beam_search_nbest_LG":
+            lexicon = Lexicon(params.lang_dir)
+            word_table = lexicon.word_table
+            lg_filename = params.lang_dir / "LG.pt"
+            logging.info(f"Loading {lg_filename}")
+            decoding_graph = k2.Fsa.from_dict(
+                torch.load(lg_filename, map_location=device)
+            )
+            decoding_graph.scores *= params.ngram_lm_scale
+        else:
+            word_table = None
+            decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+    else:
+        decoding_graph = None
+        word_table = None
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    tedlium = TedLiumAsrDataModule(args)
+
+    dev_cuts = tedlium.dev_cuts()
+    test_cuts = tedlium.test_cuts()
+
+    dev_dl = tedlium.test_dataloaders(dev_cuts)
+    test_dl = tedlium.test_dataloaders(test_cuts)
+
+    test_sets = ["dev", "test"]
+    test_dls = [dev_dl, test_dl]
+
+    for name, dl in zip(test_sets, test_dls):
+        results_dict = decode_dataset(
+            dl=dl,
+            params=params,
+            model=model,
+            sp=sp,
+            word_table=word_table,
+            decoding_graph=decoding_graph,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=name,
+            results_dict=results_dict,
+        )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/tedlium3/ASR/zipformer/decoder.py b/egs/tedlium3/ASR/zipformer/decoder.py
new file mode 120000
index 000000000..5a8018680
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/decoder.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/decoder.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/encoder_interface.py b/egs/tedlium3/ASR/zipformer/encoder_interface.py
new file mode 120000
index 000000000..653c5b09a
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/encoder_interface.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/transducer_stateless/encoder_interface.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/export.py b/egs/tedlium3/ASR/zipformer/export.py
new file mode 120000
index 000000000..dfc1bec08
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/export.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/export.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/joiner.py b/egs/tedlium3/ASR/zipformer/joiner.py
new file mode 120000
index 000000000..5b8a36332
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/joiner.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/joiner.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/model.py b/egs/tedlium3/ASR/zipformer/model.py
new file mode 100644
index 000000000..90ec7e7aa
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/model.py
@@ -0,0 +1,223 @@
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang, Wei Kang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import k2
+import torch
+import torch.nn as nn
+from encoder_interface import EncoderInterface
+
+from icefall.utils import add_sos, make_pad_mask
+from scaling import ScaledLinear
+
+
+class Transducer(nn.Module):
+    """It implements https://arxiv.org/pdf/1211.3711.pdf
+    "Sequence Transduction with Recurrent Neural Networks"
+    """
+
+    def __init__(
+        self,
+        encoder_embed: nn.Module,
+        encoder: EncoderInterface,
+        decoder: nn.Module,
+        joiner: nn.Module,
+        encoder_dim: int,
+        decoder_dim: int,
+        joiner_dim: int,
+        vocab_size: int,
+    ):
+        """
+        Args:
+          encoder_embed:
+            It is a Convolutional 2D subsampling module. It converts
+            an input of shape (N, T, idim) to an output of of shape
+            (N, T', odim), where T' = (T-3)//2-2 = (T-7)//2.
+          encoder:
+            It is the transcription network in the paper. Its accepts
+            two inputs: `x` of (N, T, encoder_dim) and `x_lens` of shape (N,).
+            It returns two tensors: `logits` of shape (N, T, encoder_dim) and
+            `logit_lens` of shape (N,).
+          decoder:
+            It is the prediction network in the paper. Its input shape
+            is (N, U) and its output shape is (N, U, decoder_dim).
+            It should contain one attribute: `blank_id`.
+          joiner:
+            It has two inputs with shapes: (N, T, encoder_dim) and (N, U, decoder_dim).
+            Its output shape is (N, T, U, vocab_size). Note that its output contains
+            unnormalized probs, i.e., not processed by log-softmax.
+        """
+        super().__init__()
+        assert isinstance(encoder, EncoderInterface), type(encoder)
+        assert hasattr(decoder, "blank_id")
+
+        self.encoder_embed = encoder_embed
+        self.encoder = encoder
+        self.decoder = decoder
+        self.joiner = joiner
+
+        self.simple_am_proj = ScaledLinear(
+            encoder_dim,
+            vocab_size,
+            initial_scale=0.25,
+        )
+        self.simple_lm_proj = ScaledLinear(
+            decoder_dim,
+            vocab_size,
+            initial_scale=0.25,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: k2.RaggedTensor,
+        prune_range: int = 5,
+        am_scale: float = 0.0,
+        lm_scale: float = 0.0,
+        rnnt_type: str = "regular",
+    ) -> torch.Tensor:
+        """
+        Args:
+          x:
+            A 3-D tensor of shape (N, T, C).
+          x_lens:
+            A 1-D tensor of shape (N,). It contains the number of frames in `x`
+            before padding.
+          y:
+            A ragged tensor with 2 axes [utt][label]. It contains labels of each
+            utterance.
+          prune_range:
+            The prune range for rnnt loss, it means how many symbols(context)
+            we are considering for each frame to compute the loss.
+          am_scale:
+            The scale to smooth the loss with am (output of encoder network)
+            part
+          lm_scale:
+            The scale to smooth the loss with lm (output of predictor network)
+            part
+          rnnt_type:
+            The type of label topology to use for the transducer loss. One of "regular",
+            "modified", or "constrained".
+        Returns:
+          Return the transducer loss.
+
+        Note:
+           Regarding am_scale & lm_scale, it will make the loss-function one of
+           the form:
+              lm_scale * lm_probs + am_scale * am_probs +
+              (1-lm_scale-am_scale) * combined_probs
+        """
+        assert x.ndim == 3, x.shape
+        assert x_lens.ndim == 1, x_lens.shape
+        assert y.num_axes == 2, y.num_axes
+
+        assert x.size(0) == x_lens.size(0) == y.dim0
+
+        # logging.info(f"Memory allocated at entry: {torch.cuda.memory_allocated() // 1000000}M")
+        x, x_lens = self.encoder_embed(x, x_lens)
+        # logging.info(f"Memory allocated after encoder_embed: {torch.cuda.memory_allocated() // 1000000}M")
+
+        src_key_padding_mask = make_pad_mask(x_lens)
+        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+
+        encoder_out, x_lens = self.encoder(x, x_lens, src_key_padding_mask)
+        encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
+
+        assert torch.all(x_lens > 0)
+
+        # Now for the decoder, i.e., the prediction network
+        row_splits = y.shape.row_splits(1)
+        y_lens = row_splits[1:] - row_splits[:-1]
+
+        blank_id = self.decoder.blank_id
+        sos_y = add_sos(y, sos_id=blank_id)
+
+        # sos_y_padded: [B, S + 1], start with SOS.
+        sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id)
+
+        # decoder_out: [B, S + 1, decoder_dim]
+        decoder_out = self.decoder(sos_y_padded)
+
+        # Note: y does not start with SOS
+        # y_padded : [B, S]
+        y_padded = y.pad(mode="constant", padding_value=0)
+
+        y_padded = y_padded.to(torch.int64)
+        boundary = torch.zeros(
+            (encoder_out.size(0), 4),
+            dtype=torch.int64,
+            device=encoder_out.device,
+        )
+        boundary[:, 2] = y_lens
+        boundary[:, 3] = x_lens
+
+        lm = self.simple_lm_proj(decoder_out)
+        am = self.simple_am_proj(encoder_out)
+
+        # if self.training and random.random() < 0.25:
+        #    lm = penalize_abs_values_gt(lm, 100.0, 1.0e-04)
+        # if self.training and random.random() < 0.25:
+        #    am = penalize_abs_values_gt(am, 30.0, 1.0e-04)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
+                lm=lm.float(),
+                am=am.float(),
+                symbols=y_padded,
+                termination_symbol=blank_id,
+                lm_only_scale=lm_scale,
+                am_only_scale=am_scale,
+                boundary=boundary,
+                reduction="sum",
+                return_grad=True,
+                rnnt_type=rnnt_type,
+            )
+
+        # ranges : [B, T, prune_range]
+        ranges = k2.get_rnnt_prune_ranges(
+            px_grad=px_grad,
+            py_grad=py_grad,
+            boundary=boundary,
+            s_range=prune_range,
+        )
+
+        # am_pruned : [B, T, prune_range, encoder_dim]
+        # lm_pruned : [B, T, prune_range, decoder_dim]
+        am_pruned, lm_pruned = k2.do_rnnt_pruning(
+            am=self.joiner.encoder_proj(encoder_out),
+            lm=self.joiner.decoder_proj(decoder_out),
+            ranges=ranges,
+        )
+
+        # logits : [B, T, prune_range, vocab_size]
+
+        # project_input=False since we applied the decoder's input projections
+        # prior to do_rnnt_pruning (this is an optimization for speed).
+        logits = self.joiner(am_pruned, lm_pruned, project_input=False)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            pruned_loss = k2.rnnt_loss_pruned(
+                logits=logits.float(),
+                symbols=y_padded,
+                ranges=ranges,
+                termination_symbol=blank_id,
+                boundary=boundary,
+                reduction="sum",
+                rnnt_type=rnnt_type,
+            )
+
+        return (simple_loss, pruned_loss)
diff --git a/egs/tedlium3/ASR/zipformer/optim.py b/egs/tedlium3/ASR/zipformer/optim.py
new file mode 120000
index 000000000..5eaa3cffd
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/optim.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/optim.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/pretrained.py b/egs/tedlium3/ASR/zipformer/pretrained.py
new file mode 120000
index 000000000..0bd71dde4
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/pretrained.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/pretrained.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/profile.py b/egs/tedlium3/ASR/zipformer/profile.py
new file mode 120000
index 000000000..c93adbd14
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/profile.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/profile.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/scaling.py b/egs/tedlium3/ASR/zipformer/scaling.py
new file mode 120000
index 000000000..6f398f431
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/scaling.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/scaling.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/scaling_converter.py b/egs/tedlium3/ASR/zipformer/scaling_converter.py
new file mode 120000
index 000000000..b0ecee05e
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/scaling_converter.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/scaling_converter.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/subsampling.py b/egs/tedlium3/ASR/zipformer/subsampling.py
new file mode 120000
index 000000000..01ae9002c
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/subsampling.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/subsampling.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/train.py b/egs/tedlium3/ASR/zipformer/train.py
new file mode 100755
index 000000000..9271c8438
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/train.py
@@ -0,0 +1,1308 @@
+#!/usr/bin/env python3
+# Copyright    2021-2023  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                       Wei Kang,
+#                                                       Mingshuang Luo,
+#                                                       Zengwei Yao,
+#                                                       Daniel Povey)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+# For non-streaming model training:
+./zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir zipformer/exp \
+  --full-libri 1 \
+  --max-duration 1000
+
+# For streaming model training:
+./zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir zipformer/exp \
+  --causal 1 \
+  --full-libri 1 \
+  --max-duration 1000
+
+"""
+
+
+import argparse
+import copy
+import logging
+import warnings
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple, Union
+
+import k2
+import optim
+import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import TedLiumAsrDataModule
+from decoder import Decoder
+from joiner import Joiner
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import fix_random_seed
+from local.convert_transcript_words_to_bpe_ids import convert_texts_into_ids
+from model import Transducer
+from optim import Eden, ScaledAdam
+from scaling import ScheduledFloat
+from subsampling import Conv2dSubsampling
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from zipformer import Zipformer2
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.hooks import register_inf_check_hooks
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    get_parameter_groups_with_lrs,
+    setup_logger,
+    str2bool,
+)
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def get_adjusted_batch_count(params: AttributeDict) -> float:
+    # returns the number of batches we would have used so far if we had used the reference
+    # duration.  This is for purposes of set_batch_count().
+    return (
+        params.batch_idx_train
+        * (params.max_duration * params.world_size)
+        / params.ref_duration
+    )
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for name, module in model.named_modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+        if hasattr(module, "name"):
+            module.name = name
+
+
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-encoder-layers",
+        type=str,
+        default="2,2,3,4,3,2",
+        help="Number of zipformer encoder layers per stack, comma separated.",
+    )
+
+    parser.add_argument(
+        "--downsampling-factor",
+        type=str,
+        default="1,2,4,8,4,2",
+        help="Downsampling factor for each stack of encoder layers.",
+    )
+
+    parser.add_argument(
+        "--feedforward-dim",
+        type=str,
+        default="512,768,1024,1536,1024,768",
+        help="Feedforward dimension of the zipformer encoder layers, per stack, comma separated.",
+    )
+
+    parser.add_argument(
+        "--num-heads",
+        type=str,
+        default="4,4,4,8,4,4",
+        help="Number of attention heads in the zipformer encoder layers: a single int or comma-separated list.",
+    )
+
+    parser.add_argument(
+        "--encoder-dim",
+        type=str,
+        default="192,256,384,512,384,256",
+        help="Embedding dimension in encoder stacks: a single int or comma-separated list.",
+    )
+
+    parser.add_argument(
+        "--query-head-dim",
+        type=str,
+        default="32",
+        help="Query/key dimension per head in encoder stacks: a single int or comma-separated list.",
+    )
+
+    parser.add_argument(
+        "--value-head-dim",
+        type=str,
+        default="12",
+        help="Value dimension per head in encoder stacks: a single int or comma-separated list.",
+    )
+
+    parser.add_argument(
+        "--pos-head-dim",
+        type=str,
+        default="4",
+        help="Positional-encoding dimension per head in encoder stacks: a single int or comma-separated list.",
+    )
+
+    parser.add_argument(
+        "--pos-dim",
+        type=int,
+        default="48",
+        help="Positional-encoding embedding dimension",
+    )
+
+    parser.add_argument(
+        "--encoder-unmasked-dim",
+        type=str,
+        default="192,192,256,256,256,192",
+        help="Unmasked dimensions in the encoders, relates to augmentation during training.  "
+        "A single int or comma-separated list.  Must be <= each corresponding encoder_dim.",
+    )
+
+    parser.add_argument(
+        "--cnn-module-kernel",
+        type=str,
+        default="31,31,15,15,15,31",
+        help="Sizes of convolutional kernels in convolution modules in each encoder stack: "
+        "a single int or comma-separated list.",
+    )
+
+    parser.add_argument(
+        "--decoder-dim",
+        type=int,
+        default=512,
+        help="Embedding dimension in the decoder model.",
+    )
+
+    parser.add_argument(
+        "--joiner-dim",
+        type=int,
+        default=512,
+        help="""Dimension used in the joiner model.
+        Outputs from the encoder and decoder model are projected
+        to this dimension before adding.
+        """,
+    )
+
+    parser.add_argument(
+        "--causal",
+        type=str2bool,
+        default=False,
+        help="If True, use causal version of model.",
+    )
+
+    parser.add_argument(
+        "--chunk-size",
+        type=str,
+        default="16,32,64,-1",
+        help="Chunk sizes (at 50Hz frame rate) will be chosen randomly from this list during training. "
+        " Must be just -1 if --causal=False",
+    )
+
+    parser.add_argument(
+        "--left-context-frames",
+        type=str,
+        default="64,128,256,-1",
+        help="Maximum left-contexts for causal training, measured in frames which will "
+        "be converted to a number of chunks.  If splitting into chunks, "
+        "chunk left-context frames will be chosen randomly from this list; else not relevant.",
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=50,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=0.04, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=7500,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=5,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--ref-duration",
+        type=float,
+        default=600,
+        help="Reference batch duration for purposes of adjusting batch counts for setting various "
+        "schedules inside the model",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; " "2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="The prune range for rnnt loss, it means how many symbols(context)"
+        "we are using to compute the loss",
+    )
+
+    parser.add_argument(
+        "--rnnt-type",
+        type=str,
+        default="regular",
+        choices=["regular", "modified", "constrained"],
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="The scale to smooth the loss with lm "
+        "(output of prediction network) part.",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="The scale to smooth the loss with am (output of encoder network)" "part.",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="To get pruning ranges, we will calculate a simple version"
+        "loss(joiner is just addition), this simple loss also uses for"
+        "training (as a regularization item). We will scale the simple loss"
+        "with this parameter before adding to the final loss.",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--inf-check",
+        type=str2bool,
+        default=False,
+        help="Add hooks to check for infinite module outputs and gradients.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=4000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 1.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=1,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=200,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - feature_dim: The model input dim. It has to match the one used
+                       in computing features.
+
+        - subsampling_factor:  The subsampling factor for the model.
+
+        - encoder_dim: Hidden dim for multi-head attention model.
+
+        - num_decoder_layers: Number of decoder layer of transformer decoder.
+
+        - warm_step: The warmup period that dictates the decay of the
+              scale on "simple" (un-pruned) loss.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "valid_interval": 3000,  # For the 100h subset, use 800
+            # parameters for zipformer
+            "feature_dim": 80,
+            "subsampling_factor": 4,  # not passed in, this is fixed.
+            "warm_step": 2000,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def _to_int_tuple(s: str):
+    return tuple(map(int, s.split(",")))
+
+
+def get_encoder_embed(params: AttributeDict) -> nn.Module:
+    # encoder_embed converts the input of shape (N, T, num_features)
+    # to the shape (N, (T - 7) // 2, encoder_dims).
+    # That is, it does two things simultaneously:
+    #   (1) subsampling: T -> (T - 7) // 2
+    #   (2) embedding: num_features -> encoder_dims
+    # In the normal configuration, we will downsample once more at the end
+    # by a factor of 2, and most of the encoder stacks will run at a lower
+    # sampling rate.
+    encoder_embed = Conv2dSubsampling(
+        in_channels=params.feature_dim,
+        out_channels=_to_int_tuple(params.encoder_dim)[0],
+        dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
+    )
+    return encoder_embed
+
+
+def get_encoder_model(params: AttributeDict) -> nn.Module:
+    encoder = Zipformer2(
+        output_downsampling_factor=2,
+        downsampling_factor=_to_int_tuple(params.downsampling_factor),
+        num_encoder_layers=_to_int_tuple(params.num_encoder_layers),
+        encoder_dim=_to_int_tuple(params.encoder_dim),
+        encoder_unmasked_dim=_to_int_tuple(params.encoder_unmasked_dim),
+        query_head_dim=_to_int_tuple(params.query_head_dim),
+        pos_head_dim=_to_int_tuple(params.pos_head_dim),
+        value_head_dim=_to_int_tuple(params.value_head_dim),
+        pos_dim=params.pos_dim,
+        num_heads=_to_int_tuple(params.num_heads),
+        feedforward_dim=_to_int_tuple(params.feedforward_dim),
+        cnn_module_kernel=_to_int_tuple(params.cnn_module_kernel),
+        dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
+        warmup_batches=4000.0,
+        causal=params.causal,
+        chunk_size=_to_int_tuple(params.chunk_size),
+        left_context_frames=_to_int_tuple(params.left_context_frames),
+    )
+    return encoder
+
+
+def get_decoder_model(params: AttributeDict) -> nn.Module:
+    decoder = Decoder(
+        vocab_size=params.vocab_size,
+        decoder_dim=params.decoder_dim,
+        blank_id=params.blank_id,
+        context_size=params.context_size,
+    )
+    return decoder
+
+
+def get_joiner_model(params: AttributeDict) -> nn.Module:
+    joiner = Joiner(
+        encoder_dim=max(_to_int_tuple(params.encoder_dim)),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return joiner
+
+
+def get_transducer_model(params: AttributeDict) -> nn.Module:
+    encoder_embed = get_encoder_embed(params)
+    encoder = get_encoder_model(params)
+    decoder = get_decoder_model(params)
+    joiner = get_joiner_model(params)
+
+    model = Transducer(
+        encoder_embed=encoder_embed,
+        encoder=encoder,
+        decoder=decoder,
+        joiner=joiner,
+        encoder_dim=int(max(params.encoder_dim.split(","))),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return model
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute RNNT loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Zipformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+     warmup: a floating point value which increases throughout training;
+        values >= 1.0 are fully warmed up and have all modules present.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"]
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+    feature = feature.to(device)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    batch_idx_train = params.batch_idx_train
+    warm_step = params.warm_step
+
+    texts = batch["supervisions"]["text"]
+    y = convert_texts_into_ids(texts, sp)
+    y = k2.RaggedTensor(y).to(device)
+
+    with torch.set_grad_enabled(is_training):
+        simple_loss, pruned_loss = model(
+            x=feature,
+            x_lens=feature_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+            rnnt_type=params.rnnt_type,
+        )
+
+        s = params.simple_loss_scale
+        # take down the scale on the simple loss from 1.0 at the start
+        # to params.simple_loss scale by warm_step.
+        simple_loss_scale = (
+            s
+            if batch_idx_train >= warm_step
+            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
+        )
+        pruned_loss_scale = (
+            1.0
+            if batch_idx_train >= warm_step
+            else 0.1 + 0.9 * (batch_idx_train / warm_step)
+        )
+
+        loss = simple_loss_scale * simple_loss + pruned_loss_scale * pruned_loss
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    info["simple_loss"] = simple_loss.detach().cpu().item()
+    info["pruned_loss"] = pruned_loss.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            sp=sp,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    sp: spm.SentencePieceProcessor,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    saved_bad_model = False
+
+    def save_bad_model(suffix: str = ""):
+        save_checkpoint_impl(
+            filename=params.exp_dir / f"bad-model{suffix}-{rank}.pt",
+            model=model,
+            model_avg=model_avg,
+            params=params,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=0,
+        )
+
+    for batch_idx, batch in enumerate(train_dl):
+        if batch_idx % 10 == 0:
+            set_batch_count(model, get_adjusted_batch_count(params))
+
+        params.batch_idx_train += 1
+        batch_size = len(batch["supervisions"]["text"])
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            scheduler.step_batch(params.batch_idx_train)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            save_bad_model()
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+
+            if cur_grad_scale < 8.0 or (cur_grad_scale < 32.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                if not saved_bad_model:
+                    save_bad_model(suffix="-first-warning")
+                    saved_bad_model = True
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                save_bad_model()
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = max(scheduler.get_last_lr())
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
+                    )
+
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_transducer_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model).to(torch.float64)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    model.to(device)
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    optimizer = ScaledAdam(
+        get_parameter_groups_with_lrs(model, lr=params.base_lr, include_names=True),
+        lr=params.base_lr,  # should have no effect
+        clipping_scale=2.0,
+    )
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        opts = diagnostics.TensorDiagnosticOptions(
+            2**22
+        )  # allow 4 megabytes per sub-module
+        diagnostic = diagnostics.attach_diagnostics(model, opts)
+
+    if params.inf_check:
+        register_inf_check_hooks(model)
+
+    tedlium = TedLiumAsrDataModule(args)
+
+    train_cuts = tedlium.train_cuts()
+    train_cuts = train_cuts.filter(lambda c: 1.0 <= c.duration <= 20.0)
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = tedlium.train_dataloaders(
+        train_cuts, sampler_state_dict=sampler_state_dict
+    )
+
+    valid_cuts = tedlium.dev_cuts()
+    valid_dl = tedlium.valid_dataloaders(valid_cuts)
+
+    if not params.print_diagnostics:
+        scan_pessimistic_batches_for_oom(
+            model=model,
+            train_dl=train_dl,
+            optimizer=optimizer,
+            sp=sp,
+            params=params,
+        )
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sp=sp,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    supervisions = batch["supervisions"]
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    y = sp.encode(supervisions["text"], out_type=int)
+    num_tokens = sum(len(i) for i in y)
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def scan_pessimistic_batches_for_oom(
+    model: Union[nn.Module, DDP],
+    train_dl: torch.utils.data.DataLoader,
+    optimizer: torch.optim.Optimizer,
+    sp: spm.SentencePieceProcessor,
+    params: AttributeDict,
+):
+    from lhotse.dataset import find_pessimistic_batches
+
+    logging.info(
+        "Sanity check -- see if any of the batches in epoch 1 would cause OOM."
+    )
+    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
+    for criterion, cuts in batches.items():
+        batch = train_dl.dataset[cuts]
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, _ = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            loss.backward()
+            optimizer.zero_grad()
+        except Exception as e:
+            if "CUDA out of memory" in str(e):
+                logging.error(
+                    "Your GPU ran out of memory with the current "
+                    "max_duration setting. We recommend decreasing "
+                    "max_duration and trying again.\n"
+                    f"Failing criterion: {criterion} "
+                    f"(={crit_values[criterion]}) ..."
+                )
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+        logging.info(
+            f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+        )
+
+
+def main():
+    parser = get_parser()
+    TedLiumAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/tedlium3/ASR/zipformer/zipformer.py b/egs/tedlium3/ASR/zipformer/zipformer.py
new file mode 120000
index 000000000..23011dda7
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/zipformer.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/zipformer.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/local/prepare_char_lm_training_data.py b/egs/wenetspeech/ASR/local/prepare_char_lm_training_data.py
new file mode 120000
index 000000000..2374cafdd
--- /dev/null
+++ b/egs/wenetspeech/ASR/local/prepare_char_lm_training_data.py
@@ -0,0 +1 @@
+../../../aishell/ASR/local/prepare_char_lm_training_data.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/local/preprocess_wenetspeech.py b/egs/wenetspeech/ASR/local/preprocess_wenetspeech.py
index 93ce750f8..5de3c23a9 100755
--- a/egs/wenetspeech/ASR/local/preprocess_wenetspeech.py
+++ b/egs/wenetspeech/ASR/local/preprocess_wenetspeech.py
@@ -16,6 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import argparse
 import logging
 import re
 from pathlib import Path
@@ -24,6 +25,7 @@ from lhotse import CutSet, SupervisionSegment
 from lhotse.recipes.utils import read_manifests_if_cached
 
 from icefall import setup_logger
+from icefall.utils import str2bool
 
 # Similar text filtering and normalization procedure as in:
 # https://github.com/SpeechColab/WenetSpeech/blob/main/toolkits/kaldi/wenetspeech_data_prep.sh
@@ -45,7 +47,7 @@ def has_no_oov(
     return oov_pattern.search(sup.text) is None
 
 
-def preprocess_wenet_speech():
+def preprocess_wenet_speech(perturb_speed: bool = False):
     src_dir = Path("data/manifests")
     output_dir = Path("data/fbank")
     output_dir.mkdir(exist_ok=True)
@@ -110,7 +112,7 @@ def preprocess_wenet_speech():
         )
         # Run data augmentation that needs to be done in the
         # time domain.
-        if partition not in ["DEV", "TEST_NET", "TEST_MEETING"]:
+        if partition not in ["DEV", "TEST_NET", "TEST_MEETING"] and perturb_speed:
             logging.info(
                 f"Speed perturb for {partition} with factors 0.9 and 1.1 "
                 "(Perturbing may take 8 minutes and saving may take 20 minutes)"
@@ -120,10 +122,22 @@ def preprocess_wenet_speech():
         cut_set.to_file(raw_cuts_path)
 
 
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--perturb-speed",
+        type=str2bool,
+        default=False,
+        help="Enable 0.9 and 1.1 speed perturbation for data augmentation. Default: False.",
+    )
+    return parser.parse_args()
+
+
 def main():
     setup_logger(log_filename="./log-preprocess-wenetspeech")
 
-    preprocess_wenet_speech()
+    args = get_args()
+    preprocess_wenet_speech(perturb_speed=args.perturb_speed)
     logging.info("Done")
 
 
diff --git a/egs/wenetspeech/ASR/local/sort_lm_training_data.py b/egs/wenetspeech/ASR/local/sort_lm_training_data.py
new file mode 120000
index 000000000..efef2c445
--- /dev/null
+++ b/egs/wenetspeech/ASR/local/sort_lm_training_data.py
@@ -0,0 +1 @@
+../../../aishell/ASR/local/sort_lm_training_data.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/prepare.sh b/egs/wenetspeech/ASR/prepare.sh
index f7b521794..097a59a5f 100755
--- a/egs/wenetspeech/ASR/prepare.sh
+++ b/egs/wenetspeech/ASR/prepare.sh
@@ -91,7 +91,7 @@ fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
   log "Stage 3: Preprocess WenetSpeech manifest"
   if [ ! -f data/fbank/.preprocess_complete ]; then
-    python3 ./local/preprocess_wenetspeech.py
+    python3 ./local/preprocess_wenetspeech.py --perturb-speed True
     touch data/fbank/.preprocess_complete
   fi
 fi
diff --git a/icefall/rnn_lm/train.py b/icefall/rnn_lm/train.py
index 0f0887859..3d206d139 100755
--- a/icefall/rnn_lm/train.py
+++ b/icefall/rnn_lm/train.py
@@ -99,6 +99,15 @@ def get_parser():
         """,
     )
 
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
     parser.add_argument(
         "--exp-dir",
         type=str,
@@ -242,7 +251,9 @@ def load_checkpoint_if_available(
 ) -> None:
     """Load checkpoint from file.
 
-    If params.start_epoch is positive, it will load the checkpoint from
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
     `params.start_epoch - 1`. Otherwise, this function does nothing.
 
     Apart from loading state dict for `model`, `optimizer` and `scheduler`,
@@ -261,10 +272,14 @@ def load_checkpoint_if_available(
     Returns:
       Return None.
     """
-    if params.start_epoch <= 0:
-        return
 
-    filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
     logging.info(f"Loading checkpoint: {filename}")
     saved_params = load_checkpoint(
         filename,
@@ -283,6 +298,13 @@ def load_checkpoint_if_available(
     for k in keys:
         params[k] = saved_params[k]
 
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+        if "cur_batch_idx" in saved_params:
+            params["cur_batch_idx"] = saved_params["cur_batch_idx"]
+
     return saved_params
 
 
@@ -438,7 +460,14 @@ def train_one_epoch(
 
     tot_loss = MetricsTracker()
 
+    cur_batch_idx = params.get("cur_batch_idx", 0)
+
     for batch_idx, batch in enumerate(train_dl):
+
+        if batch_idx < cur_batch_idx:
+            continue
+        cur_batch_idx = batch_idx
+
         params.batch_idx_train += 1
         x, y, sentence_lengths = batch
         batch_size = x.size(0)
@@ -463,6 +492,7 @@ def train_one_epoch(
             params.batch_idx_train > 0
             and params.batch_idx_train % params.save_every_n == 0
         ):
+            params.cur_batch_idx = batch_idx
             save_checkpoint_with_global_batch_idx(
                 out_dir=params.exp_dir,
                 global_batch_idx=params.batch_idx_train,
@@ -471,6 +501,7 @@ def train_one_epoch(
                 optimizer=optimizer,
                 rank=rank,
             )
+            del params.cur_batch_idx
 
         if batch_idx % params.log_interval == 0:
             # Note: "frames" here means "num_tokens"
diff --git a/icefall/utils.py b/icefall/utils.py
index dfe9a7b42..0feff9dc8 100644
--- a/icefall/utils.py
+++ b/icefall/utils.py
@@ -429,6 +429,8 @@ def store_transcripts(
       texts:
         An iterable of tuples. The first element is the cur_id, the second is
         the reference transcript and the third element is the predicted result.
+        If it is a multi-talker ASR system, the ref and hyp may also be lists of
+        strings.
     Returns:
       Return None.
     """
@@ -886,8 +888,167 @@ def write_error_stats_with_timestamps(
         hyp_count = corr + hyp_sub + ins
 
         print(f"{word}   {corr} {tot_errs} {ref_count} {hyp_count}", file=f)
+    return float(tot_err_rate), float(mean_delay), float(var_delay)
 
-    return tot_err_rate, mean_delay, var_delay
+
+def write_surt_error_stats(
+    f: TextIO,
+    test_set_name: str,
+    results: List[Tuple[str, str]],
+    enable_log: bool = True,
+    num_channels: int = 2,
+) -> float:
+    """Write statistics based on predicted results and reference transcripts for SURT
+    multi-talker ASR systems. The difference between this and the `write_error_stats`
+    is that this function finds the optimal speaker-agnostic WER using the ``meeteval``
+    toolkit.
+
+    Args:
+        f: File to write the statistics to.
+        test_set_name: Name of the test set.
+        results: List of tuples containing the utterance ID and the predicted
+            transcript.
+        enable_log: Whether to enable logging.
+        num_channels: Number of output channels/branches. Defaults to 2.
+    Returns:
+      Return None.
+    """
+    from meeteval.wer import wer
+
+    subs: Dict[Tuple[str, str], int] = defaultdict(int)
+    ins: Dict[str, int] = defaultdict(int)
+    dels: Dict[str, int] = defaultdict(int)
+    ref_lens: List[int] = []
+
+    print(
+        "Search below for sections starting with PER-UTT DETAILS:, "
+        "SUBSTITUTIONS:, DELETIONS:, INSERTIONS:, PER-WORD STATS:",
+        file=f,
+    )
+
+    print("", file=f)
+    print("PER-UTT DETAILS: corr or (ref->hyp)  ", file=f)
+
+    # `words` stores counts per word, as follows:
+    #   corr, ref_sub, hyp_sub, ins, dels
+    words: Dict[str, List[int]] = defaultdict(lambda: [0, 0, 0, 0, 0])
+    num_corr = 0
+    ERR = "*"
+    for cut_id, ref, hyp in results:
+        # First compute the optimal assignment of references to output channels
+        orc_wer = wer.orc_word_error_rate(ref, hyp)
+        assignment = orc_wer.assignment
+        refs = [[] for _ in range(num_channels)]
+        # Assign references to channels
+        for i, ref_text in zip(assignment, ref):
+            refs[i] += ref_text.split()
+        hyps = [hyp_text.split() for hyp_text in hyp]
+        # Now compute the WER for each channel
+        for ref_c, hyp_c in zip(refs, hyps):
+            ref_lens.append(len(ref_c))
+            ali = kaldialign.align(ref_c, hyp_c, ERR)
+            for ref_word, hyp_word in ali:
+                if ref_word == ERR:
+                    ins[hyp_word] += 1
+                    words[hyp_word][3] += 1
+                elif hyp_word == ERR:
+                    dels[ref_word] += 1
+                    words[ref_word][4] += 1
+                elif hyp_word != ref_word:
+                    subs[(ref_word, hyp_word)] += 1
+                    words[ref_word][1] += 1
+                    words[hyp_word][2] += 1
+                else:
+                    words[ref_word][0] += 1
+                    num_corr += 1
+            combine_successive_errors = True
+            if combine_successive_errors:
+                ali = [[[x], [y]] for x, y in ali]
+                for i in range(len(ali) - 1):
+                    if ali[i][0] != ali[i][1] and ali[i + 1][0] != ali[i + 1][1]:
+                        ali[i + 1][0] = ali[i][0] + ali[i + 1][0]
+                        ali[i + 1][1] = ali[i][1] + ali[i + 1][1]
+                        ali[i] = [[], []]
+                ali = [
+                    [
+                        list(filter(lambda a: a != ERR, x)),
+                        list(filter(lambda a: a != ERR, y)),
+                    ]
+                    for x, y in ali
+                ]
+                ali = list(filter(lambda x: x != [[], []], ali))
+                ali = [
+                    [
+                        ERR if x == [] else " ".join(x),
+                        ERR if y == [] else " ".join(y),
+                    ]
+                    for x, y in ali
+                ]
+
+            print(
+                f"{cut_id}:\t"
+                + " ".join(
+                    (
+                        ref_word
+                        if ref_word == hyp_word
+                        else f"({ref_word}->{hyp_word})"
+                        for ref_word, hyp_word in ali
+                    )
+                ),
+                file=f,
+            )
+    ref_len = sum(ref_lens)
+    sub_errs = sum(subs.values())
+    ins_errs = sum(ins.values())
+    del_errs = sum(dels.values())
+    tot_errs = sub_errs + ins_errs + del_errs
+    tot_err_rate = "%.2f" % (100.0 * tot_errs / ref_len)
+
+    if enable_log:
+        logging.info(
+            f"[{test_set_name}] %WER {tot_errs / ref_len:.2%} "
+            f"[{tot_errs} / {ref_len}, {ins_errs} ins, "
+            f"{del_errs} del, {sub_errs} sub ]"
+        )
+
+    print(f"%WER = {tot_err_rate}", file=f)
+    print(
+        f"Errors: {ins_errs} insertions, {del_errs} deletions, "
+        f"{sub_errs} substitutions, over {ref_len} reference "
+        f"words ({num_corr} correct)",
+        file=f,
+    )
+
+    print("", file=f)
+    print("SUBSTITUTIONS: count ref -> hyp", file=f)
+
+    for count, (ref, hyp) in sorted([(v, k) for k, v in subs.items()], reverse=True):
+        print(f"{count}   {ref} -> {hyp}", file=f)
+
+    print("", file=f)
+    print("DELETIONS: count ref", file=f)
+    for count, ref in sorted([(v, k) for k, v in dels.items()], reverse=True):
+        print(f"{count}   {ref}", file=f)
+
+    print("", file=f)
+    print("INSERTIONS: count hyp", file=f)
+    for count, hyp in sorted([(v, k) for k, v in ins.items()], reverse=True):
+        print(f"{count}   {hyp}", file=f)
+
+    print("", file=f)
+    print("PER-WORD STATS: word  corr tot_errs count_in_ref count_in_hyp", file=f)
+    for _, word, counts in sorted(
+        [(sum(v[1:]), k, v) for k, v in words.items()], reverse=True
+    ):
+        (corr, ref_sub, hyp_sub, ins, dels) = counts
+        tot_errs = ref_sub + hyp_sub + ins + dels
+        ref_count = corr + ref_sub + dels
+        hyp_count = corr + hyp_sub + ins
+
+        print(f"{word}   {corr} {tot_errs} {ref_count} {hyp_count}", file=f)
+
+    print(f"%WER = {tot_err_rate}", file=f)
+    return float(tot_err_rate)
 
 
 class MetricsTracker(collections.defaultdict):