diff --git a/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh
similarity index 100%
rename from .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
rename to .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh
index 761eb72e2..7d2853c17 100755
--- a/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
+++ b/.github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh
@@ -21,9 +21,9 @@ tree $repo/
 ls -lh $repo/test_wavs/*.wav
 
 pushd $repo/exp
-git lfs pull --include "data/lang_bpe_500/HLG.pt"
 git lfs pull --include "data/lang_bpe_500/L.pt"
 git lfs pull --include "data/lang_bpe_500/LG.pt"
+git lfs pull --include "data/lang_bpe_500/HLG.pt"
 git lfs pull --include "data/lang_bpe_500/Linv.pt"
 git lfs pull --include "data/lang_bpe_500/bpe.model"
 git lfs pull --include "exp/cpu_jit.pt"
diff --git a/.github/scripts/run-librispeech-streaming-zipformer-2023-05-18.sh b/.github/scripts/run-librispeech-streaming-zipformer-2023-05-18.sh
index 45324cb27..f4e2124b1 100755
--- a/.github/scripts/run-librispeech-streaming-zipformer-2023-05-18.sh
+++ b/.github/scripts/run-librispeech-streaming-zipformer-2023-05-18.sh
@@ -23,6 +23,7 @@ ls -lh $repo/test_wavs/*.wav
 
 pushd $repo/exp
 git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
 git lfs pull --include "exp/jit_script_chunk_16_left_128.pt"
 git lfs pull --include "exp/pretrained.pt"
 ln -s pretrained.pt epoch-99.pt
@@ -33,7 +34,7 @@ log "Export to torchscript model"
 ./zipformer/export.py \
   --exp-dir $repo/exp \
   --use-averaged-model false \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
   --causal 1 \
   --chunk-size 16 \
   --left-context-frames 128 \
@@ -46,7 +47,7 @@ ls -lh $repo/exp/*.pt
 log "Decode with models exported by torch.jit.script()"
 
 ./zipformer/jit_pretrained_streaming.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
   --nn-model-filename $repo/exp/jit_script_chunk_16_left_128.pt \
   $repo/test_wavs/1089-134686-0001.wav
 
@@ -60,7 +61,7 @@ for method in greedy_search modified_beam_search fast_beam_search; do
     --method $method \
     --beam-size 4 \
     --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
     $repo/test_wavs/1089-134686-0001.wav \
     $repo/test_wavs/1221-135766-0001.wav \
     $repo/test_wavs/1221-135766-0002.wav
diff --git a/.github/scripts/run-librispeech-zipformer-2023-05-18.sh b/.github/scripts/run-librispeech-zipformer-2023-05-18.sh
index 6aac1793e..fb1a0149d 100755
--- a/.github/scripts/run-librispeech-zipformer-2023-05-18.sh
+++ b/.github/scripts/run-librispeech-zipformer-2023-05-18.sh
@@ -23,6 +23,7 @@ ls -lh $repo/test_wavs/*.wav
 
 pushd $repo/exp
 git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
 git lfs pull --include "exp/jit_script.pt"
 git lfs pull --include "exp/pretrained.pt"
 ln -s pretrained.pt epoch-99.pt
@@ -33,7 +34,7 @@ log "Export to torchscript model"
 ./zipformer/export.py \
   --exp-dir $repo/exp \
   --use-averaged-model false \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
   --epoch 99 \
   --avg 1 \
   --jit 1
@@ -43,7 +44,7 @@ ls -lh $repo/exp/*.pt
 log "Decode with models exported by torch.jit.script()"
 
 ./zipformer/jit_pretrained.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
   --nn-model-filename $repo/exp/jit_script.pt \
   $repo/test_wavs/1089-134686-0001.wav \
   $repo/test_wavs/1221-135766-0001.wav \
@@ -56,7 +57,7 @@ for method in greedy_search modified_beam_search fast_beam_search; do
     --method $method \
     --beam-size 4 \
     --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
     $repo/test_wavs/1089-134686-0001.wav \
     $repo/test_wavs/1221-135766-0001.wav \
     $repo/test_wavs/1221-135766-0002.wav
diff --git a/.github/scripts/run-librispeech-zipformer-ctc-2023-06-14.sh b/.github/scripts/run-librispeech-zipformer-ctc-2023-06-14.sh
index cfa9c420c..0026d2109 100755
--- a/.github/scripts/run-librispeech-zipformer-ctc-2023-06-14.sh
+++ b/.github/scripts/run-librispeech-zipformer-ctc-2023-06-14.sh
@@ -23,6 +23,7 @@ ls -lh $repo/test_wavs/*.wav
 
 pushd $repo/exp
 git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
 git lfs pull --include "data/lang_bpe_500/HLG.pt"
 git lfs pull --include "data/lang_bpe_500/L.pt"
 git lfs pull --include "data/lang_bpe_500/LG.pt"
@@ -40,7 +41,7 @@ log "Export to torchscript model"
   --use-transducer 1 \
   --use-ctc 1 \
   --use-averaged-model false \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
   --epoch 99 \
   --avg 1 \
   --jit 1
@@ -51,7 +52,7 @@ log "Decode with models exported by torch.jit.script()"
 
 for method in ctc-decoding 1best; do
   ./zipformer/jit_pretrained_ctc.py \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
     --model-filename $repo/exp/jit_script.pt \
     --HLG $repo/data/lang_bpe_500/HLG.pt \
     --words-file $repo/data/lang_bpe_500/words.txt  \
@@ -71,8 +72,7 @@ for method in ctc-decoding 1best; do
     --use-ctc 1 \
     --method $method \
     --checkpoint $repo/exp/pretrained.pt \
-    --bpe-model $repo/data/lang_bpe_500/bpe.model \
-    --words-file $repo/data/lang_bpe_500/words.txt  \
+    --tokens $repo/data/lang_bpe_500/tokens.txt \
     --HLG $repo/data/lang_bpe_500/HLG.pt \
     --G $repo/data/lm/G_4_gram.pt \
     --words-file $repo/data/lang_bpe_500/words.txt  \
diff --git a/.github/scripts/test-ncnn-export.sh b/.github/scripts/test-ncnn-export.sh
index 52491d2ea..ac16131d0 100755
--- a/.github/scripts/test-ncnn-export.sh
+++ b/.github/scripts/test-ncnn-export.sh
@@ -195,14 +195,14 @@ git lfs pull --include "data/lang_char_bpe/Linv.pt"
 git lfs pull --include "exp/pretrained.pt"
 
 cd exp
-ln -s pretrained.pt epoch-99.pt
+ln -s pretrained.pt epoch-9999.pt
 popd
 
 ./pruned_transducer_stateless7_streaming/export-for-ncnn-zh.py \
   --lang-dir $repo/data/lang_char_bpe \
   --exp-dir $repo/exp \
   --use-averaged-model 0 \
-  --epoch 99 \
+  --epoch 9999 \
   --avg 1 \
   --decode-chunk-len 32 \
   --num-encoder-layers "2,4,3,2,4" \
diff --git a/.github/workflows/run-aishell-2022-06-20.yml b/.github/workflows/run-aishell-2022-06-20.yml
index c46cea0f6..d14196f38 100644
--- a/.github/workflows/run-aishell-2022-06-20.yml
+++ b/.github/workflows/run-aishell-2022-06-20.yml
@@ -44,7 +44,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -119,5 +119,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: aishell-torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless3-2022-06-20
+          name: aishell-torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless3-2022-06-20
           path: egs/aishell/ASR/pruned_transducer_stateless3/exp/
diff --git a/.github/workflows/run-gigaspeech-2022-05-13.yml b/.github/workflows/run-gigaspeech-2022-05-13.yml
index f8ee25cc4..0e47f7538 100644
--- a/.github/workflows/run-gigaspeech-2022-05-13.yml
+++ b/.github/workflows/run-gigaspeech-2022-05-13.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -122,5 +122,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-gigaspeech-pruned_transducer_stateless2-2022-05-12
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-gigaspeech-pruned_transducer_stateless2-2022-05-12
           path: egs/gigaspeech/ASR/pruned_transducer_stateless2/exp/
diff --git a/.github/workflows/run-librispeech-2022-03-12.yml b/.github/workflows/run-librispeech-2022-03-12.yml
index d42202b79..3edbe43ec 100644
--- a/.github/workflows/run-librispeech-2022-03-12.yml
+++ b/.github/workflows/run-librispeech-2022-03-12.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless-2022-03-12
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless-2022-03-12
           path: egs/librispeech/ASR/pruned_transducer_stateless/exp/
diff --git a/.github/workflows/run-librispeech-2022-04-29.yml b/.github/workflows/run-librispeech-2022-04-29.yml
index f42c8f27a..bb44a073b 100644
--- a/.github/workflows/run-librispeech-2022-04-29.yml
+++ b/.github/workflows/run-librispeech-2022-04-29.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -174,12 +174,12 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless2-2022-04-29
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless2-2022-04-29
           path: egs/librispeech/ASR/pruned_transducer_stateless2/exp/
 
       - name: Upload decoding results for pruned_transducer_stateless3
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless3-2022-04-29
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless3-2022-04-29
           path: egs/librispeech/ASR/pruned_transducer_stateless3/exp/
diff --git a/.github/workflows/run-librispeech-2022-05-13.yml b/.github/workflows/run-librispeech-2022-05-13.yml
index 1fbd96157..e7b53b21c 100644
--- a/.github/workflows/run-librispeech-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-2022-05-13.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless5-2022-05-13
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless5-2022-05-13
           path: egs/librispeech/ASR/pruned_transducer_stateless5/exp/
diff --git a/.github/workflows/run-librispeech-2022-11-11-stateless7.yml b/.github/workflows/run-librispeech-2022-11-11-stateless7.yml
index 596596bd9..7e378c9a1 100644
--- a/.github/workflows/run-librispeech-2022-11-11-stateless7.yml
+++ b/.github/workflows/run-librispeech-2022-11-11-stateless7.yml
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-2022-11-11
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-2022-11-11
           path: egs/librispeech/ASR/pruned_transducer_stateless7/exp/
diff --git a/.github/workflows/run-librispeech-2022-11-14-stateless8.yml b/.github/workflows/run-librispeech-2022-11-14-stateless8.yml
index dca7d6d25..a2c1a0ad6 100644
--- a/.github/workflows/run-librispeech-2022-11-14-stateless8.yml
+++ b/.github/workflows/run-librispeech-2022-11-14-stateless8.yml
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless8-2022-11-14
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless8-2022-11-14
           path: egs/librispeech/ASR/pruned_transducer_stateless8/exp/
diff --git a/.github/workflows/run-librispeech-2022-12-01-stateless7-ctc.yml b/.github/workflows/run-librispeech-2022-12-01-stateless7-ctc.yml
index cd41e988e..500ab1736 100644
--- a/.github/workflows/run-librispeech-2022-12-01-stateless7-ctc.yml
+++ b/.github/workflows/run-librispeech-2022-12-01-stateless7-ctc.yml
@@ -159,5 +159,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-ctc-2022-12-01
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-ctc-2022-12-01
           path: egs/librispeech/ASR/pruned_transducer_stateless7_ctc/exp/
diff --git a/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml b/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml
index 91242c401..1a7f9f594 100644
--- a/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml
+++ b/.github/workflows/run-librispeech-2022-12-08-zipformer-mmi.yml
@@ -163,5 +163,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer_mmi-2022-12-08
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer_mmi-2022-12-08
           path: egs/librispeech/ASR/zipformer_mmi/exp/
diff --git a/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml b/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml
index 8490a62fc..68014e20c 100644
--- a/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml
+++ b/.github/workflows/run-librispeech-2022-12-29-stateless7-streaming.yml
@@ -168,5 +168,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-streaming-2022-12-29
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-streaming-2022-12-29
           path: egs/librispeech/ASR/pruned_transducer_stateless7_streaming/exp/
diff --git a/.github/workflows/run-librispeech-2022-12-15-stateless7-ctc-bs.yml b/.github/workflows/run-librispeech-2023-01-29-stateless7-ctc-bs.yml
similarity index 96%
rename from .github/workflows/run-librispeech-2022-12-15-stateless7-ctc-bs.yml
rename to .github/workflows/run-librispeech-2023-01-29-stateless7-ctc-bs.yml
index e0130a636..821abc25d 100644
--- a/.github/workflows/run-librispeech-2022-12-15-stateless7-ctc-bs.yml
+++ b/.github/workflows/run-librispeech-2023-01-29-stateless7-ctc-bs.yml
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-name: run-librispeech-2022-12-15-stateless7-ctc-bs
+name: run-librispeech-2023-01-29-stateless7-ctc-bs
 # zipformer
 
 on:
@@ -34,7 +34,7 @@ on:
     - cron: "50 15 * * *"
 
 jobs:
-  run_librispeech_2022_12_15_zipformer_ctc_bs:
+  run_librispeech_2023_01_29_zipformer_ctc_bs:
     if: github.event.label.name == 'run-decode' || github.event.label.name == 'blank-skip' || github.event_name == 'push' || github.event_name == 'schedule'
     runs-on: ${{ matrix.os }}
     strategy:
@@ -124,7 +124,7 @@ jobs:
           export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
           export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
 
-          .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2022-12-15.sh
+          .github/scripts/run-librispeech-pruned-transducer-stateless7-ctc-bs-2023-01-29.sh
 
       - name: Display decoding results for librispeech pruned_transducer_stateless7_ctc_bs
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
@@ -159,5 +159,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless7-ctc-bs-2022-12-15
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless7-ctc-bs-2023-01-29
           path: egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/exp/
diff --git a/.github/workflows/run-librispeech-conformer-ctc3-2022-11-28.yml b/.github/workflows/run-librispeech-conformer-ctc3-2022-11-28.yml
index 40a37da57..905515dc4 100644
--- a/.github/workflows/run-librispeech-conformer-ctc3-2022-11-28.yml
+++ b/.github/workflows/run-librispeech-conformer-ctc3-2022-11-28.yml
@@ -151,5 +151,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-conformer_ctc3-2022-11-28
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-conformer_ctc3-2022-11-28
           path: egs/librispeech/ASR/conformer_ctc3/exp/
diff --git a/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml b/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
index aba29d066..501fae38c 100644
--- a/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
+++ b/.github/workflows/run-librispeech-lstm-transducer-stateless2-2022-09-03.yml
@@ -26,7 +26,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.8]
 
       fail-fast: false
@@ -159,5 +159,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'shallow-fusion' || github.event.label.name == 'LODR'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-lstm_transducer_stateless2-2022-09-03
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-lstm_transducer_stateless2-2022-09-03
           path: egs/librispeech/ASR/lstm_transducer_stateless2/exp/
diff --git a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
index fd497601d..bf73d4f18 100644
--- a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -153,5 +153,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless3-2022-04-29
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless3-2022-04-29
           path: egs/librispeech/ASR/pruned_transducer_stateless3/exp/
diff --git a/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml b/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
index 57fe5b999..6ea308468 100644
--- a/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
+++ b/.github/workflows/run-librispeech-streaming-transducer-stateless2-2022-06-26.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-pruned_transducer_stateless2-2022-06-26
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless2-2022-06-26
           path: egs/librispeech/ASR/pruned_transducer_stateless2/exp/
diff --git a/.github/workflows/run-librispeech-streaming-zipformer-2023-05-18.yml b/.github/workflows/run-librispeech-streaming-zipformer-2023-05-18.yml
index ed934d56d..5145fb43c 100644
--- a/.github/workflows/run-librispeech-streaming-zipformer-2023-05-18.yml
+++ b/.github/workflows/run-librispeech-streaming-zipformer-2023-05-18.yml
@@ -170,5 +170,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer-2022-11-11
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer-2022-11-11
           path: egs/librispeech/ASR/zipformer/exp/
diff --git a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
index 515122a66..9fe2f0389 100644
--- a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
+++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
@@ -43,7 +43,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless2-2022-04-19
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-transducer_stateless2-2022-04-19
           path: egs/librispeech/ASR/transducer_stateless2/exp/
diff --git a/.github/workflows/run-librispeech-zipformer-2023-05-18.yml b/.github/workflows/run-librispeech-zipformer-2023-05-18.yml
index 7ecf0d2a0..e9d235ad1 100644
--- a/.github/workflows/run-librispeech-zipformer-2023-05-18.yml
+++ b/.github/workflows/run-librispeech-zipformer-2023-05-18.yml
@@ -155,5 +155,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer-2022-11-11
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer-2022-11-11
           path: egs/librispeech/ASR/zipformer/exp/
diff --git a/.github/workflows/run-librispeech-zipformer-ctc-2023-06-14.yml b/.github/workflows/run-librispeech-zipformer-ctc-2023-06-14.yml
index 569ce48fc..48f0b1532 100644
--- a/.github/workflows/run-librispeech-zipformer-ctc-2023-06-14.yml
+++ b/.github/workflows/run-librispeech-zipformer-ctc-2023-06-14.yml
@@ -151,5 +151,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-zipformer-2022-11-11
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-zipformer-2022-11-11
           path: egs/librispeech/ASR/zipformer/exp/
diff --git a/.github/workflows/run-pretrained-conformer-ctc.yml b/.github/workflows/run-pretrained-conformer-ctc.yml
index 8aaea35f6..bcd326b9d 100644
--- a/.github/workflows/run-pretrained-conformer-ctc.yml
+++ b/.github/workflows/run-pretrained-conformer-ctc.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
index 03a1df48e..1e5b25f5c 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@@ -42,7 +42,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -154,5 +154,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless_multi_datasets-100h-2022-02-21
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-transducer_stateless_multi_datasets-100h-2022-02-21
           path: egs/librispeech/ASR/transducer_stateless_multi_datasets/exp/
diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
index 8da4ff56a..9063c0ed6 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@@ -42,7 +42,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -154,5 +154,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless_multi_datasets-100h-2022-03-01
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-transducer_stateless_multi_datasets-100h-2022-03-01
           path: egs/librispeech/ASR/transducer_stateless_multi_datasets/exp/
diff --git a/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml b/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
index 0b3e70d77..2d24528d3 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-2-aishell.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
diff --git a/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml b/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
index a6a59d339..761b26131 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-modified-aishell.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
diff --git a/.github/workflows/run-pretrained-transducer-stateless.yml b/.github/workflows/run-pretrained-transducer-stateless.yml
index 98d84bf96..e46b9a849 100644
--- a/.github/workflows/run-pretrained-transducer-stateless.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@@ -42,7 +42,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
@@ -154,5 +154,5 @@ jobs:
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event.label.name == 'run-decode'
         with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-18.04-cpu-transducer_stateless-2022-02-07
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-transducer_stateless-2022-02-07
           path: egs/librispeech/ASR/transducer_stateless/exp/
diff --git a/.github/workflows/run-pretrained-transducer.yml b/.github/workflows/run-pretrained-transducer.yml
index 8c1a652e0..190e446bc 100644
--- a/.github/workflows/run-pretrained-transducer.yml
+++ b/.github/workflows/run-pretrained-transducer.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.7, 3.8, 3.9]
 
       fail-fast: false
diff --git a/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml b/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
index 6c70c646b..319a5558a 100644
--- a/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
+++ b/.github/workflows/run-wenetspeech-pruned-transducer-stateless2.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-18.04]
+        os: [ubuntu-latest]
         python-version: [3.8]
 
       fail-fast: false
diff --git a/.github/workflows/run-yesno-recipe.yml b/.github/workflows/run-yesno-recipe.yml
index f997e634a..8a2c94829 100644
--- a/.github/workflows/run-yesno-recipe.yml
+++ b/.github/workflows/run-yesno-recipe.yml
@@ -33,7 +33,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        # os: [ubuntu-18.04, macos-10.15]
+        # os: [ubuntu-latest, macos-10.15]
         # TODO: enable macOS for CPU testing
         os: [ubuntu-latest]
         python-version: [3.8]
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e04fb5655..363556bb7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -35,9 +35,9 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         python-version: ["3.8"]
-        torch: ["1.10.0"]
-        torchaudio: ["0.10.0"]
-        k2-version: ["1.23.2.dev20221201"]
+        torch: ["1.13.0"]
+        torchaudio: ["0.13.0"]
+        k2-version: ["1.24.3.dev20230719"]
 
       fail-fast: false
 
@@ -66,14 +66,14 @@ jobs:
           pip install torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
           pip install torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
 
-          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/
+          pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.github.io/k2/cpu.html
           pip install git+https://github.com/lhotse-speech/lhotse
           # icefall requirements
           pip uninstall -y protobuf
           pip install --no-binary protobuf protobuf==3.20.*
 
           pip install kaldifst
-          pip install onnxruntime
+          pip install onnxruntime matplotlib
           pip install -r requirements.txt
 
       - name: Install graphviz
@@ -83,13 +83,6 @@ jobs:
           python3 -m pip install -qq graphviz
           sudo apt-get -qq install graphviz
 
-      - name: Install graphviz
-        if: startsWith(matrix.os, 'macos')
-        shell: bash
-        run: |
-          python3 -m pip install -qq graphviz
-          brew install -q graphviz
-
       - name: Run tests
         if: startsWith(matrix.os, 'ubuntu')
         run: |
@@ -129,40 +122,10 @@ jobs:
           cd ../transducer_lstm
           pytest -v -s
 
-      - name: Run tests
-        if: startsWith(matrix.os, 'macos')
-        run: |
-          ls -lh
-          export PYTHONPATH=$PWD:$PWD/lhotse:$PYTHONPATH
-          lib_path=$(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")
-          echo "lib_path: $lib_path"
-          export DYLD_LIBRARY_PATH=$lib_path:$DYLD_LIBRARY_PATH
-          pytest -v -s ./test
-
-          # run tests for conformer ctc
-          cd egs/librispeech/ASR/conformer_ctc
+          cd ../zipformer
           pytest -v -s
 
-          cd ../pruned_transducer_stateless
-          pytest -v -s
-
-          cd ../pruned_transducer_stateless2
-          pytest -v -s
-
-          cd ../pruned_transducer_stateless3
-          pytest -v -s
-
-          cd ../pruned_transducer_stateless4
-          pytest -v -s
-
-          cd ../transducer_stateless
-          pytest -v -s
-
-          # cd ../transducer
-          # pytest -v -s
-
-          cd ../transducer_stateless2
-          pytest -v -s
-
-          cd ../transducer_lstm
-          pytest -v -s
+      - uses: actions/upload-artifact@v2
+        with:
+          path: egs/librispeech/ASR/zipformer/swoosh.pdf
+          name: swoosh.pdf
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6901dec02..0ff3f801c 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -86,6 +86,7 @@ rst_epilog = """
 .. _git-lfs: https://git-lfs.com/
 .. _ncnn: https://github.com/tencent/ncnn
 .. _LibriSpeech: https://www.openslr.org/12
+.. _Gigaspeech: https://github.com/SpeechColab/GigaSpeech
 .. _musan: http://www.openslr.org/17/
 .. _ONNX: https://github.com/onnx/onnx
 .. _onnxruntime: https://github.com/microsoft/onnxruntime
diff --git a/docs/source/decoding-with-langugage-models/LODR.rst b/docs/source/decoding-with-langugage-models/LODR.rst
new file mode 100644
index 000000000..7ffa0c128
--- /dev/null
+++ b/docs/source/decoding-with-langugage-models/LODR.rst
@@ -0,0 +1,184 @@
+.. _LODR:
+
+LODR for RNN Transducer
+=======================
+
+
+As a type of E2E model, neural transducers are usually considered as having an internal 
+language model, which learns the language level information on the training corpus. 
+In real-life scenario, there is often a mismatch between the training corpus and the target corpus space. 
+This mismatch can be a problem when decoding for neural transducer models with language models as its internal
+language can act "against" the external LM. In this tutorial, we show how to use
+`Low-order Density Ratio <https://arxiv.org/abs/2203.16776>`_ to alleviate this effect to further improve the performance
+of langugae model integration. 
+
+.. note::
+
+    This tutorial is based on the recipe 
+    `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
+    which is a streaming transducer model trained on `LibriSpeech`_. 
+    However, you can easily apply LODR to other recipes.
+    If you encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`__.
+
+
+.. note::
+
+    For simplicity, the training and testing corpus in this tutorial are the same (`LibriSpeech`_). However, 
+    you can change the testing set to any other domains (e.g `GigaSpeech`_) and prepare the language models 
+    using that corpus.
+
+First, let's have a look at some background information. As the predecessor of LODR, Density Ratio (DR) is first proposed `here <https://arxiv.org/abs/2002.11268>`_ 
+to address the language information mismatch between the training
+corpus (source domain) and the testing corpus (target domain). Assuming that the source domain and the test domain
+are acoustically similar, DR derives the following formular for decoding with Bayes' theorem:
+
+.. math::
+
+    \text{score}\left(y_u|\mathit{x},y\right) = 
+    \log p\left(y_u|\mathit{x},y_{1:u-1}\right) + 
+    \lambda_1 \log p_{\text{Target LM}}\left(y_u|\mathit{x},y_{1:u-1}\right) - 
+    \lambda_2 \log p_{\text{Source LM}}\left(y_u|\mathit{x},y_{1:u-1}\right)
+
+
+where :math:`\lambda_1` and :math:`\lambda_2` are the weights of LM scores for target domain and source domain respectively. 
+Here, the source domain LM is trained on the training corpus. The only difference in the above formular compared to 
+shallow fusion is the subtraction of the source domain LM.
+
+Some works treat the predictor and the joiner of the neural transducer as its internal LM. However, the LM is 
+considered to be weak and can only capture low-level language information. Therefore, `LODR <https://arxiv.org/abs/2203.16776>`__ proposed to use
+a low-order n-gram LM as an approximation of the ILM of the neural transducer. This leads to the following formula
+during decoding for transducer model:
+
+.. math::
+
+    \text{score}\left(y_u|\mathit{x},y\right) = 
+    \log p_{rnnt}\left(y_u|\mathit{x},y_{1:u-1}\right) + 
+    \lambda_1 \log p_{\text{Target LM}}\left(y_u|\mathit{x},y_{1:u-1}\right) - 
+    \lambda_2 \log p_{\text{bi-gram}}\left(y_u|\mathit{x},y_{1:u-1}\right)
+
+In LODR, an additional bi-gram LM estimated on the source domain (e.g training corpus) is required. Comared to DR, 
+the only difference lies in the choice of source domain LM. According to the original `paper <https://arxiv.org/abs/2203.16776>`_,
+LODR achieves similar performance compared DR in both intra-domain and cross-domain settings.
+As a bi-gram is much faster to evaluate, LODR is usually much faster.
+
+Now, we will show you how to use LODR in ``icefall``.
+For illustration purpose, we will use a pre-trained ASR model from this `link <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`_.
+If you want to train your model from scratch, please have a look at :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+The testing scenario here is intra-domain (we decode the model trained on `LibriSpeech`_ on `LibriSpeech`_ testing sets).
+
+As the initial step, let's download the pre-trained model.
+
+.. code-block:: bash
+
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+    $ pushd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
+
+To test the model, let's have a look at the decoding results **without** using LM. This can be done via the following command:
+
+.. code-block:: bash
+
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --exp-dir $exp_dir \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search
+
+The following WERs are achieved on test-clean and test-other:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	3.11	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.93	best for test-other
+
+Then, we download the external language model and bi-gram LM that are necessary for LODR. 
+Note that the bi-gram is estimated on the LibriSpeech 960 hours' text.
+
+.. code-block:: bash
+
+    $ # download the external LM
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm 
+    $ # create a symbolic link so that the checkpoint can be loaded
+    $ pushd icefall-librispeech-rnn-lm/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt 
+    $ popd
+    $
+    $ # download the bi-gram
+    $ git lfs install
+    $ git clone https://huggingface.co/marcoyang/librispeech_bigram
+    $ pushd data/lang_bpe_500
+    $ ln -s ../../librispeech_bigram/2gram.fst.txt .
+    $ popd
+
+Then, we perform LODR decoding by setting ``--decoding-method`` to ``modified_beam_search_lm_LODR``:
+
+.. code-block:: bash
+    
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ lm_dir=./icefall-librispeech-rnn-lm/exp
+    $ lm_scale=0.42
+    $ LODR_scale=-0.24
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --beam-size 4 \
+        --exp-dir $exp_dir \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search_lm_LODR \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model
+        --use-shallow-fusion 1 \
+        --lm-type rnn \
+        --lm-exp-dir $lm_dir \
+        --lm-epoch 99 \
+        --lm-scale $lm_scale \
+        --lm-avg 1 \
+        --rnn-lm-embedding-dim 2048 \
+        --rnn-lm-hidden-dim 2048 \
+        --rnn-lm-num-layers 3 \
+        --lm-vocab-size 500 \
+        --tokens-ngram 2 \
+        --ngram-lm-scale $LODR_scale
+
+There are two extra arguments that need to be given when doing LODR. ``--tokens-ngram`` specifies the order of n-gram. As we
+are using a bi-gram, we set it to 2. ``--ngram-lm-scale`` is the scale of the bi-gram, it should be a negative number
+as we are subtracting the bi-gram's score during decoding.
+
+The decoding results obtained with the above command are shown below:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	2.61	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	6.74	best for test-other
+
+Recall that the lowest WER we obtained in :ref:`shallow_fusion` with beam size of 4 is ``2.77/7.08``, LODR
+indeed **further improves** the WER. We can do even better if we increase ``--beam-size``:
+
+.. list-table:: WER of LODR with different beam sizes
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Beam size
+     - test-clean
+     - test-other
+   * - 4
+     - 2.61
+     - 6.74
+   * - 8
+     - 2.45
+     - 6.38
+   * - 12
+     - 2.4
+     - 6.23
\ No newline at end of file
diff --git a/docs/source/decoding-with-langugage-models/index.rst b/docs/source/decoding-with-langugage-models/index.rst
new file mode 100644
index 000000000..6e5e3a4d9
--- /dev/null
+++ b/docs/source/decoding-with-langugage-models/index.rst
@@ -0,0 +1,33 @@
+Decoding with language models
+=============================
+
+This section describes how to use external langugage models 
+during decoding to improve the WER of transducer models.
+
+The following decoding methods with external langugage models are available:
+
+
+.. list-table:: LM-rescoring-based methods vs shallow-fusion-based methods (The numbers in each field is WER on test-clean, WER on test-other and decoding time on test-clean)
+   :widths: 25 50
+   :header-rows: 1
+
+   * - Decoding method
+     - beam=4
+   * - ``modified_beam_search``
+     - Beam search (i.e. really n-best decoding, the "beam" is the value of n), similar to the original RNN-T paper. Note, this method does not use language model. 
+   * - ``modified_beam_search_lm_shallow_fusion``
+     - As ``modified_beam_search``, but interpolate RNN-T scores with language model scores, also known as shallow fusion
+   * - ``modified_beam_search_LODR``
+     - As ``modified_beam_search_lm_shallow_fusion``, but subtract score of a (BPE-symbol-level) bigram backoff language model used as an approximation to the internal language model of RNN-T.
+   * - ``modified_beam_search_lm_rescore``
+     - As ``modified_beam_search``, but rescore the n-best hypotheses with external language model (e.g. RNNLM) and re-rank them.
+   * - ``modified_beam_search_lm_rescore_LODR``
+     - As ``modified_beam_search_lm_rescore``, but also subtract the score of a (BPE-symbol-level) bigram backoff language model during re-ranking.
+
+
+.. toctree::
+   :maxdepth: 2
+
+   shallow-fusion
+   LODR
+   rescoring
diff --git a/docs/source/decoding-with-langugage-models/rescoring.rst b/docs/source/decoding-with-langugage-models/rescoring.rst
new file mode 100644
index 000000000..de7e700d0
--- /dev/null
+++ b/docs/source/decoding-with-langugage-models/rescoring.rst
@@ -0,0 +1,256 @@
+.. _rescoring:
+
+LM rescoring for Transducer
+=================================
+
+LM rescoring is a commonly used approach to incorporate external LM information. Unlike shallow-fusion-based
+<<<<<<< HEAD
+methods (see :ref:`shallow_fusion`, :ref:`LODR`), rescoring is usually performed to re-rank the n-best hypotheses after beam search.
+=======
+methods (see :ref:`shallow-fusion`, :ref:`LODR`), rescoring is usually performed to re-rank the n-best hypotheses after beam search.
+>>>>>>> 80d922c1583b9b7fb7e9b47008302cdc74ef58b7
+Rescoring is usually more efficient than shallow fusion since less computation is performed on the external LM.
+In this tutorial, we will show you how to use external LM to rescore the n-best hypotheses decoded from neural transducer models in
+`icefall <https://github.com/k2-fsa/icefall>`__.
+
+.. note::
+
+    This tutorial is based on the recipe 
+    `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
+    which is a streaming transducer model trained on `LibriSpeech`_. 
+    However, you can easily apply shallow fusion to other recipes.
+    If you encounter any problems, please open an issue `here <https://github.com/k2-fsa/icefall/issues>`_.
+
+.. note::
+
+    For simplicity, the training and testing corpus in this tutorial is the same (`LibriSpeech`_). However, you can change the testing set
+    to any other domains (e.g `GigaSpeech`_) and use an external LM trained on that domain.
+
+.. HINT::
+
+  We recommend you to use a GPU for decoding.
+
+For illustration purpose, we will use a pre-trained ASR model from this `link <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`__.
+If you want to train your model from scratch, please have a look at :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+
+As the initial step, let's download the pre-trained model.
+
+.. code-block:: bash
+
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+    $ pushd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
+
+As usual, we first test the model's performance without external LM. This can be done via the following command:
+
+.. code-block:: bash
+
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --exp-dir $exp_dir \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model 
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search
+
+The following WERs are achieved on test-clean and test-other:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	3.11	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.93	best for test-other
+
+Now, we will try to improve the above WER numbers via external LM rescoring. We will download 
+a pre-trained LM from this `link <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm>`__.
+
+.. note::
+
+    This is an RNN LM trained on the LibriSpeech text corpus. So it might not be ideal for other corpus.
+    You may also train a RNN LM from scratch. Please refer to this `script <https://github.com/k2-fsa/icefall/blob/master/icefall/rnn_lm/train.py>`__
+    for training a RNN LM and this `script <https://github.com/k2-fsa/icefall/blob/master/icefall/transformer_lm/train.py>`__ to train a transformer LM.
+
+.. code-block:: bash
+
+    $ # download the external LM
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm 
+    $ # create a symbolic link so that the checkpoint can be loaded
+    $ pushd icefall-librispeech-rnn-lm/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt 
+    $ popd
+
+
+With the RNNLM available, we can rescore the n-best hypotheses generated from `modified_beam_search`. Here,
+`n` should be the number of beams, i.e ``--beam-size``. The command for LM rescoring is
+as follows. Note that the ``--decoding-method`` is set to `modified_beam_search_lm_rescore` and ``--use-shallow-fusion``
+is set to `False`.
+
+.. code-block:: bash
+    
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ lm_dir=./icefall-librispeech-rnn-lm/exp
+    $ lm_scale=0.43
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --beam-size 4 \
+        --exp-dir $exp_dir \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search_lm_rescore \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model
+        --use-shallow-fusion 0 \
+        --lm-type rnn \
+        --lm-exp-dir $lm_dir \
+        --lm-epoch 99 \
+        --lm-scale $lm_scale \
+        --lm-avg 1 \
+        --rnn-lm-embedding-dim 2048 \
+        --rnn-lm-hidden-dim 2048 \
+        --rnn-lm-num-layers 3 \
+        --lm-vocab-size 500
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	2.93	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.6	best for test-other
+
+Great! We made some improvements! Increasing the size of the n-best hypotheses will further boost the performance,
+see the following table:
+
+.. list-table:: WERs of LM rescoring with different beam sizes
+   :widths: 25 25 25
+   :header-rows: 1
+
+   * - Beam size
+     - test-clean
+     - test-other
+   * - 4
+     - 2.93
+     - 7.6
+   * - 8
+     - 2.67
+     - 7.11
+   * - 12
+     - 2.59
+     - 6.86
+
+In fact, we can also apply LODR (see :ref:`LODR`) when doing LM rescoring. To do so, we need to 
+download the bi-gram required by LODR:
+
+.. code-block:: bash
+
+    $ # download the bi-gram
+    $ git lfs install
+    $ git clone https://huggingface.co/marcoyang/librispeech_bigram
+    $ pushd data/lang_bpe_500
+    $ ln -s ../../librispeech_bigram/2gram.arpa .
+    $ popd
+
+Then we can performn LM rescoring + LODR by changing the decoding method to `modified_beam_search_lm_rescore_LODR`. 
+
+.. note:: 
+
+    This decoding method requires the dependency of `kenlm <https://github.com/kpu/kenlm>`_. You can install it
+    via this command: `pip install https://github.com/kpu/kenlm/archive/master.zip`. 
+
+.. code-block:: bash
+    
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ lm_dir=./icefall-librispeech-rnn-lm/exp
+    $ lm_scale=0.43
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --beam-size 4 \
+        --exp-dir $exp_dir \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search_lm_rescore_LODR \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model
+        --use-shallow-fusion 0 \
+        --lm-type rnn \
+        --lm-exp-dir $lm_dir \
+        --lm-epoch 99 \
+        --lm-scale $lm_scale \
+        --lm-avg 1 \
+        --rnn-lm-embedding-dim 2048 \
+        --rnn-lm-hidden-dim 2048 \
+        --rnn-lm-num-layers 3 \
+        --lm-vocab-size 500
+
+You should see the following WERs after executing the commands above:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	2.9	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.57	best for test-other
+
+It's slightly better than LM rescoring. If we further increase the beam size, we will see
+further improvements from LM rescoring + LODR:
+
+.. list-table:: WERs of LM rescoring + LODR with different beam sizes
+   :widths: 25 25 25
+   :header-rows: 1
+
+   * - Beam size
+     - test-clean
+     - test-other
+   * - 4
+     - 2.9
+     - 7.57
+   * - 8
+     - 2.63
+     - 7.04
+   * - 12
+     - 2.52
+     - 6.73
+
+As mentioned earlier, LM rescoring is usually faster than shallow-fusion based methods.
+Here, we benchmark the WERs and decoding speed of them:
+
+.. list-table:: LM-rescoring-based methods vs shallow-fusion-based methods (The numbers in each field is WER on test-clean, WER on test-other and decoding time on test-clean)
+   :widths: 25 25 25 25
+   :header-rows: 1
+
+   * - Decoding method
+     - beam=4
+     - beam=8
+     - beam=12
+   * - ``modified_beam_search``
+     - 3.11/7.93; 132s
+     - 3.1/7.95; 177s
+     - 3.1/7.96; 210s
+   * - ``modified_beam_search_lm_shallow_fusion``
+     - 2.77/7.08; 262s
+     - 2.62/6.65; 352s
+     - 2.58/6.65; 488s
+   * - ``modified_beam_search_LODR``
+     - 2.61/6.74; 400s
+     - 2.45/6.38; 610s
+     - 2.4/6.23; 870s
+   * - ``modified_beam_search_lm_rescore``
+     - 2.93/7.6; 156s
+     - 2.67/7.11; 203s
+     - 2.59/6.86; 255s
+   * - ``modified_beam_search_lm_rescore_LODR``
+     - 2.9/7.57; 160s
+     - 2.63/7.04; 203s
+     - 2.52/6.73; 263s
+
+.. note::
+
+    Decoding is performed with a single 32G V100, we set ``--max-duration`` to 600. 
+    Decoding time here is only for reference and it may vary.
\ No newline at end of file
diff --git a/docs/source/decoding-with-langugage-models/shallow-fusion.rst b/docs/source/decoding-with-langugage-models/shallow-fusion.rst
new file mode 100644
index 000000000..0d2837372
--- /dev/null
+++ b/docs/source/decoding-with-langugage-models/shallow-fusion.rst
@@ -0,0 +1,176 @@
+.. _shallow_fusion:
+
+Shallow fusion for Transducer
+=================================
+
+External language models (LM) are commonly used to improve WERs for E2E ASR models.
+This tutorial shows you how to perform ``shallow fusion`` with an external LM
+to improve the word-error-rate of a transducer model.
+
+.. note::
+
+    This tutorial is based on the recipe 
+    `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
+    which is a streaming transducer model trained on `LibriSpeech`_. 
+    However, you can easily apply shallow fusion to other recipes.
+    If you encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`_.
+
+.. note::
+
+    For simplicity, the training and testing corpus in this tutorial is the same (`LibriSpeech`_). However, you can change the testing set
+    to any other domains (e.g `GigaSpeech`_) and use an external LM trained on that domain.
+
+.. HINT::
+
+  We recommend you to use a GPU for decoding.
+
+For illustration purpose, we will use a pre-trained ASR model from this `link <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`__.
+If you want to train your model from scratch, please have a look at :ref:`non_streaming_librispeech_pruned_transducer_stateless`.
+
+As the initial step, let's download the pre-trained model.
+
+.. code-block:: bash
+
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29
+    $ pushd icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt # create a symbolic link so that the checkpoint can be loaded
+
+To test the model, let's have a look at the decoding results without using LM. This can be done via the following command:
+
+.. code-block:: bash
+
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp/
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --exp-dir $exp_dir \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model 
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search
+
+The following WERs are achieved on test-clean and test-other:
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	3.11	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.93	best for test-other
+
+These are already good numbers! But we can further improve it by using shallow fusion with external LM.
+Training a language model usually takes a long time, we can download a pre-trained LM from this `link <https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm>`__.
+
+.. code-block:: bash
+
+    $ # download the external LM
+    $ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/ezerhouni/icefall-librispeech-rnn-lm 
+    $ # create a symbolic link so that the checkpoint can be loaded
+    $ pushd icefall-librispeech-rnn-lm/exp
+    $ git lfs pull --include "pretrained.pt"
+    $ ln -s pretrained.pt epoch-99.pt 
+    $ popd
+
+.. note::
+
+    This is an RNN LM trained on the LibriSpeech text corpus. So it might not be ideal for other corpus.
+    You may also train a RNN LM from scratch. Please refer to this `script <https://github.com/k2-fsa/icefall/blob/master/icefall/rnn_lm/train.py>`__
+    for training a RNN LM and this `script <https://github.com/k2-fsa/icefall/blob/master/icefall/transformer_lm/train.py>`__ to train a transformer LM.
+
+To use shallow fusion for decoding, we can execute the following command:
+
+.. code-block:: bash
+    
+    $ exp_dir=./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/exp
+    $ lm_dir=./icefall-librispeech-rnn-lm/exp
+    $ lm_scale=0.29
+    $ ./pruned_transducer_stateless7_streaming/decode.py \
+        --epoch 99 \
+        --avg 1 \
+        --use-averaged-model False \
+        --beam-size 4 \
+        --exp-dir $exp_dir \
+        --max-duration 600 \
+        --decode-chunk-len 32 \
+        --decoding-method modified_beam_search_lm_shallow_fusion \
+        --bpe-model ./icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/data/lang_bpe_500/bpe.model
+        --use-shallow-fusion 1 \
+        --lm-type rnn \
+        --lm-exp-dir $lm_dir \
+        --lm-epoch 99 \
+        --lm-scale $lm_scale \
+        --lm-avg 1 \
+        --rnn-lm-embedding-dim 2048 \
+        --rnn-lm-hidden-dim 2048 \
+        --rnn-lm-num-layers 3 \
+        --lm-vocab-size 500
+
+Note that we set ``--decoding-method modified_beam_search_lm_shallow_fusion`` and ``--use-shallow-fusion True``
+to use shallow fusion. ``--lm-type`` specifies the type of neural LM we are going to use, you can either choose
+between ``rnn`` or ``transformer``. The following three arguments are associated with the rnn:
+
+- ``--rnn-lm-embedding-dim``
+    The embedding dimension of the RNN LM
+
+- ``--rnn-lm-hidden-dim``
+    The hidden dimension of the RNN LM
+
+- ``--rnn-lm-num-layers``
+    The number of RNN layers in the RNN LM.
+
+
+The decoding result obtained with the above command are shown below.
+
+.. code-block:: text
+
+    $ For test-clean, WER of different settings are:
+    $ beam_size_4	2.77	best for test-clean
+    $ For test-other, WER of different settings are:
+    $ beam_size_4	7.08	best for test-other
+
+The improvement of shallow fusion is very obvious! The relative WER reduction on test-other is around 10.5%. 
+A few parameters can be tuned to further boost the performance of shallow fusion:
+
+- ``--lm-scale`` 
+
+    Controls the scale of the LM. If too small, the external language model may not be fully utilized; if too large, 
+    the LM score may dominant during decoding, leading to bad WER. A typical value of this is around 0.3.
+
+- ``--beam-size`` 
+    
+    The number of active paths in the search beam. It controls the trade-off between decoding efficiency and accuracy.
+
+Here, we also show how `--beam-size` effect the WER and decoding time:
+
+.. list-table:: WERs and decoding time (on test-clean) of shallow fusion with different beam sizes
+   :widths: 25 25 25 25
+   :header-rows: 1
+
+   * - Beam size
+     - test-clean
+     - test-other
+     - Decoding time on test-clean (s)
+   * - 4
+     - 2.77
+     - 7.08
+     - 262
+   * - 8
+     - 2.62
+     - 6.65
+     - 352
+   * - 12
+     - 2.58
+     - 6.65
+     - 488
+
+As we see, a larger beam size during shallow fusion improves the WER, but is also slower.
+
+
+
+
+
+
+
+ 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8d76eb68b..a7d365a15 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -34,3 +34,8 @@ speech recognition recipes using `k2 <https://github.com/k2-fsa/k2>`_.
 
    contributing/index
    huggingface/index
+
+.. toctree::
+   :maxdepth: 2
+   
+   decoding-with-langugage-models/index
\ No newline at end of file
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
index ea9f350cd..2e8d0893a 100644
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/distillation.rst
@@ -1,7 +1,7 @@
 Distillation with HuBERT
 ========================
 
-This tutorial shows you how to perform knowledge distillation in `icefall`_
+This tutorial shows you how to perform knowledge distillation in `icefall <https://github.com/k2-fsa/icefall>`_
 with the `LibriSpeech`_ dataset. The distillation method
 used here is called "Multi Vector Quantization Knowledge Distillation" (MVQ-KD).
 Please have a look at our paper `Predicting Multi-Codebook Vector Quantization Indexes for Knowledge Distillation <https://arxiv.org/abs/2211.00508>`_
@@ -13,7 +13,7 @@ for more details about MVQ-KD.
     `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_.
     Currently, we only implement MVQ-KD in this recipe. However, MVQ-KD is theoretically applicable to all recipes
     with only minor changes needed. Feel free to try out MVQ-KD in different recipes. If you
-    encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`_.
+    encounter any problems, please open an issue here `icefall <https://github.com/k2-fsa/icefall/issues>`__.
 
 .. note::
 
@@ -217,7 +217,7 @@ the following command.
     --exp-dir $exp_dir \
     --enable-distillation True
 
-You should get similar results as `here <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS-100hours.md#distillation-with-hubert>`_.
+You should get similar results as `here <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS-100hours.md#distillation-with-hubert>`__.
 
 That's all! Feel free to experiment with your own setups and report your results.
-If you encounter any problems during training, please open up an issue `here <https://github.com/k2-fsa/icefall/issues>`_.
+If you encounter any problems during training, please open up an issue `here <https://github.com/k2-fsa/icefall/issues>`__.
diff --git a/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst b/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
index 42fd3df77..1bc1dd984 100644
--- a/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
+++ b/docs/source/recipes/Non-streaming-ASR/librispeech/pruned_transducer_stateless.rst
@@ -8,10 +8,10 @@ with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 
 .. Note::
 
-   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
-   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
-   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
-   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_,
+   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`__,
+   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`__,
+   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`__,
+   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`__,
    We will take pruned_transducer_stateless4 as an example in this tutorial.
 
 .. HINT::
@@ -237,7 +237,7 @@ them, please modify ``./pruned_transducer_stateless4/train.py`` directly.
 
 .. NOTE::
 
-  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`_ are a little different from
+  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`__ are a little different from
   other recipes. It allows you to configure ``--num-encoder-layers``, ``--dim-feedforward``, ``--nhead``, ``--encoder-dim``, ``--decoder-dim``, ``--joiner-dim`` from commandline, so that you can train models with different size with pruned_transducer_stateless5.
 
 
@@ -529,13 +529,13 @@ Download pretrained models
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
 
-  - `pruned_transducer_stateless <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12>`_
+  - `pruned_transducer_stateless <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12>`__
 
-  - `pruned_transducer_stateless2 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29>`_
+  - `pruned_transducer_stateless2 <https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless2-2022-04-29>`__
 
-  - `pruned_transducer_stateless4 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless4-2022-06-03>`_
+  - `pruned_transducer_stateless4 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless4-2022-06-03>`__
 
-  - `pruned_transducer_stateless5 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless5-2022-07-07>`_
+  - `pruned_transducer_stateless5 <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless5-2022-07-07>`__
 
   See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
   for the details of the above pretrained models
diff --git a/docs/source/recipes/Streaming-ASR/introduction.rst b/docs/source/recipes/Streaming-ASR/introduction.rst
index e1382e77d..ac77a51d1 100644
--- a/docs/source/recipes/Streaming-ASR/introduction.rst
+++ b/docs/source/recipes/Streaming-ASR/introduction.rst
@@ -45,9 +45,9 @@ the input features.
 
 We have three variants of Emformer models in ``icefall``.
 
- - ``pruned_stateless_emformer_rnnt2`` using Emformer from torchaudio, see `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2>`_.
+ - ``pruned_stateless_emformer_rnnt2`` using Emformer from torchaudio, see `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2>`__.
  - ``conv_emformer_transducer_stateless`` using ConvEmformer implemented by ourself. Different from the Emformer in torchaudio,
    ConvEmformer has a convolution in each layer and uses the mechanisms in our reworked conformer model.
-   See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless>`_.
+   See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless>`__.
  - ``conv_emformer_transducer_stateless2`` using ConvEmformer implemented by ourself. The only difference from the above one is that
    it uses a simplified memory bank. See `LibriSpeech recipe <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/conv_emformer_transducer_stateless2>`_.
diff --git a/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst b/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst
index de7102ba8..2ca70bcf3 100644
--- a/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/pruned_transducer_stateless.rst
@@ -6,10 +6,10 @@ with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 
 .. Note::
 
-   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`_,
-   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`_,
-   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`_,
-   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`_,
+   The tutorial is suitable for `pruned_transducer_stateless <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless>`__,
+   `pruned_transducer_stateless2 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless2>`__,
+   `pruned_transducer_stateless4 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless4>`__,
+   `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless5>`__,
    We will take pruned_transducer_stateless4 as an example in this tutorial.
 
 .. HINT::
@@ -264,7 +264,7 @@ them, please modify ``./pruned_transducer_stateless4/train.py`` directly.
 
 .. NOTE::
 
-  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`_ are a little different from
+  The options for `pruned_transducer_stateless5 <https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/pruned_transducer_stateless5/train.py>`__ are a little different from
   other recipes. It allows you to configure ``--num-encoder-layers``, ``--dim-feedforward``, ``--nhead``, ``--encoder-dim``, ``--decoder-dim``, ``--joiner-dim`` from commandline, so that you can train models with different size with pruned_transducer_stateless5.
 
 
diff --git a/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst b/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst
index f0e8961d7..8b75473c6 100644
--- a/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst
+++ b/docs/source/recipes/Streaming-ASR/librispeech/zipformer_transducer.rst
@@ -6,7 +6,7 @@ with the `LibriSpeech <https://www.openslr.org/12>`_ dataset.
 
 .. Note::
 
-   The tutorial is suitable for `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`_,
+   The tutorial is suitable for `pruned_transducer_stateless7_streaming <https://github.com/k2-fsa/icefall/tree/master/egs/librispeech/ASR/pruned_transducer_stateless7_streaming>`__,
 
 .. HINT::
 
@@ -642,7 +642,7 @@ Download pretrained models
 If you don't want to train from scratch, you can download the pretrained models
 by visiting the following links:
 
-  - `pruned_transducer_stateless7_streaming <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`_
+  - `pruned_transducer_stateless7_streaming <https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29>`__
 
   See `<https://github.com/k2-fsa/icefall/blob/master/egs/librispeech/ASR/RESULTS.md>`_
   for the details of the above pretrained models
diff --git a/egs/aishell/ASR/pruned_transducer_stateless7/train.py b/egs/aishell/ASR/pruned_transducer_stateless7/train.py
index ef536c035..cbb7db086 100755
--- a/egs/aishell/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/train.py
@@ -240,7 +240,7 @@ def get_parser():
     parser.add_argument(
         "--exp-dir",
         type=str,
-        default="pruned_transducer_stateless3/exp",
+        default="pruned_transducer_stateless7/exp",
         help="""The experiment dir.
         It specifies the directory where all training related
         files, e.g., checkpoints, log, etc, are saved
diff --git a/egs/aishell/ASR/pruned_transducer_stateless7/train2.py b/egs/aishell/ASR/pruned_transducer_stateless7/train2.py
index fb35a6c95..c30f6f960 100755
--- a/egs/aishell/ASR/pruned_transducer_stateless7/train2.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/train2.py
@@ -243,7 +243,7 @@ def get_parser():
     parser.add_argument(
         "--exp-dir",
         type=str,
-        default="pruned_transducer_stateless3/exp",
+        default="pruned_transducer_stateless7/exp",
         help="""The experiment dir.
         It specifies the directory where all training related
         files, e.g., checkpoints, log, etc, are saved
diff --git a/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/decode.py b/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/decode.py
index fcb0ebc4e..da9000164 100755
--- a/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/decode.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/decode.py
@@ -397,7 +397,6 @@ def decode_one_batch(
             beam=params.beam,
             max_contexts=params.max_contexts,
             max_states=params.max_states,
-            subtract_ilme=True,
             ilme_scale=params.ilme_scale,
         )
         for hyp in hyp_tokens:
diff --git a/egs/ami/SURT/README.md b/egs/ami/SURT/README.md
new file mode 100644
index 000000000..74a8ba014
--- /dev/null
+++ b/egs/ami/SURT/README.md
@@ -0,0 +1,156 @@
+# Introduction
+
+This is a multi-talker ASR recipe for the AMI and ICSI datasets. We train a Streaming
+Unmixing and Recognition Transducer (SURT) model for the task.
+
+Please refer to the `egs/libricss/SURT` recipe README for details about the task and the
+model.
+
+## Description of the recipe
+
+### Pre-requisites
+
+The recipes in this directory need the following packages to be installed:
+
+- [meeteval](https://github.com/fgnt/meeteval)
+- [einops](https://github.com/arogozhnikov/einops)
+
+Additionally, we initialize the model with the pre-trained model from the LibriCSS recipe.
+Please download this checkpoint (see below) or train the LibriCSS recipe first.
+
+### Training
+
+To train the model, run the following from within `egs/ami/SURT`:
+
+```bash
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+python dprnn_zipformer/train.py \
+    --use-fp16 True \
+    --exp-dir dprnn_zipformer/exp/surt_base \
+    --world-size 4 \
+    --max-duration 500 \
+    --max-duration-valid 250 \
+    --max-cuts 200 \
+    --num-buckets 50 \
+    --num-epochs 30 \
+    --enable-spec-aug True \
+    --enable-musan False \
+    --ctc-loss-scale 0.2 \
+    --heat-loss-scale 0.2 \
+    --base-lr 0.004 \
+    --model-init-ckpt exp/libricss_base.pt \
+    --chunk-width-randomization True \
+    --num-mask-encoder-layers 4 \
+    --num-encoder-layers 2,2,2,2,2
+```
+
+The above is for SURT-base (~26M). For SURT-large (~38M), use:
+
+```bash
+    --model-init-ckpt exp/libricss_large.pt \
+    --num-mask-encoder-layers 6 \
+    --num-encoder-layers 2,4,3,2,4 \
+    --model-init-ckpt exp/zipformer_large.pt \
+```
+
+**NOTE:** You may need to decrease the `--max-duration` for SURT-large to avoid OOM.
+
+### Adaptation
+
+The training step above only trains on simulated mixtures. For best results, we also
+adapt the final model on the AMI+ICSI train set. For this, run the following from within
+`egs/ami/SURT`:
+
+```bash
+export CUDA_VISIBLE_DEVICES="0"
+
+python dprnn_zipformer/train_adapt.py \
+    --use-fp16 True \
+    --exp-dir dprnn_zipformer/exp/surt_base_adapt \
+    --world-size 4 \
+    --max-duration 500 \
+    --max-duration-valid 250 \
+    --max-cuts 200 \
+    --num-buckets 50 \
+    --num-epochs 8 \
+    --lr-epochs 2 \
+    --enable-spec-aug True \
+    --enable-musan False \
+    --ctc-loss-scale 0.2 \
+    --base-lr 0.0004 \
+    --model-init-ckpt dprnn_zipformer/exp/surt_base/epoch-30.pt \
+    --chunk-width-randomization True \
+    --num-mask-encoder-layers 4 \
+    --num-encoder-layers 2,2,2,2,2
+```
+
+For SURT-large, use the following config:
+
+```bash
+    --num-mask-encoder-layers 6 \
+    --num-encoder-layers 2,4,3,2,4 \
+    --model-init-ckpt dprnn_zipformer/exp/surt_large/epoch-30.pt \
+    --num-epochs 15 \
+    --lr-epochs 4 \
+```
+
+
+### Decoding
+
+To decode the model, run the following from within `egs/ami/SURT`:
+
+#### Greedy search
+
+```bash
+export CUDA_VISIBLE_DEVICES="0"
+
+python dprnn_zipformer/decode.py \
+    --epoch 20 --avg 1 --use-averaged-model False \
+    --exp-dir dprnn_zipformer/exp/surt_base_adapt \
+    --max-duration 250 \
+    --decoding-method greedy_search
+```
+
+#### Beam search
+
+```bash
+python dprnn_zipformer/decode.py \
+    --epoch 20 --avg 1 --use-averaged-model False \
+    --exp-dir dprnn_zipformer/exp/surt_base_adapt \
+    --max-duration 250 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+```
+
+## Results (using beam search)
+
+**AMI**
+
+| Model      | IHM-Mix |  SDM |  MDM |
+|------------|:-------:|:----:|:----:|
+| SURT-base  |   39.8  | 65.4 | 46.6 |
+|   + adapt  |   37.4  | 46.9 | 43.7 |
+| SURT-large |   36.8  | 62.5 | 44.4 |
+|    + adapt |   **35.1**  | **44.6** | **41.4** |
+
+**ICSI**
+
+| Model      | IHM-Mix |  SDM |
+|------------|:-------:|:----:|
+| SURT-base  |   28.3  | 60.0 |
+|   + adapt  |   26.3  | 33.9 |
+| SURT-large |   27.8  | 59.7 |
+|    + adapt |   **24.4**  | **32.3** |
+
+## Pre-trained models and logs
+
+* LibriCSS pre-trained model (for initialization): [base](https://huggingface.co/desh2608/icefall-surt-libricss-dprnn-zipformer/tree/main/exp/surt_base) [large](https://huggingface.co/desh2608/icefall-surt-libricss-dprnn-zipformer/tree/main/exp/surt_large)
+
+* Pre-trained models: <https://huggingface.co/desh2608/icefall-surt-ami-dprnn-zipformer>
+
+* Training logs:
+    - surt_base: <https://tensorboard.dev/experiment/8awy98VZSWegLmH4l2JWSA/>
+    - surt_base_adapt: <https://tensorboard.dev/experiment/aGVgXVzYRDKbGUbPekcNjg/>
+    - surt_large: <https://tensorboard.dev/experiment/ZXMkez0VSYKbPLqRk4clOQ/>
+    - surt_large_adapt: <https://tensorboard.dev/experiment/WLKL1e7bTVyEjSonYSNYwg/>
diff --git a/egs/ami/SURT/dprnn_zipformer/asr_datamodule.py b/egs/ami/SURT/dprnn_zipformer/asr_datamodule.py
new file mode 100644
index 000000000..ec8106bc3
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/asr_datamodule.py
@@ -0,0 +1,399 @@
+# Copyright      2021  Piotr Żelasko
+# Copyright      2022  Xiaomi Corporation     (Author: Mingshuang Luo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import inspect
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
+from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
+    CutMix,
+    DynamicBucketingSampler,
+    K2SurtDataset,
+    PrecomputedFeatures,
+    SimpleCutSampler,
+    SpecAugment,
+)
+from lhotse.dataset.input_strategies import OnTheFlyFeatures
+from lhotse.utils import fix_random_seed
+from torch.utils.data import DataLoader
+
+from icefall.utils import str2bool
+
+
+class _SeedWorkers:
+    def __init__(self, seed: int):
+        self.seed = seed
+
+    def __call__(self, worker_id: int):
+        fix_random_seed(self.seed + worker_id)
+
+
+class AmiAsrDataModule:
+    """
+    DataModule for k2 SURT experiments.
+    It assumes there is always one train and valid dataloader,
+    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
+    and test-other).
+
+    It contains all the common data pipeline modules used in ASR
+    experiments, e.g.:
+    - dynamic batch size,
+    - bucketing samplers,
+    - augmentation,
+    - on-the-fly feature extraction
+
+    This class should be derived for specific corpora used in ASR tasks.
+    """
+
+    def __init__(self, args: argparse.Namespace):
+        self.args = args
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(
+            title="ASR data related options",
+            description="These options are used for the preparation of "
+            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
+            "effective batch sizes, sampling strategies, applied data "
+            "augmentations, etc.",
+        )
+        group.add_argument(
+            "--manifest-dir",
+            type=Path,
+            default=Path("data/manifests"),
+            help="Path to directory with train/valid/test cuts.",
+        )
+        group.add_argument(
+            "--max-duration",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--max-duration-valid",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--max-cuts",
+            type=int,
+            default=100,
+            help="Maximum number of cuts in a single batch. You can "
+            "reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--bucketing-sampler",
+            type=str2bool,
+            default=True,
+            help="When enabled, the batches will come from buckets of "
+            "similar duration (saves padding frames).",
+        )
+        group.add_argument(
+            "--num-buckets",
+            type=int,
+            default=30,
+            help="The number of buckets for the DynamicBucketingSampler"
+            "(you might want to increase it for larger datasets).",
+        )
+        group.add_argument(
+            "--on-the-fly-feats",
+            type=str2bool,
+            default=False,
+            help=(
+                "When enabled, use on-the-fly cut mixing and feature "
+                "extraction. Will drop existing precomputed feature manifests "
+                "if available."
+            ),
+        )
+        group.add_argument(
+            "--shuffle",
+            type=str2bool,
+            default=True,
+            help="When enabled (=default), the examples will be "
+            "shuffled for each epoch.",
+        )
+        group.add_argument(
+            "--drop-last",
+            type=str2bool,
+            default=True,
+            help="Whether to drop last batch. Used by sampler.",
+        )
+        group.add_argument(
+            "--return-cuts",
+            type=str2bool,
+            default=True,
+            help="When enabled, each batch will have the "
+            "field: batch['supervisions']['cut'] with the cuts that "
+            "were used to construct it.",
+        )
+
+        group.add_argument(
+            "--num-workers",
+            type=int,
+            default=2,
+            help="The number of training dataloader workers that "
+            "collect the batches.",
+        )
+
+        group.add_argument(
+            "--enable-spec-aug",
+            type=str2bool,
+            default=True,
+            help="When enabled, use SpecAugment for training dataset.",
+        )
+
+        group.add_argument(
+            "--spec-aug-time-warp-factor",
+            type=int,
+            default=80,
+            help="Used only when --enable-spec-aug is True. "
+            "It specifies the factor for time warping in SpecAugment. "
+            "Larger values mean more warping. "
+            "A value less than 1 means to disable time warp.",
+        )
+
+        group.add_argument(
+            "--enable-musan",
+            type=str2bool,
+            default=True,
+            help="When enabled, select noise from MUSAN and mix it"
+            "with training dataset. ",
+        )
+
+    def train_dataloaders(
+        self,
+        cuts_train: CutSet,
+        sampler_state_dict: Optional[Dict[str, Any]] = None,
+        sources: bool = False,
+    ) -> DataLoader:
+        """
+        Args:
+          cuts_train:
+            CutSet for training.
+          sampler_state_dict:
+            The state dict for the training sampler.
+        """
+        transforms = []
+        if self.args.enable_musan:
+            logging.info("Enable MUSAN")
+            logging.info("About to get Musan cuts")
+            cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
+            transforms.append(
+                CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
+            )
+        else:
+            logging.info("Disable MUSAN")
+
+        input_transforms = []
+        if self.args.enable_spec_aug:
+            logging.info("Enable SpecAugment")
+            logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
+            # Set the value of num_frame_masks according to Lhotse's version.
+            # In different Lhotse's versions, the default of num_frame_masks is
+            # different.
+            num_frame_masks = 10
+            num_frame_masks_parameter = inspect.signature(
+                SpecAugment.__init__
+            ).parameters["num_frame_masks"]
+            if num_frame_masks_parameter.default == 1:
+                num_frame_masks = 2
+            logging.info(f"Num frame mask: {num_frame_masks}")
+            input_transforms.append(
+                SpecAugment(
+                    time_warp_factor=self.args.spec_aug_time_warp_factor,
+                    num_frame_masks=num_frame_masks,
+                    features_mask_size=27,
+                    num_feature_masks=2,
+                    frames_mask_size=100,
+                )
+            )
+        else:
+            logging.info("Disable SpecAugment")
+
+        logging.info("About to create train dataset")
+        train = K2SurtDataset(
+            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+            cut_transforms=transforms,
+            input_transforms=input_transforms,
+            return_cuts=self.args.return_cuts,
+            return_sources=sources,
+            strict=False,
+        )
+
+        if self.args.bucketing_sampler:
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                quadratic_duration=30.0,
+                max_cuts=self.args.max_cuts,
+                shuffle=self.args.shuffle,
+                num_buckets=self.args.num_buckets,
+                drop_last=self.args.drop_last,
+            )
+        else:
+            logging.info("Using SingleCutSampler.")
+            train_sampler = SimpleCutSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                max_cuts=self.args.max_cuts,
+                shuffle=self.args.shuffle,
+            )
+        logging.info("About to create train dataloader")
+
+        if sampler_state_dict is not None:
+            logging.info("Loading sampler state dict")
+            train_sampler.load_state_dict(sampler_state_dict)
+
+        # 'seed' is derived from the current random state, which will have
+        # previously been set in the main process.
+        seed = torch.randint(0, 100000, ()).item()
+        worker_init_fn = _SeedWorkers(seed)
+
+        train_dl = DataLoader(
+            train,
+            sampler=train_sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+            worker_init_fn=worker_init_fn,
+        )
+
+        return train_dl
+
+    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
+        transforms = []
+
+        logging.info("About to create dev dataset")
+        validate = K2SurtDataset(
+            input_strategy=OnTheFlyFeatures(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            )
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+            cut_transforms=transforms,
+            return_cuts=self.args.return_cuts,
+            return_sources=False,
+            strict=False,
+        )
+        valid_sampler = DynamicBucketingSampler(
+            cuts_valid,
+            max_duration=self.args.max_duration_valid,
+            quadratic_duration=30.0,
+            max_cuts=self.args.max_cuts,
+            shuffle=False,
+        )
+        logging.info("About to create dev dataloader")
+
+        # 'seed' is derived from the current random state, which will have
+        # previously been set in the main process.
+        seed = torch.randint(0, 100000, ()).item()
+        worker_init_fn = _SeedWorkers(seed)
+
+        valid_dl = DataLoader(
+            validate,
+            sampler=valid_sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+            worker_init_fn=worker_init_fn,
+        )
+
+        return valid_dl
+
+    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
+        logging.debug("About to create test dataset")
+        test = K2SurtDataset(
+            input_strategy=OnTheFlyFeatures(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            )
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+            return_cuts=self.args.return_cuts,
+            return_sources=False,
+            strict=False,
+        )
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration_valid,
+            max_cuts=self.args.max_cuts,
+            shuffle=False,
+        )
+
+        # 'seed' is derived from the current random state, which will have
+        # previously been set in the main process.
+        seed = torch.randint(0, 100000, ()).item()
+        worker_init_fn = _SeedWorkers(seed)
+
+        logging.debug("About to create test dataloader")
+        test_dl = DataLoader(
+            test,
+            batch_size=None,
+            sampler=sampler,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+            worker_init_fn=worker_init_fn,
+        )
+        return test_dl
+
+    @lru_cache()
+    def aimix_train_cuts(
+        self,
+        rvb_affix: str = "clean",
+        sources: bool = True,
+    ) -> CutSet:
+        logging.info("About to get train cuts")
+        source_affix = "_sources" if sources else ""
+        cs = load_manifest_lazy(
+            self.args.manifest_dir / f"cuts_train_{rvb_affix}{source_affix}.jsonl.gz"
+        )
+        cs = cs.filter(lambda c: c.duration >= 1.0 and c.duration <= 30.0)
+        return cs
+
+    @lru_cache()
+    def train_cuts(
+        self,
+    ) -> CutSet:
+        logging.info("About to get train cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "cuts_train_ami_icsi.jsonl.gz"
+        )
+
+    @lru_cache()
+    def ami_cuts(self, split: str = "dev", type: str = "sdm") -> CutSet:
+        logging.info(f"About to get AMI {split} {type} cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / f"cuts_ami-{type}_{split}.jsonl.gz"
+        )
+
+    @lru_cache()
+    def icsi_cuts(self, split: str = "dev", type: str = "sdm") -> CutSet:
+        logging.info(f"About to get ICSI {split} {type} cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / f"cuts_icsi-{type}_{split}.jsonl.gz"
+        )
diff --git a/egs/ami/SURT/dprnn_zipformer/beam_search.py b/egs/ami/SURT/dprnn_zipformer/beam_search.py
new file mode 120000
index 000000000..581b29833
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/beam_search.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/beam_search.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/decode.py b/egs/ami/SURT/dprnn_zipformer/decode.py
new file mode 100755
index 000000000..d1a1eddc9
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/decode.py
@@ -0,0 +1,622 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+(1) greedy search
+./dprnn_zipformer/decode.py \
+    --epoch 20 \
+    --avg 1 \
+    --use-averaged-model false \
+    --exp-dir ./dprnn_zipformer/exp_adapt \
+    --max-duration 600 \
+    --decoding-method greedy_search
+
+(2) beam search (not recommended)
+./dprnn_zipformer/decode.py \
+    --epoch 20 \
+    --avg 1 \
+    --use-averaged-model false \
+    --exp-dir ./dprnn_zipformer/exp_adapt \
+    --max-duration 600 \
+    --decoding-method beam_search \
+    --beam-size 4
+
+(3) modified beam search
+./dprnn_zipformer/decode.py \
+    --epoch 20 \
+    --avg 1 \
+    --use-averaged-model false \
+    --exp-dir ./dprnn_zipformer/exp_adapt \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+"""
+
+
+import argparse
+import logging
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from asr_datamodule import AmiAsrDataModule
+from beam_search import (
+    beam_search,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from lhotse.utils import EPSILON
+from train import add_model_arguments, get_params, get_surt_model
+
+from icefall import LmScorer, NgramLm
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_surt_error_stats,
+)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=20,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=1,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="dprnn_zipformer/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Possible values are:
+          - greedy_search
+          - beam_search
+          - modified_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame.
+        Used only when --decoding_method is greedy_search""",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+) -> Dict[str, List[List[str]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+
+        - key: It indicates the setting used for decoding. For example,
+               if greedy_search is used, it would be "greedy_search"
+               If beam search with a beam size of 7 is used, it would be
+               "beam_7"
+        - value: It contains the decoding result. `len(value)` equals to
+                 batch size. `value[i]` is the decoding result for the i-th
+                 utterance in the given batch.
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+    Returns:
+      Return the decoding result. See above description for the format of
+      the returned dict.
+    """
+    device = next(model.parameters()).device
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+
+    feature = feature.to(device)
+    feature_lens = batch["input_lens"].to(device)
+
+    # Apply the mask encoder
+    B, T, F = feature.shape
+    processed = model.mask_encoder(feature)  # B,T,F*num_channels
+    masks = processed.view(B, T, F, params.num_channels).unbind(dim=-1)
+    x_masked = [feature * m for m in masks]
+
+    # Recognition
+    # Stack the inputs along the batch axis
+    h = torch.cat(x_masked, dim=0)
+    h_lens = torch.cat([feature_lens for _ in range(params.num_channels)], dim=0)
+    encoder_out, encoder_out_lens = model.encoder(x=h, x_lens=h_lens)
+
+    if model.joint_encoder_layer is not None:
+        encoder_out = model.joint_encoder_layer(encoder_out)
+
+    def _group_channels(hyps: List[str]) -> List[List[str]]:
+        """
+        Currently we have a batch of size M*B, where M is the number of
+        channels and B is the batch size. We need to group the hypotheses
+        into B groups, each of which contains M hypotheses.
+
+        Example:
+            hyps = ['a1', 'b1', 'c1', 'a2', 'b2', 'c2']
+            _group_channels(hyps) = [['a1', 'a2'], ['b1', 'b2'], ['c1', 'c2']]
+        """
+        assert len(hyps) == B * params.num_channels
+        out_hyps = []
+        for i in range(B):
+            out_hyps.append(hyps[i::B])
+        return out_hyps
+
+    hyps = []
+    if params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp)
+    elif params.decoding_method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp)
+    else:
+        batch_size = encoder_out.size(0)
+
+        for i in range(batch_size):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.decoding_method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                )
+            elif params.decoding_method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported decoding method: {params.decoding_method}"
+                )
+            hyps.append(sp.decode(hyp))
+
+    if params.decoding_method == "greedy_search":
+        return {"greedy_search": _group_channels(hyps)}
+    elif "fast_beam_search" in params.decoding_method:
+        key = f"beam_{params.beam}_"
+        key += f"max_contexts_{params.max_contexts}_"
+        key += f"max_states_{params.max_states}"
+        if "nbest" in params.decoding_method:
+            key += f"_num_paths_{params.num_paths}_"
+            key += f"nbest_scale_{params.nbest_scale}"
+            if "LG" in params.decoding_method:
+                key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
+
+        return {key: _group_channels(hyps)}
+    else:
+        return {f"beam_size_{params.beam_size}": _group_channels(hyps)}
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    if params.decoding_method == "greedy_search":
+        log_interval = 50
+    else:
+        log_interval = 20
+
+    results = defaultdict(list)
+    for batch_idx, batch in enumerate(dl):
+        cut_ids = [cut.id for cut in batch["cuts"]]
+        cuts_batch = batch["cuts"]
+
+        hyps_dict = decode_one_batch(
+            params=params,
+            model=model,
+            sp=sp,
+        )
+
+        for name, hyps in hyps_dict.items():
+            this_batch = []
+            for cut_id, hyp_words in zip(cut_ids, hyps):
+                # Reference is a list of supervision texts sorted by start time.
+                ref_words = [
+                    s.text.strip()
+                    for s in sorted(
+                        cuts_batch[cut_id].supervisions, key=lambda s: s.start
+                    )
+                ]
+                this_batch.append((cut_id, ref_words, hyp_words))
+
+            results[name].extend(this_batch)
+
+        num_cuts += len(cut_ids)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+    return results
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        with open(errs_filename, "w") as f:
+            wer = write_surt_error_stats(
+                f,
+                f"{test_set_name}-{key}",
+                results,
+                enable_log=True,
+                num_channels=params.num_channels,
+            )
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = (
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+    )
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LmScorer.add_arguments(parser)
+    AmiAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+    args.lang_dir = Path(args.lang_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    assert params.decoding_method in (
+        "greedy_search",
+        "beam_search",
+        "modified_beam_search",
+    ), f"Decoding method {params.decoding_method} is not supported."
+    params.res_dir = params.exp_dir / params.decoding_method
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    if "beam_search" in params.decoding_method:
+        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
+    else:
+        params.suffix += f"-context-{params.context_size}"
+        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
+
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> and <unk> are defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.piece_to_id("<unk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+    assert model.encoder.decode_chunk_size == params.decode_chunk_len // 2, (
+        model.encoder.decode_chunk_size,
+        params.decode_chunk_len,
+    )
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to(device)
+    model.eval()
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    ami = AmiAsrDataModule(args)
+
+    # NOTE(@desh2608): we filter segments longer than 120s to avoid OOM errors in decoding.
+    # However, 99.9% of the segments are shorter than 120s, so this should not
+    # substantially affect the results. In future, we will implement an overlapped
+    # inference method to avoid OOM errors.
+
+    test_sets = {}
+    for split in ["dev", "test"]:
+        for type in ["ihm-mix", "sdm", "mdm8-bf"]:
+            test_sets[f"ami-{split}_{type}"] = (
+                ami.ami_cuts(split=split, type=type)
+                .trim_to_supervision_groups(max_pause=0.0)
+                .filter(lambda c: 0.1 < c.duration < 120.0)
+                .to_eager()
+            )
+
+    for split in ["dev", "test"]:
+        for type in ["ihm-mix", "sdm"]:
+            test_sets[f"icsi-{split}_{type}"] = (
+                ami.icsi_cuts(split=split, type=type)
+                .trim_to_supervision_groups(max_pause=0.0)
+                .filter(lambda c: 0.1 < c.duration < 120.0)
+                .to_eager()
+            )
+
+    for test_set, test_cuts in test_sets.items():
+        test_dl = ami.test_dataloaders(test_cuts)
+        results_dict = decode_dataset(
+            dl=test_dl,
+            params=params,
+            model=model,
+            sp=sp,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=test_set,
+            results_dict=results_dict,
+        )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/ami/SURT/dprnn_zipformer/decoder.py b/egs/ami/SURT/dprnn_zipformer/decoder.py
new file mode 120000
index 000000000..c34865c25
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/decoder.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/decoder.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/dprnn.py b/egs/ami/SURT/dprnn_zipformer/dprnn.py
new file mode 120000
index 000000000..8918beb32
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/dprnn.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/dprnn.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/encoder_interface.py b/egs/ami/SURT/dprnn_zipformer/encoder_interface.py
new file mode 120000
index 000000000..0ba945d0f
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/encoder_interface.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/encoder_interface.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/export.py b/egs/ami/SURT/dprnn_zipformer/export.py
new file mode 120000
index 000000000..3deae4471
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/export.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/export.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/joiner.py b/egs/ami/SURT/dprnn_zipformer/joiner.py
new file mode 120000
index 000000000..79fbe8769
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/joiner.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/joiner.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/model.py b/egs/ami/SURT/dprnn_zipformer/model.py
new file mode 120000
index 000000000..ae8c65c99
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/model.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/model.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/optim.py b/egs/ami/SURT/dprnn_zipformer/optim.py
new file mode 120000
index 000000000..366d0f7a2
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/optim.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/optim.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/scaling.py b/egs/ami/SURT/dprnn_zipformer/scaling.py
new file mode 120000
index 000000000..f11d49d77
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/scaling.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/scaling.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/scaling_converter.py b/egs/ami/SURT/dprnn_zipformer/scaling_converter.py
new file mode 120000
index 000000000..1533cbe0e
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/scaling_converter.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/scaling_converter.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/test_model.py b/egs/ami/SURT/dprnn_zipformer/test_model.py
new file mode 120000
index 000000000..1259849e0
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/test_model.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7_streaming/test_model.py
\ No newline at end of file
diff --git a/egs/ami/SURT/dprnn_zipformer/train.py b/egs/ami/SURT/dprnn_zipformer/train.py
new file mode 100755
index 000000000..cd5fafc34
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/train.py
@@ -0,0 +1,1420 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                  Wei Kang,
+#                                                  Mingshuang Luo,)
+#                                                  Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+cd egs/ami/SURT/
+./prepare.sh
+
+./dprnn_zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir dprnn_zipformer/exp \
+  --max-duration 650
+"""
+
+import argparse
+import copy
+import logging
+import warnings
+from itertools import chain
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple, Union
+
+import k2
+import optim
+import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import AmiAsrDataModule
+from decoder import Decoder
+from dprnn import DPRNN
+from einops.layers.torch import Rearrange
+from joiner import Joiner
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import LOG_EPSILON, fix_random_seed
+from model import SURT
+from optim import Eden, ScaledAdam
+from scaling import ScaledLinear, ScaledLSTM
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from zipformer import Zipformer
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for module in model.modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+
+
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-mask-encoder-layers",
+        type=int,
+        default=4,
+        help="Number of layers in the DPRNN based mask encoder.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-dim",
+        type=int,
+        default=256,
+        help="Hidden dimension of the LSTM blocks in DPRNN.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-segment-size",
+        type=int,
+        default=32,
+        help="Segment size of the SegLSTM in DPRNN. Ideally, this should be equal to the "
+        "decode-chunk-length of the zipformer encoder.",
+    )
+
+    parser.add_argument(
+        "--chunk-width-randomization",
+        type=bool,
+        default=False,
+        help="Whether to randomize the chunk width in DPRNN.",
+    )
+
+    # Zipformer config is based on:
+    # https://github.com/k2-fsa/icefall/pull/745#issuecomment-1405282740
+    parser.add_argument(
+        "--num-encoder-layers",
+        type=str,
+        default="2,2,2,2,2",
+        help="Number of zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--feedforward-dims",
+        type=str,
+        default="768,768,768,768,768",
+        help="Feedforward dimension of the zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--nhead",
+        type=str,
+        default="8,8,8,8,8",
+        help="Number of attention heads in the zipformer encoder layers.",
+    )
+
+    parser.add_argument(
+        "--encoder-dims",
+        type=str,
+        default="256,256,256,256,256",
+        help="Embedding dimension in the 2 blocks of zipformer encoder layers, comma separated",
+    )
+
+    parser.add_argument(
+        "--attention-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="""Attention dimension in the 2 blocks of zipformer encoder layers, comma separated;
+        not the same as embedding dimension.""",
+    )
+
+    parser.add_argument(
+        "--encoder-unmasked-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="Unmasked dimensions in the encoders, relates to augmentation during training.  "
+        "Must be <= each of encoder_dims.  Empirically, less than 256 seems to make performance "
+        " worse.",
+    )
+
+    parser.add_argument(
+        "--zipformer-downsampling-factors",
+        type=str,
+        default="1,2,4,8,2",
+        help="Downsampling factor for each stack of encoder layers.",
+    )
+
+    parser.add_argument(
+        "--cnn-module-kernels",
+        type=str,
+        default="31,31,31,31,31",
+        help="Sizes of kernels in convolution modules",
+    )
+
+    parser.add_argument(
+        "--use-joint-encoder-layer",
+        type=str,
+        default="lstm",
+        choices=["linear", "lstm", "none"],
+        help="Whether to use a joint layer to combine all branches.",
+    )
+
+    parser.add_argument(
+        "--decoder-dim",
+        type=int,
+        default=512,
+        help="Embedding dimension in the decoder model.",
+    )
+
+    parser.add_argument(
+        "--joiner-dim",
+        type=int,
+        default=512,
+        help="""Dimension used in the joiner model.
+        Outputs from the encoder and decoder model are projected
+        to this dimension before adding.
+        """,
+    )
+
+    parser.add_argument(
+        "--short-chunk-size",
+        type=int,
+        default=50,
+        help="""Chunk length of dynamic training, the chunk size would be either
+        max sequence length of current batch or uniformly sampled from (1, short_chunk_size).
+        """,
+    )
+
+    parser.add_argument(
+        "--num-left-chunks",
+        type=int,
+        default=4,
+        help="How many left context can be seen in chunks when calculating attention.",
+    )
+
+    parser.add_argument(
+        "--decode-chunk-len",
+        type=int,
+        default=32,
+        help="The chunk size for decoding (in frames before subsampling)",
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=30,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="conv_lstm_transducer_stateless_ctc/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--model-init-ckpt",
+        type=str,
+        default=None,
+        help="""The model checkpoint to initialize the model (either full or part).
+        If not specified, the model is randomly initialized.
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=0.004, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=5000,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=5,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="The prune range for rnnt loss, it means how many symbols(context)"
+        "we are using to compute the loss",
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="The scale to smooth the loss with lm "
+        "(output of prediction network) part.",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="The scale to smooth the loss with am (output of encoder network) part.",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="To get pruning ranges, we will calculate a simple version"
+        "loss(joiner is just addition), this simple loss also uses for"
+        "training (as a regularization item). We will scale the simple loss"
+        "with this parameter before adding to the final loss.",
+    )
+
+    parser.add_argument(
+        "--ctc-loss-scale",
+        type=float,
+        default=0.2,
+        help="Scale for CTC loss.",
+    )
+
+    parser.add_argument(
+        "--heat-loss-scale",
+        type=float,
+        default=0.2,
+        help="Scale for HEAT loss on separated sources.",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=2000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 0.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=1,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=100,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - feature_dim: The model input dim. It has to match the one used
+                       in computing features.
+
+        - subsampling_factor:  The subsampling factor for the model.
+
+        - num_decoder_layers: Number of decoder layer of transformer decoder.
+
+        - warm_step: The warm_step for Noam optimizer.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "valid_interval": 2000,
+            # parameters for SURT
+            "num_channels": 2,
+            "feature_dim": 80,
+            "subsampling_factor": 4,  # not passed in, this is fixed
+            # parameters for Noam
+            "model_warm_step": 5000,  # arg given to model, not for lrate
+            # parameters for ctc loss
+            "beam_size": 10,
+            "use_double_scores": True,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def get_mask_encoder_model(params: AttributeDict) -> nn.Module:
+    mask_encoder = DPRNN(
+        feature_dim=params.feature_dim,
+        input_size=params.mask_encoder_dim,
+        hidden_size=params.mask_encoder_dim,
+        output_size=params.feature_dim * params.num_channels,
+        segment_size=params.mask_encoder_segment_size,
+        num_blocks=params.num_mask_encoder_layers,
+        chunk_width_randomization=params.chunk_width_randomization,
+    )
+    return mask_encoder
+
+
+def get_encoder_model(params: AttributeDict) -> nn.Module:
+    # TODO: We can add an option to switch between Zipformer and Transformer
+    def to_int_tuple(s: str):
+        return tuple(map(int, s.split(",")))
+
+    encoder = Zipformer(
+        num_features=params.feature_dim,
+        output_downsampling_factor=2,
+        zipformer_downsampling_factors=to_int_tuple(
+            params.zipformer_downsampling_factors
+        ),
+        encoder_dims=to_int_tuple(params.encoder_dims),
+        attention_dim=to_int_tuple(params.attention_dims),
+        encoder_unmasked_dims=to_int_tuple(params.encoder_unmasked_dims),
+        nhead=to_int_tuple(params.nhead),
+        feedforward_dim=to_int_tuple(params.feedforward_dims),
+        cnn_module_kernels=to_int_tuple(params.cnn_module_kernels),
+        num_encoder_layers=to_int_tuple(params.num_encoder_layers),
+        num_left_chunks=params.num_left_chunks,
+        short_chunk_size=params.short_chunk_size,
+        decode_chunk_size=params.decode_chunk_len // 2,
+    )
+    return encoder
+
+
+def get_joint_encoder_layer(params: AttributeDict) -> nn.Module:
+    class TakeFirst(nn.Module):
+        def forward(self, x):
+            return x[0]
+
+    if params.use_joint_encoder_layer == "linear":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            nn.Linear(
+                params.num_channels * encoder_dim, params.num_channels * encoder_dim
+            ),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "lstm":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            ScaledLSTM(
+                input_size=params.num_channels * encoder_dim,
+                hidden_size=params.num_channels * encoder_dim,
+                num_layers=1,
+                bias=True,
+                batch_first=True,
+                dropout=0.0,
+                bidirectional=False,
+            ),
+            TakeFirst(),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "none":
+        joint_layer = None
+    else:
+        raise ValueError(
+            f"Unknown joint encoder layer type: {params.use_joint_encoder_layer}"
+        )
+    return joint_layer
+
+
+def get_decoder_model(params: AttributeDict) -> nn.Module:
+    decoder = Decoder(
+        vocab_size=params.vocab_size,
+        decoder_dim=params.decoder_dim,
+        blank_id=params.blank_id,
+        context_size=params.context_size,
+    )
+    return decoder
+
+
+def get_joiner_model(params: AttributeDict) -> nn.Module:
+    joiner = Joiner(
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return joiner
+
+
+def get_surt_model(
+    params: AttributeDict,
+) -> nn.Module:
+    mask_encoder = get_mask_encoder_model(params)
+    encoder = get_encoder_model(params)
+    joint_layer = get_joint_encoder_layer(params)
+    decoder = get_decoder_model(params)
+    joiner = get_joiner_model(params)
+
+    model = SURT(
+        mask_encoder=mask_encoder,
+        encoder=encoder,
+        joint_encoder_layer=joint_layer,
+        decoder=decoder,
+        joiner=joiner,
+        num_channels=params.num_channels,
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return model
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_heat_loss(x_masked, batch, num_channels=2) -> Tensor:
+    """
+    Compute HEAT loss for separated sources using the output of mask encoder.
+    Args:
+      x_masked:
+        The output of mask encoder. It is a tensor of shape (B, T, C).
+      batch:
+        A batch of data. See `lhotse.dataset.K2SurtDatasetWithSources()`
+        for the content in it.
+      num_channels:
+        The number of output branches in the SURT model.
+    """
+    B, T, D = x_masked[0].shape
+    device = x_masked[0].device
+
+    # Create training targets for each channel.
+    targets = []
+    for i in range(num_channels):
+        target = torch.ones_like(x_masked[i]) * LOG_EPSILON
+        targets.append(target)
+
+    source_feats = batch["source_feats"]
+    source_boundaries = batch["source_boundaries"]
+    input_lens = batch["input_lens"].to(device)
+    # Assign sources to channels based on the HEAT criteria
+    for b in range(B):
+        cut_source_feats = source_feats[b]
+        cut_source_boundaries = source_boundaries[b]
+        last_seg_end = [0 for _ in range(num_channels)]
+        for source_feat, (start, end) in zip(cut_source_feats, cut_source_boundaries):
+            assigned = False
+            end = min(end, T)
+            source_feat = source_feat[: end - start, :]
+            for i in range(num_channels):
+                if start >= last_seg_end[i]:
+                    targets[i][b, start:end, :] += source_feat.to(device)
+                    last_seg_end[i] = max(end, last_seg_end[i])
+                    assigned = True
+                    break
+            if not assigned:
+                min_end_channel = last_seg_end.index(min(last_seg_end))
+                targets[min_end_channel][b, start:end, :] += source_feat.to(device)
+                last_seg_end[min_end_channel] = max(end, last_seg_end[min_end_channel])
+
+    # Get padding mask based on input lengths
+    pad_mask = torch.arange(T, device=device).expand(B, T) > input_lens.unsqueeze(1)
+    pad_mask = pad_mask.unsqueeze(-1)
+
+    # Compute masked loss for each channel
+    losses = torch.zeros((num_channels, B, T, D), device=device)
+    for i in range(num_channels):
+        loss = nn.functional.mse_loss(x_masked[i], targets[i], reduction="none")
+        # Apply padding mask to loss
+        loss.masked_fill_(pad_mask, 0)
+        losses[i] = loss
+
+    # loss: C x B x T x D. pad_mask: B x T x 1
+    # We want to compute loss for each item in the batch. Each item has loss given
+    # by the sum over C, and average over T and D. For T, we need to use the padding.
+    loss = losses.sum(0).mean(-1).sum(-1) / batch["input_lens"].to(device)
+    return loss
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute RNN-T loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Conformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"].to(device)
+    feature_lens = batch["input_lens"].to(device)
+
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+
+    # The dataloader returns text as a list of cuts, each of which is a list of channel
+    # text. We flatten this to a list where all channels are together, i.e., it looks like
+    # [utt1_ch1, utt2_ch1, ..., uttN_ch1, utt1_ch2, ...., uttN,ch2].
+    text = [val for tup in zip(*batch["text"]) for val in tup]
+    assert len(text) == len(feature) * params.num_channels
+
+    # Convert all channel texts to token IDs and create a ragged tensor.
+    y = sp.encode(text, out_type=int)
+    y = k2.RaggedTensor(y).to(device)
+
+    batch_idx_train = params.batch_idx_train
+    warm_step = params.model_warm_step
+
+    with torch.set_grad_enabled(is_training):
+        (simple_loss, pruned_loss, ctc_loss, x_masked) = model(
+            x=feature,
+            x_lens=feature_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+            reduction="none",
+            subsampling_factor=params.subsampling_factor,
+        )
+        simple_loss_is_finite = torch.isfinite(simple_loss)
+        pruned_loss_is_finite = torch.isfinite(pruned_loss)
+        ctc_loss_is_finite = torch.isfinite(ctc_loss)
+
+        # Compute HEAT loss
+        if is_training and params.heat_loss_scale > 0.0:
+            heat_loss = compute_heat_loss(
+                x_masked, batch, num_channels=params.num_channels
+            )
+        else:
+            heat_loss = torch.tensor(0.0, device=device)
+
+        heat_loss_is_finite = torch.isfinite(heat_loss)
+        is_finite = (
+            simple_loss_is_finite
+            & pruned_loss_is_finite
+            & ctc_loss_is_finite
+            & heat_loss_is_finite
+        )
+        if not torch.all(is_finite):
+            logging.info(
+                "Not all losses are finite!\n"
+                f"simple_losses: {simple_loss}\n"
+                f"pruned_losses: {pruned_loss}\n"
+                f"ctc_losses: {ctc_loss}\n"
+                f"heat_losses: {heat_loss}\n"
+            )
+            display_and_save_batch(batch, params=params, sp=sp)
+            simple_loss = simple_loss[simple_loss_is_finite]
+            pruned_loss = pruned_loss[pruned_loss_is_finite]
+            ctc_loss = ctc_loss[ctc_loss_is_finite]
+            heat_loss = heat_loss[heat_loss_is_finite]
+
+            # If either all simple_loss or pruned_loss is inf or nan,
+            # we stop the training process by raising an exception
+            if (
+                torch.all(~simple_loss_is_finite)
+                or torch.all(~pruned_loss_is_finite)
+                or torch.all(~ctc_loss_is_finite)
+                or torch.all(~heat_loss_is_finite)
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
+
+        simple_loss_sum = simple_loss.sum()
+        pruned_loss_sum = pruned_loss.sum()
+        ctc_loss_sum = ctc_loss.sum()
+        heat_loss_sum = heat_loss.sum()
+
+        s = params.simple_loss_scale
+        # take down the scale on the simple loss from 1.0 at the start
+        # to params.simple_loss scale by warm_step.
+        simple_loss_scale = (
+            s
+            if batch_idx_train >= warm_step
+            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
+        )
+        pruned_loss_scale = (
+            1.0
+            if batch_idx_train >= warm_step
+            else 0.1 + 0.9 * (batch_idx_train / warm_step)
+        )
+        loss = (
+            simple_loss_scale * simple_loss_sum
+            + pruned_loss_scale * pruned_loss_sum
+            + params.ctc_loss_scale * ctc_loss_sum
+            + params.heat_loss_scale * heat_loss_sum
+        )
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        # info["frames"] is an approximate number for two reasons:
+        # (1) The acutal subsampling factor is ((lens - 1) // 2 - 1) // 2
+        # (2) If some utterances in the batch lead to inf/nan loss, they
+        #     are filtered out.
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances`  # noqa
+    info["utterances"] = feature.size(0)
+    # averaged input duration in frames over utterances
+    info["utt_duration"] = feature_lens.sum().item()
+    # averaged padding proportion over utterances
+    info["utt_pad_proportion"] = (
+        ((feature.size(1) - feature_lens) / feature.size(1)).sum().item()
+    )
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    info["simple_loss"] = simple_loss_sum.detach().cpu().item()
+    info["pruned_loss"] = pruned_loss_sum.detach().cpu().item()
+    if params.ctc_loss_scale > 0.0:
+        info["ctc_loss"] = ctc_loss_sum.detach().cpu().item()
+    if params.heat_loss_scale > 0.0:
+        info["heat_loss"] = heat_loss_sum.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            sp=sp,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    sp: spm.SentencePieceProcessor,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    torch.cuda.empty_cache()
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    cur_batch_idx = params.get("cur_batch_idx", 0)
+
+    for batch_idx, batch in enumerate(train_dl):
+        if batch_idx < cur_batch_idx:
+            continue
+        cur_batch_idx = batch_idx
+
+        params.batch_idx_train += 1
+        batch_size = batch["inputs"].shape[0]
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            set_batch_count(model, params.batch_idx_train)
+            scheduler.step_batch(params.batch_idx_train)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            params.cur_batch_idx = batch_idx
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            del params.cur_batch_idx
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = scheduler.get_last_lr()[0]
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
+                    )
+
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    model.to(device)
+
+    if checkpoints is None and params.model_init_ckpt is not None:
+        logging.info(
+            f"Initializing model with checkpoint from {params.model_init_ckpt}"
+        )
+        init_ckpt = torch.load(params.model_init_ckpt, map_location=device)
+        model.load_state_dict(init_ckpt["model"], strict=False)
+
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    parameters_names = []
+    parameters_names.append(
+        [name_param_pair[0] for name_param_pair in model.named_parameters()]
+    )
+    optimizer = ScaledAdam(
+        model.parameters(),
+        lr=params.base_lr,
+        clipping_scale=2.0,
+        parameters_names=parameters_names,
+    )
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        diagnostic = diagnostics.attach_diagnostics(model)
+
+    ami = AmiAsrDataModule(args)
+
+    train_cuts = ami.aimix_train_cuts(rvb_affix="comb", sources=True)
+    dev_cuts = ami.ami_cuts(split="dev", type="ihm-mix")
+    dev_cuts = dev_cuts.trim_to_supervision_groups(max_pause=0.0).filter(
+        lambda c: 0.2 <= c.duration <= 60.0
+    )
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = ami.train_dataloaders(
+        train_cuts,
+        sampler_state_dict=sampler_state_dict,
+        sources=True,
+    )
+    valid_dl = ami.valid_dataloaders(dev_cuts)
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sp=sp,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    y = [sp.encode(text_ch) for text_ch in batch["text"]]
+    num_tokens = [sum(len(yi) for yi in y_ch) for y_ch in y]
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def main():
+    parser = get_parser()
+    AmiAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/ami/SURT/dprnn_zipformer/train_adapt.py b/egs/ami/SURT/dprnn_zipformer/train_adapt.py
new file mode 100755
index 000000000..9f3b4425f
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/train_adapt.py
@@ -0,0 +1,1411 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                  Wei Kang,
+#                                                  Mingshuang Luo,)
+#                                                  Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+# ./dprnn_zipformer/train.py should be run before this script.
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+./dprnn_zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir dprnn_zipformer/exp_adapt \
+  --model-init-ckpt dprnn_zipformer/exp/epoch-30.pt \
+  --max-duration 550
+"""
+
+import argparse
+import copy
+import logging
+import warnings
+from itertools import chain
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple, Union
+
+import k2
+import optim
+import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import AmiAsrDataModule
+from decoder import Decoder
+from dprnn import DPRNN
+from einops.layers.torch import Rearrange
+from joiner import Joiner
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import LOG_EPSILON, fix_random_seed
+from model import SURT
+from optim import Eden, ScaledAdam
+from scaling import ScaledLinear, ScaledLSTM
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from zipformer import Zipformer
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for module in model.modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+
+
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-mask-encoder-layers",
+        type=int,
+        default=4,
+        help="Number of layers in the DPRNN based mask encoder.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-dim",
+        type=int,
+        default=256,
+        help="Hidden dimension of the LSTM blocks in DPRNN.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-segment-size",
+        type=int,
+        default=32,
+        help="Segment size of the SegLSTM in DPRNN. Ideally, this should be equal to the "
+        "decode-chunk-length of the zipformer encoder.",
+    )
+
+    parser.add_argument(
+        "--chunk-width-randomization",
+        type=bool,
+        default=False,
+        help="Whether to randomize the chunk width in DPRNN.",
+    )
+
+    # Zipformer config is based on:
+    # https://github.com/k2-fsa/icefall/pull/745#issuecomment-1405282740
+    parser.add_argument(
+        "--num-encoder-layers",
+        type=str,
+        default="2,2,2,2,2",
+        help="Number of zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--feedforward-dims",
+        type=str,
+        default="768,768,768,768,768",
+        help="Feedforward dimension of the zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--nhead",
+        type=str,
+        default="8,8,8,8,8",
+        help="Number of attention heads in the zipformer encoder layers.",
+    )
+
+    parser.add_argument(
+        "--encoder-dims",
+        type=str,
+        default="256,256,256,256,256",
+        help="Embedding dimension in the 2 blocks of zipformer encoder layers, comma separated",
+    )
+
+    parser.add_argument(
+        "--attention-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="""Attention dimension in the 2 blocks of zipformer encoder layers, comma separated;
+        not the same as embedding dimension.""",
+    )
+
+    parser.add_argument(
+        "--encoder-unmasked-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="Unmasked dimensions in the encoders, relates to augmentation during training.  "
+        "Must be <= each of encoder_dims.  Empirically, less than 256 seems to make performance "
+        " worse.",
+    )
+
+    parser.add_argument(
+        "--zipformer-downsampling-factors",
+        type=str,
+        default="1,2,4,8,2",
+        help="Downsampling factor for each stack of encoder layers.",
+    )
+
+    parser.add_argument(
+        "--cnn-module-kernels",
+        type=str,
+        default="31,31,31,31,31",
+        help="Sizes of kernels in convolution modules",
+    )
+
+    parser.add_argument(
+        "--use-joint-encoder-layer",
+        type=str,
+        default="linear",
+        choices=["linear", "lstm", "none"],
+        help="Whether to use a joint layer to combine all branches.",
+    )
+
+    parser.add_argument(
+        "--decoder-dim",
+        type=int,
+        default=512,
+        help="Embedding dimension in the decoder model.",
+    )
+
+    parser.add_argument(
+        "--joiner-dim",
+        type=int,
+        default=512,
+        help="""Dimension used in the joiner model.
+        Outputs from the encoder and decoder model are projected
+        to this dimension before adding.
+        """,
+    )
+
+    parser.add_argument(
+        "--short-chunk-size",
+        type=int,
+        default=50,
+        help="""Chunk length of dynamic training, the chunk size would be either
+        max sequence length of current batch or uniformly sampled from (1, short_chunk_size).
+        """,
+    )
+
+    parser.add_argument(
+        "--num-left-chunks",
+        type=int,
+        default=4,
+        help="How many left context can be seen in chunks when calculating attention.",
+    )
+
+    parser.add_argument(
+        "--decode-chunk-len",
+        type=int,
+        default=32,
+        help="The chunk size for decoding (in frames before subsampling)",
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=20,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="conv_lstm_transducer_stateless_ctc/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--model-init-ckpt",
+        type=str,
+        default=None,
+        help="""The model checkpoint to initialize the model (either full or part).
+        If not specified, the model is randomly initialized.
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=0.0001, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=5000,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=2,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="The prune range for rnnt loss, it means how many symbols(context)"
+        "we are using to compute the loss",
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="The scale to smooth the loss with lm "
+        "(output of prediction network) part.",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="The scale to smooth the loss with am (output of encoder network) part.",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="To get pruning ranges, we will calculate a simple version"
+        "loss(joiner is just addition), this simple loss also uses for"
+        "training (as a regularization item). We will scale the simple loss"
+        "with this parameter before adding to the final loss.",
+    )
+
+    parser.add_argument(
+        "--ctc-loss-scale",
+        type=float,
+        default=0.2,
+        help="Scale for CTC loss.",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=2000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 0.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=1,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=100,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - feature_dim: The model input dim. It has to match the one used
+                       in computing features.
+
+        - subsampling_factor:  The subsampling factor for the model.
+
+        - num_decoder_layers: Number of decoder layer of transformer decoder.
+
+        - warm_step: The warm_step for Noam optimizer.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "valid_interval": 2000,
+            # parameters for SURT
+            "num_channels": 2,
+            "feature_dim": 80,
+            "subsampling_factor": 4,  # not passed in, this is fixed
+            # parameters for Noam
+            "model_warm_step": 5000,  # arg given to model, not for lrate
+            # parameters for ctc loss
+            "beam_size": 10,
+            "use_double_scores": True,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def get_mask_encoder_model(params: AttributeDict) -> nn.Module:
+    mask_encoder = DPRNN(
+        feature_dim=params.feature_dim,
+        input_size=params.mask_encoder_dim,
+        hidden_size=params.mask_encoder_dim,
+        output_size=params.feature_dim * params.num_channels,
+        segment_size=params.mask_encoder_segment_size,
+        num_blocks=params.num_mask_encoder_layers,
+        chunk_width_randomization=params.chunk_width_randomization,
+    )
+    return mask_encoder
+
+
+def get_encoder_model(params: AttributeDict) -> nn.Module:
+    # TODO: We can add an option to switch between Zipformer and Transformer
+    def to_int_tuple(s: str):
+        return tuple(map(int, s.split(",")))
+
+    encoder = Zipformer(
+        num_features=params.feature_dim,
+        output_downsampling_factor=2,
+        zipformer_downsampling_factors=to_int_tuple(
+            params.zipformer_downsampling_factors
+        ),
+        encoder_dims=to_int_tuple(params.encoder_dims),
+        attention_dim=to_int_tuple(params.attention_dims),
+        encoder_unmasked_dims=to_int_tuple(params.encoder_unmasked_dims),
+        nhead=to_int_tuple(params.nhead),
+        feedforward_dim=to_int_tuple(params.feedforward_dims),
+        cnn_module_kernels=to_int_tuple(params.cnn_module_kernels),
+        num_encoder_layers=to_int_tuple(params.num_encoder_layers),
+        num_left_chunks=params.num_left_chunks,
+        short_chunk_size=params.short_chunk_size,
+        decode_chunk_size=params.decode_chunk_len // 2,
+    )
+    return encoder
+
+
+def get_joint_encoder_layer(params: AttributeDict) -> nn.Module:
+    class TakeFirst(nn.Module):
+        def forward(self, x):
+            return x[0]
+
+    if params.use_joint_encoder_layer == "linear":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            nn.Linear(
+                params.num_channels * encoder_dim, params.num_channels * encoder_dim
+            ),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "lstm":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            ScaledLSTM(
+                input_size=params.num_channels * encoder_dim,
+                hidden_size=params.num_channels * encoder_dim,
+                num_layers=1,
+                bias=True,
+                batch_first=True,
+                dropout=0.0,
+                bidirectional=False,
+            ),
+            TakeFirst(),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "none":
+        joint_layer = None
+    else:
+        raise ValueError(
+            f"Unknown joint encoder layer type: {params.use_joint_encoder_layer}"
+        )
+    return joint_layer
+
+
+def get_decoder_model(params: AttributeDict) -> nn.Module:
+    decoder = Decoder(
+        vocab_size=params.vocab_size,
+        decoder_dim=params.decoder_dim,
+        blank_id=params.blank_id,
+        context_size=params.context_size,
+    )
+    return decoder
+
+
+def get_joiner_model(params: AttributeDict) -> nn.Module:
+    joiner = Joiner(
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return joiner
+
+
+def get_surt_model(
+    params: AttributeDict,
+) -> nn.Module:
+    mask_encoder = get_mask_encoder_model(params)
+    encoder = get_encoder_model(params)
+    joint_layer = get_joint_encoder_layer(params)
+    decoder = get_decoder_model(params)
+    joiner = get_joiner_model(params)
+
+    model = SURT(
+        mask_encoder=mask_encoder,
+        encoder=encoder,
+        joint_encoder_layer=joint_layer,
+        decoder=decoder,
+        joiner=joiner,
+        num_channels=params.num_channels,
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return model
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_heat_loss(x_masked, batch, num_channels=2) -> Tensor:
+    """
+    Compute HEAT loss for separated sources using the output of mask encoder.
+    Args:
+      x_masked:
+        The output of mask encoder. It is a tensor of shape (B, T, C).
+      batch:
+        A batch of data. See `lhotse.dataset.K2SurtDatasetWithSources()`
+        for the content in it.
+      num_channels:
+        The number of output branches in the SURT model.
+    """
+    B, T, D = x_masked[0].shape
+    device = x_masked[0].device
+
+    # Create training targets for each channel.
+    targets = []
+    for i in range(num_channels):
+        target = torch.ones_like(x_masked[i]) * LOG_EPSILON
+        targets.append(target)
+
+    source_feats = batch["source_feats"]
+    source_boundaries = batch["source_boundaries"]
+    input_lens = batch["input_lens"].to(device)
+    # Assign sources to channels based on the HEAT criteria
+    for b in range(B):
+        cut_source_feats = source_feats[b]
+        cut_source_boundaries = source_boundaries[b]
+        last_seg_end = [0 for _ in range(num_channels)]
+        for source_feat, (start, end) in zip(cut_source_feats, cut_source_boundaries):
+            assigned = False
+            for i in range(num_channels):
+                if start >= last_seg_end[i]:
+                    targets[i][b, start:end, :] += source_feat.to(device)
+                    last_seg_end[i] = max(end, last_seg_end[i])
+                    assigned = True
+                    break
+            if not assigned:
+                min_end_channel = last_seg_end.index(min(last_seg_end))
+                targets[min_end_channel][b, start:end, :] += source_feat
+                last_seg_end[min_end_channel] = max(end, last_seg_end[min_end_channel])
+
+    # Get padding mask based on input lengths
+    pad_mask = torch.arange(T, device=device).expand(B, T) > input_lens.unsqueeze(1)
+    pad_mask = pad_mask.unsqueeze(-1)
+
+    # Compute masked loss for each channel
+    losses = torch.zeros((num_channels, B, T, D), device=device)
+    for i in range(num_channels):
+        loss = nn.functional.mse_loss(x_masked[i], targets[i], reduction="none")
+        # Apply padding mask to loss
+        loss.masked_fill_(pad_mask, 0)
+        losses[i] = loss
+
+    # loss: C x B x T x D. pad_mask: B x T x 1
+    # We want to compute loss for each item in the batch. Each item has loss given
+    # by the sum over C, and average over T and D. For T, we need to use the padding.
+    loss = losses.sum(0).mean(-1).sum(-1) / batch["input_lens"].to(device)
+    return loss
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute RNN-T loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Conformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"].to(device)
+    feature_lens = batch["input_lens"].to(device)
+
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+
+    # The dataloader returns text as a list of cuts, each of which is a list of channel
+    # text. We flatten this to a list where all channels are together, i.e., it looks like
+    # [utt1_ch1, utt2_ch1, ..., uttN_ch1, utt1_ch2, ...., uttN,ch2].
+    text = [val for tup in zip(*batch["text"]) for val in tup]
+    assert len(text) == len(feature) * params.num_channels
+
+    # Convert all channel texts to token IDs and create a ragged tensor.
+    y = sp.encode(text, out_type=int)
+    y = k2.RaggedTensor(y).to(device)
+
+    batch_idx_train = params.batch_idx_train
+    warm_step = params.model_warm_step
+
+    with torch.set_grad_enabled(is_training):
+        (simple_loss, pruned_loss, ctc_loss, x_masked) = model(
+            x=feature,
+            x_lens=feature_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+            reduction="none",
+            subsampling_factor=params.subsampling_factor,
+        )
+        simple_loss_is_finite = torch.isfinite(simple_loss)
+        pruned_loss_is_finite = torch.isfinite(pruned_loss)
+        ctc_loss_is_finite = torch.isfinite(ctc_loss)
+
+        # Compute HEAT loss
+        if is_training and params.heat_loss_scale > 0.0:
+            heat_loss = compute_heat_loss(
+                x_masked, batch, num_channels=params.num_channels
+            )
+        else:
+            heat_loss = torch.tensor(0.0, device=device)
+
+        heat_loss_is_finite = torch.isfinite(heat_loss)
+        is_finite = (
+            simple_loss_is_finite
+            & pruned_loss_is_finite
+            & ctc_loss_is_finite
+            & heat_loss_is_finite
+        )
+        if not torch.all(is_finite):
+            # logging.info(
+            #     "Not all losses are finite!\n"
+            #     f"simple_losses: {simple_loss}\n"
+            #     f"pruned_losses: {pruned_loss}\n"
+            #     f"ctc_losses: {ctc_loss}\n"
+            #     f"heat_losses: {heat_loss}\n"
+            # )
+            # display_and_save_batch(batch, params=params, sp=sp)
+            simple_loss = simple_loss[simple_loss_is_finite]
+            pruned_loss = pruned_loss[pruned_loss_is_finite]
+            ctc_loss = ctc_loss[ctc_loss_is_finite]
+            heat_loss = heat_loss[heat_loss_is_finite]
+
+            # If either all simple_loss or pruned_loss is inf or nan,
+            # we stop the training process by raising an exception
+            if (
+                torch.all(~simple_loss_is_finite)
+                or torch.all(~pruned_loss_is_finite)
+                or torch.all(~ctc_loss_is_finite)
+                or torch.all(~heat_loss_is_finite)
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
+
+        simple_loss_sum = simple_loss.sum()
+        pruned_loss_sum = pruned_loss.sum()
+        ctc_loss_sum = ctc_loss.sum()
+        heat_loss_sum = heat_loss.sum()
+
+        s = params.simple_loss_scale
+        # take down the scale on the simple loss from 1.0 at the start
+        # to params.simple_loss scale by warm_step.
+        simple_loss_scale = (
+            s
+            if batch_idx_train >= warm_step
+            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
+        )
+        pruned_loss_scale = (
+            1.0
+            if batch_idx_train >= warm_step
+            else 0.1 + 0.9 * (batch_idx_train / warm_step)
+        )
+        loss = (
+            simple_loss_scale * simple_loss_sum
+            + pruned_loss_scale * pruned_loss_sum
+            + params.ctc_loss_scale * ctc_loss_sum
+            + params.heat_loss_scale * heat_loss_sum
+        )
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        # info["frames"] is an approximate number for two reasons:
+        # (1) The acutal subsampling factor is ((lens - 1) // 2 - 1) // 2
+        # (2) If some utterances in the batch lead to inf/nan loss, they
+        #     are filtered out.
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances`  # noqa
+    info["utterances"] = feature.size(0)
+    # averaged input duration in frames over utterances
+    info["utt_duration"] = feature_lens.sum().item()
+    # averaged padding proportion over utterances
+    info["utt_pad_proportion"] = (
+        ((feature.size(1) - feature_lens) / feature.size(1)).sum().item()
+    )
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    info["simple_loss"] = simple_loss_sum.detach().cpu().item()
+    info["pruned_loss"] = pruned_loss_sum.detach().cpu().item()
+    if params.ctc_loss_scale > 0.0:
+        info["ctc_loss"] = ctc_loss_sum.detach().cpu().item()
+    if params.heat_loss_scale > 0.0:
+        info["heat_loss"] = heat_loss_sum.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            sp=sp,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    sp: spm.SentencePieceProcessor,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    torch.cuda.empty_cache()
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    cur_batch_idx = params.get("cur_batch_idx", 0)
+
+    for batch_idx, batch in enumerate(train_dl):
+        if batch_idx < cur_batch_idx:
+            continue
+        cur_batch_idx = batch_idx
+
+        params.batch_idx_train += 1
+        batch_size = batch["inputs"].shape[0]
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            set_batch_count(model, params.batch_idx_train)
+            scheduler.step_batch(params.batch_idx_train)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            params.cur_batch_idx = batch_idx
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            del params.cur_batch_idx
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = scheduler.get_last_lr()[0]
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
+                    )
+
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    model.to(device)
+
+    if checkpoints is None and params.model_init_ckpt is not None:
+        logging.info(
+            f"Initializing model with checkpoint from {params.model_init_ckpt}"
+        )
+        init_ckpt = torch.load(params.model_init_ckpt, map_location=device)
+        model.load_state_dict(init_ckpt["model"], strict=False)
+
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    parameters_names = []
+    parameters_names.append(
+        [name_param_pair[0] for name_param_pair in model.named_parameters()]
+    )
+    optimizer = ScaledAdam(
+        model.parameters(),
+        lr=params.base_lr,
+        clipping_scale=2.0,
+        parameters_names=parameters_names,
+    )
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        diagnostic = diagnostics.attach_diagnostics(model)
+
+    ami = AmiAsrDataModule(args)
+
+    train_cuts = ami.train_cuts()
+    train_cuts = train_cuts.filter(lambda c: 0.5 <= c.duration <= 35.0)
+    dev_cuts = ami.ami_cuts(split="dev", type="ihm-mix")
+    dev_cuts = dev_cuts.trim_to_supervision_groups(max_pause=0.0).filter(
+        lambda c: 0.2 <= c.duration <= 60.0
+    )
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = ami.train_dataloaders(
+        train_cuts,
+        sampler_state_dict=sampler_state_dict,
+    )
+    valid_dl = ami.valid_dataloaders(dev_cuts)
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sp=sp,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    y = [sp.encode(text_ch) for text_ch in batch["text"]]
+    num_tokens = [sum(len(yi) for yi in y_ch) for y_ch in y]
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def main():
+    parser = get_parser()
+    AmiAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/ami/SURT/dprnn_zipformer/zipformer.py b/egs/ami/SURT/dprnn_zipformer/zipformer.py
new file mode 120000
index 000000000..59b772024
--- /dev/null
+++ b/egs/ami/SURT/dprnn_zipformer/zipformer.py
@@ -0,0 +1 @@
+../../../libricss/SURT/dprnn_zipformer/zipformer.py
\ No newline at end of file
diff --git a/egs/ami/SURT/local/add_source_feats.py b/egs/ami/SURT/local/add_source_feats.py
new file mode 100755
index 000000000..0917b88a6
--- /dev/null
+++ b/egs/ami/SURT/local/add_source_feats.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file adds source features as temporal arrays to the mixture manifests.
+It looks for manifests in the directory data/manifests.
+"""
+import logging
+from pathlib import Path
+
+import numpy as np
+from lhotse import CutSet, LilcomChunkyWriter, load_manifest, load_manifest_lazy
+from tqdm import tqdm
+
+
+def add_source_feats():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    logging.info("Reading mixed cuts")
+    mixed_cuts_clean = load_manifest_lazy(src_dir / "cuts_train_clean.jsonl.gz")
+    mixed_cuts_reverb = load_manifest_lazy(src_dir / "cuts_train_reverb.jsonl.gz")
+
+    logging.info("Reading source cuts")
+    source_cuts = load_manifest(src_dir / "ihm_cuts_train_trimmed.jsonl.gz")
+
+    logging.info("Adding source features to the mixed cuts")
+    pbar = tqdm(total=len(mixed_cuts_clean), desc="Adding source features")
+    with CutSet.open_writer(
+        src_dir / "cuts_train_clean_sources.jsonl.gz"
+    ) as cut_writer_clean, CutSet.open_writer(
+        src_dir / "cuts_train_reverb_sources.jsonl.gz"
+    ) as cut_writer_reverb, LilcomChunkyWriter(
+        output_dir / "feats_train_clean_sources"
+    ) as source_feat_writer:
+        for cut_clean, cut_reverb in zip(mixed_cuts_clean, mixed_cuts_reverb):
+            assert cut_reverb.id == cut_clean.id + "_rvb"
+            source_feats = []
+            source_feat_offsets = []
+            cur_offset = 0
+            for sup in sorted(
+                cut_clean.supervisions, key=lambda s: (s.start, s.speaker)
+            ):
+                source_cut = source_cuts[sup.id]
+                source_feats.append(source_cut.load_features())
+                source_feat_offsets.append(cur_offset)
+                cur_offset += source_cut.num_frames
+            cut_clean.source_feats = source_feat_writer.store_array(
+                cut_clean.id, np.concatenate(source_feats, axis=0)
+            )
+            cut_clean.source_feat_offsets = source_feat_offsets
+            cut_writer_clean.write(cut_clean)
+            # Also write the reverb cut
+            cut_reverb.source_feats = cut_clean.source_feats
+            cut_reverb.source_feat_offsets = cut_clean.source_feat_offsets
+            cut_writer_reverb.write(cut_reverb)
+            pbar.update(1)
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    add_source_feats()
diff --git a/egs/ami/SURT/local/compute_fbank_aimix.py b/egs/ami/SURT/local/compute_fbank_aimix.py
new file mode 100755
index 000000000..91b3a060b
--- /dev/null
+++ b/egs/ami/SURT/local/compute_fbank_aimix.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the synthetically mixed AMI and ICSI
+train set.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+import logging
+import random
+import warnings
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+import torchaudio
+from lhotse import (
+    AudioSource,
+    LilcomChunkyWriter,
+    Recording,
+    load_manifest,
+    load_manifest_lazy,
+)
+from lhotse.audio import set_ffmpeg_torchaudio_info_enabled
+from lhotse.cut import MixedCut, MixTrack, MultiCut
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.utils import fix_random_seed, uuid4
+from tqdm import tqdm
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+torchaudio.set_audio_backend("soundfile")
+set_ffmpeg_torchaudio_info_enabled(False)
+
+
+def compute_fbank_aimix():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    logging.info("Reading manifests")
+    train_cuts = load_manifest_lazy(src_dir / "ai-mix_cuts_clean_full.jsonl.gz")
+
+    # only uses RIRs and noises from REVERB challenge
+    real_rirs = load_manifest(src_dir / "real-rir_recordings_all.jsonl.gz").filter(
+        lambda r: "RVB2014" in r.id
+    )
+    noises = load_manifest(src_dir / "iso-noise_recordings_all.jsonl.gz").filter(
+        lambda r: "RVB2014" in r.id
+    )
+
+    # Apply perturbation to the training cuts
+    logging.info("Applying perturbation to the training cuts")
+    train_cuts_rvb = train_cuts.map(
+        lambda c: augment(
+            c, perturb_snr=True, rirs=real_rirs, noises=noises, perturb_loudness=True
+        )
+    )
+
+    logging.info("Extracting fbank features for training cuts")
+    _ = train_cuts.compute_and_store_features_batch(
+        extractor=extractor,
+        storage_path=output_dir / "ai-mix_feats_clean",
+        manifest_path=src_dir / "cuts_train_clean.jsonl.gz",
+        batch_duration=5000,
+        num_workers=4,
+        storage_type=LilcomChunkyWriter,
+        overwrite=True,
+    )
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        _ = train_cuts_rvb.compute_and_store_features_batch(
+            extractor=extractor,
+            storage_path=output_dir / "ai-mix_feats_reverb",
+            manifest_path=src_dir / "cuts_train_reverb.jsonl.gz",
+            batch_duration=5000,
+            num_workers=4,
+            storage_type=LilcomChunkyWriter,
+            overwrite=True,
+        )
+
+
+def augment(cut, perturb_snr=False, rirs=None, noises=None, perturb_loudness=False):
+    """
+    Given a mixed cut, this function optionally applies the following augmentations:
+    - Perturbing the SNRs of the tracks (in range [-5, 5] dB)
+    - Reverberation using a randomly selected RIR
+    - Adding noise
+    - Perturbing the loudness (in range [-20, -25] dB)
+    """
+    out_cut = cut.drop_features()
+
+    # Perturb the SNRs (optional)
+    if perturb_snr:
+        snrs = [random.uniform(-5, 5) for _ in range(len(cut.tracks))]
+        for i, (track, snr) in enumerate(zip(out_cut.tracks, snrs)):
+            if i == 0:
+                # Skip the first track since it is the reference
+                continue
+            track.snr = snr
+
+    # Reverberate the cut (optional)
+    if rirs is not None:
+        # Select an RIR at random
+        rir = random.choice(rirs)
+        # Select a channel at random
+        rir_channel = random.choice(list(range(rir.num_channels)))
+        # Reverberate the cut
+        out_cut = out_cut.reverb_rir(rir_recording=rir, rir_channels=[rir_channel])
+
+    # Add noise (optional)
+    if noises is not None:
+        # Select a noise recording at random
+        noise = random.choice(noises).to_cut()
+        if isinstance(noise, MultiCut):
+            noise = noise.to_mono()[0]
+        # Select an SNR at random
+        snr = random.uniform(10, 30)
+        # Repeat the noise to match the duration of the cut
+        noise = repeat_cut(noise, out_cut.duration)
+        out_cut = MixedCut(
+            id=out_cut.id,
+            tracks=[
+                MixTrack(cut=out_cut, type="MixedCut"),
+                MixTrack(cut=noise, type="DataCut", snr=snr),
+            ],
+        )
+
+    # Perturb the loudness (optional)
+    if perturb_loudness:
+        target_loudness = random.uniform(-20, -25)
+        out_cut = out_cut.normalize_loudness(target_loudness, mix_first=True)
+    return out_cut
+
+
+def repeat_cut(cut, duration):
+    while cut.duration < duration:
+        cut = cut.mix(cut, offset_other_by=cut.duration)
+    return cut.truncate(duration=duration)
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    fix_random_seed(42)
+    compute_fbank_aimix()
diff --git a/egs/ami/SURT/local/compute_fbank_ami.py b/egs/ami/SURT/local/compute_fbank_ami.py
new file mode 100755
index 000000000..351b41765
--- /dev/null
+++ b/egs/ami/SURT/local/compute_fbank_ami.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the AMI dataset.
+We compute features for full recordings (i.e., without trimming to supervisions).
+This way we can create arbitrary segmentations later.
+
+The generated fbank features are saved in data/fbank.
+"""
+import logging
+import math
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+from lhotse import CutSet, LilcomChunkyWriter
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
+def compute_fbank_ami():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    logging.info("Reading manifests")
+    manifests = {}
+    for part in ["ihm-mix", "sdm", "mdm8-bf"]:
+        manifests[part] = read_manifests_if_cached(
+            dataset_parts=["train", "dev", "test"],
+            output_dir=src_dir,
+            prefix=f"ami-{part}",
+            suffix="jsonl.gz",
+        )
+
+    for part in ["ihm-mix", "sdm", "mdm8-bf"]:
+        for split in ["train", "dev", "test"]:
+            logging.info(f"Processing {part} {split}")
+            cuts = CutSet.from_manifests(
+                **manifests[part][split]
+            ).compute_and_store_features_batch(
+                extractor=extractor,
+                storage_path=output_dir / f"ami-{part}_{split}_feats",
+                manifest_path=src_dir / f"cuts_ami-{part}_{split}.jsonl.gz",
+                batch_duration=5000,
+                num_workers=4,
+                storage_type=LilcomChunkyWriter,
+            )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    compute_fbank_ami()
diff --git a/egs/ami/SURT/local/compute_fbank_icsi.py b/egs/ami/SURT/local/compute_fbank_icsi.py
new file mode 100755
index 000000000..4e2ff3f3b
--- /dev/null
+++ b/egs/ami/SURT/local/compute_fbank_icsi.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the ICSI dataset.
+We compute features for full recordings (i.e., without trimming to supervisions).
+This way we can create arbitrary segmentations later.
+
+The generated fbank features are saved in data/fbank.
+"""
+import logging
+import math
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+from lhotse import CutSet, LilcomChunkyWriter
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
+def compute_fbank_icsi():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    logging.info("Reading manifests")
+    manifests = {}
+    for part in ["ihm-mix", "sdm"]:
+        manifests[part] = read_manifests_if_cached(
+            dataset_parts=["train"],
+            output_dir=src_dir,
+            prefix=f"icsi-{part}",
+            suffix="jsonl.gz",
+        )
+
+    for part in ["ihm-mix", "sdm"]:
+        for split in ["train"]:
+            logging.info(f"Processing {part} {split}")
+            cuts = CutSet.from_manifests(
+                **manifests[part][split]
+            ).compute_and_store_features_batch(
+                extractor=extractor,
+                storage_path=output_dir / f"icsi-{part}_{split}_feats",
+                manifest_path=src_dir / f"cuts_icsi-{part}_{split}.jsonl.gz",
+                batch_duration=5000,
+                num_workers=4,
+                storage_type=LilcomChunkyWriter,
+                overwrite=True,
+            )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    compute_fbank_icsi()
diff --git a/egs/ami/SURT/local/compute_fbank_ihm.py b/egs/ami/SURT/local/compute_fbank_ihm.py
new file mode 100755
index 000000000..56f54aa21
--- /dev/null
+++ b/egs/ami/SURT/local/compute_fbank_ihm.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the trimmed sub-segments which will be
+used for simulating the training mixtures.
+
+The generated fbank features are saved in data/fbank.
+"""
+import logging
+import math
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+import torchaudio
+from lhotse import CutSet, LilcomChunkyWriter, load_manifest
+from lhotse.audio import set_ffmpeg_torchaudio_info_enabled
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+from tqdm import tqdm
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+torchaudio.set_audio_backend("soundfile")
+set_ffmpeg_torchaudio_info_enabled(False)
+
+
+def compute_fbank_ihm():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    logging.info("Reading manifests")
+    manifests = {}
+    for data in ["ami", "icsi"]:
+        manifests[data] = read_manifests_if_cached(
+            dataset_parts=["train"],
+            output_dir=src_dir,
+            types=["recordings", "supervisions"],
+            prefix=f"{data}-ihm",
+            suffix="jsonl.gz",
+        )
+
+    logging.info("Computing features")
+    for data in ["ami", "icsi"]:
+        cs = CutSet.from_manifests(**manifests[data]["train"])
+        cs = cs.trim_to_supervisions(keep_overlapping=False)
+        cs = cs.normalize_loudness(target=-23.0, affix_id=False)
+        cs = cs + cs.perturb_speed(0.9) + cs.perturb_speed(1.1)
+        _ = cs.compute_and_store_features_batch(
+            extractor=extractor,
+            storage_path=output_dir / f"{data}-ihm_train_feats",
+            manifest_path=src_dir / f"{data}-ihm_cuts_train.jsonl.gz",
+            batch_duration=5000,
+            num_workers=4,
+            storage_type=LilcomChunkyWriter,
+            overwrite=True,
+        )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    compute_fbank_ihm()
diff --git a/egs/ami/SURT/local/prepare_ami_train_cuts.py b/egs/ami/SURT/local/prepare_ami_train_cuts.py
new file mode 100755
index 000000000..72fced70d
--- /dev/null
+++ b/egs/ami/SURT/local/prepare_ami_train_cuts.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file creates AMI train segments.
+"""
+import logging
+import math
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+from lhotse import LilcomChunkyWriter, load_manifest_lazy
+from lhotse.cut import Cut, CutSet
+from lhotse.utils import EPSILON, add_durations
+from tqdm import tqdm
+
+
+def cut_into_windows(cuts: CutSet, duration: float):
+    """
+    This function takes a CutSet and cuts each cut into windows of roughly
+    `duration` seconds. By roughly, we mean that we try to adjust for the last supervision
+    that exceeds the duration, or is shorter than the duration.
+    """
+    res = []
+    with tqdm() as pbar:
+        for cut in cuts:
+            pbar.update(1)
+            sups = cut.index_supervisions()[cut.id]
+            sr = cut.sampling_rate
+            start = 0.0
+            end = duration
+            num_tries = 0
+            while start < cut.duration and num_tries < 2:
+                # Find the supervision that are cut by the window endpoint
+                hitlist = [iv for iv in sups.at(end) if iv.begin < end]
+                # If there are no supervisions, we are done
+                if not hitlist:
+                    res.append(
+                        cut.truncate(
+                            offset=start,
+                            duration=add_durations(end, -start, sampling_rate=sr),
+                            keep_excessive_supervisions=False,
+                        )
+                    )
+                    # Update the start and end for the next window
+                    start = end
+                    end = add_durations(end, duration, sampling_rate=sr)
+                else:
+                    # find ratio of durations cut by the window endpoint
+                    ratios = [
+                        add_durations(end, -iv.end, sampling_rate=sr) / iv.length()
+                        for iv in hitlist
+                    ]
+                    # we retain the supervisions that have >50% of their duration
+                    # in the window, and discard the others
+                    retained = []
+                    discarded = []
+                    for iv, ratio in zip(hitlist, ratios):
+                        if ratio > 0.5:
+                            retained.append(iv)
+                        else:
+                            discarded.append(iv)
+                    cur_end = max(iv.end for iv in retained) if retained else end
+                    res.append(
+                        cut.truncate(
+                            offset=start,
+                            duration=add_durations(cur_end, -start, sampling_rate=sr),
+                            keep_excessive_supervisions=False,
+                        )
+                    )
+                    # For the next window, we start at the earliest discarded supervision
+                    next_start = min(iv.begin for iv in discarded) if discarded else end
+                    next_end = add_durations(next_start, duration, sampling_rate=sr)
+                    # It may happen that next_start is the same as start, in which case
+                    # we will advance the window anyway
+                    if next_start == start:
+                        logging.warning(
+                            f"Next start is the same as start: {next_start} == {start} for cut {cut.id}"
+                        )
+                        start = end + EPSILON
+                        end = add_durations(start, duration, sampling_rate=sr)
+                        num_tries += 1
+                    else:
+                        start = next_start
+                        end = next_end
+    return CutSet.from_cuts(res)
+
+
+def prepare_train_cuts():
+    src_dir = Path("data/manifests")
+
+    logging.info("Loading the manifests")
+    train_cuts_ihm = load_manifest_lazy(
+        src_dir / "cuts_ami-ihm-mix_train.jsonl.gz"
+    ).map(lambda c: c.with_id(f"{c.id}_ihm-mix"))
+    train_cuts_sdm = load_manifest_lazy(src_dir / "cuts_ami-sdm_train.jsonl.gz").map(
+        lambda c: c.with_id(f"{c.id}_sdm")
+    )
+    train_cuts_mdm = load_manifest_lazy(
+        src_dir / "cuts_ami-mdm8-bf_train.jsonl.gz"
+    ).map(lambda c: c.with_id(f"{c.id}_mdm8-bf"))
+
+    # Combine all cuts into one CutSet
+    train_cuts = train_cuts_ihm + train_cuts_sdm + train_cuts_mdm
+
+    train_cuts_1 = train_cuts.trim_to_supervision_groups(max_pause=0.5)
+    train_cuts_2 = train_cuts.trim_to_supervision_groups(max_pause=0.0)
+
+    # Combine the two segmentations
+    train_all = train_cuts_1 + train_cuts_2
+
+    # At this point, some of the cuts may be very long. We will cut them into windows of
+    # roughly 30 seconds.
+    logging.info("Cutting the segments into windows of 30 seconds")
+    train_all_30 = cut_into_windows(train_all, duration=30.0)
+    logging.info(f"Number of cuts after cutting into windows: {len(train_all_30)}")
+
+    # Show statistics
+    train_all.describe(full=True)
+
+    # Save the cuts
+    logging.info("Saving the cuts")
+    train_all.to_file(src_dir / "cuts_train_ami.jsonl.gz")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    prepare_train_cuts()
diff --git a/egs/ami/SURT/local/prepare_icsi_train_cuts.py b/egs/ami/SURT/local/prepare_icsi_train_cuts.py
new file mode 100755
index 000000000..818e26bfb
--- /dev/null
+++ b/egs/ami/SURT/local/prepare_icsi_train_cuts.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file creates ICSI train segments.
+"""
+import logging
+from pathlib import Path
+
+from lhotse import load_manifest_lazy
+from prepare_ami_train_cuts import cut_into_windows
+
+
+def prepare_train_cuts():
+    src_dir = Path("data/manifests")
+
+    logging.info("Loading the manifests")
+    train_cuts_ihm = load_manifest_lazy(
+        src_dir / "cuts_icsi-ihm-mix_train.jsonl.gz"
+    ).map(lambda c: c.with_id(f"{c.id}_ihm-mix"))
+    train_cuts_sdm = load_manifest_lazy(src_dir / "cuts_icsi-sdm_train.jsonl.gz").map(
+        lambda c: c.with_id(f"{c.id}_sdm")
+    )
+
+    # Combine all cuts into one CutSet
+    train_cuts = train_cuts_ihm + train_cuts_sdm
+
+    train_cuts_1 = train_cuts.trim_to_supervision_groups(max_pause=0.5)
+    train_cuts_2 = train_cuts.trim_to_supervision_groups(max_pause=0.0)
+
+    # Combine the two segmentations
+    train_all = train_cuts_1 + train_cuts_2
+
+    # At this point, some of the cuts may be very long. We will cut them into windows of
+    # roughly 30 seconds.
+    logging.info("Cutting the segments into windows of 30 seconds")
+    train_all_30 = cut_into_windows(train_all, duration=30.0)
+    logging.info(f"Number of cuts after cutting into windows: {len(train_all_30)}")
+
+    # Show statistics
+    train_all.describe(full=True)
+
+    # Save the cuts
+    logging.info("Saving the cuts")
+    train_all.to_file(src_dir / "cuts_train_icsi.jsonl.gz")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    prepare_train_cuts()
diff --git a/egs/ami/SURT/local/prepare_lang_bpe.py b/egs/ami/SURT/local/prepare_lang_bpe.py
new file mode 120000
index 000000000..36b40e7fc
--- /dev/null
+++ b/egs/ami/SURT/local/prepare_lang_bpe.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/prepare_lang_bpe.py
\ No newline at end of file
diff --git a/egs/ami/SURT/local/train_bpe_model.py b/egs/ami/SURT/local/train_bpe_model.py
new file mode 120000
index 000000000..6fad36421
--- /dev/null
+++ b/egs/ami/SURT/local/train_bpe_model.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/train_bpe_model.py
\ No newline at end of file
diff --git a/egs/ami/SURT/prepare.sh b/egs/ami/SURT/prepare.sh
new file mode 100755
index 000000000..ea4e5baf2
--- /dev/null
+++ b/egs/ami/SURT/prepare.sh
@@ -0,0 +1,195 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+stage=-1
+stop_stage=100
+
+# We assume dl_dir (download dir) contains the following
+# directories and files. If not, they will be downloaded
+# by this script automatically.
+#
+#  - $dl_dir/ami
+#      You can find audio and transcripts for AMI in this path.
+#
+#  - $dl_dir/icsi
+#      You can find audio and transcripts for ICSI in this path.
+#
+#  - $dl_dir/rirs_noises
+#      This directory contains the RIRS_NOISES corpus downloaded from https://openslr.org/28/.
+#
+dl_dir=$PWD/download
+
+. shared/parse_options.sh || exit 1
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+vocab_size=500
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Download data"
+
+  # If you have pre-downloaded it to /path/to/amicorpus,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/amicorpus $dl_dir/amicorpus
+  #
+  if [ ! -d $dl_dir/amicorpus ]; then
+    for mic in ihm ihm-mix sdm mdm8-bf; do
+      lhotse download ami --mic $mic $dl_dir/amicorpus
+    done
+  fi
+
+  # If you have pre-downloaded it to /path/to/icsi,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/icsi $dl_dir/icsi
+  #
+  if [ ! -d $dl_dir/icsi ]; then
+    lhotse download icsi $dl_dir/icsi
+  fi
+
+  # If you have pre-downloaded it to /path/to/rirs_noises,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/rirs_noises $dl_dir/
+  #
+  if [ ! -d $dl_dir/rirs_noises ]; then
+    lhotse download rirs_noises $dl_dir
+  fi
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare AMI manifests"
+  # We assume that you have downloaded the AMI corpus
+  # to $dl_dir/amicorpus. We perform text normalization for the transcripts.
+  mkdir -p data/manifests
+  for mic in ihm ihm-mix sdm mdm8-bf; do
+    log "Preparing AMI manifest for $mic"
+    lhotse prepare ami --mic $mic --max-words-per-segment 30 --merge-consecutive $dl_dir/amicorpus data/manifests/
+  done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Prepare ICSI manifests"
+  # We assume that you have downloaded the ICSI corpus
+  # to $dl_dir/icsi. We perform text normalization for the transcripts.
+  mkdir -p data/manifests
+  log "Preparing ICSI manifest"
+  for mic in ihm ihm-mix sdm; do
+    lhotse prepare icsi --mic $mic $dl_dir/icsi data/manifests/
+  done
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Prepare RIRs"
+  # We assume that you have downloaded the RIRS_NOISES corpus
+  # to $dl_dir/rirs_noises
+  lhotse prepare rir-noise -p real_rir -p iso_noise $dl_dir/rirs_noises data/manifests
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  log "Stage 3: Extract features for AMI and ICSI recordings"
+  python local/compute_fbank_ami.py
+  python local/compute_fbank_icsi.py
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Create sources for simulating mixtures"
+  # In the following script, we speed-perturb the IHM recordings and extract features.
+  python local/compute_fbank_ihm.py
+  lhotse combine data/manifests/ami-ihm_cuts_train.jsonl.gz \
+    data/manifests/icsi-ihm_cuts_train.jsonl.gz - |\
+    lhotse cut trim-to-alignments --type word --max-pause 0.5 - - |\
+    lhotse filter 'duration<=12.0' - - |\
+    shuf | gzip -c > data/manifests/ihm_cuts_train_trimmed.jsonl.gz
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
+  log "Stage 6: Create training mixtures"
+  lhotse workflows simulate-meetings \
+    --method conversational \
+    --same-spk-pause 0.5 \
+    --diff-spk-pause 0.5 \
+    --diff-spk-overlap 1.0 \
+    --prob-diff-spk-overlap 0.8 \
+    --num-meetings 200000 \
+    --num-speakers-per-meeting 2,3 \
+    --max-duration-per-speaker 15.0 \
+    --max-utterances-per-speaker 3 \
+    --seed 1234 \
+    --num-jobs 2 \
+    data/manifests/ihm_cuts_train_trimmed.jsonl.gz \
+    data/manifests/ai-mix_cuts_clean.jsonl.gz
+
+  python local/compute_fbank_aimix.py
+
+  # Add source features to the manifest (will be used for masking loss)
+  # This may take ~2 hours.
+  python local/add_source_feats.py
+
+  # Combine clean and reverb
+  cat <(gunzip -c data/manifests/cuts_train_clean_sources.jsonl.gz) \
+    <(gunzip -c data/manifests/cuts_train_reverb_sources.jsonl.gz) |\
+    shuf | gzip -c > data/manifests/cuts_train_comb_sources.jsonl.gz
+fi
+
+if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
+  log "Stage 7: Create training mixtures from real sessions"
+  python local/prepare_ami_train_cuts.py
+  python local/prepare_icsi_train_cuts.py
+
+  # Combine AMI and ICSI
+  cat <(gunzip -c data/manifests/cuts_train_ami.jsonl.gz) \
+    <(gunzip -c data/manifests/cuts_train_icsi.jsonl.gz) |\
+    shuf | gzip -c > data/manifests/cuts_train_ami_icsi.jsonl.gz
+fi
+
+if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
+  log "Stage 8: Dump transcripts for BPE model training (using AMI and ICSI)."
+  mkdir -p data/lm
+  cat <(gunzip -c data/manifests/ami-sdm_supervisions_train.jsonl.gz | jq '.text' | sed 's:"::g') \
+      <(gunzip -c data/manifests/icsi-sdm_supervisions_train.jsonl.gz | jq '.text' | sed 's:"::g') \
+  > data/lm/transcript_words.txt
+fi
+
+if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
+  log "Stage 9: Prepare BPE based lang (combining AMI and ICSI)"
+
+  lang_dir=data/lang_bpe_${vocab_size}
+  mkdir -p $lang_dir
+
+  # Add special words to words.txt
+  echo "<eps> 0" > $lang_dir/words.txt
+  echo "!SIL 1" >> $lang_dir/words.txt
+  echo "<UNK> 2" >> $lang_dir/words.txt
+
+  # Add regular words to words.txt
+  cat data/lm/transcript_words.txt | grep -o -E '\w+' | sort -u | awk '{print $0,NR+2}' >> $lang_dir/words.txt
+
+  # Add remaining special word symbols expected by LM scripts.
+  num_words=$(cat $lang_dir/words.txt | wc -l)
+  echo "<s> ${num_words}" >> $lang_dir/words.txt
+  num_words=$(cat $lang_dir/words.txt | wc -l)
+  echo "</s> ${num_words}" >> $lang_dir/words.txt
+  num_words=$(cat $lang_dir/words.txt | wc -l)
+  echo "#0 ${num_words}" >> $lang_dir/words.txt
+
+  ./local/train_bpe_model.py \
+    --lang-dir $lang_dir \
+    --vocab-size $vocab_size \
+    --transcript data/lm/transcript_words.txt
+
+  if [ ! -f $lang_dir/L_disambig.pt ]; then
+    ./local/prepare_lang_bpe.py --lang-dir $lang_dir
+  fi
+fi
diff --git a/egs/ami/SURT/shared b/egs/ami/SURT/shared
new file mode 120000
index 000000000..4cbd91a7e
--- /dev/null
+++ b/egs/ami/SURT/shared
@@ -0,0 +1 @@
+../../../icefall/shared
\ No newline at end of file
diff --git a/egs/commonvoice/ASR/local/preprocess_commonvoice.py b/egs/commonvoice/ASR/local/preprocess_commonvoice.py
index c5ec14502..e60459765 100755
--- a/egs/commonvoice/ASR/local/preprocess_commonvoice.py
+++ b/egs/commonvoice/ASR/local/preprocess_commonvoice.py
@@ -45,7 +45,7 @@ def get_args():
 
 def normalize_text(utt: str) -> str:
     utt = re.sub(r"[{0}]+".format("-"), " ", utt)
-    return re.sub(r"[^a-zA-Z\s]", "", utt).upper()
+    return re.sub(r"[^a-zA-Z\s']", "", utt).upper()
 
 
 def preprocess_commonvoice(
diff --git a/egs/libricss/SURT/README.md b/egs/libricss/SURT/README.md
new file mode 100644
index 000000000..10a1aaad1
--- /dev/null
+++ b/egs/libricss/SURT/README.md
@@ -0,0 +1,249 @@
+# Introduction
+
+This is a multi-talker ASR recipe for the LibriCSS dataset. We train a Streaming
+Unmixing and Recognition Transducer (SURT) model for the task. In this README,
+we will describe the task, the model, and the training process. We will also
+provide links to pre-trained models and training logs.
+
+## Task
+
+LibriCSS is a multi-talker meeting corpus formed from mixing together LibriSpeech utterances
+and replaying in a real meeting room. It consists of 10 1-hour sessions of audio, each
+recorded on a 7-channel microphone. The sessions are recorded at a sampling rate of 16 kHz.
+For more information, refer to the paper:
+Z. Chen et al., "Continuous speech separation: dataset and analysis,"
+ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP),
+Barcelona, Spain, 2020
+
+In this recipe, we perform the "continuous, streaming, multi-talker ASR" task on LibriCSS.
+
+* By "continuous", we mean that the model should be able to transcribe unsegmented audio
+without the need of an external VAD.
+* By "streaming", we mean that the model has limited right context. We use a right-context
+of at most 32 frames (320 ms).
+* By "multi-talker", we mean that the model should be able to transcribe overlapping speech
+from multiple speakers.
+
+For now, we do not care about speaker attribution, i.e., the transcription is speaker
+agnostic. The evaluation depends on the particular model type. In this case, we use
+the optimal reference combination WER (ORC-WER) metric as implemented in the
+[meeteval](https://github.com/fgnt/meeteval) toolkit.
+
+## Model
+
+We use the Streaming Unmixing and Recognition Transducer (SURT) model for this task.
+The model is based on the papers:
+
+- Lu, Liang et al. “Streaming End-to-End Multi-Talker Speech Recognition.” IEEE Signal Processing Letters 28 (2020): 803-807.
+- Raj, Desh et al. “Continuous Streaming Multi-Talker ASR with Dual-Path Transducers.” ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2021): 7317-7321.
+
+The model is a combination of a speech separation model and a speech recognition model,
+but trained end-to-end with a single loss function. The overall architecture is shown
+in the figure below. Note that this architecture is slightly different from the one
+in the above papers. A detailed description of the model can be found in the following
+paper: [SURT 2.0: Advanced in transducer-based multi-talker ASR](https://arxiv.org/abs/2306.10559).
+
+<p align="center">
+
+  <img src="surt.png">
+  Streaming Unmixing and Recognition Transducer
+
+</p>
+
+In the [dprnn_zipformer](./dprnn_zipformer) recipe, for example, we use a DPRNN-based masking network
+and a Zipfomer-based recognition network. But other combinations are possible as well.
+
+## Training objective
+
+We train the model using the pruned transducer loss, similar to other ASR recipes in
+icefall. However, an important consideration is how to assign references to the output
+channels (2 in this case). For this, we use the heuristic error assignment training (HEAT)
+strategy, which assigns references to the first available channel based on their start
+times. An illustrative example is shown in the figure below:
+
+<p align="center">
+
+  <img src="heat.png">
+  Illustration of HEAT-based reference assignment.
+
+</p>
+
+## Description of the recipe
+
+### Pre-requisites
+
+The recipes in this directory need the following packages to be installed:
+
+- [meeteval](https://github.com/fgnt/meeteval)
+- [einops](https://github.com/arogozhnikov/einops)
+
+Additionally, we initialize the "recognition" transducer with a pre-trained model,
+trained on LibriSpeech. For this, please run the following from within `egs/librispeech/ASR`:
+
+```bash
+./prepare.sh
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+python pruned_transducer_stateless7_streaming/train.py \
+    --use-fp16 True \
+    --exp-dir pruned_transducer_stateless7_streaming/exp \
+    --world-size 4 \
+    --max-duration 800 \
+    --num-epochs 10 \
+    --keep-last-k 1 \
+    --manifest-dir data/manifests \
+    --enable-musan true \
+    --master-port 54321 \
+    --bpe-model data/lang_bpe_500/bpe.model \
+    --num-encoder-layers 2,2,2,2,2 \
+    --feedforward-dims 768,768,768,768,768 \
+    --nhead 8,8,8,8,8 \
+    --encoder-dims 256,256,256,256,256 \
+    --attention-dims 192,192,192,192,192 \
+    --encoder-unmasked-dims 192,192,192,192,192 \
+    --zipformer-downsampling-factors 1,2,4,8,2 \
+    --cnn-module-kernels 31,31,31,31,31 \
+    --decoder-dim 512 \
+    --joiner-dim 512
+```
+
+The above is for SURT-base (~26M). For SURT-large (~38M), use `--num-encoder-layers 2,4,3,2,4`.
+
+Once the above model is trained for 10 epochs, copy it to `egs/libricss/SURT/exp`:
+
+```bash
+cp -r pruned_transducer_stateless7_streaming/exp/epoch-10.pt exp/zipformer_base.pt
+```
+
+**NOTE:** We also provide this pre-trained checkpoint (see the section below), so you can skip
+the above step if you want.
+
+### Training
+
+To train the model, run the following from within `egs/libricss/SURT`:
+
+```bash
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+python dprnn_zipformer/train.py \
+    --use-fp16 True \
+    --exp-dir dprnn_zipformer/exp/surt_base \
+    --world-size 4 \
+    --max-duration 500 \
+    --max-duration-valid 250 \
+    --max-cuts 200 \
+    --num-buckets 50 \
+    --num-epochs 30 \
+    --enable-spec-aug True \
+    --enable-musan False \
+    --ctc-loss-scale 0.2 \
+    --heat-loss-scale 0.2 \
+    --base-lr 0.004 \
+    --model-init-ckpt exp/zipformer_base.pt \
+    --chunk-width-randomization True \
+    --num-mask-encoder-layers 4 \
+    --num-encoder-layers 2,2,2,2,2
+```
+
+The above is for SURT-base (~26M). For SURT-large (~38M), use:
+
+```bash
+    --num-mask-encoder-layers 6 \
+    --num-encoder-layers 2,4,3,2,4 \
+    --model-init-ckpt exp/zipformer_large.pt \
+```
+
+**NOTE:** You may need to decrease the `--max-duration` for SURT-large to avoid OOM.
+
+### Adaptation
+
+The training step above only trains on simulated mixtures. For best results, we also
+adapt the final model on the LibriCSS dev set. For this, run the following from within
+`egs/libricss/SURT`:
+
+```bash
+export CUDA_VISIBLE_DEVICES="0"
+
+python dprnn_zipformer/train_adapt.py \
+    --use-fp16 True \
+    --exp-dir dprnn_zipformer/exp/surt_base_adapt \
+    --world-size 1 \
+    --max-duration 500 \
+    --max-duration-valid 250 \
+    --max-cuts 200 \
+    --num-buckets 50 \
+    --num-epochs 8 \
+    --lr-epochs 2 \
+    --enable-spec-aug True \
+    --enable-musan False \
+    --ctc-loss-scale 0.2 \
+    --base-lr 0.0004 \
+    --model-init-ckpt dprnn_zipformer/exp/surt_base/epoch-30.pt \
+    --chunk-width-randomization True \
+    --num-mask-encoder-layers 4 \
+    --num-encoder-layers 2,2,2,2,2
+```
+
+For SURT-large, use the following config:
+
+```bash
+    --num-mask-encoder-layers 6 \
+    --num-encoder-layers 2,4,3,2,4 \
+    --model-init-ckpt dprnn_zipformer/exp/surt_large/epoch-30.pt \
+    --num-epochs 15 \
+    --lr-epochs 4 \
+```
+
+
+### Decoding
+
+To decode the model, run the following from within `egs/libricss/SURT`:
+
+#### Greedy search
+
+```bash
+export CUDA_VISIBLE_DEVICES="0"
+
+python dprnn_zipformer/decode.py \
+    --epoch 8 --avg 1 --use-averaged-model False \
+    --exp-dir dprnn_zipformer/exp/surt_base_adapt \
+    --max-duration 250 \
+    --decoding-method greedy_search
+```
+
+#### Beam search
+
+```bash
+python dprnn_zipformer/decode.py \
+    --epoch 8 --avg 1 --use-averaged-model False \
+    --exp-dir dprnn_zipformer/exp/surt_base_adapt \
+    --max-duration 250 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+```
+
+## Results (using beam search)
+
+#### IHM-Mix
+
+| Model | # params | 0L | 0S | OV10 | OV20 | OV30 | OV40 | Avg. |
+|------------|:-------:|:----:|:---:|----:|:----:|:----:|:----:|:----:|
+| dprnn_zipformer (base)  | 26.7 | 5.1 | 4.2 | 13.7 | 18.7 | 20.5 | 20.6 | 13.8 |
+| dprnn_zipformer (large) | 37.9 | 4.6 | 3.8 | 12.7 | 14.3 | 16.7 | 21.2 | 12.2 |
+
+#### SDM
+
+| Model | # params | 0L | 0S | OV10 | OV20 | OV30 | OV40 | Avg. |
+|------------|:-------:|:----:|:---:|----:|:----:|:----:|:----:|:----:|
+| dprnn_zipformer (base)  | 26.7 | 6.8 | 7.2 | 21.4 | 24.5 | 28.6 | 31.2 | 20.0 |
+| dprnn_zipformer (large) | 37.9 | 6.4 | 6.9 | 17.9 | 19.7 | 25.2 | 25.5 | 16.9 |
+
+## Pre-trained models and logs
+
+* Pre-trained models: <https://huggingface.co/desh2608/icefall-surt-libricss-dprnn-zipformer>
+
+* Training logs:
+    - surt_base: <https://tensorboard.dev/experiment/YLGJTkBETb2aqDQ61jbxvQ/>
+    - surt_base_adapt: <https://tensorboard.dev/experiment/pjXMFVL9RMej85rMHyd0EQ/>
+    - surt_large: <https://tensorboard.dev/experiment/82HvYqfrSOKZ4w8Jod2QMw/>
+    - surt_large_adapt: <https://tensorboard.dev/experiment/5oIdEgRqS9Wb6yVuxaExEw/>
diff --git a/egs/libricss/SURT/dprnn_zipformer/asr_datamodule.py b/egs/libricss/SURT/dprnn_zipformer/asr_datamodule.py
new file mode 100644
index 000000000..51df91598
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/asr_datamodule.py
@@ -0,0 +1,372 @@
+# Copyright      2021  Piotr Żelasko
+# Copyright      2022  Xiaomi Corporation     (Author: Mingshuang Luo)
+# Copyright      2023  Johns Hopkins Univrtsity (Author: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import inspect
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional
+
+import torch
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
+from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
+    CutMix,
+    DynamicBucketingSampler,
+    K2SurtDataset,
+    PrecomputedFeatures,
+    SimpleCutSampler,
+    SpecAugment,
+)
+from lhotse.dataset.input_strategies import OnTheFlyFeatures
+from lhotse.utils import fix_random_seed
+from torch.utils.data import DataLoader
+
+from icefall.utils import str2bool
+
+
+class _SeedWorkers:
+    def __init__(self, seed: int):
+        self.seed = seed
+
+    def __call__(self, worker_id: int):
+        fix_random_seed(self.seed + worker_id)
+
+
+class LibriCssAsrDataModule:
+    """
+    DataModule for k2 ASR experiments.
+    It assumes there is always one train and valid dataloader,
+    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
+    and test-other).
+
+    It contains all the common data pipeline modules used in ASR
+    experiments, e.g.:
+    - dynamic batch size,
+    - bucketing samplers,
+    - augmentation,
+    - on-the-fly feature extraction
+
+    This class should be derived for specific corpora used in ASR tasks.
+    """
+
+    def __init__(self, args: argparse.Namespace):
+        self.args = args
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(
+            title="ASR data related options",
+            description="These options are used for the preparation of "
+            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
+            "effective batch sizes, sampling strategies, applied data "
+            "augmentations, etc.",
+        )
+        group.add_argument(
+            "--manifest-dir",
+            type=Path,
+            default=Path("data/manifests"),
+            help="Path to directory with train/valid/test cuts.",
+        )
+        group.add_argument(
+            "--max-duration",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--max-duration-valid",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--max-cuts",
+            type=int,
+            default=100,
+            help="Maximum number of cuts in a single batch. You can "
+            "reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--bucketing-sampler",
+            type=str2bool,
+            default=True,
+            help="When enabled, the batches will come from buckets of "
+            "similar duration (saves padding frames).",
+        )
+        group.add_argument(
+            "--num-buckets",
+            type=int,
+            default=30,
+            help="The number of buckets for the DynamicBucketingSampler"
+            "(you might want to increase it for larger datasets).",
+        )
+        group.add_argument(
+            "--on-the-fly-feats",
+            type=str2bool,
+            default=False,
+            help=(
+                "When enabled, use on-the-fly cut mixing and feature "
+                "extraction. Will drop existing precomputed feature manifests "
+                "if available."
+            ),
+        )
+        group.add_argument(
+            "--shuffle",
+            type=str2bool,
+            default=True,
+            help="When enabled (=default), the examples will be "
+            "shuffled for each epoch.",
+        )
+        group.add_argument(
+            "--drop-last",
+            type=str2bool,
+            default=True,
+            help="Whether to drop last batch. Used by sampler.",
+        )
+        group.add_argument(
+            "--return-cuts",
+            type=str2bool,
+            default=True,
+            help="When enabled, each batch will have the "
+            "field: batch['supervisions']['cut'] with the cuts that "
+            "were used to construct it.",
+        )
+
+        group.add_argument(
+            "--num-workers",
+            type=int,
+            default=2,
+            help="The number of training dataloader workers that "
+            "collect the batches.",
+        )
+
+        group.add_argument(
+            "--enable-spec-aug",
+            type=str2bool,
+            default=True,
+            help="When enabled, use SpecAugment for training dataset.",
+        )
+
+        group.add_argument(
+            "--spec-aug-time-warp-factor",
+            type=int,
+            default=80,
+            help="Used only when --enable-spec-aug is True. "
+            "It specifies the factor for time warping in SpecAugment. "
+            "Larger values mean more warping. "
+            "A value less than 1 means to disable time warp.",
+        )
+
+        group.add_argument(
+            "--enable-musan",
+            type=str2bool,
+            default=True,
+            help="When enabled, select noise from MUSAN and mix it"
+            "with training dataset. ",
+        )
+
+    def train_dataloaders(
+        self,
+        cuts_train: CutSet,
+        sampler_state_dict: Optional[Dict[str, Any]] = None,
+        return_sources: bool = True,
+        strict: bool = True,
+    ) -> DataLoader:
+        """
+        Args:
+          cuts_train:
+            CutSet for training.
+          sampler_state_dict:
+            The state dict for the training sampler.
+        """
+        transforms = []
+        if self.args.enable_musan:
+            logging.info("Enable MUSAN")
+            logging.info("About to get Musan cuts")
+            cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
+            transforms.append(
+                CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True)
+            )
+        else:
+            logging.info("Disable MUSAN")
+
+        input_transforms = []
+        if self.args.enable_spec_aug:
+            logging.info("Enable SpecAugment")
+            logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
+            # Set the value of num_frame_masks according to Lhotse's version.
+            # In different Lhotse's versions, the default of num_frame_masks is
+            # different.
+            num_frame_masks = 10
+            num_frame_masks_parameter = inspect.signature(
+                SpecAugment.__init__
+            ).parameters["num_frame_masks"]
+            if num_frame_masks_parameter.default == 1:
+                num_frame_masks = 2
+            logging.info(f"Num frame mask: {num_frame_masks}")
+            input_transforms.append(
+                SpecAugment(
+                    time_warp_factor=self.args.spec_aug_time_warp_factor,
+                    num_frame_masks=num_frame_masks,
+                    features_mask_size=27,
+                    num_feature_masks=2,
+                    frames_mask_size=100,
+                )
+            )
+        else:
+            logging.info("Disable SpecAugment")
+
+        logging.info("About to create train dataset")
+        train = K2SurtDataset(
+            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+            cut_transforms=transforms,
+            input_transforms=input_transforms,
+            return_cuts=self.args.return_cuts,
+            return_sources=return_sources,
+            strict=strict,
+        )
+
+        if self.args.bucketing_sampler:
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                quadratic_duration=30.0,
+                max_cuts=self.args.max_cuts,
+                shuffle=self.args.shuffle,
+                num_buckets=self.args.num_buckets,
+                drop_last=self.args.drop_last,
+            )
+        else:
+            logging.info("Using SingleCutSampler.")
+            train_sampler = SimpleCutSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                max_cuts=self.args.max_cuts,
+                shuffle=self.args.shuffle,
+            )
+        logging.info("About to create train dataloader")
+
+        if sampler_state_dict is not None:
+            logging.info("Loading sampler state dict")
+            train_sampler.load_state_dict(sampler_state_dict)
+
+        # 'seed' is derived from the current random state, which will have
+        # previously been set in the main process.
+        seed = torch.randint(0, 100000, ()).item()
+        worker_init_fn = _SeedWorkers(seed)
+
+        train_dl = DataLoader(
+            train,
+            sampler=train_sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+            worker_init_fn=worker_init_fn,
+        )
+
+        return train_dl
+
+    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
+        transforms = []
+
+        logging.info("About to create dev dataset")
+        validate = K2SurtDataset(
+            input_strategy=OnTheFlyFeatures(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            )
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+            cut_transforms=transforms,
+            return_cuts=self.args.return_cuts,
+            return_sources=False,
+            strict=False,
+        )
+        valid_sampler = DynamicBucketingSampler(
+            cuts_valid,
+            max_duration=self.args.max_duration_valid,
+            max_cuts=self.args.max_cuts,
+            shuffle=False,
+        )
+        logging.info("About to create dev dataloader")
+        valid_dl = DataLoader(
+            validate,
+            sampler=valid_sampler,
+            batch_size=None,
+            num_workers=2,
+            persistent_workers=False,
+        )
+
+        return valid_dl
+
+    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
+        logging.debug("About to create test dataset")
+        test = K2SurtDataset(
+            input_strategy=OnTheFlyFeatures(
+                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            )
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+            return_cuts=self.args.return_cuts,
+            return_sources=False,
+            strict=False,
+        )
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration_valid,
+            max_cuts=self.args.max_cuts,
+            shuffle=False,
+        )
+        logging.debug("About to create test dataloader")
+        test_dl = DataLoader(
+            test,
+            batch_size=None,
+            sampler=sampler,
+            num_workers=self.args.num_workers,
+        )
+        return test_dl
+
+    @lru_cache()
+    def lsmix_cuts(
+        self,
+        rvb_affix: str = "clean",
+        type_affix: str = "full",
+        sources: bool = True,
+    ) -> CutSet:
+        logging.info("About to get train cuts")
+        source_affix = "_sources" if sources else ""
+        cs = load_manifest_lazy(
+            self.args.manifest_dir
+            / f"cuts_train_{rvb_affix}_{type_affix}{source_affix}.jsonl.gz"
+        )
+        cs = cs.filter(lambda c: c.duration >= 1.0 and c.duration <= 30.0)
+        return cs
+
+    @lru_cache()
+    def libricss_cuts(self, split="dev", type="sdm") -> CutSet:
+        logging.info(f"About to get LibriCSS {split} {type} cuts")
+        cs = load_manifest_lazy(
+            self.args.manifest_dir / f"cuts_{split}_libricss-{type}.jsonl.gz"
+        )
+        return cs
diff --git a/egs/libricss/SURT/dprnn_zipformer/beam_search.py b/egs/libricss/SURT/dprnn_zipformer/beam_search.py
new file mode 100644
index 000000000..c8e4643d0
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/beam_search.py
@@ -0,0 +1,730 @@
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang
+#                                                  Xiaoyu Yang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple, Union
+
+import k2
+import torch
+from model import SURT
+
+from icefall import NgramLmStateCost
+from icefall.utils import DecodingResults
+
+
+def greedy_search(
+    model: SURT,
+    encoder_out: torch.Tensor,
+    max_sym_per_frame: int,
+    return_timestamps: bool = False,
+) -> Union[List[int], DecodingResults]:
+    """Greedy search for a single utterance.
+    Args:
+      model:
+        An instance of `SURT`.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
+      max_sym_per_frame:
+        Maximum number of symbols per frame. If it is set to 0, the WER
+        would be 100%.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 4
+
+    # support only batch_size == 1 for now
+    assert encoder_out.size(0) == 1, encoder_out.size(0)
+
+    blank_id = model.decoder.blank_id
+    context_size = model.decoder.context_size
+    unk_id = getattr(model, "unk_id", blank_id)
+
+    device = next(model.parameters()).device
+
+    decoder_input = torch.tensor(
+        [-1] * (context_size - 1) + [blank_id], device=device, dtype=torch.int64
+    ).reshape(1, context_size)
+
+    decoder_out = model.decoder(decoder_input, need_pad=False)
+    decoder_out = model.joiner.decoder_proj(decoder_out)
+
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    T = encoder_out.size(1)
+    t = 0
+    hyp = [blank_id] * context_size
+
+    # timestamp[i] is the frame index after subsampling
+    # on which hyp[i] is decoded
+    timestamp = []
+
+    # Maximum symbols per utterance.
+    max_sym_per_utt = 1000
+
+    # symbols per frame
+    sym_per_frame = 0
+
+    # symbols per utterance decoded so far
+    sym_per_utt = 0
+
+    while t < T and sym_per_utt < max_sym_per_utt:
+        if sym_per_frame >= max_sym_per_frame:
+            sym_per_frame = 0
+            t += 1
+            continue
+
+        # fmt: off
+        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
+        # fmt: on
+        logits = model.joiner(
+            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
+        )
+        # logits is (1, 1, 1, vocab_size)
+
+        y = logits.argmax().item()
+        if y not in (blank_id, unk_id):
+            hyp.append(y)
+            timestamp.append(t)
+            decoder_input = torch.tensor([hyp[-context_size:]], device=device).reshape(
+                1, context_size
+            )
+
+            decoder_out = model.decoder(decoder_input, need_pad=False)
+            decoder_out = model.joiner.decoder_proj(decoder_out)
+
+            sym_per_utt += 1
+            sym_per_frame += 1
+        else:
+            sym_per_frame = 0
+            t += 1
+    hyp = hyp[context_size:]  # remove blanks
+
+    if not return_timestamps:
+        return hyp
+    else:
+        return DecodingResults(
+            hyps=[hyp],
+            timestamps=[timestamp],
+        )
+
+
+def greedy_search_batch(
+    model: SURT,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    return_timestamps: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
+    Args:
+      model:
+        The SURT model.
+      encoder_out:
+        Output from the encoder. Its shape is (N, T, C), where N >= 1.
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    device = next(model.parameters()).device
+
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    hyps = [[-1] * (context_size - 1) + [blank_id] for _ in range(N)]
+
+    # timestamp[n][i] is the frame index after subsampling
+    # on which hyp[n][i] is decoded
+    timestamps = [[] for _ in range(N)]
+
+    decoder_input = torch.tensor(
+        hyps,
+        device=device,
+        dtype=torch.int64,
+    )  # (N, context_size)
+
+    decoder_out = model.decoder(decoder_input, need_pad=False)
+    decoder_out = model.joiner.decoder_proj(decoder_out)
+    # decoder_out: (N, 1, decoder_out_dim)
+
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    for (t, batch_size) in enumerate(batch_size_list):
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        decoder_out = decoder_out[:batch_size]
+
+        logits = model.joiner(
+            current_encoder_out, decoder_out.unsqueeze(1), project_input=False
+        )
+        # logits'shape (batch_size, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)  # (batch_size, vocab_size)
+        assert logits.ndim == 2, logits.shape
+        y = logits.argmax(dim=1).tolist()
+        emitted = False
+        for i, v in enumerate(y):
+            if v not in (blank_id, unk_id):
+                hyps[i].append(v)
+                timestamps[i].append(t)
+                emitted = True
+        if emitted:
+            # update decoder output
+            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
+            decoder_input = torch.tensor(
+                decoder_input,
+                device=device,
+                dtype=torch.int64,
+            )
+            decoder_out = model.decoder(decoder_input, need_pad=False)
+            decoder_out = model.joiner.decoder_proj(decoder_out)
+
+    sorted_ans = [h[context_size:] for h in hyps]
+    ans = []
+    ans_timestamps = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+        ans_timestamps.append(timestamps[unsorted_indices[i]])
+
+    if not return_timestamps:
+        return ans
+    else:
+        return DecodingResults(
+            hyps=ans,
+            timestamps=ans_timestamps,
+        )
+
+
+def modified_beam_search(
+    model: SURT,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+    beam: int = 4,
+    temperature: float = 1.0,
+    return_timestamps: bool = False,
+) -> Union[List[List[int]], DecodingResults]:
+    """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
+
+    Args:
+      model:
+        The SURT model.
+      encoder_out:
+        Output from the encoder. Its shape is (N, T, C).
+      encoder_out_lens:
+        A 1-D tensor of shape (N,), containing number of valid frames in
+        encoder_out before padding.
+      beam:
+        Number of active paths during the beam search.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3, encoder_out.shape
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+    device = next(model.parameters()).device
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    B = [HypothesisList() for _ in range(N)]
+    for i in range(N):
+        B[i].add(
+            Hypothesis(
+                ys=[blank_id] * context_size,
+                log_prob=torch.zeros(1, dtype=torch.float32, device=device),
+                timestamp=[],
+            )
+        )
+
+    encoder_out = model.joiner.encoder_proj(packed_encoder_out.data)
+
+    offset = 0
+    finalized_B = []
+    for (t, batch_size) in enumerate(batch_size_list):
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = encoder_out.data[start:end]
+        current_encoder_out = current_encoder_out.unsqueeze(1).unsqueeze(1)
+        # current_encoder_out's shape is (batch_size, 1, 1, encoder_out_dim)
+        offset = end
+
+        finalized_B = B[batch_size:] + finalized_B
+        B = B[:batch_size]
+
+        hyps_shape = get_hyps_shape(B).to(device)
+
+        A = [list(b) for b in B]
+        B = [HypothesisList() for _ in range(batch_size)]
+
+        ys_log_probs = torch.cat(
+            [hyp.log_prob.reshape(1, 1) for hyps in A for hyp in hyps]
+        )  # (num_hyps, 1)
+
+        decoder_input = torch.tensor(
+            [hyp.ys[-context_size:] for hyps in A for hyp in hyps],
+            device=device,
+            dtype=torch.int64,
+        )  # (num_hyps, context_size)
+
+        decoder_out = model.decoder(decoder_input, need_pad=False).unsqueeze(1)
+        decoder_out = model.joiner.decoder_proj(decoder_out)
+        # decoder_out is of shape (num_hyps, 1, 1, joiner_dim)
+
+        # Note: For torch 1.7.1 and below, it requires a torch.int64 tensor
+        # as index, so we use `to(torch.int64)` below.
+        current_encoder_out = torch.index_select(
+            current_encoder_out,
+            dim=0,
+            index=hyps_shape.row_ids(1).to(torch.int64),
+        )  # (num_hyps, 1, 1, encoder_out_dim)
+
+        logits = model.joiner(
+            current_encoder_out,
+            decoder_out,
+            project_input=False,
+        )  # (num_hyps, 1, 1, vocab_size)
+
+        logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
+
+        log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
+
+        log_probs.add_(ys_log_probs)
+
+        vocab_size = log_probs.size(-1)
+
+        log_probs = log_probs.reshape(-1)
+
+        row_splits = hyps_shape.row_splits(1) * vocab_size
+        log_probs_shape = k2.ragged.create_ragged_shape2(
+            row_splits=row_splits, cached_tot_size=log_probs.numel()
+        )
+        ragged_log_probs = k2.RaggedTensor(shape=log_probs_shape, value=log_probs)
+
+        for i in range(batch_size):
+            topk_log_probs, topk_indexes = ragged_log_probs[i].topk(beam)
+
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                topk_hyp_indexes = (topk_indexes // vocab_size).tolist()
+                topk_token_indexes = (topk_indexes % vocab_size).tolist()
+
+            for k in range(len(topk_hyp_indexes)):
+                hyp_idx = topk_hyp_indexes[k]
+                hyp = A[i][hyp_idx]
+
+                new_ys = hyp.ys[:]
+                new_token = topk_token_indexes[k]
+                new_timestamp = hyp.timestamp[:]
+                if new_token not in (blank_id, unk_id):
+                    new_ys.append(new_token)
+                    new_timestamp.append(t)
+
+                new_log_prob = topk_log_probs[k]
+                new_hyp = Hypothesis(
+                    ys=new_ys, log_prob=new_log_prob, timestamp=new_timestamp
+                )
+                B[i].add(new_hyp)
+
+    B = B + finalized_B
+    best_hyps = [b.get_most_probable(length_norm=True) for b in B]
+
+    sorted_ans = [h.ys[context_size:] for h in best_hyps]
+    sorted_timestamps = [h.timestamp for h in best_hyps]
+    ans = []
+    ans_timestamps = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+        ans_timestamps.append(sorted_timestamps[unsorted_indices[i]])
+
+    if not return_timestamps:
+        return ans
+    else:
+        return DecodingResults(
+            hyps=ans,
+            timestamps=ans_timestamps,
+        )
+
+
+def beam_search(
+    model: SURT,
+    encoder_out: torch.Tensor,
+    beam: int = 4,
+    temperature: float = 1.0,
+    return_timestamps: bool = False,
+) -> Union[List[int], DecodingResults]:
+    """
+    It implements Algorithm 1 in https://arxiv.org/pdf/1211.3711.pdf
+
+    espnet/nets/beam_search_SURT.py#L247 is used as a reference.
+
+    Args:
+      model:
+        An instance of `SURT`.
+      encoder_out:
+        A tensor of shape (N, T, C) from the encoder. Support only N==1 for now.
+      beam:
+        Beam size.
+      temperature:
+        Softmax temperature.
+      return_timestamps:
+        Whether to return timestamps.
+
+    Returns:
+      If return_timestamps is False, return the decoded result.
+      Else, return a DecodingResults object containing
+      decoded result and corresponding timestamps.
+    """
+    assert encoder_out.ndim == 3
+
+    # support only batch_size == 1 for now
+    assert encoder_out.size(0) == 1, encoder_out.size(0)
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+
+    device = next(model.parameters()).device
+
+    decoder_input = torch.tensor(
+        [blank_id] * context_size,
+        device=device,
+        dtype=torch.int64,
+    ).reshape(1, context_size)
+
+    decoder_out = model.decoder(decoder_input, need_pad=False)
+    decoder_out = model.joiner.decoder_proj(decoder_out)
+
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    T = encoder_out.size(1)
+    t = 0
+
+    B = HypothesisList()
+    B.add(Hypothesis(ys=[blank_id] * context_size, log_prob=0.0, timestamp=[]))
+
+    max_sym_per_utt = 20000
+
+    sym_per_utt = 0
+
+    decoder_cache: Dict[str, torch.Tensor] = {}
+
+    while t < T and sym_per_utt < max_sym_per_utt:
+        # fmt: off
+        current_encoder_out = encoder_out[:, t:t+1, :].unsqueeze(2)
+        # fmt: on
+        A = B
+        B = HypothesisList()
+
+        joint_cache: Dict[str, torch.Tensor] = {}
+
+        # TODO(fangjun): Implement prefix search to update the `log_prob`
+        # of hypotheses in A
+
+        while True:
+            y_star = A.get_most_probable()
+            A.remove(y_star)
+
+            cached_key = y_star.key
+
+            if cached_key not in decoder_cache:
+                decoder_input = torch.tensor(
+                    [y_star.ys[-context_size:]],
+                    device=device,
+                    dtype=torch.int64,
+                ).reshape(1, context_size)
+
+                decoder_out = model.decoder(decoder_input, need_pad=False)
+                decoder_out = model.joiner.decoder_proj(decoder_out)
+                decoder_cache[cached_key] = decoder_out
+            else:
+                decoder_out = decoder_cache[cached_key]
+
+            cached_key += f"-t-{t}"
+            if cached_key not in joint_cache:
+                logits = model.joiner(
+                    current_encoder_out,
+                    decoder_out.unsqueeze(1),
+                    project_input=False,
+                )
+
+                # TODO(fangjun): Scale the blank posterior
+                log_prob = (logits / temperature).log_softmax(dim=-1)
+                # log_prob is (1, 1, 1, vocab_size)
+                log_prob = log_prob.squeeze()
+                # Now log_prob is (vocab_size,)
+                joint_cache[cached_key] = log_prob
+            else:
+                log_prob = joint_cache[cached_key]
+
+            # First, process the blank symbol
+            skip_log_prob = log_prob[blank_id]
+            new_y_star_log_prob = y_star.log_prob + skip_log_prob
+
+            # ys[:] returns a copy of ys
+            B.add(
+                Hypothesis(
+                    ys=y_star.ys[:],
+                    log_prob=new_y_star_log_prob,
+                    timestamp=y_star.timestamp[:],
+                )
+            )
+
+            # Second, process other non-blank labels
+            values, indices = log_prob.topk(beam + 1)
+            for i, v in zip(indices.tolist(), values.tolist()):
+                if i in (blank_id, unk_id):
+                    continue
+                new_ys = y_star.ys + [i]
+                new_log_prob = y_star.log_prob + v
+                new_timestamp = y_star.timestamp + [t]
+                A.add(
+                    Hypothesis(
+                        ys=new_ys,
+                        log_prob=new_log_prob,
+                        timestamp=new_timestamp,
+                    )
+                )
+
+            # Check whether B contains more than "beam" elements more probable
+            # than the most probable in A
+            A_most_probable = A.get_most_probable()
+
+            kept_B = B.filter(A_most_probable.log_prob)
+
+            if len(kept_B) >= beam:
+                B = kept_B.topk(beam)
+                break
+
+        t += 1
+
+    best_hyp = B.get_most_probable(length_norm=True)
+    ys = best_hyp.ys[context_size:]  # [context_size:] to remove blanks
+
+    if not return_timestamps:
+        return ys
+    else:
+        return DecodingResults(hyps=[ys], timestamps=[best_hyp.timestamp])
+
+
+@dataclass
+class Hypothesis:
+    # The predicted tokens so far.
+    # Newly predicted tokens are appended to `ys`.
+    ys: List[int]
+
+    # The log prob of ys.
+    # It contains only one entry.
+    log_prob: torch.Tensor
+
+    # timestamp[i] is the frame index after subsampling
+    # on which ys[i] is decoded
+    timestamp: List[int] = field(default_factory=list)
+
+    # the lm score for next token given the current ys
+    lm_score: Optional[torch.Tensor] = None
+
+    # the RNNLM states (h and c in LSTM)
+    state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None
+
+    # N-gram LM state
+    state_cost: Optional[NgramLmStateCost] = None
+
+    @property
+    def key(self) -> str:
+        """Return a string representation of self.ys"""
+        return "_".join(map(str, self.ys))
+
+
+class HypothesisList(object):
+    def __init__(self, data: Optional[Dict[str, Hypothesis]] = None) -> None:
+        """
+        Args:
+          data:
+            A dict of Hypotheses. Its key is its `value.key`.
+        """
+        if data is None:
+            self._data = {}
+        else:
+            self._data = data
+
+    @property
+    def data(self) -> Dict[str, Hypothesis]:
+        return self._data
+
+    def add(self, hyp: Hypothesis) -> None:
+        """Add a Hypothesis to `self`.
+
+        If `hyp` already exists in `self`, its probability is updated using
+        `log-sum-exp` with the existed one.
+
+        Args:
+          hyp:
+            The hypothesis to be added.
+        """
+        key = hyp.key
+        if key in self:
+            old_hyp = self._data[key]  # shallow copy
+            torch.logaddexp(old_hyp.log_prob, hyp.log_prob, out=old_hyp.log_prob)
+        else:
+            self._data[key] = hyp
+
+    def get_most_probable(self, length_norm: bool = False) -> Hypothesis:
+        """Get the most probable hypothesis, i.e., the one with
+        the largest `log_prob`.
+
+        Args:
+          length_norm:
+            If True, the `log_prob` of a hypothesis is normalized by the
+            number of tokens in it.
+        Returns:
+          Return the hypothesis that has the largest `log_prob`.
+        """
+        if length_norm:
+            return max(self._data.values(), key=lambda hyp: hyp.log_prob / len(hyp.ys))
+        else:
+            return max(self._data.values(), key=lambda hyp: hyp.log_prob)
+
+    def remove(self, hyp: Hypothesis) -> None:
+        """Remove a given hypothesis.
+
+        Caution:
+          `self` is modified **in-place**.
+
+        Args:
+          hyp:
+            The hypothesis to be removed from `self`.
+            Note: It must be contained in `self`. Otherwise,
+            an exception is raised.
+        """
+        key = hyp.key
+        assert key in self, f"{key} does not exist"
+        del self._data[key]
+
+    def filter(self, threshold: torch.Tensor) -> "HypothesisList":
+        """Remove all Hypotheses whose log_prob is less than threshold.
+
+        Caution:
+          `self` is not modified. Instead, a new HypothesisList is returned.
+
+        Returns:
+          Return a new HypothesisList containing all hypotheses from `self`
+          with `log_prob` being greater than the given `threshold`.
+        """
+        ans = HypothesisList()
+        for _, hyp in self._data.items():
+            if hyp.log_prob > threshold:
+                ans.add(hyp)  # shallow copy
+        return ans
+
+    def topk(self, k: int) -> "HypothesisList":
+        """Return the top-k hypothesis."""
+        hyps = list(self._data.items())
+
+        hyps = sorted(hyps, key=lambda h: h[1].log_prob, reverse=True)[:k]
+
+        ans = HypothesisList(dict(hyps))
+        return ans
+
+    def __contains__(self, key: str):
+        return key in self._data
+
+    def __iter__(self):
+        return iter(self._data.values())
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __str__(self) -> str:
+        s = []
+        for key in self:
+            s.append(key)
+        return ", ".join(s)
+
+
+def get_hyps_shape(hyps: List[HypothesisList]) -> k2.RaggedShape:
+    """Return a ragged shape with axes [utt][num_hyps].
+
+    Args:
+      hyps:
+        len(hyps) == batch_size. It contains the current hypothesis for
+        each utterance in the batch.
+    Returns:
+      Return a ragged shape with 2 axes [utt][num_hyps]. Note that
+      the shape is on CPU.
+    """
+    num_hyps = [len(h) for h in hyps]
+
+    # torch.cumsum() is inclusive sum, so we put a 0 at the beginning
+    # to get exclusive sum later.
+    num_hyps.insert(0, 0)
+
+    num_hyps = torch.tensor(num_hyps)
+    row_splits = torch.cumsum(num_hyps, dim=0, dtype=torch.int32)
+    ans = k2.ragged.create_ragged_shape2(
+        row_splits=row_splits, cached_tot_size=row_splits[-1].item()
+    )
+    return ans
diff --git a/egs/libricss/SURT/dprnn_zipformer/decode.py b/egs/libricss/SURT/dprnn_zipformer/decode.py
new file mode 100755
index 000000000..6abbffe00
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/decode.py
@@ -0,0 +1,654 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+(1) greedy search
+./dprnn_zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --use-averaged-model true \
+    --exp-dir ./dprnn_zipformer/exp \
+    --max-duration 600 \
+    --decoding-method greedy_search
+
+(2) modified beam search
+./dprnn_zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --use-averaged-model true \
+    --exp-dir ./dprnn_zipformer/exp \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+"""
+
+
+import argparse
+import logging
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from asr_datamodule import LibriCssAsrDataModule
+from beam_search import (
+    beam_search,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from lhotse.utils import EPSILON
+from train import add_model_arguments, get_params, get_surt_model
+
+from icefall import LmScorer, NgramLm
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_surt_error_stats,
+)
+
+OVERLAP_RATIOS = ["0L", "0S", "OV10", "OV20", "OV30", "OV40"]
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=30,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=9,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="dprnn_zipformer/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=Path,
+        default="data/lang_bpe_500",
+        help="The lang dir containing word table and LG graph",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Possible values are:
+          - greedy_search
+          - beam_search
+          - modified_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame.
+        Used only when --decoding_method is greedy_search""",
+    )
+
+    parser.add_argument(
+        "--save-masks",
+        type=str2bool,
+        default=False,
+        help="""If true, save masks generated by unmixing module.""",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+) -> Dict[str, List[List[str]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+
+        - key: It indicates the setting used for decoding. For example,
+               if greedy_search is used, it would be "greedy_search"
+               If beam search with a beam size of 7 is used, it would be
+               "beam_7"
+        - value: It contains the decoding result. `len(value)` equals to
+                 batch size. `value[i]` is the decoding result for the i-th
+                 utterance in the given batch.
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+    Returns:
+      Return the decoding result. See above description for the format of
+      the returned dict.
+    """
+    device = next(model.parameters()).device
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+
+    feature = feature.to(device)
+    feature_lens = batch["input_lens"].to(device)
+
+    # Apply the mask encoder
+    B, T, F = feature.shape
+    processed = model.mask_encoder(feature)  # B,T,F*num_channels
+    masks = processed.view(B, T, F, params.num_channels).unbind(dim=-1)
+    x_masked = [feature * m for m in masks]
+
+    masks_dict = {}
+    if params.save_masks:
+        # To save the masks, we split them by batch and trim each mask to the length of
+        # the corresponding feature. We save them in a dict, where the key is the
+        # cut ID and the value is the mask.
+        for i in range(B):
+            mask = torch.cat(
+                [x_masked[j][i, : feature_lens[i]] for j in range(params.num_channels)],
+                dim=-1,
+            )
+            mask = mask.cpu().numpy()
+            masks_dict[batch["cuts"][i].id] = mask
+
+    # Recognition
+    # Concatenate the inputs along the batch axis
+    h = torch.cat(x_masked, dim=0)
+    h_lens = feature_lens.repeat(params.num_channels)
+    encoder_out, encoder_out_lens = model.encoder(x=h, x_lens=h_lens)
+
+    if model.joint_encoder_layer is not None:
+        encoder_out = model.joint_encoder_layer(encoder_out)
+
+    def _group_channels(hyps: List[str]) -> List[List[str]]:
+        """
+        Currently we have a batch of size M*B, where M is the number of
+        channels and B is the batch size. We need to group the hypotheses
+        into B groups, each of which contains M hypotheses.
+
+        Example:
+            hyps = ['a1', 'b1', 'c1', 'a2', 'b2', 'c2']
+            _group_channels(hyps) = [['a1', 'a2'], ['b1', 'b2'], ['c1', 'c2']]
+        """
+        assert len(hyps) == B * params.num_channels
+        out_hyps = []
+        for i in range(B):
+            out_hyps.append(hyps[i::B])
+        return out_hyps
+
+    hyps = []
+    if params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp)
+    elif params.decoding_method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp)
+    else:
+        batch_size = encoder_out.size(0)
+
+        for i in range(batch_size):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.decoding_method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                )
+            elif params.decoding_method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported decoding method: {params.decoding_method}"
+                )
+            hyps.append(sp.decode(hyp))
+
+    if params.decoding_method == "greedy_search":
+        return {"greedy_search": _group_channels(hyps)}, masks_dict
+    else:
+        return {f"beam_size_{params.beam_size}": _group_channels(hyps)}, masks_dict
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    if params.decoding_method == "greedy_search":
+        log_interval = 50
+    else:
+        log_interval = 20
+
+    results = defaultdict(list)
+    masks = {}
+    for batch_idx, batch in enumerate(dl):
+        cut_ids = [cut.id for cut in batch["cuts"]]
+        cuts_batch = batch["cuts"]
+
+        hyps_dict, masks_dict = decode_one_batch(
+            params=params,
+            model=model,
+            sp=sp,
+        )
+        masks.update(masks_dict)
+
+        for name, hyps in hyps_dict.items():
+            this_batch = []
+            for cut_id, hyp_words in zip(cut_ids, hyps):
+                # Reference is a list of supervision texts sorted by start time.
+                ref_words = [
+                    s.text.strip()
+                    for s in sorted(
+                        cuts_batch[cut_id].supervisions, key=lambda s: s.start
+                    )
+                ]
+                this_batch.append((cut_id, ref_words, hyp_words))
+
+            results[name].extend(this_batch)
+
+        num_cuts += len(cut_ids)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+    return results, masks_dict
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        with open(errs_filename, "w") as f:
+            wer = write_surt_error_stats(
+                f,
+                f"{test_set_name}-{key}",
+                results,
+                enable_log=True,
+                num_channels=params.num_channels,
+            )
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = (
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+    )
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+def save_masks(
+    params: AttributeDict,
+    test_set_name: str,
+    masks: List[torch.Tensor],
+):
+    masks_path = params.res_dir / f"masks-{test_set_name}.txt"
+    torch.save(masks, masks_path)
+    logging.info(f"The masks are stored in {masks_path}")
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LmScorer.add_arguments(parser)
+    LibriCssAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+    args.lang_dir = Path(args.lang_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    assert params.decoding_method in (
+        "greedy_search",
+        "beam_search",
+        "modified_beam_search",
+    ), f"Decoding method {params.decoding_method} is not supported."
+    params.res_dir = params.exp_dir / params.decoding_method
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    if "beam_search" in params.decoding_method:
+        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
+    else:
+        params.suffix += f"-context-{params.context_size}"
+        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
+
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> and <unk> are defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.piece_to_id("<unk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+    assert model.encoder.decode_chunk_size == params.decode_chunk_len // 2, (
+        model.encoder.decode_chunk_size,
+        params.decode_chunk_len,
+    )
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to(device)
+    model.eval()
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    libricss = LibriCssAsrDataModule(args)
+
+    dev_cuts = libricss.libricss_cuts(split="dev", type="ihm-mix").to_eager()
+    dev_cuts_grouped = [dev_cuts.filter(lambda x: ol in x.id) for ol in OVERLAP_RATIOS]
+    test_cuts = libricss.libricss_cuts(split="test", type="ihm-mix").to_eager()
+    test_cuts_grouped = [
+        test_cuts.filter(lambda x: ol in x.id) for ol in OVERLAP_RATIOS
+    ]
+
+    for dev_set, ol in zip(dev_cuts_grouped, OVERLAP_RATIOS):
+        dev_dl = libricss.test_dataloaders(dev_set)
+        results_dict, masks = decode_dataset(
+            dl=dev_dl,
+            params=params,
+            model=model,
+            sp=sp,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=f"dev_{ol}",
+            results_dict=results_dict,
+        )
+
+        if params.save_masks:
+            save_masks(
+                params=params,
+                test_set_name=f"dev_{ol}",
+                masks=masks,
+            )
+
+    for test_set, ol in zip(test_cuts_grouped, OVERLAP_RATIOS):
+        test_dl = libricss.test_dataloaders(test_set)
+        results_dict, masks = decode_dataset(
+            dl=test_dl,
+            params=params,
+            model=model,
+            sp=sp,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=f"test_{ol}",
+            results_dict=results_dict,
+        )
+
+        if params.save_masks:
+            save_masks(
+                params=params,
+                test_set_name=f"test_{ol}",
+                masks=masks,
+            )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/libricss/SURT/dprnn_zipformer/decoder.py b/egs/libricss/SURT/dprnn_zipformer/decoder.py
new file mode 120000
index 000000000..8283d8c5a
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/decoder.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7/decoder.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/dprnn_zipformer/dprnn.py b/egs/libricss/SURT/dprnn_zipformer/dprnn.py
new file mode 100644
index 000000000..440dea885
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/dprnn.py
@@ -0,0 +1,305 @@
+import random
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+from einops import rearrange
+from scaling import ActivationBalancer, BasicNorm, DoubleSwish, ScaledLinear, ScaledLSTM
+from torch.autograd import Variable
+
+EPS = torch.finfo(torch.get_default_dtype()).eps
+
+
+def _pad_segment(input, segment_size):
+    # Source: https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/dprnn.py#L342
+    # input is the features: (B, N, T)
+    batch_size, dim, seq_len = input.shape
+    segment_stride = segment_size // 2
+
+    rest = segment_size - (segment_stride + seq_len % segment_size) % segment_size
+    if rest > 0:
+        pad = Variable(torch.zeros(batch_size, dim, rest)).type(input.type())
+        input = torch.cat([input, pad], 2)
+
+    pad_aux = Variable(torch.zeros(batch_size, dim, segment_stride)).type(input.type())
+    input = torch.cat([pad_aux, input, pad_aux], 2)
+
+    return input, rest
+
+
+def split_feature(input, segment_size):
+    # Source: https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/dprnn.py#L358
+    # split the feature into chunks of segment size
+    # input is the features: (B, N, T)
+
+    input, rest = _pad_segment(input, segment_size)
+    batch_size, dim, seq_len = input.shape
+    segment_stride = segment_size // 2
+
+    segments1 = (
+        input[:, :, :-segment_stride]
+        .contiguous()
+        .view(batch_size, dim, -1, segment_size)
+    )
+    segments2 = (
+        input[:, :, segment_stride:]
+        .contiguous()
+        .view(batch_size, dim, -1, segment_size)
+    )
+    segments = (
+        torch.cat([segments1, segments2], 3)
+        .view(batch_size, dim, -1, segment_size)
+        .transpose(2, 3)
+    )
+
+    return segments.contiguous(), rest
+
+
+def merge_feature(input, rest):
+    # Source: https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/dprnn.py#L385
+    # merge the splitted features into full utterance
+    # input is the features: (B, N, L, K)
+
+    batch_size, dim, segment_size, _ = input.shape
+    segment_stride = segment_size // 2
+    input = (
+        input.transpose(2, 3).contiguous().view(batch_size, dim, -1, segment_size * 2)
+    )  # B, N, K, L
+
+    input1 = (
+        input[:, :, :, :segment_size]
+        .contiguous()
+        .view(batch_size, dim, -1)[:, :, segment_stride:]
+    )
+    input2 = (
+        input[:, :, :, segment_size:]
+        .contiguous()
+        .view(batch_size, dim, -1)[:, :, :-segment_stride]
+    )
+
+    output = input1 + input2
+    if rest > 0:
+        output = output[:, :, :-rest]
+
+    return output.contiguous()  # B, N, T
+
+
+class RNNEncoderLayer(nn.Module):
+    """
+    RNNEncoderLayer is made up of lstm and feedforward networks.
+    Args:
+      input_size:
+        The number of expected features in the input (required).
+      hidden_size:
+        The hidden dimension of rnn layer.
+      dropout:
+        The dropout value (default=0.1).
+      layer_dropout:
+        The dropout value for model-level warmup (default=0.075).
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        hidden_size: int,
+        dropout: float = 0.1,
+        bidirectional: bool = False,
+    ) -> None:
+        super(RNNEncoderLayer, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+
+        assert hidden_size >= input_size, (hidden_size, input_size)
+        self.lstm = ScaledLSTM(
+            input_size=input_size,
+            hidden_size=hidden_size // 2 if bidirectional else hidden_size,
+            proj_size=0,
+            num_layers=1,
+            dropout=0.0,
+            batch_first=True,
+            bidirectional=bidirectional,
+        )
+        self.norm_final = BasicNorm(input_size)
+
+        # try to ensure the output is close to zero-mean (or at least, zero-median).  # noqa
+        self.balancer = ActivationBalancer(
+            num_channels=input_size,
+            channel_dim=-1,
+            min_positive=0.45,
+            max_positive=0.55,
+            max_abs=6.0,
+        )
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(
+        self,
+        src: torch.Tensor,
+        states: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        warmup: float = 1.0,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Pass the input through the encoder layer.
+        Args:
+          src:
+            The sequence to the encoder layer (required).
+            Its shape is (S, N, E), where S is the sequence length,
+            N is the batch size, and E is the feature number.
+          states:
+            A tuple of 2 tensors (optional). It is for streaming inference.
+            states[0] is the hidden states of all layers,
+              with shape of (1, N, input_size);
+            states[1] is the cell states of all layers,
+              with shape of (1, N, hidden_size).
+        """
+        src_orig = src
+
+        # alpha = 1.0 means fully use this encoder layer, 0.0 would mean
+        # completely bypass it.
+        alpha = warmup if self.training else 1.0
+
+        # lstm module
+        src_lstm, new_states = self.lstm(src, states)
+        src = self.dropout(src_lstm) + src
+        src = self.norm_final(self.balancer(src))
+
+        if alpha != 1.0:
+            src = alpha * src + (1 - alpha) * src_orig
+
+        return src
+
+
+# dual-path RNN
+class DPRNN(nn.Module):
+    """Deep dual-path RNN.
+    Source: https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/dprnn.py
+
+    args:
+        input_size: int, dimension of the input feature. The input should have shape
+                    (batch, seq_len, input_size).
+        hidden_size: int, dimension of the hidden state.
+        output_size: int, dimension of the output size.
+        dropout: float, dropout ratio. Default is 0.
+        num_blocks: int, number of stacked RNN layers. Default is 1.
+    """
+
+    def __init__(
+        self,
+        feature_dim,
+        input_size,
+        hidden_size,
+        output_size,
+        dropout=0.1,
+        num_blocks=1,
+        segment_size=50,
+        chunk_width_randomization=False,
+    ):
+        super().__init__()
+
+        self.input_size = input_size
+        self.output_size = output_size
+        self.hidden_size = hidden_size
+
+        self.segment_size = segment_size
+        self.chunk_width_randomization = chunk_width_randomization
+
+        self.input_embed = nn.Sequential(
+            ScaledLinear(feature_dim, input_size),
+            BasicNorm(input_size),
+            ActivationBalancer(
+                num_channels=input_size,
+                channel_dim=-1,
+                min_positive=0.45,
+                max_positive=0.55,
+            ),
+        )
+
+        # dual-path RNN
+        self.row_rnn = nn.ModuleList([])
+        self.col_rnn = nn.ModuleList([])
+        for _ in range(num_blocks):
+            # intra-RNN is non-causal
+            self.row_rnn.append(
+                RNNEncoderLayer(
+                    input_size, hidden_size, dropout=dropout, bidirectional=True
+                )
+            )
+            self.col_rnn.append(
+                RNNEncoderLayer(
+                    input_size, hidden_size, dropout=dropout, bidirectional=False
+                )
+            )
+
+        # output layer
+        self.out_embed = nn.Sequential(
+            ScaledLinear(input_size, output_size),
+            BasicNorm(output_size),
+            ActivationBalancer(
+                num_channels=output_size,
+                channel_dim=-1,
+                min_positive=0.45,
+                max_positive=0.55,
+            ),
+        )
+
+    def forward(self, input):
+        # input shape: B, T, F
+        input = self.input_embed(input)
+        B, T, D = input.shape
+
+        if self.chunk_width_randomization and self.training:
+            segment_size = random.randint(self.segment_size // 2, self.segment_size)
+        else:
+            segment_size = self.segment_size
+        input, rest = split_feature(input.transpose(1, 2), segment_size)
+        # input shape: batch, N, dim1, dim2
+        # apply RNN on dim1 first and then dim2
+        # output shape: B, output_size, dim1, dim2
+        # input = input.to(device)
+        batch_size, _, dim1, dim2 = input.shape
+        output = input
+        for i in range(len(self.row_rnn)):
+            row_input = (
+                output.permute(0, 3, 2, 1)
+                .contiguous()
+                .view(batch_size * dim2, dim1, -1)
+            )  # B*dim2, dim1, N
+            output = self.row_rnn[i](row_input)  # B*dim2, dim1, H
+            output = (
+                output.view(batch_size, dim2, dim1, -1).permute(0, 3, 2, 1).contiguous()
+            )  # B, N, dim1, dim2
+
+            col_input = (
+                output.permute(0, 2, 3, 1)
+                .contiguous()
+                .view(batch_size * dim1, dim2, -1)
+            )  # B*dim1, dim2, N
+            output = self.col_rnn[i](col_input)  # B*dim1, dim2, H
+            output = (
+                output.view(batch_size, dim1, dim2, -1).permute(0, 3, 1, 2).contiguous()
+            )  # B, N, dim1, dim2
+
+        output = merge_feature(output, rest)
+        output = output.transpose(1, 2)
+        output = self.out_embed(output)
+
+        # Apply ReLU to the output
+        output = torch.relu(output)
+
+        return output
+
+
+if __name__ == "__main__":
+
+    model = DPRNN(
+        80,
+        256,
+        256,
+        160,
+        dropout=0.1,
+        num_blocks=4,
+        segment_size=32,
+        chunk_width_randomization=True,
+    )
+    input = torch.randn(2, 1002, 80)
+    print(sum(p.numel() for p in model.parameters()))
+    print(model(input).shape)
diff --git a/egs/libricss/SURT/dprnn_zipformer/encoder_interface.py b/egs/libricss/SURT/dprnn_zipformer/encoder_interface.py
new file mode 120000
index 000000000..0c2673d46
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/encoder_interface.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7/encoder_interface.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/dprnn_zipformer/export.py b/egs/libricss/SURT/dprnn_zipformer/export.py
new file mode 100755
index 000000000..f51f2a7ab
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/export.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021 Xiaomi Corporation (Author: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This script converts several saved checkpoints
+# to a single one using model averaging.
+"""
+
+Usage:
+
+(1) Export to torchscript model using torch.jit.script()
+
+./dprnn_zipformer/export.py \
+  --exp-dir ./dprnn_zipformer/exp \
+  --bpe-model data/lang_bpe_500/bpe.model \
+  --epoch 30 \
+  --avg 9 \
+  --jit 1
+
+It will generate a file `cpu_jit.pt` in the given `exp_dir`. You can later
+load it by `torch.jit.load("cpu_jit.pt")`.
+
+Note `cpu` in the name `cpu_jit.pt` means the parameters when loaded into Python
+are on CPU. You can use `to("cuda")` to move them to a CUDA device.
+
+Check
+https://github.com/k2-fsa/sherpa
+for how to use the exported models outside of icefall.
+
+(2) Export `model.state_dict()`
+
+./dprnn_zipformer/export.py \
+  --exp-dir ./dprnn_zipformer/exp \
+  --bpe-model data/lang_bpe_500/bpe.model \
+  --epoch 30 \
+  --avg 9
+
+It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
+load it by `icefall.checkpoint.load_checkpoint()`.
+
+To use the generated file with `dprnn_zipformer/decode.py`,
+you can do:
+
+    cd /path/to/exp_dir
+    ln -s pretrained.pt epoch-9999.pt
+
+    cd /path/to/egs/librispeech/ASR
+    ./dprnn_zipformer/decode.py \
+        --exp-dir ./dprnn_zipformer/exp \
+        --epoch 9999 \
+        --avg 1 \
+        --max-duration 600 \
+        --decoding-method greedy_search \
+        --bpe-model data/lang_bpe_500/bpe.model
+"""
+
+import argparse
+import logging
+from pathlib import Path
+
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from scaling_converter import convert_scaled_to_non_scaled
+from train import add_model_arguments, get_params, get_surt_model
+
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.utils import str2bool
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=30,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=9,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="dprnn_zipformer/exp",
+        help="""It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--jit",
+        type=str2bool,
+        default=False,
+        help="""True to save a model after applying torch.jit.script.
+        It will generate a file named cpu_jit.pt
+
+        Check ./jit_pretrained.py for how to use it.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+@torch.no_grad()
+def main():
+    args = get_parser().parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+
+    model.to(device)
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to("cpu")
+    model.eval()
+
+    if params.jit is True:
+        convert_scaled_to_non_scaled(model, inplace=True)
+        # We won't use the forward() method of the model in C++, so just ignore
+        # it here.
+        # Otherwise, one of its arguments is a ragged tensor and is not
+        # torch scriptabe.
+        model.__class__.forward = torch.jit.ignore(model.__class__.forward)
+        logging.info("Using torch.jit.script")
+        model = torch.jit.script(model)
+        filename = params.exp_dir / "cpu_jit.pt"
+        model.save(str(filename))
+        logging.info(f"Saved to {filename}")
+    else:
+        logging.info("Not using torchscript. Export model.state_dict()")
+        # Save it using a format so that it can be loaded
+        # by :func:`load_checkpoint`
+        filename = params.exp_dir / "pretrained.pt"
+        torch.save({"model": model.state_dict()}, str(filename))
+        logging.info(f"Saved to {filename}")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
diff --git a/egs/libricss/SURT/dprnn_zipformer/joiner.py b/egs/libricss/SURT/dprnn_zipformer/joiner.py
new file mode 120000
index 000000000..0f0c3c90a
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/joiner.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7/joiner.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/dprnn_zipformer/model.py b/egs/libricss/SURT/dprnn_zipformer/model.py
new file mode 100644
index 000000000..688e1e78d
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/model.py
@@ -0,0 +1,316 @@
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang, Wei Kang)
+# Copyright    2023  Johns Hopkins University (author: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple
+
+import k2
+import torch
+import torch.nn as nn
+from encoder_interface import EncoderInterface
+
+from icefall.utils import add_sos
+
+
+class SURT(nn.Module):
+    """It implements Streaming Unmixing and Recognition Transducer (SURT).
+    https://arxiv.org/abs/2011.13148
+    """
+
+    def __init__(
+        self,
+        mask_encoder: nn.Module,
+        encoder: EncoderInterface,
+        joint_encoder_layer: Optional[nn.Module],
+        decoder: nn.Module,
+        joiner: nn.Module,
+        num_channels: int,
+        encoder_dim: int,
+        decoder_dim: int,
+        joiner_dim: int,
+        vocab_size: int,
+    ):
+        """
+        Args:
+          mask_encoder:
+            It is the masking network. It generates a mask for each channel of the
+            encoder. These masks are applied to the input features, and then passed
+            to the transcription network.
+          encoder:
+            It is the transcription network in the paper. Its accepts
+            two inputs: `x` of (N, T, encoder_dim) and `x_lens` of shape (N,).
+            It returns two tensors: `logits` of shape (N, T, encoder_dm) and
+            `logit_lens` of shape (N,).
+          decoder:
+            It is the prediction network in the paper. Its input shape
+            is (N, U) and its output shape is (N, U, decoder_dim).
+            It should contain one attribute: `blank_id`.
+          joiner:
+            It has two inputs with shapes: (N, T, encoder_dim) and (N, U, decoder_dim).
+            Its output shape is (N, T, U, vocab_size). Note that its output contains
+            unnormalized probs, i.e., not processed by log-softmax.
+          num_channels:
+            It is the number of channels that the input features will be split into.
+            In general, it should be equal to the maximum number of simultaneously
+            active speakers. For most real scenarios, using 2 channels is sufficient.
+        """
+        super().__init__()
+        assert isinstance(encoder, EncoderInterface), type(encoder)
+        assert hasattr(decoder, "blank_id")
+
+        self.mask_encoder = mask_encoder
+        self.encoder = encoder
+        self.joint_encoder_layer = joint_encoder_layer
+        self.decoder = decoder
+        self.joiner = joiner
+        self.num_channels = num_channels
+
+        self.simple_am_proj = nn.Linear(
+            encoder_dim,
+            vocab_size,
+        )
+        self.simple_lm_proj = nn.Linear(decoder_dim, vocab_size)
+
+        self.ctc_output = nn.Sequential(
+            nn.Dropout(p=0.1),
+            nn.Linear(encoder_dim, vocab_size),
+            nn.LogSoftmax(dim=-1),
+        )
+
+    def forward_helper(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: k2.RaggedTensor,
+        prune_range: int = 5,
+        am_scale: float = 0.0,
+        lm_scale: float = 0.0,
+        reduction: str = "sum",
+        beam_size: int = 10,
+        use_double_scores: bool = False,
+        subsampling_factor: int = 1,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Compute transducer loss for one branch of the SURT model.
+        """
+        encoder_out, x_lens = self.encoder(x, x_lens)
+        assert torch.all(x_lens > 0)
+
+        if self.joint_encoder_layer is not None:
+            encoder_out = self.joint_encoder_layer(encoder_out)
+
+        # compute ctc log-probs
+        ctc_output = self.ctc_output(encoder_out)
+
+        # For the decoder, i.e., the prediction network
+        row_splits = y.shape.row_splits(1)
+        y_lens = row_splits[1:] - row_splits[:-1]
+
+        blank_id = self.decoder.blank_id
+        sos_y = add_sos(y, sos_id=blank_id)
+
+        # sos_y_padded: [B, S + 1], start with SOS.
+        sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id)
+
+        # decoder_out: [B, S + 1, decoder_dim]
+        decoder_out = self.decoder(sos_y_padded)
+
+        # Note: y does not start with SOS
+        # y_padded : [B, S]
+        y_padded = y.pad(mode="constant", padding_value=0)
+
+        y_padded = y_padded.to(torch.int64)
+        boundary = torch.zeros((x.size(0), 4), dtype=torch.int64, device=x.device)
+        boundary[:, 2] = y_lens
+        boundary[:, 3] = x_lens
+
+        lm = self.simple_lm_proj(decoder_out)
+        am = self.simple_am_proj(encoder_out)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
+                lm=lm.float(),
+                am=am.float(),
+                symbols=y_padded,
+                termination_symbol=blank_id,
+                lm_only_scale=lm_scale,
+                am_only_scale=am_scale,
+                boundary=boundary,
+                reduction=reduction,
+                return_grad=True,
+            )
+
+        # ranges : [B, T, prune_range]
+        ranges = k2.get_rnnt_prune_ranges(
+            px_grad=px_grad,
+            py_grad=py_grad,
+            boundary=boundary,
+            s_range=prune_range,
+        )
+
+        # am_pruned : [B, T, prune_range, encoder_dim]
+        # lm_pruned : [B, T, prune_range, decoder_dim]
+        am_pruned, lm_pruned = k2.do_rnnt_pruning(
+            am=self.joiner.encoder_proj(encoder_out),
+            lm=self.joiner.decoder_proj(decoder_out),
+            ranges=ranges,
+        )
+
+        # logits : [B, T, prune_range, vocab_size]
+
+        # project_input=False since we applied the decoder's input projections
+        # prior to do_rnnt_pruning (this is an optimization for speed).
+        logits = self.joiner(am_pruned, lm_pruned, project_input=False)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            pruned_loss = k2.rnnt_loss_pruned(
+                logits=logits.float(),
+                symbols=y_padded,
+                ranges=ranges,
+                termination_symbol=blank_id,
+                boundary=boundary,
+                reduction=reduction,
+            )
+
+        # Compute ctc loss
+        supervision_segments = torch.stack(
+            (
+                torch.arange(len(x_lens), device="cpu"),
+                torch.zeros_like(x_lens, device="cpu"),
+                torch.clone(x_lens).detach().cpu(),
+            ),
+            dim=1,
+        ).to(torch.int32)
+        # We need to sort supervision_segments in decreasing order of num_frames
+        indices = torch.argsort(supervision_segments[:, 2], descending=True)
+        supervision_segments = supervision_segments[indices]
+
+        # Works with a BPE model
+        decoding_graph = k2.ctc_graph(y, modified=False, device=x.device)
+        dense_fsa_vec = k2.DenseFsaVec(
+            ctc_output,
+            supervision_segments,
+            allow_truncate=subsampling_factor - 1,
+        )
+        ctc_loss = k2.ctc_loss(
+            decoding_graph=decoding_graph,
+            dense_fsa_vec=dense_fsa_vec,
+            output_beam=beam_size,
+            reduction="none",
+            use_double_scores=use_double_scores,
+        )
+
+        return (simple_loss, pruned_loss, ctc_loss)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: k2.RaggedTensor,
+        prune_range: int = 5,
+        am_scale: float = 0.0,
+        lm_scale: float = 0.0,
+        reduction: str = "sum",
+        beam_size: int = 10,
+        use_double_scores: bool = False,
+        subsampling_factor: int = 1,
+        return_masks: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+          x:
+            A 3-D tensor of shape (N, T, C).
+          x_lens:
+            A 1-D tensor of shape (N,). It contains the number of frames in `x`
+            before padding.
+          y:
+            A ragged tensor of shape (N*num_channels, S). It contains the labels
+            of the N utterances. The labels are in the range [0, vocab_size). All
+            the channels are concatenated together one after another.
+          prune_range:
+            The prune range for rnnt loss, it means how many symbols(context)
+            we are considering for each frame to compute the loss.
+          am_scale:
+            The scale to smooth the loss with am (output of encoder network)
+            part
+          lm_scale:
+            The scale to smooth the loss with lm (output of predictor network)
+            part
+          reduction:
+            "sum" to sum the losses over all utterances in the batch.
+            "none" to return the loss in a 1-D tensor for each utterance
+            in the batch.
+          beam_size:
+            The beam size used in CTC decoding.
+          use_double_scores:
+            If True, use double precision for CTC decoding.
+          subsampling_factor:
+            The subsampling factor of the model. It is used to compute the
+            supervision segments for CTC loss.
+          return_masks:
+            If True, return the masks as well as masked features.
+        Returns:
+          Return the transducer loss.
+
+        Note:
+           Regarding am_scale & lm_scale, it will make the loss-function one of
+           the form:
+              lm_scale * lm_probs + am_scale * am_probs +
+              (1-lm_scale-am_scale) * combined_probs
+        """
+        assert x.ndim == 3, x.shape
+        assert x_lens.ndim == 1, x_lens.shape
+        assert y.num_axes == 2, y.num_axes
+
+        assert x.size(0) == x_lens.size(0), (x.size(), x_lens.size())
+
+        # Apply the mask encoder
+        B, T, F = x.shape
+        processed = self.mask_encoder(x)  # B,T,F*num_channels
+        masks = processed.view(B, T, F, self.num_channels).unbind(dim=-1)
+        x_masked = [x * m for m in masks]
+
+        # Recognition
+        # Stack the inputs along the batch axis
+        h = torch.cat(x_masked, dim=0)
+        h_lens = torch.cat([x_lens for _ in range(self.num_channels)], dim=0)
+
+        simple_loss, pruned_loss, ctc_loss = self.forward_helper(
+            h,
+            h_lens,
+            y,
+            prune_range,
+            am_scale,
+            lm_scale,
+            reduction=reduction,
+            beam_size=beam_size,
+            use_double_scores=use_double_scores,
+            subsampling_factor=subsampling_factor,
+        )
+
+        # Chunks the outputs into 2 parts along batch axis and then stack them along a new axis.
+        simple_loss = torch.stack(
+            torch.chunk(simple_loss, self.num_channels, dim=0), dim=0
+        )
+        pruned_loss = torch.stack(
+            torch.chunk(pruned_loss, self.num_channels, dim=0), dim=0
+        )
+        ctc_loss = torch.stack(torch.chunk(ctc_loss, self.num_channels, dim=0), dim=0)
+
+        if return_masks:
+            return (simple_loss, pruned_loss, ctc_loss, x_masked, masks)
+        else:
+            return (simple_loss, pruned_loss, ctc_loss, x_masked)
diff --git a/egs/libricss/SURT/dprnn_zipformer/optim.py b/egs/libricss/SURT/dprnn_zipformer/optim.py
new file mode 120000
index 000000000..8a05abb5f
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/optim.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7/optim.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/dprnn_zipformer/scaling.py b/egs/libricss/SURT/dprnn_zipformer/scaling.py
new file mode 120000
index 000000000..5f9be9fe0
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/scaling.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7/scaling.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/dprnn_zipformer/scaling_converter.py b/egs/libricss/SURT/dprnn_zipformer/scaling_converter.py
new file mode 120000
index 000000000..f9960e5c6
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/scaling_converter.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7/scaling_converter.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/dprnn_zipformer/train.py b/egs/libricss/SURT/dprnn_zipformer/train.py
new file mode 100755
index 000000000..6598f8b5d
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/train.py
@@ -0,0 +1,1452 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                  Wei Kang,
+#                                                  Mingshuang Luo,)
+#                                                  Zengwei Yao)
+#              2023  Johns Hopkins University (author: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+cd egs/libricss/SURT
+./prepare.sh
+
+./dprnn_zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --exp-dir dprnn_zipformer/exp \
+  --max-duration 300
+
+# For mix precision training:
+
+./dprnn_zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir dprnn_zipformer/exp \
+  --max-duration 550
+"""
+
+import argparse
+import copy
+import logging
+import warnings
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple, Union
+
+import k2
+import optim
+import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import LibriCssAsrDataModule
+from decoder import Decoder
+from dprnn import DPRNN
+from einops.layers.torch import Rearrange
+from graph_pit.loss.optimized import optimized_graph_pit_mse_loss as gpit_mse
+from joiner import Joiner
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import LOG_EPSILON, fix_random_seed
+from model import SURT
+from optim import Eden, ScaledAdam
+from scaling import ScaledLSTM
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from zipformer import Zipformer
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for module in model.modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+
+
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-mask-encoder-layers",
+        type=int,
+        default=4,
+        help="Number of layers in the DPRNN based mask encoder.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-dim",
+        type=int,
+        default=256,
+        help="Hidden dimension of the LSTM blocks in DPRNN.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-segment-size",
+        type=int,
+        default=32,
+        help="Segment size of the SegLSTM in DPRNN. Ideally, this should be equal to the "
+        "decode-chunk-length of the zipformer encoder.",
+    )
+
+    parser.add_argument(
+        "--chunk-width-randomization",
+        type=bool,
+        default=False,
+        help="Whether to randomize the chunk width in DPRNN.",
+    )
+
+    # Zipformer config is based on:
+    # https://github.com/k2-fsa/icefall/pull/745#issuecomment-1405282740
+    parser.add_argument(
+        "--num-encoder-layers",
+        type=str,
+        default="2,2,2,2,2",
+        help="Number of zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--feedforward-dims",
+        type=str,
+        default="768,768,768,768,768",
+        help="Feedforward dimension of the zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--nhead",
+        type=str,
+        default="8,8,8,8,8",
+        help="Number of attention heads in the zipformer encoder layers.",
+    )
+
+    parser.add_argument(
+        "--encoder-dims",
+        type=str,
+        default="256,256,256,256,256",
+        help="Embedding dimension in the 2 blocks of zipformer encoder layers, comma separated",
+    )
+
+    parser.add_argument(
+        "--attention-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="""Attention dimension in the 2 blocks of zipformer encoder layers, comma separated;
+        not the same as embedding dimension.""",
+    )
+
+    parser.add_argument(
+        "--encoder-unmasked-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="Unmasked dimensions in the encoders, relates to augmentation during training.  "
+        "Must be <= each of encoder_dims.  Empirically, less than 256 seems to make performance "
+        " worse.",
+    )
+
+    parser.add_argument(
+        "--zipformer-downsampling-factors",
+        type=str,
+        default="1,2,4,8,2",
+        help="Downsampling factor for each stack of encoder layers.",
+    )
+
+    parser.add_argument(
+        "--cnn-module-kernels",
+        type=str,
+        default="31,31,31,31,31",
+        help="Sizes of kernels in convolution modules",
+    )
+
+    parser.add_argument(
+        "--use-joint-encoder-layer",
+        type=str,
+        default="lstm",
+        choices=["linear", "lstm", "none"],
+        help="Whether to use a joint layer to combine all branches.",
+    )
+
+    parser.add_argument(
+        "--decoder-dim",
+        type=int,
+        default=512,
+        help="Embedding dimension in the decoder model.",
+    )
+
+    parser.add_argument(
+        "--joiner-dim",
+        type=int,
+        default=512,
+        help="""Dimension used in the joiner model.
+        Outputs from the encoder and decoder model are projected
+        to this dimension before adding.
+        """,
+    )
+
+    parser.add_argument(
+        "--short-chunk-size",
+        type=int,
+        default=50,
+        help="""Chunk length of dynamic training, the chunk size would be either
+        max sequence length of current batch or uniformly sampled from (1, short_chunk_size).
+        """,
+    )
+
+    parser.add_argument(
+        "--num-left-chunks",
+        type=int,
+        default=4,
+        help="How many left context can be seen in chunks when calculating attention.",
+    )
+
+    parser.add_argument(
+        "--decode-chunk-len",
+        type=int,
+        default=32,
+        help="The chunk size for decoding (in frames before subsampling)",
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=30,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="conv_lstm_transducer_stateless_ctc/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--model-init-ckpt",
+        type=str,
+        default=None,
+        help="""The model checkpoint to initialize the model (either full or part).
+        If not specified, the model is randomly initialized.
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=0.004, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=5000,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=6,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="The prune range for rnnt loss, it means how many symbols(context)"
+        "we are using to compute the loss",
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="The scale to smooth the loss with lm "
+        "(output of prediction network) part.",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="The scale to smooth the loss with am (output of encoder network) part.",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="To get pruning ranges, we will calculate a simple version"
+        "loss(joiner is just addition), this simple loss also uses for"
+        "training (as a regularization item). We will scale the simple loss"
+        "with this parameter before adding to the final loss.",
+    )
+
+    parser.add_argument(
+        "--ctc-loss-scale",
+        type=float,
+        default=0.2,
+        help="Scale for CTC loss.",
+    )
+
+    parser.add_argument(
+        "--heat-loss-scale",
+        type=float,
+        default=0.0,
+        help="Scale for HEAT loss on separated sources.",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=2000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 0.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=1,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=100,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - feature_dim: The model input dim. It has to match the one used
+                       in computing features.
+
+        - subsampling_factor:  The subsampling factor for the model.
+
+        - num_decoder_layers: Number of decoder layer of transformer decoder.
+
+        - warm_step: The warm_step for Noam optimizer.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "valid_interval": 2000,
+            # parameters for SURT
+            "num_channels": 2,
+            "feature_dim": 80,
+            "subsampling_factor": 4,  # not passed in, this is fixed
+            # parameters for Noam
+            "model_warm_step": 5000,  # arg given to model, not for lrate
+            # parameters for ctc loss
+            "beam_size": 10,
+            "use_double_scores": True,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def get_mask_encoder_model(params: AttributeDict) -> nn.Module:
+    mask_encoder = DPRNN(
+        feature_dim=params.feature_dim,
+        input_size=params.mask_encoder_dim,
+        hidden_size=params.mask_encoder_dim,
+        output_size=params.feature_dim * params.num_channels,
+        segment_size=params.mask_encoder_segment_size,
+        num_blocks=params.num_mask_encoder_layers,
+        chunk_width_randomization=params.chunk_width_randomization,
+    )
+    return mask_encoder
+
+
+def get_encoder_model(params: AttributeDict) -> nn.Module:
+    # TODO: We can add an option to switch between Zipformer and Transformer
+    def to_int_tuple(s: str):
+        return tuple(map(int, s.split(",")))
+
+    encoder = Zipformer(
+        num_features=params.feature_dim,
+        output_downsampling_factor=2,
+        zipformer_downsampling_factors=to_int_tuple(
+            params.zipformer_downsampling_factors
+        ),
+        encoder_dims=to_int_tuple(params.encoder_dims),
+        attention_dim=to_int_tuple(params.attention_dims),
+        encoder_unmasked_dims=to_int_tuple(params.encoder_unmasked_dims),
+        nhead=to_int_tuple(params.nhead),
+        feedforward_dim=to_int_tuple(params.feedforward_dims),
+        cnn_module_kernels=to_int_tuple(params.cnn_module_kernels),
+        num_encoder_layers=to_int_tuple(params.num_encoder_layers),
+        num_left_chunks=params.num_left_chunks,
+        short_chunk_size=params.short_chunk_size,
+        decode_chunk_size=params.decode_chunk_len // 2,
+    )
+    return encoder
+
+
+def get_joint_encoder_layer(params: AttributeDict) -> nn.Module:
+    class TakeFirst(nn.Module):
+        def forward(self, x):
+            return x[0]
+
+    if params.use_joint_encoder_layer == "linear":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            nn.Linear(
+                params.num_channels * encoder_dim, params.num_channels * encoder_dim
+            ),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "lstm":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            ScaledLSTM(
+                input_size=params.num_channels * encoder_dim,
+                hidden_size=params.num_channels * encoder_dim,
+                num_layers=1,
+                bias=True,
+                batch_first=True,
+                dropout=0.0,
+                bidirectional=False,
+            ),
+            TakeFirst(),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "none":
+        joint_layer = None
+    else:
+        raise ValueError(
+            f"Unknown joint encoder layer type: {params.use_joint_encoder_layer}"
+        )
+    return joint_layer
+
+
+def get_decoder_model(params: AttributeDict) -> nn.Module:
+    decoder = Decoder(
+        vocab_size=params.vocab_size,
+        decoder_dim=params.decoder_dim,
+        blank_id=params.blank_id,
+        context_size=params.context_size,
+    )
+    return decoder
+
+
+def get_joiner_model(params: AttributeDict) -> nn.Module:
+    joiner = Joiner(
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return joiner
+
+
+def get_surt_model(
+    params: AttributeDict,
+) -> nn.Module:
+    mask_encoder = get_mask_encoder_model(params)
+    encoder = get_encoder_model(params)
+    joint_layer = get_joint_encoder_layer(params)
+    decoder = get_decoder_model(params)
+    joiner = get_joiner_model(params)
+
+    model = SURT(
+        mask_encoder=mask_encoder,
+        encoder=encoder,
+        joint_encoder_layer=joint_layer,
+        decoder=decoder,
+        joiner=joiner,
+        num_channels=params.num_channels,
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return model
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_heat_loss(x_masked, batch, num_channels=2) -> Tensor:
+    """
+    Compute HEAT loss for separated sources using the output of mask encoder.
+    Args:
+      x_masked:
+        The output of mask encoder. It is a tensor of shape (B, T, C).
+      batch:
+        A batch of data. See `lhotse.dataset.K2SurtDatasetWithSources()`
+        for the content in it.
+      num_channels:
+        The number of output branches in the SURT model.
+    """
+    B, T, D = x_masked[0].shape
+    device = x_masked[0].device
+
+    # Create training targets for each channel.
+    targets = []
+    for i in range(num_channels):
+        target = torch.ones_like(x_masked[i]) * LOG_EPSILON
+        targets.append(target)
+
+    source_feats = batch["source_feats"]
+    source_boundaries = batch["source_boundaries"]
+    input_lens = batch["input_lens"].to(device)
+    # Assign sources to channels based on the HEAT criteria
+    for b in range(B):
+        cut_source_feats = source_feats[b]
+        cut_source_boundaries = source_boundaries[b]
+        last_seg_end = [0 for _ in range(num_channels)]
+        for source_feat, (start, end) in zip(cut_source_feats, cut_source_boundaries):
+            assigned = False
+            for i in range(num_channels):
+                if start >= last_seg_end[i]:
+                    targets[i][b, start:end, :] += source_feat.to(device)
+                    last_seg_end[i] = max(end, last_seg_end[i])
+                    assigned = True
+                    break
+            if not assigned:
+                min_end_channel = last_seg_end.index(min(last_seg_end))
+                targets[min_end_channel][b, start:end, :] += source_feat
+                last_seg_end[min_end_channel] = max(end, last_seg_end[min_end_channel])
+
+    # Get padding mask based on input lengths
+    pad_mask = torch.arange(T, device=device).expand(B, T) > input_lens.unsqueeze(1)
+    pad_mask = pad_mask.unsqueeze(-1)
+
+    # Compute masked loss for each channel
+    losses = torch.zeros((num_channels, B, T, D), device=device)
+    for i in range(num_channels):
+        loss = nn.functional.mse_loss(x_masked[i], targets[i], reduction="none")
+        # Apply padding mask to loss
+        loss.masked_fill_(pad_mask, 0)
+        losses[i] = loss
+
+    # loss: C x B x T x D. pad_mask: B x T x 1
+    # We want to compute loss for each item in the batch. Each item has loss given
+    # by the sum over C, and average over T and D. For T, we need to use the padding.
+    loss = losses.sum(0).mean(-1).sum(-1) / batch["input_lens"].to(device)
+    return loss
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute RNN-T loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Conformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"].to(device)
+    feature_lens = batch["input_lens"].to(device)
+
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+
+    # The dataloader returns text as a list of cuts, each of which is a list of channel
+    # text. We flatten this to a list where all channels are together, i.e., it looks like
+    # [utt1_ch1, utt2_ch1, ..., uttN_ch1, utt1_ch2, ...., uttN,ch2].
+    text = [val for tup in zip(*batch["text"]) for val in tup]
+    assert len(text) == len(feature) * params.num_channels
+
+    # Convert all channel texts to token IDs and create a ragged tensor.
+    y = sp.encode(text, out_type=int)
+    y = k2.RaggedTensor(y).to(device)
+
+    batch_idx_train = params.batch_idx_train
+    warm_step = params.model_warm_step
+
+    with torch.set_grad_enabled(is_training):
+        (simple_loss, pruned_loss, ctc_loss, x_masked) = model(
+            x=feature,
+            x_lens=feature_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+            reduction="none",
+            subsampling_factor=params.subsampling_factor,
+        )
+        simple_loss_is_finite = torch.isfinite(simple_loss)
+        pruned_loss_is_finite = torch.isfinite(pruned_loss)
+        ctc_loss_is_finite = torch.isfinite(ctc_loss)
+
+        # Compute HEAT loss
+        if is_training and params.heat_loss_scale > 0.0:
+            heat_loss = compute_heat_loss(
+                x_masked, batch, num_channels=params.num_channels
+            )
+        else:
+            heat_loss = torch.tensor(0.0, device=device)
+
+        heat_loss_is_finite = torch.isfinite(heat_loss)
+        is_finite = (
+            simple_loss_is_finite
+            & pruned_loss_is_finite
+            & ctc_loss_is_finite
+            & heat_loss_is_finite
+        )
+        if not torch.all(is_finite):
+            logging.info(
+                "Not all losses are finite!\n"
+                f"simple_losses: {simple_loss}\n"
+                f"pruned_losses: {pruned_loss}\n"
+                f"ctc_losses: {ctc_loss}\n"
+                f"heat_losses: {heat_loss}\n"
+            )
+            display_and_save_batch(batch, params=params, sp=sp)
+            simple_loss = simple_loss[simple_loss_is_finite]
+            pruned_loss = pruned_loss[pruned_loss_is_finite]
+            ctc_loss = ctc_loss[ctc_loss_is_finite]
+            heat_loss = heat_loss[heat_loss_is_finite]
+
+            # If either all simple_loss or pruned_loss is inf or nan,
+            # we stop the training process by raising an exception
+            if (
+                torch.all(~simple_loss_is_finite)
+                or torch.all(~pruned_loss_is_finite)
+                or torch.all(~ctc_loss_is_finite)
+                or torch.all(~heat_loss_is_finite)
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
+
+        simple_loss_sum = simple_loss.sum()
+        pruned_loss_sum = pruned_loss.sum()
+        ctc_loss_sum = ctc_loss.sum()
+        heat_loss_sum = heat_loss.sum()
+
+        s = params.simple_loss_scale
+        # take down the scale on the simple loss from 1.0 at the start
+        # to params.simple_loss scale by warm_step.
+        simple_loss_scale = (
+            s
+            if batch_idx_train >= warm_step
+            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
+        )
+        pruned_loss_scale = (
+            1.0
+            if batch_idx_train >= warm_step
+            else 0.1 + 0.9 * (batch_idx_train / warm_step)
+        )
+        loss = (
+            simple_loss_scale * simple_loss_sum
+            + pruned_loss_scale * pruned_loss_sum
+            + params.ctc_loss_scale * ctc_loss_sum
+            + params.heat_loss_scale * heat_loss_sum
+        )
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        # info["frames"] is an approximate number for two reasons:
+        # (1) The acutal subsampling factor is ((lens - 1) // 2 - 1) // 2
+        # (2) If some utterances in the batch lead to inf/nan loss, they
+        #     are filtered out.
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances`  # noqa
+    info["utterances"] = feature.size(0)
+    # averaged input duration in frames over utterances
+    info["utt_duration"] = feature_lens.sum().item()
+    # averaged padding proportion over utterances
+    info["utt_pad_proportion"] = (
+        ((feature.size(1) - feature_lens) / feature.size(1)).sum().item()
+    )
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    info["simple_loss"] = simple_loss_sum.detach().cpu().item()
+    info["pruned_loss"] = pruned_loss_sum.detach().cpu().item()
+    if params.ctc_loss_scale > 0.0:
+        info["ctc_loss"] = ctc_loss_sum.detach().cpu().item()
+    if params.heat_loss_scale > 0.0:
+        info["heat_loss"] = heat_loss_sum.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            sp=sp,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    sp: spm.SentencePieceProcessor,
+    train_dl: torch.utils.data.DataLoader,
+    train_dl_warmup: Optional[torch.utils.data.DataLoader],
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      train_dl_warmup:
+        Dataloader for the training dataset with 2 speakers. This is used during the
+        warmup stage.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    torch.cuda.empty_cache()
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    iter_train = iter(train_dl)
+    iter_train_warmup = iter(train_dl_warmup) if train_dl_warmup is not None else None
+
+    batch_idx = 0
+
+    while True:
+        # We first sample a batch from the main dataset. This is because we want to
+        # make sure all epochs have the same number of batches.
+        try:
+            batch = next(iter_train)
+        except StopIteration:
+            break
+
+        # If we are in warmup stage, get the batch from the warmup dataset.
+        if (
+            params.batch_idx_train <= params.model_warm_step
+            and iter_train_warmup is not None
+        ):
+            try:
+                batch = next(iter_train_warmup)
+            except StopIteration:
+                iter_train_warmup = iter(train_dl_warmup)
+                batch = next(iter_train_warmup)
+
+        batch_idx += 1
+
+        params.batch_idx_train += 1
+        batch_size = batch["inputs"].shape[0]
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            set_batch_count(model, params.batch_idx_train)
+            scheduler.step_batch(params.batch_idx_train)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            params.cur_batch_idx = batch_idx
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            del params.cur_batch_idx
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = scheduler.get_last_lr()[0]
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
+                    )
+
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    model.to(device)
+
+    if checkpoints is None and params.model_init_ckpt is not None:
+        logging.info(
+            f"Initializing model with checkpoint from {params.model_init_ckpt}"
+        )
+        init_ckpt = torch.load(params.model_init_ckpt, map_location=device)
+        model.load_state_dict(init_ckpt["model"], strict=False)
+
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    parameters_names = []
+    parameters_names.append(
+        [name_param_pair[0] for name_param_pair in model.named_parameters()]
+    )
+    optimizer = ScaledAdam(
+        model.parameters(),
+        lr=params.base_lr,
+        clipping_scale=2.0,
+        parameters_names=parameters_names,
+    )
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        diagnostic = diagnostics.attach_diagnostics(model)
+
+    libricss = LibriCssAsrDataModule(args)
+
+    train_cuts = libricss.lsmix_cuts(rvb_affix="comb", type_affix="full", sources=True)
+    train_cuts_ov40 = libricss.lsmix_cuts(
+        rvb_affix="comb", type_affix="ov40", sources=True
+    )
+    dev_cuts = libricss.libricss_cuts(split="dev", type="sdm")
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = libricss.train_dataloaders(
+        train_cuts,
+        sampler_state_dict=sampler_state_dict,
+    )
+    train_dl_ov40 = libricss.train_dataloaders(train_cuts_ov40)
+    valid_dl = libricss.valid_dataloaders(dev_cuts)
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sp=sp,
+            train_dl=train_dl,
+            train_dl_warmup=train_dl_ov40,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    y = [sp.encode(text_ch) for text_ch in batch["text"]]
+    num_tokens = [sum(len(yi) for yi in y_ch) for y_ch in y]
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def main():
+    parser = get_parser()
+    LibriCssAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/libricss/SURT/dprnn_zipformer/train_adapt.py b/egs/libricss/SURT/dprnn_zipformer/train_adapt.py
new file mode 100755
index 000000000..1c1b0c28c
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/train_adapt.py
@@ -0,0 +1,1343 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                  Wei Kang,
+#                                                  Mingshuang Luo,)
+#                                                  Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+export CUDA_VISIBLE_DEVICES=0
+
+./dprnn_zipformer/train.py \
+  --world-size 1 \
+  --num-epochs 15 \
+  --start-epoch 1 \
+  --exp-dir dprnn_zipformer/exp \
+  --max-duration 300
+
+# For mix precision training:
+
+./dprnn_zipformer/train.py \
+  --world-size 1 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir dprnn_zipformer/exp \
+  --max-duration 550
+"""
+
+import argparse
+import copy
+import logging
+import warnings
+from itertools import chain
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple, Union
+
+import k2
+import optim
+import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import LibriCssAsrDataModule
+from decoder import Decoder
+from dprnn import DPRNN
+from einops.layers.torch import Rearrange
+from joiner import Joiner
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import LOG_EPSILON, fix_random_seed
+from model import SURT
+from optim import Eden, ScaledAdam
+from scaling import ScaledLinear, ScaledLSTM
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from zipformer import Zipformer
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for module in model.modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+
+
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-mask-encoder-layers",
+        type=int,
+        default=4,
+        help="Number of layers in the DPRNN based mask encoder.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-dim",
+        type=int,
+        default=256,
+        help="Hidden dimension of the LSTM blocks in DPRNN.",
+    )
+
+    parser.add_argument(
+        "--mask-encoder-segment-size",
+        type=int,
+        default=32,
+        help="Segment size of the SegLSTM in DPRNN. Ideally, this should be equal to the "
+        "decode-chunk-length of the zipformer encoder.",
+    )
+
+    parser.add_argument(
+        "--chunk-width-randomization",
+        type=bool,
+        default=False,
+        help="Whether to randomize the chunk width in DPRNN.",
+    )
+
+    # Zipformer config is based on:
+    # https://github.com/k2-fsa/icefall/pull/745#issuecomment-1405282740
+    parser.add_argument(
+        "--num-encoder-layers",
+        type=str,
+        default="2,2,2,2,2",
+        help="Number of zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--feedforward-dims",
+        type=str,
+        default="768,768,768,768,768",
+        help="Feedforward dimension of the zipformer encoder layers, comma separated.",
+    )
+
+    parser.add_argument(
+        "--nhead",
+        type=str,
+        default="8,8,8,8,8",
+        help="Number of attention heads in the zipformer encoder layers.",
+    )
+
+    parser.add_argument(
+        "--encoder-dims",
+        type=str,
+        default="256,256,256,256,256",
+        help="Embedding dimension in the 2 blocks of zipformer encoder layers, comma separated",
+    )
+
+    parser.add_argument(
+        "--attention-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="""Attention dimension in the 2 blocks of zipformer encoder layers, comma separated;
+        not the same as embedding dimension.""",
+    )
+
+    parser.add_argument(
+        "--encoder-unmasked-dims",
+        type=str,
+        default="192,192,192,192,192",
+        help="Unmasked dimensions in the encoders, relates to augmentation during training.  "
+        "Must be <= each of encoder_dims.  Empirically, less than 256 seems to make performance "
+        " worse.",
+    )
+
+    parser.add_argument(
+        "--zipformer-downsampling-factors",
+        type=str,
+        default="1,2,4,8,2",
+        help="Downsampling factor for each stack of encoder layers.",
+    )
+
+    parser.add_argument(
+        "--cnn-module-kernels",
+        type=str,
+        default="31,31,31,31,31",
+        help="Sizes of kernels in convolution modules",
+    )
+
+    parser.add_argument(
+        "--use-joint-encoder-layer",
+        type=str,
+        default="lstm",
+        choices=["linear", "lstm", "none"],
+        help="Whether to use a joint layer to combine all branches.",
+    )
+
+    parser.add_argument(
+        "--decoder-dim",
+        type=int,
+        default=512,
+        help="Embedding dimension in the decoder model.",
+    )
+
+    parser.add_argument(
+        "--joiner-dim",
+        type=int,
+        default=512,
+        help="""Dimension used in the joiner model.
+        Outputs from the encoder and decoder model are projected
+        to this dimension before adding.
+        """,
+    )
+
+    parser.add_argument(
+        "--short-chunk-size",
+        type=int,
+        default=50,
+        help="""Chunk length of dynamic training, the chunk size would be either
+        max sequence length of current batch or uniformly sampled from (1, short_chunk_size).
+        """,
+    )
+
+    parser.add_argument(
+        "--num-left-chunks",
+        type=int,
+        default=4,
+        help="How many left context can be seen in chunks when calculating attention.",
+    )
+
+    parser.add_argument(
+        "--decode-chunk-len",
+        type=int,
+        default=32,
+        help="The chunk size for decoding (in frames before subsampling)",
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=15,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="conv_lstm_transducer_stateless_ctc/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--model-init-ckpt",
+        type=str,
+        default=None,
+        help="""The model checkpoint to initialize the model (either full or part).
+        If not specified, the model is randomly initialized.
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=0.0004, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=1000,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=2,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="The prune range for rnnt loss, it means how many symbols(context)"
+        "we are using to compute the loss",
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="The scale to smooth the loss with lm "
+        "(output of prediction network) part.",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="The scale to smooth the loss with am (output of encoder network) part.",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="To get pruning ranges, we will calculate a simple version"
+        "loss(joiner is just addition), this simple loss also uses for"
+        "training (as a regularization item). We will scale the simple loss"
+        "with this parameter before adding to the final loss.",
+    )
+
+    parser.add_argument(
+        "--ctc-loss-scale",
+        type=float,
+        default=0.2,
+        help="Scale for CTC loss.",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=1000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 0.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=5,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=100,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - feature_dim: The model input dim. It has to match the one used
+                       in computing features.
+
+        - subsampling_factor:  The subsampling factor for the model.
+
+        - num_decoder_layers: Number of decoder layer of transformer decoder.
+
+        - warm_step: The warm_step for Noam optimizer.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 10,
+            "reset_interval": 200,
+            "valid_interval": 100,
+            # parameters for SURT
+            "num_channels": 2,
+            "feature_dim": 80,
+            "subsampling_factor": 4,  # not passed in, this is fixed
+            # parameters for Noam
+            "model_warm_step": 5000,  # arg given to model, not for lrate
+            # parameters for ctc loss
+            "beam_size": 10,
+            "use_double_scores": True,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def get_mask_encoder_model(params: AttributeDict) -> nn.Module:
+    mask_encoder = DPRNN(
+        feature_dim=params.feature_dim,
+        input_size=params.mask_encoder_dim,
+        hidden_size=params.mask_encoder_dim,
+        output_size=params.feature_dim * params.num_channels,
+        segment_size=params.mask_encoder_segment_size,
+        num_blocks=params.num_mask_encoder_layers,
+        chunk_width_randomization=params.chunk_width_randomization,
+    )
+    return mask_encoder
+
+
+def get_encoder_model(params: AttributeDict) -> nn.Module:
+    # TODO: We can add an option to switch between Zipformer and Transformer
+    def to_int_tuple(s: str):
+        return tuple(map(int, s.split(",")))
+
+    encoder = Zipformer(
+        num_features=params.feature_dim,
+        output_downsampling_factor=2,
+        zipformer_downsampling_factors=to_int_tuple(
+            params.zipformer_downsampling_factors
+        ),
+        encoder_dims=to_int_tuple(params.encoder_dims),
+        attention_dim=to_int_tuple(params.attention_dims),
+        encoder_unmasked_dims=to_int_tuple(params.encoder_unmasked_dims),
+        nhead=to_int_tuple(params.nhead),
+        feedforward_dim=to_int_tuple(params.feedforward_dims),
+        cnn_module_kernels=to_int_tuple(params.cnn_module_kernels),
+        num_encoder_layers=to_int_tuple(params.num_encoder_layers),
+        num_left_chunks=params.num_left_chunks,
+        short_chunk_size=params.short_chunk_size,
+        decode_chunk_size=params.decode_chunk_len // 2,
+    )
+    return encoder
+
+
+def get_joint_encoder_layer(params: AttributeDict) -> nn.Module:
+    class TakeFirst(nn.Module):
+        def forward(self, x):
+            return x[0]
+
+    if params.use_joint_encoder_layer == "linear":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            nn.Linear(
+                params.num_channels * encoder_dim, params.num_channels * encoder_dim
+            ),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "lstm":
+        encoder_dim = int(params.encoder_dims.split(",")[-1])
+        joint_layer = nn.Sequential(
+            Rearrange("(c b) t d -> b t (c d)", c=params.num_channels),
+            ScaledLSTM(
+                input_size=params.num_channels * encoder_dim,
+                hidden_size=params.num_channels * encoder_dim,
+                num_layers=1,
+                bias=True,
+                batch_first=True,
+                dropout=0.0,
+                bidirectional=False,
+            ),
+            TakeFirst(),
+            nn.ReLU(),
+            Rearrange("b t (c d) -> (c b) t d", c=params.num_channels),
+        )
+    elif params.use_joint_encoder_layer == "none":
+        joint_layer = None
+    else:
+        raise ValueError(
+            f"Unknown joint encoder layer type: {params.use_joint_encoder_layer}"
+        )
+    return joint_layer
+
+
+def get_decoder_model(params: AttributeDict) -> nn.Module:
+    decoder = Decoder(
+        vocab_size=params.vocab_size,
+        decoder_dim=params.decoder_dim,
+        blank_id=params.blank_id,
+        context_size=params.context_size,
+    )
+    return decoder
+
+
+def get_joiner_model(params: AttributeDict) -> nn.Module:
+    joiner = Joiner(
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return joiner
+
+
+def get_surt_model(
+    params: AttributeDict,
+) -> nn.Module:
+    mask_encoder = get_mask_encoder_model(params)
+    encoder = get_encoder_model(params)
+    joint_layer = get_joint_encoder_layer(params)
+    decoder = get_decoder_model(params)
+    joiner = get_joiner_model(params)
+
+    model = SURT(
+        mask_encoder=mask_encoder,
+        encoder=encoder,
+        joint_encoder_layer=joint_layer,
+        decoder=decoder,
+        joiner=joiner,
+        num_channels=params.num_channels,
+        encoder_dim=int(params.encoder_dims.split(",")[-1]),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return model
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute RNN-T loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Conformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"].to(device)
+    feature_lens = batch["input_lens"].to(device)
+
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+
+    # The dataloader returns text as a list of cuts, each of which is a list of channel
+    # text. We flatten this to a list where all channels are together, i.e., it looks like
+    # [utt1_ch1, utt2_ch1, ..., uttN_ch1, utt1_ch2, ...., uttN,ch2].
+    text = [val for tup in zip(*batch["text"]) for val in tup]
+    assert len(text) == len(feature) * params.num_channels
+
+    # Convert all channel texts to token IDs and create a ragged tensor.
+    y = sp.encode(text, out_type=int)
+    y = k2.RaggedTensor(y).to(device)
+
+    batch_idx_train = params.batch_idx_train
+    warm_step = params.model_warm_step
+
+    with torch.set_grad_enabled(is_training):
+        (simple_loss, pruned_loss, ctc_loss, x_masked) = model(
+            x=feature,
+            x_lens=feature_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+            reduction="none",
+            subsampling_factor=params.subsampling_factor,
+        )
+        simple_loss_is_finite = torch.isfinite(simple_loss)
+        pruned_loss_is_finite = torch.isfinite(pruned_loss)
+        ctc_loss_is_finite = torch.isfinite(ctc_loss)
+
+        is_finite = simple_loss_is_finite & pruned_loss_is_finite & ctc_loss_is_finite
+        if not torch.all(is_finite):
+            logging.info(
+                "Not all losses are finite!\n"
+                f"simple_losses: {simple_loss}\n"
+                f"pruned_losses: {pruned_loss}\n"
+                f"ctc_losses: {ctc_loss}\n"
+            )
+            display_and_save_batch(batch, params=params, sp=sp)
+            simple_loss = simple_loss[simple_loss_is_finite]
+            pruned_loss = pruned_loss[pruned_loss_is_finite]
+            ctc_loss = ctc_loss[ctc_loss_is_finite]
+
+            # If either all simple_loss or pruned_loss is inf or nan,
+            # we stop the training process by raising an exception
+            if (
+                torch.all(~simple_loss_is_finite)
+                or torch.all(~pruned_loss_is_finite)
+                or torch.all(~ctc_loss_is_finite)
+            ):
+                raise ValueError(
+                    "There are too many utterances in this batch "
+                    "leading to inf or nan losses."
+                )
+
+        simple_loss_sum = simple_loss.sum()
+        pruned_loss_sum = pruned_loss.sum()
+        ctc_loss_sum = ctc_loss.sum()
+
+        s = params.simple_loss_scale
+        # take down the scale on the simple loss from 1.0 at the start
+        # to params.simple_loss scale by warm_step.
+        simple_loss_scale = (
+            s
+            if batch_idx_train >= warm_step
+            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
+        )
+        pruned_loss_scale = (
+            1.0
+            if batch_idx_train >= warm_step
+            else 0.1 + 0.9 * (batch_idx_train / warm_step)
+        )
+        loss = (
+            simple_loss_scale * simple_loss_sum
+            + pruned_loss_scale * pruned_loss_sum
+            + params.ctc_loss_scale * ctc_loss_sum
+        )
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        # info["frames"] is an approximate number for two reasons:
+        # (1) The acutal subsampling factor is ((lens - 1) // 2 - 1) // 2
+        # (2) If some utterances in the batch lead to inf/nan loss, they
+        #     are filtered out.
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # `utt_duration` and `utt_pad_proportion` would be normalized by `utterances`  # noqa
+    info["utterances"] = feature.size(0)
+    # averaged input duration in frames over utterances
+    info["utt_duration"] = feature_lens.sum().item()
+    # averaged padding proportion over utterances
+    info["utt_pad_proportion"] = (
+        ((feature.size(1) - feature_lens) / feature.size(1)).sum().item()
+    )
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    info["simple_loss"] = simple_loss_sum.detach().cpu().item()
+    info["pruned_loss"] = pruned_loss_sum.detach().cpu().item()
+    if params.ctc_loss_scale > 0.0:
+        info["ctc_loss"] = ctc_loss_sum.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            sp=sp,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    sp: spm.SentencePieceProcessor,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      train_dl_warmup:
+        Dataloader for the training dataset with 2 speakers. This is used during the
+        warmup stage.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    torch.cuda.empty_cache()
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    cur_batch_idx = params.get("cur_batch_idx", 0)
+
+    for batch_idx, batch in enumerate(train_dl):
+        if batch_idx < cur_batch_idx:
+            continue
+        cur_batch_idx = batch_idx
+
+        params.batch_idx_train += 1
+        batch_size = batch["inputs"].shape[0]
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            set_batch_count(model, params.batch_idx_train)
+            scheduler.step_batch(params.batch_idx_train)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            params.cur_batch_idx = batch_idx
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            del params.cur_batch_idx
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+            if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = scheduler.get_last_lr()[0]
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
+                    )
+
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_surt_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    model.to(device)
+
+    if checkpoints is None and params.model_init_ckpt is not None:
+        logging.info(
+            f"Initializing model with checkpoint from {params.model_init_ckpt}"
+        )
+        init_ckpt = torch.load(params.model_init_ckpt, map_location=device)
+        model.load_state_dict(init_ckpt["model"], strict=True)
+
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    parameters_names = []
+    parameters_names.append(
+        [name_param_pair[0] for name_param_pair in model.named_parameters()]
+    )
+    optimizer = ScaledAdam(
+        model.parameters(),
+        lr=params.base_lr,
+        clipping_scale=2.0,
+        parameters_names=parameters_names,
+    )
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        diagnostic = diagnostics.attach_diagnostics(model)
+
+    libricss = LibriCssAsrDataModule(args)
+
+    train_cuts_ihm = libricss.libricss_cuts(split="dev", type="ihm-mix")
+    train_cuts_sdm = libricss.libricss_cuts(split="dev", type="sdm")
+    train_cuts = train_cuts_ihm + train_cuts_sdm
+
+    # This will create 2 copies of the sessions with different segmentation
+    train_cuts = train_cuts.trim_to_supervision_groups(
+        max_pause=0.1
+    ) + train_cuts.trim_to_supervision_groups(max_pause=0.5)
+    dev_cuts = libricss.libricss_cuts(split="dev", type="sdm")
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = libricss.train_dataloaders(
+        train_cuts,
+        sampler_state_dict=sampler_state_dict,
+        return_sources=False,
+        strict=False,
+    )
+    valid_dl = libricss.valid_dataloaders(dev_cuts)
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sp=sp,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    y = [sp.encode(text_ch) for text_ch in batch["text"]]
+    num_tokens = [sum(len(yi) for yi in y_ch) for y_ch in y]
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def main():
+    parser = get_parser()
+    LibriCssAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/libricss/SURT/dprnn_zipformer/zipformer.py b/egs/libricss/SURT/dprnn_zipformer/zipformer.py
new file mode 120000
index 000000000..ec183baa7
--- /dev/null
+++ b/egs/libricss/SURT/dprnn_zipformer/zipformer.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless7_streaming/zipformer.py
\ No newline at end of file
diff --git a/egs/libricss/SURT/heat.png b/egs/libricss/SURT/heat.png
new file mode 100644
index 000000000..ac7ecfff4
Binary files /dev/null and b/egs/libricss/SURT/heat.png differ
diff --git a/egs/libricss/SURT/local/add_source_feats.py b/egs/libricss/SURT/local/add_source_feats.py
new file mode 100755
index 000000000..c9775561f
--- /dev/null
+++ b/egs/libricss/SURT/local/add_source_feats.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file adds source features as temporal arrays to the mixture manifests.
+It looks for manifests in the directory data/manifests.
+"""
+import logging
+from pathlib import Path
+
+import numpy as np
+from lhotse import CutSet, LilcomChunkyWriter, load_manifest, load_manifest_lazy
+from tqdm import tqdm
+
+
+def add_source_feats(num_jobs=1):
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    for type_affix in ["full", "ov40"]:
+        logging.info(f"Adding source features for {type_affix}")
+        mixed_name_clean = f"train_clean_{type_affix}"
+        mixed_name_rvb = f"train_rvb_{type_affix}"
+
+        logging.info("Reading mixed cuts")
+        mixed_cuts_clean = load_manifest_lazy(
+            src_dir / f"cuts_{mixed_name_clean}.jsonl.gz"
+        )
+        mixed_cuts_rvb = load_manifest_lazy(src_dir / f"cuts_{mixed_name_rvb}.jsonl.gz")
+
+        logging.info("Reading source cuts")
+        source_cuts = load_manifest(src_dir / "librispeech_cuts_train_trimmed.jsonl.gz")
+
+        logging.info("Adding source features to the mixed cuts")
+        with tqdm() as pbar, CutSet.open_writer(
+            src_dir / f"cuts_{mixed_name_clean}_sources.jsonl.gz"
+        ) as cut_writer_clean, CutSet.open_writer(
+            src_dir / f"cuts_{mixed_name_rvb}_sources.jsonl.gz"
+        ) as cut_writer_rvb, LilcomChunkyWriter(
+            output_dir / f"feats_train_{type_affix}_sources"
+        ) as source_feat_writer:
+            for cut_clean, cut_rvb in zip(mixed_cuts_clean, mixed_cuts_rvb):
+                assert cut_rvb.id == cut_clean.id + "_rvb"
+                # Create source_feats and source_feat_offsets
+                # (See `lhotse.datasets.K2SurtDataset` for details)
+                source_feats = []
+                source_feat_offsets = []
+                cur_offset = 0
+                for sup in sorted(
+                    cut_clean.supervisions, key=lambda s: (s.start, s.speaker)
+                ):
+                    source_cut = source_cuts[sup.id]
+                    source_feats.append(source_cut.load_features())
+                    source_feat_offsets.append(cur_offset)
+                    cur_offset += source_cut.num_frames
+                cut_clean.source_feats = source_feat_writer.store_array(
+                    cut_clean.id, np.concatenate(source_feats, axis=0)
+                )
+                cut_clean.source_feat_offsets = source_feat_offsets
+                cut_writer_clean.write(cut_clean)
+                cut_rvb.source_feats = cut_clean.source_feats
+                cut_rvb.source_feat_offsets = cut_clean.source_feat_offsets
+                cut_writer_rvb.write(cut_rvb)
+                pbar.update(1)
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    add_source_feats()
diff --git a/egs/libricss/SURT/local/compute_fbank_libricss.py b/egs/libricss/SURT/local/compute_fbank_libricss.py
new file mode 100755
index 000000000..afd66899c
--- /dev/null
+++ b/egs/libricss/SURT/local/compute_fbank_libricss.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the LibriCSS dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+import logging
+from pathlib import Path
+
+import pyloudnorm as pyln
+import torch
+import torch.multiprocessing
+from lhotse import LilcomChunkyWriter, load_manifest_lazy
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
+def compute_fbank_libricss():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    logging.info("Reading manifests")
+    cuts_ihm_mix = load_manifest_lazy(
+        src_dir / "libricss-ihm-mix_segments_all.jsonl.gz"
+    )
+    cuts_sdm = load_manifest_lazy(src_dir / "libricss-sdm_segments_all.jsonl.gz")
+
+    for name, cuts in [("ihm-mix", cuts_ihm_mix), ("sdm", cuts_sdm)]:
+        dev_cuts = cuts.filter(lambda c: "session0" in c.id)
+        test_cuts = cuts.filter(lambda c: "session0" not in c.id)
+
+        # If SDM cuts, apply loudness normalization
+        if name == "sdm":
+            dev_cuts = dev_cuts.normalize_loudness(target=-23.0)
+            test_cuts = test_cuts.normalize_loudness(target=-23.0)
+
+        logging.info(f"Extracting fbank features for {name} dev cuts")
+        _ = dev_cuts.compute_and_store_features_batch(
+            extractor=extractor,
+            storage_path=output_dir / f"libricss-{name}_feats_dev",
+            manifest_path=src_dir / f"cuts_dev_libricss-{name}.jsonl.gz",
+            batch_duration=500,
+            num_workers=2,
+            storage_type=LilcomChunkyWriter,
+            overwrite=True,
+        )
+
+        logging.info(f"Extracting fbank features for {name} test cuts")
+        _ = test_cuts.compute_and_store_features_batch(
+            extractor=extractor,
+            storage_path=output_dir / f"libricss-{name}_feats_test",
+            manifest_path=src_dir / f"cuts_test_libricss-{name}.jsonl.gz",
+            batch_duration=2000,
+            num_workers=4,
+            storage_type=LilcomChunkyWriter,
+            overwrite=True,
+        )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    compute_fbank_libricss()
diff --git a/egs/libricss/SURT/local/compute_fbank_librispeech.py b/egs/libricss/SURT/local/compute_fbank_librispeech.py
new file mode 100755
index 000000000..5c8aece9c
--- /dev/null
+++ b/egs/libricss/SURT/local/compute_fbank_librispeech.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the LibriSpeech dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+
+import logging
+from pathlib import Path
+
+import torch
+from lhotse import CutSet, LilcomChunkyWriter
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
+def compute_fbank_librispeech():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+    num_mel_bins = 80
+
+    dataset_parts = (
+        "train-clean-100",
+        "train-clean-360",
+        "train-other-500",
+    )
+    prefix = "librispeech"
+    suffix = "jsonl.gz"
+    manifests = read_manifests_if_cached(
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
+    )
+    assert manifests is not None
+
+    assert len(manifests) == len(dataset_parts), (
+        len(manifests),
+        len(dataset_parts),
+        list(manifests.keys()),
+        dataset_parts,
+    )
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=16000),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    for partition, m in manifests.items():
+        cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
+        if (output_dir / cuts_filename).is_file():
+            logging.info(f"{partition} already exists - skipping.")
+            continue
+        logging.info(f"Processing {partition}")
+        cut_set = CutSet.from_manifests(
+            recordings=m["recordings"],
+            supervisions=m["supervisions"],
+        )
+
+        cut_set = cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1)
+
+        cut_set = cut_set.compute_and_store_features_batch(
+            extractor=extractor,
+            storage_path=f"{output_dir}/{prefix}_feats_{partition}",
+            manifest_path=f"{src_dir}/{cuts_filename}",
+            batch_duration=4000,
+            num_workers=2,
+            storage_type=LilcomChunkyWriter,
+            overwrite=True,
+        )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    compute_fbank_librispeech()
diff --git a/egs/libricss/SURT/local/compute_fbank_lsmix.py b/egs/libricss/SURT/local/compute_fbank_lsmix.py
new file mode 100755
index 000000000..da42f8ba1
--- /dev/null
+++ b/egs/libricss/SURT/local/compute_fbank_lsmix.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+# Copyright    2022  Johns Hopkins University        (authors: Desh Raj)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the synthetically mixed LibriSpeech
+train and dev sets.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+import logging
+import random
+import warnings
+from pathlib import Path
+
+import torch
+import torch.multiprocessing
+from lhotse import LilcomChunkyWriter, load_manifest
+from lhotse.cut import MixedCut, MixTrack, MultiCut
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+from lhotse.utils import fix_random_seed, uuid4
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+torch.multiprocessing.set_sharing_strategy("file_system")
+
+
+def compute_fbank_lsmix():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    logging.info("Reading manifests")
+    manifests = read_manifests_if_cached(
+        dataset_parts=["train_clean_full", "train_clean_ov40"],
+        types=["cuts"],
+        output_dir=src_dir,
+        prefix="lsmix",
+        suffix="jsonl.gz",
+        lazy=True,
+    )
+
+    cs = {}
+    cs["clean_full"] = manifests["train_clean_full"]["cuts"]
+    cs["clean_ov40"] = manifests["train_clean_ov40"]["cuts"]
+
+    # only uses RIRs and noises from REVERB challenge
+    real_rirs = load_manifest(src_dir / "real-rir_recordings_all.jsonl.gz").filter(
+        lambda r: "RVB2014" in r.id
+    )
+    noises = load_manifest(src_dir / "iso-noise_recordings_all.jsonl.gz").filter(
+        lambda r: "RVB2014" in r.id
+    )
+
+    # Apply perturbation to the training cuts
+    logging.info("Applying perturbation to the training cuts")
+    cs["rvb_full"] = cs["clean_full"].map(
+        lambda c: augment(
+            c, perturb_snr=True, rirs=real_rirs, noises=noises, perturb_loudness=True
+        )
+    )
+    cs["rvb_ov40"] = cs["clean_ov40"].map(
+        lambda c: augment(
+            c, perturb_snr=True, rirs=real_rirs, noises=noises, perturb_loudness=True
+        )
+    )
+
+    for type_affix in ["full", "ov40"]:
+        for rvb_affix in ["clean", "rvb"]:
+            logging.info(
+                f"Extracting fbank features for {type_affix} {rvb_affix} training cuts"
+            )
+            cuts = cs[f"{rvb_affix}_{type_affix}"]
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                _ = cuts.compute_and_store_features_batch(
+                    extractor=extractor,
+                    storage_path=output_dir
+                    / f"lsmix_feats_train_{rvb_affix}_{type_affix}",
+                    manifest_path=src_dir
+                    / f"cuts_train_{rvb_affix}_{type_affix}.jsonl.gz",
+                    batch_duration=5000,
+                    num_workers=4,
+                    storage_type=LilcomChunkyWriter,
+                    overwrite=True,
+                )
+
+
+def augment(cut, perturb_snr=False, rirs=None, noises=None, perturb_loudness=False):
+    """
+    Given a mixed cut, this function optionally applies the following augmentations:
+    - Perturbing the SNRs of the tracks (in range [-5, 5] dB)
+    - Reverberation using a randomly selected RIR
+    - Adding noise
+    - Perturbing the loudness (in range [-20, -25] dB)
+    """
+    out_cut = cut.drop_features()
+
+    # Perturb the SNRs (optional)
+    if perturb_snr:
+        snrs = [random.uniform(-5, 5) for _ in range(len(cut.tracks))]
+        for i, (track, snr) in enumerate(zip(out_cut.tracks, snrs)):
+            if i == 0:
+                # Skip the first track since it is the reference
+                continue
+            track.snr = snr
+
+    # Reverberate the cut (optional)
+    if rirs is not None:
+        # Select an RIR at random
+        rir = random.choice(rirs)
+        # Select a channel at random
+        rir_channel = random.choice(list(range(rir.num_channels)))
+        # Reverberate the cut
+        out_cut = out_cut.reverb_rir(rir_recording=rir, rir_channels=[rir_channel])
+
+    # Add noise (optional)
+    if noises is not None:
+        # Select a noise recording at random
+        noise = random.choice(noises).to_cut()
+        if isinstance(noise, MultiCut):
+            noise = noise.to_mono()[0]
+        # Select an SNR at random
+        snr = random.uniform(10, 30)
+        # Repeat the noise to match the duration of the cut
+        noise = repeat_cut(noise, out_cut.duration)
+        out_cut = MixedCut(
+            id=out_cut.id,
+            tracks=[
+                MixTrack(cut=out_cut, type="MixedCut"),
+                MixTrack(cut=noise, type="DataCut", snr=snr),
+            ],
+        )
+
+    # Perturb the loudness (optional)
+    if perturb_loudness:
+        target_loudness = random.uniform(-20, -25)
+        out_cut = out_cut.normalize_loudness(target_loudness, mix_first=True)
+    return out_cut
+
+
+def repeat_cut(cut, duration):
+    while cut.duration < duration:
+        cut = cut.mix(cut, offset_other_by=cut.duration)
+    return cut.truncate(duration=duration)
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    fix_random_seed(42)
+    compute_fbank_lsmix()
diff --git a/egs/libricss/SURT/local/compute_fbank_musan.py b/egs/libricss/SURT/local/compute_fbank_musan.py
new file mode 100755
index 000000000..1fcf951f9
--- /dev/null
+++ b/egs/libricss/SURT/local/compute_fbank_musan.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the musan dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+
+import logging
+from pathlib import Path
+
+import torch
+from lhotse import CutSet, LilcomChunkyWriter, combine
+from lhotse.features.kaldifeat import (
+    KaldifeatFbank,
+    KaldifeatFbankConfig,
+    KaldifeatFrameOptions,
+    KaldifeatMelOptions,
+)
+from lhotse.recipes.utils import read_manifests_if_cached
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+
+def compute_fbank_musan():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+
+    sampling_rate = 16000
+    num_mel_bins = 80
+
+    dataset_parts = (
+        "music",
+        "speech",
+        "noise",
+    )
+    prefix = "musan"
+    suffix = "jsonl.gz"
+    manifests = read_manifests_if_cached(
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
+    )
+    assert manifests is not None
+
+    assert len(manifests) == len(dataset_parts), (
+        len(manifests),
+        len(dataset_parts),
+        list(manifests.keys()),
+        dataset_parts,
+    )
+
+    musan_cuts_path = src_dir / "musan_cuts.jsonl.gz"
+
+    if musan_cuts_path.is_file():
+        logging.info(f"{musan_cuts_path} already exists - skipping")
+        return
+
+    logging.info("Extracting features for Musan")
+
+    extractor = KaldifeatFbank(
+        KaldifeatFbankConfig(
+            frame_opts=KaldifeatFrameOptions(sampling_rate=sampling_rate),
+            mel_opts=KaldifeatMelOptions(num_bins=num_mel_bins),
+            device="cuda",
+        )
+    )
+
+    # create chunks of Musan with duration 5 - 10 seconds
+    _ = (
+        CutSet.from_manifests(
+            recordings=combine(part["recordings"] for part in manifests.values())
+        )
+        .cut_into_windows(10.0)
+        .filter(lambda c: c.duration > 5)
+        .compute_and_store_features_batch(
+            extractor=extractor,
+            storage_path=output_dir / "musan_feats",
+            manifest_path=musan_cuts_path,
+            batch_duration=500,
+            num_workers=4,
+            storage_type=LilcomChunkyWriter,
+        )
+    )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    compute_fbank_musan()
diff --git a/egs/libricss/SURT/prepare.sh b/egs/libricss/SURT/prepare.sh
new file mode 100755
index 000000000..028240e44
--- /dev/null
+++ b/egs/libricss/SURT/prepare.sh
@@ -0,0 +1,204 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+stage=-1
+stop_stage=100
+
+# We assume dl_dir (download dir) contains the following
+# directories and files. If not, they will be downloaded
+# by this script automatically.
+#
+#  - $dl_dir/librispeech
+#      You can find audio and transcripts for LibriSpeech in this path.
+#
+#  - $dl_dir/libricss
+#      You can find audio and transcripts for LibriCSS in this path.
+#
+#  - $dl_dir/musan
+#      This directory contains the following directories downloaded from
+#       http://www.openslr.org/17/
+#
+#     - music
+#     - noise
+#     - speech
+#
+#  - $dl_dir/rirs_noises
+#      This directory contains the RIRS_NOISES corpus downloaded from https://openslr.org/28/.
+#
+dl_dir=$PWD/download
+
+. shared/parse_options.sh || exit 1
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+vocab_size=500
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Download data"
+
+  # If you have pre-downloaded it to /path/to/librispeech,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/librispeech $dl_dir/librispeech
+  #
+  if [ ! -d $dl_dir/librispeech ]; then
+    lhotse download librispeech $dl_dir/librispeech
+  fi
+
+  # If you have pre-downloaded it to /path/to/libricss,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/libricss $dl_dir/libricss
+  #
+  if [ ! -d $dl_dir/libricss ]; then
+    lhotse download libricss $dl_dir/libricss
+  fi
+
+  # If you have pre-downloaded it to /path/to/musan,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/musan $dl_dir/
+  #
+  if [ ! -d $dl_dir/musan ]; then
+    lhotse download musan $dl_dir
+  fi
+
+  # If you have pre-downloaded it to /path/to/rirs_noises,
+  # you can create a symlink
+  #
+  #   ln -sfv /path/to/rirs_noises $dl_dir/
+  #
+  if [ ! -d $dl_dir/rirs_noises ]; then
+    lhotse download rirs_noises $dl_dir
+  fi
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare LibriSpeech manifests"
+  # We assume that you have downloaded the LibriSpeech corpus
+  # to $dl_dir/librispeech. We perform text normalization for the transcripts.
+  # NOTE: Alignments are required for this recipe.
+  mkdir -p data/manifests
+  lhotse prepare librispeech -p train-clean-100 -p train-clean-360 -p train-other-500 -p dev-clean \
+    -j 4 --alignments-dir $dl_dir/libri_alignments/LibriSpeech $dl_dir/librispeech data/manifests/
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Prepare LibriCSS manifests"
+  # We assume that you have downloaded the LibriCSS corpus
+  # to $dl_dir/libricss. We perform text normalization for the transcripts.
+  mkdir -p data/manifests
+  for mic in sdm ihm-mix; do
+    lhotse prepare libricss --type $mic --segmented $dl_dir/libricss data/manifests/
+  done
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Prepare musan manifest and RIRs"
+  # We assume that you have downloaded the musan corpus
+  # to $dl_dir/musan
+  mkdir -p data/manifests
+  lhotse prepare musan $dl_dir/musan data/manifests
+
+  # We assume that you have downloaded the RIRS_NOISES corpus
+  # to $dl_dir/rirs_noises
+  lhotse prepare rir-noise -p real_rir -p iso_noise $dl_dir/rirs_noises data/manifests
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  log "Stage 4: Extract features for LibriSpeech, trim to alignments, and shuffle the cuts"
+  python local/compute_fbank_librispeech.py
+  lhotse combine data/manifests/librispeech_cuts_train* - |\
+    lhotse cut trim-to-alignments --type word --max-pause 0.2 - - |\
+    shuf | gzip -c > data/manifests/librispeech_cuts_train_trimmed.jsonl.gz
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Create simulated mixtures from LibriSpeech (train and dev). This may take a while."
+  # We create a high overlap set which will be used during the model warmup phase, and a
+  # full training set that will be used for the subsequent training.
+
+  gunzip -c data/manifests/libricss-sdm_supervisions_all.jsonl.gz |\
+    grep -v "0L" | grep -v "OV10" |\
+    gzip -c > data/manifests/libricss-sdm_supervisions_all_v1.jsonl.gz
+
+  gunzip -c data/manifests/libricss-sdm_supervisions_all.jsonl.gz |\
+    grep "OV40" |\
+    gzip -c > data/manifests/libricss-sdm_supervisions_ov40.jsonl.gz
+
+  # Warmup mixtures (100k) based on high overlap (OV40)
+  log "Generating 100k anechoic train mixtures for warmup"
+  lhotse workflows simulate-meetings \
+    --method conversational \
+    --fit-to-supervisions data/manifests/libricss-sdm_supervisions_ov40.jsonl.gz \
+    --num-meetings 100000 \
+    --num-speakers-per-meeting 2,3 \
+    --max-duration-per-speaker 15.0 \
+    --max-utterances-per-speaker 3 \
+    --seed 1234 \
+    --num-jobs 4 \
+    data/manifests/librispeech_cuts_train_trimmed.jsonl.gz \
+    data/manifests/lsmix_cuts_train_clean_ov40.jsonl.gz
+
+  # Full training set (2,3 speakers) anechoic
+  log "Generating anechoic ${part} set (full)"
+  lhotse workflows simulate-meetings \
+    --method conversational \
+    --fit-to-supervisions data/manifests/libricss-sdm_supervisions_all_v1.jsonl.gz \
+    --num-repeats 1 \
+    --num-speakers-per-meeting 2,3 \
+    --max-duration-per-speaker 15.0 \
+    --max-utterances-per-speaker 3 \
+    --seed 1234 \
+    --num-jobs 4 \
+    data/manifests/librispeech_cuts_train_trimmed.jsonl.gz \
+    data/manifests/lsmix_cuts_train_clean_full.jsonl.gz
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
+  log "Stage 6: Compute fbank features for musan"
+  mkdir -p data/fbank
+  python local/compute_fbank_musan.py
+fi
+
+if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
+  log "Stage 7: Compute fbank features for simulated Libri-mix"
+  mkdir -p data/fbank
+  python local/compute_fbank_lsmix.py
+fi
+
+if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
+  log "Stage 8: Add source feats to mixtures (useful for auxiliary tasks)"
+  python local/add_source_feats.py
+
+  log "Combining lsmix-clean and lsmix-rvb"
+  for type in full ov40; do
+    cat <(gunzip -c data/manifests/cuts_train_clean_${type}_sources.jsonl.gz) \
+      <(gunzip -c data/manifests/cuts_train_rvb_${type}_sources.jsonl.gz) |\
+      shuf | gzip -c > data/manifests/cuts_train_comb_${type}_sources.jsonl.gz
+  done
+fi
+
+if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
+  log "Stage 9: Compute fbank features for LibriCSS"
+  mkdir -p data/fbank
+  python local/compute_fbank_libricss.py
+fi
+
+if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then
+  log "Stage 10: Download LibriSpeech BPE model from HuggingFace."
+  mkdir -p data/lang_bpe_500
+  pushd data/lang_bpe_500
+  wget https://huggingface.co/Zengwei/icefall-asr-librispeech-pruned-transducer-stateless7-streaming-2022-12-29/resolve/main/data/lang_bpe_500/bpe.model
+  popd
+fi
diff --git a/egs/libricss/SURT/shared b/egs/libricss/SURT/shared
new file mode 120000
index 000000000..4cbd91a7e
--- /dev/null
+++ b/egs/libricss/SURT/shared
@@ -0,0 +1 @@
+../../../icefall/shared
\ No newline at end of file
diff --git a/egs/libricss/SURT/surt.png b/egs/libricss/SURT/surt.png
new file mode 100644
index 000000000..fcc8119d4
Binary files /dev/null and b/egs/libricss/SURT/surt.png differ
diff --git a/egs/librispeech/ASR/conformer_ctc2/train.py b/egs/librispeech/ASR/conformer_ctc2/train.py
index 3366af13e..c4a13b101 100755
--- a/egs/librispeech/ASR/conformer_ctc2/train.py
+++ b/egs/librispeech/ASR/conformer_ctc2/train.py
@@ -675,7 +675,6 @@ def train_one_epoch(
     for batch_idx, batch in enumerate(train_dl):
         params.batch_idx_train += 1
         batch_size = len(batch["supervisions"]["text"])
-        batch_name = batch["supervisions"]["uttid"]
 
         with torch.cuda.amp.autocast(enabled=params.use_fp16):
             loss, loss_info = compute_loss(
@@ -698,10 +697,7 @@ def train_one_epoch(
             scaler.scale(loss).backward()
         except RuntimeError as e:
             if "CUDA out of memory" in str(e):
-                logging.error(
-                    f"failing batch size:{batch_size} "
-                    f"failing batch names {batch_name}"
-                )
+                logging.error(f"failing batch size:{batch_size} ")
             raise
 
         scheduler.step_batch(params.batch_idx_train)
@@ -756,10 +752,7 @@ def train_one_epoch(
             if loss_info["ctc_loss"] == float("inf") or loss_info["att_loss"] == float(
                 "inf"
             ):
-                logging.error(
-                    "Your loss contains inf, something goes wrong"
-                    f"failing batch names {batch_name}"
-                )
+                logging.error("Your loss contains inf, something goes wrong")
             if tb_writer is not None:
                 tb_writer.add_scalar(
                     "train/learning_rate", cur_lr, params.batch_idx_train
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
index 1bbad6946..fd59d4b7f 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
@@ -22,6 +22,7 @@ from typing import Dict, List, Optional, Tuple, Union
 import k2
 import sentencepiece as spm
 import torch
+from torch import nn
 
 from icefall import ContextGraph, ContextState, NgramLm, NgramLmStateCost
 from icefall.decode import Nbest, one_best_decoding
@@ -35,7 +36,6 @@ from icefall.utils import (
     get_texts,
     get_texts_with_timestamp,
 )
-from torch import nn
 
 
 def fast_beam_search_one_best(
@@ -47,9 +47,10 @@ def fast_beam_search_one_best(
     max_states: int,
     max_contexts: int,
     temperature: float = 1.0,
-    subtract_ilme: bool = False,
-    ilme_scale: float = 0.1,
+    ilme_scale: float = 0.0,
+    blank_penalty: float = 0.0,
     return_timestamps: bool = False,
+    allow_partial: bool = False,
 ) -> Union[List[List[int]], DecodingResults]:
     """It limits the maximum number of symbols per frame to 1.
 
@@ -90,8 +91,9 @@ def fast_beam_search_one_best(
         max_states=max_states,
         max_contexts=max_contexts,
         temperature=temperature,
-        subtract_ilme=subtract_ilme,
         ilme_scale=ilme_scale,
+        allow_partial=allow_partial,
+        blank_penalty=blank_penalty,
     )
 
     best_path = one_best_decoding(lattice)
@@ -114,7 +116,10 @@ def fast_beam_search_nbest_LG(
     nbest_scale: float = 0.5,
     use_double_scores: bool = True,
     temperature: float = 1.0,
+    blank_penalty: float = 0.0,
+    ilme_scale: float = 0.0,
     return_timestamps: bool = False,
+    allow_partial: bool = False,
 ) -> Union[List[List[int]], DecodingResults]:
     """It limits the maximum number of symbols per frame to 1.
 
@@ -168,6 +173,9 @@ def fast_beam_search_nbest_LG(
         max_states=max_states,
         max_contexts=max_contexts,
         temperature=temperature,
+        allow_partial=allow_partial,
+        blank_penalty=blank_penalty,
+        ilme_scale=ilme_scale,
     )
 
     nbest = Nbest.from_lattice(
@@ -240,7 +248,9 @@ def fast_beam_search_nbest(
     nbest_scale: float = 0.5,
     use_double_scores: bool = True,
     temperature: float = 1.0,
+    blank_penalty: float = 0.0,
     return_timestamps: bool = False,
+    allow_partial: bool = False,
 ) -> Union[List[List[int]], DecodingResults]:
     """It limits the maximum number of symbols per frame to 1.
 
@@ -293,7 +303,9 @@ def fast_beam_search_nbest(
         beam=beam,
         max_states=max_states,
         max_contexts=max_contexts,
+        blank_penalty=blank_penalty,
         temperature=temperature,
+        allow_partial=allow_partial,
     )
 
     nbest = Nbest.from_lattice(
@@ -331,7 +343,9 @@ def fast_beam_search_nbest_oracle(
     use_double_scores: bool = True,
     nbest_scale: float = 0.5,
     temperature: float = 1.0,
+    blank_penalty: float = 0.0,
     return_timestamps: bool = False,
+    allow_partial: bool = False,
 ) -> Union[List[List[int]], DecodingResults]:
     """It limits the maximum number of symbols per frame to 1.
 
@@ -389,6 +403,8 @@ def fast_beam_search_nbest_oracle(
         max_states=max_states,
         max_contexts=max_contexts,
         temperature=temperature,
+        allow_partial=allow_partial,
+        blank_penalty=blank_penalty,
     )
 
     nbest = Nbest.from_lattice(
@@ -434,6 +450,8 @@ def fast_beam_search(
     temperature: float = 1.0,
     subtract_ilme: bool = False,
     ilme_scale: float = 0.1,
+    allow_partial: bool = False,
+    blank_penalty: float = 0.0,
 ) -> k2.Fsa:
     """It limits the maximum number of symbols per frame to 1.
 
@@ -503,8 +521,13 @@ def fast_beam_search(
             project_input=False,
         )
         logits = logits.squeeze(1).squeeze(1)
+
+        if blank_penalty != 0:
+            logits[:, 0] -= blank_penalty
+
         log_probs = (logits / temperature).log_softmax(dim=-1)
-        if subtract_ilme:
+
+        if ilme_scale != 0:
             ilme_logits = model.joiner(
                 torch.zeros_like(
                     current_encoder_out, device=current_encoder_out.device
@@ -513,11 +536,16 @@ def fast_beam_search(
                 project_input=False,
             )
             ilme_logits = ilme_logits.squeeze(1).squeeze(1)
+            if blank_penalty != 0:
+                ilme_logits[:, 0] -= blank_penalty
             ilme_log_probs = (ilme_logits / temperature).log_softmax(dim=-1)
             log_probs -= ilme_scale * ilme_log_probs
+
         decoding_streams.advance(log_probs)
     decoding_streams.terminate_and_flush_to_streams()
-    lattice = decoding_streams.format_output(encoder_out_lens.tolist())
+    lattice = decoding_streams.format_output(
+        encoder_out_lens.tolist(), allow_partial=allow_partial
+    )
 
     return lattice
 
@@ -526,6 +554,7 @@ def greedy_search(
     model: nn.Module,
     encoder_out: torch.Tensor,
     max_sym_per_frame: int,
+    blank_penalty: float = 0.0,
     return_timestamps: bool = False,
 ) -> Union[List[int], DecodingResults]:
     """Greedy search for a single utterance.
@@ -595,6 +624,9 @@ def greedy_search(
         )
         # logits is (1, 1, 1, vocab_size)
 
+        if blank_penalty != 0:
+            logits[:, :, :, 0] -= blank_penalty
+
         y = logits.argmax().item()
         if y not in (blank_id, unk_id):
             hyp.append(y)
@@ -626,6 +658,7 @@ def greedy_search_batch(
     model: nn.Module,
     encoder_out: torch.Tensor,
     encoder_out_lens: torch.Tensor,
+    blank_penalty: float = 0,
     return_timestamps: bool = False,
 ) -> Union[List[List[int]], DecodingResults]:
     """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
@@ -703,6 +736,10 @@ def greedy_search_batch(
 
         logits = logits.squeeze(1).squeeze(1)  # (batch_size, vocab_size)
         assert logits.ndim == 2, logits.shape
+
+        if blank_penalty != 0:
+            logits[:, 0] -= blank_penalty
+
         y = logits.argmax(dim=1).tolist()
         emitted = False
         for i, v in enumerate(y):
@@ -923,6 +960,7 @@ def modified_beam_search(
     context_graph: Optional[ContextGraph] = None,
     beam: int = 4,
     temperature: float = 1.0,
+    blank_penalty: float = 0.0,
     return_timestamps: bool = False,
 ) -> Union[List[List[int]], DecodingResults]:
     """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
@@ -1028,6 +1066,9 @@ def modified_beam_search(
 
         logits = logits.squeeze(1).squeeze(1)  # (num_hyps, vocab_size)
 
+        if blank_penalty != 0:
+            logits[:, 0] -= blank_penalty
+
         log_probs = (logits / temperature).log_softmax(dim=-1)  # (num_hyps, vocab_size)
 
         log_probs.add_(ys_log_probs)
@@ -1662,6 +1703,7 @@ def beam_search(
     encoder_out: torch.Tensor,
     beam: int = 4,
     temperature: float = 1.0,
+    blank_penalty: float = 0.0,
     return_timestamps: bool = False,
 ) -> Union[List[int], DecodingResults]:
     """
@@ -1758,6 +1800,9 @@ def beam_search(
                     project_input=False,
                 )
 
+                if blank_penalty != 0:
+                    logits[:, :, :, 0] -= blank_penalty
+
                 # TODO(fangjun): Scale the blank posterior
                 log_prob = (logits / temperature).log_softmax(dim=-1)
                 # log_prob is (1, 1, 1, vocab_size)
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
index 9bac46004..bcd419fb7 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
@@ -849,6 +849,8 @@ class RelPositionalEncoding(torch.nn.Module):
             torch.Tensor: Encoded tensor (batch, 2*time-1, `*`).
 
         """
+        if isinstance(left_context, torch.Tensor):
+            left_context = left_context.item()
         self.extend_pe(x, left_context)
         x_size_1 = x.size(1) + left_context
         pos_emb = self.pe[
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/test_onnx.py b/egs/librispeech/ASR/pruned_transducer_stateless3/test_onnx.py
index 598fcf344..810da8da6 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/test_onnx.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/test_onnx.py
@@ -113,7 +113,7 @@ def test_rel_pos():
 
     torch.onnx.export(
         encoder_pos,
-        x,
+        (x, torch.zeros(1, dtype=torch.int64)),
         filename,
         verbose=False,
         opset_version=opset_version,
@@ -139,7 +139,9 @@ def test_rel_pos():
     assert input_nodes[0].name == "x"
     assert input_nodes[0].shape == ["N", "T", num_features]
 
-    inputs = {input_nodes[0].name: x.numpy()}
+    inputs = {
+        input_nodes[0].name: x.numpy(),
+    }
     onnx_y, onnx_pos_emb = session.run(["y", "pos_emb"], inputs)
     onnx_y = torch.from_numpy(onnx_y)
     onnx_pos_emb = torch.from_numpy(onnx_pos_emb)
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/test_onnx.py b/egs/librispeech/ASR/pruned_transducer_stateless7/test_onnx.py
index 2440d267c..1e9b67226 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/test_onnx.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/test_onnx.py
@@ -265,7 +265,7 @@ def test_zipformer_encoder():
 
     torch.onnx.export(
         encoder,
-        (x),
+        (x, torch.ones(1, dtype=torch.float32)),
         filename,
         verbose=False,
         opset_version=opset_version,
@@ -289,6 +289,7 @@ def test_zipformer_encoder():
     input_nodes = session.get_inputs()
     inputs = {
         input_nodes[0].name: x.numpy(),
+        input_nodes[1].name: torch.ones(1, dtype=torch.float32).numpy(),
     }
     onnx_y = session.run(["y"], inputs)[0]
     onnx_y = torch.from_numpy(onnx_y)
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py
index 0841f7cf1..c44cb1eaf 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc_bs/frame_reducer.py
@@ -81,20 +81,20 @@ class FrameReducer(nn.Module):
             fake_limit_indexes = torch.topk(
                 ctc_output[:, :, blank_id], max_limit_len
             ).indices
-            T = (
+            T_arange = (
                 torch.arange(max_limit_len)
                 .expand_as(
                     fake_limit_indexes,
                 )
                 .to(device=x.device)
             )
-            T = torch.remainder(T, limit_lens.unsqueeze(1))
-            limit_indexes = torch.gather(fake_limit_indexes, 1, T)
+            T_arange = torch.remainder(T_arange, limit_lens.unsqueeze(1))
+            limit_indexes = torch.gather(fake_limit_indexes, 1, T_arange)
             limit_mask = torch.full_like(
                 non_blank_mask,
-                False,
+                0,
                 device=x.device,
-            ).scatter_(1, limit_indexes, True)
+            ).scatter_(1, limit_indexes, 1)
 
             non_blank_mask = non_blank_mask | ~limit_mask
 
@@ -108,9 +108,9 @@ class FrameReducer(nn.Module):
             )
             - out_lens
         )
-        max_pad_len = pad_lens_list.max()
+        max_pad_len = int(pad_lens_list.max())
 
-        out = F.pad(x, (0, 0, 0, max_pad_len))
+        out = F.pad(x, [0, 0, 0, max_pad_len])
 
         valid_pad_mask = ~make_pad_mask(pad_lens_list)
         total_valid_mask = torch.concat([non_blank_mask, valid_pad_mask], dim=1)
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/export.py b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/export.py
index 5735ee692..c191b5bcc 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/export.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/export.py
@@ -856,6 +856,10 @@ def main():
         # Otherwise, one of its arguments is a ragged tensor and is not
         # torch scriptabe.
         model.__class__.forward = torch.jit.ignore(model.__class__.forward)
+        model.encoder.__class__.non_streaming_forward = model.encoder.__class__.forward
+        model.encoder.__class__.non_streaming_forward = torch.jit.export(
+            model.encoder.__class__.non_streaming_forward
+        )
         model.encoder.__class__.forward = model.encoder.__class__.streaming_forward
         logging.info("Using torch.jit.script")
         model = torch.jit.script(model)
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/jit_pretrained.py b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/jit_pretrained.py
index 4fd5e1820..c8301b2da 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/jit_pretrained.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/jit_pretrained.py
@@ -252,7 +252,7 @@ def main():
 
     feature_lengths = torch.tensor(feature_lengths, device=device)
 
-    encoder_out, encoder_out_lens = model.encoder(
+    encoder_out, encoder_out_lens = model.encoder.non_streaming_forward(
         x=features,
         x_lens=feature_lengths,
     )
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/streaming_decode.py b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/streaming_decode.py
index b76272e66..a0f54b6e1 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/streaming_decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/streaming_decode.py
@@ -22,7 +22,7 @@ Usage:
   --avg 15 \
   --decode-chunk-len 32 \
   --exp-dir ./pruned_transducer_stateless7_streaming/exp \
-  --decoding_method greedy_search \
+  --decoding-method greedy_search \
   --num-decode-streams 2000
 """
 
diff --git a/egs/librispeech/ASR/zipformer/.gitignore b/egs/librispeech/ASR/zipformer/.gitignore
new file mode 100644
index 000000000..e47ac1582
--- /dev/null
+++ b/egs/librispeech/ASR/zipformer/.gitignore
@@ -0,0 +1 @@
+swoosh.pdf
diff --git a/egs/librispeech/ASR/zipformer/export-onnx-streaming.py b/egs/librispeech/ASR/zipformer/export-onnx-streaming.py
index 8cec09869..3eb06f68c 100755
--- a/egs/librispeech/ASR/zipformer/export-onnx-streaming.py
+++ b/egs/librispeech/ASR/zipformer/export-onnx-streaming.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# Copyright 2023 Xiaomi Corporation (Author: Fangjun Kuang)
+# Copyright 2023 Xiaomi Corporation (Author: Fangjun Kuang, Wei Kang)
 # Copyright 2023 Danqing Fu (danqing.fu@gmail.com)
 
 """
@@ -19,7 +19,7 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 
 pushd $repo
-git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
 git lfs pull --include "exp/pretrained.pt"
 
 cd exp
@@ -29,7 +29,7 @@ popd
 2. Export the model to ONNX
 
 ./zipformer/export-onnx-streaming.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
   --use-averaged-model 0 \
   --epoch 99 \
   --avg 1 \
@@ -57,9 +57,9 @@ whose value is "64,128,256,-1".
 
 It will generate the following 3 files inside $repo/exp:
 
-  - encoder-epoch-99-avg-1.onnx
-  - decoder-epoch-99-avg-1.onnx
-  - joiner-epoch-99-avg-1.onnx
+  - encoder-epoch-99-avg-1-chunk-16-left-64.onnx
+  - decoder-epoch-99-avg-1-chunk-16-left-64.onnx
+  - joiner-epoch-99-avg-1-chunk-16-left-64.onnx
 
 See ./onnx_pretrained-streaming.py for how to use the exported ONNX models.
 """
@@ -69,14 +69,15 @@ import logging
 from pathlib import Path
 from typing import Dict, List, Tuple
 
+import k2
 import onnx
-import sentencepiece as spm
 import torch
 import torch.nn as nn
 from decoder import Decoder
+from export import num_tokens
 from onnxruntime.quantization import QuantType, quantize_dynamic
 from scaling_converter import convert_scaled_to_non_scaled
-from train import add_model_arguments, get_params, get_model
+from train import add_model_arguments, get_model, get_params
 from zipformer import Zipformer2
 
 from icefall.checkpoint import (
@@ -85,7 +86,7 @@ from icefall.checkpoint import (
     find_checkpoints,
     load_checkpoint,
 )
-from icefall.utils import str2bool, make_pad_mask
+from icefall.utils import str2bool
 
 
 def get_parser():
@@ -142,10 +143,10 @@ def get_parser():
     )
 
     parser.add_argument(
-        "--bpe-model",
+        "--tokens",
         type=str,
-        default="data/lang_bpe_500/bpe.model",
-        help="Path to the BPE model",
+        default="data/lang_bpe_500/tokens.txt",
+        help="Path to the tokens.txt",
     )
 
     parser.add_argument(
@@ -217,7 +218,7 @@ class OnnxEncoder(nn.Module):
         )
         assert x.size(1) == self.chunk_size, (x.size(1), self.chunk_size)
 
-        src_key_padding_mask = make_pad_mask(x_lens)
+        src_key_padding_mask = torch.zeros(N, self.chunk_size, dtype=torch.bool)
 
         # processed_mask is used to mask out initial states
         processed_mask = torch.arange(left_context_len, device=x.device).expand(
@@ -271,6 +272,7 @@ class OnnxEncoder(nn.Module):
         states = self.encoder.get_init_states(batch_size, device)
 
         embed_states = self.encoder_embed.get_init_states(batch_size, device)
+
         states.append(embed_states)
 
         processed_lens = torch.zeros(batch_size, dtype=torch.int64, device=device)
@@ -585,12 +587,9 @@ def main():
 
     logging.info(f"device: {device}")
 
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    # <blk> is defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.vocab_size = sp.get_piece_size()
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.vocab_size = num_tokens(token_table) + 1
 
     logging.info(params)
 
@@ -709,6 +708,8 @@ def main():
         suffix = f"epoch-{params.epoch}"
 
     suffix += f"-avg-{params.avg}"
+    suffix += f"-chunk-{params.chunk_size}"
+    suffix += f"-left-{params.left_context_frames}"
 
     opset_version = 13
 
@@ -756,7 +757,7 @@ def main():
     quantize_dynamic(
         model_input=decoder_filename,
         model_output=decoder_filename_int8,
-        op_types_to_quantize=["MatMul"],
+        op_types_to_quantize=["MatMul", "Gather"],
         weight_type=QuantType.QInt8,
     )
 
diff --git a/egs/librispeech/ASR/zipformer/export-onnx.py b/egs/librispeech/ASR/zipformer/export-onnx.py
index f5b01ce71..724fdd2a6 100755
--- a/egs/librispeech/ASR/zipformer/export-onnx.py
+++ b/egs/librispeech/ASR/zipformer/export-onnx.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 #
-# Copyright 2023 Xiaomi Corporation (Author: Fangjun Kuang)
+# Copyright 2023 Xiaomi Corporation (Author: Fangjun Kuang, Wei Kang)
 # Copyright 2023 Danqing Fu (danqing.fu@gmail.com)
 
 """
@@ -19,7 +19,7 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 
 pushd $repo
-git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
 git lfs pull --include "exp/pretrained.pt"
 
 cd exp
@@ -29,12 +29,11 @@ popd
 2. Export the model to ONNX
 
 ./zipformer/export-onnx.py \
-  --bpe-model $repo/data/lang_bpe_500/bpe.model \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
   --use-averaged-model 0 \
   --epoch 99 \
   --avg 1 \
   --exp-dir $repo/exp \
-  \
   --num-encoder-layers "2,2,3,4,3,2" \
   --downsampling-factor "1,2,4,8,4,2" \
   --feedforward-dim "512,768,1024,1536,1024,768" \
@@ -67,14 +66,15 @@ import logging
 from pathlib import Path
 from typing import Dict, Tuple
 
+import k2
 import onnx
-import sentencepiece as spm
 import torch
 import torch.nn as nn
 from decoder import Decoder
+from export import num_tokens
 from onnxruntime.quantization import QuantType, quantize_dynamic
 from scaling_converter import convert_scaled_to_non_scaled
-from train import add_model_arguments, get_params, get_model
+from train import add_model_arguments, get_model, get_params
 from zipformer import Zipformer2
 
 from icefall.checkpoint import (
@@ -83,7 +83,7 @@ from icefall.checkpoint import (
     find_checkpoints,
     load_checkpoint,
 )
-from icefall.utils import str2bool, make_pad_mask
+from icefall.utils import make_pad_mask, str2bool
 
 
 def get_parser():
@@ -140,10 +140,10 @@ def get_parser():
     )
 
     parser.add_argument(
-        "--bpe-model",
+        "--tokens",
         type=str,
-        default="data/lang_bpe_500/bpe.model",
-        help="Path to the BPE model",
+        default="data/lang_bpe_500/tokens.txt",
+        help="Path to the tokens.txt",
     )
 
     parser.add_argument(
@@ -434,12 +434,9 @@ def main():
 
     logging.info(f"device: {device}")
 
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    # <blk> is defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.vocab_size = sp.get_piece_size()
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.vocab_size = num_tokens(token_table) + 1
 
     logging.info(params)
 
@@ -605,7 +602,7 @@ def main():
     quantize_dynamic(
         model_input=decoder_filename,
         model_output=decoder_filename_int8,
-        op_types_to_quantize=["MatMul"],
+        op_types_to_quantize=["MatMul", "Gather"],
         weight_type=QuantType.QInt8,
     )
 
diff --git a/egs/librispeech/ASR/zipformer/export.py b/egs/librispeech/ASR/zipformer/export.py
index a100cbb8d..4a48d5bad 100755
--- a/egs/librispeech/ASR/zipformer/export.py
+++ b/egs/librispeech/ASR/zipformer/export.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 #
-# Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang, Zengwei Yao)
+# Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao,
+#                                                 Wei Kang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@@ -22,13 +24,16 @@
 
 Usage:
 
+Note: This is a example for librispeech dataset, if you are using different
+dataset, you should change the argument values according to your dataset.
+
 (1) Export to torchscript model using torch.jit.script()
 
 - For non-streaming model:
 
 ./zipformer/export.py \
   --exp-dir ./zipformer/exp \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --epoch 30 \
   --avg 9 \
   --jit 1
@@ -48,7 +53,7 @@ for how to use the exported models outside of icefall.
   --causal 1 \
   --chunk-size 16 \
   --left-context-frames 128 \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --epoch 30 \
   --avg 9 \
   --jit 1
@@ -67,7 +72,7 @@ for how to use the exported models outside of icefall.
 
 ./zipformer/export.py \
   --exp-dir ./zipformer/exp \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --epoch 30 \
   --avg 9
 
@@ -76,7 +81,7 @@ for how to use the exported models outside of icefall.
 ./zipformer/export.py \
   --exp-dir ./zipformer/exp \
   --causal 1 \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --epoch 30 \
   --avg 9
 
@@ -155,13 +160,15 @@ with the following commands:
 
 import argparse
 import logging
+import re
 from pathlib import Path
 from typing import List, Tuple
 
-import sentencepiece as spm
+import k2
 import torch
+from scaling_converter import convert_scaled_to_non_scaled
 from torch import Tensor, nn
-from train import add_model_arguments, get_params, get_model
+from train import add_model_arguments, get_model, get_params
 
 from icefall.checkpoint import (
     average_checkpoints,
@@ -170,7 +177,26 @@ from icefall.checkpoint import (
     load_checkpoint,
 )
 from icefall.utils import make_pad_mask, str2bool
-from scaling_converter import convert_scaled_to_non_scaled
+
+
+def num_tokens(
+    token_table: k2.SymbolTable, disambig_pattern: str = re.compile(r"^#\d+$")
+) -> int:
+    """Return the number of tokens excluding those from
+    disambiguation symbols.
+
+    Caution:
+      0 is not a token ID so it is excluded from the return value.
+    """
+    symbols = token_table.symbols
+    ans = []
+    for s in symbols:
+        if not disambig_pattern.match(s):
+            ans.append(token_table[s])
+    num_tokens = len(ans)
+    if 0 in ans:
+        num_tokens -= 1
+    return num_tokens
 
 
 def get_parser():
@@ -227,10 +253,10 @@ def get_parser():
     )
 
     parser.add_argument(
-        "--bpe-model",
+        "--tokens",
         type=str,
-        default="data/lang_bpe_500/bpe.model",
-        help="Path to the BPE model",
+        default="data/lang_bpe_500/tokens.txt",
+        help="Path to the tokens.txt",
     )
 
     parser.add_argument(
@@ -238,7 +264,7 @@ def get_parser():
         type=str2bool,
         default=False,
         help="""True to save a model after applying torch.jit.script.
-        It will generate a file named cpu_jit.pt.
+        It will generate a file named jit_script.pt.
         Check ./jit_pretrained.py for how to use it.
         """,
     )
@@ -257,6 +283,7 @@ def get_parser():
 
 class EncoderModel(nn.Module):
     """A wrapper for encoder and encoder_embed"""
+
     def __init__(self, encoder: nn.Module, encoder_embed: nn.Module) -> None:
         super().__init__()
         self.encoder = encoder
@@ -275,9 +302,7 @@ class EncoderModel(nn.Module):
         src_key_padding_mask = make_pad_mask(x_lens)
         x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
 
-        encoder_out, encoder_out_lens = self.encoder(
-            x, x_lens, src_key_padding_mask
-        )
+        encoder_out, encoder_out_lens = self.encoder(x, x_lens, src_key_padding_mask)
         encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
 
         return encoder_out, encoder_out_lens
@@ -398,12 +423,9 @@ def main():
 
     logging.info(f"device: {device}")
 
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    # <blk> is defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.vocab_size = sp.get_piece_size()
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = token_table["<blk>"]
+    params.vocab_size = num_tokens(token_table) + 1
 
     logging.info(params)
 
diff --git a/egs/librispeech/ASR/zipformer/generate_averaged_model.py b/egs/librispeech/ASR/zipformer/generate_averaged_model.py
index e0c7b52cb..68111fad7 100755
--- a/egs/librispeech/ASR/zipformer/generate_averaged_model.py
+++ b/egs/librispeech/ASR/zipformer/generate_averaged_model.py
@@ -40,16 +40,11 @@ You can later load it by `torch.load("iter-22000-avg-5.pt")`.
 import argparse
 from pathlib import Path
 
-import sentencepiece as spm
+import k2
 import torch
-from asr_datamodule import LibriSpeechAsrDataModule
+from train import add_model_arguments, get_model, get_params
 
-from train import add_model_arguments, get_params, get_model
-
-from icefall.checkpoint import (
-    average_checkpoints_with_averaged_model,
-    find_checkpoints,
-)
+from icefall.checkpoint import average_checkpoints_with_averaged_model, find_checkpoints
 
 
 def get_parser():
@@ -93,10 +88,10 @@ def get_parser():
     )
 
     parser.add_argument(
-        "--bpe-model",
+        "--tokens",
         type=str,
-        default="data/lang_bpe_500/bpe.model",
-        help="Path to the BPE model",
+        default="data/lang_bpe_500/tokens.txt",
+        help="Path to the tokens.txt",
     )
 
     parser.add_argument(
@@ -114,7 +109,6 @@ def get_parser():
 @torch.no_grad()
 def main():
     parser = get_parser()
-    LibriSpeechAsrDataModule.add_arguments(parser)
     args = parser.parse_args()
     args.exp_dir = Path(args.exp_dir)
 
@@ -131,13 +125,10 @@ def main():
     device = torch.device("cpu")
     print(f"Device: {device}")
 
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    # <blk> is defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.unk_id = sp.piece_to_id("<unk>")
-    params.vocab_size = sp.get_piece_size()
+    symbol_table = k2.SymbolTable.from_file(params.tokens)
+    params.blank_id = symbol_table["<blk>"]
+    params.unk_id = symbol_table["<unk>"]
+    params.vocab_size = len(symbol_table)
 
     print("About to create model")
     model = get_model(params)
diff --git a/egs/librispeech/ASR/zipformer/jit_pretrained.py b/egs/librispeech/ASR/zipformer/jit_pretrained.py
index 87cd5102c..a41fbc1c9 100755
--- a/egs/librispeech/ASR/zipformer/jit_pretrained.py
+++ b/egs/librispeech/ASR/zipformer/jit_pretrained.py
@@ -21,7 +21,7 @@ You can use the following command to get the exported models:
 
 ./zipformer/export.py \
   --exp-dir ./zipformer/exp \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --epoch 30 \
   --avg 9 \
   --jit 1
@@ -30,7 +30,7 @@ Usage of this script:
 
 ./zipformer/jit_pretrained.py \
   --nn-model-filename ./zipformer/exp/cpu_jit.pt \
-  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --tokens ./data/lang_bpe_500/tokens.txt \
   /path/to/foo.wav \
   /path/to/bar.wav
 """
@@ -40,8 +40,8 @@ import logging
 import math
 from typing import List
 
+import k2
 import kaldifeat
-import sentencepiece as spm
 import torch
 import torchaudio
 from torch.nn.utils.rnn import pad_sequence
@@ -60,9 +60,9 @@ def get_parser():
     )
 
     parser.add_argument(
-        "--bpe-model",
+        "--tokens",
         type=str,
-        help="""Path to bpe.model.""",
+        help="""Path to tokens.txt.""",
     )
 
     parser.add_argument(
@@ -128,7 +128,7 @@ def greedy_search(
     )
 
     device = encoder_out.device
-    blank_id = 0  # hard-code to 0
+    blank_id = model.decoder.blank_id
 
     batch_size_list = packed_encoder_out.batch_sizes.tolist()
     N = encoder_out.size(0)
@@ -215,9 +215,6 @@ def main():
 
     model.to(device)
 
-    sp = spm.SentencePieceProcessor()
-    sp.load(args.bpe_model)
-
     logging.info("Constructing Fbank computer")
     opts = kaldifeat.FbankOptions()
     opts.device = device
@@ -256,10 +253,21 @@ def main():
         encoder_out=encoder_out,
         encoder_out_lens=encoder_out_lens,
     )
+
     s = "\n"
+
+    token_table = k2.SymbolTable.from_file(args.tokens)
+
+    def token_ids_to_words(token_ids: List[int]) -> str:
+        text = ""
+        for i in token_ids:
+            text += token_table[i]
+        return text.replace("▁", " ").strip()
+
     for filename, hyp in zip(args.sound_files, hyps):
-        words = sp.decode(hyp)
-        s += f"{filename}:\n{words}\n\n"
+        words = token_ids_to_words(hyp)
+        s += f"{filename}:\n{words}\n"
+
     logging.info(s)
 
     logging.info("Decoding Done")
diff --git a/egs/librispeech/ASR/zipformer/jit_pretrained_ctc.py b/egs/librispeech/ASR/zipformer/jit_pretrained_ctc.py
index 1ec390d5b..904d8cd76 100755
--- a/egs/librispeech/ASR/zipformer/jit_pretrained_ctc.py
+++ b/egs/librispeech/ASR/zipformer/jit_pretrained_ctc.py
@@ -24,7 +24,7 @@ You can generate the checkpoint with the following command:
 ./zipformer/export.py \
   --exp-dir ./zipformer/exp \
   --use-ctc 1 \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --epoch 30 \
   --avg 9 \
   --jit 1
@@ -35,7 +35,7 @@ You can generate the checkpoint with the following command:
   --exp-dir ./zipformer/exp \
   --use-ctc 1 \
   --causal 1 \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --epoch 30 \
   --avg 9 \
   --jit 1
@@ -45,7 +45,7 @@ Usage of this script:
 (1) ctc-decoding
 ./zipformer/jit_pretrained_ctc.py \
   --model-filename ./zipformer/exp/jit_script.pt \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --method ctc-decoding \
   --sample-rate 16000 \
   /path/to/foo.wav \
@@ -91,10 +91,10 @@ from typing import List
 
 import k2
 import kaldifeat
-import sentencepiece as spm
 import torch
 import torchaudio
 from ctc_decode import get_decoding_params
+from export import num_tokens
 from torch.nn.utils.rnn import pad_sequence
 from train import get_params
 
@@ -136,9 +136,9 @@ def get_parser():
     )
 
     parser.add_argument(
-        "--bpe-model",
+        "--tokens",
         type=str,
-        help="""Path to bpe.model.
+        help="""Path to tokens.txt.
         Used only when method is ctc-decoding.
         """,
     )
@@ -149,8 +149,8 @@ def get_parser():
         default="1best",
         help="""Decoding method.
         Possible values are:
-        (0) ctc-decoding - Use CTC decoding. It uses a sentence
-            piece model, i.e., lang_dir/bpe.model, to convert
+        (0) ctc-decoding - Use CTC decoding. It uses a token table,
+            i.e., lang_dir/token.txt, to convert
             word pieces to words. It needs neither a lexicon
             nor an n-gram LM.
         (1) 1best - Use the best path as decoding output. Only
@@ -263,10 +263,8 @@ def main():
     params.update(get_decoding_params())
     params.update(vars(args))
 
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    params.vocab_size = sp.get_piece_size()
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.vocab_size = num_tokens(token_table) + 1
 
     logging.info(f"{params}")
 
@@ -340,8 +338,7 @@ def main():
             lattice=lattice, use_double_scores=params.use_double_scores
         )
         token_ids = get_texts(best_path)
-        hyps = sp.decode(token_ids)
-        hyps = [s.split() for s in hyps]
+        hyps = [[token_table[i] for i in ids] for ids in token_ids]
     elif params.method in [
         "1best",
         "nbest-rescoring",
@@ -415,6 +412,7 @@ def main():
     s = "\n"
     for filename, hyp in zip(params.sound_files, hyps):
         words = " ".join(hyp)
+        words = words.replace("▁", " ").strip()
         s += f"{filename}:\n{words}\n\n"
     logging.info(s)
 
diff --git a/egs/librispeech/ASR/zipformer/jit_pretrained_streaming.py b/egs/librispeech/ASR/zipformer/jit_pretrained_streaming.py
index 58d736685..d4ceacefd 100755
--- a/egs/librispeech/ASR/zipformer/jit_pretrained_streaming.py
+++ b/egs/librispeech/ASR/zipformer/jit_pretrained_streaming.py
@@ -25,7 +25,7 @@ You can use the following command to get the exported models:
   --causal 1 \
   --chunk-size 16 \
   --left-context-frames 128 \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --epoch 30 \
   --avg 9 \
   --jit 1
@@ -34,7 +34,7 @@ Usage of this script:
 
 ./zipformer/jit_pretrained_streaming.py \
   --nn-model-filename ./zipformer/exp-causal/jit_script_chunk_16_left_128.pt \
-  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --tokens ./data/lang_bpe_500/tokens.txt \
   /path/to/foo.wav \
 """
 
@@ -43,8 +43,8 @@ import logging
 import math
 from typing import List, Optional
 
+import k2
 import kaldifeat
-import sentencepiece as spm
 import torch
 import torchaudio
 from kaldifeat import FbankOptions, OnlineFbank, OnlineFeature
@@ -60,13 +60,13 @@ def get_parser():
         "--nn-model-filename",
         type=str,
         required=True,
-        help="Path to the torchscript model cpu_jit.pt",
+        help="Path to the torchscript model jit_script.pt",
     )
 
     parser.add_argument(
-        "--bpe-model",
+        "--tokens",
         type=str,
-        help="""Path to bpe.model.""",
+        help="""Path to tokens.txt.""",
     )
 
     parser.add_argument(
@@ -120,8 +120,8 @@ def greedy_search(
     device: torch.device = torch.device("cpu"),
 ):
     assert encoder_out.ndim == 2
-    context_size = 2
-    blank_id = 0
+    context_size = decoder.context_size
+    blank_id = decoder.blank_id
 
     if decoder_out is None:
         assert hyp is None, hyp
@@ -190,8 +190,8 @@ def main():
     decoder = model.decoder
     joiner = model.joiner
 
-    sp = spm.SentencePieceProcessor()
-    sp.load(args.bpe_model)
+    token_table = k2.SymbolTable.from_file(args.tokens)
+    context_size = decoder.context_size
 
     logging.info("Constructing Fbank computer")
     online_fbank = create_streaming_feature_extractor(args.sample_rate)
@@ -250,9 +250,13 @@ def main():
                 decoder, joiner, encoder_out.squeeze(0), decoder_out, hyp, device=device
             )
 
-    context_size = 2
+    text = ""
+    for i in hyp[context_size:]:
+        text += token_table[i]
+    text = text.replace("▁", " ").strip()
+
     logging.info(args.sound_file)
-    logging.info(sp.decode(hyp[context_size:]))
+    logging.info(text)
 
     logging.info("Decoding Done")
 
diff --git a/egs/librispeech/ASR/zipformer/model.py b/egs/librispeech/ASR/zipformer/model.py
index b541ee697..f2f86af47 100644
--- a/egs/librispeech/ASR/zipformer/model.py
+++ b/egs/librispeech/ASR/zipformer/model.py
@@ -320,7 +320,7 @@ class AsrModel(nn.Module):
         assert x_lens.ndim == 1, x_lens.shape
         assert y.num_axes == 2, y.num_axes
 
-        assert x.size(0) == x_lens.size(0) == y.dim0
+        assert x.size(0) == x_lens.size(0) == y.dim0, (x.shape, x_lens.shape, y.dim0)
 
         # Compute encoder outputs
         encoder_out, encoder_out_lens = self.forward_encoder(x, x_lens)
diff --git a/egs/librispeech/ASR/zipformer/onnx_check.py b/egs/librispeech/ASR/zipformer/onnx_check.py
new file mode 100755
index 000000000..b38b875d0
--- /dev/null
+++ b/egs/librispeech/ASR/zipformer/onnx_check.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+#
+# Copyright 2022 Xiaomi Corporation (Author: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script checks that exported onnx models produce the same output
+with the given torchscript model for the same input.
+
+We use the pre-trained model from
+https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+as an example to show how to use this file.
+
+1. Download the pre-trained model
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
+git lfs pull --include "exp/pretrained.pt"
+
+cd exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+2. Export the model via torchscript (torch.jit.script())
+
+./zipformer/export.py \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --use-averaged-model 0 \
+  --epoch 99 \
+  --avg 1 \
+  --exp-dir $repo/exp/ \
+  --jit 1
+
+It will generate the following file in $repo/exp:
+    - jit_script.pt
+
+3. Export the model to ONNX
+
+./zipformer/export-onnx.py \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --use-averaged-model 0 \
+  --epoch 99 \
+  --avg 1 \
+  --exp-dir $repo/exp/
+
+It will generate the following 3 files inside $repo/exp:
+
+  - encoder-epoch-99-avg-1.onnx
+  - decoder-epoch-99-avg-1.onnx
+  - joiner-epoch-99-avg-1.onnx
+
+4. Run this file
+
+./zipformer/onnx_check.py \
+  --jit-filename $repo/exp/jit_script.pt \
+  --onnx-encoder-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+  --onnx-decoder-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+  --onnx-joiner-filename $repo/exp/joiner-epoch-99-avg-1.onnx
+"""
+
+import argparse
+import logging
+
+import torch
+from onnx_pretrained import OnnxModel
+
+from icefall import is_module_available
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--jit-filename",
+        required=True,
+        type=str,
+        help="Path to the torchscript model",
+    )
+
+    parser.add_argument(
+        "--onnx-encoder-filename",
+        required=True,
+        type=str,
+        help="Path to the onnx encoder model",
+    )
+
+    parser.add_argument(
+        "--onnx-decoder-filename",
+        required=True,
+        type=str,
+        help="Path to the onnx decoder model",
+    )
+
+    parser.add_argument(
+        "--onnx-joiner-filename",
+        required=True,
+        type=str,
+        help="Path to the onnx joiner model",
+    )
+
+    return parser
+
+
+def test_encoder(
+    torch_model: torch.jit.ScriptModule,
+    onnx_model: OnnxModel,
+):
+    C = 80
+    for i in range(3):
+        N = torch.randint(low=1, high=20, size=(1,)).item()
+        T = torch.randint(low=30, high=50, size=(1,)).item()
+        logging.info(f"test_encoder: iter {i}, N={N}, T={T}")
+
+        x = torch.rand(N, T, C)
+        x_lens = torch.randint(low=30, high=T + 1, size=(N,))
+        x_lens[0] = T
+
+        torch_encoder_out, torch_encoder_out_lens = torch_model.encoder(x, x_lens)
+        torch_encoder_out = torch_model.joiner.encoder_proj(torch_encoder_out)
+
+        onnx_encoder_out, onnx_encoder_out_lens = onnx_model.run_encoder(x, x_lens)
+
+        assert torch.allclose(torch_encoder_out, onnx_encoder_out, atol=1e-05), (
+            (torch_encoder_out - onnx_encoder_out).abs().max()
+        )
+
+
+def test_decoder(
+    torch_model: torch.jit.ScriptModule,
+    onnx_model: OnnxModel,
+):
+    context_size = onnx_model.context_size
+    vocab_size = onnx_model.vocab_size
+    for i in range(10):
+        N = torch.randint(1, 100, size=(1,)).item()
+        logging.info(f"test_decoder: iter {i}, N={N}")
+        x = torch.randint(
+            low=1,
+            high=vocab_size,
+            size=(N, context_size),
+            dtype=torch.int64,
+        )
+        torch_decoder_out = torch_model.decoder(x, need_pad=torch.tensor([False]))
+        torch_decoder_out = torch_model.joiner.decoder_proj(torch_decoder_out)
+        torch_decoder_out = torch_decoder_out.squeeze(1)
+
+        onnx_decoder_out = onnx_model.run_decoder(x)
+        assert torch.allclose(torch_decoder_out, onnx_decoder_out, atol=1e-4), (
+            (torch_decoder_out - onnx_decoder_out).abs().max()
+        )
+
+
+def test_joiner(
+    torch_model: torch.jit.ScriptModule,
+    onnx_model: OnnxModel,
+):
+    encoder_dim = torch_model.joiner.encoder_proj.weight.shape[1]
+    decoder_dim = torch_model.joiner.decoder_proj.weight.shape[1]
+    for i in range(10):
+        N = torch.randint(1, 100, size=(1,)).item()
+        logging.info(f"test_joiner: iter {i}, N={N}")
+        encoder_out = torch.rand(N, encoder_dim)
+        decoder_out = torch.rand(N, decoder_dim)
+
+        projected_encoder_out = torch_model.joiner.encoder_proj(encoder_out)
+        projected_decoder_out = torch_model.joiner.decoder_proj(decoder_out)
+
+        torch_joiner_out = torch_model.joiner(encoder_out, decoder_out)
+        onnx_joiner_out = onnx_model.run_joiner(
+            projected_encoder_out, projected_decoder_out
+        )
+
+        assert torch.allclose(torch_joiner_out, onnx_joiner_out, atol=1e-4), (
+            (torch_joiner_out - onnx_joiner_out).abs().max()
+        )
+
+
+@torch.no_grad()
+def main():
+    args = get_parser().parse_args()
+    logging.info(vars(args))
+
+    torch_model = torch.jit.load(args.jit_filename)
+
+    onnx_model = OnnxModel(
+        encoder_model_filename=args.onnx_encoder_filename,
+        decoder_model_filename=args.onnx_decoder_filename,
+        joiner_model_filename=args.onnx_joiner_filename,
+    )
+
+    logging.info("Test encoder")
+    test_encoder(torch_model, onnx_model)
+
+    logging.info("Test decoder")
+    test_decoder(torch_model, onnx_model)
+
+    logging.info("Test joiner")
+    test_joiner(torch_model, onnx_model)
+    logging.info("Finished checking ONNX models")
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+# See https://github.com/pytorch/pytorch/issues/38342
+# and https://github.com/pytorch/pytorch/issues/33354
+#
+# If we don't do this, the delay increases whenever there is
+# a new request that changes the actual batch size.
+# If you use `py-spy dump --pid <server-pid> --native`, you will
+# see a lot of time is spent in re-compiling the torch script model.
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_set_profiling_mode(False)
+torch._C._set_graph_executor_optimize(False)
+if __name__ == "__main__":
+    torch.manual_seed(20220727)
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
diff --git a/egs/librispeech/ASR/zipformer/onnx_decode.py b/egs/librispeech/ASR/zipformer/onnx_decode.py
new file mode 100755
index 000000000..2aca36ca9
--- /dev/null
+++ b/egs/librispeech/ASR/zipformer/onnx_decode.py
@@ -0,0 +1,323 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao,
+#                                                 Xiaoyu Yang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script loads ONNX exported models and uses them to decode the test sets.
+
+We use the pre-trained model from
+https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+as an example to show how to use this file.
+
+1. Download the pre-trained model
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "data/lang_bpe_500/bpe.model"
+git lfs pull --include "exp/pretrained.pt"
+
+cd exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+2. Export the model to ONNX
+
+./zipformer/export-onnx.py \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --use-averaged-model 0 \
+  --epoch 99 \
+  --avg 1 \
+  --exp-dir $repo/exp \
+  --causal False
+
+It will generate the following 3 files inside $repo/exp:
+
+  - encoder-epoch-99-avg-1.onnx
+  - decoder-epoch-99-avg-1.onnx
+  - joiner-epoch-99-avg-1.onnx
+
+2. Run this file
+
+./zipformer/onnx_decode.py \
+  --exp-dir $repo/exp \
+  --max-duration 600 \
+  --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+  --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+  --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+"""
+
+
+import argparse
+import logging
+import time
+from pathlib import Path
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+from asr_datamodule import LibriSpeechAsrDataModule
+
+from onnx_pretrained import greedy_search, OnnxModel
+
+from icefall.utils import setup_logger, store_transcripts, write_error_stats
+from k2 import SymbolTable
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--encoder-model-filename",
+        type=str,
+        required=True,
+        help="Path to the encoder onnx model. ",
+    )
+
+    parser.add_argument(
+        "--decoder-model-filename",
+        type=str,
+        required=True,
+        help="Path to the decoder onnx model. ",
+    )
+
+    parser.add_argument(
+        "--joiner-model-filename",
+        type=str,
+        required=True,
+        help="Path to the joiner onnx model. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        help="""Path to tokens.txt.""",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="Valid values are greedy_search and modified_beam_search",
+    )
+
+    return parser
+
+
+def decode_one_batch(
+    model: OnnxModel, token_table: SymbolTable, batch: dict
+) -> List[List[str]]:
+    """Decode one batch and return the result.
+    Currently it only greedy_search is supported.
+
+    Args:
+      model:
+        The neural model.
+      token_table:
+        The token table.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+
+    Returns:
+      Return the decoded results for each utterance.
+    """
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+    # at entry, feature is (N, T, C)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(dtype=torch.int64)
+
+    encoder_out, encoder_out_lens = model.run_encoder(x=feature, x_lens=feature_lens)
+
+    hyps = greedy_search(
+        model=model, encoder_out=encoder_out, encoder_out_lens=encoder_out_lens
+    )
+
+    def token_ids_to_words(token_ids: List[int]) -> str:
+        text = ""
+        for i in token_ids:
+            text += token_table[i]
+        return text.replace("▁", " ").strip()
+
+    hyps = [token_ids_to_words(h).split() for h in hyps]
+    return hyps
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    model: nn.Module,
+    token_table: SymbolTable,
+) -> Tuple[List[Tuple[str, List[str], List[str]]], float]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      model:
+        The neural model.
+      token_table:
+        The token table.
+
+    Returns:
+      - A list of tuples. Each tuple contains three elements:
+         - cut_id,
+         - reference transcript,
+         - predicted result.
+      - The total duration (in seconds) of the dataset.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    log_interval = 10
+    total_duration = 0
+
+    results = []
+    for batch_idx, batch in enumerate(dl):
+        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+        total_duration += sum([cut.duration for cut in batch["supervisions"]["cut"]])
+
+        hyps = decode_one_batch(model=model, token_table=token_table, batch=batch)
+
+        this_batch = []
+        assert len(hyps) == len(texts)
+        for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+            ref_words = ref_text.split()
+            this_batch.append((cut_id, ref_words, hyp_words))
+
+        results.extend(this_batch)
+
+        num_cuts += len(texts)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+
+    return results, total_duration
+
+
+def save_results(
+    res_dir: Path,
+    test_set_name: str,
+    results: List[Tuple[str, List[str], List[str]]],
+):
+    recog_path = res_dir / f"recogs-{test_set_name}.txt"
+    results = sorted(results)
+    store_transcripts(filename=recog_path, texts=results)
+    logging.info(f"The transcripts are stored in {recog_path}")
+
+    # The following prints out WERs, per-word error statistics and aligned
+    # ref/hyp pairs.
+    errs_filename = res_dir / f"errs-{test_set_name}.txt"
+    with open(errs_filename, "w") as f:
+        wer = write_error_stats(f, f"{test_set_name}", results, enable_log=True)
+
+    logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    errs_info = res_dir / f"wer-summary-{test_set_name}.txt"
+    with open(errs_info, "w") as f:
+        print("WER", file=f)
+        print(wer, file=f)
+
+    s = "\nFor {}, WER is {}:\n".format(test_set_name, wer)
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    LibriSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+
+    assert (
+        args.decoding_method == "greedy_search"
+    ), "Only supports greedy_search currently."
+    res_dir = Path(args.exp_dir) / f"onnx-{args.decoding_method}"
+
+    setup_logger(f"{res_dir}/log-decode")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    logging.info(f"Device: {device}")
+
+    token_table = SymbolTable.from_file(args.tokens)
+
+    logging.info(vars(args))
+
+    logging.info("About to create model")
+    model = OnnxModel(
+        encoder_model_filename=args.encoder_model_filename,
+        decoder_model_filename=args.decoder_model_filename,
+        joiner_model_filename=args.joiner_model_filename,
+    )
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    librispeech = LibriSpeechAsrDataModule(args)
+
+    test_clean_cuts = librispeech.test_clean_cuts()
+    test_other_cuts = librispeech.test_other_cuts()
+
+    test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
+    test_other_dl = librispeech.test_dataloaders(test_other_cuts)
+
+    test_sets = ["test-clean", "test-other"]
+    test_dl = [test_clean_dl, test_other_dl]
+
+    for test_set, test_dl in zip(test_sets, test_dl):
+        start_time = time.time()
+        results, total_duration = decode_dataset(dl=test_dl, model=model, token_table=token_table)
+        end_time = time.time()
+        elapsed_seconds = end_time - start_time
+        rtf = elapsed_seconds / total_duration
+
+        logging.info(f"Elapsed time: {elapsed_seconds:.3f} s")
+        logging.info(f"Wave duration: {total_duration:.3f} s")
+        logging.info(
+            f"Real time factor (RTF): {elapsed_seconds:.3f}/{total_duration:.3f} = {rtf:.3f}"
+        )
+
+        save_results(res_dir=res_dir, test_set_name=test_set, results=results)
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/ASR/zipformer/onnx_pretrained-streaming.py b/egs/librispeech/ASR/zipformer/onnx_pretrained-streaming.py
index 273f883df..2ce4506a8 100755
--- a/egs/librispeech/ASR/zipformer/onnx_pretrained-streaming.py
+++ b/egs/librispeech/ASR/zipformer/onnx_pretrained-streaming.py
@@ -524,11 +524,11 @@ def main():
                 hyp,
             )
 
-    symbol_table = k2.SymbolTable.from_file(args.tokens)
+    token_table = k2.SymbolTable.from_file(args.tokens)
 
     text = ""
     for i in hyp[context_size:]:
-        text += symbol_table[i]
+        text += token_table[i]
     text = text.replace("▁", " ").strip()
 
     logging.info(args.sound_file)
diff --git a/egs/librispeech/ASR/zipformer/onnx_pretrained.py b/egs/librispeech/ASR/zipformer/onnx_pretrained.py
deleted file mode 120000
index 0069288fe..000000000
--- a/egs/librispeech/ASR/zipformer/onnx_pretrained.py
+++ /dev/null
@@ -1 +0,0 @@
-../pruned_transducer_stateless7/onnx_pretrained.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/zipformer/onnx_pretrained.py b/egs/librispeech/ASR/zipformer/onnx_pretrained.py
new file mode 100755
index 000000000..e8a521460
--- /dev/null
+++ b/egs/librispeech/ASR/zipformer/onnx_pretrained.py
@@ -0,0 +1,419 @@
+#!/usr/bin/env python3
+# Copyright      2022  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script loads ONNX models and uses them to decode waves.
+You can use the following command to get the exported models:
+
+We use the pre-trained model from
+https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+as an example to show how to use this file.
+
+1. Download the pre-trained model
+
+cd egs/librispeech/ASR
+
+repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "data/lang_bpe_500/tokens.txt"
+git lfs pull --include "exp/pretrained.pt"
+
+cd exp
+ln -s pretrained.pt epoch-99.pt
+popd
+
+2. Export the model to ONNX
+
+./zipformer/export-onnx.py \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  --use-averaged-model 0 \
+  --epoch 99 \
+  --avg 1 \
+  --exp-dir $repo/exp \
+  --causal False
+
+It will generate the following 3 files inside $repo/exp:
+
+  - encoder-epoch-99-avg-1.onnx
+  - decoder-epoch-99-avg-1.onnx
+  - joiner-epoch-99-avg-1.onnx
+
+3. Run this file
+
+./zipformer/onnx_pretrained.py \
+  --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+  --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+  --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
+  $repo/test_wavs/1089-134686-0001.wav \
+  $repo/test_wavs/1221-135766-0001.wav \
+  $repo/test_wavs/1221-135766-0002.wav
+"""
+
+import argparse
+import logging
+import math
+from typing import List, Tuple
+
+import k2
+import kaldifeat
+import onnxruntime as ort
+import torch
+import torchaudio
+from torch.nn.utils.rnn import pad_sequence
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--encoder-model-filename",
+        type=str,
+        required=True,
+        help="Path to the encoder onnx model. ",
+    )
+
+    parser.add_argument(
+        "--decoder-model-filename",
+        type=str,
+        required=True,
+        help="Path to the decoder onnx model. ",
+    )
+
+    parser.add_argument(
+        "--joiner-model-filename",
+        type=str,
+        required=True,
+        help="Path to the joiner onnx model. ",
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        help="""Path to tokens.txt.""",
+    )
+
+    parser.add_argument(
+        "sound_files",
+        type=str,
+        nargs="+",
+        help="The input sound file(s) to transcribe. "
+        "Supported formats are those supported by torchaudio.load(). "
+        "For example, wav and flac are supported. "
+        "The sample rate has to be 16kHz.",
+    )
+
+    parser.add_argument(
+        "--sample-rate",
+        type=int,
+        default=16000,
+        help="The sample rate of the input sound file",
+    )
+
+    return parser
+
+
+class OnnxModel:
+    def __init__(
+        self,
+        encoder_model_filename: str,
+        decoder_model_filename: str,
+        joiner_model_filename: str,
+    ):
+        session_opts = ort.SessionOptions()
+        session_opts.inter_op_num_threads = 1
+        session_opts.intra_op_num_threads = 4
+
+        self.session_opts = session_opts
+
+        self.init_encoder(encoder_model_filename)
+        self.init_decoder(decoder_model_filename)
+        self.init_joiner(joiner_model_filename)
+
+    def init_encoder(self, encoder_model_filename: str):
+        self.encoder = ort.InferenceSession(
+            encoder_model_filename,
+            sess_options=self.session_opts,
+        )
+
+    def init_decoder(self, decoder_model_filename: str):
+        self.decoder = ort.InferenceSession(
+            decoder_model_filename,
+            sess_options=self.session_opts,
+        )
+
+        decoder_meta = self.decoder.get_modelmeta().custom_metadata_map
+        self.context_size = int(decoder_meta["context_size"])
+        self.vocab_size = int(decoder_meta["vocab_size"])
+
+        logging.info(f"context_size: {self.context_size}")
+        logging.info(f"vocab_size: {self.vocab_size}")
+
+    def init_joiner(self, joiner_model_filename: str):
+        self.joiner = ort.InferenceSession(
+            joiner_model_filename,
+            sess_options=self.session_opts,
+        )
+
+        joiner_meta = self.joiner.get_modelmeta().custom_metadata_map
+        self.joiner_dim = int(joiner_meta["joiner_dim"])
+
+        logging.info(f"joiner_dim: {self.joiner_dim}")
+
+    def run_encoder(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+          x:
+            A 3-D tensor of shape (N, T, C)
+          x_lens:
+            A 2-D tensor of shape (N,). Its dtype is torch.int64
+        Returns:
+          Return a tuple containing:
+            - encoder_out, its shape is (N, T', joiner_dim)
+            - encoder_out_lens, its shape is (N,)
+        """
+        out = self.encoder.run(
+            [
+                self.encoder.get_outputs()[0].name,
+                self.encoder.get_outputs()[1].name,
+            ],
+            {
+                self.encoder.get_inputs()[0].name: x.numpy(),
+                self.encoder.get_inputs()[1].name: x_lens.numpy(),
+            },
+        )
+        return torch.from_numpy(out[0]), torch.from_numpy(out[1])
+
+    def run_decoder(self, decoder_input: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+          decoder_input:
+            A 2-D tensor of shape (N, context_size)
+        Returns:
+          Return a 2-D tensor of shape (N, joiner_dim)
+        """
+        out = self.decoder.run(
+            [self.decoder.get_outputs()[0].name],
+            {self.decoder.get_inputs()[0].name: decoder_input.numpy()},
+        )[0]
+
+        return torch.from_numpy(out)
+
+    def run_joiner(
+        self, encoder_out: torch.Tensor, decoder_out: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Args:
+          encoder_out:
+            A 2-D tensor of shape (N, joiner_dim)
+          decoder_out:
+            A 2-D tensor of shape (N, joiner_dim)
+        Returns:
+          Return a 2-D tensor of shape (N, vocab_size)
+        """
+        out = self.joiner.run(
+            [self.joiner.get_outputs()[0].name],
+            {
+                self.joiner.get_inputs()[0].name: encoder_out.numpy(),
+                self.joiner.get_inputs()[1].name: decoder_out.numpy(),
+            },
+        )[0]
+
+        return torch.from_numpy(out)
+
+
+def read_sound_files(
+    filenames: List[str], expected_sample_rate: float
+) -> List[torch.Tensor]:
+    """Read a list of sound files into a list 1-D float32 torch tensors.
+    Args:
+      filenames:
+        A list of sound filenames.
+      expected_sample_rate:
+        The expected sample rate of the sound files.
+    Returns:
+      Return a list of 1-D float32 torch tensors.
+    """
+    ans = []
+    for f in filenames:
+        wave, sample_rate = torchaudio.load(f)
+        assert (
+            sample_rate == expected_sample_rate
+        ), f"expected sample rate: {expected_sample_rate}. Given: {sample_rate}"
+        # We use only the first channel
+        ans.append(wave[0])
+    return ans
+
+
+def greedy_search(
+    model: OnnxModel,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+) -> List[List[int]]:
+    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
+    Args:
+      model:
+        The transducer model.
+      encoder_out:
+        A 3-D tensor of shape (N, T, joiner_dim)
+      encoder_out_lens:
+        A 1-D tensor of shape (N,).
+    Returns:
+      Return the decoded results for each utterance.
+    """
+    assert encoder_out.ndim == 3, encoder_out.shape
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    packed_encoder_out = torch.nn.utils.rnn.pack_padded_sequence(
+        input=encoder_out,
+        lengths=encoder_out_lens.cpu(),
+        batch_first=True,
+        enforce_sorted=False,
+    )
+
+    blank_id = 0  # hard-code to 0
+
+    batch_size_list = packed_encoder_out.batch_sizes.tolist()
+    N = encoder_out.size(0)
+
+    assert torch.all(encoder_out_lens > 0), encoder_out_lens
+    assert N == batch_size_list[0], (N, batch_size_list)
+
+    context_size = model.context_size
+    hyps = [[blank_id] * context_size for _ in range(N)]
+
+    decoder_input = torch.tensor(
+        hyps,
+        dtype=torch.int64,
+    )  # (N, context_size)
+
+    decoder_out = model.run_decoder(decoder_input)
+
+    offset = 0
+    for batch_size in batch_size_list:
+        start = offset
+        end = offset + batch_size
+        current_encoder_out = packed_encoder_out.data[start:end]
+        # current_encoder_out's shape: (batch_size, joiner_dim)
+        offset = end
+
+        decoder_out = decoder_out[:batch_size]
+        logits = model.run_joiner(current_encoder_out, decoder_out)
+
+        # logits'shape (batch_size, vocab_size)
+
+        assert logits.ndim == 2, logits.shape
+        y = logits.argmax(dim=1).tolist()
+        emitted = False
+        for i, v in enumerate(y):
+            if v != blank_id:
+                hyps[i].append(v)
+                emitted = True
+        if emitted:
+            # update decoder output
+            decoder_input = [h[-context_size:] for h in hyps[:batch_size]]
+            decoder_input = torch.tensor(
+                decoder_input,
+                dtype=torch.int64,
+            )
+            decoder_out = model.run_decoder(decoder_input)
+
+    sorted_ans = [h[context_size:] for h in hyps]
+    ans = []
+    unsorted_indices = packed_encoder_out.unsorted_indices.tolist()
+    for i in range(N):
+        ans.append(sorted_ans[unsorted_indices[i]])
+
+    return ans
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    args = parser.parse_args()
+    logging.info(vars(args))
+    model = OnnxModel(
+        encoder_model_filename=args.encoder_model_filename,
+        decoder_model_filename=args.decoder_model_filename,
+        joiner_model_filename=args.joiner_model_filename,
+    )
+
+    logging.info("Constructing Fbank computer")
+    opts = kaldifeat.FbankOptions()
+    opts.device = "cpu"
+    opts.frame_opts.dither = 0
+    opts.frame_opts.snip_edges = False
+    opts.frame_opts.samp_freq = args.sample_rate
+    opts.mel_opts.num_bins = 80
+
+    fbank = kaldifeat.Fbank(opts)
+
+    logging.info(f"Reading sound files: {args.sound_files}")
+    waves = read_sound_files(
+        filenames=args.sound_files,
+        expected_sample_rate=args.sample_rate,
+    )
+
+    logging.info("Decoding started")
+    features = fbank(waves)
+    feature_lengths = [f.size(0) for f in features]
+
+    features = pad_sequence(
+        features,
+        batch_first=True,
+        padding_value=math.log(1e-10),
+    )
+
+    feature_lengths = torch.tensor(feature_lengths, dtype=torch.int64)
+    encoder_out, encoder_out_lens = model.run_encoder(features, feature_lengths)
+
+    hyps = greedy_search(
+        model=model,
+        encoder_out=encoder_out,
+        encoder_out_lens=encoder_out_lens,
+    )
+    s = "\n"
+
+    token_table = k2.SymbolTable.from_file(args.tokens)
+
+    def token_ids_to_words(token_ids: List[int]) -> str:
+        text = ""
+        for i in token_ids:
+            text += token_table[i]
+        return text.replace("▁", " ").strip()
+
+    for filename, hyp in zip(args.sound_files, hyps):
+        words = token_ids_to_words(hyp)
+        s += f"{filename}:\n{words}\n"
+    logging.info(s)
+
+    logging.info("Decoding Done")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
diff --git a/egs/librispeech/ASR/zipformer/pretrained.py b/egs/librispeech/ASR/zipformer/pretrained.py
index 2944f79e3..3104b6084 100755
--- a/egs/librispeech/ASR/zipformer/pretrained.py
+++ b/egs/librispeech/ASR/zipformer/pretrained.py
@@ -18,11 +18,14 @@
 This script loads a checkpoint and uses it to decode waves.
 You can generate the checkpoint with the following command:
 
+Note: This is a example for librispeech dataset, if you are using different
+dataset, you should change the argument values according to your dataset.
+
 - For non-streaming model:
 
 ./zipformer/export.py \
   --exp-dir ./zipformer/exp \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --epoch 30 \
   --avg 9
 
@@ -31,7 +34,7 @@ You can generate the checkpoint with the following command:
 ./zipformer/export.py \
   --exp-dir ./zipformer/exp \
   --causal 1 \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --epoch 30 \
   --avg 9
 
@@ -42,7 +45,7 @@ Usage of this script:
 (1) greedy search
 ./zipformer/pretrained.py \
   --checkpoint ./zipformer/exp/pretrained.pt \
-  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --method greedy_search \
   /path/to/foo.wav \
   /path/to/bar.wav
@@ -50,7 +53,7 @@ Usage of this script:
 (2) modified beam search
 ./zipformer/pretrained.py \
   --checkpoint ./zipformer/exp/pretrained.pt \
-  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --tokens ./data/lang_bpe_500/tokens.txt \
   --method modified_beam_search \
   /path/to/foo.wav \
   /path/to/bar.wav
@@ -58,7 +61,7 @@ Usage of this script:
 (3) fast beam search
 ./zipformer/pretrained.py \
   --checkpoint ./zipformer/exp/pretrained.pt \
-  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --tokens ./data/lang_bpe_500/tokens.txt \
   --method fast_beam_search \
   /path/to/foo.wav \
   /path/to/bar.wav
@@ -71,7 +74,7 @@ Usage of this script:
   --causal 1 \
   --chunk-size 16 \
   --left-context-frames 128 \
-  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --tokens ./data/lang_bpe_500/tokens.txt \
   --method greedy_search \
   /path/to/foo.wav \
   /path/to/bar.wav
@@ -82,7 +85,7 @@ Usage of this script:
   --causal 1 \
   --chunk-size 16 \
   --left-context-frames 128 \
-  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --tokens ./data/lang_bpe_500/tokens.txt \
   --method modified_beam_search \
   /path/to/foo.wav \
   /path/to/bar.wav
@@ -93,7 +96,7 @@ Usage of this script:
   --causal 1 \
   --chunk-size 16 \
   --left-context-frames 128 \
-  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --tokens ./data/lang_bpe_500/tokens.txt \
   --method fast_beam_search \
   /path/to/foo.wav \
   /path/to/bar.wav
@@ -112,7 +115,6 @@ from typing import List
 
 import k2
 import kaldifeat
-import sentencepiece as spm
 import torch
 import torchaudio
 from beam_search import (
@@ -120,8 +122,11 @@ from beam_search import (
     greedy_search_batch,
     modified_beam_search,
 )
+from export import num_tokens
 from torch.nn.utils.rnn import pad_sequence
-from train import add_model_arguments, get_params, get_model
+from train import add_model_arguments, get_model, get_params
+
+from icefall.utils import make_pad_mask
 
 
 def get_parser():
@@ -139,9 +144,9 @@ def get_parser():
     )
 
     parser.add_argument(
-        "--bpe-model",
+        "--tokens",
         type=str,
-        help="""Path to bpe.model.""",
+        help="""Path to tokens.txt.""",
     )
 
     parser.add_argument(
@@ -258,13 +263,11 @@ def main():
 
     params.update(vars(args))
 
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
+    token_table = k2.SymbolTable.from_file(params.tokens)
 
-    # <blk> is defined in local/train_bpe_model.py
-    params.blank_id = sp.piece_to_id("<blk>")
-    params.unk_id = sp.piece_to_id("<unk>")
-    params.vocab_size = sp.get_piece_size()
+    params.blank_id = token_table["<blk>"]
+    params.unk_id = token_table["<unk>"]
+    params.vocab_size = num_tokens(token_table) + 1
 
     logging.info(f"{params}")
 
@@ -323,6 +326,12 @@ def main():
     msg = f"Using {params.method}"
     logging.info(msg)
 
+    def token_ids_to_words(token_ids: List[int]) -> str:
+        text = ""
+        for i in token_ids:
+            text += token_table[i]
+        return text.replace("▁", " ").strip()
+
     if params.method == "fast_beam_search":
         decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
         hyp_tokens = fast_beam_search_one_best(
@@ -334,8 +343,8 @@ def main():
             max_contexts=params.max_contexts,
             max_states=params.max_states,
         )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
+        for hyp in hyp_tokens:
+            hyps.append(token_ids_to_words(hyp))
     elif params.method == "modified_beam_search":
         hyp_tokens = modified_beam_search(
             model=model,
@@ -344,23 +353,22 @@ def main():
             beam=params.beam_size,
         )
 
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
+        for hyp in hyp_tokens:
+            hyps.append(token_ids_to_words(hyp))
     elif params.method == "greedy_search" and params.max_sym_per_frame == 1:
         hyp_tokens = greedy_search_batch(
             model=model,
             encoder_out=encoder_out,
             encoder_out_lens=encoder_out_lens,
         )
-        for hyp in sp.decode(hyp_tokens):
-            hyps.append(hyp.split())
+        for hyp in hyp_tokens:
+            hyps.append(token_ids_to_words(hyp))
     else:
         raise ValueError(f"Unsupported method: {params.method}")
 
     s = "\n"
     for filename, hyp in zip(params.sound_files, hyps):
-        words = " ".join(hyp)
-        s += f"{filename}:\n{words}\n\n"
+        s += f"{filename}:\n{hyp}\n\n"
     logging.info(s)
 
     logging.info("Decoding Done")
diff --git a/egs/librispeech/ASR/zipformer/pretrained_ctc.py b/egs/librispeech/ASR/zipformer/pretrained_ctc.py
index f10d95449..be239e9c3 100755
--- a/egs/librispeech/ASR/zipformer/pretrained_ctc.py
+++ b/egs/librispeech/ASR/zipformer/pretrained_ctc.py
@@ -24,7 +24,7 @@ You can generate the checkpoint with the following command:
 ./zipformer/export.py \
   --exp-dir ./zipformer/exp \
   --use-ctc 1 \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --epoch 30 \
   --avg 9
 
@@ -34,7 +34,7 @@ You can generate the checkpoint with the following command:
   --exp-dir ./zipformer/exp \
   --use-ctc 1 \
   --causal 1 \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --epoch 30 \
   --avg 9
 
@@ -43,7 +43,7 @@ Usage of this script:
 (1) ctc-decoding
 ./zipformer/pretrained_ctc.py \
   --checkpoint ./zipformer/exp/pretrained.pt \
-  --bpe-model data/lang_bpe_500/bpe.model \
+  --tokens data/lang_bpe_500/tokens.txt \
   --method ctc-decoding \
   --sample-rate 16000 \
   /path/to/foo.wav \
@@ -90,12 +90,12 @@ from typing import List
 
 import k2
 import kaldifeat
-import sentencepiece as spm
 import torch
 import torchaudio
 from ctc_decode import get_decoding_params
+from export import num_tokens
 from torch.nn.utils.rnn import pad_sequence
-from train import add_model_arguments, get_params, get_model
+from train import add_model_arguments, get_model, get_params
 
 from icefall.decode import (
     get_lattice,
@@ -144,9 +144,9 @@ def get_parser():
     )
 
     parser.add_argument(
-        "--bpe-model",
+        "--tokens",
         type=str,
-        help="""Path to bpe.model.
+        help="""Path to tokens.txt.
         Used only when method is ctc-decoding.
         """,
     )
@@ -157,8 +157,8 @@ def get_parser():
         default="1best",
         help="""Decoding method.
         Possible values are:
-        (0) ctc-decoding - Use CTC decoding. It uses a sentence
-            piece model, i.e., lang_dir/bpe.model, to convert
+        (0) ctc-decoding - Use CTC decoding. It uses a token table,
+            i.e., lang_dir/tokens.txt, to convert
             word pieces to words. It needs neither a lexicon
             nor an n-gram LM.
         (1) 1best - Use the best path as decoding output. Only
@@ -273,11 +273,10 @@ def main():
     params.update(get_decoding_params())
     params.update(vars(args))
 
-    sp = spm.SentencePieceProcessor()
-    sp.load(params.bpe_model)
-
-    params.vocab_size = sp.get_piece_size()
-    params.blank_id = 0
+    token_table = k2.SymbolTable.from_file(params.tokens)
+    params.vocab_size = num_tokens(token_table)
+    params.blank_id = token_table["<blk>"]
+    assert params.blank_id == 0
 
     logging.info(f"{params}")
 
@@ -358,8 +357,7 @@ def main():
             lattice=lattice, use_double_scores=params.use_double_scores
         )
         token_ids = get_texts(best_path)
-        hyps = sp.decode(token_ids)
-        hyps = [s.split() for s in hyps]
+        hyps = [[token_table[i] for i in ids] for ids in token_ids]
     elif params.method in [
         "1best",
         "nbest-rescoring",
@@ -433,6 +431,7 @@ def main():
     s = "\n"
     for filename, hyp in zip(params.sound_files, hyps):
         words = " ".join(hyp)
+        words = words.replace("▁", " ").strip()
         s += f"{filename}:\n{words}\n\n"
     logging.info(s)
 
diff --git a/egs/librispeech/ASR/zipformer/scaling.py b/egs/librispeech/ASR/zipformer/scaling.py
index 9f23eeead..7c98ef045 100644
--- a/egs/librispeech/ASR/zipformer/scaling.py
+++ b/egs/librispeech/ASR/zipformer/scaling.py
@@ -25,6 +25,11 @@ import math
 import torch.nn as nn
 from torch import Tensor
 
+def logaddexp_onnx(x: Tensor, y: Tensor) -> Tensor:
+    max_value = torch.max(x, y)
+    diff = torch.abs(x - y)
+    return max_value + torch.log1p(torch.exp(-diff))
+
 
 # RuntimeError: Exporting the operator logaddexp to ONNX opset version
 # 14 is not supported. Please feel free to request support or submit
@@ -33,10 +38,22 @@ from torch import Tensor
 # The following function is to solve the above error when exporting
 # models to ONNX via torch.jit.trace()
 def logaddexp(x: Tensor, y: Tensor) -> Tensor:
-    if not torch.jit.is_tracing():
+    # Caution(fangjun): Put torch.jit.is_scripting() before
+    # torch.onnx.is_in_onnx_export();
+    # otherwise, it will cause errors for torch.jit.script().
+    #
+    # torch.logaddexp() works for both torch.jit.script() and
+    # torch.jit.trace() but it causes errors for ONNX export.
+    #
+    if torch.jit.is_scripting():
+        # Note: We cannot use torch.jit.is_tracing() here as it also
+        # matches torch.onnx.export().
         return torch.logaddexp(x, y)
+    elif torch.onnx.is_in_onnx_export():
+        return logaddexp_onnx(x, y)
     else:
-        return (x.exp() + y.exp()).log()
+        # for torch.jit.trace()
+        return torch.logaddexp(x, y)
 
 class PiecewiseLinear(object):
     """
@@ -108,7 +125,7 @@ class PiecewiseLinear(object):
                          p: 'PiecewiseLinear',
                          include_crossings: bool = False):
         """
-        Returns (self_mod, p_mod) which are equivalent piecewise lienar
+        Returns (self_mod, p_mod) which are equivalent piecewise linear
         functions to self and p, but with the same x values.
 
           p: the other piecewise linear function
@@ -149,7 +166,7 @@ class ScheduledFloat(torch.nn.Module):
     in, float(parent_module.whatever), and use it as something like a dropout prob.
 
     It is a floating point value whose value changes depending on the batch count of the
-    training loop.  It is a piecewise linear function where you specifiy the (x,y) pairs
+    training loop.  It is a piecewise linear function where you specify the (x,y) pairs
     in sorted order on x; x corresponds to the batch index.  For batch-index values before the
     first x or after the last x, we just use the first or last y value.
 
@@ -326,7 +343,7 @@ class MaxEigLimiterFunction(torch.autograd.Function):
 class BiasNormFunction(torch.autograd.Function):
     # This computes:
     #   scales = (torch.mean((x - bias) ** 2, keepdim=True)) ** -0.5 * log_scale.exp()
-    #   return (x - bias) * scales
+    #   return x * scales
     # (after unsqueezing the bias), but it does it in a memory-efficient way so that
     # it can just store the returned value (chances are, this will also be needed for
     # some other reason, related to the next operation, so we can save memory).
@@ -383,8 +400,8 @@ class BiasNorm(torch.nn.Module):
     Args:
        num_channels: the number of channels, e.g. 512.
        channel_dim: the axis/dimension corresponding to the channel,
-         interprted as an offset from the input's ndim if negative.
-         shis is NOT the num_channels; it should typically be one of
+         interpreted as an offset from the input's ndim if negative.
+         This is NOT the num_channels; it should typically be one of
          {-2, -1, 0, 1, 2, 3}.
       log_scale: the initial log-scale that we multiply the output by; this
          is learnable.
@@ -1269,7 +1286,7 @@ class Dropout3(nn.Module):
 
 class SwooshLFunction(torch.autograd.Function):
     """
-      swoosh(x) =  log(1 + exp(x-4)) - 0.08*x - 0.035
+      swoosh_l(x) =  log(1 + exp(x-4)) - 0.08*x - 0.035
     """
 
     @staticmethod
@@ -1334,10 +1351,17 @@ class SwooshL(torch.nn.Module):
             return k2.swoosh_l(x)
         # return SwooshLFunction.apply(x)
 
+class SwooshLOnnx(torch.nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        """Return Swoosh-L activation.
+        """
+        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+        return logaddexp_onnx(zero, x - 4.0) - 0.08 * x - 0.035
+
 
 class SwooshRFunction(torch.autograd.Function):
     """
-      swoosh(x) =  log(1 + exp(x-1)) - 0.08*x - 0.313261687
+      swoosh_r(x) =  log(1 + exp(x-1)) - 0.08*x - 0.313261687
 
      derivatives are between -0.08 and 0.92.
     """
@@ -1400,6 +1424,13 @@ class SwooshR(torch.nn.Module):
             return k2.swoosh_r(x)
         # return SwooshRFunction.apply(x)
 
+class SwooshROnnx(torch.nn.Module):
+    def forward(self, x: Tensor) -> Tensor:
+        """Return Swoosh-R activation.
+        """
+        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+        return logaddexp_onnx(zero, x - 1.) - 0.08 * x - 0.313261687
+
 
 # simple version of SwooshL that does not redefine the backprop, used in
 # ActivationDropoutAndLinearFunction.
diff --git a/egs/librispeech/ASR/zipformer/scaling_converter.py b/egs/librispeech/ASR/zipformer/scaling_converter.py
index 54a5c2a6a..76622fa12 100644
--- a/egs/librispeech/ASR/zipformer/scaling_converter.py
+++ b/egs/librispeech/ASR/zipformer/scaling_converter.py
@@ -26,7 +26,16 @@ from typing import List, Tuple
 
 import torch
 import torch.nn as nn
-from scaling import Balancer, Dropout3, ScaleGrad, Whiten
+from scaling import (
+    Balancer,
+    Dropout3,
+    ScaleGrad,
+    SwooshL,
+    SwooshLOnnx,
+    SwooshR,
+    SwooshROnnx,
+    Whiten,
+)
 from zipformer import CompactRelPositionalEncoding
 
 
@@ -75,6 +84,10 @@ def convert_scaled_to_non_scaled(
     for name, m in model.named_modules():
         if isinstance(m, (Balancer, Dropout3, ScaleGrad, Whiten)):
             d[name] = nn.Identity()
+        elif is_onnx and isinstance(m, SwooshR):
+            d[name] = SwooshROnnx()
+        elif is_onnx and isinstance(m, SwooshL):
+            d[name] = SwooshLOnnx()
         elif is_onnx and isinstance(m, CompactRelPositionalEncoding):
             # We want to recreate the positional encoding vector when
             # the input changes, so we have to use torch.jit.script()
diff --git a/egs/librispeech/ASR/zipformer/streaming_beam_search.py b/egs/librispeech/ASR/zipformer/streaming_beam_search.py
index e6e0fb1c8..3c8565b33 100644
--- a/egs/librispeech/ASR/zipformer/streaming_beam_search.py
+++ b/egs/librispeech/ASR/zipformer/streaming_beam_search.py
@@ -31,6 +31,7 @@ def greedy_search(
     model: nn.Module,
     encoder_out: torch.Tensor,
     streams: List[DecodeStream],
+    blank_penalty: float = 0.0,
 ) -> None:
     """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
 
@@ -71,6 +72,9 @@ def greedy_search(
         # logits'shape (batch_size,  vocab_size)
         logits = logits.squeeze(1).squeeze(1)
 
+        if blank_penalty != 0.0:
+            logits[:, 0] -= blank_penalty
+
         assert logits.ndim == 2, logits.shape
         y = logits.argmax(dim=1).tolist()
         emitted = False
@@ -97,6 +101,7 @@ def modified_beam_search(
     encoder_out: torch.Tensor,
     streams: List[DecodeStream],
     num_active_paths: int = 4,
+    blank_penalty: float = 0.0,
 ) -> None:
     """Beam search in batch mode with --max-sym-per-frame=1 being hardcoded.
 
@@ -158,6 +163,9 @@ def modified_beam_search(
 
         logits = logits.squeeze(1).squeeze(1)
 
+        if blank_penalty != 0.0:
+            logits[:, 0] -= blank_penalty
+
         log_probs = logits.log_softmax(dim=-1)  # (num_hyps, vocab_size)
 
         log_probs.add_(ys_log_probs)
@@ -205,6 +213,7 @@ def fast_beam_search_one_best(
     beam: float,
     max_states: int,
     max_contexts: int,
+    blank_penalty: float = 0.0,
 ) -> None:
     """It limits the maximum number of symbols per frame to 1.
 
@@ -269,6 +278,10 @@ def fast_beam_search_one_best(
             project_input=False,
         )
         logits = logits.squeeze(1).squeeze(1)
+
+        if blank_penalty != 0.0:
+            logits[:, 0] -= blank_penalty
+
         log_probs = logits.log_softmax(dim=-1)
         decoding_streams.advance(log_probs)
 
diff --git a/egs/librispeech/ASR/zipformer/subsampling.py b/egs/librispeech/ASR/zipformer/subsampling.py
index d6bf57db4..6532ddccb 100644
--- a/egs/librispeech/ASR/zipformer/subsampling.py
+++ b/egs/librispeech/ASR/zipformer/subsampling.py
@@ -138,9 +138,11 @@ class ConvNeXt(nn.Module):
 
         x = bypass + x
         x = self.out_balancer(x)
-        x = x.transpose(1, 3)  # (N, W, H, C); need channel dim to be last
-        x = self.out_whiten(x)
-        x = x.transpose(1, 3)  # (N, C, H, W)
+
+        if x.requires_grad:
+            x = x.transpose(1, 3)  # (N, W, H, C); need channel dim to be last
+            x = self.out_whiten(x)
+            x = x.transpose(1, 3)  # (N, C, H, W)
 
         return x
 
@@ -266,6 +268,7 @@ class Conv2dSubsampling(nn.Module):
         # just one convnext layer
         self.convnext = ConvNeXt(layer3_channels, kernel_size=(7, 7))
 
+        # (in_channels-3)//4
         self.out_width = (((in_channels - 1) // 2) - 1) // 2
         self.layer3_channels = layer3_channels
 
@@ -299,7 +302,7 @@ class Conv2dSubsampling(nn.Module):
             A tensor of shape (batch_size,) containing the number of frames in
 
         Returns:
-          - a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
+          - a tensor of shape (N, (T-7)//2, odim)
           - output lengths, of shape (batch_size,)
         """
         # On entry, x is (N, T, idim)
@@ -310,14 +313,14 @@ class Conv2dSubsampling(nn.Module):
         x = self.conv(x)
         x = self.convnext(x)
 
-        # Now x is of shape (N, odim, ((T-3)//2 - 1)//2, ((idim-1)//2 - 1)//2)
+        # Now x is of shape (N, odim, (T-7)//2, (idim-3)//4)
         b, c, t, f = x.size()
 
         x = x.transpose(1, 2).reshape(b, t, c * f)
-        # now x: (N, ((T-1)//2 - 1))//2, out_width * layer3_channels))
+        # now x: (N, (T-7)//2, out_width * layer3_channels))
 
         x = self.out(x)
-        # Now x is of shape (N, ((T-1)//2 - 1))//2, odim)
+        # Now x is of shape (N, (T-7)//2, odim)
         x = self.out_whiten(x)
         x = self.out_norm(x)
         x = self.dropout(x)
@@ -328,7 +331,7 @@ class Conv2dSubsampling(nn.Module):
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
                 x_lens = (x_lens - 7) // 2
-        assert x.size(1) == x_lens.max().item()
+        assert x.size(1) == x_lens.max().item() , (x.size(1), x_lens.max())
 
         return x, x_lens
 
@@ -347,7 +350,7 @@ class Conv2dSubsampling(nn.Module):
             A tensor of shape (batch_size,) containing the number of frames in
 
         Returns:
-          - a tensor of shape (N, ((T-1)//2 - 1)//2, odim)
+          - a tensor of shape (N, (T-7)//2, odim)
           - output lengths, of shape (batch_size,)
           - updated cache
         """
@@ -383,7 +386,7 @@ class Conv2dSubsampling(nn.Module):
                 assert self.convnext.padding[0] == 3
                 x_lens = (x_lens - 7) // 2 - 3
 
-        assert x.size(1) == x_lens.max().item()
+        assert x.size(1) == x_lens.max().item(), (x.shape, x_lens.max())
 
         return x, x_lens, cached_left_pad
 
diff --git a/egs/librispeech/ASR/zipformer/test_scaling.py b/egs/librispeech/ASR/zipformer/test_scaling.py
new file mode 100755
index 000000000..5c04291e7
--- /dev/null
+++ b/egs/librispeech/ASR/zipformer/test_scaling.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+import matplotlib.pyplot as plt
+import torch
+from scaling import PiecewiseLinear, ScheduledFloat, SwooshL, SwooshR
+
+
+def test_piecewise_linear():
+    # An identity map in the range [0, 1].
+    # 1 - identity map in the range [1, 2]
+    # x1=0, y1=0
+    # x2=1, y2=1
+    # x3=2, y3=0
+    pl = PiecewiseLinear((0, 0), (1, 1), (2, 0))
+    assert pl(0.25) == 0.25, pl(0.25)
+    assert pl(0.625) == 0.625, pl(0.625)
+    assert pl(1.25) == 0.75, pl(1.25)
+
+    assert pl(-10) == pl(0), pl(-10)  # out of range
+    assert pl(10) == pl(2), pl(10)  # out of range
+
+    # multiplication
+    pl10 = pl * 10
+    assert pl10(1) == 10 * pl(1)
+    assert pl10(0.5) == 10 * pl(0.5)
+
+
+def test_scheduled_float():
+    # Initial value is 0.2 and it decreases linearly towards 0 at 4000
+    dropout = ScheduledFloat((0, 0.2), (4000, 0.0), default=0.0)
+    dropout.batch_count = 0
+    assert float(dropout) == 0.2, (float(dropout), dropout.batch_count)
+
+    dropout.batch_count = 1000
+    assert abs(float(dropout) - 0.15) < 1e-5, (float(dropout), dropout.batch_count)
+
+    dropout.batch_count = 2000
+    assert float(dropout) == 0.1, (float(dropout), dropout.batch_count)
+
+    dropout.batch_count = 3000
+    assert abs(float(dropout) - 0.05) < 1e-5, (float(dropout), dropout.batch_count)
+
+    dropout.batch_count = 4000
+    assert float(dropout) == 0.0, (float(dropout), dropout.batch_count)
+
+    dropout.batch_count = 5000  # out of range
+    assert float(dropout) == 0.0, (float(dropout), dropout.batch_count)
+
+
+def test_swoosh():
+    x1 = torch.linspace(start=-10, end=0, steps=100, dtype=torch.float32)
+    x2 = torch.linspace(start=0, end=10, steps=100, dtype=torch.float32)
+    x = torch.cat([x1, x2[1:]])
+
+    left = SwooshL()(x)
+    r = SwooshR()(x)
+
+    relu = torch.nn.functional.relu(x)
+    print(left[x == 0], r[x == 0])
+    plt.plot(x, left, "k")
+    plt.plot(x, r, "r")
+    plt.plot(x, relu, "b")
+    plt.axis([-10, 10, -1, 10])  # [xmin, xmax, ymin, ymax]
+    plt.legend(
+        [
+            "SwooshL(x) = log(1 + exp(x-4)) - 0.08x - 0.035 ",
+            "SwooshR(x) = log(1 + exp(x-1)) - 0.08x - 0.313261687",
+            "ReLU(x) = max(0, x)",
+        ]
+    )
+    plt.grid()
+    plt.savefig("swoosh.pdf")
+
+
+def main():
+    test_piecewise_linear()
+    test_scheduled_float()
+    test_swoosh()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/ASR/zipformer/test_subsampling.py b/egs/librispeech/ASR/zipformer/test_subsampling.py
new file mode 100755
index 000000000..078227fb6
--- /dev/null
+++ b/egs/librispeech/ASR/zipformer/test_subsampling.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+
+import torch
+from scaling import ScheduledFloat
+from subsampling import Conv2dSubsampling
+
+
+def test_conv2d_subsampling():
+    layer1_channels = 8
+    layer2_channels = 32
+    layer3_channels = 128
+
+    out_channels = 192
+    encoder_embed = Conv2dSubsampling(
+        in_channels=80,
+        out_channels=out_channels,
+        layer1_channels=layer1_channels,
+        layer2_channels=layer2_channels,
+        layer3_channels=layer3_channels,
+        dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
+    )
+    N = 2
+    T = 200
+    num_features = 80
+    x = torch.rand(N, T, num_features)
+    x_copy = x.clone()
+
+    x = x.unsqueeze(1)  # (N, 1, T, num_features)
+
+    x = encoder_embed.conv[0](x)  # conv2d, in 1, out 8, kernel 3, padding (0,1)
+    assert x.shape == (N, layer1_channels, T - 2, num_features)
+    # (2, 8, 198, 80)
+
+    x = encoder_embed.conv[1](x)  # scale grad
+    x = encoder_embed.conv[2](x)  # balancer
+    x = encoder_embed.conv[3](x)  # swooshR
+
+    x = encoder_embed.conv[4](x)  # conv2d, in 8, out 32, kernel 3, stride 2
+    assert x.shape == (
+        N,
+        layer2_channels,
+        ((T - 2) - 3) // 2 + 1,
+        (num_features - 3) // 2 + 1,
+    )
+    # (2, 32, 98, 39)
+
+    x = encoder_embed.conv[5](x)  # balancer
+    x = encoder_embed.conv[6](x)  # swooshR
+
+    # conv2d:
+    # in 32, out 128, kernel 3, stride (1, 2)
+    x = encoder_embed.conv[7](x)
+    assert x.shape == (
+        N,
+        layer3_channels,
+        (((T - 2) - 3) // 2 + 1) - 2,
+        (((num_features - 3) // 2 + 1) - 3) // 2 + 1,
+    )
+    # (2, 128, 96, 19)
+
+    x = encoder_embed.conv[8](x)  # balancer
+    x = encoder_embed.conv[9](x)  # swooshR
+
+    # (((T - 2) - 3) // 2 + 1) - 2
+    # = (T - 2) - 3) // 2 + 1 - 2
+    # = ((T - 2) - 3) // 2 - 1
+    # = (T - 2 - 3) // 2 - 1
+    # = (T - 5) // 2 - 1
+    # = (T - 7) // 2
+    assert x.shape[2] == (x_copy.shape[1] - 7) // 2
+
+    # (((num_features - 3) // 2 + 1) - 3) // 2 + 1,
+    # = ((num_features - 3) // 2 + 1 - 3) // 2 + 1,
+    # = ((num_features - 3) // 2 - 2) // 2 + 1,
+    # = (num_features - 3 - 4) // 2 // 2 + 1,
+    # = (num_features - 7) // 2 // 2 + 1,
+    # = (num_features - 7) // 4 + 1,
+    # = (num_features - 3) // 4
+    assert x.shape[3] == (x_copy.shape[2] - 3) // 4
+
+    assert x.shape == (N, layer3_channels, (T - 7) // 2, (num_features - 3) // 4)
+
+    # Input shape to convnext is
+    #
+    # (N, layer3_channels, (T-7)//2, (num_features - 3)//4)
+
+    # conv2d: in layer3_channels, out layer3_channels, groups layer3_channels
+    # kernel_size 7, padding 3
+    x = encoder_embed.convnext.depthwise_conv(x)
+    assert x.shape == (N, layer3_channels, (T - 7) // 2, (num_features - 3) // 4)
+
+    # conv2d: in layer3_channels, out hidden_ratio * layer3_channels, kernel_size 1
+    x = encoder_embed.convnext.pointwise_conv1(x)
+    assert x.shape == (N, layer3_channels * 3, (T - 7) // 2, (num_features - 3) // 4)
+
+    x = encoder_embed.convnext.hidden_balancer(x)  # balancer
+    x = encoder_embed.convnext.activation(x)  # swooshL
+
+    # conv2d: in hidden_ratio * layer3_channels, out layer3_channels, kernel 1
+    x = encoder_embed.convnext.pointwise_conv2(x)
+    assert x.shape == (N, layer3_channels, (T - 7) // 2, (num_features - 3) // 4)
+
+    # bypass and layer drop, omitted here.
+    x = encoder_embed.convnext.out_balancer(x)
+
+    # Note: the input and output shape of ConvNeXt are the same
+
+    x = x.transpose(1, 2).reshape(N, (T - 7) // 2, -1)
+    assert x.shape == (N, (T - 7) // 2, layer3_channels * ((num_features - 3) // 4))
+
+    x = encoder_embed.out(x)
+    assert x.shape == (N, (T - 7) // 2, out_channels)
+
+    x = encoder_embed.out_whiten(x)
+    x = encoder_embed.out_norm(x)
+    # final layer is dropout
+
+    # test streaming forward
+
+    subsampling_factor = 2
+    cached_left_padding = encoder_embed.get_init_states(batch_size=N)
+    depthwise_conv_kernel_size = 7
+    pad_size = (depthwise_conv_kernel_size - 1) // 2
+
+    assert cached_left_padding.shape == (
+        N,
+        layer3_channels,
+        pad_size,
+        (num_features - 3) // 4,
+    )
+
+    chunk_size = 16
+    right_padding = pad_size * subsampling_factor
+    T = chunk_size * subsampling_factor + 7 + right_padding
+    x = torch.rand(N, T, num_features)
+    x_lens = torch.tensor([T] * N)
+    y, y_lens, next_cached_left_padding = encoder_embed.streaming_forward(
+        x, x_lens, cached_left_padding
+    )
+
+    assert y.shape == (N, chunk_size, out_channels), y.shape
+    assert next_cached_left_padding.shape == cached_left_padding.shape
+
+    assert y.shape[1] == y_lens[0] == y_lens[1]
+
+
+def main():
+    test_conv2d_subsampling()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/ASR/zipformer/zipformer.py b/egs/librispeech/ASR/zipformer/zipformer.py
index 7d98dbeb1..b39af02b8 100644
--- a/egs/librispeech/ASR/zipformer/zipformer.py
+++ b/egs/librispeech/ASR/zipformer/zipformer.py
@@ -219,7 +219,7 @@ class Zipformer2(EncoderInterface):
 
         (num_frames0, batch_size, _encoder_dims0) = x.shape
 
-        assert self.encoder_dim[0] == _encoder_dims0
+        assert self.encoder_dim[0] == _encoder_dims0, (self.encoder_dim[0], _encoder_dims0)
 
         feature_mask_dropout_prob = 0.125
 
@@ -334,7 +334,7 @@ class Zipformer2(EncoderInterface):
         x = self._get_full_dim_output(outputs)
         x = self.downsample_output(x)
         # class Downsample has this rounding behavior..
-        assert self.output_downsampling_factor == 2
+        assert self.output_downsampling_factor == 2, self.output_downsampling_factor
         if torch.jit.is_scripting() or torch.jit.is_tracing():
             lengths = (x_lens + 1) // 2
         else:
diff --git a/egs/tedlium3/ASR/RESULTS.md b/egs/tedlium3/ASR/RESULTS.md
index 38eaa8f44..bd8a5b43f 100644
--- a/egs/tedlium3/ASR/RESULTS.md
+++ b/egs/tedlium3/ASR/RESULTS.md
@@ -1,5 +1,111 @@
 ## Results
 
+### TedLium3 BPE training results (Zipformer)
+
+#### 2023-06-15 (Regular transducer)
+
+Using the codes from this PR https://github.com/k2-fsa/icefall/pull/1125.
+
+Number of model parameters: 65549011, i.e., 65.5 M
+
+The WERs are
+
+|                                    |     dev    |    test    | comment                                  |
+|------------------------------------|------------|------------|------------------------------------------|
+|          greedy search             | 6.74       | 6.16       | --epoch 50, --avg 22, --max-duration 500 |
+|      beam search (beam size 4)     | 6.56       | 5.95       | --epoch 50, --avg 22, --max-duration 500 |
+| modified beam search (beam size 4) | 6.54       | 6.00       | --epoch 50, --avg 22, --max-duration 500 |
+| fast beam search (set as default)  | 6.91       | 6.28       | --epoch 50, --avg 22, --max-duration 500 |
+
+The training command for reproducing is given below:
+
+```
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+./zipformer/train.py \
+  --use-fp16 true \
+  --world-size 4 \
+  --num-epochs 50 \
+  --start-epoch 0 \
+  --exp-dir zipformer/exp \
+  --max-duration 1000
+```
+
+The tensorboard training log can be found at
+https://tensorboard.dev/experiment/AKXbJha0S9aXyfmuvG4h5A/#scalars
+
+The decoding command is:
+```
+epoch=50
+avg=22
+
+## greedy search
+./zipformer/decode.py \
+  --epoch $epoch \
+  --avg $avg \
+  --exp-dir zipformer/exp \
+  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --max-duration 500
+
+## beam search
+./zipformer/decode.py \
+  --epoch $epoch \
+  --avg $avg \
+  --exp-dir zipformer/exp \
+  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --max-duration 500 \
+  --decoding-method beam_search \
+  --beam-size 4
+
+## modified beam search
+./zipformer/decode.py \
+  --epoch $epoch \
+  --avg $avg \
+  --exp-dir zipformer/exp \
+  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --max-duration 500 \
+  --decoding-method modified_beam_search \
+  --beam-size 4
+
+## fast beam search
+./zipformer/decode.py \
+  --epoch $epoch \
+  --avg $avg \
+  --exp-dir ./zipformer/exp \
+  --bpe-model ./data/lang_bpe_500/bpe.model \
+  --max-duration 1500 \
+  --decoding-method fast_beam_search \
+  --beam 4 \
+  --max-contexts 4 \
+  --max-states 8
+```
+
+A pre-trained model and decoding logs can be found at <https://huggingface.co/desh2608/icefall-asr-tedlium3-zipformer>
+
+#### 2023-06-26 (Modified transducer)
+
+```
+./zipformer/train.py \
+  --use-fp16 true \
+  --world-size 4 \
+  --num-epochs 50 \
+  --start-epoch 0 \
+  --exp-dir zipformer/exp \
+  --max-duration 1000 \
+  --rnnt-type modified
+```
+
+The tensorboard training log can be found at
+https://tensorboard.dev/experiment/3d4bYmbJTGiWQQaW88CVEQ/#scalars
+
+|                                    |     dev    |    test    | comment                                  |
+|------------------------------------|------------|------------|------------------------------------------|
+|          greedy search             | 6.32       | 5.83       | --epoch 50, --avg 22, --max-duration 500 |
+| modified beam search (beam size 4) | 6.16       | 5.79       | --epoch 50, --avg 22, --max-duration 500 |
+| fast beam search (set as default)  | 6.30       | 5.89       | --epoch 50, --avg 22, --max-duration 500 |
+
+A pre-trained model and decoding logs can be found at <https://huggingface.co/desh2608/icefall-asr-tedlium3-zipformer>.
+
 ### TedLium3 BPE training results (Conformer-CTC 2)
 
 #### [conformer_ctc2](./conformer_ctc2)
diff --git a/egs/tedlium3/ASR/zipformer/__init__.py b/egs/tedlium3/ASR/zipformer/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/egs/tedlium3/ASR/zipformer/asr_datamodule.py b/egs/tedlium3/ASR/zipformer/asr_datamodule.py
new file mode 120000
index 000000000..49b2ee483
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/asr_datamodule.py
@@ -0,0 +1 @@
+../transducer_stateless/asr_datamodule.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/beam_search.py b/egs/tedlium3/ASR/zipformer/beam_search.py
new file mode 120000
index 000000000..e24eca39f
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/beam_search.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/pruned_transducer_stateless2/beam_search.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/decode.py b/egs/tedlium3/ASR/zipformer/decode.py
new file mode 100755
index 000000000..ea1cbba1b
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/decode.py
@@ -0,0 +1,833 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+(1) greedy search
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method greedy_search
+
+(2) beam search (not recommended)
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method beam_search \
+    --beam-size 4
+
+(3) modified beam search
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+
+(4) fast beam search (one best)
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64
+
+(5) fast beam search (nbest)
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64 \
+    --num-paths 200 \
+    --nbest-scale 0.5
+
+(6) fast beam search (nbest oracle WER)
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest_oracle \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64 \
+    --num-paths 200 \
+    --nbest-scale 0.5
+
+(7) fast beam search (with LG)
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 9 \
+    --exp-dir ./zipformer/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest_LG \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64
+"""
+
+
+import argparse
+import logging
+import math
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import sentencepiece as spm
+import torch
+import torch.nn as nn
+from asr_datamodule import TedLiumAsrDataModule
+from beam_search import (
+    beam_search,
+    fast_beam_search_nbest,
+    fast_beam_search_nbest_LG,
+    fast_beam_search_nbest_oracle,
+    fast_beam_search_one_best,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from train import add_model_arguments, get_params, get_transducer_model
+
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    make_pad_mask,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_error_stats,
+)
+
+LOG_EPS = math.log(1e-10)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=30,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=15,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=Path,
+        default="data/lang_bpe_500",
+        help="The lang dir containing word table and LG graph",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Possible values are:
+          - greedy_search
+          - beam_search
+          - modified_beam_search
+          - fast_beam_search
+          - fast_beam_search_nbest
+          - fast_beam_search_nbest_oracle
+          - fast_beam_search_nbest_LG
+        If you use fast_beam_search_nbest_LG, you have to specify
+        `--lang-dir`, which should contain `LG.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=20.0,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search,
+        fast_beam_search_nbest, fast_beam_search_nbest_LG,
+        and fast_beam_search_nbest_oracle
+        """,
+    )
+
+    parser.add_argument(
+        "--ngram-lm-scale",
+        type=float,
+        default=0.01,
+        help="""
+        Used only when --decoding_method is fast_beam_search_nbest_LG.
+        It specifies the scale for n-gram LM scores.
+        """,
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=8,
+        help="""Used only when --decoding-method is
+        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
+        and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=64,
+        help="""Used only when --decoding-method is
+        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
+        and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; " "2 means tri-gram",
+    )
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame.
+        Used only when --decoding_method is greedy_search""",
+    )
+
+    parser.add_argument(
+        "--num-paths",
+        type=int,
+        default=200,
+        help="""Number of paths for nbest decoding.
+        Used only when the decoding method is fast_beam_search_nbest,
+        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--nbest-scale",
+        type=float,
+        default=0.5,
+        help="""Scale applied to lattice scores when computing nbest paths.
+        Used only when the decoding method is fast_beam_search_nbest,
+        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    word_table: Optional[k2.SymbolTable] = None,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Dict[str, List[List[str]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+
+        - key: It indicates the setting used for decoding. For example,
+               if greedy_search is used, it would be "greedy_search"
+               If beam search with a beam size of 7 is used, it would be
+               "beam_7"
+        - value: It contains the decoding result. `len(value)` equals to
+                 batch size. `value[i]` is the decoding result for the i-th
+                 utterance in the given batch.
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+      word_table:
+        The word symbol table.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
+        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+    Returns:
+      Return the decoding result. See above description for the format of
+      the returned dict.
+    """
+    device = next(model.parameters()).device
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+
+    feature = feature.to(device)
+    # at entry, feature is (N, T, C)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    if params.causal:
+        # this seems to cause insertions at the end of the utterance if used with zipformer.
+        pad_len = 30
+        feature_lens += pad_len
+        feature = torch.nn.functional.pad(
+            feature,
+            pad=(0, 0, 0, pad_len),
+            value=LOG_EPS,
+        )
+
+    x, x_lens = model.encoder_embed(feature, feature_lens)
+
+    src_key_padding_mask = make_pad_mask(x_lens)
+    x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+
+    encoder_out, encoder_out_lens = model.encoder(x, x_lens, src_key_padding_mask)
+    encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
+
+    hyps = []
+    unk = sp.decode(sp.unk_id()).strip()
+
+    if params.decoding_method == "fast_beam_search":
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            allow_partial=True,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyp = [w for w in hyp.split() if w != unk]
+            hyps.append(hyp)
+    elif params.decoding_method == "fast_beam_search_nbest_LG":
+        hyp_tokens = fast_beam_search_nbest_LG(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            nbest_scale=params.nbest_scale,
+            allow_partial=True,
+        )
+        for hyp in hyp_tokens:
+            hyp = [word_table[i] for i in hyp if word_table[i] != unk]
+            hyps.append(hyp)
+    elif params.decoding_method == "fast_beam_search_nbest":
+        hyp_tokens = fast_beam_search_nbest(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            nbest_scale=params.nbest_scale,
+            allow_partial=True,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyp = [w for w in hyp.split() if w != unk]
+            hyps.append(hyp)
+    elif params.decoding_method == "fast_beam_search_nbest_oracle":
+        hyp_tokens = fast_beam_search_nbest_oracle(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            ref_texts=sp.encode(supervisions["text"]),
+            nbest_scale=params.nbest_scale,
+            allow_partial=True,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyp = [w for w in hyp.split() if w != unk]
+            hyps.append(hyp)
+    elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyp = [w for w in hyp.split() if w != unk]
+            hyps.append(hyp)
+    elif params.decoding_method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam_size,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyp = [w for w in hyp.split() if w != unk]
+            hyps.append(hyp)
+    else:
+        batch_size = encoder_out.size(0)
+
+        for i in range(batch_size):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i+1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.decoding_method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                )
+            elif params.decoding_method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported decoding method: {params.decoding_method}"
+                )
+            hyp = [w for w in sp.decode(hyp).split() if w != unk]
+            hyps.append(hyp)
+
+    if params.decoding_method == "greedy_search":
+        return {"greedy_search": hyps}
+    elif "fast_beam_search" in params.decoding_method:
+        key = f"beam_{params.beam}_"
+        key += f"max_contexts_{params.max_contexts}_"
+        key += f"max_states_{params.max_states}"
+        if "nbest" in params.decoding_method:
+            key += f"_num_paths_{params.num_paths}_"
+            key += f"nbest_scale_{params.nbest_scale}"
+            if "LG" in params.decoding_method:
+                key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
+
+        return {key: hyps}
+    else:
+        return {f"beam_size_{params.beam_size}": hyps}
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+    sp: spm.SentencePieceProcessor,
+    word_table: Optional[k2.SymbolTable] = None,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      sp:
+        The BPE model.
+      word_table:
+        The word symbol table.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
+        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    if params.decoding_method == "greedy_search":
+        log_interval = 50
+    else:
+        log_interval = 20
+
+    results = defaultdict(list)
+    for batch_idx, batch in enumerate(dl):
+        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+
+        hyps_dict = decode_one_batch(
+            params=params,
+            model=model,
+            sp=sp,
+            decoding_graph=decoding_graph,
+            word_table=word_table,
+            batch=batch,
+        )
+
+        for name, hyps in hyps_dict.items():
+            this_batch = []
+            assert len(hyps) == len(texts)
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                ref_words = ref_text.split()
+                this_batch.append((cut_id, ref_words, hyp_words))
+
+            results[name].extend(this_batch)
+
+        num_cuts += len(texts)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+    return results
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        with open(errs_filename, "w") as f:
+            wer = write_error_stats(
+                f, f"{test_set_name}-{key}", results, enable_log=True
+            )
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = (
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+    )
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    TedLiumAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    assert params.decoding_method in (
+        "greedy_search",
+        "beam_search",
+        "fast_beam_search",
+        "fast_beam_search_nbest",
+        "fast_beam_search_nbest_LG",
+        "fast_beam_search_nbest_oracle",
+        "modified_beam_search",
+    )
+    params.res_dir = params.exp_dir / params.decoding_method
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    if params.causal:
+        assert (
+            "," not in params.chunk_size
+        ), "chunk_size should be one value in decoding."
+        assert (
+            "," not in params.left_context_frames
+        ), "left_context_frames should be one value in decoding."
+        params.suffix += f"-chunk-{params.chunk_size}"
+        params.suffix += f"-left-context-{params.left_context_frames}"
+
+    if "fast_beam_search" in params.decoding_method:
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+        if "nbest" in params.decoding_method:
+            params.suffix += f"-nbest-scale-{params.nbest_scale}"
+            params.suffix += f"-num-paths-{params.num_paths}"
+            if "LG" in params.decoding_method:
+                params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
+    elif "beam_search" in params.decoding_method:
+        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
+    else:
+        params.suffix += f"-context-{params.context_size}"
+        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
+
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> and <unk> are defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.piece_to_id("<unk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_transducer_model(params)
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to(device)
+    model.eval()
+
+    if "fast_beam_search" in params.decoding_method:
+        if params.decoding_method == "fast_beam_search_nbest_LG":
+            lexicon = Lexicon(params.lang_dir)
+            word_table = lexicon.word_table
+            lg_filename = params.lang_dir / "LG.pt"
+            logging.info(f"Loading {lg_filename}")
+            decoding_graph = k2.Fsa.from_dict(
+                torch.load(lg_filename, map_location=device)
+            )
+            decoding_graph.scores *= params.ngram_lm_scale
+        else:
+            word_table = None
+            decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+    else:
+        decoding_graph = None
+        word_table = None
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    tedlium = TedLiumAsrDataModule(args)
+
+    dev_cuts = tedlium.dev_cuts()
+    test_cuts = tedlium.test_cuts()
+
+    dev_dl = tedlium.test_dataloaders(dev_cuts)
+    test_dl = tedlium.test_dataloaders(test_cuts)
+
+    test_sets = ["dev", "test"]
+    test_dls = [dev_dl, test_dl]
+
+    for name, dl in zip(test_sets, test_dls):
+        results_dict = decode_dataset(
+            dl=dl,
+            params=params,
+            model=model,
+            sp=sp,
+            word_table=word_table,
+            decoding_graph=decoding_graph,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=name,
+            results_dict=results_dict,
+        )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/tedlium3/ASR/zipformer/decoder.py b/egs/tedlium3/ASR/zipformer/decoder.py
new file mode 120000
index 000000000..5a8018680
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/decoder.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/decoder.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/encoder_interface.py b/egs/tedlium3/ASR/zipformer/encoder_interface.py
new file mode 120000
index 000000000..653c5b09a
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/encoder_interface.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/transducer_stateless/encoder_interface.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/export.py b/egs/tedlium3/ASR/zipformer/export.py
new file mode 120000
index 000000000..dfc1bec08
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/export.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/export.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/joiner.py b/egs/tedlium3/ASR/zipformer/joiner.py
new file mode 120000
index 000000000..5b8a36332
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/joiner.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/joiner.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/model.py b/egs/tedlium3/ASR/zipformer/model.py
new file mode 100644
index 000000000..90ec7e7aa
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/model.py
@@ -0,0 +1,223 @@
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang, Wei Kang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import k2
+import torch
+import torch.nn as nn
+from encoder_interface import EncoderInterface
+
+from icefall.utils import add_sos, make_pad_mask
+from scaling import ScaledLinear
+
+
+class Transducer(nn.Module):
+    """It implements https://arxiv.org/pdf/1211.3711.pdf
+    "Sequence Transduction with Recurrent Neural Networks"
+    """
+
+    def __init__(
+        self,
+        encoder_embed: nn.Module,
+        encoder: EncoderInterface,
+        decoder: nn.Module,
+        joiner: nn.Module,
+        encoder_dim: int,
+        decoder_dim: int,
+        joiner_dim: int,
+        vocab_size: int,
+    ):
+        """
+        Args:
+          encoder_embed:
+            It is a Convolutional 2D subsampling module. It converts
+            an input of shape (N, T, idim) to an output of of shape
+            (N, T', odim), where T' = (T-3)//2-2 = (T-7)//2.
+          encoder:
+            It is the transcription network in the paper. Its accepts
+            two inputs: `x` of (N, T, encoder_dim) and `x_lens` of shape (N,).
+            It returns two tensors: `logits` of shape (N, T, encoder_dim) and
+            `logit_lens` of shape (N,).
+          decoder:
+            It is the prediction network in the paper. Its input shape
+            is (N, U) and its output shape is (N, U, decoder_dim).
+            It should contain one attribute: `blank_id`.
+          joiner:
+            It has two inputs with shapes: (N, T, encoder_dim) and (N, U, decoder_dim).
+            Its output shape is (N, T, U, vocab_size). Note that its output contains
+            unnormalized probs, i.e., not processed by log-softmax.
+        """
+        super().__init__()
+        assert isinstance(encoder, EncoderInterface), type(encoder)
+        assert hasattr(decoder, "blank_id")
+
+        self.encoder_embed = encoder_embed
+        self.encoder = encoder
+        self.decoder = decoder
+        self.joiner = joiner
+
+        self.simple_am_proj = ScaledLinear(
+            encoder_dim,
+            vocab_size,
+            initial_scale=0.25,
+        )
+        self.simple_lm_proj = ScaledLinear(
+            decoder_dim,
+            vocab_size,
+            initial_scale=0.25,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: k2.RaggedTensor,
+        prune_range: int = 5,
+        am_scale: float = 0.0,
+        lm_scale: float = 0.0,
+        rnnt_type: str = "regular",
+    ) -> torch.Tensor:
+        """
+        Args:
+          x:
+            A 3-D tensor of shape (N, T, C).
+          x_lens:
+            A 1-D tensor of shape (N,). It contains the number of frames in `x`
+            before padding.
+          y:
+            A ragged tensor with 2 axes [utt][label]. It contains labels of each
+            utterance.
+          prune_range:
+            The prune range for rnnt loss, it means how many symbols(context)
+            we are considering for each frame to compute the loss.
+          am_scale:
+            The scale to smooth the loss with am (output of encoder network)
+            part
+          lm_scale:
+            The scale to smooth the loss with lm (output of predictor network)
+            part
+          rnnt_type:
+            The type of label topology to use for the transducer loss. One of "regular",
+            "modified", or "constrained".
+        Returns:
+          Return the transducer loss.
+
+        Note:
+           Regarding am_scale & lm_scale, it will make the loss-function one of
+           the form:
+              lm_scale * lm_probs + am_scale * am_probs +
+              (1-lm_scale-am_scale) * combined_probs
+        """
+        assert x.ndim == 3, x.shape
+        assert x_lens.ndim == 1, x_lens.shape
+        assert y.num_axes == 2, y.num_axes
+
+        assert x.size(0) == x_lens.size(0) == y.dim0
+
+        # logging.info(f"Memory allocated at entry: {torch.cuda.memory_allocated() // 1000000}M")
+        x, x_lens = self.encoder_embed(x, x_lens)
+        # logging.info(f"Memory allocated after encoder_embed: {torch.cuda.memory_allocated() // 1000000}M")
+
+        src_key_padding_mask = make_pad_mask(x_lens)
+        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+
+        encoder_out, x_lens = self.encoder(x, x_lens, src_key_padding_mask)
+        encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
+
+        assert torch.all(x_lens > 0)
+
+        # Now for the decoder, i.e., the prediction network
+        row_splits = y.shape.row_splits(1)
+        y_lens = row_splits[1:] - row_splits[:-1]
+
+        blank_id = self.decoder.blank_id
+        sos_y = add_sos(y, sos_id=blank_id)
+
+        # sos_y_padded: [B, S + 1], start with SOS.
+        sos_y_padded = sos_y.pad(mode="constant", padding_value=blank_id)
+
+        # decoder_out: [B, S + 1, decoder_dim]
+        decoder_out = self.decoder(sos_y_padded)
+
+        # Note: y does not start with SOS
+        # y_padded : [B, S]
+        y_padded = y.pad(mode="constant", padding_value=0)
+
+        y_padded = y_padded.to(torch.int64)
+        boundary = torch.zeros(
+            (encoder_out.size(0), 4),
+            dtype=torch.int64,
+            device=encoder_out.device,
+        )
+        boundary[:, 2] = y_lens
+        boundary[:, 3] = x_lens
+
+        lm = self.simple_lm_proj(decoder_out)
+        am = self.simple_am_proj(encoder_out)
+
+        # if self.training and random.random() < 0.25:
+        #    lm = penalize_abs_values_gt(lm, 100.0, 1.0e-04)
+        # if self.training and random.random() < 0.25:
+        #    am = penalize_abs_values_gt(am, 30.0, 1.0e-04)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
+                lm=lm.float(),
+                am=am.float(),
+                symbols=y_padded,
+                termination_symbol=blank_id,
+                lm_only_scale=lm_scale,
+                am_only_scale=am_scale,
+                boundary=boundary,
+                reduction="sum",
+                return_grad=True,
+                rnnt_type=rnnt_type,
+            )
+
+        # ranges : [B, T, prune_range]
+        ranges = k2.get_rnnt_prune_ranges(
+            px_grad=px_grad,
+            py_grad=py_grad,
+            boundary=boundary,
+            s_range=prune_range,
+        )
+
+        # am_pruned : [B, T, prune_range, encoder_dim]
+        # lm_pruned : [B, T, prune_range, decoder_dim]
+        am_pruned, lm_pruned = k2.do_rnnt_pruning(
+            am=self.joiner.encoder_proj(encoder_out),
+            lm=self.joiner.decoder_proj(decoder_out),
+            ranges=ranges,
+        )
+
+        # logits : [B, T, prune_range, vocab_size]
+
+        # project_input=False since we applied the decoder's input projections
+        # prior to do_rnnt_pruning (this is an optimization for speed).
+        logits = self.joiner(am_pruned, lm_pruned, project_input=False)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            pruned_loss = k2.rnnt_loss_pruned(
+                logits=logits.float(),
+                symbols=y_padded,
+                ranges=ranges,
+                termination_symbol=blank_id,
+                boundary=boundary,
+                reduction="sum",
+                rnnt_type=rnnt_type,
+            )
+
+        return (simple_loss, pruned_loss)
diff --git a/egs/tedlium3/ASR/zipformer/optim.py b/egs/tedlium3/ASR/zipformer/optim.py
new file mode 120000
index 000000000..5eaa3cffd
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/optim.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/optim.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/pretrained.py b/egs/tedlium3/ASR/zipformer/pretrained.py
new file mode 120000
index 000000000..0bd71dde4
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/pretrained.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/pretrained.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/profile.py b/egs/tedlium3/ASR/zipformer/profile.py
new file mode 120000
index 000000000..c93adbd14
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/profile.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/profile.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/scaling.py b/egs/tedlium3/ASR/zipformer/scaling.py
new file mode 120000
index 000000000..6f398f431
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/scaling.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/scaling.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/scaling_converter.py b/egs/tedlium3/ASR/zipformer/scaling_converter.py
new file mode 120000
index 000000000..b0ecee05e
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/scaling_converter.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/scaling_converter.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/subsampling.py b/egs/tedlium3/ASR/zipformer/subsampling.py
new file mode 120000
index 000000000..01ae9002c
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/subsampling.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/subsampling.py
\ No newline at end of file
diff --git a/egs/tedlium3/ASR/zipformer/train.py b/egs/tedlium3/ASR/zipformer/train.py
new file mode 100755
index 000000000..9271c8438
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/train.py
@@ -0,0 +1,1308 @@
+#!/usr/bin/env python3
+# Copyright    2021-2023  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                       Wei Kang,
+#                                                       Mingshuang Luo,
+#                                                       Zengwei Yao,
+#                                                       Daniel Povey)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+# For non-streaming model training:
+./zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir zipformer/exp \
+  --full-libri 1 \
+  --max-duration 1000
+
+# For streaming model training:
+./zipformer/train.py \
+  --world-size 4 \
+  --num-epochs 30 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir zipformer/exp \
+  --causal 1 \
+  --full-libri 1 \
+  --max-duration 1000
+
+"""
+
+
+import argparse
+import copy
+import logging
+import warnings
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple, Union
+
+import k2
+import optim
+import sentencepiece as spm
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import TedLiumAsrDataModule
+from decoder import Decoder
+from joiner import Joiner
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import fix_random_seed
+from local.convert_transcript_words_to_bpe_ids import convert_texts_into_ids
+from model import Transducer
+from optim import Eden, ScaledAdam
+from scaling import ScheduledFloat
+from subsampling import Conv2dSubsampling
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from zipformer import Zipformer2
+
+from icefall import diagnostics
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.hooks import register_inf_check_hooks
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    get_parameter_groups_with_lrs,
+    setup_logger,
+    str2bool,
+)
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def get_adjusted_batch_count(params: AttributeDict) -> float:
+    # returns the number of batches we would have used so far if we had used the reference
+    # duration.  This is for purposes of set_batch_count().
+    return (
+        params.batch_idx_train
+        * (params.max_duration * params.world_size)
+        / params.ref_duration
+    )
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for name, module in model.named_modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+        if hasattr(module, "name"):
+            module.name = name
+
+
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-encoder-layers",
+        type=str,
+        default="2,2,3,4,3,2",
+        help="Number of zipformer encoder layers per stack, comma separated.",
+    )
+
+    parser.add_argument(
+        "--downsampling-factor",
+        type=str,
+        default="1,2,4,8,4,2",
+        help="Downsampling factor for each stack of encoder layers.",
+    )
+
+    parser.add_argument(
+        "--feedforward-dim",
+        type=str,
+        default="512,768,1024,1536,1024,768",
+        help="Feedforward dimension of the zipformer encoder layers, per stack, comma separated.",
+    )
+
+    parser.add_argument(
+        "--num-heads",
+        type=str,
+        default="4,4,4,8,4,4",
+        help="Number of attention heads in the zipformer encoder layers: a single int or comma-separated list.",
+    )
+
+    parser.add_argument(
+        "--encoder-dim",
+        type=str,
+        default="192,256,384,512,384,256",
+        help="Embedding dimension in encoder stacks: a single int or comma-separated list.",
+    )
+
+    parser.add_argument(
+        "--query-head-dim",
+        type=str,
+        default="32",
+        help="Query/key dimension per head in encoder stacks: a single int or comma-separated list.",
+    )
+
+    parser.add_argument(
+        "--value-head-dim",
+        type=str,
+        default="12",
+        help="Value dimension per head in encoder stacks: a single int or comma-separated list.",
+    )
+
+    parser.add_argument(
+        "--pos-head-dim",
+        type=str,
+        default="4",
+        help="Positional-encoding dimension per head in encoder stacks: a single int or comma-separated list.",
+    )
+
+    parser.add_argument(
+        "--pos-dim",
+        type=int,
+        default="48",
+        help="Positional-encoding embedding dimension",
+    )
+
+    parser.add_argument(
+        "--encoder-unmasked-dim",
+        type=str,
+        default="192,192,256,256,256,192",
+        help="Unmasked dimensions in the encoders, relates to augmentation during training.  "
+        "A single int or comma-separated list.  Must be <= each corresponding encoder_dim.",
+    )
+
+    parser.add_argument(
+        "--cnn-module-kernel",
+        type=str,
+        default="31,31,15,15,15,31",
+        help="Sizes of convolutional kernels in convolution modules in each encoder stack: "
+        "a single int or comma-separated list.",
+    )
+
+    parser.add_argument(
+        "--decoder-dim",
+        type=int,
+        default=512,
+        help="Embedding dimension in the decoder model.",
+    )
+
+    parser.add_argument(
+        "--joiner-dim",
+        type=int,
+        default=512,
+        help="""Dimension used in the joiner model.
+        Outputs from the encoder and decoder model are projected
+        to this dimension before adding.
+        """,
+    )
+
+    parser.add_argument(
+        "--causal",
+        type=str2bool,
+        default=False,
+        help="If True, use causal version of model.",
+    )
+
+    parser.add_argument(
+        "--chunk-size",
+        type=str,
+        default="16,32,64,-1",
+        help="Chunk sizes (at 50Hz frame rate) will be chosen randomly from this list during training. "
+        " Must be just -1 if --causal=False",
+    )
+
+    parser.add_argument(
+        "--left-context-frames",
+        type=str,
+        default="64,128,256,-1",
+        help="Maximum left-contexts for causal training, measured in frames which will "
+        "be converted to a number of chunks.  If splitting into chunks, "
+        "chunk left-context frames will be chosen randomly from this list; else not relevant.",
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=50,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--bpe-model",
+        type=str,
+        default="data/lang_bpe_500/bpe.model",
+        help="Path to the BPE model",
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=0.04, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=7500,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=5,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--ref-duration",
+        type=float,
+        default=600,
+        help="Reference batch duration for purposes of adjusting batch counts for setting various "
+        "schedules inside the model",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; " "2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="The prune range for rnnt loss, it means how many symbols(context)"
+        "we are using to compute the loss",
+    )
+
+    parser.add_argument(
+        "--rnnt-type",
+        type=str,
+        default="regular",
+        choices=["regular", "modified", "constrained"],
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="The scale to smooth the loss with lm "
+        "(output of prediction network) part.",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="The scale to smooth the loss with am (output of encoder network)" "part.",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="To get pruning ranges, we will calculate a simple version"
+        "loss(joiner is just addition), this simple loss also uses for"
+        "training (as a regularization item). We will scale the simple loss"
+        "with this parameter before adding to the final loss.",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--inf-check",
+        type=str2bool,
+        default=False,
+        help="Add hooks to check for infinite module outputs and gradients.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=4000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 1.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=1,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=200,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - feature_dim: The model input dim. It has to match the one used
+                       in computing features.
+
+        - subsampling_factor:  The subsampling factor for the model.
+
+        - encoder_dim: Hidden dim for multi-head attention model.
+
+        - num_decoder_layers: Number of decoder layer of transformer decoder.
+
+        - warm_step: The warmup period that dictates the decay of the
+              scale on "simple" (un-pruned) loss.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "valid_interval": 3000,  # For the 100h subset, use 800
+            # parameters for zipformer
+            "feature_dim": 80,
+            "subsampling_factor": 4,  # not passed in, this is fixed.
+            "warm_step": 2000,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def _to_int_tuple(s: str):
+    return tuple(map(int, s.split(",")))
+
+
+def get_encoder_embed(params: AttributeDict) -> nn.Module:
+    # encoder_embed converts the input of shape (N, T, num_features)
+    # to the shape (N, (T - 7) // 2, encoder_dims).
+    # That is, it does two things simultaneously:
+    #   (1) subsampling: T -> (T - 7) // 2
+    #   (2) embedding: num_features -> encoder_dims
+    # In the normal configuration, we will downsample once more at the end
+    # by a factor of 2, and most of the encoder stacks will run at a lower
+    # sampling rate.
+    encoder_embed = Conv2dSubsampling(
+        in_channels=params.feature_dim,
+        out_channels=_to_int_tuple(params.encoder_dim)[0],
+        dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
+    )
+    return encoder_embed
+
+
+def get_encoder_model(params: AttributeDict) -> nn.Module:
+    encoder = Zipformer2(
+        output_downsampling_factor=2,
+        downsampling_factor=_to_int_tuple(params.downsampling_factor),
+        num_encoder_layers=_to_int_tuple(params.num_encoder_layers),
+        encoder_dim=_to_int_tuple(params.encoder_dim),
+        encoder_unmasked_dim=_to_int_tuple(params.encoder_unmasked_dim),
+        query_head_dim=_to_int_tuple(params.query_head_dim),
+        pos_head_dim=_to_int_tuple(params.pos_head_dim),
+        value_head_dim=_to_int_tuple(params.value_head_dim),
+        pos_dim=params.pos_dim,
+        num_heads=_to_int_tuple(params.num_heads),
+        feedforward_dim=_to_int_tuple(params.feedforward_dim),
+        cnn_module_kernel=_to_int_tuple(params.cnn_module_kernel),
+        dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
+        warmup_batches=4000.0,
+        causal=params.causal,
+        chunk_size=_to_int_tuple(params.chunk_size),
+        left_context_frames=_to_int_tuple(params.left_context_frames),
+    )
+    return encoder
+
+
+def get_decoder_model(params: AttributeDict) -> nn.Module:
+    decoder = Decoder(
+        vocab_size=params.vocab_size,
+        decoder_dim=params.decoder_dim,
+        blank_id=params.blank_id,
+        context_size=params.context_size,
+    )
+    return decoder
+
+
+def get_joiner_model(params: AttributeDict) -> nn.Module:
+    joiner = Joiner(
+        encoder_dim=max(_to_int_tuple(params.encoder_dim)),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return joiner
+
+
+def get_transducer_model(params: AttributeDict) -> nn.Module:
+    encoder_embed = get_encoder_embed(params)
+    encoder = get_encoder_model(params)
+    decoder = get_decoder_model(params)
+    joiner = get_joiner_model(params)
+
+    model = Transducer(
+        encoder_embed=encoder_embed,
+        encoder=encoder,
+        decoder=decoder,
+        joiner=joiner,
+        encoder_dim=int(max(params.encoder_dim.split(","))),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return model
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute RNNT loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Zipformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+     warmup: a floating point value which increases throughout training;
+        values >= 1.0 are fully warmed up and have all modules present.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"]
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+    feature = feature.to(device)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    batch_idx_train = params.batch_idx_train
+    warm_step = params.warm_step
+
+    texts = batch["supervisions"]["text"]
+    y = convert_texts_into_ids(texts, sp)
+    y = k2.RaggedTensor(y).to(device)
+
+    with torch.set_grad_enabled(is_training):
+        simple_loss, pruned_loss = model(
+            x=feature,
+            x_lens=feature_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+            rnnt_type=params.rnnt_type,
+        )
+
+        s = params.simple_loss_scale
+        # take down the scale on the simple loss from 1.0 at the start
+        # to params.simple_loss scale by warm_step.
+        simple_loss_scale = (
+            s
+            if batch_idx_train >= warm_step
+            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
+        )
+        pruned_loss_scale = (
+            1.0
+            if batch_idx_train >= warm_step
+            else 0.1 + 0.9 * (batch_idx_train / warm_step)
+        )
+
+        loss = simple_loss_scale * simple_loss + pruned_loss_scale * pruned_loss
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    info["simple_loss"] = simple_loss.detach().cpu().item()
+    info["pruned_loss"] = pruned_loss.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    sp: spm.SentencePieceProcessor,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            sp=sp,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    sp: spm.SentencePieceProcessor,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    saved_bad_model = False
+
+    def save_bad_model(suffix: str = ""):
+        save_checkpoint_impl(
+            filename=params.exp_dir / f"bad-model{suffix}-{rank}.pt",
+            model=model,
+            model_avg=model_avg,
+            params=params,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=0,
+        )
+
+    for batch_idx, batch in enumerate(train_dl):
+        if batch_idx % 10 == 0:
+            set_batch_count(model, get_adjusted_batch_count(params))
+
+        params.batch_idx_train += 1
+        batch_size = len(batch["supervisions"]["text"])
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            scheduler.step_batch(params.batch_idx_train)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            save_bad_model()
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+
+            if cur_grad_scale < 8.0 or (cur_grad_scale < 32.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                if not saved_bad_model:
+                    save_bad_model(suffix="-first-warning")
+                    saved_bad_model = True
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                save_bad_model()
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = max(scheduler.get_last_lr())
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
+                    )
+
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                sp=sp,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    sp = spm.SentencePieceProcessor()
+    sp.load(params.bpe_model)
+
+    # <blk> is defined in local/train_bpe_model.py
+    params.blank_id = sp.piece_to_id("<blk>")
+    params.vocab_size = sp.get_piece_size()
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_transducer_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model).to(torch.float64)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    model.to(device)
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    optimizer = ScaledAdam(
+        get_parameter_groups_with_lrs(model, lr=params.base_lr, include_names=True),
+        lr=params.base_lr,  # should have no effect
+        clipping_scale=2.0,
+    )
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        opts = diagnostics.TensorDiagnosticOptions(
+            2**22
+        )  # allow 4 megabytes per sub-module
+        diagnostic = diagnostics.attach_diagnostics(model, opts)
+
+    if params.inf_check:
+        register_inf_check_hooks(model)
+
+    tedlium = TedLiumAsrDataModule(args)
+
+    train_cuts = tedlium.train_cuts()
+    train_cuts = train_cuts.filter(lambda c: 1.0 <= c.duration <= 20.0)
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = tedlium.train_dataloaders(
+        train_cuts, sampler_state_dict=sampler_state_dict
+    )
+
+    valid_cuts = tedlium.dev_cuts()
+    valid_dl = tedlium.valid_dataloaders(valid_cuts)
+
+    if not params.print_diagnostics:
+        scan_pessimistic_batches_for_oom(
+            model=model,
+            train_dl=train_dl,
+            optimizer=optimizer,
+            sp=sp,
+            params=params,
+        )
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sp=sp,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    sp: spm.SentencePieceProcessor,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      sp:
+        The BPE model.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    supervisions = batch["supervisions"]
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    y = sp.encode(supervisions["text"], out_type=int)
+    num_tokens = sum(len(i) for i in y)
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def scan_pessimistic_batches_for_oom(
+    model: Union[nn.Module, DDP],
+    train_dl: torch.utils.data.DataLoader,
+    optimizer: torch.optim.Optimizer,
+    sp: spm.SentencePieceProcessor,
+    params: AttributeDict,
+):
+    from lhotse.dataset import find_pessimistic_batches
+
+    logging.info(
+        "Sanity check -- see if any of the batches in epoch 1 would cause OOM."
+    )
+    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
+    for criterion, cuts in batches.items():
+        batch = train_dl.dataset[cuts]
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, _ = compute_loss(
+                    params=params,
+                    model=model,
+                    sp=sp,
+                    batch=batch,
+                    is_training=True,
+                )
+            loss.backward()
+            optimizer.zero_grad()
+        except Exception as e:
+            if "CUDA out of memory" in str(e):
+                logging.error(
+                    "Your GPU ran out of memory with the current "
+                    "max_duration setting. We recommend decreasing "
+                    "max_duration and trying again.\n"
+                    f"Failing criterion: {criterion} "
+                    f"(={crit_values[criterion]}) ..."
+                )
+            display_and_save_batch(batch, params=params, sp=sp)
+            raise
+        logging.info(
+            f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+        )
+
+
+def main():
+    parser = get_parser()
+    TedLiumAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/tedlium3/ASR/zipformer/zipformer.py b/egs/tedlium3/ASR/zipformer/zipformer.py
new file mode 120000
index 000000000..23011dda7
--- /dev/null
+++ b/egs/tedlium3/ASR/zipformer/zipformer.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/zipformer.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/RESULTS.md b/egs/wenetspeech/ASR/RESULTS.md
index 658ad4a9b..1a0e0681f 100644
--- a/egs/wenetspeech/ASR/RESULTS.md
+++ b/egs/wenetspeech/ASR/RESULTS.md
@@ -1,5 +1,90 @@
 ## Results
 
+### WenetSpeech char-based training results (Non-streaming and streaming) on zipformer model
+
+This is the [pull request](https://github.com/k2-fsa/icefall/pull/1130) in icefall.
+
+#### Non-streaming
+
+Best results (num of params : ~76M):
+
+Type | Greedy(dev & net & meeting) | Beam search(dev & net & meeting) |  
+-- | -- | -- | --
+Non-streaming | 7.36 & 7.65 & 12.43 | 7.32 & 7.61 & 12.35 | --epoch=12
+
+The training command:
+
+```
+./zipformer/train.py \
+  --world-size 6 \
+  --num-epochs 12 \
+  --use-fp16 1 \
+  --max-duration 450 \
+  --training-subset L \
+  --lr-epochs 1.5 \
+  --context-size 2 \
+  --exp-dir zipformer/exp_L_context_2 \
+  --causal 0 \
+  --num-workers 8
+```
+
+Listed best results for each epoch below:
+
+Epoch | Greedy search(dev & net & meeting) | Modified beam search(dev & net & meeting) |  
+-- | -- | -- | --
+4 | 7.83 & 8.86 &13.73 | 7.75 & 8.81 & 13.67 | avg=1;blank-penalty=2
+5 | 7.75 & 8.46 & 13.38 | 7.68 & 8.41 & 13.27 | avg=1;blank-penalty=2
+6 | 7.72 & 8.19 & 13.16 | 7.62 & 8.14 & 13.06 | avg=1;blank-penalty=2
+7 | 7.59 & 8.08 & 12.97 | 7.53 & 8.01 & 12.87 | avg=2;blank-penalty=2
+8 | 7.68 & 7.87 & 12.96 | 7.61 & 7.81 & 12.88 | avg=1;blank-penalty=2
+9 | 7.57 & 7.77 & 12.87 | 7.5 & 7.71 & 12.77 | avg=1;blank-penalty=2
+10 | 7.45 & 7.7 & 12.69 | 7.39 & 7.63 & 12.59 | avg=2;blank-penalty=2
+11 | 7.35 & 7.67 & 12.46 | 7.31 & 7.63 & 12.43 | avg=3;blank-penalty=2
+12 | 7.36 & 7.65 & 12.43 | 7.32 & 7.61 & 12.35 | avg=4;blank-penalty=2
+
+The pre-trained model is available here : https://huggingface.co/pkufool/icefall-asr-zipformer-wenetspeech-20230615
+
+
+#### Streaming
+
+Best results (num of params : ~76M):
+
+Type | Greedy(dev & net & meeting) | Beam search(dev & net & meeting) |  
+-- | -- | -- | --
+Streaming | 8.45 & 9.89 & 16.46 | 8.21 & 9.77 & 16.07 | --epoch=12; --chunk-size=16; --left-context-frames=256
+Streaming | 8.0 & 9.0 & 15.11 | 7.84 & 8.94 & 14.92 | --epoch=12; --chunk-size=32; --left-context-frames=256
+
+The training command:
+
+```
+./zipformer/train.py \
+  --world-size 8 \
+  --num-epochs 12 \
+  --use-fp16 1 \
+  --max-duration 450 \
+  --training-subset L \
+  --lr-epochs 1.5 \
+  --context-size 2 \
+  --exp-dir zipformer/exp_L_causal_context_2 \
+  --causal 1 \
+  --num-workers 8
+```
+
+Best results for each epoch (--chunk-size=16; --left-context-frames=128)
+
+Epoch | Greedy search(dev & net & meeting) | Modified beam search(dev & net & meeting) |  
+-- | -- | -- | --
+6 | 9.14 & 10.75 & 18.15 | 8.79 & 10.54 & 17.64 | avg=1;blank-penalty=1.5
+7 | 9.11 & 10.61 & 17.86 | 8.8 & 10.42 & 17.29 | avg=1;blank-penalty=1.5
+8 | 8.89 & 10.32 & 17.44 | 8.59 & 10.09 & 16.9 | avg=1;blank-penalty=1.5
+9 | 8.86 & 10.11 & 17.35 | 8.55 & 9.87 & 16.76 | avg=1;blank-penalty=1.5
+10 | 8.66 & 10.0 & 16.94 | 8.39 & 9.83 & 16.47 | avg=2;blank-penalty=1.5
+11 | 8.58 & 9.92 & 16.67 | 8.32 & 9.77 & 16.27 | avg=3;blank-penalty=1.5
+12 | 8.45 & 9.89 & 16.46 | 8.21 & 9.77 & 16.07 | avg=4;blank-penalty=1.5
+
+The pre-trained model is available here: https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615
+
+
 ### WenetSpeech char-based training results (offline and streaming) (Pruned Transducer 5)
 
 #### 2022-07-22
diff --git a/egs/wenetspeech/ASR/local/prepare_char_lm_training_data.py b/egs/wenetspeech/ASR/local/prepare_char_lm_training_data.py
new file mode 120000
index 000000000..2374cafdd
--- /dev/null
+++ b/egs/wenetspeech/ASR/local/prepare_char_lm_training_data.py
@@ -0,0 +1 @@
+../../../aishell/ASR/local/prepare_char_lm_training_data.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/local/sort_lm_training_data.py b/egs/wenetspeech/ASR/local/sort_lm_training_data.py
new file mode 120000
index 000000000..efef2c445
--- /dev/null
+++ b/egs/wenetspeech/ASR/local/sort_lm_training_data.py
@@ -0,0 +1 @@
+../../../aishell/ASR/local/sort_lm_training_data.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
index 7cb2e1048..746b212ff 100644
--- a/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@@ -292,7 +292,7 @@ class WenetSpeechAsrDataModule:
                 max_duration=self.args.max_duration,
                 shuffle=self.args.shuffle,
                 num_buckets=self.args.num_buckets,
-                buffer_size=30000,
+                buffer_size=300000,
                 drop_last=True,
             )
         else:
diff --git a/egs/wenetspeech/ASR/pruned_transducer_stateless5/decode.py b/egs/wenetspeech/ASR/pruned_transducer_stateless5/decode.py
index dc431578c..36b8a4b67 100755
--- a/egs/wenetspeech/ASR/pruned_transducer_stateless5/decode.py
+++ b/egs/wenetspeech/ASR/pruned_transducer_stateless5/decode.py
@@ -588,7 +588,7 @@ def decode_dataset(
     results = defaultdict(list)
     for batch_idx, batch in enumerate(dl):
         texts = batch["supervisions"]["text"]
-        texts = [list(str(text)) for text in texts]
+        texts = [list("".join(text.split())) for text in texts]
         cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
 
         hyps_dict = decode_one_batch(
diff --git a/egs/wenetspeech/ASR/zipformer/__init__.py b/egs/wenetspeech/ASR/zipformer/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/egs/wenetspeech/ASR/zipformer/asr_datamodule.py b/egs/wenetspeech/ASR/zipformer/asr_datamodule.py
new file mode 120000
index 000000000..a074d6085
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/asr_datamodule.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/asr_datamodule.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/beam_search.py b/egs/wenetspeech/ASR/zipformer/beam_search.py
new file mode 120000
index 000000000..8554e44cc
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/beam_search.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/beam_search.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/decode.py b/egs/wenetspeech/ASR/zipformer/decode.py
new file mode 100755
index 000000000..0fbc8244b
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/decode.py
@@ -0,0 +1,818 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2022 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao
+#                                                 Mingshuang Luo)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+(1) greedy search
+./zipformer/decode.py \
+    --epoch 35 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp \
+    --lang-dir data/lang_char \
+    --max-duration 600 \
+    --decoding-method greedy_search
+
+(2) modified beam search
+./zipformer/decode.py \
+    --epoch 35 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp \
+    --lang-dir data/lang_char \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4
+
+(3) fast beam search (trivial_graph)
+./zipformer/decode.py \
+    --epoch 35 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp \
+    --lang-dir data/lang_char \
+    --max-duration 600 \
+    --decoding-method fast_beam_search \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64
+
+(4) fast beam search (LG)
+./zipformer/decode.py \
+    --epoch 30 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp \
+    --lang-dir data/lang_char \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_LG \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64
+
+(5) fast beam search (nbest oracle WER)
+./zipformer/decode.py \
+    --epoch 35 \
+    --avg 15 \
+    --exp-dir ./zipformer/exp \
+    --lang-dir data/lang_char \
+    --max-duration 600 \
+    --decoding-method fast_beam_search_nbest_oracle \
+    --beam 20.0 \
+    --max-contexts 8 \
+    --max-states 64 \
+    --num-paths 200 \
+    --nbest-scale 0.5
+"""
+
+
+import argparse
+import logging
+import math
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import torch
+import torch.nn as nn
+from asr_datamodule import WenetSpeechAsrDataModule
+from beam_search import (
+    beam_search,
+    fast_beam_search_nbest,
+    fast_beam_search_nbest_LG,
+    fast_beam_search_nbest_oracle,
+    fast_beam_search_one_best,
+    greedy_search,
+    greedy_search_batch,
+    modified_beam_search,
+)
+from lhotse.cut import Cut
+from train import add_model_arguments, get_model, get_params
+
+from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    make_pad_mask,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_error_stats,
+)
+
+LOG_EPS = math.log(1e-10)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=30,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=15,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=Path,
+        default="data/lang_char",
+        help="The lang dir containing word table and LG graph",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Possible values are:
+          - greedy_search
+          - modified_beam_search
+          - fast_beam_search
+          - fast_beam_search_LG
+          - fast_beam_search_nbest_oracle
+        If you use fast_beam_search_LG, you have to specify
+        `--lang-dir`, which should contain `LG.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--beam-size",
+        type=int,
+        default=4,
+        help="""An integer indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is beam_search or
+        modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=20.0,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search,
+        fast_beam_search, fast_beam_search_LG,
+        and fast_beam_search_nbest_oracle
+        """,
+    )
+
+    parser.add_argument(
+        "--ngram-lm-scale",
+        type=float,
+        default=0.01,
+        help="""
+        Used only when --decoding_method is fast_beam_search_LG.
+        It specifies the scale for n-gram LM scores.
+        """,
+    )
+
+    parser.add_argument(
+        "--ilme-scale",
+        type=float,
+        default=0.2,
+        help="""
+        Used only when --decoding_method is fast_beam_search_LG.
+        It specifies the scale for the internal language model estimation.
+        """,
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=8,
+        help="""Used only when --decoding-method is
+        fast_beam_search, fast_beam_search, fast_beam_search_LG,
+        and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=64,
+        help="""Used only when --decoding-method is
+        fast_beam_search, fast_beam_search, fast_beam_search_LG,
+        and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--max-sym-per-frame",
+        type=int,
+        default=1,
+        help="""Maximum number of symbols per frame.
+        Used only when --decoding_method is greedy_search""",
+    )
+
+    parser.add_argument(
+        "--num-paths",
+        type=int,
+        default=200,
+        help="""Number of paths for nbest decoding.
+        Used only when the decoding method is fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--nbest-scale",
+        type=float,
+        default=0.5,
+        help="""Scale applied to lattice scores when computing nbest paths.
+        Used only when the decoding method is and fast_beam_search_nbest_oracle""",
+    )
+
+    parser.add_argument(
+        "--blank-penalty",
+        type=float,
+        default=0.0,
+        help="""
+        The penalty applied on blank symbol during decoding.
+        Note: It is a positive value that would be applied to logits like
+        this `logits[:, 0] -= blank_penalty` (suppose logits.shape is
+        [batch_size, vocab] and blank id is 0).
+        """,
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def decode_one_batch(
+    params: AttributeDict,
+    model: nn.Module,
+    lexicon: Lexicon,
+    graph_compiler: CharCtcTrainingGraphCompiler,
+    batch: dict,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Dict[str, List[List[str]]]:
+    """Decode one batch and return the result in a dict. The dict has the
+    following format:
+
+        - key: It indicates the setting used for decoding. For example,
+               if greedy_search is used, it would be "greedy_search"
+               If beam search with a beam size of 7 is used, it would be
+               "beam_7"
+        - value: It contains the decoding result. `len(value)` equals to
+                 batch size. `value[i]` is the decoding result for the i-th
+                 utterance in the given batch.
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or LG, Used
+        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
+        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+    Returns:
+      Return the decoding result. See above description for the format of
+      the returned dict.
+    """
+    device = next(model.parameters()).device
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+
+    feature = feature.to(device)
+    # at entry, feature is (N, T, C)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    if params.causal:
+        # this seems to cause insertions at the end of the utterance if used with zipformer.
+        pad_len = 30
+        feature_lens += pad_len
+        feature = torch.nn.functional.pad(
+            feature,
+            pad=(0, 0, 0, pad_len),
+            value=LOG_EPS,
+        )
+
+    x, x_lens = model.encoder_embed(feature, feature_lens)
+
+    src_key_padding_mask = make_pad_mask(x_lens)
+    x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+
+    encoder_out, encoder_out_lens = model.encoder(x, x_lens, src_key_padding_mask)
+    encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
+
+    hyps = []
+
+    if params.decoding_method == "fast_beam_search":
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            blank_penalty=params.blank_penalty,
+        )
+        for i in range(encoder_out.size(0)):
+            hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
+    elif params.decoding_method == "fast_beam_search_LG":
+        hyp_tokens = fast_beam_search_one_best(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            blank_penalty=params.blank_penalty,
+            ilme_scale=params.ilme_scale,
+        )
+        for hyp in hyp_tokens:
+            sentence = "".join([lexicon.word_table[i] for i in hyp])
+            hyps.append(list(sentence))
+    elif params.decoding_method == "fast_beam_search_nbest_oracle":
+        hyp_tokens = fast_beam_search_nbest_oracle(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            ref_texts=graph_compiler.texts_to_ids(supervisions["text"]),
+            nbest_scale=params.nbest_scale,
+            blank_penalty=params.blank_penalty,
+        )
+        for i in range(encoder_out.size(0)):
+            hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
+    elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
+        hyp_tokens = greedy_search_batch(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            blank_penalty=params.blank_penalty,
+        )
+        for i in range(encoder_out.size(0)):
+            hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
+    elif params.decoding_method == "modified_beam_search":
+        hyp_tokens = modified_beam_search(
+            model=model,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            blank_penalty=params.blank_penalty,
+            beam=params.beam_size,
+        )
+        for i in range(encoder_out.size(0)):
+            hyps.append([lexicon.token_table[idx] for idx in hyp_tokens[i]])
+    else:
+        batch_size = encoder_out.size(0)
+
+        for i in range(batch_size):
+            # fmt: off
+            encoder_out_i = encoder_out[i:i + 1, :encoder_out_lens[i]]
+            # fmt: on
+            if params.decoding_method == "greedy_search":
+                hyp = greedy_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    max_sym_per_frame=params.max_sym_per_frame,
+                    blank_penalty=params.blank_penalty,
+                )
+            elif params.decoding_method == "beam_search":
+                hyp = beam_search(
+                    model=model,
+                    encoder_out=encoder_out_i,
+                    beam=params.beam_size,
+                    blank_penalty=params.blank_penalty,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported decoding method: {params.decoding_method}"
+                )
+            hyps.append([lexicon.token_table[idx] for idx in hyp])
+
+    key = f"blank_penalty_{params.blank_penalty}"
+    if params.decoding_method == "greedy_search":
+        return {"greedy_search_" + key: hyps}
+    elif "fast_beam_search" in params.decoding_method:
+        key += f"_beam_{params.beam}_"
+        key += f"max_contexts_{params.max_contexts}_"
+        key += f"max_states_{params.max_states}"
+        if "nbest" in params.decoding_method:
+            key += f"_num_paths_{params.num_paths}_"
+            key += f"nbest_scale_{params.nbest_scale}"
+        if "LG" in params.decoding_method:
+            key += f"_ilme_scale_{params.ilme_scale}"
+            key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
+
+        return {key: hyps}
+    else:
+        return {f"beam_size_{params.beam_size}_" + key: hyps}
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    params: AttributeDict,
+    model: nn.Module,
+    lexicon: Lexicon,
+    graph_compiler: CharCtcTrainingGraphCompiler,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or LG, Used
+        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
+        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    if params.decoding_method == "greedy_search":
+        log_interval = 50
+    else:
+        log_interval = 20
+
+    results = defaultdict(list)
+    for batch_idx, batch in enumerate(dl):
+        texts = batch["supervisions"]["text"]
+        texts = [list("".join(text.split())) for text in texts]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+
+        hyps_dict = decode_one_batch(
+            params=params,
+            model=model,
+            lexicon=lexicon,
+            graph_compiler=graph_compiler,
+            decoding_graph=decoding_graph,
+            batch=batch,
+        )
+
+        for name, hyps in hyps_dict.items():
+            this_batch = []
+            assert len(hyps) == len(texts)
+            for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+                this_batch.append((cut_id, ref_text, hyp_words))
+
+            results[name].extend(this_batch)
+
+        num_cuts += len(texts)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+    return results
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        with open(errs_filename, "w") as f:
+            wer = write_error_stats(
+                f, f"{test_set_name}-{key}", results, enable_log=True
+            )
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = (
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+    )
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    WenetSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    assert params.decoding_method in (
+        "greedy_search",
+        "beam_search",
+        "modified_beam_search",
+        "fast_beam_search",
+        "fast_beam_search_LG",
+        "fast_beam_search_nbest_oracle",
+    )
+    params.res_dir = params.exp_dir / params.decoding_method
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    if params.causal:
+        assert (
+            "," not in params.chunk_size
+        ), "chunk_size should be one value in decoding."
+        assert (
+            "," not in params.left_context_frames
+        ), "left_context_frames should be one value in decoding."
+        params.suffix += f"-chunk-{params.chunk_size}"
+        params.suffix += f"-left-context-{params.left_context_frames}"
+
+    if "fast_beam_search" in params.decoding_method:
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+        if "nbest" in params.decoding_method:
+            params.suffix += f"-nbest-scale-{params.nbest_scale}"
+            params.suffix += f"-num-paths-{params.num_paths}"
+        if "LG" in params.decoding_method:
+            params.suffix += f"_ilme_scale_{params.ilme_scale}"
+            params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
+    elif "beam_search" in params.decoding_method:
+        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
+    else:
+        params.suffix += f"-context-{params.context_size}"
+        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
+    params.suffix += f"-blank-penalty-{params.blank_penalty}"
+
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+
+    lexicon = Lexicon(params.lang_dir)
+    params.blank_id = lexicon.token_table["<blk>"]
+    params.vocab_size = max(lexicon.tokens) + 1
+
+    graph_compiler = CharCtcTrainingGraphCompiler(
+        lexicon=lexicon,
+        device=device,
+    )
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_model(params)
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to(device)
+    model.eval()
+
+    if "fast_beam_search" in params.decoding_method:
+        if "LG" in params.decoding_method:
+            lexicon = Lexicon(params.lang_dir)
+            lg_filename = params.lang_dir / "LG.pt"
+            logging.info(f"Loading {lg_filename}")
+            decoding_graph = k2.Fsa.from_dict(
+                torch.load(lg_filename, map_location=device)
+            )
+            decoding_graph.scores *= params.ngram_lm_scale
+        else:
+            decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+    else:
+        decoding_graph = None
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+    wenetspeech = WenetSpeechAsrDataModule(args)
+
+    def remove_short_utt(c: Cut):
+        T = ((c.num_frames - 7) // 2 + 1) // 2
+        if T <= 0:
+            logging.warning(
+                f"Exclude cut with ID {c.id} from decoding, num_frames : {c.num_frames}."
+            )
+        return T > 0
+
+    dev_cuts = wenetspeech.valid_cuts()
+    dev_cuts = dev_cuts.filter(remove_short_utt)
+    dev_dl = wenetspeech.valid_dataloaders(dev_cuts)
+
+    test_net_cuts = wenetspeech.test_net_cuts()
+    test_net_cuts = test_net_cuts.filter(remove_short_utt)
+    test_net_dl = wenetspeech.test_dataloaders(test_net_cuts)
+
+    test_meeting_cuts = wenetspeech.test_meeting_cuts()
+    test_meeting_cuts = test_meeting_cuts.filter(remove_short_utt)
+    test_meeting_dl = wenetspeech.test_dataloaders(test_meeting_cuts)
+
+    test_sets = ["DEV", "TEST_NET", "TEST_MEETING"]
+    test_dls = [dev_dl, test_net_dl, test_meeting_dl]
+
+    for test_set, test_dl in zip(test_sets, test_dls):
+        results_dict = decode_dataset(
+            dl=test_dl,
+            params=params,
+            model=model,
+            lexicon=lexicon,
+            graph_compiler=graph_compiler,
+            decoding_graph=decoding_graph,
+        )
+
+        save_results(
+            params=params,
+            test_set_name=test_set,
+            results_dict=results_dict,
+        )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wenetspeech/ASR/zipformer/decode_stream.py b/egs/wenetspeech/ASR/zipformer/decode_stream.py
new file mode 120000
index 000000000..b8d8ddfc4
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/decode_stream.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/decode_stream.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/decoder.py b/egs/wenetspeech/ASR/zipformer/decoder.py
new file mode 120000
index 000000000..5a8018680
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/decoder.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/decoder.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/encoder_interface.py b/egs/wenetspeech/ASR/zipformer/encoder_interface.py
new file mode 120000
index 000000000..b9aa0ae08
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/encoder_interface.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless2/encoder_interface.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/export-onnx-streaming.py b/egs/wenetspeech/ASR/zipformer/export-onnx-streaming.py
new file mode 120000
index 000000000..2962eb784
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/export-onnx-streaming.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/export-onnx-streaming.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/export-onnx.py b/egs/wenetspeech/ASR/zipformer/export-onnx.py
new file mode 120000
index 000000000..70a15683c
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/export-onnx.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/export-onnx.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/export.py b/egs/wenetspeech/ASR/zipformer/export.py
new file mode 120000
index 000000000..dfc1bec08
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/export.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/export.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/jit_pretrained.py b/egs/wenetspeech/ASR/zipformer/jit_pretrained.py
new file mode 120000
index 000000000..25108391f
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/jit_pretrained.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/jit_pretrained.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/jit_pretrained_streaming.py b/egs/wenetspeech/ASR/zipformer/jit_pretrained_streaming.py
new file mode 120000
index 000000000..1962351e9
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/jit_pretrained_streaming.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/jit_pretrained_streaming.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/joiner.py b/egs/wenetspeech/ASR/zipformer/joiner.py
new file mode 120000
index 000000000..5b8a36332
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/joiner.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/joiner.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/model.py b/egs/wenetspeech/ASR/zipformer/model.py
new file mode 120000
index 000000000..cd7e07d72
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/model.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/model.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/onnx_check.py b/egs/wenetspeech/ASR/zipformer/onnx_check.py
new file mode 120000
index 000000000..f3dd42004
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/onnx_check.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/onnx_check.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/onnx_decode.py b/egs/wenetspeech/ASR/zipformer/onnx_decode.py
new file mode 100755
index 000000000..ed5f6db08
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/onnx_decode.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python3
+#
+# Copyright 2021-2023 Xiaomi Corporation (Author: Fangjun Kuang,
+#                                                 Zengwei Yao,
+#                                                 Xiaoyu Yang,
+#                                                 Wei Kang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script loads ONNX exported models and uses them to decode the test sets.
+
+We use the pre-trained model from
+https://huggingface.co/pkufool/icefall-asr-zipformer-wenetspeech-20230615
+as an example to show how to use this file.
+
+1. Download the pre-trained model
+
+cd egs/wenetspeech/ASR
+
+repo_url=https://huggingface.co/pkufool/icefall-asr-zipformer-wenetspeech-20230615
+GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+repo=$(basename $repo_url)
+
+pushd $repo
+git lfs pull --include "data/lang_char/tokens.txt"
+git lfs pull --include "exp/pretrained.pt"
+
+cd exp
+ln -s pretrained.pt epoch-9999.pt
+popd
+
+2. Export the model to ONNX
+
+./zipformer/export-onnx.py \
+  --tokens $repo/data/lang_char/tokens.txt \
+  --epoch 9999 \
+  --avg 1 \
+  --exp-dir $repo/exp/
+
+It will generate the following 3 files inside $repo/exp:
+
+  - encoder-epoch-9999-avg-1.onnx
+  - decoder-epoch-9999-avg-1.onnx
+  - joiner-epoch-9999-avg-1.onnx
+
+2. Run this file
+
+./zipformer/onnx_decode.py \
+  --exp-dir ./zipformer/exp \
+  --max-duration 600 \
+  --encoder-model-filename $repo/exp/encoder-epoch-9999-avg-1.onnx \
+  --decoder-model-filename $repo/exp/decoder-epoch-9999-avg-1.onnx \
+  --joiner-model-filename $repo/exp/joiner-epoch-9999-avg-1.onnx \
+"""
+
+
+import argparse
+import logging
+import time
+from pathlib import Path
+from typing import List, Tuple
+
+import k2
+import torch
+import torch.nn as nn
+from asr_datamodule import WenetSpeechAsrDataModule
+from lhotse.cut import Cut
+from onnx_pretrained import OnnxModel, greedy_search
+
+from icefall.utils import setup_logger, store_transcripts, write_error_stats
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--encoder-model-filename",
+        type=str,
+        required=True,
+        help="Path to the encoder onnx model. ",
+    )
+
+    parser.add_argument(
+        "--decoder-model-filename",
+        type=str,
+        required=True,
+        help="Path to the decoder onnx model. ",
+    )
+
+    parser.add_argument(
+        "--joiner-model-filename",
+        type=str,
+        required=True,
+        help="Path to the joiner onnx model. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="pruned_transducer_stateless7/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        default="data/lang_char/tokens.txt",
+        help="Path to the tokens.txt",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="Valid values are greedy_search and modified_beam_search",
+    )
+
+    return parser
+
+
+def decode_one_batch(
+    model: OnnxModel, token_table: k2.SymbolTable, batch: dict
+) -> List[List[str]]:
+    """Decode one batch and return the result.
+    Currently it only greedy_search is supported.
+
+    Args:
+      model:
+        The neural model.
+      token_table:
+        Mapping ids to tokens.
+      batch:
+        It is the return value from iterating
+        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
+        for the format of the `batch`.
+
+    Returns:
+      Return the decoded results for each utterance.
+    """
+    feature = batch["inputs"]
+    assert feature.ndim == 3
+    # at entry, feature is (N, T, C)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(dtype=torch.int64)
+
+    encoder_out, encoder_out_lens = model.run_encoder(x=feature, x_lens=feature_lens)
+
+    hyps = greedy_search(
+        model=model, encoder_out=encoder_out, encoder_out_lens=encoder_out_lens
+    )
+
+    hyps = [[token_table[h] for h in hyp] for hyp in hyps]
+    return hyps
+
+
+def decode_dataset(
+    dl: torch.utils.data.DataLoader,
+    model: nn.Module,
+    token_table: k2.SymbolTable,
+) -> Tuple[List[Tuple[str, List[str], List[str]]], float]:
+    """Decode dataset.
+
+    Args:
+      dl:
+        PyTorch's dataloader containing the dataset to decode.
+      model:
+        The neural model.
+      token_table:
+        Mapping ids to tokens.
+
+    Returns:
+      - A list of tuples. Each tuple contains three elements:
+         - cut_id,
+         - reference transcript,
+         - predicted result.
+      - The total duration (in seconds) of the dataset.
+    """
+    num_cuts = 0
+
+    try:
+        num_batches = len(dl)
+    except TypeError:
+        num_batches = "?"
+
+    log_interval = 10
+    total_duration = 0
+
+    results = []
+    for batch_idx, batch in enumerate(dl):
+        texts = batch["supervisions"]["text"]
+        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
+        total_duration += sum([cut.duration for cut in batch["supervisions"]["cut"]])
+
+        hyps = decode_one_batch(model=model, token_table=token_table, batch=batch)
+
+        this_batch = []
+        assert len(hyps) == len(texts)
+        for cut_id, hyp_words, ref_text in zip(cut_ids, hyps, texts):
+            ref_words = list(ref_text)
+            this_batch.append((cut_id, ref_words, hyp_words))
+
+        results.extend(this_batch)
+
+        num_cuts += len(texts)
+
+        if batch_idx % log_interval == 0:
+            batch_str = f"{batch_idx}/{num_batches}"
+
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
+
+    return results, total_duration
+
+
+def save_results(
+    res_dir: Path,
+    test_set_name: str,
+    results: List[Tuple[str, List[str], List[str]]],
+):
+    recog_path = res_dir / f"recogs-{test_set_name}.txt"
+    results = sorted(results)
+    store_transcripts(filename=recog_path, texts=results)
+    logging.info(f"The transcripts are stored in {recog_path}")
+
+    # The following prints out WERs, per-word error statistics and aligned
+    # ref/hyp pairs.
+    errs_filename = res_dir / f"errs-{test_set_name}.txt"
+    with open(errs_filename, "w") as f:
+        wer = write_error_stats(f, f"{test_set_name}", results, enable_log=True)
+
+    logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    errs_info = res_dir / f"wer-summary-{test_set_name}.txt"
+    with open(errs_info, "w") as f:
+        print("WER", file=f)
+        print(wer, file=f)
+
+    s = "\nFor {}, WER is {}:\n".format(test_set_name, wer)
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    WenetSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+
+    assert (
+        args.decoding_method == "greedy_search"
+    ), "Only supports greedy_search currently."
+    res_dir = Path(args.exp_dir) / f"onnx-{args.decoding_method}"
+
+    setup_logger(f"{res_dir}/log-decode")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    logging.info(f"Device: {device}")
+
+    token_table = k2.SymbolTable.from_file(args.tokens)
+    assert token_table[0] == "<blk>"
+
+    logging.info(vars(args))
+
+    logging.info("About to create model")
+    model = OnnxModel(
+        encoder_model_filename=args.encoder_model_filename,
+        decoder_model_filename=args.decoder_model_filename,
+        joiner_model_filename=args.joiner_model_filename,
+    )
+
+    # we need cut ids to display recognition results.
+    args.return_cuts = True
+
+    wenetspeech = WenetSpeechAsrDataModule(args)
+
+    def remove_short_utt(c: Cut):
+        T = ((c.num_frames - 7) // 2 + 1) // 2
+        if T <= 0:
+            logging.warning(
+                f"Exclude cut with ID {c.id} from decoding, num_frames : {c.num_frames}."
+            )
+        return T > 0
+
+    dev_cuts = wenetspeech.valid_cuts()
+    dev_cuts = dev_cuts.filter(remove_short_utt)
+    dev_dl = wenetspeech.valid_dataloaders(dev_cuts)
+
+    test_net_cuts = wenetspeech.test_net_cuts()
+    test_net_cuts = test_net_cuts.filter(remove_short_utt)
+    test_net_dl = wenetspeech.test_dataloaders(test_net_cuts)
+
+    test_meeting_cuts = wenetspeech.test_meeting_cuts()
+    test_meeting_cuts = test_meeting_cuts.filter(remove_short_utt)
+    test_meeting_dl = wenetspeech.test_dataloaders(test_meeting_cuts)
+
+    test_sets = ["DEV", "TEST_NET", "TEST_MEETING"]
+    test_dl = [dev_dl, test_net_dl, test_meeting_dl]
+
+    for test_set, test_dl in zip(test_sets, test_dl):
+        start_time = time.time()
+        results, total_duration = decode_dataset(
+            dl=test_dl, model=model, token_table=token_table
+        )
+        end_time = time.time()
+        elapsed_seconds = end_time - start_time
+        rtf = elapsed_seconds / total_duration
+
+        logging.info(f"Elapsed time: {elapsed_seconds:.3f} s")
+        logging.info(f"Wave duration: {total_duration:.3f} s")
+        logging.info(
+            f"Real time factor (RTF): {elapsed_seconds:.3f}/{total_duration:.3f} = {rtf:.3f}"
+        )
+
+        save_results(res_dir=res_dir, test_set_name=test_set, results=results)
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wenetspeech/ASR/zipformer/onnx_pretrained-streaming.py b/egs/wenetspeech/ASR/zipformer/onnx_pretrained-streaming.py
new file mode 120000
index 000000000..cfea104c2
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/onnx_pretrained-streaming.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/onnx_pretrained-streaming.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/onnx_pretrained.py b/egs/wenetspeech/ASR/zipformer/onnx_pretrained.py
new file mode 120000
index 000000000..8f32f4ee7
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/onnx_pretrained.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/onnx_pretrained.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/optim.py b/egs/wenetspeech/ASR/zipformer/optim.py
new file mode 120000
index 000000000..5eaa3cffd
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/optim.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/optim.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/pretrained.py b/egs/wenetspeech/ASR/zipformer/pretrained.py
new file mode 120000
index 000000000..0bd71dde4
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/pretrained.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/pretrained.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/scaling.py b/egs/wenetspeech/ASR/zipformer/scaling.py
new file mode 120000
index 000000000..6f398f431
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/scaling.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/scaling.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/scaling_converter.py b/egs/wenetspeech/ASR/zipformer/scaling_converter.py
new file mode 120000
index 000000000..b0ecee05e
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/scaling_converter.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/scaling_converter.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/streaming_beam_search.py b/egs/wenetspeech/ASR/zipformer/streaming_beam_search.py
new file mode 120000
index 000000000..b1ed54557
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/streaming_beam_search.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/streaming_beam_search.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/streaming_decode.py b/egs/wenetspeech/ASR/zipformer/streaming_decode.py
new file mode 100755
index 000000000..94c5fae5f
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/streaming_decode.py
@@ -0,0 +1,881 @@
+#!/usr/bin/env python3
+# Copyright 2022-2023 Xiaomi Corporation (Authors: Wei Kang,
+#                                                  Fangjun Kuang,
+#                                                  Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Usage:
+./zipformer/streaming_decode.py \
+  --epoch 28 \
+  --avg 15 \
+  --causal 1 \
+  --chunk-size 16 \
+  --left-context-frames 256 \
+  --exp-dir ./zipformer/exp \
+  --decoding-method greedy_search \
+  --num-decode-streams 2000
+"""
+
+import argparse
+import logging
+import math
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import k2
+import numpy as np
+import torch
+from asr_datamodule import WenetSpeechAsrDataModule
+from decode_stream import DecodeStream
+from kaldifeat import Fbank, FbankOptions
+from lhotse import CutSet
+from streaming_beam_search import (
+    fast_beam_search_one_best,
+    greedy_search,
+    modified_beam_search,
+)
+from torch import Tensor, nn
+from torch.nn.utils.rnn import pad_sequence
+from train import add_model_arguments, get_model, get_params
+
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    make_pad_mask,
+    setup_logger,
+    store_transcripts,
+    str2bool,
+    write_error_stats,
+)
+
+LOG_EPS = math.log(1e-10)
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=28,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        You can specify --avg to use more checkpoints for model averaging.""",
+    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
+    parser.add_argument(
+        "--avg",
+        type=int,
+        default=15,
+        help="Number of checkpoints to average. Automatically select "
+        "consecutive checkpoints before the checkpoint specified by "
+        "'--epoch' and '--iter'",
+    )
+
+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=True,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        default="data/lang_char",
+        help="Path to the lang dir(containing lexicon, tokens, etc.)",
+    )
+
+    parser.add_argument(
+        "--decoding-method",
+        type=str,
+        default="greedy_search",
+        help="""Supported decoding methods are:
+        greedy_search
+        modified_beam_search
+        fast_beam_search
+        """,
+    )
+
+    parser.add_argument(
+        "--num_active_paths",
+        type=int,
+        default=4,
+        help="""An interger indicating how many candidates we will keep for each
+        frame. Used only when --decoding-method is modified_beam_search.""",
+    )
+
+    parser.add_argument(
+        "--beam",
+        type=float,
+        default=4,
+        help="""A floating point value to calculate the cutoff score during beam
+        search (i.e., `cutoff = max-score - beam`), which is the same as the
+        `beam` in Kaldi.
+        Used only when --decoding-method is fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-contexts",
+        type=int,
+        default=4,
+        help="""Used only when --decoding-method is
+        fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--max-states",
+        type=int,
+        default=32,
+        help="""Used only when --decoding-method is
+        fast_beam_search""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
+    )
+
+    parser.add_argument(
+        "--blank-penalty",
+        type=float,
+        default=0.0,
+        help="""
+        The penalty applied on blank symbol during decoding.
+        Note: It is a positive value that would be applied to logits like
+        this `logits[:, 0] -= blank_penalty` (suppose logits.shape is
+        [batch_size, vocab] and blank id is 0).
+        """,
+    )
+
+    parser.add_argument(
+        "--num-decode-streams",
+        type=int,
+        default=2000,
+        help="The number of streams that can be decoded parallel.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_init_states(
+    model: nn.Module,
+    batch_size: int = 1,
+    device: torch.device = torch.device("cpu"),
+) -> List[torch.Tensor]:
+    """
+    Returns a list of cached tensors of all encoder layers. For layer-i, states[i*6:(i+1)*6]
+    is (cached_key, cached_nonlin_attn, cached_val1, cached_val2, cached_conv1, cached_conv2).
+    states[-2] is the cached left padding for ConvNeXt module,
+    of shape (batch_size, num_channels, left_pad, num_freqs)
+    states[-1] is processed_lens of shape (batch,), which records the number
+    of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
+    """
+    states = model.encoder.get_init_states(batch_size, device)
+
+    embed_states = model.encoder_embed.get_init_states(batch_size, device)
+    states.append(embed_states)
+
+    processed_lens = torch.zeros(batch_size, dtype=torch.int32, device=device)
+    states.append(processed_lens)
+
+    return states
+
+
+def stack_states(state_list: List[List[torch.Tensor]]) -> List[torch.Tensor]:
+    """Stack list of zipformer states that correspond to separate utterances
+    into a single emformer state, so that it can be used as an input for
+    zipformer when those utterances are formed into a batch.
+
+    Args:
+      state_list:
+        Each element in state_list corresponding to the internal state
+        of the zipformer model for a single utterance. For element-n,
+        state_list[n] is a list of cached tensors of all encoder layers. For layer-i,
+        state_list[n][i*6:(i+1)*6] is (cached_key, cached_nonlin_attn, cached_val1,
+        cached_val2, cached_conv1, cached_conv2).
+        state_list[n][-2] is the cached left padding for ConvNeXt module,
+          of shape (batch_size, num_channels, left_pad, num_freqs)
+        state_list[n][-1] is processed_lens of shape (batch,), which records the number
+        of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
+
+    Note:
+      It is the inverse of :func:`unstack_states`.
+    """
+    batch_size = len(state_list)
+    assert (len(state_list[0]) - 2) % 6 == 0, len(state_list[0])
+    tot_num_layers = (len(state_list[0]) - 2) // 6
+
+    batch_states = []
+    for layer in range(tot_num_layers):
+        layer_offset = layer * 6
+        # cached_key: (left_context_len, batch_size, key_dim)
+        cached_key = torch.cat(
+            [state_list[i][layer_offset] for i in range(batch_size)], dim=1
+        )
+        # cached_nonlin_attn: (num_heads, batch_size, left_context_len, head_dim)
+        cached_nonlin_attn = torch.cat(
+            [state_list[i][layer_offset + 1] for i in range(batch_size)], dim=1
+        )
+        # cached_val1: (left_context_len, batch_size, value_dim)
+        cached_val1 = torch.cat(
+            [state_list[i][layer_offset + 2] for i in range(batch_size)], dim=1
+        )
+        # cached_val2: (left_context_len, batch_size, value_dim)
+        cached_val2 = torch.cat(
+            [state_list[i][layer_offset + 3] for i in range(batch_size)], dim=1
+        )
+        # cached_conv1: (#batch, channels, left_pad)
+        cached_conv1 = torch.cat(
+            [state_list[i][layer_offset + 4] for i in range(batch_size)], dim=0
+        )
+        # cached_conv2: (#batch, channels, left_pad)
+        cached_conv2 = torch.cat(
+            [state_list[i][layer_offset + 5] for i in range(batch_size)], dim=0
+        )
+        batch_states += [
+            cached_key,
+            cached_nonlin_attn,
+            cached_val1,
+            cached_val2,
+            cached_conv1,
+            cached_conv2,
+        ]
+
+    cached_embed_left_pad = torch.cat(
+        [state_list[i][-2] for i in range(batch_size)], dim=0
+    )
+    batch_states.append(cached_embed_left_pad)
+
+    processed_lens = torch.cat([state_list[i][-1] for i in range(batch_size)], dim=0)
+    batch_states.append(processed_lens)
+
+    return batch_states
+
+
+def unstack_states(batch_states: List[Tensor]) -> List[List[Tensor]]:
+    """Unstack the zipformer state corresponding to a batch of utterances
+    into a list of states, where the i-th entry is the state from the i-th
+    utterance in the batch.
+
+    Note:
+      It is the inverse of :func:`stack_states`.
+
+    Args:
+        batch_states: A list of cached tensors of all encoder layers. For layer-i,
+          states[i*6:(i+1)*6] is (cached_key, cached_nonlin_attn, cached_val1, cached_val2,
+          cached_conv1, cached_conv2).
+          state_list[-2] is the cached left padding for ConvNeXt module,
+          of shape (batch_size, num_channels, left_pad, num_freqs)
+          states[-1] is processed_lens of shape (batch,), which records the number
+          of processed frames (at 50hz frame rate, after encoder_embed) for each sample in batch.
+
+    Returns:
+        state_list: A list of list. Each element in state_list corresponding to the internal state
+        of the zipformer model for a single utterance.
+    """
+    assert (len(batch_states) - 2) % 6 == 0, len(batch_states)
+    tot_num_layers = (len(batch_states) - 2) // 6
+
+    processed_lens = batch_states[-1]
+    batch_size = processed_lens.shape[0]
+
+    state_list = [[] for _ in range(batch_size)]
+
+    for layer in range(tot_num_layers):
+        layer_offset = layer * 6
+        # cached_key: (left_context_len, batch_size, key_dim)
+        cached_key_list = batch_states[layer_offset].chunk(chunks=batch_size, dim=1)
+        # cached_nonlin_attn: (num_heads, batch_size, left_context_len, head_dim)
+        cached_nonlin_attn_list = batch_states[layer_offset + 1].chunk(
+            chunks=batch_size, dim=1
+        )
+        # cached_val1: (left_context_len, batch_size, value_dim)
+        cached_val1_list = batch_states[layer_offset + 2].chunk(
+            chunks=batch_size, dim=1
+        )
+        # cached_val2: (left_context_len, batch_size, value_dim)
+        cached_val2_list = batch_states[layer_offset + 3].chunk(
+            chunks=batch_size, dim=1
+        )
+        # cached_conv1: (#batch, channels, left_pad)
+        cached_conv1_list = batch_states[layer_offset + 4].chunk(
+            chunks=batch_size, dim=0
+        )
+        # cached_conv2: (#batch, channels, left_pad)
+        cached_conv2_list = batch_states[layer_offset + 5].chunk(
+            chunks=batch_size, dim=0
+        )
+        for i in range(batch_size):
+            state_list[i] += [
+                cached_key_list[i],
+                cached_nonlin_attn_list[i],
+                cached_val1_list[i],
+                cached_val2_list[i],
+                cached_conv1_list[i],
+                cached_conv2_list[i],
+            ]
+
+    cached_embed_left_pad_list = batch_states[-2].chunk(chunks=batch_size, dim=0)
+    for i in range(batch_size):
+        state_list[i].append(cached_embed_left_pad_list[i])
+
+    processed_lens_list = batch_states[-1].chunk(chunks=batch_size, dim=0)
+    for i in range(batch_size):
+        state_list[i].append(processed_lens_list[i])
+
+    return state_list
+
+
+def streaming_forward(
+    features: Tensor,
+    feature_lens: Tensor,
+    model: nn.Module,
+    states: List[Tensor],
+    chunk_size: int,
+    left_context_len: int,
+) -> Tuple[Tensor, Tensor, List[Tensor]]:
+    """
+    Returns encoder outputs, output lengths, and updated states.
+    """
+    cached_embed_left_pad = states[-2]
+    (x, x_lens, new_cached_embed_left_pad,) = model.encoder_embed.streaming_forward(
+        x=features,
+        x_lens=feature_lens,
+        cached_left_pad=cached_embed_left_pad,
+    )
+    assert x.size(1) == chunk_size, (x.size(1), chunk_size)
+
+    src_key_padding_mask = make_pad_mask(x_lens)
+
+    # processed_mask is used to mask out initial states
+    processed_mask = torch.arange(left_context_len, device=x.device).expand(
+        x.size(0), left_context_len
+    )
+    processed_lens = states[-1]  # (batch,)
+    # (batch, left_context_size)
+    processed_mask = (processed_lens.unsqueeze(1) <= processed_mask).flip(1)
+    # Update processed lengths
+    new_processed_lens = processed_lens + x_lens
+
+    # (batch, left_context_size + chunk_size)
+    src_key_padding_mask = torch.cat([processed_mask, src_key_padding_mask], dim=1)
+
+    x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+    encoder_states = states[:-2]
+    (
+        encoder_out,
+        encoder_out_lens,
+        new_encoder_states,
+    ) = model.encoder.streaming_forward(
+        x=x,
+        x_lens=x_lens,
+        states=encoder_states,
+        src_key_padding_mask=src_key_padding_mask,
+    )
+    encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
+
+    new_states = new_encoder_states + [
+        new_cached_embed_left_pad,
+        new_processed_lens,
+    ]
+    return encoder_out, encoder_out_lens, new_states
+
+
+def decode_one_chunk(
+    params: AttributeDict,
+    model: nn.Module,
+    decode_streams: List[DecodeStream],
+) -> List[int]:
+    """Decode one chunk frames of features for each decode_streams and
+    return the indexes of finished streams in a List.
+
+    Args:
+      params:
+        It's the return value of :func:`get_params`.
+      model:
+        The neural model.
+      decode_streams:
+        A List of DecodeStream, each belonging to a utterance.
+    Returns:
+      Return a List containing which DecodeStreams are finished.
+    """
+    device = model.device
+    chunk_size = int(params.chunk_size)
+    left_context_len = int(params.left_context_frames)
+
+    features = []
+    feature_lens = []
+    states = []
+    processed_lens = []  # Used in fast-beam-search
+
+    for stream in decode_streams:
+        feat, feat_len = stream.get_feature_frames(chunk_size * 2)
+        features.append(feat)
+        feature_lens.append(feat_len)
+        states.append(stream.states)
+        processed_lens.append(stream.done_frames)
+
+    feature_lens = torch.tensor(feature_lens, device=device)
+    features = pad_sequence(features, batch_first=True, padding_value=LOG_EPS)
+
+    # Make sure the length after encoder_embed is at least 1.
+    # The encoder_embed subsample features (T - 7) // 2
+    # The ConvNeXt module needs (7 - 1) // 2 = 3 frames of right padding after subsampling
+    tail_length = chunk_size * 2 + 7 + 2 * 3
+    if features.size(1) < tail_length:
+        pad_length = tail_length - features.size(1)
+        feature_lens += pad_length
+        features = torch.nn.functional.pad(
+            features,
+            (0, 0, 0, pad_length),
+            mode="constant",
+            value=LOG_EPS,
+        )
+
+    states = stack_states(states)
+
+    encoder_out, encoder_out_lens, new_states = streaming_forward(
+        features=features,
+        feature_lens=feature_lens,
+        model=model,
+        states=states,
+        chunk_size=chunk_size,
+        left_context_len=left_context_len,
+    )
+
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    if params.decoding_method == "greedy_search":
+        greedy_search(
+            model=model,
+            encoder_out=encoder_out,
+            streams=decode_streams,
+            blank_penalty=params.blank_penalty,
+        )
+    elif params.decoding_method == "fast_beam_search":
+        processed_lens = torch.tensor(processed_lens, device=device)
+        processed_lens = processed_lens + encoder_out_lens
+        fast_beam_search_one_best(
+            model=model,
+            encoder_out=encoder_out,
+            processed_lens=processed_lens,
+            streams=decode_streams,
+            beam=params.beam,
+            max_states=params.max_states,
+            max_contexts=params.max_contexts,
+            blank_penalty=params.blank_penalty,
+        )
+    elif params.decoding_method == "modified_beam_search":
+        modified_beam_search(
+            model=model,
+            streams=decode_streams,
+            encoder_out=encoder_out,
+            num_active_paths=params.num_active_paths,
+            blank_penalty=params.blank_penalty,
+        )
+    else:
+        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
+
+    states = unstack_states(new_states)
+
+    finished_streams = []
+    for i in range(len(decode_streams)):
+        decode_streams[i].states = states[i]
+        decode_streams[i].done_frames += encoder_out_lens[i]
+        if decode_streams[i].done:
+            finished_streams.append(i)
+
+    return finished_streams
+
+
+def decode_dataset(
+    cuts: CutSet,
+    params: AttributeDict,
+    model: nn.Module,
+    lexicon: Lexicon,
+    decoding_graph: Optional[k2.Fsa] = None,
+) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+    """Decode dataset.
+
+    Args:
+      cuts:
+        Lhotse Cutset containing the dataset to decode.
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The neural model.
+      lexicon:
+        The Lexicon.
+      decoding_graph:
+        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
+        only when --decoding_method is fast_beam_search.
+    Returns:
+      Return a dict, whose key may be "greedy_search" if greedy search
+      is used, or it may be "beam_7" if beam size of 7 is used.
+      Its value is a list of tuples. Each tuple contains two elements:
+      The first is the reference transcript, and the second is the
+      predicted result.
+    """
+    device = model.device
+
+    opts = FbankOptions()
+    opts.device = device
+    opts.frame_opts.dither = 0
+    opts.frame_opts.snip_edges = False
+    opts.frame_opts.samp_freq = 16000
+    opts.mel_opts.num_bins = 80
+
+    log_interval = 100
+
+    decode_results = []
+    # Contain decode streams currently running.
+    decode_streams = []
+    for num, cut in enumerate(cuts):
+        # each utterance has a DecodeStream.
+        initial_states = get_init_states(model=model, batch_size=1, device=device)
+        decode_stream = DecodeStream(
+            params=params,
+            cut_id=cut.id,
+            initial_states=initial_states,
+            decoding_graph=decoding_graph,
+            device=device,
+        )
+
+        audio: np.ndarray = cut.load_audio()
+        # audio.shape: (1, num_samples)
+        assert len(audio.shape) == 2
+        assert audio.shape[0] == 1, "Should be single channel"
+        assert audio.dtype == np.float32, audio.dtype
+
+        # The trained model is using normalized samples
+        if audio.max() > 1:
+            logging.warning(
+                f"The audio should be normalized to [-1, 1], audio.max : {audio.max()}."
+                f"Clipping to [-1, 1]."
+            )
+            audio = np.clip(audio, -1, 1)
+
+        samples = torch.from_numpy(audio).squeeze(0)
+
+        fbank = Fbank(opts)
+        feature = fbank(samples.to(device))
+        decode_stream.set_features(feature, tail_pad_len=30)
+        decode_stream.ground_truth = cut.supervisions[0].text
+
+        decode_streams.append(decode_stream)
+
+        while len(decode_streams) >= params.num_decode_streams:
+            finished_streams = decode_one_chunk(
+                params=params, model=model, decode_streams=decode_streams
+            )
+            for i in sorted(finished_streams, reverse=True):
+                decode_results.append(
+                    (
+                        decode_streams[i].id,
+                        list(decode_streams[i].ground_truth.strip()),
+                        [
+                            lexicon.token_table[idx]
+                            for idx in decode_streams[i].decoding_result()
+                        ],
+                    )
+                )
+                del decode_streams[i]
+
+        if num % log_interval == 0:
+            logging.info(f"Cuts processed until now is {num}.")
+
+    # decode final chunks of last sequences
+    while len(decode_streams):
+        finished_streams = decode_one_chunk(
+            params=params, model=model, decode_streams=decode_streams
+        )
+        for i in sorted(finished_streams, reverse=True):
+            decode_results.append(
+                (
+                    decode_streams[i].id,
+                    decode_streams[i].ground_truth.split(),
+                    [
+                        lexicon.token_table[idx]
+                        for idx in decode_streams[i].decoding_result()
+                    ],
+                )
+            )
+            del decode_streams[i]
+
+    key = f"blank_penalty_{params.blank_penalty}"
+    if params.decoding_method == "greedy_search":
+        key = f"greedy_search_{key}"
+    elif params.decoding_method == "fast_beam_search":
+        key = (
+            f"beam_{params.beam}_"
+            f"max_contexts_{params.max_contexts}_"
+            f"max_states_{params.max_states}_{key}"
+        )
+    elif params.decoding_method == "modified_beam_search":
+        key = f"num_active_paths_{params.num_active_paths}_{key}"
+    else:
+        raise ValueError(f"Unsupported decoding method: {params.decoding_method}")
+    return {key: decode_results}
+
+
+def save_results(
+    params: AttributeDict,
+    test_set_name: str,
+    results_dict: Dict[str, List[Tuple[List[str], List[str]]]],
+):
+    test_set_wers = dict()
+    for key, results in results_dict.items():
+        recog_path = (
+            params.res_dir / f"recogs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        results = sorted(results)
+        store_transcripts(filename=recog_path, texts=results)
+        logging.info(f"The transcripts are stored in {recog_path}")
+
+        # The following prints out WERs, per-word error statistics and aligned
+        # ref/hyp pairs.
+        errs_filename = (
+            params.res_dir / f"errs-{test_set_name}-{key}-{params.suffix}.txt"
+        )
+        with open(errs_filename, "w") as f:
+            wer = write_error_stats(
+                f, f"{test_set_name}-{key}", results, enable_log=True
+            )
+            test_set_wers[key] = wer
+
+        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+
+    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
+    errs_info = (
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+    )
+    with open(errs_info, "w") as f:
+        print("settings\tWER", file=f)
+        for key, val in test_set_wers:
+            print("{}\t{}".format(key, val), file=f)
+
+    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    note = "\tbest for {}".format(test_set_name)
+    for key, val in test_set_wers:
+        s += "{}\t{}{}\n".format(key, val, note)
+        note = ""
+    logging.info(s)
+
+
+@torch.no_grad()
+def main():
+    parser = get_parser()
+    WenetSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    params.res_dir = params.exp_dir / "streaming" / params.decoding_method
+
+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
+        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+
+    assert params.causal, params.causal
+    assert "," not in params.chunk_size, "chunk_size should be one value in decoding."
+    assert (
+        "," not in params.left_context_frames
+    ), "left_context_frames should be one value in decoding."
+    params.suffix += f"-chunk-{params.chunk_size}"
+    params.suffix += f"-left-context-{params.left_context_frames}"
+    params.suffix += f"-blank-penalty-{params.blank_penalty}"
+
+    # for fast_beam_search
+    if params.decoding_method == "fast_beam_search":
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+
+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
+    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
+    logging.info("Decoding started")
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", 0)
+
+    logging.info(f"Device: {device}")
+
+    lexicon = Lexicon(params.lang_dir)
+    params.blank_id = lexicon.token_table["<blk>"]
+    params.vocab_size = max(lexicon.tokens) + 1
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_model(params)
+
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if start >= 0:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+    else:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+
+    model.to(device)
+    model.eval()
+    model.device = device
+
+    decoding_graph = None
+    if params.decoding_method == "fast_beam_search":
+        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    wenetspeech = WenetSpeechAsrDataModule(args)
+
+    dev_cuts = wenetspeech.valid_cuts()
+    test_net_cuts = wenetspeech.test_net_cuts()
+    test_meeting_cuts = wenetspeech.test_meeting_cuts()
+
+    test_sets = ["DEV", "TEST_NET", "TEST_MEETING"]
+    test_cuts = [dev_cuts, test_net_cuts, test_meeting_cuts]
+
+    for test_set, test_cut in zip(test_sets, test_cuts):
+        results_dict = decode_dataset(
+            cuts=test_cut,
+            params=params,
+            model=model,
+            lexicon=lexicon,
+            decoding_graph=decoding_graph,
+        )
+        save_results(
+            params=params,
+            test_set_name=test_set,
+            results_dict=results_dict,
+        )
+
+    logging.info("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wenetspeech/ASR/zipformer/subsampling.py b/egs/wenetspeech/ASR/zipformer/subsampling.py
new file mode 120000
index 000000000..01ae9002c
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/subsampling.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/subsampling.py
\ No newline at end of file
diff --git a/egs/wenetspeech/ASR/zipformer/train.py b/egs/wenetspeech/ASR/zipformer/train.py
new file mode 100755
index 000000000..83dbfa22f
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/train.py
@@ -0,0 +1,1350 @@
+#!/usr/bin/env python3
+# Copyright    2021-2023  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                       Wei Kang,
+#                                                       Mingshuang Luo,
+#                                                       Zengwei Yao,
+#                                                       Daniel Povey)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Usage:
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+
+./zipformer/train.py \
+  --world-size 8 \
+  --num-epochs 12 \
+  --start-epoch 1 \
+  --exp-dir zipformer/exp \
+  --training-subset L
+  --lr-epochs 1.5 \
+  --max-duration 350
+
+# For mix precision training:
+
+./zipformer/train.py \
+  --world-size 8 \
+  --num-epochs 12 \
+  --start-epoch 1 \
+  --use-fp16 1 \
+  --exp-dir zipformer/exp \
+  --training-subset L \
+  --lr-epochs 1.5 \
+  --max-duration 750
+
+"""
+
+
+import argparse
+import copy
+import logging
+import warnings
+from pathlib import Path
+from shutil import copyfile
+from typing import Any, Dict, Optional, Tuple, Union
+
+import k2
+import optim
+import torch
+import torch.multiprocessing as mp
+import torch.nn as nn
+from asr_datamodule import WenetSpeechAsrDataModule
+from decoder import Decoder
+from joiner import Joiner
+from lhotse.cut import Cut
+from lhotse.dataset.sampling.base import CutSampler
+from lhotse.utils import fix_random_seed
+from model import AsrModel
+from optim import Eden, ScaledAdam
+from scaling import ScheduledFloat
+from subsampling import Conv2dSubsampling
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+from zipformer import Zipformer2
+
+from icefall import diagnostics
+from icefall.char_graph_compiler import CharCtcTrainingGraphCompiler
+from icefall.checkpoint import load_checkpoint, remove_checkpoints
+from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
+from icefall.dist import cleanup_dist, setup_dist
+from icefall.env import get_env_info
+from icefall.hooks import register_inf_check_hooks
+from icefall.lexicon import Lexicon
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    get_parameter_groups_with_lrs,
+    setup_logger,
+    str2bool,
+)
+
+LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
+
+
+def get_adjusted_batch_count(params: AttributeDict) -> float:
+    # returns the number of batches we would have used so far if we had used the reference
+    # duration.  This is for purposes of set_batch_count().
+    return (
+        params.batch_idx_train
+        * (params.max_duration * params.world_size)
+        / params.ref_duration
+    )
+
+
+def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None:
+    if isinstance(model, DDP):
+        # get underlying nn.Module
+        model = model.module
+    for name, module in model.named_modules():
+        if hasattr(module, "batch_count"):
+            module.batch_count = batch_count
+        if hasattr(module, "name"):
+            module.name = name
+
+
+def add_model_arguments(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        "--num-encoder-layers",
+        type=str,
+        default="2,2,3,4,3,2",
+        help="Number of zipformer encoder layers per stack, comma separated.",
+    )
+
+    parser.add_argument(
+        "--downsampling-factor",
+        type=str,
+        default="1,2,4,8,4,2",
+        help="Downsampling factor for each stack of encoder layers.",
+    )
+
+    parser.add_argument(
+        "--feedforward-dim",
+        type=str,
+        default="512,768,1024,1536,1024,768",
+        help="""Feedforward dimension of the zipformer encoder layers, per stack, comma separated.""",
+    )
+
+    parser.add_argument(
+        "--num-heads",
+        type=str,
+        default="4,4,4,8,4,4",
+        help="""Number of attention heads in the zipformer encoder layers: a single int or comma-separated list.""",
+    )
+
+    parser.add_argument(
+        "--encoder-dim",
+        type=str,
+        default="192,256,384,512,384,256",
+        help="""Embedding dimension in encoder stacks: a single int or comma-separated list.""",
+    )
+
+    parser.add_argument(
+        "--query-head-dim",
+        type=str,
+        default="32",
+        help="""Query/key dimension per head in encoder stacks: a single int or comma-separated list.""",
+    )
+
+    parser.add_argument(
+        "--value-head-dim",
+        type=str,
+        default="12",
+        help="""Value dimension per head in encoder stacks: a single int or comma-separated list.""",
+    )
+
+    parser.add_argument(
+        "--pos-head-dim",
+        type=str,
+        default="4",
+        help="""Positional-encoding dimension per head in encoder stacks: a single int or comma-separated list.""",
+    )
+
+    parser.add_argument(
+        "--pos-dim",
+        type=int,
+        default="48",
+        help="Positional-encoding embedding dimension",
+    )
+
+    parser.add_argument(
+        "--encoder-unmasked-dim",
+        type=str,
+        default="192,192,256,256,256,192",
+        help="""Unmasked dimensions in the encoders, relates to augmentation during training. A single int or comma-separated list.  Must be <= each corresponding encoder_dim.""",
+    )
+
+    parser.add_argument(
+        "--cnn-module-kernel",
+        type=str,
+        default="31,31,15,15,15,31",
+        help="""Sizes of convolutional kernels in convolution modules in each encoder stack: a single int or comma-separated list.""",
+    )
+
+    parser.add_argument(
+        "--decoder-dim",
+        type=int,
+        default=512,
+        help="Embedding dimension in the decoder model.",
+    )
+
+    parser.add_argument(
+        "--joiner-dim",
+        type=int,
+        default=512,
+        help="""Dimension used in the joiner model.
+        Outputs from the encoder and decoder model are projected
+        to this dimension before adding.
+        """,
+    )
+
+    parser.add_argument(
+        "--causal",
+        type=str2bool,
+        default=False,
+        help="If True, use causal version of model.",
+    )
+
+    parser.add_argument(
+        "--chunk-size",
+        type=str,
+        default="16,32,64,-1",
+        help="""Chunk sizes (at 50Hz frame rate) will be chosen randomly from this list during training. Must be just -1 if --causal=False""",
+    )
+
+    parser.add_argument(
+        "--left-context-frames",
+        type=str,
+        default="64,128,256,-1",
+        help="""Maximum left-contexts for causal training, measured in frames which will
+        be converted to a number of chunks.  If splitting into chunks,
+        chunk left-context frames will be chosen randomly from this list; else not relevant.""",
+    )
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--world-size",
+        type=int,
+        default=1,
+        help="Number of GPUs for DDP training.",
+    )
+
+    parser.add_argument(
+        "--master-port",
+        type=int,
+        default=12354,
+        help="Master port to use for DDP training.",
+    )
+
+    parser.add_argument(
+        "--tensorboard",
+        type=str2bool,
+        default=True,
+        help="Should various information be logged in tensorboard.",
+    )
+
+    parser.add_argument(
+        "--num-epochs",
+        type=int,
+        default=30,
+        help="Number of epochs to train.",
+    )
+
+    parser.add_argument(
+        "--start-epoch",
+        type=int,
+        default=1,
+        help="""Resume training from this epoch. It should be positive.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="zipformer/exp",
+        help="""The experiment dir.
+        It specifies the directory where all training related
+        files, e.g., checkpoints, log, etc, are saved
+        """,
+    )
+
+    parser.add_argument(
+        "--lang-dir",
+        type=str,
+        default="data/lang_char",
+        help="""The lang dir
+        It contains language related input files such as
+        "lexicon.txt"
+        """,
+    )
+
+    parser.add_argument(
+        "--base-lr", type=float, default=0.045, help="The base learning rate."
+    )
+
+    parser.add_argument(
+        "--lr-batches",
+        type=float,
+        default=7500,
+        help="""Number of steps that affects how rapidly the learning rate
+        decreases. We suggest not to change this.""",
+    )
+
+    parser.add_argument(
+        "--lr-epochs",
+        type=float,
+        default=3.5,
+        help="""Number of epochs that affects how rapidly the learning rate decreases.
+        """,
+    )
+
+    parser.add_argument(
+        "--ref-duration",
+        type=float,
+        default=600,
+        help="""Reference batch duration for purposes of adjusting batch counts for setting various schedules inside the model""",
+    )
+
+    parser.add_argument(
+        "--context-size",
+        type=int,
+        default=2,
+        help="""The context size in the decoder. 1 means bigram; 2 means tri-gram""",
+    )
+
+    parser.add_argument(
+        "--prune-range",
+        type=int,
+        default=5,
+        help="""The prune range for rnnt loss, it means how many symbols(context)
+        we are using to compute the loss""",
+    )
+
+    parser.add_argument(
+        "--lm-scale",
+        type=float,
+        default=0.25,
+        help="""The scale to smooth the loss with lm
+        (output of prediction network) part.""",
+    )
+
+    parser.add_argument(
+        "--am-scale",
+        type=float,
+        default=0.0,
+        help="""The scale to smooth the loss with am (output of encoder network) part.""",
+    )
+
+    parser.add_argument(
+        "--simple-loss-scale",
+        type=float,
+        default=0.5,
+        help="""To get pruning ranges, we will calculate a simple version
+        loss(joiner is just addition), this simple loss also uses for
+        training (as a regularization item). We will scale the simple loss
+        with this parameter before adding to the final loss.""",
+    )
+
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="The seed for random generators intended for reproducibility",
+    )
+
+    parser.add_argument(
+        "--print-diagnostics",
+        type=str2bool,
+        default=False,
+        help="Accumulate stats on activations, print them and exit.",
+    )
+
+    parser.add_argument(
+        "--inf-check",
+        type=str2bool,
+        default=False,
+        help="Add hooks to check for infinite module outputs and gradients.",
+    )
+
+    parser.add_argument(
+        "--save-every-n",
+        type=int,
+        default=4000,
+        help="""Save checkpoint after processing this number of batches"
+        periodically. We save checkpoint to exp-dir/ whenever
+        params.batch_idx_train % save_every_n == 0. The checkpoint filename
+        has the form: f'exp-dir/checkpoint-{params.batch_idx_train}.pt'
+        Note: It also saves checkpoint to `exp-dir/epoch-xxx.pt` at the
+        end of each epoch where `xxx` is the epoch number counting from 0.
+        """,
+    )
+
+    parser.add_argument(
+        "--keep-last-k",
+        type=int,
+        default=30,
+        help="""Only keep this number of checkpoints on disk.
+        For instance, if it is 3, there are only 3 checkpoints
+        in the exp-dir with filenames `checkpoint-xxx.pt`.
+        It does not affect checkpoints with name `epoch-xxx.pt`.
+        """,
+    )
+
+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=200,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
+    parser.add_argument(
+        "--use-fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use half precision training.",
+    )
+
+    add_model_arguments(parser)
+
+    return parser
+
+
+def get_params() -> AttributeDict:
+    """Return a dict containing training parameters.
+
+    All training related parameters that are not passed from the commandline
+    are saved in the variable `params`.
+
+    Commandline options are merged into `params` after they are parsed, so
+    you can also access them via `params`.
+
+    Explanation of options saved in `params`:
+
+        - best_train_loss: Best training loss so far. It is used to select
+                           the model that has the lowest training loss. It is
+                           updated during the training.
+
+        - best_valid_loss: Best validation loss so far. It is used to select
+                           the model that has the lowest validation loss. It is
+                           updated during the training.
+
+        - best_train_epoch: It is the epoch that has the best training loss.
+
+        - best_valid_epoch: It is the epoch that has the best validation loss.
+
+        - batch_idx_train: Used to writing statistics to tensorboard. It
+                           contains number of batches trained so far across
+                           epochs.
+
+        - log_interval:  Print training loss if batch_idx % log_interval` is 0
+
+        - reset_interval: Reset statistics if batch_idx % reset_interval is 0
+
+        - valid_interval:  Run validation if batch_idx % valid_interval is 0
+
+        - feature_dim: The model input dim. It has to match the one used
+                       in computing features.
+
+        - subsampling_factor:  The subsampling factor for the model.
+
+        - encoder_dim: Hidden dim for multi-head attention model.
+
+        - num_decoder_layers: Number of decoder layer of transformer decoder.
+
+        - warm_step: The warmup period that dictates the decay of the
+              scale on "simple" (un-pruned) loss.
+    """
+    params = AttributeDict(
+        {
+            "best_train_loss": float("inf"),
+            "best_valid_loss": float("inf"),
+            "best_train_epoch": -1,
+            "best_valid_epoch": -1,
+            "batch_idx_train": 0,
+            "log_interval": 50,
+            "reset_interval": 200,
+            "valid_interval": 3000,
+            # parameters for zipformer
+            "feature_dim": 80,
+            "subsampling_factor": 4,  # not passed in, this is fixed.
+            "warm_step": 2000,
+            "env_info": get_env_info(),
+        }
+    )
+
+    return params
+
+
+def _to_int_tuple(s: str):
+    return tuple(map(int, s.split(",")))
+
+
+def get_encoder_embed(params: AttributeDict) -> nn.Module:
+    # encoder_embed converts the input of shape (N, T, num_features)
+    # to the shape (N, (T - 7) // 2, encoder_dims).
+    # That is, it does two things simultaneously:
+    #   (1) subsampling: T -> (T - 7) // 2
+    #   (2) embedding: num_features -> encoder_dims
+    # In the normal configuration, we will downsample once more at the end
+    # by a factor of 2, and most of the encoder stacks will run at a lower
+    # sampling rate.
+    encoder_embed = Conv2dSubsampling(
+        in_channels=params.feature_dim,
+        out_channels=_to_int_tuple(params.encoder_dim)[0],
+        dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
+    )
+    return encoder_embed
+
+
+def get_encoder_model(params: AttributeDict) -> nn.Module:
+    encoder = Zipformer2(
+        output_downsampling_factor=2,
+        downsampling_factor=_to_int_tuple(params.downsampling_factor),
+        num_encoder_layers=_to_int_tuple(params.num_encoder_layers),
+        encoder_dim=_to_int_tuple(params.encoder_dim),
+        encoder_unmasked_dim=_to_int_tuple(params.encoder_unmasked_dim),
+        query_head_dim=_to_int_tuple(params.query_head_dim),
+        pos_head_dim=_to_int_tuple(params.pos_head_dim),
+        value_head_dim=_to_int_tuple(params.value_head_dim),
+        pos_dim=params.pos_dim,
+        num_heads=_to_int_tuple(params.num_heads),
+        feedforward_dim=_to_int_tuple(params.feedforward_dim),
+        cnn_module_kernel=_to_int_tuple(params.cnn_module_kernel),
+        dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
+        warmup_batches=4000.0,
+        causal=params.causal,
+        chunk_size=_to_int_tuple(params.chunk_size),
+        left_context_frames=_to_int_tuple(params.left_context_frames),
+    )
+    return encoder
+
+
+def get_decoder_model(params: AttributeDict) -> nn.Module:
+    decoder = Decoder(
+        vocab_size=params.vocab_size,
+        decoder_dim=params.decoder_dim,
+        blank_id=params.blank_id,
+        context_size=params.context_size,
+    )
+    return decoder
+
+
+def get_joiner_model(params: AttributeDict) -> nn.Module:
+    joiner = Joiner(
+        encoder_dim=max(_to_int_tuple(params.encoder_dim)),
+        decoder_dim=params.decoder_dim,
+        joiner_dim=params.joiner_dim,
+        vocab_size=params.vocab_size,
+    )
+    return joiner
+
+
+def get_model(params: AttributeDict) -> nn.Module:
+    encoder_embed = get_encoder_embed(params)
+    encoder = get_encoder_model(params)
+    decoder = get_decoder_model(params)
+    joiner = get_joiner_model(params)
+
+    model = AsrModel(
+        encoder_embed=encoder_embed,
+        encoder=encoder,
+        decoder=decoder,
+        joiner=joiner,
+        encoder_dim=int(max(params.encoder_dim.split(","))),
+        decoder_dim=params.decoder_dim,
+        vocab_size=params.vocab_size,
+    )
+    return model
+
+
+def load_checkpoint_if_available(
+    params: AttributeDict,
+    model: nn.Module,
+    model_avg: nn.Module = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+) -> Optional[Dict[str, Any]]:
+    """Load checkpoint from file.
+
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
+    `params.start_epoch - 1`.
+
+    Apart from loading state dict for `model` and `optimizer` it also updates
+    `best_train_epoch`, `best_train_loss`, `best_valid_epoch`,
+    and `best_valid_loss` in `params`.
+
+    Args:
+      params:
+        The return value of :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer that we are using.
+      scheduler:
+        The scheduler that we are using.
+    Returns:
+      Return a dict containing previously saved training info.
+    """
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
+    assert filename.is_file(), f"{filename} does not exist!"
+
+    saved_params = load_checkpoint(
+        filename,
+        model=model,
+        model_avg=model_avg,
+        optimizer=optimizer,
+        scheduler=scheduler,
+    )
+
+    keys = [
+        "best_train_epoch",
+        "best_valid_epoch",
+        "batch_idx_train",
+        "best_train_loss",
+        "best_valid_loss",
+    ]
+    for k in keys:
+        params[k] = saved_params[k]
+
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+        if "cur_batch_idx" in saved_params:
+            params["cur_batch_idx"] = saved_params["cur_batch_idx"]
+
+    return saved_params
+
+
+def save_checkpoint(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
+    optimizer: Optional[torch.optim.Optimizer] = None,
+    scheduler: Optional[LRSchedulerType] = None,
+    sampler: Optional[CutSampler] = None,
+    scaler: Optional[GradScaler] = None,
+    rank: int = 0,
+) -> None:
+    """Save model, optimizer, scheduler and training stats to file.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
+      optimizer:
+        The optimizer used in the training.
+      sampler:
+       The sampler for the training dataset.
+      scaler:
+        The scaler used for mix precision training.
+    """
+    if rank != 0:
+        return
+    filename = params.exp_dir / f"epoch-{params.cur_epoch}.pt"
+    save_checkpoint_impl(
+        filename=filename,
+        model=model,
+        model_avg=model_avg,
+        params=params,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        sampler=sampler,
+        scaler=scaler,
+        rank=rank,
+    )
+
+    if params.best_train_epoch == params.cur_epoch:
+        best_train_filename = params.exp_dir / "best-train-loss.pt"
+        copyfile(src=filename, dst=best_train_filename)
+
+    if params.best_valid_epoch == params.cur_epoch:
+        best_valid_filename = params.exp_dir / "best-valid-loss.pt"
+        copyfile(src=filename, dst=best_valid_filename)
+
+
+def compute_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    graph_compiler: CharCtcTrainingGraphCompiler,
+    batch: dict,
+    is_training: bool,
+) -> Tuple[Tensor, MetricsTracker]:
+    """
+    Compute CTC loss given the model and its inputs.
+
+    Args:
+      params:
+        Parameters for training. See :func:`get_params`.
+      model:
+        The model for training. It is an instance of Zipformer in our case.
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      is_training:
+        True for training. False for validation. When it is True, this
+        function enables autograd during computation; when it is False, it
+        disables autograd.
+     warmup: a floating point value which increases throughout training;
+        values >= 1.0 are fully warmed up and have all modules present.
+    """
+    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
+    feature = batch["inputs"]
+    # at entry, feature is (N, T, C)
+    assert feature.ndim == 3
+    feature = feature.to(device)
+
+    supervisions = batch["supervisions"]
+    feature_lens = supervisions["num_frames"].to(device)
+
+    batch_idx_train = params.batch_idx_train
+    warm_step = params.warm_step
+
+    texts = batch["supervisions"]["text"]
+    y = graph_compiler.texts_to_ids(texts)
+    y = k2.RaggedTensor(y).to(device)
+
+    with torch.set_grad_enabled(is_training):
+        simple_loss, pruned_loss, _ = model(
+            x=feature,
+            x_lens=feature_lens,
+            y=y,
+            prune_range=params.prune_range,
+            am_scale=params.am_scale,
+            lm_scale=params.lm_scale,
+        )
+
+        s = params.simple_loss_scale
+        # take down the scale on the simple loss from 1.0 at the start
+        # to params.simple_loss scale by warm_step.
+        simple_loss_scale = (
+            s
+            if batch_idx_train >= warm_step
+            else 1.0 - (batch_idx_train / warm_step) * (1.0 - s)
+        )
+        pruned_loss_scale = (
+            1.0
+            if batch_idx_train >= warm_step
+            else 0.1 + 0.9 * (batch_idx_train / warm_step)
+        )
+
+        loss = simple_loss_scale * simple_loss + pruned_loss_scale * pruned_loss
+
+    assert loss.requires_grad == is_training
+
+    info = MetricsTracker()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+
+    # Note: We use reduction=sum while computing the loss.
+    info["loss"] = loss.detach().cpu().item()
+    info["simple_loss"] = simple_loss.detach().cpu().item()
+    info["pruned_loss"] = pruned_loss.detach().cpu().item()
+
+    return loss, info
+
+
+def compute_validation_loss(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    graph_compiler: CharCtcTrainingGraphCompiler,
+    valid_dl: torch.utils.data.DataLoader,
+    world_size: int = 1,
+) -> MetricsTracker:
+    """Run the validation process."""
+    model.eval()
+
+    tot_loss = MetricsTracker()
+
+    for batch_idx, batch in enumerate(valid_dl):
+        loss, loss_info = compute_loss(
+            params=params,
+            model=model,
+            graph_compiler=graph_compiler,
+            batch=batch,
+            is_training=False,
+        )
+        assert loss.requires_grad is False
+        tot_loss = tot_loss + loss_info
+
+    if world_size > 1:
+        tot_loss.reduce(loss.device)
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    if loss_value < params.best_valid_loss:
+        params.best_valid_epoch = params.cur_epoch
+        params.best_valid_loss = loss_value
+
+    return tot_loss
+
+
+def train_one_epoch(
+    params: AttributeDict,
+    model: Union[nn.Module, DDP],
+    optimizer: torch.optim.Optimizer,
+    scheduler: LRSchedulerType,
+    graph_compiler: CharCtcTrainingGraphCompiler,
+    train_dl: torch.utils.data.DataLoader,
+    valid_dl: torch.utils.data.DataLoader,
+    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
+    tb_writer: Optional[SummaryWriter] = None,
+    world_size: int = 1,
+    rank: int = 0,
+) -> None:
+    """Train the model for one epoch.
+
+    The training loss from the mean of all frames is saved in
+    `params.train_loss`. It runs the validation process every
+    `params.valid_interval` batches.
+
+    Args:
+      params:
+        It is returned by :func:`get_params`.
+      model:
+        The model for training.
+      optimizer:
+        The optimizer we are using.
+      scheduler:
+        The learning rate scheduler, we call step() every step.
+      train_dl:
+        Dataloader for the training dataset.
+      valid_dl:
+        Dataloader for the validation dataset.
+      scaler:
+        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
+      tb_writer:
+        Writer to write log messages to tensorboard.
+      world_size:
+        Number of nodes in DDP training. If it is 1, DDP is disabled.
+      rank:
+        The rank of the node in DDP training. If no DDP is used, it should
+        be set to 0.
+    """
+    model.train()
+
+    tot_loss = MetricsTracker()
+
+    cur_batch_idx = params.get("cur_batch_idx", 0)
+
+    saved_bad_model = False
+
+    def save_bad_model(suffix: str = ""):
+        save_checkpoint_impl(
+            filename=params.exp_dir / f"bad-model{suffix}-{rank}.pt",
+            model=model,
+            model_avg=model_avg,
+            params=params,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=0,
+        )
+
+    for batch_idx, batch in enumerate(train_dl):
+        if batch_idx % 10 == 0:
+            set_batch_count(model, get_adjusted_batch_count(params))
+        if batch_idx < cur_batch_idx:
+            continue
+        cur_batch_idx = batch_idx
+
+        params.batch_idx_train += 1
+        batch_size = len(batch["supervisions"]["text"])
+
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, loss_info = compute_loss(
+                    params=params,
+                    model=model,
+                    graph_compiler=graph_compiler,
+                    batch=batch,
+                    is_training=True,
+                )
+            # summary stats
+            tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
+
+            # NOTE: We use reduction==sum and loss is computed over utterances
+            # in the batch and there is no normalization to it so far.
+            scaler.scale(loss).backward()
+            scheduler.step_batch(params.batch_idx_train)
+
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad()
+        except:  # noqa
+            save_bad_model()
+            display_and_save_batch(batch, params=params, graph_compiler=graph_compiler)
+            raise
+
+        if params.print_diagnostics and batch_idx == 5:
+            return
+
+        if (
+            rank == 0
+            and params.batch_idx_train > 0
+            and params.batch_idx_train % params.average_period == 0
+        ):
+            update_averaged_model(
+                params=params,
+                model_cur=model,
+                model_avg=model_avg,
+            )
+
+        if (
+            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0
+        ):
+            params.cur_batch_idx = batch_idx
+            save_checkpoint_with_global_batch_idx(
+                out_dir=params.exp_dir,
+                global_batch_idx=params.batch_idx_train,
+                model=model,
+                model_avg=model_avg,
+                params=params,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                sampler=train_dl.sampler,
+                scaler=scaler,
+                rank=rank,
+            )
+            del params.cur_batch_idx
+            remove_checkpoints(
+                out_dir=params.exp_dir,
+                topk=params.keep_last_k,
+                rank=rank,
+            )
+
+        if batch_idx % 100 == 0 and params.use_fp16:
+            # If the grad scale was less than 1, try increasing it.    The _growth_interval
+            # of the grad scaler is configurable, but we can't configure it to have different
+            # behavior depending on the current grad scale.
+            cur_grad_scale = scaler._scale.item()
+
+            if cur_grad_scale < 8.0 or (cur_grad_scale < 32.0 and batch_idx % 400 == 0):
+                scaler.update(cur_grad_scale * 2.0)
+            if cur_grad_scale < 0.01:
+                if not saved_bad_model:
+                    save_bad_model(suffix="-first-warning")
+                    saved_bad_model = True
+                logging.warning(f"Grad scale is small: {cur_grad_scale}")
+            if cur_grad_scale < 1.0e-05:
+                save_bad_model()
+                raise RuntimeError(
+                    f"grad_scale is too small, exiting: {cur_grad_scale}"
+                )
+
+        if batch_idx % params.log_interval == 0:
+            cur_lr = max(scheduler.get_last_lr())
+            cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
+
+            logging.info(
+                f"Epoch {params.cur_epoch}, "
+                f"batch {batch_idx}, loss[{loss_info}], "
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
+                f"lr: {cur_lr:.2e}, "
+                + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "")
+            )
+
+            if tb_writer is not None:
+                tb_writer.add_scalar(
+                    "train/learning_rate", cur_lr, params.batch_idx_train
+                )
+
+                loss_info.write_summary(
+                    tb_writer, "train/current_", params.batch_idx_train
+                )
+                tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train)
+                if params.use_fp16:
+                    tb_writer.add_scalar(
+                        "train/grad_scale", cur_grad_scale, params.batch_idx_train
+                    )
+
+        if batch_idx % params.valid_interval == 0 and not params.print_diagnostics:
+            logging.info("Computing validation loss")
+            valid_info = compute_validation_loss(
+                params=params,
+                model=model,
+                graph_compiler=graph_compiler,
+                valid_dl=valid_dl,
+                world_size=world_size,
+            )
+            model.train()
+            logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}")
+            logging.info(
+                f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+            )
+            if tb_writer is not None:
+                valid_info.write_summary(
+                    tb_writer, "train/valid_", params.batch_idx_train
+                )
+
+    loss_value = tot_loss["loss"] / tot_loss["frames"]
+    params.train_loss = loss_value
+    if params.train_loss < params.best_train_loss:
+        params.best_train_epoch = params.cur_epoch
+        params.best_train_loss = params.train_loss
+
+
+def run(rank, world_size, args):
+    """
+    Args:
+      rank:
+        It is a value between 0 and `world_size-1`, which is
+        passed automatically by `mp.spawn()` in :func:`main`.
+        The node with rank 0 is responsible for saving checkpoint.
+      world_size:
+        Number of GPUs for DDP training.
+      args:
+        The return value of get_parser().parse_args()
+    """
+    params = get_params()
+    params.update(vars(args))
+
+    fix_random_seed(params.seed)
+    if world_size > 1:
+        setup_dist(rank, world_size, params.master_port)
+
+    setup_logger(f"{params.exp_dir}/log/log-train")
+    logging.info("Training started")
+
+    if args.tensorboard and rank == 0:
+        tb_writer = SummaryWriter(log_dir=f"{params.exp_dir}/tensorboard")
+    else:
+        tb_writer = None
+
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda", rank)
+    logging.info(f"Device: {device}")
+
+    lexicon = Lexicon(params.lang_dir)
+    graph_compiler = CharCtcTrainingGraphCompiler(
+        lexicon=lexicon,
+        device=device,
+    )
+
+    params.blank_id = lexicon.token_table["<blk>"]
+    params.vocab_size = max(lexicon.tokens) + 1
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_model(params)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"Number of model parameters: {num_param}")
+
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model).to(torch.float64)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )
+
+    model.to(device)
+    if world_size > 1:
+        logging.info("Using DDP")
+        model = DDP(model, device_ids=[rank], find_unused_parameters=True)
+
+    optimizer = ScaledAdam(
+        get_parameter_groups_with_lrs(model, lr=params.base_lr, include_names=True),
+        lr=params.base_lr,  # should have no effect
+        clipping_scale=2.0,
+    )
+
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
+
+    if checkpoints and "optimizer" in checkpoints:
+        logging.info("Loading optimizer state dict")
+        optimizer.load_state_dict(checkpoints["optimizer"])
+
+    if (
+        checkpoints
+        and "scheduler" in checkpoints
+        and checkpoints["scheduler"] is not None
+    ):
+        logging.info("Loading scheduler state dict")
+        scheduler.load_state_dict(checkpoints["scheduler"])
+
+    if params.print_diagnostics:
+        opts = diagnostics.TensorDiagnosticOptions(
+            2**22
+        )  # allow 4 megabytes per sub-module
+        diagnostic = diagnostics.attach_diagnostics(model, opts)
+
+    if params.inf_check:
+        register_inf_check_hooks(model)
+
+    wenetspeech = WenetSpeechAsrDataModule(args)
+
+    train_cuts = wenetspeech.train_cuts()
+    valid_cuts = wenetspeech.valid_cuts()
+
+    def remove_short_and_long_utt(c: Cut):
+        # Keep only utterances with duration between 1 second and 15 seconds
+        #
+        # Caution: There is a reason to select 15.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
+        if c.duration < 1.0 or c.duration > 15.0:
+            # logging.warning(
+            #    f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
+            # )
+            return False
+
+        # In pruned RNN-T, we require that T >= S
+        # where T is the number of feature frames after subsampling
+        # and S is the number of tokens in the utterance
+
+        # In ./zipformer.py, the conv module uses the following expression
+        # for subsampling
+        T = ((c.num_frames - 7) // 2 + 1) // 2
+        tokens = graph_compiler.texts_to_ids([c.supervisions[0].text])[0]
+
+        if T < len(tokens):
+            logging.warning(
+                f"Exclude cut with ID {c.id} from training. "
+                f"Number of frames (before subsampling): {c.num_frames}. "
+                f"Number of frames (after subsampling): {T}. "
+                f"Text: {c.supervisions[0].text}. "
+                f"Tokens: {tokens}. "
+                f"Number of tokens: {len(tokens)}"
+            )
+            return False
+
+        return True
+
+    train_cuts = train_cuts.filter(remove_short_and_long_utt)
+
+    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
+        # We only load the sampler's state dict when it loads a checkpoint
+        # saved in the middle of an epoch
+        sampler_state_dict = checkpoints["sampler"]
+    else:
+        sampler_state_dict = None
+
+    train_dl = wenetspeech.train_dataloaders(
+        train_cuts, sampler_state_dict=sampler_state_dict
+    )
+
+    valid_dl = wenetspeech.valid_dataloaders(valid_cuts)
+
+    if False and not params.print_diagnostics:
+        scan_pessimistic_batches_for_oom(
+            model=model,
+            train_dl=train_dl,
+            optimizer=optimizer,
+            graph_compiler=graph_compiler,
+            params=params,
+        )
+
+    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    if checkpoints and "grad_scaler" in checkpoints:
+        logging.info("Loading grad scaler state dict")
+        scaler.load_state_dict(checkpoints["grad_scaler"])
+
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)
+
+        if tb_writer is not None:
+            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
+
+        params.cur_epoch = epoch
+
+        train_one_epoch(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            graph_compiler=graph_compiler,
+            train_dl=train_dl,
+            valid_dl=valid_dl,
+            scaler=scaler,
+            tb_writer=tb_writer,
+            world_size=world_size,
+            rank=rank,
+        )
+
+        if params.print_diagnostics:
+            diagnostic.print_diagnostics()
+            break
+
+        save_checkpoint(
+            params=params,
+            model=model,
+            model_avg=model_avg,
+            optimizer=optimizer,
+            scheduler=scheduler,
+            sampler=train_dl.sampler,
+            scaler=scaler,
+            rank=rank,
+        )
+
+    logging.info("Done!")
+
+    if world_size > 1:
+        torch.distributed.barrier()
+        cleanup_dist()
+
+
+def display_and_save_batch(
+    batch: dict,
+    params: AttributeDict,
+    graph_compiler: CharCtcTrainingGraphCompiler,
+) -> None:
+    """Display the batch statistics and save the batch into disk.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      params:
+        Parameters for training. See :func:`get_params`.
+      graph_compiler:
+        The compiler to encode texts to ids.
+    """
+    from lhotse.utils import uuid4
+
+    filename = f"{params.exp_dir}/batch-{uuid4()}.pt"
+    logging.info(f"Saving batch to {filename}")
+    torch.save(batch, filename)
+
+    supervisions = batch["supervisions"]
+    features = batch["inputs"]
+
+    logging.info(f"features shape: {features.shape}")
+
+    texts = supervisions["text"]
+    y = graph_compiler.texts_to_ids(texts)
+    num_tokens = sum(len(i) for i in y)
+    logging.info(f"num tokens: {num_tokens}")
+
+
+def scan_pessimistic_batches_for_oom(
+    model: Union[nn.Module, DDP],
+    train_dl: torch.utils.data.DataLoader,
+    optimizer: torch.optim.Optimizer,
+    graph_compiler: CharCtcTrainingGraphCompiler,
+    params: AttributeDict,
+):
+    from lhotse.dataset import find_pessimistic_batches
+
+    logging.info(
+        "Sanity check -- see if any of the batches in epoch 1 would cause OOM."
+    )
+    batches, crit_values = find_pessimistic_batches(train_dl.sampler)
+    for criterion, cuts in batches.items():
+        batch = train_dl.dataset[cuts]
+        try:
+            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+                loss, _ = compute_loss(
+                    params=params,
+                    model=model,
+                    graph_compiler=graph_compiler,
+                    batch=batch,
+                    is_training=True,
+                )
+            loss.backward()
+            optimizer.zero_grad()
+        except Exception as e:
+            if "CUDA out of memory" in str(e):
+                logging.error(
+                    "Your GPU ran out of memory with the current "
+                    "max_duration setting. We recommend decreasing "
+                    "max_duration and trying again.\n"
+                    f"Failing criterion: {criterion} "
+                    f"(={crit_values[criterion]}) ..."
+                )
+            display_and_save_batch(batch, params=params, graph_compiler=graph_compiler)
+            raise
+        logging.info(
+            f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB"
+        )
+
+
+def main():
+    parser = get_parser()
+    WenetSpeechAsrDataModule.add_arguments(parser)
+    args = parser.parse_args()
+    args.lang_dir = Path(args.lang_dir)
+    args.exp_dir = Path(args.exp_dir)
+
+    world_size = args.world_size
+    assert world_size >= 1
+    if world_size > 1:
+        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
+    else:
+        run(rank=0, world_size=1, args=args)
+
+
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wenetspeech/ASR/zipformer/zipformer.py b/egs/wenetspeech/ASR/zipformer/zipformer.py
new file mode 120000
index 000000000..23011dda7
--- /dev/null
+++ b/egs/wenetspeech/ASR/zipformer/zipformer.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/zipformer/zipformer.py
\ No newline at end of file
diff --git a/icefall/rnn_lm/train.py b/icefall/rnn_lm/train.py
index 0f0887859..3d206d139 100755
--- a/icefall/rnn_lm/train.py
+++ b/icefall/rnn_lm/train.py
@@ -99,6 +99,15 @@ def get_parser():
         """,
     )
 
+    parser.add_argument(
+        "--start-batch",
+        type=int,
+        default=0,
+        help="""If positive, --start-epoch is ignored and
+        it loads the checkpoint from exp-dir/checkpoint-{start_batch}.pt
+        """,
+    )
+
     parser.add_argument(
         "--exp-dir",
         type=str,
@@ -242,7 +251,9 @@ def load_checkpoint_if_available(
 ) -> None:
     """Load checkpoint from file.
 
-    If params.start_epoch is positive, it will load the checkpoint from
+    If params.start_batch is positive, it will load the checkpoint from
+    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
+    params.start_epoch is larger than 1, it will load the checkpoint from
     `params.start_epoch - 1`. Otherwise, this function does nothing.
 
     Apart from loading state dict for `model`, `optimizer` and `scheduler`,
@@ -261,10 +272,14 @@ def load_checkpoint_if_available(
     Returns:
       Return None.
     """
-    if params.start_epoch <= 0:
-        return
 
-    filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    if params.start_batch > 0:
+        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
+    elif params.start_epoch > 1:
+        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
+    else:
+        return None
+
     logging.info(f"Loading checkpoint: {filename}")
     saved_params = load_checkpoint(
         filename,
@@ -283,6 +298,13 @@ def load_checkpoint_if_available(
     for k in keys:
         params[k] = saved_params[k]
 
+    if params.start_batch > 0:
+        if "cur_epoch" in saved_params:
+            params["start_epoch"] = saved_params["cur_epoch"]
+
+        if "cur_batch_idx" in saved_params:
+            params["cur_batch_idx"] = saved_params["cur_batch_idx"]
+
     return saved_params
 
 
@@ -438,7 +460,14 @@ def train_one_epoch(
 
     tot_loss = MetricsTracker()
 
+    cur_batch_idx = params.get("cur_batch_idx", 0)
+
     for batch_idx, batch in enumerate(train_dl):
+
+        if batch_idx < cur_batch_idx:
+            continue
+        cur_batch_idx = batch_idx
+
         params.batch_idx_train += 1
         x, y, sentence_lengths = batch
         batch_size = x.size(0)
@@ -463,6 +492,7 @@ def train_one_epoch(
             params.batch_idx_train > 0
             and params.batch_idx_train % params.save_every_n == 0
         ):
+            params.cur_batch_idx = batch_idx
             save_checkpoint_with_global_batch_idx(
                 out_dir=params.exp_dir,
                 global_batch_idx=params.batch_idx_train,
@@ -471,6 +501,7 @@ def train_one_epoch(
                 optimizer=optimizer,
                 rank=rank,
             )
+            del params.cur_batch_idx
 
         if batch_idx % params.log_interval == 0:
             # Note: "frames" here means "num_tokens"
diff --git a/icefall/utils.py b/icefall/utils.py
index dfe9a7b42..0feff9dc8 100644
--- a/icefall/utils.py
+++ b/icefall/utils.py
@@ -429,6 +429,8 @@ def store_transcripts(
       texts:
         An iterable of tuples. The first element is the cur_id, the second is
         the reference transcript and the third element is the predicted result.
+        If it is a multi-talker ASR system, the ref and hyp may also be lists of
+        strings.
     Returns:
       Return None.
     """
@@ -886,8 +888,167 @@ def write_error_stats_with_timestamps(
         hyp_count = corr + hyp_sub + ins
 
         print(f"{word}   {corr} {tot_errs} {ref_count} {hyp_count}", file=f)
+    return float(tot_err_rate), float(mean_delay), float(var_delay)
 
-    return tot_err_rate, mean_delay, var_delay
+
+def write_surt_error_stats(
+    f: TextIO,
+    test_set_name: str,
+    results: List[Tuple[str, str]],
+    enable_log: bool = True,
+    num_channels: int = 2,
+) -> float:
+    """Write statistics based on predicted results and reference transcripts for SURT
+    multi-talker ASR systems. The difference between this and the `write_error_stats`
+    is that this function finds the optimal speaker-agnostic WER using the ``meeteval``
+    toolkit.
+
+    Args:
+        f: File to write the statistics to.
+        test_set_name: Name of the test set.
+        results: List of tuples containing the utterance ID and the predicted
+            transcript.
+        enable_log: Whether to enable logging.
+        num_channels: Number of output channels/branches. Defaults to 2.
+    Returns:
+      Return None.
+    """
+    from meeteval.wer import wer
+
+    subs: Dict[Tuple[str, str], int] = defaultdict(int)
+    ins: Dict[str, int] = defaultdict(int)
+    dels: Dict[str, int] = defaultdict(int)
+    ref_lens: List[int] = []
+
+    print(
+        "Search below for sections starting with PER-UTT DETAILS:, "
+        "SUBSTITUTIONS:, DELETIONS:, INSERTIONS:, PER-WORD STATS:",
+        file=f,
+    )
+
+    print("", file=f)
+    print("PER-UTT DETAILS: corr or (ref->hyp)  ", file=f)
+
+    # `words` stores counts per word, as follows:
+    #   corr, ref_sub, hyp_sub, ins, dels
+    words: Dict[str, List[int]] = defaultdict(lambda: [0, 0, 0, 0, 0])
+    num_corr = 0
+    ERR = "*"
+    for cut_id, ref, hyp in results:
+        # First compute the optimal assignment of references to output channels
+        orc_wer = wer.orc_word_error_rate(ref, hyp)
+        assignment = orc_wer.assignment
+        refs = [[] for _ in range(num_channels)]
+        # Assign references to channels
+        for i, ref_text in zip(assignment, ref):
+            refs[i] += ref_text.split()
+        hyps = [hyp_text.split() for hyp_text in hyp]
+        # Now compute the WER for each channel
+        for ref_c, hyp_c in zip(refs, hyps):
+            ref_lens.append(len(ref_c))
+            ali = kaldialign.align(ref_c, hyp_c, ERR)
+            for ref_word, hyp_word in ali:
+                if ref_word == ERR:
+                    ins[hyp_word] += 1
+                    words[hyp_word][3] += 1
+                elif hyp_word == ERR:
+                    dels[ref_word] += 1
+                    words[ref_word][4] += 1
+                elif hyp_word != ref_word:
+                    subs[(ref_word, hyp_word)] += 1
+                    words[ref_word][1] += 1
+                    words[hyp_word][2] += 1
+                else:
+                    words[ref_word][0] += 1
+                    num_corr += 1
+            combine_successive_errors = True
+            if combine_successive_errors:
+                ali = [[[x], [y]] for x, y in ali]
+                for i in range(len(ali) - 1):
+                    if ali[i][0] != ali[i][1] and ali[i + 1][0] != ali[i + 1][1]:
+                        ali[i + 1][0] = ali[i][0] + ali[i + 1][0]
+                        ali[i + 1][1] = ali[i][1] + ali[i + 1][1]
+                        ali[i] = [[], []]
+                ali = [
+                    [
+                        list(filter(lambda a: a != ERR, x)),
+                        list(filter(lambda a: a != ERR, y)),
+                    ]
+                    for x, y in ali
+                ]
+                ali = list(filter(lambda x: x != [[], []], ali))
+                ali = [
+                    [
+                        ERR if x == [] else " ".join(x),
+                        ERR if y == [] else " ".join(y),
+                    ]
+                    for x, y in ali
+                ]
+
+            print(
+                f"{cut_id}:\t"
+                + " ".join(
+                    (
+                        ref_word
+                        if ref_word == hyp_word
+                        else f"({ref_word}->{hyp_word})"
+                        for ref_word, hyp_word in ali
+                    )
+                ),
+                file=f,
+            )
+    ref_len = sum(ref_lens)
+    sub_errs = sum(subs.values())
+    ins_errs = sum(ins.values())
+    del_errs = sum(dels.values())
+    tot_errs = sub_errs + ins_errs + del_errs
+    tot_err_rate = "%.2f" % (100.0 * tot_errs / ref_len)
+
+    if enable_log:
+        logging.info(
+            f"[{test_set_name}] %WER {tot_errs / ref_len:.2%} "
+            f"[{tot_errs} / {ref_len}, {ins_errs} ins, "
+            f"{del_errs} del, {sub_errs} sub ]"
+        )
+
+    print(f"%WER = {tot_err_rate}", file=f)
+    print(
+        f"Errors: {ins_errs} insertions, {del_errs} deletions, "
+        f"{sub_errs} substitutions, over {ref_len} reference "
+        f"words ({num_corr} correct)",
+        file=f,
+    )
+
+    print("", file=f)
+    print("SUBSTITUTIONS: count ref -> hyp", file=f)
+
+    for count, (ref, hyp) in sorted([(v, k) for k, v in subs.items()], reverse=True):
+        print(f"{count}   {ref} -> {hyp}", file=f)
+
+    print("", file=f)
+    print("DELETIONS: count ref", file=f)
+    for count, ref in sorted([(v, k) for k, v in dels.items()], reverse=True):
+        print(f"{count}   {ref}", file=f)
+
+    print("", file=f)
+    print("INSERTIONS: count hyp", file=f)
+    for count, hyp in sorted([(v, k) for k, v in ins.items()], reverse=True):
+        print(f"{count}   {hyp}", file=f)
+
+    print("", file=f)
+    print("PER-WORD STATS: word  corr tot_errs count_in_ref count_in_hyp", file=f)
+    for _, word, counts in sorted(
+        [(sum(v[1:]), k, v) for k, v in words.items()], reverse=True
+    ):
+        (corr, ref_sub, hyp_sub, ins, dels) = counts
+        tot_errs = ref_sub + hyp_sub + ins + dels
+        ref_count = corr + ref_sub + dels
+        hyp_count = corr + hyp_sub + ins
+
+        print(f"{word}   {corr} {tot_errs} {ref_count} {hyp_count}", file=f)
+
+    print(f"%WER = {tot_err_rate}", file=f)
+    return float(tot_err_rate)
 
 
 class MetricsTracker(collections.defaultdict):