Merge branch 'k2-fsa:master' into dev/speechllm

2025-08-09 01:52:41 +00:00 · 2025-07-07 11:32:12 +08:00 · 2025-07-07 11:32:12 +08:00 · 70f13e54d8
commit 70f13e54d8
parent 56349001d6 fba5e67d5e
229 changed files with 1992 additions and 16192 deletions
--- a/.github/scripts/docker/Dockerfile
+++ b/.github/scripts/docker/Dockerfile
@ -55,9 +55,9 @@ RUN pip install --no-cache-dir \
      "numpy<2.0" \
      onnxoptimizer \
      onnxsim \
-      onnx \
+      onnx==1.17.0 \
      onnxmltools \
-      onnxruntime \
+      onnxruntime==1.17.1 \
      piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html \
      pypinyin==0.50.0 \
      pytest \
--- a/.github/scripts/docker/generate_build_matrix.py
+++ b/.github/scripts/docker/generate_build_matrix.py
@ -63,23 +63,24 @@ def get_torchaudio_version(torch_version):


 def get_matrix(min_torch_version, specified_torch_version, specified_python_version):
-    k2_version = "1.24.4.dev20241029"
-    kaldifeat_version = "1.25.5.dev20241029"
-    version = "20241218"
+    k2_version = "1.24.4.dev20250630"
+    kaldifeat_version = "1.25.5.dev20250630"
+    version = "20250630"

    # torchaudio 2.5.0 does not support python 3.13
-    python_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
+    python_version = ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
    torch_version = []
    torch_version += ["1.13.0", "1.13.1"]
    torch_version += ["2.0.0", "2.0.1"]
-    #  torch_version += ["2.1.0", "2.1.1", "2.1.2"]
-    #  torch_version += ["2.2.0", "2.2.1", "2.2.2"]
+    torch_version += ["2.1.0", "2.1.1", "2.1.2"]
+    torch_version += ["2.2.0", "2.2.1", "2.2.2"]
    # Test only torch >= 2.3.0
    torch_version += ["2.3.0", "2.3.1"]
    torch_version += ["2.4.0"]
    torch_version += ["2.4.1"]
    torch_version += ["2.5.0"]
    torch_version += ["2.5.1"]
+    torch_version += ["2.6.0", "2.7.0", "2.7.1"]

    if specified_torch_version:
        torch_version = [specified_torch_version]
@ -109,12 +110,8 @@ def get_matrix(min_torch_version, specified_torch_version, specified_python_vers
                # torch>=2.5 requires python 3.10
                continue

-            if t == "2.5.1":
-                k2_version_2 = "1.24.4.dev20241122"
-                kaldifeat_version_2 = "1.25.5.dev20241126"
-            else:
-                k2_version_2 = k2_version
-                kaldifeat_version_2 = kaldifeat_version
+            k2_version_2 = k2_version
+            kaldifeat_version_2 = kaldifeat_version

            matrix.append(
                {
--- a/.github/scripts/generate-piper-phonemize-page.py
+++ b/.github/scripts/generate-piper-phonemize-page.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3


-def main():
+def get_v1_2_0_files():
    prefix = (
        "https://github.com/csukuangfj/piper-phonemize/releases/download/2023.12.5/"
    )
@ -19,9 +19,70 @@ def main():
        "piper_phonemize-1.2.0-cp39-cp39-macosx_10_14_x86_64.whl",
        "piper_phonemize-1.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
    ]
+    ans = [prefix + f for f in files]
+    ans.sort()
+    return ans
+
+
+def get_v1_3_0_files():
+    prefix = (
+        "https://github.com/csukuangfj/piper-phonemize/releases/download/2025.06.23/"
+    )
+    files = [
+        "piper_phonemize-1.3.0-cp310-cp310-macosx_10_9_universal2.whl",
+        "piper_phonemize-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl",
+        "piper_phonemize-1.3.0-cp310-cp310-macosx_11_0_arm64.whl",
+        "piper_phonemize-1.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
+        "piper_phonemize-1.3.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl",
+        "piper_phonemize-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.3.0-cp310-cp310-win_amd64.whl",
+        "piper_phonemize-1.3.0-cp311-cp311-macosx_10_9_universal2.whl",
+        "piper_phonemize-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl",
+        "piper_phonemize-1.3.0-cp311-cp311-macosx_11_0_arm64.whl",
+        "piper_phonemize-1.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
+        "piper_phonemize-1.3.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl",
+        "piper_phonemize-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.3.0-cp311-cp311-win_amd64.whl",
+        "piper_phonemize-1.3.0-cp312-cp312-macosx_10_13_universal2.whl",
+        "piper_phonemize-1.3.0-cp312-cp312-macosx_10_13_x86_64.whl",
+        "piper_phonemize-1.3.0-cp312-cp312-macosx_11_0_arm64.whl",
+        "piper_phonemize-1.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
+        "piper_phonemize-1.3.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl",
+        "piper_phonemize-1.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.3.0-cp312-cp312-win_amd64.whl",
+        "piper_phonemize-1.3.0-cp313-cp313-macosx_10_13_universal2.whl",
+        "piper_phonemize-1.3.0-cp313-cp313-macosx_10_13_x86_64.whl",
+        "piper_phonemize-1.3.0-cp313-cp313-macosx_11_0_arm64.whl",
+        "piper_phonemize-1.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
+        "piper_phonemize-1.3.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl",
+        "piper_phonemize-1.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.3.0-cp313-cp313-win_amd64.whl",
+        "piper_phonemize-1.3.0-cp38-cp38-macosx_10_9_universal2.whl",
+        "piper_phonemize-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl",
+        "piper_phonemize-1.3.0-cp38-cp38-macosx_11_0_arm64.whl",
+        "piper_phonemize-1.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
+        "piper_phonemize-1.3.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl",
+        "piper_phonemize-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.3.0-cp38-cp38-win_amd64.whl",
+        "piper_phonemize-1.3.0-cp39-cp39-macosx_10_9_universal2.whl",
+        "piper_phonemize-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl",
+        "piper_phonemize-1.3.0-cp39-cp39-macosx_11_0_arm64.whl",
+        "piper_phonemize-1.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl",
+        "piper_phonemize-1.3.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl",
+        "piper_phonemize-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl",
+        "piper_phonemize-1.3.0-cp39-cp39-win_amd64.whl",
+    ]
+    ans = [prefix + f for f in files]
+    ans.sort()
+    return ans
+
+
+def main():
+    files = get_v1_3_0_files() + get_v1_2_0_files()
+
    with open("piper_phonemize.html", "w") as f:
-        for file in files:
-            url = prefix + file
+        for url in files:
+            file = url.split("/")[-1]
            f.write(f'<a href="{url}">{file}</a><br/>\n')


--- a/.github/scripts/multi-zh-hans.sh
+++ b/.github/scripts/multi-zh-hans.sh
@ -1,200 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-git config --global user.name "k2-fsa"
-git config --global user.email "csukuangfj@gmail.com"
-git config --global lfs.allowincompletepush true
-
-log() {
-  # This function is from espnet
-  local fname=${BASH_SOURCE[1]##*/}
-  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
-}
-
-log "pwd: $PWD"
-
-cd egs/multi_zh-hans/ASR
-
-repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-2023-9-2
-log "Downloading pre-trained model from $repo_url"
-GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
-repo=$(basename $repo_url)
-pushd $repo
-cd exp
-git lfs pull --include pretrained.pt
-ln -s pretrained.pt epoch-99.pt
-cd ../data/lang_bpe_2000
-ls -lh
-git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
-git lfs pull --include "*.model"
-ls -lh
-popd
-
-log "--------------------------------------------"
-log "Export non-streaming ONNX transducer models "
-log "--------------------------------------------"
-./zipformer/export-onnx.py \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  --use-averaged-model 0 \
-  --epoch 99 \
-  --avg 1 \
-  --exp-dir $repo/exp \
-  --causal False
-
-ls -lh $repo/exp
-
-./zipformer/onnx_pretrained.py \
-  --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
-  --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
-  --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  $repo/test_wavs/DEV_T0000000000.wav \
-  $repo/test_wavs/DEV_T0000000001.wav \
-  $repo/test_wavs/DEV_T0000000002.wav \
-  $repo/test_wavs/TEST_MEETING_T0000000113.wav \
-  $repo/test_wavs/TEST_MEETING_T0000000219.wav \
-  $repo/test_wavs/TEST_MEETING_T0000000351.wav
-
-rm -rf $repo
-
-repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05
-log "Downloading pre-trained model from $repo_url"
-GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
-repo=$(basename $repo_url)
-
-pushd $repo
-cd exp/
-git lfs pull --include pretrained.pt
-rm -fv epoch-20.pt
-rm -fv *.onnx
-ln -s pretrained.pt epoch-20.pt
-cd ../data/lang_bpe_2000
-ls -lh
-git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
-git lfs pull --include "*.model"
-ls -lh
-popd
-
-log "----------------------------------------"
-log "Export streaming ONNX CTC models "
-log "----------------------------------------"
-./zipformer/export-onnx-streaming-ctc.py \
-  --exp-dir $repo/exp \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  --causal 1 \
-  --avg 1 \
-  --epoch 20 \
-  --use-averaged-model 0 \
-  --chunk-size 16 \
-  --left-context-frames 128 \
-  --use-ctc 1
-
-ls -lh $repo/exp/
-
-log "------------------------------------------------------------"
-log "Test exported streaming ONNX CTC models (greedy search)     "
-log "------------------------------------------------------------"
-
-test_wavs=(
-DEV_T0000000000.wav
-DEV_T0000000001.wav
-DEV_T0000000002.wav
-TEST_MEETING_T0000000113.wav
-TEST_MEETING_T0000000219.wav
-TEST_MEETING_T0000000351.wav
-)
-
-for w in ${test_wavs[@]}; do
-  ./zipformer/onnx_pretrained-streaming-ctc.py \
-    --model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
-    --tokens $repo/data/lang_bpe_2000/tokens.txt \
-    $repo/test_wavs/$w
-done
-
-log "Upload onnx CTC models to huggingface"
-url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
-GIT_LFS_SKIP_SMUDGE=1 git clone $url
-dst=$(basename $url)
-cp -v $repo/exp/ctc*.onnx $dst
-cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
-cp -v $repo/data/lang_bpe_2000/bpe.model $dst
-mkdir -p $dst/test_wavs
-cp -v $repo/test_wavs/*.wav $dst/test_wavs
-cd $dst
-git lfs track "*.onnx" "bpe.model"
-ls -lh
-file bpe.model
-git status
-git add .
-git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
-
-log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
-rm -rf .git
-rm -fv .gitattributes
-cd ..
-tar cjfv $dst.tar.bz2 $dst
-ls -lh *.tar.bz2
-mv -v $dst.tar.bz2 ../../../
-
-log "----------------------------------------"
-log "Export streaming ONNX transducer models "
-log "----------------------------------------"
-
-./zipformer/export-onnx-streaming.py \
-  --exp-dir $repo/exp \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  --causal 1 \
-  --avg 1 \
-  --epoch 20 \
-  --use-averaged-model 0 \
-  --chunk-size 16 \
-  --left-context-frames 128 \
-  --use-ctc 0
-
-ls -lh $repo/exp
-
-log "------------------------------------------------------------"
-log "Test exported streaming ONNX transducer models (Python code)"
-log "------------------------------------------------------------"
-
-log "test fp32"
-./zipformer/onnx_pretrained-streaming.py \
-  --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.onnx \
-  --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
-  --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.onnx \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  $repo/test_wavs/DEV_T0000000000.wav
-
-log "test int8"
-./zipformer/onnx_pretrained-streaming.py \
-  --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
-  --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
-  --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  $repo/test_wavs/DEV_T0000000000.wav
-
-log "Upload onnx transducer models to huggingface"
-
-url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12
-GIT_LFS_SKIP_SMUDGE=1 git clone $url
-dst=$(basename $url)
-cp -v $repo/exp/encoder*.onnx $dst
-cp -v $repo/exp/decoder*.onnx $dst
-cp -v $repo/exp/joiner*.onnx $dst
-cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
-cp -v $repo/data/lang_bpe_2000/bpe.model $dst
-mkdir -p $dst/test_wavs
-cp -v $repo/test_wavs/*.wav $dst/test_wavs
-cd $dst
-git lfs track "*.onnx" bpe.model
-git add .
-git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
-
-log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
-rm -rf .git
-rm -fv .gitattributes
-cd ..
-tar cjfv $dst.tar.bz2 $dst
-ls -lh *.tar.bz2
-mv -v $dst.tar.bz2 ../../../
--- a/.github/scripts/multi_zh-hans/ASR/run.sh
+++ b/.github/scripts/multi_zh-hans/ASR/run.sh
@ -0,0 +1,756 @@
+#!/usr/bin/env bash
+
+set -ex
+
+git config --global user.name "k2-fsa"
+git config --global user.email "csukuangfj@gmail.com"
+git config --global lfs.allowincompletepush true
+
+python3 -m pip install onnxmltools==1.13.0 onnx==1.17.0 onnxruntime==1.17.1 sherpa-onnx
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/multi_zh-hans/ASR
+
+log "pwd: $PWD"
+
+function run_2023_9_2() {
+  repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-2023-9-2
+  log "Downloading pre-trained model from $repo_url"
+  GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+  repo=$(basename $repo_url)
+  pushd $repo
+  cd exp
+  git lfs pull --include pretrained.pt
+  ln -s pretrained.pt epoch-99.pt
+  cd ../data/lang_bpe_2000
+  ls -lh
+  git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
+  git lfs pull --include "*.model"
+  ls -lh
+  popd
+
+  log "--------------------------------------------"
+  log "Export non-streaming ONNX transducer models "
+  log "--------------------------------------------"
+  ./zipformer/export-onnx.py \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    --use-averaged-model 0 \
+    --epoch 99 \
+    --avg 1 \
+    --exp-dir $repo/exp \
+    --causal False \
+    --fp16 1
+
+  ls -lh $repo/exp
+
+  ./zipformer/onnx_pretrained.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav \
+    $repo/test_wavs/DEV_T0000000001.wav \
+    $repo/test_wavs/DEV_T0000000002.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000113.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000219.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000351.wav
+
+  ./zipformer/onnx_pretrained.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.int8.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.int8.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav \
+    $repo/test_wavs/DEV_T0000000001.wav \
+    $repo/test_wavs/DEV_T0000000002.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000113.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000219.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000351.wav
+
+  ./zipformer/onnx_pretrained.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.fp16.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.fp16.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.fp16.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav \
+    $repo/test_wavs/DEV_T0000000001.wav \
+    $repo/test_wavs/DEV_T0000000002.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000113.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000219.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000351.wav
+
+  rm -rf $repo
+}
+
+function run_2023_11_05_streaming() {
+  repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05
+  log "Downloading pre-trained model from $repo_url"
+  GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+  repo=$(basename $repo_url)
+
+  pushd $repo
+  cd exp/
+  git lfs pull --include pretrained.pt
+  rm -fv epoch-20.pt
+  rm -fv *.onnx
+  ln -s pretrained.pt epoch-20.pt
+  cd ../data/lang_bpe_2000
+  ls -lh
+  git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
+  git lfs pull --include "*.model"
+  ls -lh
+  popd
+
+  log "----------------------------------------"
+  log "Export streaming ONNX CTC models "
+  log "----------------------------------------"
+  ./zipformer/export-onnx-streaming-ctc.py \
+    --exp-dir $repo/exp \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 20 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 1 \
+    --fp16 1
+
+  ls -lh $repo/exp/
+
+  log "------------------------------------------------------------"
+  log "Test exported streaming ONNX CTC models (greedy search)     "
+  log "------------------------------------------------------------"
+
+  test_wavs=(
+    DEV_T0000000000.wav
+    DEV_T0000000001.wav
+    DEV_T0000000002.wav
+    TEST_MEETING_T0000000113.wav
+    TEST_MEETING_T0000000219.wav
+    TEST_MEETING_T0000000351.wav
+  )
+
+  for w in ${test_wavs[@]}; do
+    log "----fp32----"
+    ./zipformer/onnx_pretrained-streaming-ctc.py \
+      --model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.onnx \
+      --tokens $repo/data/lang_bpe_2000/tokens.txt \
+      $repo/test_wavs/$w
+
+    log "----int8----"
+
+    ./zipformer/onnx_pretrained-streaming-ctc.py \
+      --model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
+      --tokens $repo/data/lang_bpe_2000/tokens.txt \
+      $repo/test_wavs/$w
+
+    log "----fp16----"
+
+    ./zipformer/onnx_pretrained-streaming-ctc.py \
+      --model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.fp16.onnx \
+      --tokens $repo/data/lang_bpe_2000/tokens.txt \
+      $repo/test_wavs/$w
+  done
+
+  log "Upload onnx CTC models to huggingface"
+  name=(
+    sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
+    sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-int8-2023-12-13
+    sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-fp16-2023-12-13
+    )
+  for n in ${name[@]}; do
+      url=https://huggingface.co/k2-fsa/$n
+      GIT_LFS_SKIP_SMUDGE=1 git clone $url
+      dst=$(basename $url)
+      if [[ $n == sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 ]]; then
+        cp -v $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-int8-2023-12-13 ]]; then
+        cp -v $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-fp16-2023-12-13 ]]; then
+        cp -v $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.fp16.onnx $dst
+      fi
+
+      cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+      cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+      mkdir -p $dst/test_wavs
+      cp -v $repo/test_wavs/*.wav $dst/test_wavs
+      cd $dst
+      git lfs track "*.onnx" "bpe.model" "*.wav"
+      ls -lh
+      file bpe.model
+      git status
+      git add .
+      git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
+
+      log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+      rm -rf .git
+      rm -fv .gitattributes
+      cd ..
+      tar cjfv $dst.tar.bz2 $dst
+      ls -lh *.tar.bz2
+      mv -v $dst.tar.bz2 ../../../
+  done
+
+  log "----------------------------------------"
+  log "Export streaming ONNX transducer models "
+  log "----------------------------------------"
+
+  ./zipformer/export-onnx-streaming.py \
+    --exp-dir $repo/exp \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 20 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 0 \
+    --fp16 1
+
+  ls -lh $repo/exp
+
+  log "------------------------------------------------------------"
+  log "Test exported streaming ONNX transducer models (Python code)"
+  log "------------------------------------------------------------"
+
+  log "test fp32"
+  ./zipformer/onnx_pretrained-streaming.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav
+
+  log "test int8"
+  ./zipformer/onnx_pretrained-streaming.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav
+
+  log "test fp16"
+  ./zipformer/onnx_pretrained-streaming.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.fp16.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.fp16.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.fp16.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav
+
+  name=(
+    sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-13
+    sherpa-onnx-streaming-zipformer-multi-zh-hans-int8-2023-12-13
+    sherpa-onnx-streaming-zipformer-multi-zh-hans-fp16-2023-12-13
+  )
+
+  for n in ${name[@]}; do
+      url=https://huggingface.co/csukuangfj/$n
+      GIT_LFS_SKIP_SMUDGE=1 git clone $url
+      dst=$(basename $url)
+      if [[ $n == sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-13 ]]; then
+        cp -v $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.onnx $dst
+        cp -v $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx $dst
+        cp -v $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-multi-zh-hans-int8-2023-12-13 ]]; then
+        cp -v $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx $dst
+        cp -v $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx $dst
+        cp -v $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-multi-zh-hans-fp16-2023-12-13 ]]; then
+        cp -v $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.fp16.onnx $dst
+        cp -v $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.fp16.onnx $dst
+        cp -v $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.fp16.onnx $dst
+      fi
+
+      cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+      cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+      mkdir -p $dst/test_wavs
+      cp -v $repo/test_wavs/*.wav $dst/test_wavs
+      cd $dst
+      git lfs track "*.onnx" "bpe.model" "*.wav"
+      ls -lh
+      file bpe.model
+      git status
+      git add .
+      git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
+
+      log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+      rm -rf .git
+      rm -fv .gitattributes
+      cd ..
+      tar cjfv $dst.tar.bz2 $dst
+      ls -lh *.tar.bz2
+      mv -v $dst.tar.bz2 ../../../
+  done
+}
+
+function run_2023_12_12_streaming() {
+  log "Upload onnx transducer models to huggingface"
+
+  url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12
+  GIT_LFS_SKIP_SMUDGE=1 git clone $url
+  dst=$(basename $url)
+  cp -v $repo/exp/encoder*.onnx $dst
+  cp -v $repo/exp/decoder*.onnx $dst
+  cp -v $repo/exp/joiner*.onnx $dst
+  cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+  cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+  mkdir -p $dst/test_wavs
+  cp -v $repo/test_wavs/*.wav $dst/test_wavs
+  cd $dst
+  git lfs track "*.onnx" bpe.model "*.wav"
+  git add .
+  git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
+
+  log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+  rm -rf .git
+  rm -fv .gitattributes
+  cd ..
+  tar cjfv $dst.tar.bz2 $dst
+  ls -lh *.tar.bz2
+  mv -v $dst.tar.bz2 ../../../
+}
+
+function run_yuekai_large() {
+  repo_url=https://csukuangfj:${HF_TOKEN}@huggingface.co/yuekai/icefall-asr-multi-zh-hans-zipformer-large
+  log "Downloading pre-trained model from $repo_url"
+  GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+  repo=$(basename $repo_url)
+  pushd $repo
+  git lfs pull --include pretrained.pt
+  mv pretrained.pt epoch-99.pt
+  curl -SL -O https://huggingface.co/pingzxy/icefall-asr-multi-zh-hans-zipformer-large-onnx/resolve/main/tokens.txt
+  popd
+
+  log "----------------------------------------"
+  log "Export streaming ONNX CTC models "
+  log "----------------------------------------"
+  ./zipformer/export-onnx-streaming-ctc.py \
+    --exp-dir $repo/ \
+    --tokens $repo/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 99 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 1 \
+    \
+    --num-encoder-layers 2,2,4,5,4,2 \
+    --feedforward-dim 768,1024,1536,2048,1536,768 \
+    --encoder-dim 256,384,512,768,512,256 \
+    --encoder-unmasked-dim 192,192,256,320,256,192 \
+    \
+    --fp16 1 \
+    --use-whisper-features 1
+
+
+  ls -lh $repo/
+  pushd $repo
+
+cat >README.md <<EOF
+# Introduction
+
+This model is converted
+from
+https://huggingface.co/yuekai/icefall-asr-multi-zh-hans-zipformer-large
+
+The training code can be found at
+https://github.com/k2-fsa/icefall/blob/master/egs/multi_zh-hans/ASR/RESULTS.md#multi-chinese-datasets-char-based-training-results-streaming-on-zipformer-large-model
+EOF
+
+  mv -v ctc-epoch-99-avg-1-chunk-16-left-128.fp16.onnx model.fp16.onnx
+  mv -v ctc-epoch-99-avg-1-chunk-16-left-128.int8.onnx model.int8.onnx
+  mv -v ctc-epoch-99-avg-1-chunk-16-left-128.onnx model.onnx
+
+  ls -lh *.onnx
+
+  mkdir test_wavs
+  cd test_wavs
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/0.wav
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/1.wav
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/8k.wav
+  popd
+
+  for w in 0.wav 1.wav 8k.wav; do
+    log "---fp32---"
+    sherpa-onnx \
+      --zipformer2-ctc-model=$repo/model.onnx \
+      --tokens=$repo/tokens.txt \
+      $repo/test_wavs/$w
+
+    log "---int8---"
+
+    sherpa-onnx \
+      --zipformer2-ctc-model=$repo/model.int8.onnx \
+      --tokens=$repo/tokens.txt \
+      $repo/test_wavs/$w
+
+    log "---fp16---"
+
+    sherpa-onnx \
+      --zipformer2-ctc-model=$repo/model.fp16.onnx \
+      --tokens=$repo/tokens.txt \
+      $repo/test_wavs/$w
+  done
+
+  name=(
+    sherpa-onnx-streaming-zipformer-ctc-zh-2025-06-30
+    sherpa-onnx-streaming-zipformer-ctc-zh-int8-2025-06-30
+    sherpa-onnx-streaming-zipformer-ctc-zh-fp16-2025-06-30
+  )
+  for n in ${name[@]}; do
+      url=https://huggingface.co/csukuangfj/$n
+      GIT_LFS_SKIP_SMUDGE=1 git clone $url
+      dst=$(basename $url)
+      if [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-2025-06-30 ]]; then
+        cp -v $repo/model.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-int8-2025-06-30 ]]; then
+        cp -v $repo/model.int8.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-fp16-2025-06-30 ]]; then
+        cp -v $repo/model.fp16.onnx $dst
+      fi
+
+      cp -v $repo/tokens.txt $dst
+      cp -v $repo/README.md $dst
+      mkdir -p $dst/test_wavs
+      cp -v $repo/test_wavs/*.wav $dst/test_wavs
+      cd $dst
+      git lfs track "*.onnx" "*.wav"
+      ls -lh
+      git status
+      git add .
+      git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
+
+      log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+      rm -rf .git
+      rm -fv .gitattributes
+      cd ..
+      tar cjfv $dst.tar.bz2 $dst
+      ls -lh *.tar.bz2
+      mv -v $dst.tar.bz2 ../../../
+  done
+
+  rm $repo/*.onnx
+
+  log "----------------------------------------"
+  log "Export streaming ONNX transducer models "
+  log "----------------------------------------"
+
+  ./zipformer/export-onnx-streaming.py \
+    --exp-dir $repo \
+    --tokens $repo/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 99 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 0 \
+    \
+    --num-encoder-layers 2,2,4,5,4,2 \
+    --feedforward-dim 768,1024,1536,2048,1536,768 \
+    --encoder-dim 256,384,512,768,512,256 \
+    --encoder-unmasked-dim 192,192,256,320,256,192 \
+    \
+    --fp16 1 \
+    --use-whisper-features 1
+
+  ls -lh $repo
+  pushd $repo
+  for m in encoder decoder joiner; do
+    mv -v $m-epoch-99-avg-1-chunk-16-left-128.onnx $m.onnx
+    mv -v $m-epoch-99-avg-1-chunk-16-left-128.fp16.onnx $m.fp16.onnx
+    mv -v $m-epoch-99-avg-1-chunk-16-left-128.int8.onnx $m.int8.onnx
+  done
+  ls -lh *.onnx
+  popd
+
+  for w in 0.wav 1.wav 8k.wav; do
+    log "---fp32---"
+      sherpa-onnx \
+        --encoder=$repo/encoder.onnx \
+        --decoder=$repo/decoder.onnx \
+        --joiner=$repo/joiner.onnx \
+        --tokens=$repo/tokens.txt \
+        $repo/test_wavs/$w
+
+    log "---int8---"
+
+      sherpa-onnx \
+        --encoder=$repo/encoder.int8.onnx \
+        --decoder=$repo/decoder.onnx \
+        --joiner=$repo/joiner.int8.onnx \
+        --tokens=$repo/tokens.txt \
+        $repo/test_wavs/$w
+
+    log "---fp16---"
+
+      sherpa-onnx \
+        --encoder=$repo/encoder.fp16.onnx \
+        --decoder=$repo/decoder.fp16.onnx \
+        --joiner=$repo/joiner.fp16.onnx \
+        --tokens=$repo/tokens.txt \
+        $repo/test_wavs/$w
+  done
+
+  name=(
+    sherpa-onnx-streaming-zipformer-zh-2025-06-30
+    sherpa-onnx-streaming-zipformer-zh-int8-2025-06-30
+    sherpa-onnx-streaming-zipformer-zh-fp16-2025-06-30
+  )
+  for n in ${name[@]}; do
+      url=https://huggingface.co/csukuangfj/$n
+      GIT_LFS_SKIP_SMUDGE=1 git clone $url
+      dst=$(basename $url)
+      if [[ $n == sherpa-onnx-streaming-zipformer-zh-2025-06-30 ]]; then
+        cp -v $repo/encoder.onnx $dst
+        cp -v $repo/decoder.onnx $dst
+        cp -v $repo/joiner.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-zh-int8-2025-06-30 ]]; then
+        cp -v $repo/encoder.int8.onnx $dst
+        cp -v $repo/decoder.onnx $dst
+        cp -v $repo/joiner.int8.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-zh-fp16-2025-06-30 ]]; then
+        cp -v $repo/encoder.fp16.onnx $dst
+        cp -v $repo/decoder.fp16.onnx $dst
+        cp -v $repo/joiner.fp16.onnx $dst
+      fi
+
+      cp -v $repo/tokens.txt $dst
+      cp -v $repo/README.md $dst
+      mkdir -p $dst/test_wavs
+      cp -v $repo/test_wavs/*.wav $dst/test_wavs
+      cd $dst
+      git lfs track "*.onnx" "*.wav"
+      ls -lh
+      git status
+      git add .
+      git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
+
+      log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+      rm -rf .git
+      rm -fv .gitattributes
+      cd ..
+      tar cjfv $dst.tar.bz2 $dst
+      ls -lh *.tar.bz2
+      mv -v $dst.tar.bz2 ../../../
+  done
+}
+
+function run_yuekai_xl() {
+  repo_url=https://csukuangfj:${HF_TOKEN}@huggingface.co/yuekai/icefall-asr-multi-zh-hans-zipformer-xl
+  log "Downloading pre-trained model from $repo_url"
+  GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+  repo=$(basename $repo_url)
+
+  pushd $repo
+  git lfs pull --include pretrained.pt
+  git lfs pull --include data/lang_bpe_2000/bpe.model
+  mv pretrained.pt epoch-99.pt
+  ls -lh *.pt
+  popd
+
+  log "----------------------------------------"
+  log "Export streaming ONNX CTC models "
+  log "----------------------------------------"
+  ./zipformer/export-onnx-streaming-ctc.py \
+    --exp-dir $repo/ \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 99 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 1 \
+    \
+    --num-encoder-layers 2,3,5,6,5,3 \
+    --feedforward-dim 1536,2048,3072,4096,3072,1536 \
+    --encoder-dim 512,768,1024,1536,1024,512 \
+    --encoder-unmasked-dim 192,192,256,320,256,192 \
+    --decoder-dim 768 --joiner-dim 768 \
+    --value-head-dim 18 \
+    --query-head-dim 48 \
+    --num-heads 4,4,4,8,4,4 \
+    \
+    --fp16 1 \
+    --use-whisper-features 1 \
+    --use-external-data 1
+
+  mv -v ctc-epoch-99-avg-1-chunk-16-left-128.int8.onnx model.int8.onnx
+  mv -v ctc-epoch-99-avg-1-chunk-16-left-128.fp16.onnx model.fp16.onnx
+
+  ls -lh *.onnx
+
+  mkdir test_wavs
+  pushd test_wavs
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/0.wav
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/1.wav
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/8k.wav
+  popd
+
+  for w in 0.wav 1.wav 8k.wav; do
+    log "---int8---"
+
+    sherpa-onnx \
+      --zipformer2-ctc-model=./model.int8.onnx \
+      --tokens=$repo/data/lang_bpe_2000/tokens.txt \
+      test_wavs/$w
+
+    log "---fp16---"
+
+    sherpa-onnx \
+      --zipformer2-ctc-model=./model.fp16.onnx \
+      --tokens=$repo/data/lang_bpe_2000/tokens.txt \
+      test_wavs/$w
+  done
+
+  pushd $repo
+cat >README.md <<EOF
+# Introduction
+
+This model is converted
+from
+https://huggingface.co/yuekai/icefall-asr-multi-zh-hans-zipformer-xl
+
+The training code can be found at
+https://github.com/k2-fsa/icefall/blob/master/egs/multi_zh-hans/ASR/RESULTS.md#multi-chinese-datasets-char-based-training-results-streaming-on-zipformer-xl-model
+EOF
+  popd
+
+  name=(
+    sherpa-onnx-streaming-zipformer-ctc-zh-xlarge-int8-2025-06-30
+    sherpa-onnx-streaming-zipformer-ctc-zh-xlarge-fp16-2025-06-30
+  )
+
+  for n in ${name[@]}; do
+      url=https://huggingface.co/csukuangfj/$n
+      GIT_LFS_SKIP_SMUDGE=1 git clone $url
+      dst=$(basename $url)
+      if [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-xlarge-fp16-2025-06-30 ]]; then
+        cp -v model.fp16.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-xlarge-int8-2025-06-30 ]]; then
+        cp -v model.int8.onnx $dst
+      fi
+
+      cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+      cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+      cp -v $repo/README.md $dst
+      mkdir -p $dst/test_wavs
+      cp -v ./test_wavs/*.wav $dst/test_wavs
+      cd $dst
+      git lfs track "*.onnx" "*.wav" "bpe.model"
+      ls -lh
+      git status
+      git add .
+      git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
+
+      log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+      rm -rf .git
+      rm -fv .gitattributes
+      cd ..
+
+      ls -lh $dst
+      tar cjfv $dst.tar.bz2 $dst
+      ls -lh *.tar.bz2
+      mv -v $dst.tar.bz2 ../../../
+  done
+
+  rm -fv *.onnx *.weights
+
+  log "----------------------------------------"
+  log "Export streaming ONNX transducer models "
+  log "----------------------------------------"
+
+  ./zipformer/export-onnx-streaming.py \
+    --exp-dir $repo/ \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 99 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 0 \
+    \
+    --num-encoder-layers 2,3,5,6,5,3 \
+    --feedforward-dim 1536,2048,3072,4096,3072,1536 \
+    --encoder-dim 512,768,1024,1536,1024,512 \
+    --encoder-unmasked-dim 192,192,256,320,256,192 \
+    --decoder-dim 768 --joiner-dim 768 \
+    --value-head-dim 18 \
+    --query-head-dim 48 \
+    --num-heads 4,4,4,8,4,4 \
+    \
+    --fp16 1 \
+    --use-whisper-features 1 \
+    --use-external-data 1
+
+    ls -lh *.onnx
+    ls -lh *.weights
+
+    mv encoder-epoch-99-avg-1-chunk-16-left-128.fp16.onnx encoder.fp16.onnx
+    mv encoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx encoder.int8.onnx
+
+    mv $repo/decoder-epoch-99-avg-1-chunk-16-left-128.onnx decoder.onnx
+    mv $repo/decoder-epoch-99-avg-1-chunk-16-left-128.fp16.onnx decoder.fp16.onnx
+
+    mv $repo/joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx joiner.int8.onnx
+    mv $repo/joiner-epoch-99-avg-1-chunk-16-left-128.fp16.onnx joiner.fp16.onnx
+
+  name=(
+    sherpa-onnx-streaming-zipformer-zh-xlarge-int8-2025-06-30
+    sherpa-onnx-streaming-zipformer-zh-xlarge-fp16-2025-06-30
+  )
+
+  for n in ${name[@]}; do
+      url=https://huggingface.co/csukuangfj/$n
+      GIT_LFS_SKIP_SMUDGE=1 git clone $url
+      dst=$(basename $url)
+      if [[ $n == sherpa-onnx-streaming-zipformer-zh-xlarge-fp16-2025-06-30 ]]; then
+        cp -v encoder.fp16.onnx $dst
+        cp -v decoder.fp16.onnx $dst
+        cp -v joiner.fp16.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-zh-xlarge-int8-2025-06-30 ]]; then
+        cp -v encoder.int8.onnx $dst
+        cp -v decoder.onnx $dst
+        cp -v joiner.int8.onnx $dst
+      fi
+
+      cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+      cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+      cp -v $repo/README.md $dst
+      mkdir -p $dst/test_wavs
+      cp -v ./test_wavs/*.wav $dst/test_wavs
+      cd $dst
+      git lfs track "*.onnx" "*.wav" "bpe.model"
+      ls -lh
+      git status
+      git add .
+      git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
+
+      log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+      rm -rf .git
+      rm -fv .gitattributes
+      cd ..
+
+      ls -lh $dst
+      tar cjfv $dst.tar.bz2 $dst
+      ls -lh *.tar.bz2
+      mv -v $dst.tar.bz2 ../../../
+  done
+
+  rm -fv *.onnx *.weights
+}
+
+# run_yuekai_large
+# run_yuekai_xl
+# run_2023_9_2
+run_2023_11_05_streaming
+# run_2023_12_12_streaming
--- a/.github/workflows/aishell.yml
+++ b/.github/workflows/aishell.yml
@ -17,7 +17,7 @@ concurrency:

 jobs:
  generate_build_matrix:
-    if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event.label.name == 'ready' || github.event_name == 'push' || github.event_name == 'aishell')
+    if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'

    # see https://github.com/pytorch/pytorch/pull/50633
    runs-on: ubuntu-latest
@ -31,8 +31,8 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
          echo "::set-output name=matrix::${MATRIX}"
  aishell:
    needs: generate_build_matrix
--- a/.github/workflows/audioset.yml
+++ b/.github/workflows/audioset.yml
@ -30,8 +30,8 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
          echo "::set-output name=matrix::${MATRIX}"

  audioset:
@ -83,7 +83,7 @@ jobs:
          ls -lh ./model-onnx/*

      - name: Upload model to huggingface
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        uses: nick-fields/retry@v3
@ -116,7 +116,7 @@ jobs:
            rm -rf huggingface

      - name: Prepare for release
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
        shell: bash
        run: |
          d=sherpa-onnx-zipformer-audio-tagging-2024-04-09
@ -125,7 +125,7 @@ jobs:
          ls -lh

      - name: Release exported onnx models
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
        uses: svenstaro/upload-release-action@v2
        with:
          file_glob: true
--- a/.github/workflows/baker_zh.yml
+++ b/.github/workflows/baker_zh.yml
@ -31,8 +31,8 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py --min-torch-version "2.3"
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --min-torch-version "2.3")
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
          echo "::set-output name=matrix::${MATRIX}"

  baker_zh:
@ -84,43 +84,43 @@ jobs:
          ls -lh

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
          path: ./*.wav

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-2
          path: ./model-steps-2.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-3
          path: ./model-steps-3.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-4
          path: ./model-steps-4.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-5
          path: ./model-steps-5.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-6
          path: ./model-steps-6.onnx

      - name: Upload models to huggingface
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
        shell: bash
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
@ -141,7 +141,7 @@ jobs:
          popd

      - name: Release exported onnx models
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
        uses: svenstaro/upload-release-action@v2
        with:
          file_glob: true
--- a/.github/workflows/build-doc.yml
+++ b/.github/workflows/build-doc.yml
@ -34,7 +34,7 @@ concurrency:

 jobs:
  build-doc:
-    if: github.event.label.name == 'doc' || github.event_name == 'push'
+    # if: github.event.label.name == 'doc' || github.event_name == 'push'
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
@ -43,7 +43,7 @@ jobs:
        python-version: ["3.8"]
    steps:
      # refer to https://github.com/actions/checkout
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

--- a/.github/workflows/librispeech.yml
+++ b/.github/workflows/librispeech.yml
@ -29,8 +29,9 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          # MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10" --min-torch-version "2.6.0")
          echo "::set-output name=matrix::${MATRIX}"
  librispeech:
    needs: generate_build_matrix
--- a/.github/workflows/ljspeech.yml
+++ b/.github/workflows/ljspeech.yml
@ -30,8 +30,8 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py --min-torch-version "2.3"
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --min-torch-version "2.3")
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
          echo "::set-output name=matrix::${MATRIX}"

  ljspeech:
@ -83,13 +83,13 @@ jobs:
          ls -lh

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
          path: ./*.wav

      - name: Release exported onnx models
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
        uses: svenstaro/upload-release-action@v2
        with:
          file_glob: true
@ -100,37 +100,37 @@ jobs:
          tag: tts-models

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-2
          path: ./model-steps-2.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-3
          path: ./model-steps-3.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-4
          path: ./model-steps-4.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-5
          path: ./model-steps-5.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-6
          path: ./model-steps-6.onnx

      - name: Upload models to huggingface
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        shell: bash
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
@ -155,7 +155,7 @@ jobs:
          popd

      - name: Release exported onnx models
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        uses: svenstaro/upload-release-action@v2
        with:
          file_glob: true
--- a/.github/workflows/multi-zh-hans.yml
+++ b/.github/workflows/multi-zh-hans.yml
@ -1,4 +1,4 @@
-name: run-multi-zh-hans
+name: multi-zh-hans

 on:
  push:
@ -8,65 +8,72 @@ on:
  workflow_dispatch:

 concurrency:
-  group: run-multi-zh-hans-${{ github.ref }}
+  group: multi-zh-hans-${{ github.ref }}
  cancel-in-progress: true

 permissions:
  contents: write

 jobs:
+  generate_build_matrix:
+    if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python ./.github/scripts/docker/generate_build_matrix.py --torch-version "2.7.0" --python-version "3.11"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --torch-version "2.7.0" --python-version "3.11")
+          echo "::set-output name=matrix::${MATRIX}"
  multi-zh-hans:
-    runs-on: ${{ matrix.os }}
+    needs: generate_build_matrix
+    name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
+    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
-        os: [ubuntu-latest]
-        python-version: [3.8]
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}

    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

-      - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-          cache: 'pip'
-          cache-dependency-path: '**/requirements-ci.txt'
-
-      - name: Install Python dependencies
-        run: |
-          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
-          pip uninstall -y protobuf
-          pip install --no-binary protobuf protobuf==3.20.*
-
-      - name: Cache kaldifeat
-        id: my-cache
-        uses: actions/cache@v2
-        with:
-          path: |
-            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}-2023-05-22
-
-      - name: Install kaldifeat
-        if: steps.my-cache.outputs.cache-hit != 'true'
+      - name: Free space
        shell: bash
        run: |
-          .github/scripts/install-kaldifeat.sh
+          df -h
+          rm -rf /opt/hostedtoolcache
+          df -h
+          echo "pwd: $PWD"
+          echo "github.workspace ${{ github.workspace }}"

-      - name: export-model
+      - name: Test with multi_zh-hans
+        uses: addnab/docker-run-action@v3
+        with:
+            image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
+            options: |
+              --volume ${{ github.workspace }}/:/icefall
+            shell: bash
+            run: |
+              export PYTHONPATH=/icefall:$PYTHONPATH
+              export HF_TOKEN=${{ secrets.HF_TOKEN }}
+              cd /icefall
+              git config --global --add safe.directory /icefall
+
+              .github/scripts/multi_zh-hans/ASR/run.sh
+
+      - name: Show models
        shell: bash
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        run: |
-          sudo apt-get -qq install git-lfs tree
-          export PYTHONPATH=$PWD:$PYTHONPATH
-          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
-          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
-
-          .github/scripts/multi-zh-hans.sh
-          ls -lh
+          ls -lh *.tar.bz2

      - name: upload model to https://github.com/k2-fsa/sherpa-onnx
        uses: svenstaro/upload-release-action@v2
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -30,8 +30,8 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
          echo "::set-output name=matrix::${MATRIX}"
  test:
    needs: generate_build_matrix
--- a/.github/workflows/yesno.yml
+++ b/.github/workflows/yesno.yml
@ -30,8 +30,9 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
+          # MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10" --min-torch-version "2.5.0")
          echo "::set-output name=matrix::${MATRIX}"
  yesno:
    needs: generate_build_matrix
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/train.py
@ -79,7 +79,13 @@ from icefall.checkpoint import save_checkpoint_with_global_batch_idx
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -638,7 +644,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -912,7 +918,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless2/train.py
@ -72,7 +72,13 @@ from icefall.checkpoint import save_checkpoint_with_global_batch_idx
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -688,7 +694,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -989,7 +995,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/pruned_transducer_stateless3/model.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless3/model.py
@ -23,7 +23,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -184,7 +184,7 @@ class Transducer(nn.Module):
        lm = simple_lm_proj(decoder_out)
        am = simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -219,7 +219,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/aishell/ASR/pruned_transducer_stateless3/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless3/train.py
@ -94,7 +94,13 @@ from icefall.checkpoint import (
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -797,7 +803,7 @@ def train_one_epoch(
        aishell = is_aishell(batch["supervisions"]["cut"][0])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1202,7 +1208,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/train.py
@ -94,6 +94,7 @@ from icefall.utils import (
    filter_uneven_sized_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -809,7 +810,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1206,7 +1207,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py
@ -87,6 +87,7 @@ from icefall.utils import (
    setup_logger,
    str2bool,
    tokenize_by_CJK_char,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -802,7 +803,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1202,7 +1203,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py
@ -81,7 +81,13 @@ from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -812,7 +818,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1202,7 +1208,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -92,7 +92,7 @@ class AishellAsrDataModule:
        group.add_argument(
            "--num-buckets",
            type=int,
-            default=30,
+            default=15,
            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
@ -275,8 +275,7 @@ class AishellAsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                buffer_size=self.args.num_buckets * 2000,
-                shuffle_buffer_size=self.args.num_buckets * 5000,
+                buffer_size=self.args.num_buckets * 5000,
                drop_last=self.args.drop_last,
            )
        else:
--- a/egs/aishell/ASR/whisper/train.py
+++ b/egs/aishell/ASR/whisper/train.py
@ -81,6 +81,7 @@ from icefall.utils import (
    filter_uneven_sized_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -514,7 +515,7 @@ def compute_validation_loss(
    tot_loss = MetricsTracker()

    for batch_idx, batch in enumerate(valid_dl):
-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                tokenizer=tokenizer,
@ -608,7 +609,7 @@ def train_one_epoch(
                )

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    tokenizer=tokenizer,
--- a/egs/aishell/ASR/zipformer/train.py
+++ b/egs/aishell/ASR/zipformer/train.py
@ -95,6 +95,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -910,7 +911,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1302,7 +1303,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/zipformer/train_bbpe.py
+++ b/egs/aishell/ASR/zipformer/train_bbpe.py
@ -92,6 +92,7 @@ from icefall.utils import (
    setup_logger,
    str2bool,
    tokenize_by_CJK_char,
+    torch_autocast,
 )


@ -495,7 +496,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -895,7 +896,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell2/ASR/pruned_transducer_stateless5/asr_datamodule.py
+++ b/egs/aishell2/ASR/pruned_transducer_stateless5/asr_datamodule.py
@ -104,7 +104,7 @@ class AiShell2AsrDataModule:
        group.add_argument(
            "--num-buckets",
            type=int,
-            default=30,
+            default=15,
            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
@ -296,8 +296,7 @@ class AiShell2AsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                buffer_size=self.args.num_buckets * 2000,
-                shuffle_buffer_size=self.args.num_buckets * 5000,
+                buffer_size=self.args.num_buckets * 5000,
                drop_last=self.args.drop_last,
            )
        else:
--- a/egs/aishell2/ASR/pruned_transducer_stateless5/train.py
+++ b/egs/aishell2/ASR/pruned_transducer_stateless5/train.py
@ -90,7 +90,13 @@ from icefall.checkpoint import (
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -734,7 +740,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1062,7 +1068,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell4/ASR/pruned_transducer_stateless5/train.py
+++ b/egs/aishell4/ASR/pruned_transducer_stateless5/train.py
@ -83,7 +83,13 @@ from icefall.checkpoint import (
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -727,7 +733,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])
        # print(batch["supervisions"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1034,7 +1040,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/alimeeting/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/train.py
@ -79,7 +79,13 @@ from icefall.checkpoint import save_checkpoint_with_global_batch_idx
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -638,7 +644,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -912,7 +918,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/train.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/train.py
@ -73,7 +73,13 @@ from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -782,7 +788,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1127,7 +1133,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/ami/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/ami/ASR/pruned_transducer_stateless7/train.py
@ -71,7 +71,13 @@ from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -773,7 +779,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1134,7 +1140,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/ami/SURT/dprnn_zipformer/train.py
+++ b/egs/ami/SURT/dprnn_zipformer/train.py
@ -76,7 +76,13 @@ from icefall.checkpoint import (
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -1067,7 +1073,7 @@ def train_one_epoch(
        batch_size = batch["inputs"].shape[0]

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
--- a/egs/ami/SURT/dprnn_zipformer/train_adapt.py
+++ b/egs/ami/SURT/dprnn_zipformer/train_adapt.py
@ -76,7 +76,13 @@ from icefall.checkpoint import (
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -1058,7 +1064,7 @@ def train_one_epoch(
        batch_size = batch["inputs"].shape[0]

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
--- a/egs/audioset/AT/zipformer/train.py
+++ b/egs/audioset/AT/zipformer/train.py
@ -74,6 +74,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -799,7 +800,7 @@ def train_one_epoch(
        num_samples += batch_size

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1148,7 +1149,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/baker_zh/TTS/local/compute_fbank_baker_zh.py
+++ b/egs/baker_zh/TTS/local/compute_fbank_baker_zh.py
@ -73,6 +73,8 @@ def compute_fbank_baker_zh(num_jobs: int):
        f_min=0,
        f_max=8000,
    )
+    if not torch.cuda.is_available():
+        config.device = "cpu"

    prefix = "baker_zh"
    suffix = "jsonl.gz"
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7/train.py
@ -88,6 +88,7 @@ from icefall.utils import (
    filter_uneven_sized_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -825,7 +826,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1220,7 +1221,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/finetune.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/finetune.py
@ -90,6 +90,7 @@ from icefall.utils import (
    filter_uneven_sized_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -895,7 +896,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1293,7 +1294,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/train.py
@ -81,7 +81,13 @@ from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -840,7 +846,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1237,7 +1243,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/commonvoice/ASR/zipformer/train.py
+++ b/egs/commonvoice/ASR/zipformer/train.py
@ -97,6 +97,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -969,7 +970,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1365,7 +1366,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/commonvoice/ASR/zipformer/train_char.py
+++ b/egs/commonvoice/ASR/zipformer/train_char.py
@ -97,6 +97,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -604,7 +605,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -784,7 +785,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/csj/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/csj/ASR/pruned_transducer_stateless7_streaming/train.py
@ -83,7 +83,13 @@ from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
 LOG_EPS = math.log(1e-10)
@ -838,7 +844,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1245,7 +1251,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -101,7 +101,7 @@ class GigaSpeechAsrDataModule:
        group.add_argument(
            "--num-buckets",
            type=int,
-            default=30,
+            default=15,
            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
@ -294,8 +294,7 @@ class GigaSpeechAsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                buffer_size=self.args.num_buckets * 2000,
-                shuffle_buffer_size=self.args.num_buckets * 5000,
+                buffer_size=self.args.num_buckets * 5000,
                drop_last=True,
            )
        else:
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/train.py
@ -77,7 +77,13 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -675,7 +681,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -944,7 +950,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/gigaspeech/ASR/zipformer/asr_datamodule.py
+++ b/egs/gigaspeech/ASR/zipformer/asr_datamodule.py
@ -105,7 +105,7 @@ class GigaSpeechAsrDataModule:
        group.add_argument(
            "--num-buckets",
            type=int,
-            default=100,
+            default=15,
            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
@ -311,8 +311,7 @@ class GigaSpeechAsrDataModule:
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                buffer_size=self.args.num_buckets * 2000,
-                shuffle_buffer_size=self.args.num_buckets * 5000,
+                buffer_size=self.args.num_buckets * 5000,
                drop_last=self.args.drop_last,
            )
        else:
@ -369,7 +368,7 @@ class GigaSpeechAsrDataModule:
            cuts_valid,
            max_duration=self.args.max_duration,
            num_buckets=self.args.num_buckets,
-            buffer_size=self.args.num_buckets * 2000,
+            buffer_size=self.args.num_buckets * 5000,
            shuffle=False,
        )
        logging.info("About to create dev dataloader")
--- a/egs/gigaspeech/ASR/zipformer/train.py
+++ b/egs/gigaspeech/ASR/zipformer/train.py
@ -97,6 +97,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -958,7 +959,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1317,7 +1318,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/gigaspeech/KWS/zipformer/asr_datamodule.py
+++ b/egs/gigaspeech/KWS/zipformer/asr_datamodule.py
@ -1,477 +0,0 @@
-# Copyright      2021  Piotr Żelasko
-# Copyright      2024  Xiaomi Corporation     (Author: Wei Kang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import glob
-import inspect
-import logging
-import re
-from functools import lru_cache
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-import lhotse
-import torch
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
-from lhotse.dataset import (
-    CutConcatenate,
-    CutMix,
-    DynamicBucketingSampler,
-    K2SpeechRecognitionDataset,
-    PrecomputedFeatures,
-    SimpleCutSampler,
-    SpecAugment,
-)
-from lhotse.dataset.input_strategies import AudioSamples, OnTheFlyFeatures
-from lhotse.utils import fix_random_seed
-from torch.utils.data import DataLoader
-
-from icefall.utils import str2bool
-
-
-class _SeedWorkers:
-    def __init__(self, seed: int):
-        self.seed = seed
-
-    def __call__(self, worker_id: int):
-        fix_random_seed(self.seed + worker_id)
-
-
-class GigaSpeechAsrDataModule:
-    """
-    DataModule for k2 ASR experiments.
-    It assumes there is always one train and valid dataloader,
-    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
-    and test-other).
-
-    It contains all the common data pipeline modules used in ASR
-    experiments, e.g.:
-    - dynamic batch size,
-    - bucketing samplers,
-    - cut concatenation,
-    - augmentation,
-    - on-the-fly feature extraction
-
-    This class should be derived for specific corpora used in ASR tasks.
-    """
-
-    def __init__(self, args: argparse.Namespace):
-        self.args = args
-
-    @classmethod
-    def add_arguments(cls, parser: argparse.ArgumentParser):
-        group = parser.add_argument_group(
-            title="ASR data related options",
-            description="These options are used for the preparation of "
-            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
-            "effective batch sizes, sampling strategies, applied data "
-            "augmentations, etc.",
-        )
-        group.add_argument(
-            "--manifest-dir",
-            type=Path,
-            default=Path("data/fbank"),
-            help="Path to directory with train/valid/test cuts.",
-        )
-        group.add_argument(
-            "--max-duration",
-            type=int,
-            default=200.0,
-            help="Maximum pooled recordings duration (seconds) in a "
-            "single batch. You can reduce it if it causes CUDA OOM.",
-        )
-        group.add_argument(
-            "--bucketing-sampler",
-            type=str2bool,
-            default=True,
-            help="When enabled, the batches will come from buckets of "
-            "similar duration (saves padding frames).",
-        )
-        group.add_argument(
-            "--num-buckets",
-            type=int,
-            default=30,
-            help="The number of buckets for the DynamicBucketingSampler"
-            "(you might want to increase it for larger datasets).",
-        )
-        group.add_argument(
-            "--concatenate-cuts",
-            type=str2bool,
-            default=False,
-            help="When enabled, utterances (cuts) will be concatenated "
-            "to minimize the amount of padding.",
-        )
-        group.add_argument(
-            "--duration-factor",
-            type=float,
-            default=1.0,
-            help="Determines the maximum duration of a concatenated cut "
-            "relative to the duration of the longest cut in a batch.",
-        )
-        group.add_argument(
-            "--gap",
-            type=float,
-            default=1.0,
-            help="The amount of padding (in seconds) inserted between "
-            "concatenated cuts. This padding is filled with noise when "
-            "noise augmentation is used.",
-        )
-        group.add_argument(
-            "--on-the-fly-feats",
-            type=str2bool,
-            default=False,
-            help="When enabled, use on-the-fly cut mixing and feature "
-            "extraction. Will drop existing precomputed feature manifests "
-            "if available.",
-        )
-        group.add_argument(
-            "--shuffle",
-            type=str2bool,
-            default=True,
-            help="When enabled (=default), the examples will be "
-            "shuffled for each epoch.",
-        )
-        group.add_argument(
-            "--drop-last",
-            type=str2bool,
-            default=True,
-            help="Whether to drop last batch. Used by sampler.",
-        )
-        group.add_argument(
-            "--return-cuts",
-            type=str2bool,
-            default=True,
-            help="When enabled, each batch will have the "
-            "field: batch['supervisions']['cut'] with the cuts that "
-            "were used to construct it.",
-        )
-
-        group.add_argument(
-            "--num-workers",
-            type=int,
-            default=2,
-            help="The number of training dataloader workers that "
-            "collect the batches.",
-        )
-
-        group.add_argument(
-            "--enable-spec-aug",
-            type=str2bool,
-            default=True,
-            help="When enabled, use SpecAugment for training dataset.",
-        )
-
-        group.add_argument(
-            "--spec-aug-time-warp-factor",
-            type=int,
-            default=80,
-            help="Used only when --enable-spec-aug is True. "
-            "It specifies the factor for time warping in SpecAugment. "
-            "Larger values mean more warping. "
-            "A value less than 1 means to disable time warp.",
-        )
-
-        group.add_argument(
-            "--enable-musan",
-            type=str2bool,
-            default=True,
-            help="When enabled, select noise from MUSAN and mix it"
-            "with training dataset. ",
-        )
-
-        group.add_argument(
-            "--input-strategy",
-            type=str,
-            default="PrecomputedFeatures",
-            help="AudioSamples or PrecomputedFeatures",
-        )
-
-        # GigaSpeech specific arguments
-        group.add_argument(
-            "--subset",
-            type=str,
-            default="XL",
-            help="Select the GigaSpeech subset (XS|S|M|L|XL)",
-        )
-        group.add_argument(
-            "--small-dev",
-            type=str2bool,
-            default=False,
-            help="Should we use only 1000 utterances for dev (speeds up training)",
-        )
-
-    def train_dataloaders(
-        self,
-        cuts_train: CutSet,
-        sampler_state_dict: Optional[Dict[str, Any]] = None,
-    ) -> DataLoader:
-        """
-        Args:
-          cuts_train:
-            CutSet for training.
-          sampler_state_dict:
-            The state dict for the training sampler.
-        """
-        transforms = []
-        if self.args.enable_musan:
-            logging.info("Enable MUSAN")
-            logging.info("About to get Musan cuts")
-            cuts_musan = load_manifest(self.args.manifest_dir / "musan_cuts.jsonl.gz")
-            transforms.append(
-                CutMix(cuts=cuts_musan, p=0.5, snr=(10, 20), preserve_id=True)
-            )
-        else:
-            logging.info("Disable MUSAN")
-
-        if self.args.concatenate_cuts:
-            logging.info(
-                f"Using cut concatenation with duration factor "
-                f"{self.args.duration_factor} and gap {self.args.gap}."
-            )
-            # Cut concatenation should be the first transform in the list,
-            # so that if we e.g. mix noise in, it will fill the gaps between
-            # different utterances.
-            transforms = [
-                CutConcatenate(
-                    duration_factor=self.args.duration_factor, gap=self.args.gap
-                )
-            ] + transforms
-
-        input_transforms = []
-        if self.args.enable_spec_aug:
-            logging.info("Enable SpecAugment")
-            logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}")
-            # Set the value of num_frame_masks according to Lhotse's version.
-            # In different Lhotse's versions, the default of num_frame_masks is
-            # different.
-            num_frame_masks = 10
-            num_frame_masks_parameter = inspect.signature(
-                SpecAugment.__init__
-            ).parameters["num_frame_masks"]
-            if num_frame_masks_parameter.default == 1:
-                num_frame_masks = 2
-            logging.info(f"Num frame mask: {num_frame_masks}")
-            input_transforms.append(
-                SpecAugment(
-                    time_warp_factor=self.args.spec_aug_time_warp_factor,
-                    num_frame_masks=num_frame_masks,
-                    features_mask_size=27,
-                    num_feature_masks=2,
-                    frames_mask_size=100,
-                )
-            )
-        else:
-            logging.info("Disable SpecAugment")
-
-        logging.info("About to create train dataset")
-        train = K2SpeechRecognitionDataset(
-            input_strategy=eval(self.args.input_strategy)(),
-            cut_transforms=transforms,
-            input_transforms=input_transforms,
-            return_cuts=self.args.return_cuts,
-        )
-
-        if self.args.on_the_fly_feats:
-            # NOTE: the PerturbSpeed transform should be added only if we
-            # remove it from data prep stage.
-            # Add on-the-fly speed perturbation; since originally it would
-            # have increased epoch size by 3, we will apply prob 2/3 and use
-            # 3x more epochs.
-            # Speed perturbation probably should come first before
-            # concatenation, but in principle the transforms order doesn't have
-            # to be strict (e.g. could be randomized)
-            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
-            # Drop feats to be on the safe side.
-            train = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
-                input_transforms=input_transforms,
-                return_cuts=self.args.return_cuts,
-            )
-
-        if self.args.bucketing_sampler:
-            logging.info("Using DynamicBucketingSampler.")
-            train_sampler = DynamicBucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                drop_last=self.args.drop_last,
-                buffer_size=self.args.num_buckets * 2000,
-                shuffle_buffer_size=self.args.num_buckets * 5000,
-            )
-        else:
-            logging.info("Using SimpleCutSampler.")
-            train_sampler = SimpleCutSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-            )
-        logging.info("About to create train dataloader")
-
-        if sampler_state_dict is not None:
-            logging.info("Loading sampler state dict")
-            train_sampler.load_state_dict(sampler_state_dict)
-
-        # 'seed' is derived from the current random state, which will have
-        # previously been set in the main process.
-        seed = torch.randint(0, 100000, ()).item()
-        worker_init_fn = _SeedWorkers(seed)
-
-        train_dl = DataLoader(
-            train,
-            sampler=train_sampler,
-            batch_size=None,
-            num_workers=self.args.num_workers,
-            persistent_workers=False,
-            worker_init_fn=worker_init_fn,
-        )
-
-        return train_dl
-
-    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
-        transforms = []
-        if self.args.concatenate_cuts:
-            transforms = [
-                CutConcatenate(
-                    duration_factor=self.args.duration_factor, gap=self.args.gap
-                )
-            ] + transforms
-
-        logging.info("About to create dev dataset")
-        if self.args.on_the_fly_feats:
-            validate = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-                input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))),
-                return_cuts=self.args.return_cuts,
-            )
-        else:
-            validate = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-                return_cuts=self.args.return_cuts,
-            )
-        valid_sampler = DynamicBucketingSampler(
-            cuts_valid,
-            max_duration=self.args.max_duration,
-            num_buckets=self.args.num_buckets,
-            buffer_size=self.args.num_buckets * 2000,
-            shuffle=False,
-        )
-        logging.info("About to create dev dataloader")
-        valid_dl = DataLoader(
-            validate,
-            sampler=valid_sampler,
-            batch_size=None,
-            num_workers=2,
-            persistent_workers=False,
-        )
-
-        return valid_dl
-
-    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
-        logging.debug("About to create test dataset")
-        test = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
-            if self.args.on_the_fly_feats
-            else eval(self.args.input_strategy)(),
-            return_cuts=self.args.return_cuts,
-        )
-        sampler = DynamicBucketingSampler(
-            cuts,
-            max_duration=self.args.max_duration,
-            shuffle=False,
-        )
-        logging.debug("About to create test dataloader")
-        test_dl = DataLoader(
-            test,
-            batch_size=None,
-            sampler=sampler,
-            num_workers=self.args.num_workers,
-        )
-        return test_dl
-
-    @lru_cache()
-    def train_cuts(self) -> CutSet:
-        logging.info(f"About to get train {self.args.subset} cuts")
-        if self.args.subset == "XL":
-            filenames = glob.glob(
-                f"{self.args.manifest_dir}/XL_split/gigaspeech_cuts_XL.*.jsonl.gz"
-            )
-            pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz")
-            idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames)
-            idx_filenames = sorted(idx_filenames, key=lambda x: x[0])
-            sorted_filenames = [f[1] for f in idx_filenames]
-            logging.info(
-                f"Loading GigaSpeech {len(sorted_filenames)} splits in lazy mode"
-            )
-
-            cuts_train = lhotse.combine(
-                lhotse.load_manifest_lazy(p) for p in sorted_filenames
-            )
-        else:
-            path = (
-                self.args.manifest_dir / f"gigaspeech_cuts_{self.args.subset}.jsonl.gz"
-            )
-            cuts_train = CutSet.from_jsonl_lazy(path)
-        return cuts_train
-
-    @lru_cache()
-    def dev_cuts(self) -> CutSet:
-        logging.info("About to get dev cuts")
-        cuts_valid = load_manifest_lazy(
-            self.args.manifest_dir / "gigaspeech_cuts_DEV.jsonl.gz"
-        )
-        if self.args.small_dev:
-            return cuts_valid.subset(first=1000)
-        else:
-            return cuts_valid
-
-    @lru_cache()
-    def test_cuts(self) -> CutSet:
-        logging.info("About to get test cuts")
-        return load_manifest_lazy(
-            self.args.manifest_dir / "gigaspeech_cuts_TEST.jsonl.gz"
-        )
-
-    @lru_cache()
-    def fsc_train_cuts(self) -> CutSet:
-        logging.info("About to get fluent speech commands train cuts")
-        return load_manifest_lazy(
-            self.args.manifest_dir / "fluent_speech_commands_cuts_train.jsonl.gz"
-        )
-
-    @lru_cache()
-    def fsc_valid_cuts(self) -> CutSet:
-        logging.info("About to get fluent speech commands valid cuts")
-        return load_manifest_lazy(
-            self.args.manifest_dir / "fluent_speech_commands_cuts_valid.jsonl.gz"
-        )
-
-    @lru_cache()
-    def fsc_test_small_cuts(self) -> CutSet:
-        logging.info("About to get fluent speech commands small test cuts")
-        return load_manifest_lazy(
-            self.args.manifest_dir / "fluent_speech_commands_cuts_small.jsonl.gz"
-        )
-
-    @lru_cache()
-    def fsc_test_large_cuts(self) -> CutSet:
-        logging.info("About to get fluent speech commands large test cuts")
-        return load_manifest_lazy(
-            self.args.manifest_dir / "fluent_speech_commands_cuts_large.jsonl.gz"
-        )
--- a/egs/gigaspeech/KWS/zipformer/asr_datamodule.py
+++ b/egs/gigaspeech/KWS/zipformer/asr_datamodule.py
@ -0,0 +1 @@
+../../ASR/zipformer/asr_datamodule.py
--- a/egs/gigaspeech/KWS/zipformer/train.py
+++ b/egs/gigaspeech/KWS/zipformer/train.py
@ -97,6 +97,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -961,7 +962,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1320,7 +1321,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/train.py
@ -77,7 +77,13 @@ from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -805,7 +811,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1196,7 +1202,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/ksponspeech/ASR/zipformer/train.py
+++ b/egs/ksponspeech/ASR/zipformer/train.py
@ -92,6 +92,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -942,7 +943,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1333,7 +1334,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/conformer_ctc/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc/decode.py
@ -667,7 +667,9 @@ def main():
        H = None
        bpe_model = None
        HLG = k2.Fsa.from_dict(
-            torch.load(f"{params.lang_dir}/HLG.pt", map_location=device)
+            torch.load(
+                f"{params.lang_dir}/HLG.pt", map_location=device, weights_only=False
+            )
        )
        assert HLG.requires_grad is False

@ -707,7 +709,9 @@ def main():
                torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt")
        else:
            logging.info("Loading pre-compiled G_4_gram.pt")
-            d = torch.load(params.lm_dir / "G_4_gram.pt", map_location=device)
+            d = torch.load(
+                params.lm_dir / "G_4_gram.pt", map_location=device, weights_only=False
+            )
            G = k2.Fsa.from_dict(d)

        if params.method in [
--- a/egs/librispeech/ASR/conformer_ctc/pretrained.py
+++ b/egs/librispeech/ASR/conformer_ctc/pretrained.py
@ -271,7 +271,7 @@ def main():
        use_feat_batchnorm=params.use_feat_batchnorm,
    )

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
@ -351,7 +351,9 @@ def main():
        "attention-decoder",
    ]:
        logging.info(f"Loading HLG from {params.HLG}")
-        HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu"))
+        HLG = k2.Fsa.from_dict(
+            torch.load(params.HLG, map_location="cpu", weights_only=False)
+        )
        HLG = HLG.to(device)
        if not hasattr(HLG, "lm_scores"):
            # For whole-lattice-rescoring and attention-decoder
@ -362,7 +364,9 @@ def main():
            "attention-decoder",
        ]:
            logging.info(f"Loading G from {params.G}")
-            G = k2.Fsa.from_dict(torch.load(params.G, map_location="cpu"))
+            G = k2.Fsa.from_dict(
+                torch.load(params.G, map_location="cpu", weights_only=False)
+            )
            # Add epsilon self-loops to G as we will compose
            # it with the whole lattice later
            G = G.to(device)
--- a/egs/librispeech/ASR/conformer_ctc2/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc2/decode.py
@ -774,7 +774,9 @@ def main():
        H = None
        bpe_model = None
        HLG = k2.Fsa.from_dict(
-            torch.load(f"{params.lang_dir}/HLG.pt", map_location=device)
+            torch.load(
+                f"{params.lang_dir}/HLG.pt", map_location=device, weights_only=False
+            )
        )
        assert HLG.requires_grad is False

@ -814,7 +816,9 @@ def main():
                torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt")
        else:
            logging.info("Loading pre-compiled G_4_gram.pt")
-            d = torch.load(params.lm_dir / "G_4_gram.pt", map_location=device)
+            d = torch.load(
+                params.lm_dir / "G_4_gram.pt", map_location=device, weights_only=False
+            )
            G = k2.Fsa.from_dict(d)

        if params.method in [
--- a/egs/librispeech/ASR/conformer_ctc2/train.py
+++ b/egs/librispeech/ASR/conformer_ctc2/train.py
@ -65,7 +65,6 @@ from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -84,9 +83,11 @@ from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    encode_supervisions,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -420,7 +421,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -629,7 +630,7 @@ def train_one_epoch(
    scheduler: LRSchedulerType,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -676,7 +677,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -965,7 +966,7 @@ def run(rank, world_size, args):
            params=params,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1036,7 +1037,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/conformer_ctc3/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc3/decode.py
@ -868,7 +868,9 @@ def main():
        H = None
        bpe_model = None
        HLG = k2.Fsa.from_dict(
-            torch.load(f"{params.lang_dir}/HLG.pt", map_location=device)
+            torch.load(
+                f"{params.lang_dir}/HLG.pt", map_location=device, weights_only=False
+            )
        )
        assert HLG.requires_grad is False

@ -907,7 +909,9 @@ def main():
                torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt")
        else:
            logging.info("Loading pre-compiled G_4_gram.pt")
-            d = torch.load(params.lm_dir / "G_4_gram.pt", map_location=device)
+            d = torch.load(
+                params.lm_dir / "G_4_gram.pt", map_location=device, weights_only=False
+            )
            G = k2.Fsa.from_dict(d)

        if params.decoding_method == "whole-lattice-rescoring":
--- a/egs/librispeech/ASR/conformer_ctc3/jit_pretrained.py
+++ b/egs/librispeech/ASR/conformer_ctc3/jit_pretrained.py
@ -334,7 +334,9 @@ def main():
        "whole-lattice-rescoring",
    ]:
        logging.info(f"Loading HLG from {params.HLG}")
-        HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu"))
+        HLG = k2.Fsa.from_dict(
+            torch.load(params.HLG, map_location="cpu", weights_only=False)
+        )
        HLG = HLG.to(device)
        if not hasattr(HLG, "lm_scores"):
            # For whole-lattice-rescoring and attention-decoder
@ -345,7 +347,9 @@ def main():
            "whole-lattice-rescoring",
        ]:
            logging.info(f"Loading G from {params.G}")
-            G = k2.Fsa.from_dict(torch.load(params.G, map_location="cpu"))
+            G = k2.Fsa.from_dict(
+                torch.load(params.G, map_location="cpu", weights_only=False)
+            )
            G = G.to(device)
            if params.method == "whole-lattice-rescoring":
                # Add epsilon self-loops to G as we will compose
--- a/egs/librispeech/ASR/conformer_ctc3/pretrained.py
+++ b/egs/librispeech/ASR/conformer_ctc3/pretrained.py
@ -290,7 +290,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
@ -386,7 +386,9 @@ def main():
        "whole-lattice-rescoring",
    ]:
        logging.info(f"Loading HLG from {params.HLG}")
-        HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu"))
+        HLG = k2.Fsa.from_dict(
+            torch.load(params.HLG, map_location="cpu", weights_only=False)
+        )
        HLG = HLG.to(device)
        if not hasattr(HLG, "lm_scores"):
            # For whole-lattice-rescoring and attention-decoder
@ -397,7 +399,9 @@ def main():
            "whole-lattice-rescoring",
        ]:
            logging.info(f"Loading G from {params.G}")
-            G = k2.Fsa.from_dict(torch.load(params.G, map_location="cpu"))
+            G = k2.Fsa.from_dict(
+                torch.load(params.G, map_location="cpu", weights_only=False)
+            )
            G = G.to(device)
            if params.method == "whole-lattice-rescoring":
                # Add epsilon self-loops to G as we will compose
--- a/egs/librispeech/ASR/conformer_ctc3/train.py
+++ b/egs/librispeech/ASR/conformer_ctc3/train.py
@ -76,7 +76,6 @@ from lhotse.utils import fix_random_seed
 from model import CTCModel
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -95,9 +94,11 @@ from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    encode_supervisions,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -493,7 +494,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -694,7 +695,7 @@ def train_one_epoch(
    graph_compiler: Union[BpeCtcTrainingGraphCompiler, CtcTrainingGraphCompiler],
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -743,7 +744,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1004,7 +1005,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1073,7 +1074,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/conformer_mmi/decode.py
+++ b/egs/librispeech/ASR/conformer_mmi/decode.py
@ -574,7 +574,9 @@ def main():
        H = None
        bpe_model = None
        HLG = k2.Fsa.from_dict(
-            torch.load(f"{params.lang_dir}/HLG.pt", map_location="cpu")
+            torch.load(
+                f"{params.lang_dir}/HLG.pt", map_location="cpu", weights_only=False
+            )
        )
        HLG = HLG.to(device)
        assert HLG.requires_grad is False
@ -609,7 +611,9 @@ def main():
                torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt")
        else:
            logging.info("Loading pre-compiled G_4_gram.pt")
-            d = torch.load(params.lm_dir / "G_4_gram.pt", map_location="cpu")
+            d = torch.load(
+                params.lm_dir / "G_4_gram.pt", map_location="cpu", weights_only=False
+            )
            G = k2.Fsa.from_dict(d).to(device)

        if params.method in ["whole-lattice-rescoring", "attention-decoder"]:
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py
@ -80,7 +80,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -93,7 +92,14 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    create_grad_scaler,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -560,7 +566,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -727,7 +733,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -772,7 +778,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1002,7 +1008,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1071,7 +1077,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py
@ -80,7 +80,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -93,7 +92,14 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    create_grad_scaler,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -560,7 +566,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -727,7 +733,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -772,7 +778,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1001,7 +1007,7 @@ def run(rank, world_size, args):
            params=params,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1072,7 +1078,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/local/compile_hlg.py
+++ b/egs/librispeech/ASR/local/compile_hlg.py
@ -72,11 +72,11 @@ def compile_HLG(lang_dir: str, lm: str = "G_3_gram") -> k2.Fsa:
    max_token_id = max(lexicon.tokens)
    logging.info(f"Building ctc_topo. max_token_id: {max_token_id}")
    H = k2.ctc_topo(max_token_id)
-    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
+    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt", weights_only=False))

    if Path(f"data/lm/{lm}.pt").is_file():
        logging.info(f"Loading pre-compiled {lm}")
-        d = torch.load(f"data/lm/{lm}.pt")
+        d = torch.load(f"data/lm/{lm}.pt", weights_only=False)
        G = k2.Fsa.from_dict(d)
    else:
        logging.info(f"Loading {lm}.fst.txt")
--- a/egs/librispeech/ASR/local/compile_lg.py
+++ b/egs/librispeech/ASR/local/compile_lg.py
@ -66,11 +66,11 @@ def compile_LG(lang_dir: str, lm: str = "G_3_gram") -> k2.Fsa:
      An FSA representing LG.
    """
    lexicon = Lexicon(lang_dir)
-    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
+    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt", weights_only=False))

    if Path(f"data/lm/{lm}.pt").is_file():
        logging.info(f"Loading pre-compiled {lm}")
-        d = torch.load(f"data/lm/{lm}.pt")
+        d = torch.load(f"data/lm/{lm}.pt", weights_only=False)
        G = k2.Fsa.from_dict(d)
    else:
        logging.info(f"Loading {lm}.fst.txt")
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
@ -750,7 +750,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/lstm_transducer_stateless/model.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/model.py
@ -23,7 +23,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -156,7 +156,7 @@ class Transducer(nn.Module):
        lm = self.simple_lm_proj(decoder_out)
        am = self.simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -192,7 +192,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = self.joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py
@ -238,7 +238,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/lstm_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
@ -66,7 +66,6 @@ from lstm import RNN
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -82,9 +81,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -521,7 +522,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -717,7 +718,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -763,7 +764,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1023,7 +1024,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1092,7 +1093,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py
@ -935,7 +935,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/model.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/model.py
@ -23,7 +23,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -195,7 +195,7 @@ class Transducer(nn.Module):
        lm = simple_lm_proj(decoder_out)
        am = simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -231,7 +231,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/pretrained.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/pretrained.py
@ -241,7 +241,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/train.py
@ -74,7 +74,6 @@ from lstm import RNN
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -90,9 +89,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -560,7 +561,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -772,7 +773,7 @@ def train_one_epoch(
    giga_train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
    rng: random.Random,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -848,7 +849,7 @@ def train_one_epoch(
        libri = is_libri(batch["supervisions"]["cut"][0])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1176,7 +1177,7 @@ def run(rank, world_size, args):
        else:
            logging.info("Skip scan_pessimistic_batches_for_oom")

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1247,7 +1248,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/lstm_transducer_stateless3/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless3/decode.py
@ -815,7 +815,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/lstm_transducer_stateless3/pretrained.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless3/pretrained.py
@ -239,7 +239,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/lstm_transducer_stateless3/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless3/train.py
@ -66,7 +66,6 @@ from lstm import RNN
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -82,9 +81,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -551,7 +552,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -747,7 +748,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -793,7 +794,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1067,7 +1068,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1136,7 +1137,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_.autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned2_knowledge/model.py
+++ b/egs/librispeech/ASR/pruned2_knowledge/model.py
@ -21,7 +21,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -141,7 +141,7 @@ class Transducer(nn.Module):
        lm = self.simple_lm_proj(decoder_out)
        am = self.simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -176,7 +176,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = self.joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/librispeech/ASR/pruned2_knowledge/sampling.py
+++ b/egs/librispeech/ASR/pruned2_knowledge/sampling.py
@ -10,9 +10,11 @@ from typing import Optional, Tuple
 import torch
 from scaling import ScaledLinear
 from torch import Tensor, nn
-from torch.cuda.amp import GradScaler, custom_bwd, custom_fwd
+from torch.cuda.amp import custom_bwd, custom_fwd
 from torch_scheduled_sampling import sample_combined

+from icefall.utils import create_grad_scaler, torch_autocast
+
 # The main exports of this file are the module KnowledgeBaseLookup and the
 # function create_knowledge_base.

@ -330,14 +332,14 @@ def _test_knowledge_base_lookup_autocast():
    optimizer = Eve(m.parameters(), lr=0.005, eps=1.0e-04)
    m = m.to(device)

-    scaler = GradScaler(enabled=True)
+    scaler = create_grad_scaler(enabled=True)

    start = timeit.default_timer()

    for epoch in range(150):
        for n, (x, y) in enumerate(train_pairs):
            y_out = m(x)
-            with torch.cuda.amp.autocast(enabled=True):
+            with torch_autocast(enabled=True):
                loss = ((y_out - y) ** 2).mean() * 100.0
            if n % 10 == 0 and epoch % 10 == 0:
                print(f"Epoch {epoch}, batch {n}, loss {loss.item()}")
--- a/egs/librispeech/ASR/pruned2_knowledge/train.py
+++ b/egs/librispeech/ASR/pruned2_knowledge/train.py
@ -66,7 +66,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -76,7 +75,14 @@ from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
 from icefall.checkpoint import save_checkpoint_with_global_batch_idx
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    create_grad_scaler,
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -453,7 +459,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -608,7 +614,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
    rank: int = 0,
@ -650,7 +656,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -868,7 +874,7 @@ def run(rank, world_size, args):
            params=params,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -937,7 +943,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py
+++ b/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py
@ -55,7 +55,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from noam import Noam
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -68,7 +67,14 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    create_grad_scaler,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)


 def add_model_arguments(parser: argparse.ArgumentParser):
@ -496,7 +502,7 @@ def save_checkpoint(
    model_avg: Optional[nn.Module] = None,
    optimizer: Optional[torch.optim.Optimizer] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, and training stats to file.
@ -650,7 +656,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -693,7 +699,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -939,7 +945,7 @@ def run(rank, world_size, args):
            params=params,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1004,7 +1010,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
@ -741,7 +741,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
@ -1347,7 +1347,10 @@ def modified_beam_search(
                        (
                            context_score,
                            new_context_state,
-                        ) = context_graph.forward_one_step(hyp.context_state, new_token)
+                            _,
+                        ) = context_graph.forward_one_step(
+                            hyp.context_state, new_token, strict_mode=False
+                        )

                new_log_prob = topk_log_probs[k] + context_score

@ -2853,7 +2856,10 @@ def modified_beam_search_LODR(
                        (
                            context_score,
                            new_context_state,
-                        ) = context_graph.forward_one_step(hyp.context_state, new_token)
+                            _,
+                        ) = context_graph.forward_one_step(
+                            hyp.context_state, new_token, strict_mode=False
+                        )

                    ys.append(new_token)
                    state_cost = hyp.state_cost.forward_one_step(new_token)
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py
@ -754,7 +754,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
@ -23,7 +23,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -157,7 +157,7 @@ class Transducer(nn.Module):
        lm = self.simple_lm_proj(decoder_out)
        am = self.simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -193,7 +193,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = self.joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/pretrained.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/pretrained.py
@ -265,7 +265,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
@ -78,7 +78,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -91,9 +90,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -523,7 +524,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -716,7 +717,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
    rank: int = 0,
@ -759,7 +760,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1000,7 +1001,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 0 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1067,7 +1068,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/decode.py
@ -921,7 +921,7 @@ def load_ngram_LM(

    if pt_file.is_file():
        logging.info(f"Loading pre-compiled {pt_file}")
-        d = torch.load(pt_file, map_location=device)
+        d = torch.load(pt_file, map_location=device, weights_only=False)
        G = k2.Fsa.from_dict(d)
        G = k2.add_epsilon_self_loops(G)
        G = k2.arc_sort(G)
@ -1101,7 +1101,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        elif params.decoding_method in [
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/model.py
@ -23,7 +23,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -195,7 +195,7 @@ class Transducer(nn.Module):
        lm = simple_lm_proj(decoder_out)
        am = simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -231,7 +231,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/pretrained.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/pretrained.py
@ -274,7 +274,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
@ -74,7 +74,6 @@ from librispeech import LibriSpeech
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -87,9 +86,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -546,7 +547,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -755,7 +756,7 @@ def train_one_epoch(
    giga_train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
    rng: random.Random,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
    rank: int = 0,
@ -827,7 +828,7 @@ def train_one_epoch(

        libri = is_libri(batch["supervisions"]["cut"][0])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1126,7 +1127,7 @@ def run(rank, world_size, args):
                warmup=0.0 if params.start_epoch == 0 else 1.0,
            )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1195,7 +1196,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py
@ -913,7 +913,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/pruned_transducer_stateless4/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/train.py
@ -80,7 +80,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -96,9 +95,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -548,7 +549,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -744,7 +745,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -789,7 +790,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1047,7 +1048,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1116,7 +1117,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/decode.py
@ -972,7 +972,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/pretrained.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/pretrained.py
@ -238,7 +238,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/train.py
@ -68,7 +68,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -84,9 +83,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -571,7 +572,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -768,7 +769,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -814,7 +815,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1078,7 +1079,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1147,7 +1148,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/model.py
@ -23,7 +23,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -185,7 +185,7 @@ class Transducer(nn.Module):
        lm = self.simple_lm_proj(decoder_out)
        am = self.simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -220,7 +220,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = self.joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/train.py
@ -80,7 +80,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -96,9 +95,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -519,7 +520,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -736,7 +737,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -781,7 +782,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1039,7 +1040,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1108,7 +1109,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py
@ -348,7 +348,9 @@ class CodebookIndexExtractor:
            num_codebooks=self.params.num_codebooks,
            codebook_size=256,
        )
-        quantizer.load_state_dict(torch.load(self.quantizer_file_path))
+        quantizer.load_state_dict(
+            torch.load(self.quantizer_file_path, weights_only=False)
+        )
        quantizer.to(self.params.device)
        return quantizer

--- a/egs/librispeech/ASR/pruned_transducer_stateless7/compute_ali.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/compute_ali.py
@ -289,7 +289,7 @@ def main():
    logging.info("About to create model")
    model = get_transducer_model(params)

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/decode.py
@ -910,7 +910,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/decode_gigaspeech.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/decode_gigaspeech.py
@ -813,7 +813,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py
@ -66,7 +66,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, ScaledAdam
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter
 from zipformer import Zipformer
@ -85,9 +84,11 @@ from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    filter_uneven_sized_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -635,7 +636,7 @@ def load_model_params(

    """
    logging.info(f"Loading checkpoint from {ckpt}")
-    checkpoint = torch.load(ckpt, map_location="cpu")
+    checkpoint = torch.load(ckpt, map_location="cpu", weights_only=False)

    # if module list is empty, load the whole model from ckpt
    if not init_modules:
@ -678,7 +679,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -857,7 +858,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -903,7 +904,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1219,7 +1220,7 @@ def run(rank, world_size, args):
            params=params,
        )

-    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    scaler = create_grad_scaler(enabled=params.use_fp16, init_scale=1.0)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1319,7 +1320,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/Show More
+++ b/Show More