Merge branch 'k2-fsa:master' into cr-ctc-aishell

2025-12-10 22:45:27 +00:00 · 2025-07-04 16:03:44 +08:00 · 2025-07-04 16:03:44 +08:00 · 94e828e9ab
commit 94e828e9ab
parent bbc163901a fba5e67d5e
185 changed files with 1898 additions and 790 deletions
--- a/.github/scripts/docker/Dockerfile
+++ b/.github/scripts/docker/Dockerfile
@ -55,9 +55,9 @@ RUN pip install --no-cache-dir \
      "numpy<2.0" \
      onnxoptimizer \
      onnxsim \
-      onnx \
+      onnx==1.17.0 \
      onnxmltools \
-      onnxruntime \
+      onnxruntime==1.17.1 \
      piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html \
      pypinyin==0.50.0 \
      pytest \
--- a/.github/scripts/docker/generate_build_matrix.py
+++ b/.github/scripts/docker/generate_build_matrix.py
@ -63,23 +63,24 @@ def get_torchaudio_version(torch_version):


 def get_matrix(min_torch_version, specified_torch_version, specified_python_version):
-    k2_version = "1.24.4.dev20241029"
-    kaldifeat_version = "1.25.5.dev20241029"
-    version = "20241218"
+    k2_version = "1.24.4.dev20250630"
+    kaldifeat_version = "1.25.5.dev20250630"
+    version = "20250630"

    # torchaudio 2.5.0 does not support python 3.13
-    python_version = ["3.8", "3.9", "3.10", "3.11", "3.12"]
+    python_version = ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
    torch_version = []
    torch_version += ["1.13.0", "1.13.1"]
    torch_version += ["2.0.0", "2.0.1"]
-    #  torch_version += ["2.1.0", "2.1.1", "2.1.2"]
-    #  torch_version += ["2.2.0", "2.2.1", "2.2.2"]
+    torch_version += ["2.1.0", "2.1.1", "2.1.2"]
+    torch_version += ["2.2.0", "2.2.1", "2.2.2"]
    # Test only torch >= 2.3.0
    torch_version += ["2.3.0", "2.3.1"]
    torch_version += ["2.4.0"]
    torch_version += ["2.4.1"]
    torch_version += ["2.5.0"]
    torch_version += ["2.5.1"]
+    torch_version += ["2.6.0", "2.7.0", "2.7.1"]

    if specified_torch_version:
        torch_version = [specified_torch_version]
@ -109,10 +110,6 @@ def get_matrix(min_torch_version, specified_torch_version, specified_python_vers
                # torch>=2.5 requires python 3.10
                continue

-            if t == "2.5.1":
-                k2_version_2 = "1.24.4.dev20241122"
-                kaldifeat_version_2 = "1.25.5.dev20241126"
-            else:
            k2_version_2 = k2_version
            kaldifeat_version_2 = kaldifeat_version

--- a/.github/scripts/multi-zh-hans.sh
+++ b/.github/scripts/multi-zh-hans.sh
@ -1,200 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-git config --global user.name "k2-fsa"
-git config --global user.email "csukuangfj@gmail.com"
-git config --global lfs.allowincompletepush true
-
-log() {
-  # This function is from espnet
-  local fname=${BASH_SOURCE[1]##*/}
-  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
-}
-
-log "pwd: $PWD"
-
-cd egs/multi_zh-hans/ASR
-
-repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-2023-9-2
-log "Downloading pre-trained model from $repo_url"
-GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
-repo=$(basename $repo_url)
-pushd $repo
-cd exp
-git lfs pull --include pretrained.pt
-ln -s pretrained.pt epoch-99.pt
-cd ../data/lang_bpe_2000
-ls -lh
-git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
-git lfs pull --include "*.model"
-ls -lh
-popd
-
-log "--------------------------------------------"
-log "Export non-streaming ONNX transducer models "
-log "--------------------------------------------"
-./zipformer/export-onnx.py \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  --use-averaged-model 0 \
-  --epoch 99 \
-  --avg 1 \
-  --exp-dir $repo/exp \
-  --causal False
-
-ls -lh $repo/exp
-
-./zipformer/onnx_pretrained.py \
-  --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
-  --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
-  --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  $repo/test_wavs/DEV_T0000000000.wav \
-  $repo/test_wavs/DEV_T0000000001.wav \
-  $repo/test_wavs/DEV_T0000000002.wav \
-  $repo/test_wavs/TEST_MEETING_T0000000113.wav \
-  $repo/test_wavs/TEST_MEETING_T0000000219.wav \
-  $repo/test_wavs/TEST_MEETING_T0000000351.wav
-
-rm -rf $repo
-
-repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05
-log "Downloading pre-trained model from $repo_url"
-GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
-repo=$(basename $repo_url)
-
-pushd $repo
-cd exp/
-git lfs pull --include pretrained.pt
-rm -fv epoch-20.pt
-rm -fv *.onnx
-ln -s pretrained.pt epoch-20.pt
-cd ../data/lang_bpe_2000
-ls -lh
-git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
-git lfs pull --include "*.model"
-ls -lh
-popd
-
-log "----------------------------------------"
-log "Export streaming ONNX CTC models "
-log "----------------------------------------"
-./zipformer/export-onnx-streaming-ctc.py \
-  --exp-dir $repo/exp \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  --causal 1 \
-  --avg 1 \
-  --epoch 20 \
-  --use-averaged-model 0 \
-  --chunk-size 16 \
-  --left-context-frames 128 \
-  --use-ctc 1
-
-ls -lh $repo/exp/
-
-log "------------------------------------------------------------"
-log "Test exported streaming ONNX CTC models (greedy search)     "
-log "------------------------------------------------------------"
-
-test_wavs=(
-DEV_T0000000000.wav
-DEV_T0000000001.wav
-DEV_T0000000002.wav
-TEST_MEETING_T0000000113.wav
-TEST_MEETING_T0000000219.wav
-TEST_MEETING_T0000000351.wav
-)
-
-for w in ${test_wavs[@]}; do
-  ./zipformer/onnx_pretrained-streaming-ctc.py \
-    --model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
-    --tokens $repo/data/lang_bpe_2000/tokens.txt \
-    $repo/test_wavs/$w
-done
-
-log "Upload onnx CTC models to huggingface"
-url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
-GIT_LFS_SKIP_SMUDGE=1 git clone $url
-dst=$(basename $url)
-cp -v $repo/exp/ctc*.onnx $dst
-cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
-cp -v $repo/data/lang_bpe_2000/bpe.model $dst
-mkdir -p $dst/test_wavs
-cp -v $repo/test_wavs/*.wav $dst/test_wavs
-cd $dst
-git lfs track "*.onnx" "bpe.model"
-ls -lh
-file bpe.model
-git status
-git add .
-git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
-
-log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
-rm -rf .git
-rm -fv .gitattributes
-cd ..
-tar cjfv $dst.tar.bz2 $dst
-ls -lh *.tar.bz2
-mv -v $dst.tar.bz2 ../../../
-
-log "----------------------------------------"
-log "Export streaming ONNX transducer models "
-log "----------------------------------------"
-
-./zipformer/export-onnx-streaming.py \
-  --exp-dir $repo/exp \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  --causal 1 \
-  --avg 1 \
-  --epoch 20 \
-  --use-averaged-model 0 \
-  --chunk-size 16 \
-  --left-context-frames 128 \
-  --use-ctc 0
-
-ls -lh $repo/exp
-
-log "------------------------------------------------------------"
-log "Test exported streaming ONNX transducer models (Python code)"
-log "------------------------------------------------------------"
-
-log "test fp32"
-./zipformer/onnx_pretrained-streaming.py \
-  --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.onnx \
-  --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
-  --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.onnx \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  $repo/test_wavs/DEV_T0000000000.wav
-
-log "test int8"
-./zipformer/onnx_pretrained-streaming.py \
-  --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
-  --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
-  --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  $repo/test_wavs/DEV_T0000000000.wav
-
-log "Upload onnx transducer models to huggingface"
-
-url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12
-GIT_LFS_SKIP_SMUDGE=1 git clone $url
-dst=$(basename $url)
-cp -v $repo/exp/encoder*.onnx $dst
-cp -v $repo/exp/decoder*.onnx $dst
-cp -v $repo/exp/joiner*.onnx $dst
-cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
-cp -v $repo/data/lang_bpe_2000/bpe.model $dst
-mkdir -p $dst/test_wavs
-cp -v $repo/test_wavs/*.wav $dst/test_wavs
-cd $dst
-git lfs track "*.onnx" bpe.model
-git add .
-git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
-
-log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
-rm -rf .git
-rm -fv .gitattributes
-cd ..
-tar cjfv $dst.tar.bz2 $dst
-ls -lh *.tar.bz2
-mv -v $dst.tar.bz2 ../../../
--- a/.github/scripts/multi_zh-hans/ASR/run.sh
+++ b/.github/scripts/multi_zh-hans/ASR/run.sh
@ -0,0 +1,756 @@
+#!/usr/bin/env bash
+
+set -ex
+
+git config --global user.name "k2-fsa"
+git config --global user.email "csukuangfj@gmail.com"
+git config --global lfs.allowincompletepush true
+
+python3 -m pip install onnxmltools==1.13.0 onnx==1.17.0 onnxruntime==1.17.1 sherpa-onnx
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/multi_zh-hans/ASR
+
+log "pwd: $PWD"
+
+function run_2023_9_2() {
+  repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-2023-9-2
+  log "Downloading pre-trained model from $repo_url"
+  GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+  repo=$(basename $repo_url)
+  pushd $repo
+  cd exp
+  git lfs pull --include pretrained.pt
+  ln -s pretrained.pt epoch-99.pt
+  cd ../data/lang_bpe_2000
+  ls -lh
+  git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
+  git lfs pull --include "*.model"
+  ls -lh
+  popd
+
+  log "--------------------------------------------"
+  log "Export non-streaming ONNX transducer models "
+  log "--------------------------------------------"
+  ./zipformer/export-onnx.py \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    --use-averaged-model 0 \
+    --epoch 99 \
+    --avg 1 \
+    --exp-dir $repo/exp \
+    --causal False \
+    --fp16 1
+
+  ls -lh $repo/exp
+
+  ./zipformer/onnx_pretrained.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav \
+    $repo/test_wavs/DEV_T0000000001.wav \
+    $repo/test_wavs/DEV_T0000000002.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000113.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000219.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000351.wav
+
+  ./zipformer/onnx_pretrained.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.int8.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.int8.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav \
+    $repo/test_wavs/DEV_T0000000001.wav \
+    $repo/test_wavs/DEV_T0000000002.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000113.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000219.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000351.wav
+
+  ./zipformer/onnx_pretrained.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.fp16.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.fp16.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.fp16.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav \
+    $repo/test_wavs/DEV_T0000000001.wav \
+    $repo/test_wavs/DEV_T0000000002.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000113.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000219.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000351.wav
+
+  rm -rf $repo
+}
+
+function run_2023_11_05_streaming() {
+  repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05
+  log "Downloading pre-trained model from $repo_url"
+  GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+  repo=$(basename $repo_url)
+
+  pushd $repo
+  cd exp/
+  git lfs pull --include pretrained.pt
+  rm -fv epoch-20.pt
+  rm -fv *.onnx
+  ln -s pretrained.pt epoch-20.pt
+  cd ../data/lang_bpe_2000
+  ls -lh
+  git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
+  git lfs pull --include "*.model"
+  ls -lh
+  popd
+
+  log "----------------------------------------"
+  log "Export streaming ONNX CTC models "
+  log "----------------------------------------"
+  ./zipformer/export-onnx-streaming-ctc.py \
+    --exp-dir $repo/exp \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 20 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 1 \
+    --fp16 1
+
+  ls -lh $repo/exp/
+
+  log "------------------------------------------------------------"
+  log "Test exported streaming ONNX CTC models (greedy search)     "
+  log "------------------------------------------------------------"
+
+  test_wavs=(
+    DEV_T0000000000.wav
+    DEV_T0000000001.wav
+    DEV_T0000000002.wav
+    TEST_MEETING_T0000000113.wav
+    TEST_MEETING_T0000000219.wav
+    TEST_MEETING_T0000000351.wav
+  )
+
+  for w in ${test_wavs[@]}; do
+    log "----fp32----"
+    ./zipformer/onnx_pretrained-streaming-ctc.py \
+      --model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.onnx \
+      --tokens $repo/data/lang_bpe_2000/tokens.txt \
+      $repo/test_wavs/$w
+
+    log "----int8----"
+
+    ./zipformer/onnx_pretrained-streaming-ctc.py \
+      --model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
+      --tokens $repo/data/lang_bpe_2000/tokens.txt \
+      $repo/test_wavs/$w
+
+    log "----fp16----"
+
+    ./zipformer/onnx_pretrained-streaming-ctc.py \
+      --model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.fp16.onnx \
+      --tokens $repo/data/lang_bpe_2000/tokens.txt \
+      $repo/test_wavs/$w
+  done
+
+  log "Upload onnx CTC models to huggingface"
+  name=(
+    sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
+    sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-int8-2023-12-13
+    sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-fp16-2023-12-13
+    )
+  for n in ${name[@]}; do
+      url=https://huggingface.co/k2-fsa/$n
+      GIT_LFS_SKIP_SMUDGE=1 git clone $url
+      dst=$(basename $url)
+      if [[ $n == sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13 ]]; then
+        cp -v $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-int8-2023-12-13 ]]; then
+        cp -v $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-fp16-2023-12-13 ]]; then
+        cp -v $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.fp16.onnx $dst
+      fi
+
+      cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+      cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+      mkdir -p $dst/test_wavs
+      cp -v $repo/test_wavs/*.wav $dst/test_wavs
+      cd $dst
+      git lfs track "*.onnx" "bpe.model" "*.wav"
+      ls -lh
+      file bpe.model
+      git status
+      git add .
+      git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
+
+      log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+      rm -rf .git
+      rm -fv .gitattributes
+      cd ..
+      tar cjfv $dst.tar.bz2 $dst
+      ls -lh *.tar.bz2
+      mv -v $dst.tar.bz2 ../../../
+  done
+
+  log "----------------------------------------"
+  log "Export streaming ONNX transducer models "
+  log "----------------------------------------"
+
+  ./zipformer/export-onnx-streaming.py \
+    --exp-dir $repo/exp \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 20 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 0 \
+    --fp16 1
+
+  ls -lh $repo/exp
+
+  log "------------------------------------------------------------"
+  log "Test exported streaming ONNX transducer models (Python code)"
+  log "------------------------------------------------------------"
+
+  log "test fp32"
+  ./zipformer/onnx_pretrained-streaming.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav
+
+  log "test int8"
+  ./zipformer/onnx_pretrained-streaming.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav
+
+  log "test fp16"
+  ./zipformer/onnx_pretrained-streaming.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.fp16.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.fp16.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.fp16.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav
+
+  name=(
+    sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-13
+    sherpa-onnx-streaming-zipformer-multi-zh-hans-int8-2023-12-13
+    sherpa-onnx-streaming-zipformer-multi-zh-hans-fp16-2023-12-13
+  )
+
+  for n in ${name[@]}; do
+      url=https://huggingface.co/csukuangfj/$n
+      GIT_LFS_SKIP_SMUDGE=1 git clone $url
+      dst=$(basename $url)
+      if [[ $n == sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-13 ]]; then
+        cp -v $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.onnx $dst
+        cp -v $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx $dst
+        cp -v $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-multi-zh-hans-int8-2023-12-13 ]]; then
+        cp -v $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx $dst
+        cp -v $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx $dst
+        cp -v $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-multi-zh-hans-fp16-2023-12-13 ]]; then
+        cp -v $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.fp16.onnx $dst
+        cp -v $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.fp16.onnx $dst
+        cp -v $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.fp16.onnx $dst
+      fi
+
+      cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+      cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+      mkdir -p $dst/test_wavs
+      cp -v $repo/test_wavs/*.wav $dst/test_wavs
+      cd $dst
+      git lfs track "*.onnx" "bpe.model" "*.wav"
+      ls -lh
+      file bpe.model
+      git status
+      git add .
+      git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
+
+      log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+      rm -rf .git
+      rm -fv .gitattributes
+      cd ..
+      tar cjfv $dst.tar.bz2 $dst
+      ls -lh *.tar.bz2
+      mv -v $dst.tar.bz2 ../../../
+  done
+}
+
+function run_2023_12_12_streaming() {
+  log "Upload onnx transducer models to huggingface"
+
+  url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12
+  GIT_LFS_SKIP_SMUDGE=1 git clone $url
+  dst=$(basename $url)
+  cp -v $repo/exp/encoder*.onnx $dst
+  cp -v $repo/exp/decoder*.onnx $dst
+  cp -v $repo/exp/joiner*.onnx $dst
+  cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+  cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+  mkdir -p $dst/test_wavs
+  cp -v $repo/test_wavs/*.wav $dst/test_wavs
+  cd $dst
+  git lfs track "*.onnx" bpe.model "*.wav"
+  git add .
+  git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
+
+  log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+  rm -rf .git
+  rm -fv .gitattributes
+  cd ..
+  tar cjfv $dst.tar.bz2 $dst
+  ls -lh *.tar.bz2
+  mv -v $dst.tar.bz2 ../../../
+}
+
+function run_yuekai_large() {
+  repo_url=https://csukuangfj:${HF_TOKEN}@huggingface.co/yuekai/icefall-asr-multi-zh-hans-zipformer-large
+  log "Downloading pre-trained model from $repo_url"
+  GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+  repo=$(basename $repo_url)
+  pushd $repo
+  git lfs pull --include pretrained.pt
+  mv pretrained.pt epoch-99.pt
+  curl -SL -O https://huggingface.co/pingzxy/icefall-asr-multi-zh-hans-zipformer-large-onnx/resolve/main/tokens.txt
+  popd
+
+  log "----------------------------------------"
+  log "Export streaming ONNX CTC models "
+  log "----------------------------------------"
+  ./zipformer/export-onnx-streaming-ctc.py \
+    --exp-dir $repo/ \
+    --tokens $repo/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 99 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 1 \
+    \
+    --num-encoder-layers 2,2,4,5,4,2 \
+    --feedforward-dim 768,1024,1536,2048,1536,768 \
+    --encoder-dim 256,384,512,768,512,256 \
+    --encoder-unmasked-dim 192,192,256,320,256,192 \
+    \
+    --fp16 1 \
+    --use-whisper-features 1
+
+
+  ls -lh $repo/
+  pushd $repo
+
+cat >README.md <<EOF
+# Introduction
+
+This model is converted
+from
+https://huggingface.co/yuekai/icefall-asr-multi-zh-hans-zipformer-large
+
+The training code can be found at
+https://github.com/k2-fsa/icefall/blob/master/egs/multi_zh-hans/ASR/RESULTS.md#multi-chinese-datasets-char-based-training-results-streaming-on-zipformer-large-model
+EOF
+
+  mv -v ctc-epoch-99-avg-1-chunk-16-left-128.fp16.onnx model.fp16.onnx
+  mv -v ctc-epoch-99-avg-1-chunk-16-left-128.int8.onnx model.int8.onnx
+  mv -v ctc-epoch-99-avg-1-chunk-16-left-128.onnx model.onnx
+
+  ls -lh *.onnx
+
+  mkdir test_wavs
+  cd test_wavs
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/0.wav
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/1.wav
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/8k.wav
+  popd
+
+  for w in 0.wav 1.wav 8k.wav; do
+    log "---fp32---"
+    sherpa-onnx \
+      --zipformer2-ctc-model=$repo/model.onnx \
+      --tokens=$repo/tokens.txt \
+      $repo/test_wavs/$w
+
+    log "---int8---"
+
+    sherpa-onnx \
+      --zipformer2-ctc-model=$repo/model.int8.onnx \
+      --tokens=$repo/tokens.txt \
+      $repo/test_wavs/$w
+
+    log "---fp16---"
+
+    sherpa-onnx \
+      --zipformer2-ctc-model=$repo/model.fp16.onnx \
+      --tokens=$repo/tokens.txt \
+      $repo/test_wavs/$w
+  done
+
+  name=(
+    sherpa-onnx-streaming-zipformer-ctc-zh-2025-06-30
+    sherpa-onnx-streaming-zipformer-ctc-zh-int8-2025-06-30
+    sherpa-onnx-streaming-zipformer-ctc-zh-fp16-2025-06-30
+  )
+  for n in ${name[@]}; do
+      url=https://huggingface.co/csukuangfj/$n
+      GIT_LFS_SKIP_SMUDGE=1 git clone $url
+      dst=$(basename $url)
+      if [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-2025-06-30 ]]; then
+        cp -v $repo/model.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-int8-2025-06-30 ]]; then
+        cp -v $repo/model.int8.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-fp16-2025-06-30 ]]; then
+        cp -v $repo/model.fp16.onnx $dst
+      fi
+
+      cp -v $repo/tokens.txt $dst
+      cp -v $repo/README.md $dst
+      mkdir -p $dst/test_wavs
+      cp -v $repo/test_wavs/*.wav $dst/test_wavs
+      cd $dst
+      git lfs track "*.onnx" "*.wav"
+      ls -lh
+      git status
+      git add .
+      git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
+
+      log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+      rm -rf .git
+      rm -fv .gitattributes
+      cd ..
+      tar cjfv $dst.tar.bz2 $dst
+      ls -lh *.tar.bz2
+      mv -v $dst.tar.bz2 ../../../
+  done
+
+  rm $repo/*.onnx
+
+  log "----------------------------------------"
+  log "Export streaming ONNX transducer models "
+  log "----------------------------------------"
+
+  ./zipformer/export-onnx-streaming.py \
+    --exp-dir $repo \
+    --tokens $repo/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 99 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 0 \
+    \
+    --num-encoder-layers 2,2,4,5,4,2 \
+    --feedforward-dim 768,1024,1536,2048,1536,768 \
+    --encoder-dim 256,384,512,768,512,256 \
+    --encoder-unmasked-dim 192,192,256,320,256,192 \
+    \
+    --fp16 1 \
+    --use-whisper-features 1
+
+  ls -lh $repo
+  pushd $repo
+  for m in encoder decoder joiner; do
+    mv -v $m-epoch-99-avg-1-chunk-16-left-128.onnx $m.onnx
+    mv -v $m-epoch-99-avg-1-chunk-16-left-128.fp16.onnx $m.fp16.onnx
+    mv -v $m-epoch-99-avg-1-chunk-16-left-128.int8.onnx $m.int8.onnx
+  done
+  ls -lh *.onnx
+  popd
+
+  for w in 0.wav 1.wav 8k.wav; do
+    log "---fp32---"
+      sherpa-onnx \
+        --encoder=$repo/encoder.onnx \
+        --decoder=$repo/decoder.onnx \
+        --joiner=$repo/joiner.onnx \
+        --tokens=$repo/tokens.txt \
+        $repo/test_wavs/$w
+
+    log "---int8---"
+
+      sherpa-onnx \
+        --encoder=$repo/encoder.int8.onnx \
+        --decoder=$repo/decoder.onnx \
+        --joiner=$repo/joiner.int8.onnx \
+        --tokens=$repo/tokens.txt \
+        $repo/test_wavs/$w
+
+    log "---fp16---"
+
+      sherpa-onnx \
+        --encoder=$repo/encoder.fp16.onnx \
+        --decoder=$repo/decoder.fp16.onnx \
+        --joiner=$repo/joiner.fp16.onnx \
+        --tokens=$repo/tokens.txt \
+        $repo/test_wavs/$w
+  done
+
+  name=(
+    sherpa-onnx-streaming-zipformer-zh-2025-06-30
+    sherpa-onnx-streaming-zipformer-zh-int8-2025-06-30
+    sherpa-onnx-streaming-zipformer-zh-fp16-2025-06-30
+  )
+  for n in ${name[@]}; do
+      url=https://huggingface.co/csukuangfj/$n
+      GIT_LFS_SKIP_SMUDGE=1 git clone $url
+      dst=$(basename $url)
+      if [[ $n == sherpa-onnx-streaming-zipformer-zh-2025-06-30 ]]; then
+        cp -v $repo/encoder.onnx $dst
+        cp -v $repo/decoder.onnx $dst
+        cp -v $repo/joiner.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-zh-int8-2025-06-30 ]]; then
+        cp -v $repo/encoder.int8.onnx $dst
+        cp -v $repo/decoder.onnx $dst
+        cp -v $repo/joiner.int8.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-zh-fp16-2025-06-30 ]]; then
+        cp -v $repo/encoder.fp16.onnx $dst
+        cp -v $repo/decoder.fp16.onnx $dst
+        cp -v $repo/joiner.fp16.onnx $dst
+      fi
+
+      cp -v $repo/tokens.txt $dst
+      cp -v $repo/README.md $dst
+      mkdir -p $dst/test_wavs
+      cp -v $repo/test_wavs/*.wav $dst/test_wavs
+      cd $dst
+      git lfs track "*.onnx" "*.wav"
+      ls -lh
+      git status
+      git add .
+      git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
+
+      log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+      rm -rf .git
+      rm -fv .gitattributes
+      cd ..
+      tar cjfv $dst.tar.bz2 $dst
+      ls -lh *.tar.bz2
+      mv -v $dst.tar.bz2 ../../../
+  done
+}
+
+function run_yuekai_xl() {
+  repo_url=https://csukuangfj:${HF_TOKEN}@huggingface.co/yuekai/icefall-asr-multi-zh-hans-zipformer-xl
+  log "Downloading pre-trained model from $repo_url"
+  GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+  repo=$(basename $repo_url)
+
+  pushd $repo
+  git lfs pull --include pretrained.pt
+  git lfs pull --include data/lang_bpe_2000/bpe.model
+  mv pretrained.pt epoch-99.pt
+  ls -lh *.pt
+  popd
+
+  log "----------------------------------------"
+  log "Export streaming ONNX CTC models "
+  log "----------------------------------------"
+  ./zipformer/export-onnx-streaming-ctc.py \
+    --exp-dir $repo/ \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 99 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 1 \
+    \
+    --num-encoder-layers 2,3,5,6,5,3 \
+    --feedforward-dim 1536,2048,3072,4096,3072,1536 \
+    --encoder-dim 512,768,1024,1536,1024,512 \
+    --encoder-unmasked-dim 192,192,256,320,256,192 \
+    --decoder-dim 768 --joiner-dim 768 \
+    --value-head-dim 18 \
+    --query-head-dim 48 \
+    --num-heads 4,4,4,8,4,4 \
+    \
+    --fp16 1 \
+    --use-whisper-features 1 \
+    --use-external-data 1
+
+  mv -v ctc-epoch-99-avg-1-chunk-16-left-128.int8.onnx model.int8.onnx
+  mv -v ctc-epoch-99-avg-1-chunk-16-left-128.fp16.onnx model.fp16.onnx
+
+  ls -lh *.onnx
+
+  mkdir test_wavs
+  pushd test_wavs
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/0.wav
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/1.wav
+  curl -SL -O https://huggingface.co/csukuangfj/sherpa-onnx-streaming-zipformer-small-ctc-zh-int8-2025-04-01/resolve/main/test_wavs/8k.wav
+  popd
+
+  for w in 0.wav 1.wav 8k.wav; do
+    log "---int8---"
+
+    sherpa-onnx \
+      --zipformer2-ctc-model=./model.int8.onnx \
+      --tokens=$repo/data/lang_bpe_2000/tokens.txt \
+      test_wavs/$w
+
+    log "---fp16---"
+
+    sherpa-onnx \
+      --zipformer2-ctc-model=./model.fp16.onnx \
+      --tokens=$repo/data/lang_bpe_2000/tokens.txt \
+      test_wavs/$w
+  done
+
+  pushd $repo
+cat >README.md <<EOF
+# Introduction
+
+This model is converted
+from
+https://huggingface.co/yuekai/icefall-asr-multi-zh-hans-zipformer-xl
+
+The training code can be found at
+https://github.com/k2-fsa/icefall/blob/master/egs/multi_zh-hans/ASR/RESULTS.md#multi-chinese-datasets-char-based-training-results-streaming-on-zipformer-xl-model
+EOF
+  popd
+
+  name=(
+    sherpa-onnx-streaming-zipformer-ctc-zh-xlarge-int8-2025-06-30
+    sherpa-onnx-streaming-zipformer-ctc-zh-xlarge-fp16-2025-06-30
+  )
+
+  for n in ${name[@]}; do
+      url=https://huggingface.co/csukuangfj/$n
+      GIT_LFS_SKIP_SMUDGE=1 git clone $url
+      dst=$(basename $url)
+      if [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-xlarge-fp16-2025-06-30 ]]; then
+        cp -v model.fp16.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-ctc-zh-xlarge-int8-2025-06-30 ]]; then
+        cp -v model.int8.onnx $dst
+      fi
+
+      cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+      cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+      cp -v $repo/README.md $dst
+      mkdir -p $dst/test_wavs
+      cp -v ./test_wavs/*.wav $dst/test_wavs
+      cd $dst
+      git lfs track "*.onnx" "*.wav" "bpe.model"
+      ls -lh
+      git status
+      git add .
+      git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
+
+      log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+      rm -rf .git
+      rm -fv .gitattributes
+      cd ..
+
+      ls -lh $dst
+      tar cjfv $dst.tar.bz2 $dst
+      ls -lh *.tar.bz2
+      mv -v $dst.tar.bz2 ../../../
+  done
+
+  rm -fv *.onnx *.weights
+
+  log "----------------------------------------"
+  log "Export streaming ONNX transducer models "
+  log "----------------------------------------"
+
+  ./zipformer/export-onnx-streaming.py \
+    --exp-dir $repo/ \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 99 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 0 \
+    \
+    --num-encoder-layers 2,3,5,6,5,3 \
+    --feedforward-dim 1536,2048,3072,4096,3072,1536 \
+    --encoder-dim 512,768,1024,1536,1024,512 \
+    --encoder-unmasked-dim 192,192,256,320,256,192 \
+    --decoder-dim 768 --joiner-dim 768 \
+    --value-head-dim 18 \
+    --query-head-dim 48 \
+    --num-heads 4,4,4,8,4,4 \
+    \
+    --fp16 1 \
+    --use-whisper-features 1 \
+    --use-external-data 1
+
+    ls -lh *.onnx
+    ls -lh *.weights
+
+    mv encoder-epoch-99-avg-1-chunk-16-left-128.fp16.onnx encoder.fp16.onnx
+    mv encoder-epoch-99-avg-1-chunk-16-left-128.int8.onnx encoder.int8.onnx
+
+    mv $repo/decoder-epoch-99-avg-1-chunk-16-left-128.onnx decoder.onnx
+    mv $repo/decoder-epoch-99-avg-1-chunk-16-left-128.fp16.onnx decoder.fp16.onnx
+
+    mv $repo/joiner-epoch-99-avg-1-chunk-16-left-128.int8.onnx joiner.int8.onnx
+    mv $repo/joiner-epoch-99-avg-1-chunk-16-left-128.fp16.onnx joiner.fp16.onnx
+
+  name=(
+    sherpa-onnx-streaming-zipformer-zh-xlarge-int8-2025-06-30
+    sherpa-onnx-streaming-zipformer-zh-xlarge-fp16-2025-06-30
+  )
+
+  for n in ${name[@]}; do
+      url=https://huggingface.co/csukuangfj/$n
+      GIT_LFS_SKIP_SMUDGE=1 git clone $url
+      dst=$(basename $url)
+      if [[ $n == sherpa-onnx-streaming-zipformer-zh-xlarge-fp16-2025-06-30 ]]; then
+        cp -v encoder.fp16.onnx $dst
+        cp -v decoder.fp16.onnx $dst
+        cp -v joiner.fp16.onnx $dst
+      elif [[ $n == sherpa-onnx-streaming-zipformer-zh-xlarge-int8-2025-06-30 ]]; then
+        cp -v encoder.int8.onnx $dst
+        cp -v decoder.onnx $dst
+        cp -v joiner.int8.onnx $dst
+      fi
+
+      cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+      cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+      cp -v $repo/README.md $dst
+      mkdir -p $dst/test_wavs
+      cp -v ./test_wavs/*.wav $dst/test_wavs
+      cd $dst
+      git lfs track "*.onnx" "*.wav" "bpe.model"
+      ls -lh
+      git status
+      git add .
+      git commit -m "upload model" && git push https://csukuangfj:${HF_TOKEN}@huggingface.co/csukuangfj/$dst main || true
+
+      log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+      rm -rf .git
+      rm -fv .gitattributes
+      cd ..
+
+      ls -lh $dst
+      tar cjfv $dst.tar.bz2 $dst
+      ls -lh *.tar.bz2
+      mv -v $dst.tar.bz2 ../../../
+  done
+
+  rm -fv *.onnx *.weights
+}
+
+# run_yuekai_large
+# run_yuekai_xl
+# run_2023_9_2
+run_2023_11_05_streaming
+# run_2023_12_12_streaming
--- a/.github/workflows/aishell.yml
+++ b/.github/workflows/aishell.yml
@ -17,7 +17,7 @@ concurrency:

 jobs:
  generate_build_matrix:
-    if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event.label.name == 'ready' || github.event_name == 'push' || github.event_name == 'aishell')
+    if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'

    # see https://github.com/pytorch/pytorch/pull/50633
    runs-on: ubuntu-latest
@ -31,8 +31,8 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
          echo "::set-output name=matrix::${MATRIX}"
  aishell:
    needs: generate_build_matrix
--- a/.github/workflows/audioset.yml
+++ b/.github/workflows/audioset.yml
@ -30,8 +30,8 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
          echo "::set-output name=matrix::${MATRIX}"

  audioset:
@ -83,7 +83,7 @@ jobs:
          ls -lh ./model-onnx/*

      - name: Upload model to huggingface
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        uses: nick-fields/retry@v3
@ -116,7 +116,7 @@ jobs:
            rm -rf huggingface

      - name: Prepare for release
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
        shell: bash
        run: |
          d=sherpa-onnx-zipformer-audio-tagging-2024-04-09
@ -125,7 +125,7 @@ jobs:
          ls -lh

      - name: Release exported onnx models
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
        uses: svenstaro/upload-release-action@v2
        with:
          file_glob: true
--- a/.github/workflows/baker_zh.yml
+++ b/.github/workflows/baker_zh.yml
@ -31,8 +31,8 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py --min-torch-version "2.3"
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --min-torch-version "2.3")
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
          echo "::set-output name=matrix::${MATRIX}"

  baker_zh:
@ -84,43 +84,43 @@ jobs:
          ls -lh

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
          path: ./*.wav

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-2
          path: ./model-steps-2.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-3
          path: ./model-steps-3.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-4
          path: ./model-steps-4.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-5
          path: ./model-steps-5.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-6
          path: ./model-steps-6.onnx

      - name: Upload models to huggingface
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
        shell: bash
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
@ -141,7 +141,7 @@ jobs:
          popd

      - name: Release exported onnx models
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
        uses: svenstaro/upload-release-action@v2
        with:
          file_glob: true
--- a/.github/workflows/librispeech.yml
+++ b/.github/workflows/librispeech.yml
@ -29,8 +29,9 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          # MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10" --min-torch-version "2.6.0")
          echo "::set-output name=matrix::${MATRIX}"
  librispeech:
    needs: generate_build_matrix
--- a/.github/workflows/ljspeech.yml
+++ b/.github/workflows/ljspeech.yml
@ -30,8 +30,8 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py --min-torch-version "2.3"
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --min-torch-version "2.3")
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
          echo "::set-output name=matrix::${MATRIX}"

  ljspeech:
@ -83,13 +83,13 @@ jobs:
          ls -lh

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
          path: ./*.wav

      - name: Release exported onnx models
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0' && github.event_name == 'push'
        uses: svenstaro/upload-release-action@v2
        with:
          file_glob: true
@ -100,37 +100,37 @@ jobs:
          tag: tts-models

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-2
          path: ./model-steps-2.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-3
          path: ./model-steps-3.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-4
          path: ./model-steps-4.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-5
          path: ./model-steps-5.onnx

      - uses: actions/upload-artifact@v4
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        with:
          name: step-6
          path: ./model-steps-6.onnx

      - name: Upload models to huggingface
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        shell: bash
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
@ -155,7 +155,7 @@ jobs:
          popd

      - name: Release exported onnx models
-        if: matrix.python-version == '3.9' && matrix.torch-version == '2.3.0'
+        if: matrix.python-version == '3.10' && matrix.torch-version == '2.3.0'
        uses: svenstaro/upload-release-action@v2
        with:
          file_glob: true
--- a/.github/workflows/multi-zh-hans.yml
+++ b/.github/workflows/multi-zh-hans.yml
@ -1,4 +1,4 @@
-name: run-multi-zh-hans
+name: multi-zh-hans

 on:
  push:
@ -8,65 +8,72 @@ on:
  workflow_dispatch:

 concurrency:
-  group: run-multi-zh-hans-${{ github.ref }}
+  group: multi-zh-hans-${{ github.ref }}
  cancel-in-progress: true

 permissions:
  contents: write

 jobs:
+  generate_build_matrix:
+    if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python ./.github/scripts/docker/generate_build_matrix.py --torch-version "2.7.0" --python-version "3.11"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --torch-version "2.7.0" --python-version "3.11")
+          echo "::set-output name=matrix::${MATRIX}"
  multi-zh-hans:
-    runs-on: ${{ matrix.os }}
+    needs: generate_build_matrix
+    name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
+    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
-        os: [ubuntu-latest]
-        python-version: [3.8]
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}

    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

-      - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-          cache: 'pip'
-          cache-dependency-path: '**/requirements-ci.txt'
-
-      - name: Install Python dependencies
-        run: |
-          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
-          pip uninstall -y protobuf
-          pip install --no-binary protobuf protobuf==3.20.*
-
-      - name: Cache kaldifeat
-        id: my-cache
-        uses: actions/cache@v2
-        with:
-          path: |
-            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}-2023-05-22
-
-      - name: Install kaldifeat
-        if: steps.my-cache.outputs.cache-hit != 'true'
+      - name: Free space
        shell: bash
        run: |
-          .github/scripts/install-kaldifeat.sh
+          df -h
+          rm -rf /opt/hostedtoolcache
+          df -h
+          echo "pwd: $PWD"
+          echo "github.workspace ${{ github.workspace }}"

-      - name: export-model
+      - name: Test with multi_zh-hans
+        uses: addnab/docker-run-action@v3
+        with:
+            image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
+            options: |
+              --volume ${{ github.workspace }}/:/icefall
            shell: bash
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
            run: |
-          sudo apt-get -qq install git-lfs tree
-          export PYTHONPATH=$PWD:$PYTHONPATH
-          export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH
-          export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH
+              export PYTHONPATH=/icefall:$PYTHONPATH
+              export HF_TOKEN=${{ secrets.HF_TOKEN }}
+              cd /icefall
+              git config --global --add safe.directory /icefall

-          .github/scripts/multi-zh-hans.sh
-          ls -lh
+              .github/scripts/multi_zh-hans/ASR/run.sh
+
+      - name: Show models
+        shell: bash
+        run: |
+          ls -lh *.tar.bz2

      - name: upload model to https://github.com/k2-fsa/sherpa-onnx
        uses: svenstaro/upload-release-action@v2
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -30,8 +30,8 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
          echo "::set-output name=matrix::${MATRIX}"
  test:
    needs: generate_build_matrix
--- a/.github/workflows/yesno.yml
+++ b/.github/workflows/yesno.yml
@ -30,8 +30,9 @@ jobs:
        id: set-matrix
        run: |
          # outputting for debugging purposes
-          python ./.github/scripts/docker/generate_build_matrix.py
-          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10")
+          # MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --python-version "3.10" --min-torch-version "2.5.0")
          echo "::set-output name=matrix::${MATRIX}"
  yesno:
    needs: generate_build_matrix
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/train.py
@ -79,7 +79,13 @@ from icefall.checkpoint import save_checkpoint_with_global_batch_idx
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -638,7 +644,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -912,7 +918,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless2/train.py
@ -72,7 +72,13 @@ from icefall.checkpoint import save_checkpoint_with_global_batch_idx
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -688,7 +694,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -989,7 +995,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/pruned_transducer_stateless3/model.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless3/model.py
@ -23,7 +23,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -184,7 +184,7 @@ class Transducer(nn.Module):
        lm = simple_lm_proj(decoder_out)
        am = simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -219,7 +219,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/aishell/ASR/pruned_transducer_stateless3/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless3/train.py
@ -94,7 +94,13 @@ from icefall.checkpoint import (
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -797,7 +803,7 @@ def train_one_epoch(
        aishell = is_aishell(batch["supervisions"]["cut"][0])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1202,7 +1208,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7/train.py
@ -94,6 +94,7 @@ from icefall.utils import (
    filter_uneven_sized_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -809,7 +810,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1206,7 +1207,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py
@ -87,6 +87,7 @@ from icefall.utils import (
    setup_logger,
    str2bool,
    tokenize_by_CJK_char,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -802,7 +803,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1202,7 +1203,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/aishell/ASR/pruned_transducer_stateless7_streaming/train.py
@ -81,7 +81,13 @@ from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -812,7 +818,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1202,7 +1208,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/whisper/train.py
+++ b/egs/aishell/ASR/whisper/train.py
@ -81,6 +81,7 @@ from icefall.utils import (
    filter_uneven_sized_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -514,7 +515,7 @@ def compute_validation_loss(
    tot_loss = MetricsTracker()

    for batch_idx, batch in enumerate(valid_dl):
-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                tokenizer=tokenizer,
@ -608,7 +609,7 @@ def train_one_epoch(
                )

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    tokenizer=tokenizer,
--- a/egs/aishell/ASR/zipformer/train.py
+++ b/egs/aishell/ASR/zipformer/train.py
@ -96,6 +96,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -1014,7 +1015,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1419,7 +1420,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell/ASR/zipformer/train_bbpe.py
+++ b/egs/aishell/ASR/zipformer/train_bbpe.py
@ -92,6 +92,7 @@ from icefall.utils import (
    setup_logger,
    str2bool,
    tokenize_by_CJK_char,
+    torch_autocast,
 )


@ -495,7 +496,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -895,7 +896,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell2/ASR/pruned_transducer_stateless5/train.py
+++ b/egs/aishell2/ASR/pruned_transducer_stateless5/train.py
@ -90,7 +90,13 @@ from icefall.checkpoint import (
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -734,7 +740,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1062,7 +1068,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/aishell4/ASR/pruned_transducer_stateless5/train.py
+++ b/egs/aishell4/ASR/pruned_transducer_stateless5/train.py
@ -83,7 +83,13 @@ from icefall.checkpoint import (
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -727,7 +733,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])
        # print(batch["supervisions"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1034,7 +1040,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/alimeeting/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/train.py
@ -79,7 +79,13 @@ from icefall.checkpoint import save_checkpoint_with_global_batch_idx
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -638,7 +644,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -912,7 +918,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/train.py
+++ b/egs/alimeeting/ASR_v2/pruned_transducer_stateless7/train.py
@ -73,7 +73,13 @@ from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
 from icefall.lexicon import Lexicon
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -782,7 +788,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1127,7 +1133,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/ami/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/ami/ASR/pruned_transducer_stateless7/train.py
@ -71,7 +71,13 @@ from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -773,7 +779,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1134,7 +1140,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/ami/SURT/dprnn_zipformer/train.py
+++ b/egs/ami/SURT/dprnn_zipformer/train.py
@ -76,7 +76,13 @@ from icefall.checkpoint import (
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -1067,7 +1073,7 @@ def train_one_epoch(
        batch_size = batch["inputs"].shape[0]

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
--- a/egs/ami/SURT/dprnn_zipformer/train_adapt.py
+++ b/egs/ami/SURT/dprnn_zipformer/train_adapt.py
@ -76,7 +76,13 @@ from icefall.checkpoint import (
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -1058,7 +1064,7 @@ def train_one_epoch(
        batch_size = batch["inputs"].shape[0]

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
--- a/egs/audioset/AT/zipformer/train.py
+++ b/egs/audioset/AT/zipformer/train.py
@ -74,6 +74,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -799,7 +800,7 @@ def train_one_epoch(
        num_samples += batch_size

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1148,7 +1149,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/baker_zh/TTS/local/compute_fbank_baker_zh.py
+++ b/egs/baker_zh/TTS/local/compute_fbank_baker_zh.py
@ -73,6 +73,8 @@ def compute_fbank_baker_zh(num_jobs: int):
        f_min=0,
        f_max=8000,
    )
+    if not torch.cuda.is_available():
+        config.device = "cpu"

    prefix = "baker_zh"
    suffix = "jsonl.gz"
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7/train.py
@ -88,6 +88,7 @@ from icefall.utils import (
    filter_uneven_sized_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -825,7 +826,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1220,7 +1221,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/finetune.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/finetune.py
@ -90,6 +90,7 @@ from icefall.utils import (
    filter_uneven_sized_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -895,7 +896,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1293,7 +1294,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/commonvoice/ASR/pruned_transducer_stateless7_streaming/train.py
@ -81,7 +81,13 @@ from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -840,7 +846,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1237,7 +1243,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/commonvoice/ASR/zipformer/train.py
+++ b/egs/commonvoice/ASR/zipformer/train.py
@ -97,6 +97,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -969,7 +970,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1365,7 +1366,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/commonvoice/ASR/zipformer/train_char.py
+++ b/egs/commonvoice/ASR/zipformer/train_char.py
@ -97,6 +97,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -604,7 +605,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -784,7 +785,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/csj/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/csj/ASR/pruned_transducer_stateless7_streaming/train.py
@ -83,7 +83,13 @@ from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
 LOG_EPS = math.log(1e-10)
@ -838,7 +844,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1245,7 +1251,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/train.py
@ -77,7 +77,13 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -675,7 +681,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -944,7 +950,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/gigaspeech/ASR/zipformer/train.py
+++ b/egs/gigaspeech/ASR/zipformer/train.py
@ -97,6 +97,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -958,7 +959,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1317,7 +1318,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/gigaspeech/KWS/zipformer/train.py
+++ b/egs/gigaspeech/KWS/zipformer/train.py
@ -97,6 +97,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -961,7 +962,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1320,7 +1321,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/train.py
+++ b/egs/ksponspeech/ASR/pruned_transducer_stateless7_streaming/train.py
@ -77,7 +77,13 @@ from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.err import raise_grad_scale_is_too_small_error
 from icefall.hooks import register_inf_check_hooks
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -805,7 +811,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1196,7 +1202,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/ksponspeech/ASR/zipformer/train.py
+++ b/egs/ksponspeech/ASR/zipformer/train.py
@ -92,6 +92,7 @@ from icefall.utils import (
    get_parameter_groups_with_lrs,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -942,7 +943,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1333,7 +1334,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/conformer_ctc/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc/decode.py
@ -667,7 +667,9 @@ def main():
        H = None
        bpe_model = None
        HLG = k2.Fsa.from_dict(
-            torch.load(f"{params.lang_dir}/HLG.pt", map_location=device)
+            torch.load(
+                f"{params.lang_dir}/HLG.pt", map_location=device, weights_only=False
+            )
        )
        assert HLG.requires_grad is False

@ -707,7 +709,9 @@ def main():
                torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt")
        else:
            logging.info("Loading pre-compiled G_4_gram.pt")
-            d = torch.load(params.lm_dir / "G_4_gram.pt", map_location=device)
+            d = torch.load(
+                params.lm_dir / "G_4_gram.pt", map_location=device, weights_only=False
+            )
            G = k2.Fsa.from_dict(d)

        if params.method in [
--- a/egs/librispeech/ASR/conformer_ctc/pretrained.py
+++ b/egs/librispeech/ASR/conformer_ctc/pretrained.py
@ -271,7 +271,7 @@ def main():
        use_feat_batchnorm=params.use_feat_batchnorm,
    )

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
@ -351,7 +351,9 @@ def main():
        "attention-decoder",
    ]:
        logging.info(f"Loading HLG from {params.HLG}")
-        HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu"))
+        HLG = k2.Fsa.from_dict(
+            torch.load(params.HLG, map_location="cpu", weights_only=False)
+        )
        HLG = HLG.to(device)
        if not hasattr(HLG, "lm_scores"):
            # For whole-lattice-rescoring and attention-decoder
@ -362,7 +364,9 @@ def main():
            "attention-decoder",
        ]:
            logging.info(f"Loading G from {params.G}")
-            G = k2.Fsa.from_dict(torch.load(params.G, map_location="cpu"))
+            G = k2.Fsa.from_dict(
+                torch.load(params.G, map_location="cpu", weights_only=False)
+            )
            # Add epsilon self-loops to G as we will compose
            # it with the whole lattice later
            G = G.to(device)
--- a/egs/librispeech/ASR/conformer_ctc2/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc2/decode.py
@ -774,7 +774,9 @@ def main():
        H = None
        bpe_model = None
        HLG = k2.Fsa.from_dict(
-            torch.load(f"{params.lang_dir}/HLG.pt", map_location=device)
+            torch.load(
+                f"{params.lang_dir}/HLG.pt", map_location=device, weights_only=False
+            )
        )
        assert HLG.requires_grad is False

@ -814,7 +816,9 @@ def main():
                torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt")
        else:
            logging.info("Loading pre-compiled G_4_gram.pt")
-            d = torch.load(params.lm_dir / "G_4_gram.pt", map_location=device)
+            d = torch.load(
+                params.lm_dir / "G_4_gram.pt", map_location=device, weights_only=False
+            )
            G = k2.Fsa.from_dict(d)

        if params.method in [
--- a/egs/librispeech/ASR/conformer_ctc2/train.py
+++ b/egs/librispeech/ASR/conformer_ctc2/train.py
@ -65,7 +65,6 @@ from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -84,9 +83,11 @@ from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    encode_supervisions,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -420,7 +421,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -629,7 +630,7 @@ def train_one_epoch(
    scheduler: LRSchedulerType,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -676,7 +677,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -965,7 +966,7 @@ def run(rank, world_size, args):
            params=params,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1036,7 +1037,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/conformer_ctc3/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc3/decode.py
@ -868,7 +868,9 @@ def main():
        H = None
        bpe_model = None
        HLG = k2.Fsa.from_dict(
-            torch.load(f"{params.lang_dir}/HLG.pt", map_location=device)
+            torch.load(
+                f"{params.lang_dir}/HLG.pt", map_location=device, weights_only=False
+            )
        )
        assert HLG.requires_grad is False

@ -907,7 +909,9 @@ def main():
                torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt")
        else:
            logging.info("Loading pre-compiled G_4_gram.pt")
-            d = torch.load(params.lm_dir / "G_4_gram.pt", map_location=device)
+            d = torch.load(
+                params.lm_dir / "G_4_gram.pt", map_location=device, weights_only=False
+            )
            G = k2.Fsa.from_dict(d)

        if params.decoding_method == "whole-lattice-rescoring":
--- a/egs/librispeech/ASR/conformer_ctc3/jit_pretrained.py
+++ b/egs/librispeech/ASR/conformer_ctc3/jit_pretrained.py
@ -334,7 +334,9 @@ def main():
        "whole-lattice-rescoring",
    ]:
        logging.info(f"Loading HLG from {params.HLG}")
-        HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu"))
+        HLG = k2.Fsa.from_dict(
+            torch.load(params.HLG, map_location="cpu", weights_only=False)
+        )
        HLG = HLG.to(device)
        if not hasattr(HLG, "lm_scores"):
            # For whole-lattice-rescoring and attention-decoder
@ -345,7 +347,9 @@ def main():
            "whole-lattice-rescoring",
        ]:
            logging.info(f"Loading G from {params.G}")
-            G = k2.Fsa.from_dict(torch.load(params.G, map_location="cpu"))
+            G = k2.Fsa.from_dict(
+                torch.load(params.G, map_location="cpu", weights_only=False)
+            )
            G = G.to(device)
            if params.method == "whole-lattice-rescoring":
                # Add epsilon self-loops to G as we will compose
--- a/egs/librispeech/ASR/conformer_ctc3/pretrained.py
+++ b/egs/librispeech/ASR/conformer_ctc3/pretrained.py
@ -290,7 +290,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
@ -386,7 +386,9 @@ def main():
        "whole-lattice-rescoring",
    ]:
        logging.info(f"Loading HLG from {params.HLG}")
-        HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu"))
+        HLG = k2.Fsa.from_dict(
+            torch.load(params.HLG, map_location="cpu", weights_only=False)
+        )
        HLG = HLG.to(device)
        if not hasattr(HLG, "lm_scores"):
            # For whole-lattice-rescoring and attention-decoder
@ -397,7 +399,9 @@ def main():
            "whole-lattice-rescoring",
        ]:
            logging.info(f"Loading G from {params.G}")
-            G = k2.Fsa.from_dict(torch.load(params.G, map_location="cpu"))
+            G = k2.Fsa.from_dict(
+                torch.load(params.G, map_location="cpu", weights_only=False)
+            )
            G = G.to(device)
            if params.method == "whole-lattice-rescoring":
                # Add epsilon self-loops to G as we will compose
--- a/egs/librispeech/ASR/conformer_ctc3/train.py
+++ b/egs/librispeech/ASR/conformer_ctc3/train.py
@ -76,7 +76,6 @@ from lhotse.utils import fix_random_seed
 from model import CTCModel
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -95,9 +94,11 @@ from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    encode_supervisions,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -493,7 +494,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -694,7 +695,7 @@ def train_one_epoch(
    graph_compiler: Union[BpeCtcTrainingGraphCompiler, CtcTrainingGraphCompiler],
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -743,7 +744,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1004,7 +1005,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1073,7 +1074,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/conformer_mmi/decode.py
+++ b/egs/librispeech/ASR/conformer_mmi/decode.py
@ -574,7 +574,9 @@ def main():
        H = None
        bpe_model = None
        HLG = k2.Fsa.from_dict(
-            torch.load(f"{params.lang_dir}/HLG.pt", map_location="cpu")
+            torch.load(
+                f"{params.lang_dir}/HLG.pt", map_location="cpu", weights_only=False
+            )
        )
        HLG = HLG.to(device)
        assert HLG.requires_grad is False
@ -609,7 +611,9 @@ def main():
                torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt")
        else:
            logging.info("Loading pre-compiled G_4_gram.pt")
-            d = torch.load(params.lm_dir / "G_4_gram.pt", map_location="cpu")
+            d = torch.load(
+                params.lm_dir / "G_4_gram.pt", map_location="cpu", weights_only=False
+            )
            G = k2.Fsa.from_dict(d).to(device)

        if params.method in ["whole-lattice-rescoring", "attention-decoder"]:
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/train.py
@ -80,7 +80,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -93,7 +92,14 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    create_grad_scaler,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -560,7 +566,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -727,7 +733,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -772,7 +778,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1002,7 +1008,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1071,7 +1077,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless2/train.py
@ -80,7 +80,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -93,7 +92,14 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    create_grad_scaler,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -560,7 +566,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -727,7 +733,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -772,7 +778,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1001,7 +1007,7 @@ def run(rank, world_size, args):
            params=params,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1072,7 +1078,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/local/compile_hlg.py
+++ b/egs/librispeech/ASR/local/compile_hlg.py
@ -72,11 +72,11 @@ def compile_HLG(lang_dir: str, lm: str = "G_3_gram") -> k2.Fsa:
    max_token_id = max(lexicon.tokens)
    logging.info(f"Building ctc_topo. max_token_id: {max_token_id}")
    H = k2.ctc_topo(max_token_id)
-    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
+    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt", weights_only=False))

    if Path(f"data/lm/{lm}.pt").is_file():
        logging.info(f"Loading pre-compiled {lm}")
-        d = torch.load(f"data/lm/{lm}.pt")
+        d = torch.load(f"data/lm/{lm}.pt", weights_only=False)
        G = k2.Fsa.from_dict(d)
    else:
        logging.info(f"Loading {lm}.fst.txt")
--- a/egs/librispeech/ASR/local/compile_lg.py
+++ b/egs/librispeech/ASR/local/compile_lg.py
@ -66,11 +66,11 @@ def compile_LG(lang_dir: str, lm: str = "G_3_gram") -> k2.Fsa:
      An FSA representing LG.
    """
    lexicon = Lexicon(lang_dir)
-    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt"))
+    L = k2.Fsa.from_dict(torch.load(f"{lang_dir}/L_disambig.pt", weights_only=False))

    if Path(f"data/lm/{lm}.pt").is_file():
        logging.info(f"Loading pre-compiled {lm}")
-        d = torch.load(f"data/lm/{lm}.pt")
+        d = torch.load(f"data/lm/{lm}.pt", weights_only=False)
        G = k2.Fsa.from_dict(d)
    else:
        logging.info(f"Loading {lm}.fst.txt")
--- a/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/decode.py
@ -750,7 +750,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/lstm_transducer_stateless/model.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/model.py
@ -23,7 +23,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -156,7 +156,7 @@ class Transducer(nn.Module):
        lm = self.simple_lm_proj(decoder_out)
        am = self.simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -192,7 +192,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = self.joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/pretrained.py
@ -238,7 +238,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/lstm_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless/train.py
@ -66,7 +66,6 @@ from lstm import RNN
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -82,9 +81,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -521,7 +522,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -717,7 +718,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -763,7 +764,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1023,7 +1024,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1092,7 +1093,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py
@ -935,7 +935,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/model.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/model.py
@ -23,7 +23,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -195,7 +195,7 @@ class Transducer(nn.Module):
        lm = simple_lm_proj(decoder_out)
        am = simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -231,7 +231,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/pretrained.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/pretrained.py
@ -241,7 +241,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/train.py
@ -74,7 +74,6 @@ from lstm import RNN
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -90,9 +89,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -560,7 +561,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -772,7 +773,7 @@ def train_one_epoch(
    giga_train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
    rng: random.Random,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -848,7 +849,7 @@ def train_one_epoch(
        libri = is_libri(batch["supervisions"]["cut"][0])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1176,7 +1177,7 @@ def run(rank, world_size, args):
        else:
            logging.info("Skip scan_pessimistic_batches_for_oom")

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1247,7 +1248,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/lstm_transducer_stateless3/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless3/decode.py
@ -815,7 +815,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/lstm_transducer_stateless3/pretrained.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless3/pretrained.py
@ -239,7 +239,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/lstm_transducer_stateless3/train.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless3/train.py
@ -66,7 +66,6 @@ from lstm import RNN
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -82,9 +81,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -551,7 +552,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -747,7 +748,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -793,7 +794,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1067,7 +1068,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1136,7 +1137,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_.autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned2_knowledge/model.py
+++ b/egs/librispeech/ASR/pruned2_knowledge/model.py
@ -21,7 +21,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -141,7 +141,7 @@ class Transducer(nn.Module):
        lm = self.simple_lm_proj(decoder_out)
        am = self.simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -176,7 +176,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = self.joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/librispeech/ASR/pruned2_knowledge/sampling.py
+++ b/egs/librispeech/ASR/pruned2_knowledge/sampling.py
@ -10,9 +10,11 @@ from typing import Optional, Tuple
 import torch
 from scaling import ScaledLinear
 from torch import Tensor, nn
-from torch.cuda.amp import GradScaler, custom_bwd, custom_fwd
+from torch.cuda.amp import custom_bwd, custom_fwd
 from torch_scheduled_sampling import sample_combined

+from icefall.utils import create_grad_scaler, torch_autocast
+
 # The main exports of this file are the module KnowledgeBaseLookup and the
 # function create_knowledge_base.

@ -330,14 +332,14 @@ def _test_knowledge_base_lookup_autocast():
    optimizer = Eve(m.parameters(), lr=0.005, eps=1.0e-04)
    m = m.to(device)

-    scaler = GradScaler(enabled=True)
+    scaler = create_grad_scaler(enabled=True)

    start = timeit.default_timer()

    for epoch in range(150):
        for n, (x, y) in enumerate(train_pairs):
            y_out = m(x)
-            with torch.cuda.amp.autocast(enabled=True):
+            with torch_autocast(enabled=True):
                loss = ((y_out - y) ** 2).mean() * 100.0
            if n % 10 == 0 and epoch % 10 == 0:
                print(f"Epoch {epoch}, batch {n}, loss {loss.item()}")
--- a/egs/librispeech/ASR/pruned2_knowledge/train.py
+++ b/egs/librispeech/ASR/pruned2_knowledge/train.py
@ -66,7 +66,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -76,7 +75,14 @@ from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
 from icefall.checkpoint import save_checkpoint_with_global_batch_idx
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    create_grad_scaler,
+    AttributeDict,
+    MetricsTracker,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -453,7 +459,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -608,7 +614,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
    rank: int = 0,
@ -650,7 +656,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -868,7 +874,7 @@ def run(rank, world_size, args):
            params=params,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -937,7 +943,7 @@ def scan_pessimistic_batches_for_oom(
            # warmup = 0.0 is so that the derivs for the pruned loss stay zero
            # (i.e. are not remembered by the decaying-average in adam), because
            # we want to avoid these params being subject to shrinkage in adam.
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py
+++ b/egs/librispeech/ASR/pruned_stateless_emformer_rnnt2/train.py
@ -55,7 +55,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from noam import Noam
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -68,7 +67,14 @@ from icefall.checkpoint import (
 )
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    create_grad_scaler,
+    setup_logger,
+    str2bool,
+    torch_autocast,
+)


 def add_model_arguments(parser: argparse.ArgumentParser):
@ -496,7 +502,7 @@ def save_checkpoint(
    model_avg: Optional[nn.Module] = None,
    optimizer: Optional[torch.optim.Optimizer] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, and training stats to file.
@ -650,7 +656,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -693,7 +699,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -939,7 +945,7 @@ def run(rank, world_size, args):
            params=params,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1004,7 +1010,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
@ -741,7 +741,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py
@ -754,7 +754,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/model.py
@ -23,7 +23,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -157,7 +157,7 @@ class Transducer(nn.Module):
        lm = self.simple_lm_proj(decoder_out)
        am = self.simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -193,7 +193,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = self.joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/pretrained.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/pretrained.py
@ -265,7 +265,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
@ -78,7 +78,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -91,9 +90,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -523,7 +524,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -716,7 +717,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
    rank: int = 0,
@ -759,7 +760,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1000,7 +1001,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 0 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1067,7 +1068,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/decode.py
@ -921,7 +921,7 @@ def load_ngram_LM(

    if pt_file.is_file():
        logging.info(f"Loading pre-compiled {pt_file}")
-        d = torch.load(pt_file, map_location=device)
+        d = torch.load(pt_file, map_location=device, weights_only=False)
        G = k2.Fsa.from_dict(d)
        G = k2.add_epsilon_self_loops(G)
        G = k2.arc_sort(G)
@ -1101,7 +1101,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        elif params.decoding_method in [
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/model.py
@ -23,7 +23,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -195,7 +195,7 @@ class Transducer(nn.Module):
        lm = simple_lm_proj(decoder_out)
        am = simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -231,7 +231,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/pretrained.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/pretrained.py
@ -274,7 +274,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
@ -74,7 +74,6 @@ from librispeech import LibriSpeech
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -87,9 +86,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -546,7 +547,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -755,7 +756,7 @@ def train_one_epoch(
    giga_train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
    rng: random.Random,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
    rank: int = 0,
@ -827,7 +828,7 @@ def train_one_epoch(

        libri = is_libri(batch["supervisions"]["cut"][0])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1126,7 +1127,7 @@ def run(rank, world_size, args):
                warmup=0.0 if params.start_epoch == 0 else 1.0,
            )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1195,7 +1196,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/decode.py
@ -913,7 +913,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/pruned_transducer_stateless4/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/train.py
@ -80,7 +80,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -96,9 +95,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -548,7 +549,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -744,7 +745,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -789,7 +790,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1047,7 +1048,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1116,7 +1117,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/decode.py
@ -972,7 +972,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/pretrained.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/pretrained.py
@ -238,7 +238,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/pruned_transducer_stateless5/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless5/train.py
@ -68,7 +68,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -84,9 +83,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -571,7 +572,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -768,7 +769,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -814,7 +815,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1078,7 +1079,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1147,7 +1148,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/model.py
@ -23,7 +23,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import ScaledLinear

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -185,7 +185,7 @@ class Transducer(nn.Module):
        lm = self.simple_lm_proj(decoder_out)
        am = self.simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -220,7 +220,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = self.joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/train.py
@ -80,7 +80,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, Eve
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter

@ -96,9 +95,11 @@ from icefall.env import get_env_info
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    display_and_save_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -519,7 +520,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -736,7 +737,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -781,7 +782,7 @@ def train_one_epoch(
        params.batch_idx_train += 1
        batch_size = len(batch["supervisions"]["text"])

-        with torch.cuda.amp.autocast(enabled=params.use_fp16):
+        with torch_autocast(enabled=params.use_fp16):
            loss, loss_info = compute_loss(
                params=params,
                model=model,
@ -1039,7 +1040,7 @@ def run(rank, world_size, args):
            warmup=0.0 if params.start_epoch == 1 else 1.0,
        )

-    scaler = GradScaler(enabled=params.use_fp16)
+    scaler = create_grad_scaler(enabled=params.use_fp16)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1108,7 +1109,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless6/vq_utils.py
@ -348,7 +348,9 @@ class CodebookIndexExtractor:
            num_codebooks=self.params.num_codebooks,
            codebook_size=256,
        )
-        quantizer.load_state_dict(torch.load(self.quantizer_file_path))
+        quantizer.load_state_dict(
+            torch.load(self.quantizer_file_path, weights_only=False)
+        )
        quantizer.to(self.params.device)
        return quantizer

--- a/egs/librispeech/ASR/pruned_transducer_stateless7/compute_ali.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/compute_ali.py
@ -289,7 +289,7 @@ def main():
    logging.info("About to create model")
    model = get_transducer_model(params)

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/decode.py
@ -910,7 +910,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/decode_gigaspeech.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/decode_gigaspeech.py
@ -813,7 +813,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/finetune.py
@ -66,7 +66,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, ScaledAdam
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter
 from zipformer import Zipformer
@ -85,9 +84,11 @@ from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    filter_uneven_sized_batch,
    setup_logger,
    str2bool,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -635,7 +636,7 @@ def load_model_params(

    """
    logging.info(f"Loading checkpoint from {ckpt}")
-    checkpoint = torch.load(ckpt, map_location="cpu")
+    checkpoint = torch.load(ckpt, map_location="cpu", weights_only=False)

    # if module list is empty, load the whole model from ckpt
    if not init_modules:
@ -678,7 +679,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -857,7 +858,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -903,7 +904,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1219,7 +1220,7 @@ def run(rank, world_size, args):
            params=params,
        )

-    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    scaler = create_grad_scaler(enabled=params.use_fp16, init_scale=1.0)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1319,7 +1320,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/model.py
@ -23,7 +23,7 @@ import torch.nn as nn
 from encoder_interface import EncoderInterface
 from scaling import penalize_abs_values_gt

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -150,7 +150,7 @@ class Transducer(nn.Module):
        # if self.training and random.random() < 0.25:
        #    am = penalize_abs_values_gt(am, 30.0, 1.0e-04)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -185,7 +185,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = self.joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/pretrained.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/pretrained.py
@ -247,7 +247,7 @@ def main():
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoint = torch.load(args.checkpoint, map_location="cpu")
+    checkpoint = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
@ -28,6 +28,8 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Embedding as ScaledEmbedding

+from icefall.utils import torch_autocast
+

 class ActivationBalancerFunction(torch.autograd.Function):
    @staticmethod
@ -289,7 +291,7 @@ class SoftmaxFunction(torch.autograd.Function):
    @staticmethod
    def backward(ctx, ans_grad: Tensor):
        (ans,) = ctx.saved_tensors
-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            ans_grad = ans_grad.to(torch.float32)
            ans = ans.to(torch.float32)
            x_grad = ans_grad * ans
@ -669,7 +671,7 @@ class WhiteningPenaltyFunction(torch.autograd.Function):
    def backward(ctx, x_grad: Tensor):
        (x_orig,) = ctx.saved_tensors
        with torch.enable_grad():
-            with torch.cuda.amp.autocast(enabled=False):
+            with torch_autocast(enabled=False):
                x_detached = x_orig.to(torch.float32).detach()
                x_detached.requires_grad = True

@ -867,7 +869,7 @@ class MaxEig(torch.nn.Module):
        ):
            return _no_op(x)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            eps = 1.0e-20
            orig_x = x
            x = x.to(torch.float32)
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
@ -67,7 +67,6 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, ScaledAdam
 from torch import Tensor
-from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter
 from zipformer import Zipformer
@ -86,10 +85,12 @@ from icefall.hooks import register_inf_check_hooks
 from icefall.utils import (
    AttributeDict,
    MetricsTracker,
+    create_grad_scaler,
    filter_uneven_sized_batch,
    setup_logger,
    str2bool,
    symlink_or_copy,
+    torch_autocast,
 )

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
@ -581,7 +582,7 @@ def save_checkpoint(
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
-    scaler: Optional[GradScaler] = None,
+    scaler: Optional["GradScaler"] = None,
    rank: int = 0,
 ) -> None:
    """Save model, optimizer, scheduler and training stats to file.
@ -763,7 +764,7 @@ def train_one_epoch(
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
-    scaler: GradScaler,
+    scaler: "GradScaler",
    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
@ -809,7 +810,7 @@ def train_one_epoch(
        batch_size = len(batch["supervisions"]["text"])

        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, loss_info = compute_loss(
                    params=params,
                    model=model,
@ -1106,7 +1107,7 @@ def run(rank, world_size, args):
            params=params,
        )

-    scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0)
+    scaler = create_grad_scaler(enabled=params.use_fp16, init_scale=1.0)
    if checkpoints and "grad_scaler" in checkpoints:
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])
@ -1206,7 +1207,7 @@ def scan_pessimistic_batches_for_oom(
    for criterion, cuts in batches.items():
        batch = train_dl.dataset[cuts]
        try:
-            with torch.cuda.amp.autocast(enabled=params.use_fp16):
+            with torch_autocast(enabled=params.use_fp16):
                loss, _ = compute_loss(
                    params=params,
                    model=model,
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
@ -44,7 +44,7 @@ from scaling import (
 from torch import Tensor, nn

 from icefall.dist import get_rank
-from icefall.utils import is_jit_tracing, make_pad_mask
+from icefall.utils import is_jit_tracing, make_pad_mask, torch_autocast


 class Zipformer(EncoderInterface):
@ -1421,7 +1421,7 @@ class RelPositionMultiheadAttention(nn.Module):
        bsz = n // num_heads

        with torch.no_grad():
-            with torch.cuda.amp.autocast(enabled=False):
+            with torch_autocast(enabled=False):
                attn_weights = attn_weights.to(torch.float32)
                attn_output = attn_output.to(torch.float32)
                attn_weights_entropy = (
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/ctc_decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/ctc_decode.py
@ -633,7 +633,9 @@ def main():
        H = None
        bpe_model = None
        HLG = k2.Fsa.from_dict(
-            torch.load(f"{params.lang_dir}/HLG.pt", map_location=device)
+            torch.load(
+                f"{params.lang_dir}/HLG.pt", map_location=device, weights_only=False
+            )
        )
        assert HLG.requires_grad is False

@ -672,7 +674,9 @@ def main():
                torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt")
        else:
            logging.info("Loading pre-compiled G_4_gram.pt")
-            d = torch.load(params.lm_dir / "G_4_gram.pt", map_location=device)
+            d = torch.load(
+                params.lm_dir / "G_4_gram.pt", map_location=device, weights_only=False
+            )
            G = k2.Fsa.from_dict(d)

        if params.decoding_method == "whole-lattice-rescoring":
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/decode.py
@ -786,7 +786,7 @@ def main():
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
-                torch.load(lg_filename, map_location=device)
+                torch.load(lg_filename, map_location=device, weights_only=False)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/jit_pretrained_ctc.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/jit_pretrained_ctc.py
@ -347,7 +347,9 @@ def main():
        "whole-lattice-rescoring",
    ]:
        logging.info(f"Loading HLG from {params.HLG}")
-        HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu"))
+        HLG = k2.Fsa.from_dict(
+            torch.load(params.HLG, map_location="cpu", weights_only=False)
+        )
        HLG = HLG.to(device)
        if not hasattr(HLG, "lm_scores"):
            # For whole-lattice-rescoring and attention-decoder
@ -358,7 +360,9 @@ def main():
            "whole-lattice-rescoring",
        ]:
            logging.info(f"Loading G from {params.G}")
-            G = k2.Fsa.from_dict(torch.load(params.G, map_location="cpu"))
+            G = k2.Fsa.from_dict(
+                torch.load(params.G, map_location="cpu", weights_only=False)
+            )
            G = G.to(device)
            if params.method == "whole-lattice-rescoring":
                # Add epsilon self-loops to G as we will compose
--- a/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7_ctc/model.py
@ -22,7 +22,7 @@ import torch
 import torch.nn as nn
 from encoder_interface import EncoderInterface

-from icefall.utils import add_sos
+from icefall.utils import add_sos, torch_autocast


 class Transducer(nn.Module):
@ -150,7 +150,7 @@ class Transducer(nn.Module):
        lm = self.simple_lm_proj(decoder_out)
        am = self.simple_am_proj(encoder_out)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            simple_loss, (px_grad, py_grad) = k2.rnnt_loss_smoothed(
                lm=lm.float(),
                am=am.float(),
@ -185,7 +185,7 @@ class Transducer(nn.Module):
        # prior to do_rnnt_pruning (this is an optimization for speed).
        logits = self.joiner(am_pruned, lm_pruned, project_input=False)

-        with torch.cuda.amp.autocast(enabled=False):
+        with torch_autocast(enabled=False):
            pruned_loss = k2.rnnt_loss_pruned(
                logits=logits.float(),
                symbols=y_padded,
--- a/Show More
+++ b/Show More