CI for streaming zipformer CTC + HLG decoding

2025-08-26 18:24:18 +00:00 · 2024-03-18 19:44:17 +08:00 · 2024-03-18 19:44:17 +08:00 · d1410c52e7
commit d1410c52e7
parent 557bf292a2
2 changed files with 51 additions and 7 deletions
--- a/.github/scripts/librispeech/ASR/run.sh
+++ b/.github/scripts/librispeech/ASR/run.sh
@ -64,6 +64,46 @@ function run_diagnostics() {
    --print-diagnostics 1
 }
 function test_streaming_zipformer_ctc_hlg() {
  repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-streaming-zipformer-small-2024-03-18
  log "Downloading pre-trained model from $repo_url"
  git lfs install
  git clone $repo_url
  repo=$(basename $repo_url)
  rm $repo/exp-ctc-rnnt-small/*.onnx
  ls -lh $repo/exp-ctc-rnnt-small
  # export models to onnx
  ./zipformer/export-onnx-streaming-ctc.py \
    --tokens $repo/data/lang_bpe_500/tokens.txt \
    --epoch 30 \
    --avg 3 \
    --exp-dir $repo/exp-ctc-rnnt-small \
    --causal 1 \
    --use-ctc 1 \
    --chunk-size 16 \
    --left-context-frames 128 \
    \
    --num-encoder-layers 2,2,2,2,2,2 \
    --feedforward-dim 512,768,768,768,768,768 \
    --encoder-dim 192,256,256,256,256,256 \
    --encoder-unmasked-dim 192,192,192,192,192,192
  ls -lh $repo/exp-ctc-rnnt-small
  for wav in 0.wav 1.wav 8k.wav; do
    python3 ./zipformer/onnx_pretrained_ctc_HLG_streaming.py \
      --nn-model $repo/exp-ctc-rnnt-small/ctc-epoch-30-avg-3-chunk-16-left-128.int8.onnx \
      --words $repo/data/lang_bpe_500/words.txt \
      --HLG $repo/data/lang_bpe_500/HLG.fst \
      $repo/test_wavs/$wav
  done
  rm -rf $repo
 }
 function test_pruned_transducer_stateless_2022_03_12() {
  repo_url=https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless-2022-03-12
@ -1577,6 +1617,7 @@ function test_transducer_bpe_500_2021_12_23() {
 prepare_data
 run_diagnostics
 test_streaming_zipformer_ctc_hlg
 test_pruned_transducer_stateless_2022_03_12
 test_pruned_transducer_stateless2_2022_04_29
 test_pruned_transducer_stateless3_2022_04_29
--- a/egs/librispeech/ASR/zipformer/onnx_pretrained_ctc_HLG_streaming.py
+++ b/egs/librispeech/ASR/zipformer/onnx_pretrained_ctc_HLG_streaming.py
@ -27,10 +27,10 @@ popd
 2. Export the model to ONNX
 ./zipformer/export-onnx-streaming-ctc.py \
-  --tokens ./data/lang_bpe_500/tokens.txt \
+  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --epoch 30 \
  --avg 3 \
-  --exp-dir zipformer/exp-ctc-rnnt-small \
+  --exp-dir $repo/exp-ctc-rnnt-small \
  --causal 1 \
  --use-ctc 1 \
  --chunk-size 16 \
@ -107,8 +107,7 @@ def get_parser():
        type=str,
        help="The input sound file to transcribe. "
        "Supported formats are those supported by torchaudio.load(). "
-        "For example, wav and flac are supported. "
+        "For example, wav and flac are supported. ",
        "The sample rate has to be 16kHz.",
    )
    return parser
@ -311,9 +310,13 @@ def read_sound_files(
    ans = []
    for f in filenames:
        wave, sample_rate = torchaudio.load(f)
-        assert (
+        if sample_rate != expected_sample_rate:
-            sample_rate == expected_sample_rate
+            logging.info(f"Resample {sample_rate} to {expected_sample_rate}")
-        ), f"expected sample rate: {expected_sample_rate}. Given: {sample_rate}"
+            wave = torchaudio.functional.resample(
                wave,
                orig_freq=sample_rate,
                new_freq=expected_sample_rate,
            )
        # We use only the first channel
        ans.append(wave[0].contiguous())
    return ans