Export multi_zh-hans models to onnx

2025-06-30 10:58:18 +08:00 · 2025-06-30 10:58:18 +08:00 · 9d4b0dfcd4
commit 9d4b0dfcd4
parent abd9437e6d
5 changed files with 283 additions and 227 deletions
--- a/.github/scripts/multi-zh-hans.sh
+++ b/.github/scripts/multi-zh-hans.sh
@ -1,200 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-git config --global user.name "k2-fsa"
-git config --global user.email "csukuangfj@gmail.com"
-git config --global lfs.allowincompletepush true
-
-log() {
-  # This function is from espnet
-  local fname=${BASH_SOURCE[1]##*/}
-  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
-}
-
-log "pwd: $PWD"
-
-cd egs/multi_zh-hans/ASR
-
-repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-2023-9-2
-log "Downloading pre-trained model from $repo_url"
-GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
-repo=$(basename $repo_url)
-pushd $repo
-cd exp
-git lfs pull --include pretrained.pt
-ln -s pretrained.pt epoch-99.pt
-cd ../data/lang_bpe_2000
-ls -lh
-git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
-git lfs pull --include "*.model"
-ls -lh
-popd
-
-log "--------------------------------------------"
-log "Export non-streaming ONNX transducer models "
-log "--------------------------------------------"
-./zipformer/export-onnx.py \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  --use-averaged-model 0 \
-  --epoch 99 \
-  --avg 1 \
-  --exp-dir $repo/exp \
-  --causal False
-
-ls -lh $repo/exp
-
-./zipformer/onnx_pretrained.py \
-  --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
-  --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
-  --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  $repo/test_wavs/DEV_T0000000000.wav \
-  $repo/test_wavs/DEV_T0000000001.wav \
-  $repo/test_wavs/DEV_T0000000002.wav \
-  $repo/test_wavs/TEST_MEETING_T0000000113.wav \
-  $repo/test_wavs/TEST_MEETING_T0000000219.wav \
-  $repo/test_wavs/TEST_MEETING_T0000000351.wav
-
-rm -rf $repo
-
-repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05
-log "Downloading pre-trained model from $repo_url"
-GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
-repo=$(basename $repo_url)
-
-pushd $repo
-cd exp/
-git lfs pull --include pretrained.pt
-rm -fv epoch-20.pt
-rm -fv *.onnx
-ln -s pretrained.pt epoch-20.pt
-cd ../data/lang_bpe_2000
-ls -lh
-git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
-git lfs pull --include "*.model"
-ls -lh
-popd
-
-log "----------------------------------------"
-log "Export streaming ONNX CTC models "
-log "----------------------------------------"
-./zipformer/export-onnx-streaming-ctc.py \
-  --exp-dir $repo/exp \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  --causal 1 \
-  --avg 1 \
-  --epoch 20 \
-  --use-averaged-model 0 \
-  --chunk-size 16 \
-  --left-context-frames 128 \
-  --use-ctc 1
-
-ls -lh $repo/exp/
-
-log "------------------------------------------------------------"
-log "Test exported streaming ONNX CTC models (greedy search)     "
-log "------------------------------------------------------------"
-
-test_wavs=(
-DEV_T0000000000.wav
-DEV_T0000000001.wav
-DEV_T0000000002.wav
-TEST_MEETING_T0000000113.wav
-TEST_MEETING_T0000000219.wav
-TEST_MEETING_T0000000351.wav
-)
-
-for w in ${test_wavs[@]}; do
-  ./zipformer/onnx_pretrained-streaming-ctc.py \
-    --model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
-    --tokens $repo/data/lang_bpe_2000/tokens.txt \
-    $repo/test_wavs/$w
-done
-
-log "Upload onnx CTC models to huggingface"
-url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
-GIT_LFS_SKIP_SMUDGE=1 git clone $url
-dst=$(basename $url)
-cp -v $repo/exp/ctc*.onnx $dst
-cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
-cp -v $repo/data/lang_bpe_2000/bpe.model $dst
-mkdir -p $dst/test_wavs
-cp -v $repo/test_wavs/*.wav $dst/test_wavs
-cd $dst
-git lfs track "*.onnx" "bpe.model"
-ls -lh
-file bpe.model
-git status
-git add .
-git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
-
-log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
-rm -rf .git
-rm -fv .gitattributes
-cd ..
-tar cjfv $dst.tar.bz2 $dst
-ls -lh *.tar.bz2
-mv -v $dst.tar.bz2 ../../../
-
-log "----------------------------------------"
-log "Export streaming ONNX transducer models "
-log "----------------------------------------"
-
-./zipformer/export-onnx-streaming.py \
-  --exp-dir $repo/exp \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  --causal 1 \
-  --avg 1 \
-  --epoch 20 \
-  --use-averaged-model 0 \
-  --chunk-size 16 \
-  --left-context-frames 128 \
-  --use-ctc 0
-
-ls -lh $repo/exp
-
-log "------------------------------------------------------------"
-log "Test exported streaming ONNX transducer models (Python code)"
-log "------------------------------------------------------------"
-
-log "test fp32"
-./zipformer/onnx_pretrained-streaming.py \
-  --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.onnx \
-  --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
-  --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.onnx \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  $repo/test_wavs/DEV_T0000000000.wav
-
-log "test int8"
-./zipformer/onnx_pretrained-streaming.py \
-  --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
-  --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
-  --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
-  --tokens $repo/data/lang_bpe_2000/tokens.txt \
-  $repo/test_wavs/DEV_T0000000000.wav
-
-log "Upload onnx transducer models to huggingface"
-
-url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12
-GIT_LFS_SKIP_SMUDGE=1 git clone $url
-dst=$(basename $url)
-cp -v $repo/exp/encoder*.onnx $dst
-cp -v $repo/exp/decoder*.onnx $dst
-cp -v $repo/exp/joiner*.onnx $dst
-cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
-cp -v $repo/data/lang_bpe_2000/bpe.model $dst
-mkdir -p $dst/test_wavs
-cp -v $repo/test_wavs/*.wav $dst/test_wavs
-cd $dst
-git lfs track "*.onnx" bpe.model
-git add .
-git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
-
-log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
-rm -rf .git
-rm -fv .gitattributes
-cd ..
-tar cjfv $dst.tar.bz2 $dst
-ls -lh *.tar.bz2
-mv -v $dst.tar.bz2 ../../../
--- a/.github/scripts/multi_zh-hans/ASR/run.sh
+++ b/.github/scripts/multi_zh-hans/ASR/run.sh
@ -0,0 +1,210 @@
+#!/usr/bin/env bash
+
+set -ex
+
+git config --global user.name "k2-fsa"
+git config --global user.email "csukuangfj@gmail.com"
+git config --global lfs.allowincompletepush true
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/multi_zh-hans/ASR
+
+log "pwd: $PWD"
+
+function run_2023_9_2() {
+  repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-2023-9-2
+  log "Downloading pre-trained model from $repo_url"
+  GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+  repo=$(basename $repo_url)
+  pushd $repo
+  cd exp
+  git lfs pull --include pretrained.pt
+  ln -s pretrained.pt epoch-99.pt
+  cd ../data/lang_bpe_2000
+  ls -lh
+  git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
+  git lfs pull --include "*.model"
+  ls -lh
+  popd
+
+  log "--------------------------------------------"
+  log "Export non-streaming ONNX transducer models "
+  log "--------------------------------------------"
+  ./zipformer/export-onnx.py \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    --use-averaged-model 0 \
+    --epoch 99 \
+    --avg 1 \
+    --exp-dir $repo/exp \
+    --causal False
+
+  ls -lh $repo/exp
+
+  ./zipformer/onnx_pretrained.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav \
+    $repo/test_wavs/DEV_T0000000001.wav \
+    $repo/test_wavs/DEV_T0000000002.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000113.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000219.wav \
+    $repo/test_wavs/TEST_MEETING_T0000000351.wav
+
+  rm -rf $repo
+}
+
+function run_2023_11_05_streaming() {
+  repo_url=https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05
+  log "Downloading pre-trained model from $repo_url"
+  GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+  repo=$(basename $repo_url)
+
+  pushd $repo
+  cd exp/
+  git lfs pull --include pretrained.pt
+  rm -fv epoch-20.pt
+  rm -fv *.onnx
+  ln -s pretrained.pt epoch-20.pt
+  cd ../data/lang_bpe_2000
+  ls -lh
+  git lfs pull --include L.pt L_disambig.pt Linv.pt bpe.model
+  git lfs pull --include "*.model"
+  ls -lh
+  popd
+
+  log "----------------------------------------"
+  log "Export streaming ONNX CTC models "
+  log "----------------------------------------"
+  ./zipformer/export-onnx-streaming-ctc.py \
+    --exp-dir $repo/exp \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 20 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 1
+
+  ls -lh $repo/exp/
+
+  log "------------------------------------------------------------"
+  log "Test exported streaming ONNX CTC models (greedy search)     "
+  log "------------------------------------------------------------"
+
+  test_wavs=(
+    DEV_T0000000000.wav
+    DEV_T0000000001.wav
+    DEV_T0000000002.wav
+    TEST_MEETING_T0000000113.wav
+    TEST_MEETING_T0000000219.wav
+    TEST_MEETING_T0000000351.wav
+  )
+
+  for w in ${test_wavs[@]}; do
+    ./zipformer/onnx_pretrained-streaming-ctc.py \
+      --model-filename $repo/exp/ctc-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
+      --tokens $repo/data/lang_bpe_2000/tokens.txt \
+      $repo/test_wavs/$w
+  done
+
+  log "Upload onnx CTC models to huggingface"
+  url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-ctc-multi-zh-hans-2023-12-13
+  GIT_LFS_SKIP_SMUDGE=1 git clone $url
+  dst=$(basename $url)
+  cp -v $repo/exp/ctc*.onnx $dst
+  cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+  cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+  mkdir -p $dst/test_wavs
+  cp -v $repo/test_wavs/*.wav $dst/test_wavs
+  cd $dst
+  git lfs track "*.onnx" "bpe.model"
+  ls -lh
+  file bpe.model
+  git status
+  git add .
+  git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
+
+  log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+  rm -rf .git
+  rm -fv .gitattributes
+  cd ..
+  tar cjfv $dst.tar.bz2 $dst
+  ls -lh *.tar.bz2
+  mv -v $dst.tar.bz2 ../../../
+
+  log "----------------------------------------"
+  log "Export streaming ONNX transducer models "
+  log "----------------------------------------"
+
+  ./zipformer/export-onnx-streaming.py \
+    --exp-dir $repo/exp \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    --causal 1 \
+    --avg 1 \
+    --epoch 20 \
+    --use-averaged-model 0 \
+    --chunk-size 16 \
+    --left-context-frames 128 \
+    --use-ctc 0
+
+  ls -lh $repo/exp
+
+  log "------------------------------------------------------------"
+  log "Test exported streaming ONNX transducer models (Python code)"
+  log "------------------------------------------------------------"
+
+  log "test fp32"
+  ./zipformer/onnx_pretrained-streaming.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav
+
+  log "test int8"
+  ./zipformer/onnx_pretrained-streaming.py \
+    --encoder-model-filename $repo/exp/encoder-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
+    --decoder-model-filename $repo/exp/decoder-epoch-20-avg-1-chunk-16-left-128.onnx \
+    --joiner-model-filename $repo/exp/joiner-epoch-20-avg-1-chunk-16-left-128.int8.onnx \
+    --tokens $repo/data/lang_bpe_2000/tokens.txt \
+    $repo/test_wavs/DEV_T0000000000.wav
+}
+
+function run_2023_12_12_streaming() {
+  log "Upload onnx transducer models to huggingface"
+
+  url=https://huggingface.co/k2-fsa/sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12
+  GIT_LFS_SKIP_SMUDGE=1 git clone $url
+  dst=$(basename $url)
+  cp -v $repo/exp/encoder*.onnx $dst
+  cp -v $repo/exp/decoder*.onnx $dst
+  cp -v $repo/exp/joiner*.onnx $dst
+  cp -v $repo/data/lang_bpe_2000/tokens.txt $dst
+  cp -v $repo/data/lang_bpe_2000/bpe.model $dst
+  mkdir -p $dst/test_wavs
+  cp -v $repo/test_wavs/*.wav $dst/test_wavs
+  cd $dst
+  git lfs track "*.onnx" bpe.model
+  git add .
+  git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true
+
+  log "Upload models to https://github.com/k2-fsa/sherpa-onnx"
+  rm -rf .git
+  rm -fv .gitattributes
+  cd ..
+  tar cjfv $dst.tar.bz2 $dst
+  ls -lh *.tar.bz2
+  mv -v $dst.tar.bz2 ../../../
+}
+
+run_2023_9_2
+run_2023_11_05_streaming
+run_2023_12_12_streaming
--- a/.github/workflows/multi-zh-hans.yml
+++ b/.github/workflows/multi-zh-hans.yml
@ -15,47 +15,68 @@ permissions:
  contents: write

 jobs:
+  generate_build_matrix:
+    if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python ./.github/scripts/docker/generate_build_matrix.py --torch-version "2.0.0" --python-version "3.10"
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --torch-version "2.0.0" --python-version "3.10")
+          echo "::set-output name=matrix::${MATRIX}"
  multi-zh-hans:
-    runs-on: ${{ matrix.os }}
+    needs: generate_build_matrix
+    name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
+    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
-        os: [ubuntu-latest]
-        python-version: [3.8]
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}

    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0

-      - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-          cache: 'pip'
-          cache-dependency-path: '**/requirements-ci.txt'
-
-      - name: Install Python dependencies
-        run: |
-          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
-          pip uninstall -y protobuf
-          pip install --no-binary protobuf protobuf==3.20.*
-
-      - name: Cache kaldifeat
-        id: my-cache
-        uses: actions/cache@v2
-        with:
-          path: |
-            ~/tmp/kaldifeat
-          key: cache-tmp-${{ matrix.python-version }}-2023-05-22
-
-      - name: Install kaldifeat
-        if: steps.my-cache.outputs.cache-hit != 'true'
+      - name: Free space
        shell: bash
        run: |
-          .github/scripts/install-kaldifeat.sh
+          df -h
+          rm -rf /opt/hostedtoolcache
+          df -h
+          echo "pwd: $PWD"
+          echo "github.workspace ${{ github.workspace }}"
+
+      - name: Test with multi_zh-hans
+        uses: addnab/docker-run-action@v3
+        with:
+            image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
+            options: |
+              --volume ${{ github.workspace }}/:/icefall
+            shell: bash
+            run: |
+              export PYTHONPATH=/icefall:$PYTHONPATH
+              export HF_TOKEN=${{ secrets.HF_TOKEN }}
+              cd /icefall
+              git config --global --add safe.directory /icefall
+
+              .github/scripts/multi_zh-hans/ASR/run.sh
+
+      - name: Show models
+        shell: bash
+        run: |
+          ls -lh *.tar.bz2

      - name: export-model
+        if: false
        shell: bash
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
--- a/egs/librispeech/ASR/zipformer/export-onnx-streaming-ctc.py
+++ b/egs/librispeech/ASR/zipformer/export-onnx-streaming-ctc.py
@ -136,6 +136,13 @@ def get_parser():
        help="The context size in the decoder. 1 means bigram; 2 means tri-gram",
    )

+    parser.add_argument(
+        "--use-whisper-features",
+        type=str2bool,
+        default=False,
+        help="True to use whisper features. Must match the one used in training",
+    )
+
    add_model_arguments(parser)

    return parser
@ -270,6 +277,7 @@ def export_streaming_ctc_model_onnx(
    model: OnnxModel,
    encoder_filename: str,
    opset_version: int = 11,
+    use_whisper_features: bool = False,
 ) -> None:
    model.encoder.__class__.forward = model.encoder.__class__.streaming_forward

@ -367,6 +375,10 @@ def export_streaming_ctc_model_onnx(
        "value_head_dims": value_head_dims,
        "num_heads": num_heads,
    }
+
+    if use_whisper_features:
+        meta_data["feature"] = "whisper"
+
    logging.info(f"meta_data: {meta_data}")

    for i in range(len(init_state[:-2]) // 6):
@ -547,6 +559,7 @@ def main():
        model,
        model_filename,
        opset_version=opset_version,
+        use_whisper_features=params.use_whisper_features,
    )
    logging.info(f"Exported model to {model_filename}")

--- a/egs/librispeech/ASR/zipformer/export-onnx-streaming.py
+++ b/egs/librispeech/ASR/zipformer/export-onnx-streaming.py
@ -162,6 +162,13 @@ def get_parser():
        help="Whether to export models in fp16",
    )

+    parser.add_argument(
+        "--use-whisper-features",
+        type=str2bool,
+        default=False,
+        help="True to use whisper features. Must match the one used in training",
+    )
+
    add_model_arguments(parser)

    return parser
@ -342,6 +349,7 @@ def export_encoder_model_onnx(
    encoder_filename: str,
    opset_version: int = 11,
    feature_dim: int = 80,
+    use_whisper_features: bool = False,
 ) -> None:
    encoder_model.encoder.__class__.forward = (
        encoder_model.encoder.__class__.streaming_forward
@ -441,6 +449,9 @@ def export_encoder_model_onnx(
        "value_head_dims": value_head_dims,
        "num_heads": num_heads,
    }
+    if use_whisper_features:
+        meta_data["feature"] = "whisper"
+
    logging.info(f"meta_data: {meta_data}")

    for i in range(len(init_state[:-2]) // 6):
@ -734,6 +745,7 @@ def main():
        encoder_filename,
        opset_version=opset_version,
        feature_dim=params.feature_dim,
+        use_whisper_features=params.use_whisper_features,
    )
    logging.info(f"Exported encoder to {encoder_filename}")