From e22bc78f9827ce4059cd4598c19ad08415802c0a Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 11 Jul 2025 13:24:01 +0800 Subject: [PATCH] Export streaming zipformer2 to RKNN (#1977) --- .github/scripts/librispeech/ASR/run_rknn.sh | 139 +++++-- .github/scripts/multi_zh-hans/ASR/run_rknn.sh | 73 ++++ .github/scripts/wenetspeech/ASR/run_rknn.sh | 196 ++++++++++ .github/workflows/rknn.yml | 106 ++--- .../test_rknn_on_cpu_simulator.py | 2 +- .../zipformer/export-onnx-streaming-ctc.py | 27 +- .../ASR/zipformer/export-onnx-streaming.py | 89 +++-- .../zipformer/export_rknn_ctc_streaming.py | 74 ++++ .../export_rknn_transducer_streaming.py | 139 +++++++ ...est_rknn_on_cpu_simulator_ctc_streaming.py | 362 ++++++++++++++++++ .../zipformer/export_rknn_ctc_streaming.py | 1 + .../export_rknn_transducer_streaming.py | 1 + ...est_rknn_on_cpu_simulator_ctc_streaming.py | 1 + .../export_rknn_transducer_streaming.py | 1 + .../test_rknn_on_cpu_simulator_ctc.py | 1 + ...est_rknn_on_cpu_simulator_ctc_streaming.py | 1 + 16 files changed, 1069 insertions(+), 144 deletions(-) create mode 100755 .github/scripts/multi_zh-hans/ASR/run_rknn.sh create mode 100755 .github/scripts/wenetspeech/ASR/run_rknn.sh create mode 100755 egs/librispeech/ASR/zipformer/export_rknn_ctc_streaming.py create mode 100755 egs/librispeech/ASR/zipformer/export_rknn_transducer_streaming.py create mode 100755 egs/librispeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py create mode 120000 egs/multi_zh-hans/ASR/zipformer/export_rknn_ctc_streaming.py create mode 120000 egs/multi_zh-hans/ASR/zipformer/export_rknn_transducer_streaming.py create mode 120000 egs/multi_zh-hans/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py create mode 120000 egs/wenetspeech/ASR/zipformer/export_rknn_transducer_streaming.py create mode 120000 egs/wenetspeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc.py create mode 120000 egs/wenetspeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py diff --git a/.github/scripts/librispeech/ASR/run_rknn.sh b/.github/scripts/librispeech/ASR/run_rknn.sh index 304471724..bc7b00f0c 100755 --- a/.github/scripts/librispeech/ASR/run_rknn.sh +++ b/.github/scripts/librispeech/ASR/run_rknn.sh @@ -12,11 +12,10 @@ log() { cd egs/librispeech/ASR - # https://huggingface.co/csukuangfj/k2fsa-zipformer-chinese-english-mixed # sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 -function export_bilingual_zh_en() { - d=exp_zh_en +function export_2023_02_20() { + d=exp_2023_02_20 mkdir $d pushd $d @@ -70,21 +69,20 @@ function export_bilingual_zh_en() { --tokens $d/tokens.txt \ $d/1.wav - mkdir -p /icefall/rknn-models - for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do - mkdir -p $platform + dst=sherpa-onnx-$platform-streaming-zipformer-bilingual-zh-en-2023-02-20 + mkdir -p $dst ./pruned_transducer_stateless7_streaming/export_rknn.py \ --in-encoder $d/encoder-epoch-99-avg-1.onnx \ --in-decoder $d/decoder-epoch-99-avg-1.onnx \ --in-joiner $d/joiner-epoch-99-avg-1.onnx \ - --out-encoder $platform/encoder.rknn \ - --out-decoder $platform/decoder.rknn \ - --out-joiner $platform/joiner.rknn \ + --out-encoder $dst/encoder.rknn \ + --out-decoder $dst/decoder.rknn \ + --out-joiner $dst/joiner.rknn \ --target-platform $platform 2>/dev/null - ls -lh $platform/ + ls -lh $dst/ ./pruned_transducer_stateless7_streaming/test_rknn_on_cpu_simulator.py \ --encoder $d/encoder-epoch-99-avg-1.onnx \ @@ -93,19 +91,24 @@ function export_bilingual_zh_en() { --tokens $d/tokens.txt \ --wav $d/0.wav - cp $d/tokens.txt $platform - cp $d/*.wav $platform + cp $d/tokens.txt $dst + mkdir $dst/test_wavs + cp $d/*.wav $dst/test_wavs - cp -av $platform /icefall/rknn-models + tar cjvf $dst.tar.bz2 $dst + ls -lh $dst.tar.bz2 + mv $dst.tar.bz2 /icefall/ + ls -lh $dst/ + echo "---" + + rm -rf $dst done - - ls -lh /icefall/rknn-models } # https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t # sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16 -function export_bilingual_zh_en_small() { - d=exp_zh_en_small +function export_2023_02_16() { + d=exp_2023_02_16 mkdir $d pushd $d @@ -124,7 +127,6 @@ function export_bilingual_zh_en_small() { popd - ./pruned_transducer_stateless7_streaming/export-onnx-zh.py \ --dynamic-batch 0 \ --enable-int8-quantization 0 \ @@ -163,21 +165,20 @@ function export_bilingual_zh_en_small() { --tokens $d/tokens.txt \ $d/1.wav - mkdir -p /icefall/rknn-models-small - for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do - mkdir -p $platform + dst=sherpa-onnx-$platform-streaming-zipformer-small-bilingual-zh-en-2023-02-16 + mkdir -p $dst ./pruned_transducer_stateless7_streaming/export_rknn.py \ --in-encoder $d/encoder-epoch-99-avg-1.onnx \ --in-decoder $d/decoder-epoch-99-avg-1.onnx \ --in-joiner $d/joiner-epoch-99-avg-1.onnx \ - --out-encoder $platform/encoder.rknn \ - --out-decoder $platform/decoder.rknn \ - --out-joiner $platform/joiner.rknn \ + --out-encoder $dst/encoder.rknn \ + --out-decoder $dst/decoder.rknn \ + --out-joiner $dst/joiner.rknn \ --target-platform $platform 2>/dev/null - ls -lh $platform/ + ls -lh $dst/ ./pruned_transducer_stateless7_streaming/test_rknn_on_cpu_simulator.py \ --encoder $d/encoder-epoch-99-avg-1.onnx \ @@ -186,15 +187,89 @@ function export_bilingual_zh_en_small() { --tokens $d/tokens.txt \ --wav $d/0.wav - cp $d/tokens.txt $platform - cp $d/*.wav $platform + cp $d/tokens.txt $dst + mkdir $dst/test_wavs + cp $d/*.wav $dst/test_wavs - cp -av $platform /icefall/rknn-models-small + tar cjvf $dst.tar.bz2 $dst + ls -lh $dst.tar.bz2 + mv $dst.tar.bz2 /icefall/ + ls -lh $dst/ + echo "---" + + rm -rf $dst done - - ls -lh /icefall/rknn-models-small } -export_bilingual_zh_en_small +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-en-2023-06-26-english +function export_2023_06_26() { + d=exp_2023_06_26 -export_bilingual_zh_en + mkdir $d + pushd $d + + curl -SL -O https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/exp/pretrained.pt + mv pretrained.pt epoch-99.pt + + curl -SL -O https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/data/lang_bpe_500/tokens.txt + + curl -SL -o 0.wav https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/data/lang_bpe_500/tokens.txt + curl -SL -o 1.wav https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/test_wavs/1221-135766-0001.wav + curl -SL -o 2.wav https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/test_wavs/1221-135766-0002.wav + + ls -lh + + popd + + ./zipformer/export-onnx-streaming.py \ + --dynamic-batch 0 \ + --enable-int8-quantization 0 \ + --tokens $d/tokens.txt \ + --use-averaged-model 0 \ + --epoch 99 \ + --avg 1 \ + --exp-dir $d \ + --use-ctc 0 \ + --use-transducer 1 \ + \ + --chunk-size 32 \ + --left-context-frames 128 \ + --causal 1 + + ls -lh $d/ + + for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do + dst=sherpa-onnx-$platform-streaming-zipformer-en-2023-06-26 + mkdir -p $dst + + ./zipformer/export_rknn_transducer_streaming.py \ + --in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \ + --in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \ + --in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \ + --out-encoder $dst/encoder.rknn \ + --out-decoder $dst/decoder.rknn \ + --out-joiner $dst/joiner.rknn \ + --target-platform $platform + + ls -lh $dst/ + + cp $d/tokens.txt $dst + mkdir $dst/test_wavs + cp $d/*.wav $dst/test_wavs + + tar cjvf $dst.tar.bz2 $dst + ls -lh $dst.tar.bz2 + mv $dst.tar.bz2 /icefall/ + ls -lh $dst/ + echo "---" + + rm -rf $dst + done +} + +if [[ $rknn_toolkit2_version == "2.1.0" ]]; then + export_2023_02_16 + export_2023_02_20 +else + export_2023_06_26 +fi diff --git a/.github/scripts/multi_zh-hans/ASR/run_rknn.sh b/.github/scripts/multi_zh-hans/ASR/run_rknn.sh new file mode 100755 index 000000000..ac03eac79 --- /dev/null +++ b/.github/scripts/multi_zh-hans/ASR/run_rknn.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash + +set -ex + +python3 -m pip install kaldi-native-fbank soundfile librosa + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +cd egs/multi_zh-hans/ASR + + + +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12-chinese +function export_2023_11_05() { + d=exp + mkdir $d + pushd $d + curl -SL -O https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/data/lang_bpe_2000/tokens.txt + curl -SL -O https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/exp/pretrained.pt + mv pretrained.pt epoch-99.pt + + curl -SL -o 0.wav https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/test_wavs/DEV_T0000000000.wav + curl -SL -o 1.wav https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/test_wavs/DEV_T0000000001.wav + curl -SL -o 2.wav https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/test_wavs/DEV_T0000000002.wav + ls -lh + popd + + ./zipformer/export-onnx-streaming.py \ + --dynamic-batch 0 \ + --enable-int8-quantization 0 \ + --tokens $d/tokens.txt \ + --use-averaged-model 0 \ + --epoch 99 \ + --avg 1 \ + --exp-dir $d \ + --use-ctc 0 \ + --use-transducer 1 \ + --chunk-size 32 \ + --left-context-frames 128 \ + --causal 1 + + for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do + dst=sherpa-onnx-$platform-streaming-zipformer-multi-zh-hans-2023-12-12 + mkdir -p $dst + + ./zipformer/export_rknn_transducer_streaming.py \ + --in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \ + --in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \ + --in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \ + --out-encoder $dst/encoder.rknn \ + --out-decoder $dst/decoder.rknn \ + --out-joiner $dst/joiner.rknn \ + --target-platform $platform + + cp $d/tokens.txt $dst + mkdir $dst/test_wavs + cp $d/*.wav $dst/test_wavs + + tar cjvf $dst.tar.bz2 $dst + ls -lh $dst.tar.bz2 + mv $dst.tar.bz2 /icefall/ + ls -lh $dst/ + echo "---" + + rm -rf $dst + done +} + +export_2023_11_05 diff --git a/.github/scripts/wenetspeech/ASR/run_rknn.sh b/.github/scripts/wenetspeech/ASR/run_rknn.sh new file mode 100755 index 000000000..7ecd816e0 --- /dev/null +++ b/.github/scripts/wenetspeech/ASR/run_rknn.sh @@ -0,0 +1,196 @@ +#!/usr/bin/env bash + +set -ex + +python3 -m pip install kaldi-native-fbank soundfile librosa + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +cd egs/wenetspeech/ASR + +#https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#k2-fsa-icefall-asr-zipformer-wenetspeech-streaming-small-chinese +function export_2025_03_02() { + d=exp_2025_03_02 + mkdir $d + pushd $d + curl -SL -O https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/data/lang_char/tokens.txt + curl -SL -O https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/exp/pretrained.pt + mv pretrained.pt epoch-99.pt + + curl -SL -o 0.wav https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/test_wavs/DEV_T0000000000.wav + curl -SL -o 1.wav https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/test_wavs/DEV_T0000000001.wav + curl -SL -o 2.wav https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/test_wavs/DEV_T0000000002.wav + ls -lh + popd + + ./zipformer/export-onnx-streaming.py \ + --dynamic-batch 0 \ + --enable-int8-quantization 0 \ + --tokens $d/tokens.txt \ + --use-averaged-model 0 \ + --epoch 99 \ + --avg 1 \ + --exp-dir $d \ + --use-ctc 0 \ + --use-transducer 1 \ + \ + --num-encoder-layers 2,2,2,2,2,2 \ + --feedforward-dim 512,768,768,768,768,768 \ + --encoder-dim 192,256,256,256,256,256 \ + --encoder-unmasked-dim 192,192,192,192,192,192 \ + \ + --chunk-size 32 \ + --left-context-frames 128 \ + --causal 1 + + for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do + dst=sherpa-onnx-$platform-streaming-zipformer-small-zh-2025-03-02 + mkdir -p $dst + + ./zipformer/export_rknn_transducer_streaming.py \ + --in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \ + --in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \ + --in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \ + --out-encoder $dst/encoder.rknn \ + --out-decoder $dst/decoder.rknn \ + --out-joiner $dst/joiner.rknn \ + --target-platform $platform + + cp $d/tokens.txt $dst + mkdir $dst/test_wavs + cp $d/*.wav $dst/test_wavs + + tar cjvf $dst.tar.bz2 $dst + ls -lh $dst.tar.bz2 + mv $dst.tar.bz2 /icefall/ + ls -lh $dst/ + echo "---" + + rm -rf $dst + done + rm -rf $d +} + +# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#k2-fsa-icefall-asr-zipformer-wenetspeech-streaming-large-chinese +function export_2025_03_03() { + d=exp_2025_03_03 + mkdir $d + pushd $d + curl -SL -O https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/data/lang_char/tokens.txt + curl -SL -O https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/pretrained.pt + mv pretrained.pt epoch-99.pt + + curl -SL -o 0.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000000.wav + curl -SL -o 1.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000001.wav + curl -SL -o 2.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000002.wav + ls -lh + popd + + ./zipformer/export-onnx-streaming.py \ + --dynamic-batch 0 \ + --enable-int8-quantization 0 \ + --tokens $d/tokens.txt \ + --use-averaged-model 0 \ + --epoch 99 \ + --avg 1 \ + --exp-dir $d \ + --use-ctc 0 \ + --use-transducer 1 \ + \ + --chunk-size 32 \ + --left-context-frames 128 \ + --causal 1 + + for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do + dst=sherpa-onnx-$platform-streaming-zipformer-zh-2025-03-03 + mkdir -p $dst + + ./zipformer/export_rknn_transducer_streaming.py \ + --in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \ + --in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \ + --in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \ + --out-encoder $dst/encoder.rknn \ + --out-decoder $dst/decoder.rknn \ + --out-joiner $dst/joiner.rknn \ + --target-platform $platform + + cp $d/tokens.txt $dst + mkdir $dst/test_wavs + cp $d/*.wav $dst/test_wavs + + tar cjvf $dst.tar.bz2 $dst + ls -lh $dst.tar.bz2 + mv $dst.tar.bz2 /icefall/ + ls -lh $dst/ + echo "---" + ls -lh $dst.tar.bz2 + + rm -rf $dst + done + rm -rf $d +} + +function export_2023_06_15() { + d=exp_2023_06_15 + mkdir $d + pushd $d + curl -SL -O https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/data/lang_char/tokens.txt + curl -SL -O https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/pretrained.pt + mv pretrained.pt epoch-99.pt + + curl -SL -o 0.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000000.wav + curl -SL -o 1.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000001.wav + curl -SL -o 2.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000002.wav + ls -lh + popd + + ./zipformer/export-onnx-streaming.py \ + --dynamic-batch 0 \ + --enable-int8-quantization 0 \ + --tokens $d/tokens.txt \ + --use-averaged-model 0 \ + --epoch 99 \ + --avg 1 \ + --exp-dir $d \ + --use-ctc 0 \ + --use-transducer 1 \ + \ + --chunk-size 32 \ + --left-context-frames 128 \ + --causal 1 + + for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do + dst=sherpa-onnx-$platform-streaming-zipformer-zh-2023-06-15 + mkdir -p $dst + + ./zipformer/export_rknn_transducer_streaming.py \ + --in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \ + --in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \ + --in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \ + --out-encoder $dst/encoder.rknn \ + --out-decoder $dst/decoder.rknn \ + --out-joiner $dst/joiner.rknn \ + --target-platform $platform + + cp $d/tokens.txt $dst + mkdir $dst/test_wavs + cp $d/*.wav $dst/test_wavs + + tar cjvf $dst.tar.bz2 $dst + ls -lh $dst.tar.bz2 + mv $dst.tar.bz2 /icefall/ + ls -lh $dst/ + echo "---" + ls -lh $dst.tar.bz2 + + rm -rf $dst + done +} + +export_2025_03_02 +export_2025_03_03 +export_2023_06_15 diff --git a/.github/workflows/rknn.yml b/.github/workflows/rknn.yml index 51aa4eb9b..ce37397f9 100644 --- a/.github/workflows/rknn.yml +++ b/.github/workflows/rknn.yml @@ -4,7 +4,7 @@ on: push: branches: - master - - ci-rknn-2 + - rknn-zipformer2 pull_request: branches: @@ -17,44 +17,29 @@ concurrency: cancel-in-progress: true jobs: - generate_build_matrix: - if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa' - # see https://github.com/pytorch/pytorch/pull/50633 - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - name: Generating build matrix - id: set-matrix - run: | - # outputting for debugging purposes - python ./.github/scripts/docker/generate_build_matrix.py --torch-version=2.4.0 --python-version=3.10 - MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --torch-version=2.4.0 --python-version=3.10) - echo "::set-output name=matrix::${MATRIX}" rknn: - needs: generate_build_matrix - name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }} + name: RKNN ${{ matrix.recipe }} ${{ matrix.rknn_toolkit2_version }} runs-on: ubuntu-latest strategy: fail-fast: false matrix: - ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }} + python-version: ["3.10"] + k2-version: ["1.24.4.dev20241029"] + kaldifeat-version: ["1.25.5.dev20241029"] + torch-version: ["2.0.0"] + torchaudio-version: ["2.0.1"] + version: ["20241218"] + # recipe: ["librispeech", "wenetspeech", "multi_zh-hans"] + recipe: ["librispeech"] + rknn_toolkit2_version: ["2.2.0", "2.1.0"] + steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - - name: Setup Python - if: false - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Export ONNX model + - name: Export RKNN model uses: addnab/docker-run-action@v3 with: image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }} @@ -73,65 +58,35 @@ jobs: python3 -m torch.utils.collect_env python3 -m k2.version pip list + export rknn_toolkit2_version=${{ matrix.rknn_toolkit2_version }} + if [[ $rknn_toolkit2_version == "2.1.0" ]]; then + # for the folder pruned_transducer_stateless7_streaming + curl -SL -O https://huggingface.co/csukuangfj/rknn-toolkit2/resolve/main/rknn_toolkit2-2.1.0%2B708089d1-cp310-cp310-linux_x86_64.whl + else + # for the folder zipformer/ + curl -SL -O https://huggingface.co/csukuangfj/rknn-toolkit2/resolve/main/rknn_toolkit2-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl + fi # Install rknn - curl -SL -O https://huggingface.co/csukuangfj/rknn-toolkit2/resolve/main/rknn_toolkit2-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl pip install ./*.whl "numpy<=1.26.4" pip list | grep rknn echo "---" pip list echo "---" - .github/scripts/librispeech/ASR/run_rknn.sh + recipe=${{ matrix.recipe }} + .github/scripts/$recipe/ASR/run_rknn.sh > log-$recipe.txt 2>&1 || true - - name: Display rknn models - shell: bash - run: | - ls -lh - - ls -lh rknn-models/* - echo "----" - ls -lh rknn-models-small/* - - - name: Collect results (small) - shell: bash - run: | - for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do - dst=sherpa-onnx-$platform-streaming-zipformer-small-bilingual-zh-en-2023-02-16 - mkdir $dst - mkdir $dst/test_wavs - src=rknn-models-small/$platform - - cp -v $src/*.rknn $dst/ - cp -v $src/tokens.txt $dst/ - cp -v $src/*.wav $dst/test_wavs/ - ls -lh $dst - tar cjfv $dst.tar.bz2 $dst - rm -rf $dst - done - - - name: Collect results - shell: bash - run: | - for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do - dst=sherpa-onnx-$platform-streaming-zipformer-bilingual-zh-en-2023-02-20 - mkdir $dst - mkdir $dst/test_wavs - src=rknn-models/$platform - - cp -v $src/*.rknn $dst/ - cp -v $src/tokens.txt $dst/ - cp -v $src/*.wav $dst/test_wavs/ - ls -lh $dst - tar cjfv $dst.tar.bz2 $dst - rm -rf $dst - done + - uses: actions/upload-artifact@v4 + with: + name: log-${{ matrix.recipe }}-${{ matrix.rknn_toolkit2_version }} + path: ./log-*.txt - name: Display results shell: bash run: | - ls -lh *rk*.tar.bz2 + ls -lh *rk*.tar.bz2 || true - name: Release to GitHub uses: svenstaro/upload-release-action@v2 @@ -144,7 +99,7 @@ jobs: tag: asr-models - name: Upload model to huggingface - if: github.event_name == 'push' + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} uses: nick-fields/retry@v3 @@ -167,8 +122,7 @@ jobs: git merge -m "merge remote" --ff origin main dst=streaming-asr mkdir -p $dst - rm -fv $dst/* - cp ../*rk*.tar.bz2 $dst/ + cp ../*rk*.tar.bz2 $dst/ || true ls -lh $dst git add . diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/test_rknn_on_cpu_simulator.py b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/test_rknn_on_cpu_simulator.py index a543c6083..f860aba5d 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/test_rknn_on_cpu_simulator.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7_streaming/test_rknn_on_cpu_simulator.py @@ -72,7 +72,7 @@ def compute_features(filename: str, dim: int = 80) -> np.ndarray: filename: Path to an audio file. Returns: - Return a 1-D float32 tensor of shape (1, 80, 3000) containing the features. + Return a 2-D float32 tensor of shape (T, dim) containing the features. """ wave, sample_rate = load_audio(filename) if sample_rate != 16000: diff --git a/egs/librispeech/ASR/zipformer/export-onnx-streaming-ctc.py b/egs/librispeech/ASR/zipformer/export-onnx-streaming-ctc.py index 003b4bf2c..9a715eefd 100755 --- a/egs/librispeech/ASR/zipformer/export-onnx-streaming-ctc.py +++ b/egs/librispeech/ASR/zipformer/export-onnx-streaming-ctc.py @@ -74,6 +74,20 @@ def get_parser(): formatter_class=argparse.ArgumentDefaultsHelpFormatter ) + parser.add_argument( + "--dynamic-batch", + type=int, + default=1, + help="1 to support dynamic batch size. 0 to support only batch size == 1", + ) + + parser.add_argument( + "--enable-int8-quantization", + type=int, + default=1, + help="1 to also export int8 onnx models.", + ) + parser.add_argument( "--epoch", type=int, @@ -326,6 +340,7 @@ def export_streaming_ctc_model_onnx( model: OnnxModel, encoder_filename: str, opset_version: int = 11, + dynamic_batch: bool = True, use_whisper_features: bool = False, use_external_data: bool = False, ) -> None: @@ -470,7 +485,9 @@ def export_streaming_ctc_model_onnx( "log_probs": {0: "N"}, **inputs, **outputs, - }, + } + if dynamic_batch + else {}, ) add_meta_data( @@ -618,15 +635,17 @@ def main(): model, str(model_filename), opset_version=opset_version, + dynamic_batch=params.dynamic_batch == 1, use_whisper_features=params.use_whisper_features, use_external_data=params.use_external_data, ) logging.info(f"Exported model to {model_filename}") - # Generate int8 quantization models - # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection + if params.enable_int8_quantization: + # Generate int8 quantization models + # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection - logging.info("Generate int8 quantization models") + logging.info("Generate int8 quantization models") if params.use_external_data: model_filename_int8 = f"ctc-{suffix}.int8.onnx" diff --git a/egs/librispeech/ASR/zipformer/export-onnx-streaming.py b/egs/librispeech/ASR/zipformer/export-onnx-streaming.py index 29541570b..daeb86f6a 100755 --- a/egs/librispeech/ASR/zipformer/export-onnx-streaming.py +++ b/egs/librispeech/ASR/zipformer/export-onnx-streaming.py @@ -93,6 +93,20 @@ def get_parser(): formatter_class=argparse.ArgumentDefaultsHelpFormatter ) + parser.add_argument( + "--dynamic-batch", + type=int, + default=1, + help="1 to support dynamic batch size. 0 to support only batch size == 1", + ) + + parser.add_argument( + "--enable-int8-quantization", + type=int, + default=1, + help="1 to also export int8 onnx models.", + ) + parser.add_argument( "--epoch", type=int, @@ -389,6 +403,7 @@ def export_encoder_model_onnx( encoder_filename: str, opset_version: int = 11, feature_dim: int = 80, + dynamic_batch: bool = True, use_whisper_features: bool = False, use_external_data: bool = False, ) -> None: @@ -534,7 +549,9 @@ def export_encoder_model_onnx( "encoder_out": {0: "N"}, **inputs, **outputs, - }, + } + if dynamic_batch + else {}, ) add_meta_data( @@ -548,6 +565,7 @@ def export_decoder_model_onnx( decoder_model: OnnxDecoder, decoder_filename: str, opset_version: int = 11, + dynamic_batch: bool = True, ) -> None: """Export the decoder model to ONNX format. @@ -570,7 +588,7 @@ def export_decoder_model_onnx( context_size = decoder_model.decoder.context_size vocab_size = decoder_model.decoder.vocab_size - y = torch.zeros(10, context_size, dtype=torch.int64) + y = torch.zeros(1, context_size, dtype=torch.int64) decoder_model = torch.jit.script(decoder_model) torch.onnx.export( decoder_model, @@ -583,7 +601,9 @@ def export_decoder_model_onnx( dynamic_axes={ "y": {0: "N"}, "decoder_out": {0: "N"}, - }, + } + if dynamic_batch + else {}, ) meta_data = { @@ -597,6 +617,7 @@ def export_joiner_model_onnx( joiner_model: nn.Module, joiner_filename: str, opset_version: int = 11, + dynamic_batch: bool = True, ) -> None: """Export the joiner model to ONNX format. The exported joiner model has two inputs: @@ -611,8 +632,8 @@ def export_joiner_model_onnx( joiner_dim = joiner_model.output_linear.weight.shape[1] logging.info(f"joiner dim: {joiner_dim}") - projected_encoder_out = torch.rand(11, joiner_dim, dtype=torch.float32) - projected_decoder_out = torch.rand(11, joiner_dim, dtype=torch.float32) + projected_encoder_out = torch.rand(1, joiner_dim, dtype=torch.float32) + projected_decoder_out = torch.rand(1, joiner_dim, dtype=torch.float32) torch.onnx.export( joiner_model, @@ -629,7 +650,9 @@ def export_joiner_model_onnx( "encoder_out": {0: "N"}, "decoder_out": {0: "N"}, "logit": {0: "N"}, - }, + } + if dynamic_batch + else {}, ) meta_data = { "joiner_dim": str(joiner_dim), @@ -793,6 +816,7 @@ def main(): str(encoder_filename), opset_version=opset_version, feature_dim=params.feature_dim, + dynamic_batch=params.dynamic_batch == 1, use_whisper_features=params.use_whisper_features, use_external_data=params.use_external_data, ) @@ -804,6 +828,7 @@ def main(): decoder, decoder_filename, opset_version=opset_version, + dynamic_batch=params.dynamic_batch == 1, ) logging.info(f"Exported decoder to {decoder_filename}") @@ -813,6 +838,7 @@ def main(): joiner, joiner_filename, opset_version=opset_version, + dynamic_batch=params.dynamic_batch == 1, ) logging.info(f"Exported joiner to {joiner_filename}") @@ -835,35 +861,36 @@ def main(): # Generate int8 quantization models # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection - logging.info("Generate int8 quantization models") + if params.enable_int8_quantization: + logging.info("Generate int8 quantization models") - if params.use_external_data: - encoder_filename_int8 = f"encoder-{suffix}.int8.onnx" - else: - encoder_filename_int8 = params.exp_dir / f"encoder-{suffix}.int8.onnx" + if params.use_external_data: + encoder_filename_int8 = f"encoder-{suffix}.int8.onnx" + else: + encoder_filename_int8 = params.exp_dir / f"encoder-{suffix}.int8.onnx" - quantize_dynamic( - model_input=encoder_filename, - model_output=encoder_filename_int8, - op_types_to_quantize=["MatMul"], - weight_type=QuantType.QInt8, - ) + quantize_dynamic( + model_input=encoder_filename, + model_output=encoder_filename_int8, + op_types_to_quantize=["MatMul"], + weight_type=QuantType.QInt8, + ) - decoder_filename_int8 = params.exp_dir / f"decoder-{suffix}.int8.onnx" - quantize_dynamic( - model_input=decoder_filename, - model_output=decoder_filename_int8, - op_types_to_quantize=["MatMul", "Gather"], - weight_type=QuantType.QInt8, - ) + decoder_filename_int8 = params.exp_dir / f"decoder-{suffix}.int8.onnx" + quantize_dynamic( + model_input=decoder_filename, + model_output=decoder_filename_int8, + op_types_to_quantize=["MatMul", "Gather"], + weight_type=QuantType.QInt8, + ) - joiner_filename_int8 = params.exp_dir / f"joiner-{suffix}.int8.onnx" - quantize_dynamic( - model_input=joiner_filename, - model_output=joiner_filename_int8, - op_types_to_quantize=["MatMul"], - weight_type=QuantType.QInt8, - ) + joiner_filename_int8 = params.exp_dir / f"joiner-{suffix}.int8.onnx" + quantize_dynamic( + model_input=joiner_filename, + model_output=joiner_filename_int8, + op_types_to_quantize=["MatMul"], + weight_type=QuantType.QInt8, + ) if __name__ == "__main__": diff --git a/egs/librispeech/ASR/zipformer/export_rknn_ctc_streaming.py b/egs/librispeech/ASR/zipformer/export_rknn_ctc_streaming.py new file mode 100755 index 000000000..4de0a598a --- /dev/null +++ b/egs/librispeech/ASR/zipformer/export_rknn_ctc_streaming.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +import argparse +import logging +from pathlib import Path +from typing import List + +from rknn.api import RKNN +from test_rknn_on_cpu_simulator_ctc_streaming import RKNNModel + +logging.basicConfig(level=logging.WARNING) + +g_platforms = [ + # "rv1103", + # "rv1103b", + # "rv1106", + # "rk2118", + "rk3562", + "rk3566", + "rk3568", + "rk3576", + "rk3588", +] + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--target-platform", + type=str, + required=True, + help=f"Supported values are: {','.join(g_platforms)}", + ) + + parser.add_argument( + "--in-model", + type=str, + required=True, + help="Path to the onnx model", + ) + + parser.add_argument( + "--out-model", + type=str, + required=True, + help="Path to the rknn model", + ) + + return parser + + +def main(): + args = get_parser().parse_args() + print(vars(args)) + + model = RKNNModel( + model=args.in_model, + target_platform=args.target_platform, + ) + print(model.meta) + + model.export_rknn( + model=args.out_model, + ) + + model.release() + + +if __name__ == "__main__": + main() diff --git a/egs/librispeech/ASR/zipformer/export_rknn_transducer_streaming.py b/egs/librispeech/ASR/zipformer/export_rknn_transducer_streaming.py new file mode 100755 index 000000000..27ff81b91 --- /dev/null +++ b/egs/librispeech/ASR/zipformer/export_rknn_transducer_streaming.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +import argparse +import logging +from pathlib import Path +from typing import List + +from rknn.api import RKNN +from test_rknn_on_cpu_simulator_ctc_streaming import ( + MetaData, + get_meta_data, + init_model, + export_rknn, +) + +logging.basicConfig(level=logging.WARNING) + +g_platforms = [ + # "rv1103", + # "rv1103b", + # "rv1106", + # "rk2118", + "rk3562", + "rk3566", + "rk3568", + "rk3576", + "rk3588", +] + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--target-platform", + type=str, + required=True, + help=f"Supported values are: {','.join(g_platforms)}", + ) + + parser.add_argument( + "--in-encoder", + type=str, + required=True, + help="Path to the encoder onnx model", + ) + + parser.add_argument( + "--in-decoder", + type=str, + required=True, + help="Path to the decoder onnx model", + ) + + parser.add_argument( + "--in-joiner", + type=str, + required=True, + help="Path to the joiner onnx model", + ) + + parser.add_argument( + "--out-encoder", + type=str, + required=True, + help="Path to the encoder rknn model", + ) + + parser.add_argument( + "--out-decoder", + type=str, + required=True, + help="Path to the decoder rknn model", + ) + + parser.add_argument( + "--out-joiner", + type=str, + required=True, + help="Path to the joiner rknn model", + ) + + return parser + + +class RKNNModel: + def __init__( + self, + encoder: str, + decoder: str, + joiner: str, + target_platform: str, + ): + self.meta = get_meta_data(encoder) + self.encoder = init_model( + encoder, + custom_string=self.meta.to_str(), + target_platform=target_platform, + ) + self.decoder = init_model(decoder, target_platform=target_platform) + self.joiner = init_model(joiner, target_platform=target_platform) + + def export_rknn(self, encoder, decoder, joiner): + export_rknn(self.encoder, encoder) + export_rknn(self.decoder, decoder) + export_rknn(self.joiner, joiner) + + def release(self): + self.encoder.release() + self.decoder.release() + self.joiner.release() + + +def main(): + args = get_parser().parse_args() + print(vars(args)) + + model = RKNNModel( + encoder=args.in_encoder, + decoder=args.in_decoder, + joiner=args.in_joiner, + target_platform=args.target_platform, + ) + print(model.meta) + + model.export_rknn( + encoder=args.out_encoder, + decoder=args.out_decoder, + joiner=args.out_joiner, + ) + + model.release() + + +if __name__ == "__main__": + main() diff --git a/egs/librispeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py b/egs/librispeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py new file mode 100755 index 000000000..458508b89 --- /dev/null +++ b/egs/librispeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +import argparse +from pathlib import Path +from typing import List, Tuple + +import kaldi_native_fbank as knf +import numpy as np +import soundfile as sf +from rknn.api import RKNN + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--model", + type=str, + required=True, + help="Path to the onnx model", + ) + + parser.add_argument( + "--tokens", + type=str, + required=True, + help="Path to the tokens.txt", + ) + + parser.add_argument( + "--wav", + type=str, + required=True, + help="Path to test wave", + ) + + return parser + + +def load_audio(filename: str) -> Tuple[np.ndarray, int]: + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + + samples = np.ascontiguousarray(data) + return samples, sample_rate + + +def compute_features(filename: str, dim: int = 80) -> np.ndarray: + """ + Args: + filename: + Path to an audio file. + Returns: + Return a 2-D float32 tensor of shape (T, dim) containing the features. + """ + wave, sample_rate = load_audio(filename) + if sample_rate != 16000: + import librosa + + wave = librosa.resample(wave, orig_sr=sample_rate, target_sr=16000) + sample_rate = 16000 + + features = [] + opts = knf.FbankOptions() + opts.frame_opts.dither = 0 + opts.mel_opts.num_bins = dim + opts.frame_opts.snip_edges = False + fbank = knf.OnlineFbank(opts) + + fbank.accept_waveform(16000, wave) + tail_paddings = np.zeros(int(0.5 * 16000), dtype=np.float32) + fbank.accept_waveform(16000, tail_paddings) + fbank.input_finished() + for i in range(fbank.num_frames_ready): + f = fbank.get_frame(i) + features.append(f) + + features = np.stack(features, axis=0) + + return features + + +def load_tokens(filename): + tokens = dict() + with open(filename, "r") as f: + for line in f: + t, i = line.split() + tokens[int(i)] = t + return tokens + + +def init_model(filename, target_platform="rk3588", custom_string=None): + rknn = RKNN(verbose=False) + + rknn.config(target_platform=target_platform, custom_string=custom_string) + if not Path(filename).is_file(): + exit(f"{filename} does not exist") + + ret = rknn.load_onnx(model=filename) + if ret != 0: + exit(f"Load model {filename} failed!") + + ret = rknn.build(do_quantization=False) + if ret != 0: + exit("Build model {filename} failed!") + + ret = rknn.init_runtime() + if ret != 0: + exit(f"Failed to init rknn runtime for {filename}") + return rknn + + +class MetaData: + def __init__( + self, + model_type: str, + decode_chunk_len: int, + T: int, + num_encoder_layers: List[int], + encoder_dims: List[int], + cnn_module_kernels: List[int], + left_context_len: List[int], + query_head_dims: List[int], + value_head_dims: List[int], + num_heads: List[int], + ): + self.model_type = model_type + self.decode_chunk_len = decode_chunk_len + self.T = T + self.num_encoder_layers = num_encoder_layers + self.encoder_dims = encoder_dims + self.cnn_module_kernels = cnn_module_kernels + self.left_context_len = left_context_len + self.query_head_dims = query_head_dims + self.value_head_dims = value_head_dims + self.num_heads = num_heads + + def __str__(self) -> str: + return self.to_str() + + def to_str(self) -> str: + def to_s(ll): + return ",".join(list(map(str, ll))) + + s = f"model_type={self.model_type}" + s += ";decode_chunk_len=" + str(self.decode_chunk_len) + s += ";T=" + str(self.T) + s += ";num_encoder_layers=" + to_s(self.num_encoder_layers) + s += ";encoder_dims=" + to_s(self.encoder_dims) + s += ";cnn_module_kernels=" + to_s(self.cnn_module_kernels) + s += ";left_context_len=" + to_s(self.left_context_len) + s += ";query_head_dims=" + to_s(self.query_head_dims) + s += ";value_head_dims=" + to_s(self.value_head_dims) + s += ";num_heads=" + to_s(self.num_heads) + + assert len(s) < 1024, (s, len(s)) + + return s + + +def get_meta_data(model: str): + import onnxruntime + + session_opts = onnxruntime.SessionOptions() + session_opts.inter_op_num_threads = 1 + session_opts.intra_op_num_threads = 1 + + m = onnxruntime.InferenceSession( + model, + sess_options=session_opts, + providers=["CPUExecutionProvider"], + ) + + for i in m.get_inputs(): + print(i) + + print("-----") + + for i in m.get_outputs(): + print(i) + + meta = m.get_modelmeta().custom_metadata_map + print(meta) + """ + {'num_heads': '4,4,4,8,4,4', 'query_head_dims': '32,32,32,32,32,32', + 'cnn_module_kernels': '31,31,15,15,15,31', + 'num_encoder_layers': '2,2,3,4,3,2', ' version': '1', + 'comment': 'streaming ctc zipformer2', + 'model_type': 'zipformer2', + 'encoder_dims': '192,256,384,512,384,256', + 'model_author': 'k2-fsa', 'T': '77', + 'value_head_dims': '12,12,12,12,12,12', + 'left_context_len': '128,64,32,16,32,64', + 'decode_chunk_len': '64'} + """ + + def to_int_list(s): + return list(map(int, s.split(","))) + + model_type = meta["model_type"] + decode_chunk_len = int(meta["decode_chunk_len"]) + T = int(meta["T"]) + num_encoder_layers = to_int_list(meta["num_encoder_layers"]) + encoder_dims = to_int_list(meta["encoder_dims"]) + cnn_module_kernels = to_int_list(meta["cnn_module_kernels"]) + left_context_len = to_int_list(meta["left_context_len"]) + query_head_dims = to_int_list(meta["query_head_dims"]) + value_head_dims = to_int_list(meta["value_head_dims"]) + num_heads = to_int_list(meta["num_heads"]) + + return MetaData( + model_type=model_type, + decode_chunk_len=decode_chunk_len, + T=T, + num_encoder_layers=num_encoder_layers, + encoder_dims=encoder_dims, + cnn_module_kernels=cnn_module_kernels, + left_context_len=left_context_len, + query_head_dims=query_head_dims, + value_head_dims=value_head_dims, + num_heads=num_heads, + ) + + +def export_rknn(rknn, filename): + ret = rknn.export_rknn(filename) + if ret != 0: + exit("Export rknn model to {filename} failed!") + + +class RKNNModel: + def __init__(self, model: str, target_platform="rk3588"): + self.meta = get_meta_data(model) + self.model = init_model(model, custom_string=self.meta.to_str()) + + def export_rknn(self, model: str): + export_rknn(self.model, model) + + def release(self): + self.model.release() + + def get_init_states( + self, + ) -> List[np.ndarray]: + states = [] + + num_encoder_layers = self.meta.num_encoder_layers + encoder_dims = self.meta.encoder_dims + left_context_len = self.meta.left_context_len + cnn_module_kernels = self.meta.cnn_module_kernels + query_head_dims = self.meta.query_head_dims + value_head_dims = self.meta.value_head_dims + num_heads = self.meta.num_heads + + num_encoders = len(num_encoder_layers) + N = 1 + + for i in range(num_encoders): + num_layers = num_encoder_layers[i] + key_dim = query_head_dims[i] * num_heads[i] + embed_dim = encoder_dims[i] + nonlin_attn_head_dim = 3 * embed_dim // 4 + value_dim = value_head_dims[i] * num_heads[i] + conv_left_pad = cnn_module_kernels[i] // 2 + + for layer in range(num_layers): + cached_key = np.zeros( + (left_context_len[i], N, key_dim), dtype=np.float32 + ) + cached_nonlin_attn = np.zeros( + (1, N, left_context_len[i], nonlin_attn_head_dim), + dtype=np.float32, + ) + cached_val1 = np.zeros( + (left_context_len[i], N, value_dim), + dtype=np.float32, + ) + cached_val2 = np.zeros( + (left_context_len[i], N, value_dim), + dtype=np.float32, + ) + cached_conv1 = np.zeros((N, embed_dim, conv_left_pad), dtype=np.float32) + cached_conv2 = np.zeros((N, embed_dim, conv_left_pad), dtype=np.float32) + states += [ + cached_key, + cached_nonlin_attn, + cached_val1, + cached_val2, + cached_conv1, + cached_conv2, + ] + embed_states = np.zeros((N, 128, 3, 19), dtype=np.float32) + states.append(embed_states) + processed_lens = np.zeros((N,), dtype=np.int64) + states.append(processed_lens) + + return states + + def run_model(self, x: np.ndarray, states: List[np.ndarray]): + """ + Args: + x: (T, C), np.float32 + states: A list of states + """ + x = np.expand_dims(x, axis=0) + + out = self.model.inference(inputs=[x] + states, data_format="nchw") + # out[0]: log_probs, (N, T, C) + return out[0], out[1:] + + +def main(): + args = get_parser().parse_args() + print(vars(args)) + + id2token = load_tokens(args.tokens) + features = compute_features(args.wav) + model = RKNNModel( + model=args.model, + ) + print(model.meta) + + states = model.get_init_states() + + segment = model.meta.T + offset = model.meta.decode_chunk_len + + ans = [] + blank = 0 + prev = -1 + i = 0 + while True: + if i + segment > features.shape[0]: + break + x = features[i : i + segment] + i += offset + log_probs, states = model.run_model(x, states) + log_probs = log_probs[0] # (N, T, C) -> (N, T, C) + ids = log_probs.argmax(axis=1) + for k in ids: + if i != blank and i != prev: + ans.append(i) + prev = i + tokens = [id2token[i] for i in ans] + underline = "▁" + # underline = b"\xe2\x96\x81".decode() + text = "".join(tokens).replace(underline, " ").strip() + + print(ans) + print(args.wav) + print(text) + + +if __name__ == "__main__": + main() diff --git a/egs/multi_zh-hans/ASR/zipformer/export_rknn_ctc_streaming.py b/egs/multi_zh-hans/ASR/zipformer/export_rknn_ctc_streaming.py new file mode 120000 index 000000000..761e399ba --- /dev/null +++ b/egs/multi_zh-hans/ASR/zipformer/export_rknn_ctc_streaming.py @@ -0,0 +1 @@ +../../../librispeech/ASR/zipformer/export_rknn_ctc_streaming.py \ No newline at end of file diff --git a/egs/multi_zh-hans/ASR/zipformer/export_rknn_transducer_streaming.py b/egs/multi_zh-hans/ASR/zipformer/export_rknn_transducer_streaming.py new file mode 120000 index 000000000..8be19ef3d --- /dev/null +++ b/egs/multi_zh-hans/ASR/zipformer/export_rknn_transducer_streaming.py @@ -0,0 +1 @@ +../../../librispeech/ASR/zipformer/export_rknn_transducer_streaming.py \ No newline at end of file diff --git a/egs/multi_zh-hans/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py b/egs/multi_zh-hans/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py new file mode 120000 index 000000000..6417f470f --- /dev/null +++ b/egs/multi_zh-hans/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py @@ -0,0 +1 @@ +../../../librispeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py \ No newline at end of file diff --git a/egs/wenetspeech/ASR/zipformer/export_rknn_transducer_streaming.py b/egs/wenetspeech/ASR/zipformer/export_rknn_transducer_streaming.py new file mode 120000 index 000000000..8be19ef3d --- /dev/null +++ b/egs/wenetspeech/ASR/zipformer/export_rknn_transducer_streaming.py @@ -0,0 +1 @@ +../../../librispeech/ASR/zipformer/export_rknn_transducer_streaming.py \ No newline at end of file diff --git a/egs/wenetspeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc.py b/egs/wenetspeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc.py new file mode 120000 index 000000000..8c203406b --- /dev/null +++ b/egs/wenetspeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc.py @@ -0,0 +1 @@ +../../../librispeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc.py \ No newline at end of file diff --git a/egs/wenetspeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py b/egs/wenetspeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py new file mode 120000 index 000000000..6417f470f --- /dev/null +++ b/egs/wenetspeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py @@ -0,0 +1 @@ +../../../librispeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py \ No newline at end of file