mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 01:22:22 +00:00
Export streaming zipformer2 to RKNN (#1977)
This commit is contained in:
parent
da87e7fc99
commit
e22bc78f98
139
.github/scripts/librispeech/ASR/run_rknn.sh
vendored
139
.github/scripts/librispeech/ASR/run_rknn.sh
vendored
@ -12,11 +12,10 @@ log() {
|
||||
|
||||
cd egs/librispeech/ASR
|
||||
|
||||
|
||||
# https://huggingface.co/csukuangfj/k2fsa-zipformer-chinese-english-mixed
|
||||
# sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20
|
||||
function export_bilingual_zh_en() {
|
||||
d=exp_zh_en
|
||||
function export_2023_02_20() {
|
||||
d=exp_2023_02_20
|
||||
|
||||
mkdir $d
|
||||
pushd $d
|
||||
@ -70,21 +69,20 @@ function export_bilingual_zh_en() {
|
||||
--tokens $d/tokens.txt \
|
||||
$d/1.wav
|
||||
|
||||
mkdir -p /icefall/rknn-models
|
||||
|
||||
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
|
||||
mkdir -p $platform
|
||||
dst=sherpa-onnx-$platform-streaming-zipformer-bilingual-zh-en-2023-02-20
|
||||
mkdir -p $dst
|
||||
|
||||
./pruned_transducer_stateless7_streaming/export_rknn.py \
|
||||
--in-encoder $d/encoder-epoch-99-avg-1.onnx \
|
||||
--in-decoder $d/decoder-epoch-99-avg-1.onnx \
|
||||
--in-joiner $d/joiner-epoch-99-avg-1.onnx \
|
||||
--out-encoder $platform/encoder.rknn \
|
||||
--out-decoder $platform/decoder.rknn \
|
||||
--out-joiner $platform/joiner.rknn \
|
||||
--out-encoder $dst/encoder.rknn \
|
||||
--out-decoder $dst/decoder.rknn \
|
||||
--out-joiner $dst/joiner.rknn \
|
||||
--target-platform $platform 2>/dev/null
|
||||
|
||||
ls -lh $platform/
|
||||
ls -lh $dst/
|
||||
|
||||
./pruned_transducer_stateless7_streaming/test_rknn_on_cpu_simulator.py \
|
||||
--encoder $d/encoder-epoch-99-avg-1.onnx \
|
||||
@ -93,19 +91,24 @@ function export_bilingual_zh_en() {
|
||||
--tokens $d/tokens.txt \
|
||||
--wav $d/0.wav
|
||||
|
||||
cp $d/tokens.txt $platform
|
||||
cp $d/*.wav $platform
|
||||
cp $d/tokens.txt $dst
|
||||
mkdir $dst/test_wavs
|
||||
cp $d/*.wav $dst/test_wavs
|
||||
|
||||
cp -av $platform /icefall/rknn-models
|
||||
tar cjvf $dst.tar.bz2 $dst
|
||||
ls -lh $dst.tar.bz2
|
||||
mv $dst.tar.bz2 /icefall/
|
||||
ls -lh $dst/
|
||||
echo "---"
|
||||
|
||||
rm -rf $dst
|
||||
done
|
||||
|
||||
ls -lh /icefall/rknn-models
|
||||
}
|
||||
|
||||
# https://huggingface.co/csukuangfj/k2fsa-zipformer-bilingual-zh-en-t
|
||||
# sherpa-onnx-streaming-zipformer-small-bilingual-zh-en-2023-02-16
|
||||
function export_bilingual_zh_en_small() {
|
||||
d=exp_zh_en_small
|
||||
function export_2023_02_16() {
|
||||
d=exp_2023_02_16
|
||||
|
||||
mkdir $d
|
||||
pushd $d
|
||||
@ -124,7 +127,6 @@ function export_bilingual_zh_en_small() {
|
||||
|
||||
popd
|
||||
|
||||
|
||||
./pruned_transducer_stateless7_streaming/export-onnx-zh.py \
|
||||
--dynamic-batch 0 \
|
||||
--enable-int8-quantization 0 \
|
||||
@ -163,21 +165,20 @@ function export_bilingual_zh_en_small() {
|
||||
--tokens $d/tokens.txt \
|
||||
$d/1.wav
|
||||
|
||||
mkdir -p /icefall/rknn-models-small
|
||||
|
||||
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
|
||||
mkdir -p $platform
|
||||
dst=sherpa-onnx-$platform-streaming-zipformer-small-bilingual-zh-en-2023-02-16
|
||||
mkdir -p $dst
|
||||
|
||||
./pruned_transducer_stateless7_streaming/export_rknn.py \
|
||||
--in-encoder $d/encoder-epoch-99-avg-1.onnx \
|
||||
--in-decoder $d/decoder-epoch-99-avg-1.onnx \
|
||||
--in-joiner $d/joiner-epoch-99-avg-1.onnx \
|
||||
--out-encoder $platform/encoder.rknn \
|
||||
--out-decoder $platform/decoder.rknn \
|
||||
--out-joiner $platform/joiner.rknn \
|
||||
--out-encoder $dst/encoder.rknn \
|
||||
--out-decoder $dst/decoder.rknn \
|
||||
--out-joiner $dst/joiner.rknn \
|
||||
--target-platform $platform 2>/dev/null
|
||||
|
||||
ls -lh $platform/
|
||||
ls -lh $dst/
|
||||
|
||||
./pruned_transducer_stateless7_streaming/test_rknn_on_cpu_simulator.py \
|
||||
--encoder $d/encoder-epoch-99-avg-1.onnx \
|
||||
@ -186,15 +187,89 @@ function export_bilingual_zh_en_small() {
|
||||
--tokens $d/tokens.txt \
|
||||
--wav $d/0.wav
|
||||
|
||||
cp $d/tokens.txt $platform
|
||||
cp $d/*.wav $platform
|
||||
cp $d/tokens.txt $dst
|
||||
mkdir $dst/test_wavs
|
||||
cp $d/*.wav $dst/test_wavs
|
||||
|
||||
cp -av $platform /icefall/rknn-models-small
|
||||
tar cjvf $dst.tar.bz2 $dst
|
||||
ls -lh $dst.tar.bz2
|
||||
mv $dst.tar.bz2 /icefall/
|
||||
ls -lh $dst/
|
||||
echo "---"
|
||||
|
||||
rm -rf $dst
|
||||
done
|
||||
|
||||
ls -lh /icefall/rknn-models-small
|
||||
}
|
||||
|
||||
export_bilingual_zh_en_small
|
||||
# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#csukuangfj-sherpa-onnx-streaming-zipformer-en-2023-06-26-english
|
||||
function export_2023_06_26() {
|
||||
d=exp_2023_06_26
|
||||
|
||||
export_bilingual_zh_en
|
||||
mkdir $d
|
||||
pushd $d
|
||||
|
||||
curl -SL -O https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/exp/pretrained.pt
|
||||
mv pretrained.pt epoch-99.pt
|
||||
|
||||
curl -SL -O https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/data/lang_bpe_500/tokens.txt
|
||||
|
||||
curl -SL -o 0.wav https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/data/lang_bpe_500/tokens.txt
|
||||
curl -SL -o 1.wav https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/test_wavs/1221-135766-0001.wav
|
||||
curl -SL -o 2.wav https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17/resolve/main/test_wavs/1221-135766-0002.wav
|
||||
|
||||
ls -lh
|
||||
|
||||
popd
|
||||
|
||||
./zipformer/export-onnx-streaming.py \
|
||||
--dynamic-batch 0 \
|
||||
--enable-int8-quantization 0 \
|
||||
--tokens $d/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $d \
|
||||
--use-ctc 0 \
|
||||
--use-transducer 1 \
|
||||
\
|
||||
--chunk-size 32 \
|
||||
--left-context-frames 128 \
|
||||
--causal 1
|
||||
|
||||
ls -lh $d/
|
||||
|
||||
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
|
||||
dst=sherpa-onnx-$platform-streaming-zipformer-en-2023-06-26
|
||||
mkdir -p $dst
|
||||
|
||||
./zipformer/export_rknn_transducer_streaming.py \
|
||||
--in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--out-encoder $dst/encoder.rknn \
|
||||
--out-decoder $dst/decoder.rknn \
|
||||
--out-joiner $dst/joiner.rknn \
|
||||
--target-platform $platform
|
||||
|
||||
ls -lh $dst/
|
||||
|
||||
cp $d/tokens.txt $dst
|
||||
mkdir $dst/test_wavs
|
||||
cp $d/*.wav $dst/test_wavs
|
||||
|
||||
tar cjvf $dst.tar.bz2 $dst
|
||||
ls -lh $dst.tar.bz2
|
||||
mv $dst.tar.bz2 /icefall/
|
||||
ls -lh $dst/
|
||||
echo "---"
|
||||
|
||||
rm -rf $dst
|
||||
done
|
||||
}
|
||||
|
||||
if [[ $rknn_toolkit2_version == "2.1.0" ]]; then
|
||||
export_2023_02_16
|
||||
export_2023_02_20
|
||||
else
|
||||
export_2023_06_26
|
||||
fi
|
||||
|
73
.github/scripts/multi_zh-hans/ASR/run_rknn.sh
vendored
Executable file
73
.github/scripts/multi_zh-hans/ASR/run_rknn.sh
vendored
Executable file
@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
python3 -m pip install kaldi-native-fbank soundfile librosa
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
cd egs/multi_zh-hans/ASR
|
||||
|
||||
|
||||
|
||||
# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#sherpa-onnx-streaming-zipformer-multi-zh-hans-2023-12-12-chinese
|
||||
function export_2023_11_05() {
|
||||
d=exp
|
||||
mkdir $d
|
||||
pushd $d
|
||||
curl -SL -O https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/data/lang_bpe_2000/tokens.txt
|
||||
curl -SL -O https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/exp/pretrained.pt
|
||||
mv pretrained.pt epoch-99.pt
|
||||
|
||||
curl -SL -o 0.wav https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/test_wavs/DEV_T0000000000.wav
|
||||
curl -SL -o 1.wav https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/test_wavs/DEV_T0000000001.wav
|
||||
curl -SL -o 2.wav https://huggingface.co/zrjin/icefall-asr-multi-zh-hans-zipformer-ctc-streaming-2023-11-05/resolve/main/test_wavs/DEV_T0000000002.wav
|
||||
ls -lh
|
||||
popd
|
||||
|
||||
./zipformer/export-onnx-streaming.py \
|
||||
--dynamic-batch 0 \
|
||||
--enable-int8-quantization 0 \
|
||||
--tokens $d/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $d \
|
||||
--use-ctc 0 \
|
||||
--use-transducer 1 \
|
||||
--chunk-size 32 \
|
||||
--left-context-frames 128 \
|
||||
--causal 1
|
||||
|
||||
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
|
||||
dst=sherpa-onnx-$platform-streaming-zipformer-multi-zh-hans-2023-12-12
|
||||
mkdir -p $dst
|
||||
|
||||
./zipformer/export_rknn_transducer_streaming.py \
|
||||
--in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--out-encoder $dst/encoder.rknn \
|
||||
--out-decoder $dst/decoder.rknn \
|
||||
--out-joiner $dst/joiner.rknn \
|
||||
--target-platform $platform
|
||||
|
||||
cp $d/tokens.txt $dst
|
||||
mkdir $dst/test_wavs
|
||||
cp $d/*.wav $dst/test_wavs
|
||||
|
||||
tar cjvf $dst.tar.bz2 $dst
|
||||
ls -lh $dst.tar.bz2
|
||||
mv $dst.tar.bz2 /icefall/
|
||||
ls -lh $dst/
|
||||
echo "---"
|
||||
|
||||
rm -rf $dst
|
||||
done
|
||||
}
|
||||
|
||||
export_2023_11_05
|
196
.github/scripts/wenetspeech/ASR/run_rknn.sh
vendored
Executable file
196
.github/scripts/wenetspeech/ASR/run_rknn.sh
vendored
Executable file
@ -0,0 +1,196 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
set -ex
|
||||
|
||||
python3 -m pip install kaldi-native-fbank soundfile librosa
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
cd egs/wenetspeech/ASR
|
||||
|
||||
#https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#k2-fsa-icefall-asr-zipformer-wenetspeech-streaming-small-chinese
|
||||
function export_2025_03_02() {
|
||||
d=exp_2025_03_02
|
||||
mkdir $d
|
||||
pushd $d
|
||||
curl -SL -O https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/data/lang_char/tokens.txt
|
||||
curl -SL -O https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/exp/pretrained.pt
|
||||
mv pretrained.pt epoch-99.pt
|
||||
|
||||
curl -SL -o 0.wav https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/test_wavs/DEV_T0000000000.wav
|
||||
curl -SL -o 1.wav https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/test_wavs/DEV_T0000000001.wav
|
||||
curl -SL -o 2.wav https://huggingface.co/k2-fsa/icefall-asr-zipformer-wenetspeech-streaming-small/resolve/main/test_wavs/DEV_T0000000002.wav
|
||||
ls -lh
|
||||
popd
|
||||
|
||||
./zipformer/export-onnx-streaming.py \
|
||||
--dynamic-batch 0 \
|
||||
--enable-int8-quantization 0 \
|
||||
--tokens $d/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $d \
|
||||
--use-ctc 0 \
|
||||
--use-transducer 1 \
|
||||
\
|
||||
--num-encoder-layers 2,2,2,2,2,2 \
|
||||
--feedforward-dim 512,768,768,768,768,768 \
|
||||
--encoder-dim 192,256,256,256,256,256 \
|
||||
--encoder-unmasked-dim 192,192,192,192,192,192 \
|
||||
\
|
||||
--chunk-size 32 \
|
||||
--left-context-frames 128 \
|
||||
--causal 1
|
||||
|
||||
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
|
||||
dst=sherpa-onnx-$platform-streaming-zipformer-small-zh-2025-03-02
|
||||
mkdir -p $dst
|
||||
|
||||
./zipformer/export_rknn_transducer_streaming.py \
|
||||
--in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--out-encoder $dst/encoder.rknn \
|
||||
--out-decoder $dst/decoder.rknn \
|
||||
--out-joiner $dst/joiner.rknn \
|
||||
--target-platform $platform
|
||||
|
||||
cp $d/tokens.txt $dst
|
||||
mkdir $dst/test_wavs
|
||||
cp $d/*.wav $dst/test_wavs
|
||||
|
||||
tar cjvf $dst.tar.bz2 $dst
|
||||
ls -lh $dst.tar.bz2
|
||||
mv $dst.tar.bz2 /icefall/
|
||||
ls -lh $dst/
|
||||
echo "---"
|
||||
|
||||
rm -rf $dst
|
||||
done
|
||||
rm -rf $d
|
||||
}
|
||||
|
||||
# https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html#k2-fsa-icefall-asr-zipformer-wenetspeech-streaming-large-chinese
|
||||
function export_2025_03_03() {
|
||||
d=exp_2025_03_03
|
||||
mkdir $d
|
||||
pushd $d
|
||||
curl -SL -O https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/data/lang_char/tokens.txt
|
||||
curl -SL -O https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/pretrained.pt
|
||||
mv pretrained.pt epoch-99.pt
|
||||
|
||||
curl -SL -o 0.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000000.wav
|
||||
curl -SL -o 1.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000001.wav
|
||||
curl -SL -o 2.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000002.wav
|
||||
ls -lh
|
||||
popd
|
||||
|
||||
./zipformer/export-onnx-streaming.py \
|
||||
--dynamic-batch 0 \
|
||||
--enable-int8-quantization 0 \
|
||||
--tokens $d/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $d \
|
||||
--use-ctc 0 \
|
||||
--use-transducer 1 \
|
||||
\
|
||||
--chunk-size 32 \
|
||||
--left-context-frames 128 \
|
||||
--causal 1
|
||||
|
||||
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
|
||||
dst=sherpa-onnx-$platform-streaming-zipformer-zh-2025-03-03
|
||||
mkdir -p $dst
|
||||
|
||||
./zipformer/export_rknn_transducer_streaming.py \
|
||||
--in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--out-encoder $dst/encoder.rknn \
|
||||
--out-decoder $dst/decoder.rknn \
|
||||
--out-joiner $dst/joiner.rknn \
|
||||
--target-platform $platform
|
||||
|
||||
cp $d/tokens.txt $dst
|
||||
mkdir $dst/test_wavs
|
||||
cp $d/*.wav $dst/test_wavs
|
||||
|
||||
tar cjvf $dst.tar.bz2 $dst
|
||||
ls -lh $dst.tar.bz2
|
||||
mv $dst.tar.bz2 /icefall/
|
||||
ls -lh $dst/
|
||||
echo "---"
|
||||
ls -lh $dst.tar.bz2
|
||||
|
||||
rm -rf $dst
|
||||
done
|
||||
rm -rf $d
|
||||
}
|
||||
|
||||
function export_2023_06_15() {
|
||||
d=exp_2023_06_15
|
||||
mkdir $d
|
||||
pushd $d
|
||||
curl -SL -O https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/data/lang_char/tokens.txt
|
||||
curl -SL -O https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/exp/pretrained.pt
|
||||
mv pretrained.pt epoch-99.pt
|
||||
|
||||
curl -SL -o 0.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000000.wav
|
||||
curl -SL -o 1.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000001.wav
|
||||
curl -SL -o 2.wav https://huggingface.co/pkufool/icefall-asr-zipformer-streaming-wenetspeech-20230615/resolve/main/test_wavs/DEV_T0000000002.wav
|
||||
ls -lh
|
||||
popd
|
||||
|
||||
./zipformer/export-onnx-streaming.py \
|
||||
--dynamic-batch 0 \
|
||||
--enable-int8-quantization 0 \
|
||||
--tokens $d/tokens.txt \
|
||||
--use-averaged-model 0 \
|
||||
--epoch 99 \
|
||||
--avg 1 \
|
||||
--exp-dir $d \
|
||||
--use-ctc 0 \
|
||||
--use-transducer 1 \
|
||||
\
|
||||
--chunk-size 32 \
|
||||
--left-context-frames 128 \
|
||||
--causal 1
|
||||
|
||||
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
|
||||
dst=sherpa-onnx-$platform-streaming-zipformer-zh-2023-06-15
|
||||
mkdir -p $dst
|
||||
|
||||
./zipformer/export_rknn_transducer_streaming.py \
|
||||
--in-encoder $d/encoder-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--in-decoder $d/decoder-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--in-joiner $d/joiner-epoch-99-avg-1-chunk-32-left-128.onnx \
|
||||
--out-encoder $dst/encoder.rknn \
|
||||
--out-decoder $dst/decoder.rknn \
|
||||
--out-joiner $dst/joiner.rknn \
|
||||
--target-platform $platform
|
||||
|
||||
cp $d/tokens.txt $dst
|
||||
mkdir $dst/test_wavs
|
||||
cp $d/*.wav $dst/test_wavs
|
||||
|
||||
tar cjvf $dst.tar.bz2 $dst
|
||||
ls -lh $dst.tar.bz2
|
||||
mv $dst.tar.bz2 /icefall/
|
||||
ls -lh $dst/
|
||||
echo "---"
|
||||
ls -lh $dst.tar.bz2
|
||||
|
||||
rm -rf $dst
|
||||
done
|
||||
}
|
||||
|
||||
export_2025_03_02
|
||||
export_2025_03_03
|
||||
export_2023_06_15
|
106
.github/workflows/rknn.yml
vendored
106
.github/workflows/rknn.yml
vendored
@ -4,7 +4,7 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- ci-rknn-2
|
||||
- rknn-zipformer2
|
||||
|
||||
pull_request:
|
||||
branches:
|
||||
@ -17,44 +17,29 @@ concurrency:
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
generate_build_matrix:
|
||||
if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
|
||||
# see https://github.com/pytorch/pytorch/pull/50633
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: Generating build matrix
|
||||
id: set-matrix
|
||||
run: |
|
||||
# outputting for debugging purposes
|
||||
python ./.github/scripts/docker/generate_build_matrix.py --torch-version=2.4.0 --python-version=3.10
|
||||
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py --torch-version=2.4.0 --python-version=3.10)
|
||||
echo "::set-output name=matrix::${MATRIX}"
|
||||
rknn:
|
||||
needs: generate_build_matrix
|
||||
name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
|
||||
name: RKNN ${{ matrix.recipe }} ${{ matrix.rknn_toolkit2_version }}
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
|
||||
python-version: ["3.10"]
|
||||
k2-version: ["1.24.4.dev20241029"]
|
||||
kaldifeat-version: ["1.25.5.dev20241029"]
|
||||
torch-version: ["2.0.0"]
|
||||
torchaudio-version: ["2.0.1"]
|
||||
version: ["20241218"]
|
||||
# recipe: ["librispeech", "wenetspeech", "multi_zh-hans"]
|
||||
recipe: ["librispeech"]
|
||||
rknn_toolkit2_version: ["2.2.0", "2.1.0"]
|
||||
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup Python
|
||||
if: false
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Export ONNX model
|
||||
- name: Export RKNN model
|
||||
uses: addnab/docker-run-action@v3
|
||||
with:
|
||||
image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
|
||||
@ -73,65 +58,35 @@ jobs:
|
||||
python3 -m torch.utils.collect_env
|
||||
python3 -m k2.version
|
||||
pip list
|
||||
export rknn_toolkit2_version=${{ matrix.rknn_toolkit2_version }}
|
||||
|
||||
if [[ $rknn_toolkit2_version == "2.1.0" ]]; then
|
||||
# for the folder pruned_transducer_stateless7_streaming
|
||||
curl -SL -O https://huggingface.co/csukuangfj/rknn-toolkit2/resolve/main/rknn_toolkit2-2.1.0%2B708089d1-cp310-cp310-linux_x86_64.whl
|
||||
else
|
||||
# for the folder zipformer/
|
||||
curl -SL -O https://huggingface.co/csukuangfj/rknn-toolkit2/resolve/main/rknn_toolkit2-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
|
||||
fi
|
||||
|
||||
# Install rknn
|
||||
curl -SL -O https://huggingface.co/csukuangfj/rknn-toolkit2/resolve/main/rknn_toolkit2-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
|
||||
pip install ./*.whl "numpy<=1.26.4"
|
||||
pip list | grep rknn
|
||||
echo "---"
|
||||
pip list
|
||||
echo "---"
|
||||
|
||||
.github/scripts/librispeech/ASR/run_rknn.sh
|
||||
recipe=${{ matrix.recipe }}
|
||||
.github/scripts/$recipe/ASR/run_rknn.sh > log-$recipe.txt 2>&1 || true
|
||||
|
||||
- name: Display rknn models
|
||||
shell: bash
|
||||
run: |
|
||||
ls -lh
|
||||
|
||||
ls -lh rknn-models/*
|
||||
echo "----"
|
||||
ls -lh rknn-models-small/*
|
||||
|
||||
- name: Collect results (small)
|
||||
shell: bash
|
||||
run: |
|
||||
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
|
||||
dst=sherpa-onnx-$platform-streaming-zipformer-small-bilingual-zh-en-2023-02-16
|
||||
mkdir $dst
|
||||
mkdir $dst/test_wavs
|
||||
src=rknn-models-small/$platform
|
||||
|
||||
cp -v $src/*.rknn $dst/
|
||||
cp -v $src/tokens.txt $dst/
|
||||
cp -v $src/*.wav $dst/test_wavs/
|
||||
ls -lh $dst
|
||||
tar cjfv $dst.tar.bz2 $dst
|
||||
rm -rf $dst
|
||||
done
|
||||
|
||||
- name: Collect results
|
||||
shell: bash
|
||||
run: |
|
||||
for platform in rk3562 rk3566 rk3568 rk3576 rk3588; do
|
||||
dst=sherpa-onnx-$platform-streaming-zipformer-bilingual-zh-en-2023-02-20
|
||||
mkdir $dst
|
||||
mkdir $dst/test_wavs
|
||||
src=rknn-models/$platform
|
||||
|
||||
cp -v $src/*.rknn $dst/
|
||||
cp -v $src/tokens.txt $dst/
|
||||
cp -v $src/*.wav $dst/test_wavs/
|
||||
ls -lh $dst
|
||||
tar cjfv $dst.tar.bz2 $dst
|
||||
rm -rf $dst
|
||||
done
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: log-${{ matrix.recipe }}-${{ matrix.rknn_toolkit2_version }}
|
||||
path: ./log-*.txt
|
||||
|
||||
- name: Display results
|
||||
shell: bash
|
||||
run: |
|
||||
ls -lh *rk*.tar.bz2
|
||||
ls -lh *rk*.tar.bz2 || true
|
||||
|
||||
- name: Release to GitHub
|
||||
uses: svenstaro/upload-release-action@v2
|
||||
@ -144,7 +99,7 @@ jobs:
|
||||
tag: asr-models
|
||||
|
||||
- name: Upload model to huggingface
|
||||
if: github.event_name == 'push'
|
||||
if: github.event_name == 'push' || github.event_name == 'workflow_dispatch'
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
uses: nick-fields/retry@v3
|
||||
@ -167,8 +122,7 @@ jobs:
|
||||
git merge -m "merge remote" --ff origin main
|
||||
dst=streaming-asr
|
||||
mkdir -p $dst
|
||||
rm -fv $dst/*
|
||||
cp ../*rk*.tar.bz2 $dst/
|
||||
cp ../*rk*.tar.bz2 $dst/ || true
|
||||
|
||||
ls -lh $dst
|
||||
git add .
|
||||
|
@ -72,7 +72,7 @@ def compute_features(filename: str, dim: int = 80) -> np.ndarray:
|
||||
filename:
|
||||
Path to an audio file.
|
||||
Returns:
|
||||
Return a 1-D float32 tensor of shape (1, 80, 3000) containing the features.
|
||||
Return a 2-D float32 tensor of shape (T, dim) containing the features.
|
||||
"""
|
||||
wave, sample_rate = load_audio(filename)
|
||||
if sample_rate != 16000:
|
||||
|
@ -74,6 +74,20 @@ def get_parser():
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dynamic-batch",
|
||||
type=int,
|
||||
default=1,
|
||||
help="1 to support dynamic batch size. 0 to support only batch size == 1",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--enable-int8-quantization",
|
||||
type=int,
|
||||
default=1,
|
||||
help="1 to also export int8 onnx models.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--epoch",
|
||||
type=int,
|
||||
@ -326,6 +340,7 @@ def export_streaming_ctc_model_onnx(
|
||||
model: OnnxModel,
|
||||
encoder_filename: str,
|
||||
opset_version: int = 11,
|
||||
dynamic_batch: bool = True,
|
||||
use_whisper_features: bool = False,
|
||||
use_external_data: bool = False,
|
||||
) -> None:
|
||||
@ -470,7 +485,9 @@ def export_streaming_ctc_model_onnx(
|
||||
"log_probs": {0: "N"},
|
||||
**inputs,
|
||||
**outputs,
|
||||
},
|
||||
}
|
||||
if dynamic_batch
|
||||
else {},
|
||||
)
|
||||
|
||||
add_meta_data(
|
||||
@ -618,15 +635,17 @@ def main():
|
||||
model,
|
||||
str(model_filename),
|
||||
opset_version=opset_version,
|
||||
dynamic_batch=params.dynamic_batch == 1,
|
||||
use_whisper_features=params.use_whisper_features,
|
||||
use_external_data=params.use_external_data,
|
||||
)
|
||||
logging.info(f"Exported model to {model_filename}")
|
||||
|
||||
# Generate int8 quantization models
|
||||
# See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection
|
||||
if params.enable_int8_quantization:
|
||||
# Generate int8 quantization models
|
||||
# See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection
|
||||
|
||||
logging.info("Generate int8 quantization models")
|
||||
logging.info("Generate int8 quantization models")
|
||||
|
||||
if params.use_external_data:
|
||||
model_filename_int8 = f"ctc-{suffix}.int8.onnx"
|
||||
|
@ -93,6 +93,20 @@ def get_parser():
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dynamic-batch",
|
||||
type=int,
|
||||
default=1,
|
||||
help="1 to support dynamic batch size. 0 to support only batch size == 1",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--enable-int8-quantization",
|
||||
type=int,
|
||||
default=1,
|
||||
help="1 to also export int8 onnx models.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--epoch",
|
||||
type=int,
|
||||
@ -389,6 +403,7 @@ def export_encoder_model_onnx(
|
||||
encoder_filename: str,
|
||||
opset_version: int = 11,
|
||||
feature_dim: int = 80,
|
||||
dynamic_batch: bool = True,
|
||||
use_whisper_features: bool = False,
|
||||
use_external_data: bool = False,
|
||||
) -> None:
|
||||
@ -534,7 +549,9 @@ def export_encoder_model_onnx(
|
||||
"encoder_out": {0: "N"},
|
||||
**inputs,
|
||||
**outputs,
|
||||
},
|
||||
}
|
||||
if dynamic_batch
|
||||
else {},
|
||||
)
|
||||
|
||||
add_meta_data(
|
||||
@ -548,6 +565,7 @@ def export_decoder_model_onnx(
|
||||
decoder_model: OnnxDecoder,
|
||||
decoder_filename: str,
|
||||
opset_version: int = 11,
|
||||
dynamic_batch: bool = True,
|
||||
) -> None:
|
||||
"""Export the decoder model to ONNX format.
|
||||
|
||||
@ -570,7 +588,7 @@ def export_decoder_model_onnx(
|
||||
context_size = decoder_model.decoder.context_size
|
||||
vocab_size = decoder_model.decoder.vocab_size
|
||||
|
||||
y = torch.zeros(10, context_size, dtype=torch.int64)
|
||||
y = torch.zeros(1, context_size, dtype=torch.int64)
|
||||
decoder_model = torch.jit.script(decoder_model)
|
||||
torch.onnx.export(
|
||||
decoder_model,
|
||||
@ -583,7 +601,9 @@ def export_decoder_model_onnx(
|
||||
dynamic_axes={
|
||||
"y": {0: "N"},
|
||||
"decoder_out": {0: "N"},
|
||||
},
|
||||
}
|
||||
if dynamic_batch
|
||||
else {},
|
||||
)
|
||||
|
||||
meta_data = {
|
||||
@ -597,6 +617,7 @@ def export_joiner_model_onnx(
|
||||
joiner_model: nn.Module,
|
||||
joiner_filename: str,
|
||||
opset_version: int = 11,
|
||||
dynamic_batch: bool = True,
|
||||
) -> None:
|
||||
"""Export the joiner model to ONNX format.
|
||||
The exported joiner model has two inputs:
|
||||
@ -611,8 +632,8 @@ def export_joiner_model_onnx(
|
||||
joiner_dim = joiner_model.output_linear.weight.shape[1]
|
||||
logging.info(f"joiner dim: {joiner_dim}")
|
||||
|
||||
projected_encoder_out = torch.rand(11, joiner_dim, dtype=torch.float32)
|
||||
projected_decoder_out = torch.rand(11, joiner_dim, dtype=torch.float32)
|
||||
projected_encoder_out = torch.rand(1, joiner_dim, dtype=torch.float32)
|
||||
projected_decoder_out = torch.rand(1, joiner_dim, dtype=torch.float32)
|
||||
|
||||
torch.onnx.export(
|
||||
joiner_model,
|
||||
@ -629,7 +650,9 @@ def export_joiner_model_onnx(
|
||||
"encoder_out": {0: "N"},
|
||||
"decoder_out": {0: "N"},
|
||||
"logit": {0: "N"},
|
||||
},
|
||||
}
|
||||
if dynamic_batch
|
||||
else {},
|
||||
)
|
||||
meta_data = {
|
||||
"joiner_dim": str(joiner_dim),
|
||||
@ -793,6 +816,7 @@ def main():
|
||||
str(encoder_filename),
|
||||
opset_version=opset_version,
|
||||
feature_dim=params.feature_dim,
|
||||
dynamic_batch=params.dynamic_batch == 1,
|
||||
use_whisper_features=params.use_whisper_features,
|
||||
use_external_data=params.use_external_data,
|
||||
)
|
||||
@ -804,6 +828,7 @@ def main():
|
||||
decoder,
|
||||
decoder_filename,
|
||||
opset_version=opset_version,
|
||||
dynamic_batch=params.dynamic_batch == 1,
|
||||
)
|
||||
logging.info(f"Exported decoder to {decoder_filename}")
|
||||
|
||||
@ -813,6 +838,7 @@ def main():
|
||||
joiner,
|
||||
joiner_filename,
|
||||
opset_version=opset_version,
|
||||
dynamic_batch=params.dynamic_batch == 1,
|
||||
)
|
||||
logging.info(f"Exported joiner to {joiner_filename}")
|
||||
|
||||
@ -835,35 +861,36 @@ def main():
|
||||
# Generate int8 quantization models
|
||||
# See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection
|
||||
|
||||
logging.info("Generate int8 quantization models")
|
||||
if params.enable_int8_quantization:
|
||||
logging.info("Generate int8 quantization models")
|
||||
|
||||
if params.use_external_data:
|
||||
encoder_filename_int8 = f"encoder-{suffix}.int8.onnx"
|
||||
else:
|
||||
encoder_filename_int8 = params.exp_dir / f"encoder-{suffix}.int8.onnx"
|
||||
if params.use_external_data:
|
||||
encoder_filename_int8 = f"encoder-{suffix}.int8.onnx"
|
||||
else:
|
||||
encoder_filename_int8 = params.exp_dir / f"encoder-{suffix}.int8.onnx"
|
||||
|
||||
quantize_dynamic(
|
||||
model_input=encoder_filename,
|
||||
model_output=encoder_filename_int8,
|
||||
op_types_to_quantize=["MatMul"],
|
||||
weight_type=QuantType.QInt8,
|
||||
)
|
||||
quantize_dynamic(
|
||||
model_input=encoder_filename,
|
||||
model_output=encoder_filename_int8,
|
||||
op_types_to_quantize=["MatMul"],
|
||||
weight_type=QuantType.QInt8,
|
||||
)
|
||||
|
||||
decoder_filename_int8 = params.exp_dir / f"decoder-{suffix}.int8.onnx"
|
||||
quantize_dynamic(
|
||||
model_input=decoder_filename,
|
||||
model_output=decoder_filename_int8,
|
||||
op_types_to_quantize=["MatMul", "Gather"],
|
||||
weight_type=QuantType.QInt8,
|
||||
)
|
||||
decoder_filename_int8 = params.exp_dir / f"decoder-{suffix}.int8.onnx"
|
||||
quantize_dynamic(
|
||||
model_input=decoder_filename,
|
||||
model_output=decoder_filename_int8,
|
||||
op_types_to_quantize=["MatMul", "Gather"],
|
||||
weight_type=QuantType.QInt8,
|
||||
)
|
||||
|
||||
joiner_filename_int8 = params.exp_dir / f"joiner-{suffix}.int8.onnx"
|
||||
quantize_dynamic(
|
||||
model_input=joiner_filename,
|
||||
model_output=joiner_filename_int8,
|
||||
op_types_to_quantize=["MatMul"],
|
||||
weight_type=QuantType.QInt8,
|
||||
)
|
||||
joiner_filename_int8 = params.exp_dir / f"joiner-{suffix}.int8.onnx"
|
||||
quantize_dynamic(
|
||||
model_input=joiner_filename,
|
||||
model_output=joiner_filename_int8,
|
||||
op_types_to_quantize=["MatMul"],
|
||||
weight_type=QuantType.QInt8,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
74
egs/librispeech/ASR/zipformer/export_rknn_ctc_streaming.py
Executable file
74
egs/librispeech/ASR/zipformer/export_rknn_ctc_streaming.py
Executable file
@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from rknn.api import RKNN
|
||||
from test_rknn_on_cpu_simulator_ctc_streaming import RKNNModel
|
||||
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
|
||||
g_platforms = [
|
||||
# "rv1103",
|
||||
# "rv1103b",
|
||||
# "rv1106",
|
||||
# "rk2118",
|
||||
"rk3562",
|
||||
"rk3566",
|
||||
"rk3568",
|
||||
"rk3576",
|
||||
"rk3588",
|
||||
]
|
||||
|
||||
|
||||
def get_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--target-platform",
|
||||
type=str,
|
||||
required=True,
|
||||
help=f"Supported values are: {','.join(g_platforms)}",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--in-model",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the onnx model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--out-model",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the rknn model",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def main():
|
||||
args = get_parser().parse_args()
|
||||
print(vars(args))
|
||||
|
||||
model = RKNNModel(
|
||||
model=args.in_model,
|
||||
target_platform=args.target_platform,
|
||||
)
|
||||
print(model.meta)
|
||||
|
||||
model.export_rknn(
|
||||
model=args.out_model,
|
||||
)
|
||||
|
||||
model.release()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
139
egs/librispeech/ASR/zipformer/export_rknn_transducer_streaming.py
Executable file
139
egs/librispeech/ASR/zipformer/export_rknn_transducer_streaming.py
Executable file
@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from rknn.api import RKNN
|
||||
from test_rknn_on_cpu_simulator_ctc_streaming import (
|
||||
MetaData,
|
||||
get_meta_data,
|
||||
init_model,
|
||||
export_rknn,
|
||||
)
|
||||
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
|
||||
g_platforms = [
|
||||
# "rv1103",
|
||||
# "rv1103b",
|
||||
# "rv1106",
|
||||
# "rk2118",
|
||||
"rk3562",
|
||||
"rk3566",
|
||||
"rk3568",
|
||||
"rk3576",
|
||||
"rk3588",
|
||||
]
|
||||
|
||||
|
||||
def get_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--target-platform",
|
||||
type=str,
|
||||
required=True,
|
||||
help=f"Supported values are: {','.join(g_platforms)}",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--in-encoder",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the encoder onnx model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--in-decoder",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the decoder onnx model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--in-joiner",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the joiner onnx model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--out-encoder",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the encoder rknn model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--out-decoder",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the decoder rknn model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--out-joiner",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the joiner rknn model",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
class RKNNModel:
|
||||
def __init__(
|
||||
self,
|
||||
encoder: str,
|
||||
decoder: str,
|
||||
joiner: str,
|
||||
target_platform: str,
|
||||
):
|
||||
self.meta = get_meta_data(encoder)
|
||||
self.encoder = init_model(
|
||||
encoder,
|
||||
custom_string=self.meta.to_str(),
|
||||
target_platform=target_platform,
|
||||
)
|
||||
self.decoder = init_model(decoder, target_platform=target_platform)
|
||||
self.joiner = init_model(joiner, target_platform=target_platform)
|
||||
|
||||
def export_rknn(self, encoder, decoder, joiner):
|
||||
export_rknn(self.encoder, encoder)
|
||||
export_rknn(self.decoder, decoder)
|
||||
export_rknn(self.joiner, joiner)
|
||||
|
||||
def release(self):
|
||||
self.encoder.release()
|
||||
self.decoder.release()
|
||||
self.joiner.release()
|
||||
|
||||
|
||||
def main():
|
||||
args = get_parser().parse_args()
|
||||
print(vars(args))
|
||||
|
||||
model = RKNNModel(
|
||||
encoder=args.in_encoder,
|
||||
decoder=args.in_decoder,
|
||||
joiner=args.in_joiner,
|
||||
target_platform=args.target_platform,
|
||||
)
|
||||
print(model.meta)
|
||||
|
||||
model.export_rknn(
|
||||
encoder=args.out_encoder,
|
||||
decoder=args.out_decoder,
|
||||
joiner=args.out_joiner,
|
||||
)
|
||||
|
||||
model.release()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
362
egs/librispeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py
Executable file
362
egs/librispeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py
Executable file
@ -0,0 +1,362 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
import kaldi_native_fbank as knf
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from rknn.api import RKNN
|
||||
|
||||
|
||||
def get_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the onnx model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--tokens",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the tokens.txt",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--wav",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to test wave",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def load_audio(filename: str) -> Tuple[np.ndarray, int]:
|
||||
data, sample_rate = sf.read(
|
||||
filename,
|
||||
always_2d=True,
|
||||
dtype="float32",
|
||||
)
|
||||
data = data[:, 0] # use only the first channel
|
||||
|
||||
samples = np.ascontiguousarray(data)
|
||||
return samples, sample_rate
|
||||
|
||||
|
||||
def compute_features(filename: str, dim: int = 80) -> np.ndarray:
|
||||
"""
|
||||
Args:
|
||||
filename:
|
||||
Path to an audio file.
|
||||
Returns:
|
||||
Return a 2-D float32 tensor of shape (T, dim) containing the features.
|
||||
"""
|
||||
wave, sample_rate = load_audio(filename)
|
||||
if sample_rate != 16000:
|
||||
import librosa
|
||||
|
||||
wave = librosa.resample(wave, orig_sr=sample_rate, target_sr=16000)
|
||||
sample_rate = 16000
|
||||
|
||||
features = []
|
||||
opts = knf.FbankOptions()
|
||||
opts.frame_opts.dither = 0
|
||||
opts.mel_opts.num_bins = dim
|
||||
opts.frame_opts.snip_edges = False
|
||||
fbank = knf.OnlineFbank(opts)
|
||||
|
||||
fbank.accept_waveform(16000, wave)
|
||||
tail_paddings = np.zeros(int(0.5 * 16000), dtype=np.float32)
|
||||
fbank.accept_waveform(16000, tail_paddings)
|
||||
fbank.input_finished()
|
||||
for i in range(fbank.num_frames_ready):
|
||||
f = fbank.get_frame(i)
|
||||
features.append(f)
|
||||
|
||||
features = np.stack(features, axis=0)
|
||||
|
||||
return features
|
||||
|
||||
|
||||
def load_tokens(filename):
|
||||
tokens = dict()
|
||||
with open(filename, "r") as f:
|
||||
for line in f:
|
||||
t, i = line.split()
|
||||
tokens[int(i)] = t
|
||||
return tokens
|
||||
|
||||
|
||||
def init_model(filename, target_platform="rk3588", custom_string=None):
|
||||
rknn = RKNN(verbose=False)
|
||||
|
||||
rknn.config(target_platform=target_platform, custom_string=custom_string)
|
||||
if not Path(filename).is_file():
|
||||
exit(f"{filename} does not exist")
|
||||
|
||||
ret = rknn.load_onnx(model=filename)
|
||||
if ret != 0:
|
||||
exit(f"Load model {filename} failed!")
|
||||
|
||||
ret = rknn.build(do_quantization=False)
|
||||
if ret != 0:
|
||||
exit("Build model {filename} failed!")
|
||||
|
||||
ret = rknn.init_runtime()
|
||||
if ret != 0:
|
||||
exit(f"Failed to init rknn runtime for {filename}")
|
||||
return rknn
|
||||
|
||||
|
||||
class MetaData:
|
||||
def __init__(
|
||||
self,
|
||||
model_type: str,
|
||||
decode_chunk_len: int,
|
||||
T: int,
|
||||
num_encoder_layers: List[int],
|
||||
encoder_dims: List[int],
|
||||
cnn_module_kernels: List[int],
|
||||
left_context_len: List[int],
|
||||
query_head_dims: List[int],
|
||||
value_head_dims: List[int],
|
||||
num_heads: List[int],
|
||||
):
|
||||
self.model_type = model_type
|
||||
self.decode_chunk_len = decode_chunk_len
|
||||
self.T = T
|
||||
self.num_encoder_layers = num_encoder_layers
|
||||
self.encoder_dims = encoder_dims
|
||||
self.cnn_module_kernels = cnn_module_kernels
|
||||
self.left_context_len = left_context_len
|
||||
self.query_head_dims = query_head_dims
|
||||
self.value_head_dims = value_head_dims
|
||||
self.num_heads = num_heads
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.to_str()
|
||||
|
||||
def to_str(self) -> str:
|
||||
def to_s(ll):
|
||||
return ",".join(list(map(str, ll)))
|
||||
|
||||
s = f"model_type={self.model_type}"
|
||||
s += ";decode_chunk_len=" + str(self.decode_chunk_len)
|
||||
s += ";T=" + str(self.T)
|
||||
s += ";num_encoder_layers=" + to_s(self.num_encoder_layers)
|
||||
s += ";encoder_dims=" + to_s(self.encoder_dims)
|
||||
s += ";cnn_module_kernels=" + to_s(self.cnn_module_kernels)
|
||||
s += ";left_context_len=" + to_s(self.left_context_len)
|
||||
s += ";query_head_dims=" + to_s(self.query_head_dims)
|
||||
s += ";value_head_dims=" + to_s(self.value_head_dims)
|
||||
s += ";num_heads=" + to_s(self.num_heads)
|
||||
|
||||
assert len(s) < 1024, (s, len(s))
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def get_meta_data(model: str):
|
||||
import onnxruntime
|
||||
|
||||
session_opts = onnxruntime.SessionOptions()
|
||||
session_opts.inter_op_num_threads = 1
|
||||
session_opts.intra_op_num_threads = 1
|
||||
|
||||
m = onnxruntime.InferenceSession(
|
||||
model,
|
||||
sess_options=session_opts,
|
||||
providers=["CPUExecutionProvider"],
|
||||
)
|
||||
|
||||
for i in m.get_inputs():
|
||||
print(i)
|
||||
|
||||
print("-----")
|
||||
|
||||
for i in m.get_outputs():
|
||||
print(i)
|
||||
|
||||
meta = m.get_modelmeta().custom_metadata_map
|
||||
print(meta)
|
||||
"""
|
||||
{'num_heads': '4,4,4,8,4,4', 'query_head_dims': '32,32,32,32,32,32',
|
||||
'cnn_module_kernels': '31,31,15,15,15,31',
|
||||
'num_encoder_layers': '2,2,3,4,3,2', ' version': '1',
|
||||
'comment': 'streaming ctc zipformer2',
|
||||
'model_type': 'zipformer2',
|
||||
'encoder_dims': '192,256,384,512,384,256',
|
||||
'model_author': 'k2-fsa', 'T': '77',
|
||||
'value_head_dims': '12,12,12,12,12,12',
|
||||
'left_context_len': '128,64,32,16,32,64',
|
||||
'decode_chunk_len': '64'}
|
||||
"""
|
||||
|
||||
def to_int_list(s):
|
||||
return list(map(int, s.split(",")))
|
||||
|
||||
model_type = meta["model_type"]
|
||||
decode_chunk_len = int(meta["decode_chunk_len"])
|
||||
T = int(meta["T"])
|
||||
num_encoder_layers = to_int_list(meta["num_encoder_layers"])
|
||||
encoder_dims = to_int_list(meta["encoder_dims"])
|
||||
cnn_module_kernels = to_int_list(meta["cnn_module_kernels"])
|
||||
left_context_len = to_int_list(meta["left_context_len"])
|
||||
query_head_dims = to_int_list(meta["query_head_dims"])
|
||||
value_head_dims = to_int_list(meta["value_head_dims"])
|
||||
num_heads = to_int_list(meta["num_heads"])
|
||||
|
||||
return MetaData(
|
||||
model_type=model_type,
|
||||
decode_chunk_len=decode_chunk_len,
|
||||
T=T,
|
||||
num_encoder_layers=num_encoder_layers,
|
||||
encoder_dims=encoder_dims,
|
||||
cnn_module_kernels=cnn_module_kernels,
|
||||
left_context_len=left_context_len,
|
||||
query_head_dims=query_head_dims,
|
||||
value_head_dims=value_head_dims,
|
||||
num_heads=num_heads,
|
||||
)
|
||||
|
||||
|
||||
def export_rknn(rknn, filename):
|
||||
ret = rknn.export_rknn(filename)
|
||||
if ret != 0:
|
||||
exit("Export rknn model to {filename} failed!")
|
||||
|
||||
|
||||
class RKNNModel:
|
||||
def __init__(self, model: str, target_platform="rk3588"):
|
||||
self.meta = get_meta_data(model)
|
||||
self.model = init_model(model, custom_string=self.meta.to_str())
|
||||
|
||||
def export_rknn(self, model: str):
|
||||
export_rknn(self.model, model)
|
||||
|
||||
def release(self):
|
||||
self.model.release()
|
||||
|
||||
def get_init_states(
|
||||
self,
|
||||
) -> List[np.ndarray]:
|
||||
states = []
|
||||
|
||||
num_encoder_layers = self.meta.num_encoder_layers
|
||||
encoder_dims = self.meta.encoder_dims
|
||||
left_context_len = self.meta.left_context_len
|
||||
cnn_module_kernels = self.meta.cnn_module_kernels
|
||||
query_head_dims = self.meta.query_head_dims
|
||||
value_head_dims = self.meta.value_head_dims
|
||||
num_heads = self.meta.num_heads
|
||||
|
||||
num_encoders = len(num_encoder_layers)
|
||||
N = 1
|
||||
|
||||
for i in range(num_encoders):
|
||||
num_layers = num_encoder_layers[i]
|
||||
key_dim = query_head_dims[i] * num_heads[i]
|
||||
embed_dim = encoder_dims[i]
|
||||
nonlin_attn_head_dim = 3 * embed_dim // 4
|
||||
value_dim = value_head_dims[i] * num_heads[i]
|
||||
conv_left_pad = cnn_module_kernels[i] // 2
|
||||
|
||||
for layer in range(num_layers):
|
||||
cached_key = np.zeros(
|
||||
(left_context_len[i], N, key_dim), dtype=np.float32
|
||||
)
|
||||
cached_nonlin_attn = np.zeros(
|
||||
(1, N, left_context_len[i], nonlin_attn_head_dim),
|
||||
dtype=np.float32,
|
||||
)
|
||||
cached_val1 = np.zeros(
|
||||
(left_context_len[i], N, value_dim),
|
||||
dtype=np.float32,
|
||||
)
|
||||
cached_val2 = np.zeros(
|
||||
(left_context_len[i], N, value_dim),
|
||||
dtype=np.float32,
|
||||
)
|
||||
cached_conv1 = np.zeros((N, embed_dim, conv_left_pad), dtype=np.float32)
|
||||
cached_conv2 = np.zeros((N, embed_dim, conv_left_pad), dtype=np.float32)
|
||||
states += [
|
||||
cached_key,
|
||||
cached_nonlin_attn,
|
||||
cached_val1,
|
||||
cached_val2,
|
||||
cached_conv1,
|
||||
cached_conv2,
|
||||
]
|
||||
embed_states = np.zeros((N, 128, 3, 19), dtype=np.float32)
|
||||
states.append(embed_states)
|
||||
processed_lens = np.zeros((N,), dtype=np.int64)
|
||||
states.append(processed_lens)
|
||||
|
||||
return states
|
||||
|
||||
def run_model(self, x: np.ndarray, states: List[np.ndarray]):
|
||||
"""
|
||||
Args:
|
||||
x: (T, C), np.float32
|
||||
states: A list of states
|
||||
"""
|
||||
x = np.expand_dims(x, axis=0)
|
||||
|
||||
out = self.model.inference(inputs=[x] + states, data_format="nchw")
|
||||
# out[0]: log_probs, (N, T, C)
|
||||
return out[0], out[1:]
|
||||
|
||||
|
||||
def main():
|
||||
args = get_parser().parse_args()
|
||||
print(vars(args))
|
||||
|
||||
id2token = load_tokens(args.tokens)
|
||||
features = compute_features(args.wav)
|
||||
model = RKNNModel(
|
||||
model=args.model,
|
||||
)
|
||||
print(model.meta)
|
||||
|
||||
states = model.get_init_states()
|
||||
|
||||
segment = model.meta.T
|
||||
offset = model.meta.decode_chunk_len
|
||||
|
||||
ans = []
|
||||
blank = 0
|
||||
prev = -1
|
||||
i = 0
|
||||
while True:
|
||||
if i + segment > features.shape[0]:
|
||||
break
|
||||
x = features[i : i + segment]
|
||||
i += offset
|
||||
log_probs, states = model.run_model(x, states)
|
||||
log_probs = log_probs[0] # (N, T, C) -> (N, T, C)
|
||||
ids = log_probs.argmax(axis=1)
|
||||
for k in ids:
|
||||
if i != blank and i != prev:
|
||||
ans.append(i)
|
||||
prev = i
|
||||
tokens = [id2token[i] for i in ans]
|
||||
underline = "▁"
|
||||
# underline = b"\xe2\x96\x81".decode()
|
||||
text = "".join(tokens).replace(underline, " ").strip()
|
||||
|
||||
print(ans)
|
||||
print(args.wav)
|
||||
print(text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
1
egs/multi_zh-hans/ASR/zipformer/export_rknn_ctc_streaming.py
Symbolic link
1
egs/multi_zh-hans/ASR/zipformer/export_rknn_ctc_streaming.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/zipformer/export_rknn_ctc_streaming.py
|
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/zipformer/export_rknn_transducer_streaming.py
|
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py
|
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/zipformer/export_rknn_transducer_streaming.py
|
1
egs/wenetspeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc.py
Symbolic link
1
egs/wenetspeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc.py
|
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/zipformer/test_rknn_on_cpu_simulator_ctc_streaming.py
|
Loading…
x
Reference in New Issue
Block a user