diff --git a/egs/audioset/AT/zipformer/export-onnx.py b/egs/audioset/AT/zipformer/export-onnx.py index af83c0e9c..a8b96fde9 100755 --- a/egs/audioset/AT/zipformer/export-onnx.py +++ b/egs/audioset/AT/zipformer/export-onnx.py @@ -6,56 +6,28 @@ """ This script exports a transducer model from PyTorch to ONNX. -We use the pre-trained model from -https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15 -as an example to show how to use this file. +Usage of this script: -1. Download the pre-trained model + repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12 + repo=$(basename $repo_url) + GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url + pushd $repo/exp + git lfs pull --include pretrained.pt + ln -s pretrained.pt epoch-99.pt + popd -cd egs/librispeech/ASR + python3 zipformer/export-onnx.py \ + --exp-dir $repo/exp \ + --epoch 99 \ + --avg 1 \ + --use-averaged-model 0 -repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12#/ -GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url -repo=$(basename $repo_url) + pushd $repo/exp + mv model-epoch-99-avg-1.onnx model.onnx + mv model-epoch-99-avg-1.int8.onnx model.int8.onnx + popd -pushd $repo -git lfs pull --include "exp/pretrained.pt" - -cd exp -ln -s pretrained.pt epoch-99.pt -popd - -2. Export the model to ONNX - -./zipformer/export-onnx.py \ - --use-averaged-model 0 \ - --epoch 99 \ - --avg 1 \ - --exp-dir $repo/exp \ - --num-encoder-layers "2,2,3,4,3,2" \ - --downsampling-factor "1,2,4,8,4,2" \ - --feedforward-dim "512,768,1024,1536,1024,768" \ - --num-heads "4,4,4,8,4,4" \ - --encoder-dim "192,256,384,512,384,256" \ - --query-head-dim 32 \ - --value-head-dim 12 \ - --pos-head-dim 4 \ - --pos-dim 48 \ - --encoder-unmasked-dim "192,192,256,256,256,192" \ - --cnn-module-kernel "31,31,15,15,15,31" \ - --decoder-dim 512 \ - --joiner-dim 512 \ - --causal False \ - --chunk-size "16,32,64,-1" \ - --left-context-frames "64,128,256,-1" - -It will generate the following 3 files inside $repo/exp: - - - encoder-epoch-99-avg-1.onnx - - decoder-epoch-99-avg-1.onnx - - joiner-epoch-99-avg-1.onnx - -See ./onnx_pretrained.py and ./onnx_check.py for how to +See ./onnx_pretrained.py use the exported ONNX models. """ @@ -261,6 +233,16 @@ def export_audio_tagging_model_onnx( add_meta_data(filename=filename, meta_data=meta_data) +def optimize_model(filename): + # see https://github.com/microsoft/onnxruntime/issues/1899#issuecomment-534806537 + from onnx import optimizer + + passes = ["extract_constant_to_initializer", "eliminate_unused_initializer"] + onnx_model = onnx.load(filename) + optimized_model = optimizer.optimize(onnx_model, passes) + onnx.save(optimized_model, filename) + + @torch.no_grad() def main(): args = get_parser().parse_args() @@ -389,6 +371,7 @@ def main(): model_filename, opset_version=opset_version, ) + optimized_model(model_filename) logging.info(f"Exported audio tagging model to {model_filename}") # Generate int8 quantization models @@ -403,6 +386,7 @@ def main(): op_types_to_quantize=["MatMul"], weight_type=QuantType.QInt8, ) + optimized_model(model_filename_int8) if __name__ == "__main__": diff --git a/egs/audioset/AT/zipformer/export.py b/egs/audioset/AT/zipformer/export.py index 38bf6c8c5..6ceeca8de 100755 --- a/egs/audioset/AT/zipformer/export.py +++ b/egs/audioset/AT/zipformer/export.py @@ -42,6 +42,7 @@ load it by `torch.jit.load("jit_script.pt")`. Check ./jit_pretrained.py for its usage. Check https://github.com/k2-fsa/sherpa +and https://github.com/k2-fsa/sherpa-onnx for how to use the exported models outside of icefall. (2) Export `model.state_dict()` @@ -55,13 +56,13 @@ for how to use the exported models outside of icefall. It will generate a file `pretrained.pt` in the given `exp_dir`. You can later load it by `icefall.checkpoint.load_checkpoint()`. -To use the generated file with `zipformer/decode.py`, +To use the generated file with `zipformer/evaluate.py`, you can do: cd /path/to/exp_dir ln -s pretrained.pt epoch-9999.pt - cd /path/to/egs/librispeech/ASR + cd /path/to/egs/audioset/AT ./zipformer/evaluate.py \ --exp-dir ./zipformer/exp \ --use-averaged-model False \ diff --git a/egs/audioset/AT/zipformer/jit_pretrained.py b/egs/audioset/AT/zipformer/jit_pretrained.py index 8e3afcb6f..b46d3921b 100755 --- a/egs/audioset/AT/zipformer/jit_pretrained.py +++ b/egs/audioset/AT/zipformer/jit_pretrained.py @@ -28,10 +28,20 @@ You can use the following command to get the exported models: Usage of this script: -./zipformer/jit_pretrained.py \ - --nn-model-filename ./zipformer/exp/cpu_jit.pt \ - /path/to/foo.wav \ - /path/to/bar.wav + repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12 + repo=$(basename $repo_url) + GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url + pushd $repo/exp + git lfs pull --include pretrained.pt + popd + + python3 zipformer/jit_pretrained.py \ + --nn-model-filename $repo/exp/jit_script.pt \ + --label-dict $repo/data/class_labels_indices.csv \ + $repo/test_wavs/1.wav \ + $repo/test_wavs/2.wav \ + $repo/test_wavs/3.wav \ + $repo/test_wavs/4.wav """ import argparse @@ -168,7 +178,8 @@ def main(): topk_prob, topk_index = logit.sigmoid().topk(5) topk_labels = [label_dict[index.item()] for index in topk_index] logging.info( - f"{filename}: Top 5 predicted labels are {topk_labels} with probability of {topk_prob.tolist()}" + f"{filename}: Top 5 predicted labels are {topk_labels} with " + f"probability of {topk_prob.tolist()}" ) logging.info("Done") diff --git a/egs/audioset/AT/zipformer/onnx_pretrained.py b/egs/audioset/AT/zipformer/onnx_pretrained.py index c7753715a..1d3093d99 100755 --- a/egs/audioset/AT/zipformer/onnx_pretrained.py +++ b/egs/audioset/AT/zipformer/onnx_pretrained.py @@ -17,48 +17,25 @@ # limitations under the License. """ This script loads ONNX models and uses them to decode waves. -You can use the following command to get the exported models: -We use the pre-trained model from -https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12#/ -as an example to show how to use this file. +Usage of this script: -1. Download the pre-trained model + repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12 + repo=$(basename $repo_url) + GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url + pushd $repo/exp + git lfs pull --include "*.onnx" + popd -cd egs/librispeech/ASR - -repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12#/ -GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url -repo=$(basename $repo_url) - -pushd $repo -git lfs pull --include "exp/pretrained.pt" - -cd exp -ln -s pretrained.pt epoch-99.pt -popd - -2. Export the model to ONNX - -./zipformer/export-onnx.py \ - --use-averaged-model 0 \ - --epoch 99 \ - --avg 1 \ - --exp-dir $repo/exp \ - --causal False - -It will generate the following 3 files inside $repo/exp: - - - model-epoch-99-avg-1.onnx - -3. Run this file - -./zipformer/onnx_pretrained.py \ - --model-filename $repo/exp/model-epoch-99-avg-1.onnx \ - --tokens $repo/data/lang_bpe_500/tokens.txt \ - $repo/test_wavs/1089-134686-0001.wav \ - $repo/test_wavs/1221-135766-0001.wav \ - $repo/test_wavs/1221-135766-0002.wav + for m in model.onnx model.int8.onnx; do + python3 zipformer/onnx_pretrained.py \ + --model-filename $repo/exp/model.onnx \ + --label-dict $repo/data/class_labels_indices.csv \ + $repo/test_wavs/1.wav \ + $repo/test_wavs/2.wav \ + $repo/test_wavs/3.wav \ + $repo/test_wavs/4.wav + done """ import argparse diff --git a/egs/audioset/AT/zipformer/pretrained.py b/egs/audioset/AT/zipformer/pretrained.py index 460e8b2ed..bdbd799fa 100755 --- a/egs/audioset/AT/zipformer/pretrained.py +++ b/egs/audioset/AT/zipformer/pretrained.py @@ -33,7 +33,10 @@ Usage of this script: python3 zipformer/pretrained.py \ --checkpoint $repo/exp/pretrained.pt \ --label-dict $repo/data/class_labels_indices.csv \ - $repo/test_wavs/1.wav + $repo/test_wavs/1.wav \ + $repo/test_wavs/2.wav \ + $repo/test_wavs/3.wav \ + $repo/test_wavs/4.wav """