Add CI test for the AudioSet recipe. (#1585)

This commit is contained in:
Fangjun Kuang 2024-04-09 17:45:00 +08:00 committed by GitHub
parent f5d7818733
commit fa5d861af0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 360 additions and 114 deletions

94
.github/scripts/audioset/AT/run.sh vendored Executable file
View File

@ -0,0 +1,94 @@
#!/usr/bin/env bash
set -ex
python3 -m pip install onnxoptimizer onnxsim
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/audioset/AT
function test_pretrained() {
repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12
repo=$(basename $repo_url)
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
pushd $repo/exp
git lfs pull --include pretrained.pt
ln -s pretrained.pt epoch-99.pt
ls -lh
popd
log "test pretrained.pt"
python3 zipformer/pretrained.py \
--checkpoint $repo/exp/pretrained.pt \
--label-dict $repo/data/class_labels_indices.csv \
$repo/test_wavs/1.wav \
$repo/test_wavs/2.wav \
$repo/test_wavs/3.wav \
$repo/test_wavs/4.wav
log "test jit export"
ls -lh $repo/exp/
python3 zipformer/export.py \
--exp-dir $repo/exp \
--epoch 99 \
--avg 1 \
--use-averaged-model 0 \
--jit 1
ls -lh $repo/exp/
log "test jit models"
python3 zipformer/jit_pretrained.py \
--nn-model-filename $repo/exp/jit_script.pt \
--label-dict $repo/data/class_labels_indices.csv \
$repo/test_wavs/1.wav \
$repo/test_wavs/2.wav \
$repo/test_wavs/3.wav \
$repo/test_wavs/4.wav
log "test onnx export"
ls -lh $repo/exp/
python3 zipformer/export-onnx.py \
--exp-dir $repo/exp \
--epoch 99 \
--avg 1 \
--use-averaged-model 0
ls -lh $repo/exp/
pushd $repo/exp/
mv model-epoch-99-avg-1.onnx model.onnx
mv model-epoch-99-avg-1.int8.onnx model.int8.onnx
popd
ls -lh $repo/exp/
log "test onnx models"
for m in model.onnx model.int8.onnx; do
log "$m"
python3 zipformer/onnx_pretrained.py \
--model-filename $repo/exp/model.onnx \
--label-dict $repo/data/class_labels_indices.csv \
$repo/test_wavs/1.wav \
$repo/test_wavs/2.wav \
$repo/test_wavs/3.wav \
$repo/test_wavs/4.wav
done
log "prepare data for uploading to huggingface"
dst=/icefall/model-onnx
mkdir -p $dst
cp -v $repo/exp/*.onnx $dst/
cp -v $repo/data/* $dst/
cp -av $repo/test_wavs $dst
ls -lh $dst
ls -lh $dst/test_wavs
}
test_pretrained

View File

@ -49,6 +49,8 @@ RUN pip install --no-cache-dir \
multi_quantization \ multi_quantization \
numba \ numba \
numpy \ numpy \
onnxoptimizer \
onnxsim \
onnx \ onnx \
onnxmltools \ onnxmltools \
onnxruntime \ onnxruntime \

137
.github/workflows/audioset.yml vendored Normal file
View File

@ -0,0 +1,137 @@
name: audioset
on:
push:
branches:
- master
pull_request:
branches:
- master
workflow_dispatch:
concurrency:
group: audioset-${{ github.ref }}
cancel-in-progress: true
jobs:
generate_build_matrix:
if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
# see https://github.com/pytorch/pytorch/pull/50633
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Generating build matrix
id: set-matrix
run: |
# outputting for debugging purposes
python ./.github/scripts/docker/generate_build_matrix.py
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
echo "::set-output name=matrix::${MATRIX}"
audioset:
needs: generate_build_matrix
name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Free space
shell: bash
run: |
ls -lh
df -h
rm -rf /opt/hostedtoolcache
df -h
echo "pwd: $PWD"
echo "github.workspace ${{ github.workspace }}"
- name: Run tests
uses: addnab/docker-run-action@v3
with:
image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
options: |
--volume ${{ github.workspace }}/:/icefall
shell: bash
run: |
export PYTHONPATH=/icefall:$PYTHONPATH
cd /icefall
git config --global --add safe.directory /icefall
.github/scripts/audioset/AT/run.sh
- name: Show model files
shell: bash
run: |
sudo chown -R runner ./model-onnx
ls -lh ./model-onnx
chmod -x ./model-onnx/class_labels_indices.csv
echo "----------"
ls -lh ./model-onnx/*
- name: Upload model to huggingface
if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
uses: nick-fields/retry@v3
with:
max_attempts: 20
timeout_seconds: 200
shell: bash
command: |
git config --global user.email "csukuangfj@gmail.com"
git config --global user.name "Fangjun Kuang"
rm -rf huggingface
export GIT_LFS_SKIP_SMUDGE=1
git clone https://huggingface.co/k2-fsa/sherpa-onnx-zipformer-audio-tagging-2024-04-09 huggingface
cd huggingface
git fetch
git pull
git merge -m "merge remote" --ff origin main
cp ../model-onnx/*.onnx ./
cp ../model-onnx/*.csv ./
cp -a ../model-onnx/test_wavs ./
ls -lh
git add .
git status
git commit -m "update models"
git status
git push https://csukuangfj:$HF_TOKEN@huggingface.co/k2-fsa/sherpa-onnx-zipformer-audio-tagging-2024-04-09 main || true
rm -rf huggingface
- name: Prepare for release
if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
shell: bash
run: |
d=sherpa-onnx-zipformer-audio-tagging-2024-04-09
mv ./model-onnx $d
tar cjvf ${d}.tar.bz2 $d
ls -lh
- name: Release exported onnx models
if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0' && github.event_name == 'push'
uses: svenstaro/upload-release-action@v2
with:
file_glob: true
overwrite: true
file: sherpa-onnx-*.tar.bz2
repo_name: k2-fsa/sherpa-onnx
repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }}
tag: audio-tagging-models

View File

@ -55,6 +55,8 @@ RUN pip install --no-cache-dir \
onnx \ onnx \
onnxruntime \ onnxruntime \
onnxmltools \ onnxmltools \
onnxoptimizer \
onnxsim \
multi_quantization \ multi_quantization \
typeguard \ typeguard \
numpy \ numpy \

View File

@ -55,6 +55,8 @@ RUN pip install --no-cache-dir \
onnx \ onnx \
onnxruntime \ onnxruntime \
onnxmltools \ onnxmltools \
onnxoptimizer \
onnxsim \
multi_quantization \ multi_quantization \
typeguard \ typeguard \
numpy \ numpy \

View File

@ -69,6 +69,8 @@ RUN pip uninstall -y tqdm && \
onnx \ onnx \
onnxruntime \ onnxruntime \
onnxmltools \ onnxmltools \
onnxoptimizer \
onnxsim \
multi_quantization \ multi_quantization \
typeguard \ typeguard \
numpy \ numpy \

View File

@ -56,6 +56,8 @@ RUN pip install --no-cache-dir \
onnx \ onnx \
onnxruntime \ onnxruntime \
onnxmltools \ onnxmltools \
onnxoptimizer \
onnxsim \
multi_quantization \ multi_quantization \
typeguard \ typeguard \
numpy \ numpy \

View File

@ -56,6 +56,8 @@ RUN pip install --no-cache-dir \
onnx \ onnx \
onnxruntime \ onnxruntime \
onnxmltools \ onnxmltools \
onnxoptimizer \
onnxsim \
multi_quantization \ multi_quantization \
typeguard \ typeguard \
numpy \ numpy \

View File

@ -56,6 +56,8 @@ RUN pip install --no-cache-dir \
onnx \ onnx \
onnxruntime \ onnxruntime \
onnxmltools \ onnxmltools \
onnxoptimizer \
onnxsim \
multi_quantization \ multi_quantization \
typeguard \ typeguard \
numpy \ numpy \

View File

@ -56,6 +56,8 @@ RUN pip install --no-cache-dir \
onnx \ onnx \
onnxruntime \ onnxruntime \
onnxmltools \ onnxmltools \
onnxoptimizer \
onnxsim \
multi_quantization \ multi_quantization \
typeguard \ typeguard \
numpy \ numpy \

View File

@ -56,6 +56,8 @@ RUN pip install --no-cache-dir \
onnx \ onnx \
onnxruntime \ onnxruntime \
onnxmltools \ onnxmltools \
onnxoptimizer \
onnxsim \
multi_quantization \ multi_quantization \
typeguard \ typeguard \
numpy \ numpy \

View File

@ -56,6 +56,8 @@ RUN pip install --no-cache-dir \
onnx \ onnx \
onnxruntime \ onnxruntime \
onnxmltools \ onnxmltools \
onnxoptimizer \
onnxsim \
multi_quantization \ multi_quantization \
typeguard \ typeguard \
numpy \ numpy \

View File

@ -56,6 +56,8 @@ RUN pip install --no-cache-dir \
onnx \ onnx \
onnxruntime \ onnxruntime \
onnxmltools \ onnxmltools \
onnxoptimizer \
onnxsim \
multi_quantization \ multi_quantization \
typeguard \ typeguard \
numpy \ numpy \

View File

@ -56,6 +56,8 @@ RUN pip install --no-cache-dir \
onnx \ onnx \
onnxruntime \ onnxruntime \
onnxmltools \ onnxmltools \
onnxoptimizer \
onnxsim \
multi_quantization \ multi_quantization \
typeguard \ typeguard \
numpy \ numpy \

View File

@ -56,6 +56,8 @@ RUN pip install --no-cache-dir \
onnx \ onnx \
onnxruntime \ onnxruntime \
onnxmltools \ onnxmltools \
onnxoptimizer \
onnxsim \
multi_quantization \ multi_quantization \
typeguard \ typeguard \
numpy \ numpy \

View File

@ -6,56 +6,28 @@
""" """
This script exports a transducer model from PyTorch to ONNX. This script exports a transducer model from PyTorch to ONNX.
We use the pre-trained model from Usage of this script:
https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
as an example to show how to use this file.
1. Download the pre-trained model repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12
repo=$(basename $repo_url)
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
pushd $repo/exp
git lfs pull --include pretrained.pt
ln -s pretrained.pt epoch-99.pt
popd
cd egs/librispeech/ASR python3 zipformer/export-onnx.py \
--exp-dir $repo/exp \
--epoch 99 \
--avg 1 \
--use-averaged-model 0
repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12#/ pushd $repo/exp
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url mv model-epoch-99-avg-1.onnx model.onnx
repo=$(basename $repo_url) mv model-epoch-99-avg-1.int8.onnx model.int8.onnx
popd
pushd $repo See ./onnx_pretrained.py
git lfs pull --include "exp/pretrained.pt"
cd exp
ln -s pretrained.pt epoch-99.pt
popd
2. Export the model to ONNX
./zipformer/export-onnx.py \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $repo/exp \
--num-encoder-layers "2,2,3,4,3,2" \
--downsampling-factor "1,2,4,8,4,2" \
--feedforward-dim "512,768,1024,1536,1024,768" \
--num-heads "4,4,4,8,4,4" \
--encoder-dim "192,256,384,512,384,256" \
--query-head-dim 32 \
--value-head-dim 12 \
--pos-head-dim 4 \
--pos-dim 48 \
--encoder-unmasked-dim "192,192,256,256,256,192" \
--cnn-module-kernel "31,31,15,15,15,31" \
--decoder-dim 512 \
--joiner-dim 512 \
--causal False \
--chunk-size "16,32,64,-1" \
--left-context-frames "64,128,256,-1"
It will generate the following 3 files inside $repo/exp:
- encoder-epoch-99-avg-1.onnx
- decoder-epoch-99-avg-1.onnx
- joiner-epoch-99-avg-1.onnx
See ./onnx_pretrained.py and ./onnx_check.py for how to
use the exported ONNX models. use the exported ONNX models.
""" """
@ -66,9 +38,11 @@ from typing import Dict
import k2 import k2
import onnx import onnx
import onnxoptimizer
import torch import torch
import torch.nn as nn import torch.nn as nn
from onnxruntime.quantization import QuantType, quantize_dynamic from onnxruntime.quantization import QuantType, quantize_dynamic
from onnxsim import simplify
from scaling_converter import convert_scaled_to_non_scaled from scaling_converter import convert_scaled_to_non_scaled
from train import add_model_arguments, get_model, get_params from train import add_model_arguments, get_model, get_params
from zipformer import Zipformer2 from zipformer import Zipformer2
@ -261,6 +235,29 @@ def export_audio_tagging_model_onnx(
add_meta_data(filename=filename, meta_data=meta_data) add_meta_data(filename=filename, meta_data=meta_data)
def optimize_model(filename):
# see
# https://github.com/microsoft/onnxruntime/issues/1899#issuecomment-534806537
# and
# https://github.com/onnx/onnx/issues/582#issuecomment-937788108
# and
# https://github.com/onnx/optimizer/issues/110
# and
# https://qiita.com/Yossy_Hal/items/34f3b2aef2199baf7f5f
passes = ["eliminate_unused_initializer"]
onnx_model = onnx.load(filename)
onnx_model = onnxoptimizer.optimize(onnx_model, passes)
model_simp, check = simplify(onnx_model)
if check:
logging.info("Simplified the model!")
onnx_model = model_simp
else:
logging.info("Failed to simplify the model!")
onnx.save(onnx_model, filename)
@torch.no_grad() @torch.no_grad()
def main(): def main():
args = get_parser().parse_args() args = get_parser().parse_args()
@ -389,6 +386,7 @@ def main():
model_filename, model_filename,
opset_version=opset_version, opset_version=opset_version,
) )
optimize_model(model_filename)
logging.info(f"Exported audio tagging model to {model_filename}") logging.info(f"Exported audio tagging model to {model_filename}")
# Generate int8 quantization models # Generate int8 quantization models
@ -403,6 +401,7 @@ def main():
op_types_to_quantize=["MatMul"], op_types_to_quantize=["MatMul"],
weight_type=QuantType.QInt8, weight_type=QuantType.QInt8,
) )
optimize_model(model_filename_int8)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -25,7 +25,7 @@
Usage: Usage:
Note: This is a example for librispeech dataset, if you are using different Note: This is an example for AudioSet dataset, if you are using different
dataset, you should change the argument values according to your dataset. dataset, you should change the argument values according to your dataset.
(1) Export to torchscript model using torch.jit.script() (1) Export to torchscript model using torch.jit.script()
@ -42,6 +42,7 @@ load it by `torch.jit.load("jit_script.pt")`.
Check ./jit_pretrained.py for its usage. Check ./jit_pretrained.py for its usage.
Check https://github.com/k2-fsa/sherpa Check https://github.com/k2-fsa/sherpa
and https://github.com/k2-fsa/sherpa-onnx
for how to use the exported models outside of icefall. for how to use the exported models outside of icefall.
(2) Export `model.state_dict()` (2) Export `model.state_dict()`
@ -55,13 +56,13 @@ for how to use the exported models outside of icefall.
It will generate a file `pretrained.pt` in the given `exp_dir`. You can later It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
load it by `icefall.checkpoint.load_checkpoint()`. load it by `icefall.checkpoint.load_checkpoint()`.
To use the generated file with `zipformer/decode.py`, To use the generated file with `zipformer/evaluate.py`,
you can do: you can do:
cd /path/to/exp_dir cd /path/to/exp_dir
ln -s pretrained.pt epoch-9999.pt ln -s pretrained.pt epoch-9999.pt
cd /path/to/egs/librispeech/ASR cd /path/to/egs/audioset/AT
./zipformer/evaluate.py \ ./zipformer/evaluate.py \
--exp-dir ./zipformer/exp \ --exp-dir ./zipformer/exp \
--use-averaged-model False \ --use-averaged-model False \

View File

@ -28,10 +28,20 @@ You can use the following command to get the exported models:
Usage of this script: Usage of this script:
./zipformer/jit_pretrained.py \ repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12
--nn-model-filename ./zipformer/exp/cpu_jit.pt \ repo=$(basename $repo_url)
/path/to/foo.wav \ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
/path/to/bar.wav pushd $repo/exp
git lfs pull --include jit_script.pt
popd
python3 zipformer/jit_pretrained.py \
--nn-model-filename $repo/exp/jit_script.pt \
--label-dict $repo/data/class_labels_indices.csv \
$repo/test_wavs/1.wav \
$repo/test_wavs/2.wav \
$repo/test_wavs/3.wav \
$repo/test_wavs/4.wav
""" """
import argparse import argparse
@ -168,7 +178,8 @@ def main():
topk_prob, topk_index = logit.sigmoid().topk(5) topk_prob, topk_index = logit.sigmoid().topk(5)
topk_labels = [label_dict[index.item()] for index in topk_index] topk_labels = [label_dict[index.item()] for index in topk_index]
logging.info( logging.info(
f"{filename}: Top 5 predicted labels are {topk_labels} with probability of {topk_prob.tolist()}" f"{filename}: Top 5 predicted labels are {topk_labels} with "
f"probability of {topk_prob.tolist()}"
) )
logging.info("Done") logging.info("Done")

View File

@ -17,48 +17,25 @@
# limitations under the License. # limitations under the License.
""" """
This script loads ONNX models and uses them to decode waves. This script loads ONNX models and uses them to decode waves.
You can use the following command to get the exported models:
We use the pre-trained model from Usage of this script:
https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12#/
as an example to show how to use this file.
1. Download the pre-trained model repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12
repo=$(basename $repo_url)
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
pushd $repo/exp
git lfs pull --include "*.onnx"
popd
cd egs/librispeech/ASR for m in model.onnx model.int8.onnx; do
python3 zipformer/onnx_pretrained.py \
repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12#/ --model-filename $repo/exp/model.onnx \
GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url --label-dict $repo/data/class_labels_indices.csv \
repo=$(basename $repo_url) $repo/test_wavs/1.wav \
$repo/test_wavs/2.wav \
pushd $repo $repo/test_wavs/3.wav \
git lfs pull --include "exp/pretrained.pt" $repo/test_wavs/4.wav
done
cd exp
ln -s pretrained.pt epoch-99.pt
popd
2. Export the model to ONNX
./zipformer/export-onnx.py \
--use-averaged-model 0 \
--epoch 99 \
--avg 1 \
--exp-dir $repo/exp \
--causal False
It will generate the following 3 files inside $repo/exp:
- model-epoch-99-avg-1.onnx
3. Run this file
./zipformer/onnx_pretrained.py \
--model-filename $repo/exp/model-epoch-99-avg-1.onnx \
--tokens $repo/data/lang_bpe_500/tokens.txt \
$repo/test_wavs/1089-134686-0001.wav \
$repo/test_wavs/1221-135766-0001.wav \
$repo/test_wavs/1221-135766-0002.wav
""" """
import argparse import argparse

View File

@ -18,27 +18,25 @@
This script loads a checkpoint and uses it to decode waves. This script loads a checkpoint and uses it to decode waves.
You can generate the checkpoint with the following command: You can generate the checkpoint with the following command:
Note: This is a example for librispeech dataset, if you are using different Note: This is an example for the AudioSet dataset, if you are using different
dataset, you should change the argument values according to your dataset. dataset, you should change the argument values according to your dataset.
./zipformer/export.py \
--exp-dir ./zipformer/exp \
--tokens data/lang_bpe_500/tokens.txt \
--epoch 30 \
--avg 9
Usage of this script: Usage of this script:
./zipformer/pretrained.py \ repo_url=https://huggingface.co/marcoyang/icefall-audio-tagging-audioset-zipformer-2024-03-12
--checkpoint ./zipformer/exp/pretrained.pt \ repo=$(basename $repo_url)
/path/to/foo.wav \ GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
/path/to/bar.wav pushd $repo/exp
git lfs pull --include pretrained.pt
popd
python3 zipformer/pretrained.py \
You can also use `./zipformer/exp/epoch-xx.pt`. --checkpoint $repo/exp/pretrained.pt \
--label-dict $repo/data/class_labels_indices.csv \
Note: ./zipformer/exp/pretrained.pt is generated by ./zipformer/export.py $repo/test_wavs/1.wav \
$repo/test_wavs/2.wav \
$repo/test_wavs/3.wav \
$repo/test_wavs/4.wav
""" """
@ -189,7 +187,8 @@ def main():
topk_prob, topk_index = logit.sigmoid().topk(5) topk_prob, topk_index = logit.sigmoid().topk(5)
topk_labels = [label_dict[index.item()] for index in topk_index] topk_labels = [label_dict[index.item()] for index in topk_index]
logging.info( logging.info(
f"{filename}: Top 5 predicted labels are {topk_labels} with probability of {topk_prob.tolist()}" f"{filename}: Top 5 predicted labels are {topk_labels} with "
f"probability of {topk_prob.tolist()}"
) )
logging.info("Done") logging.info("Done")
@ -199,4 +198,5 @@ if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO) logging.basicConfig(format=formatter, level=logging.INFO)
main() main()

View File

@ -8,13 +8,14 @@ pypinyin==0.50.0
tensorboard tensorboard
typeguard typeguard
dill dill
onnx==1.15.0 onnx>=1.15.0
onnxruntime==1.16.3 onnxruntime>=1.16.3
onnxoptimizer
# style check session: # style check session:
black==22.3.0 black==22.3.0
isort==5.10.1 isort==5.10.1
flake8==5.0.4 flake8==5.0.4
# cantonese word segment support # cantonese word segment support
pycantonese==3.4.0 pycantonese==3.4.0