From 9e9fe7954d13c5ee8f10e990b358cf8c752a24e6 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 12 Dec 2023 18:57:04 +0800 Subject: [PATCH] Upload gigaspeech zipformer models in CI (#1412) --- .github/scripts/multi-zh-hans.sh | 3 +- .../run-gigaspeech-zipformer-2023-10-17.sh | 74 +++++++++++++++++-- .github/workflows/multi-zh-hans.yml | 5 -- .../run-gigaspeech-zipformer-2023-10-17.yml | 14 ++++ 4 files changed, 85 insertions(+), 11 deletions(-) diff --git a/.github/scripts/multi-zh-hans.sh b/.github/scripts/multi-zh-hans.sh index 4ede7f43e..2dd1bce42 100755 --- a/.github/scripts/multi-zh-hans.sh +++ b/.github/scripts/multi-zh-hans.sh @@ -45,7 +45,7 @@ log "----------------------------------------" ls -lh $repo/exp log "------------------------------------------------------------" -log "Test export streaming ONNX transducer models (Python code) " +log "Test exported streaming ONNX transducer models (Python code)" log "------------------------------------------------------------" log "test fp32" @@ -73,6 +73,7 @@ GIT_LFS_SKIP_SMUDGE=1 git clone $url dst=$(basename $url) cp -v $repo/exp/*.onnx $dst cp -v $repo/data/lang_bpe_2000/tokens.txt $dst +cp -v $repo/data/lang_bpe_2000/bpe.model $dst mkdir -p $dst/test_wavs cp -v $repo/test_wavs/*.wav $dst/test_wavs cd $dst diff --git a/.github/scripts/run-gigaspeech-zipformer-2023-10-17.sh b/.github/scripts/run-gigaspeech-zipformer-2023-10-17.sh index 6bb0b9ebc..329896ef6 100755 --- a/.github/scripts/run-gigaspeech-zipformer-2023-10-17.sh +++ b/.github/scripts/run-gigaspeech-zipformer-2023-10-17.sh @@ -26,16 +26,80 @@ git lfs pull --include "data/lang_bpe_500/bpe.model" git lfs pull --include "data/lang_bpe_500/tokens.txt" git lfs pull --include "exp/jit_script.pt" git lfs pull --include "exp/pretrained.pt" -ln -s pretrained.pt epoch-99.pt -ls -lh *.pt +rm epoch-30.pt +ln -s pretrained.pt epoch-30.pt +rm *.onnx +ls -lh popd +log "----------------------------------------" +log "Export ONNX transducer models " +log "----------------------------------------" + +./zipformer/export-onnx.py \ + --tokens $repo/data/lang_bpe_500/tokens.txt \ + --use-averaged-model 0 \ + --epoch 30 \ + --avg 1 \ + --exp-dir $repo/exp + +ls -lh $repo/exp + +log "------------------------------------------------------------" +log "Test exported ONNX transducer models (Python code) " +log "------------------------------------------------------------" + +log "test fp32" +./zipformer/onnx_pretrained.py \ + --encoder-model-filename $repo/exp/encoder-epoch-30-avg-1.onnx \ + --decoder-model-filename $repo/exp/decoder-epoch-30-avg-1.onnx \ + --joiner-model-filename $repo/exp/joiner-epoch-30-avg-1.onnx \ + --tokens $repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +log "test int8" +./zipformer/onnx_pretrained.py \ + --encoder-model-filename $repo/exp/encoder-epoch-30-avg-1.int8.onnx \ + --decoder-model-filename $repo/exp/decoder-epoch-30-avg-1.onnx \ + --joiner-model-filename $repo/exp/joiner-epoch-30-avg-1.int8.onnx \ + --tokens $repo/data/lang_bpe_500/tokens.txt \ + $repo/test_wavs/1089-134686-0001.wav \ + $repo/test_wavs/1221-135766-0001.wav \ + $repo/test_wavs/1221-135766-0002.wav + +log "Upload models to huggingface" +git config --global user.name "k2-fsa" +git config --global user.email "xxx@gmail.com" + +url=https://huggingface.co/k2-fsa/sherpa-onnx-zipformer-gigaspeech-2023-12-12 +GIT_LFS_SKIP_SMUDGE=1 git clone $url +dst=$(basename $url) +cp -v $repo/exp/*.onnx $dst +cp -v $repo/data/lang_bpe_500/tokens.txt $dst +cp -v $repo/data/lang_bpe_500/bpe.model $dst +mkdir -p $dst/test_wavs +cp -v $repo/test_wavs/*.wav $dst/test_wavs +cd $dst +git lfs track "*.onnx" +git add . +git commit -m "upload model" && git push https://k2-fsa:${HF_TOKEN}@huggingface.co/k2-fsa/$dst main || true + +log "Upload models to https://github.com/k2-fsa/sherpa-onnx" +rm -rf .git +rm -fv .gitattributes +cd .. +tar cjfv $dst.tar.bz2 $dst +ls -lh +mv -v $dst.tar.bz2 ../../../ + log "Export to torchscript model" ./zipformer/export.py \ --exp-dir $repo/exp \ --use-averaged-model false \ --tokens $repo/data/lang_bpe_500/tokens.txt \ - --epoch 99 \ + --epoch 30 \ --avg 1 \ --jit 1 @@ -67,7 +131,7 @@ echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}" echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}" if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then mkdir -p zipformer/exp - ln -s $PWD/$repo/exp/pretrained.pt zipformer/exp/epoch-999.pt + ln -s $PWD/$repo/exp/pretrained.pt zipformer/exp/epoch-30.pt ln -s $PWD/$repo/data/lang_bpe_500 data/ ls -lh data @@ -83,7 +147,7 @@ if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == ./zipformer/decode.py \ --decoding-method $method \ - --epoch 999 \ + --epoch 30 \ --avg 1 \ --use-averaged-model 0 \ --max-duration $max_duration \ diff --git a/.github/workflows/multi-zh-hans.yml b/.github/workflows/multi-zh-hans.yml index 439300b5f..9081047de 100644 --- a/.github/workflows/multi-zh-hans.yml +++ b/.github/workflows/multi-zh-hans.yml @@ -2,11 +2,6 @@ name: run-multi-zh-hans on: push: - branches: - - master - - upload-ctc-model - - pull_request: branches: - master diff --git a/.github/workflows/run-gigaspeech-zipformer-2023-10-17.yml b/.github/workflows/run-gigaspeech-zipformer-2023-10-17.yml index 7572f4b5f..87090e310 100644 --- a/.github/workflows/run-gigaspeech-zipformer-2023-10-17.yml +++ b/.github/workflows/run-gigaspeech-zipformer-2023-10-17.yml @@ -21,6 +21,7 @@ on: push: branches: - master + pull_request: types: [labeled] @@ -33,6 +34,8 @@ on: # nightly build at 15:50 UTC time every day - cron: "50 15 * * *" + workflow_dispatch: + concurrency: group: run_gigaspeech_2023_10_17_zipformer-${{ github.ref }} cancel-in-progress: true @@ -85,6 +88,7 @@ jobs: env: GITHUB_EVENT_NAME: ${{ github.event_name }} GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | mkdir -p egs/gigaspeech/ASR/data ln -sfv ~/tmp/fbank-libri egs/gigaspeech/ASR/data/fbank @@ -97,6 +101,16 @@ jobs: .github/scripts/run-gigaspeech-zipformer-2023-10-17.sh + - name: upload model to https://github.com/k2-fsa/sherpa-onnx + uses: svenstaro/upload-release-action@v2 + with: + file_glob: true + file: ./*.tar.bz2 + overwrite: true + repo_name: k2-fsa/sherpa-onnx + repo_token: ${{ secrets.UPLOAD_GH_SHERPA_ONNX_TOKEN }} + tag: asr-models + - name: Display decoding results for gigaspeech zipformer if: github.event_name == 'schedule' || github.event.label.name == 'run-decode' shell: bash