From c54af5381355e7ffb0b4e5ef208bdb18bfe1c025 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 26 Dec 2023 19:57:38 +0800 Subject: [PATCH] Refactor CI for aishell --- .github/scripts/aishell/ASR/run.sh | 102 +++++++++++++++ ...pruned-transducer-stateless3-2022-06-20.sh | 87 ------------- .github/workflows/aishell.yml | 82 ++++++++++++ .github/workflows/run-aishell-2022-06-20.yml | 123 ------------------ .github/workflows/run-yesno-recipe.yml | 1 - 5 files changed, 184 insertions(+), 211 deletions(-) create mode 100755 .github/scripts/aishell/ASR/run.sh delete mode 100755 .github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh create mode 100644 .github/workflows/aishell.yml delete mode 100644 .github/workflows/run-aishell-2022-06-20.yml diff --git a/.github/scripts/aishell/ASR/run.sh b/.github/scripts/aishell/ASR/run.sh new file mode 100755 index 000000000..777f5b447 --- /dev/null +++ b/.github/scripts/aishell/ASR/run.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash + +set -ex + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +cd egs/aishell/ASR + +function download_test_dev_manifests() { + git lfs install + + fbank_url=https://huggingface.co/csukuangfj/aishell-test-dev-manifests + log "Downloading pre-commputed fbank from $fbank_url" + + git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests + ln -s $PWD/aishell-test-dev-manifests/data . +} + +function test_transducer_stateless3_2022_06_20() { + repo_url=https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20 + log "Downloading pre-trained model from $repo_url" + git clone $repo_url + repo=$(basename $repo_url) + + log "Display test files" + tree $repo/ + ls -lh $repo/test_wavs/*.wav + + pushd $repo/exp + ln -s pretrained-epoch-29-avg-5-torch-1.10.0.pt pretrained.pt + popd + + log "test greedy_search with pretrained.py" + + for sym in 1 2 3; do + log "Greedy search with --max-sym-per-frame $sym" + + ./pruned_transducer_stateless3/pretrained.py \ + --method greedy_search \ + --max-sym-per-frame $sym \ + --checkpoint $repo/exp/pretrained.pt \ + --lang-dir $repo/data/lang_char \ + $repo/test_wavs/BAC009S0764W0121.wav \ + $repo/test_wavs/BAC009S0764W0122.wav \ + $repo/test_wavs/BAC009S0764W0123.wav + done + + log "test beam search with pretrained.py" + + for method in modified_beam_search beam_search fast_beam_search; do + log "$method" + + ./pruned_transducer_stateless3/pretrained.py \ + --method $method \ + --beam-size 4 \ + --checkpoint $repo/exp/pretrained.pt \ + --lang-dir $repo/data/lang_char \ + $repo/test_wavs/BAC009S0764W0121.wav \ + $repo/test_wavs/BAC009S0764W0122.wav \ + $repo/test_wavs/BAC009S0764W0123.wav + done + + echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}" + echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}" + if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then + mkdir -p pruned_transducer_stateless3/exp + ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless3/exp/epoch-999.pt + ln -s $PWD/$repo/data/lang_char data/ + + ls -lh data + ls -lh pruned_transducer_stateless3/exp + + log "Decoding test and dev" + + # use a small value for decoding with CPU + max_duration=100 + + for method in greedy_search fast_beam_search modified_beam_search; do + log "Decoding with $method" + + ./pruned_transducer_stateless3/decode.py \ + --decoding-method $method \ + --epoch 999 \ + --avg 1 \ + --max-duration $max_duration \ + --exp-dir pruned_transducer_stateless3/exp + done + + rm pruned_transducer_stateless3/exp/*.pt + fi + + rm -rf $repo +} + +download_test_dev_manifests +test_transducer_stateless3_2022_06_20 + +ls -lh diff --git a/.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh b/.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh deleted file mode 100755 index c3640cfde..000000000 --- a/.github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env bash - -set -e - -log() { - # This function is from espnet - local fname=${BASH_SOURCE[1]##*/} - echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" -} - -cd egs/aishell/ASR - -git lfs install - -fbank_url=https://huggingface.co/csukuangfj/aishell-test-dev-manifests -log "Downloading pre-commputed fbank from $fbank_url" - -git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests -ln -s $PWD/aishell-test-dev-manifests/data . - -repo_url=https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20 -log "Downloading pre-trained model from $repo_url" -git clone $repo_url -repo=$(basename $repo_url) - -log "Display test files" -tree $repo/ -ls -lh $repo/test_wavs/*.wav - -pushd $repo/exp -ln -s pretrained-epoch-29-avg-5-torch-1.10.0.pt pretrained.pt -popd - -for sym in 1 2 3; do - log "Greedy search with --max-sym-per-frame $sym" - - ./pruned_transducer_stateless3/pretrained.py \ - --method greedy_search \ - --max-sym-per-frame $sym \ - --checkpoint $repo/exp/pretrained.pt \ - --lang-dir $repo/data/lang_char \ - $repo/test_wavs/BAC009S0764W0121.wav \ - $repo/test_wavs/BAC009S0764W0122.wav \ - $repo/test_wavs/BAC009S0764W0123.wav -done - -for method in modified_beam_search beam_search fast_beam_search; do - log "$method" - - ./pruned_transducer_stateless3/pretrained.py \ - --method $method \ - --beam-size 4 \ - --checkpoint $repo/exp/pretrained.pt \ - --lang-dir $repo/data/lang_char \ - $repo/test_wavs/BAC009S0764W0121.wav \ - $repo/test_wavs/BAC009S0764W0122.wav \ - $repo/test_wavs/BAC009S0764W0123.wav -done - -echo "GITHUB_EVENT_NAME: ${GITHUB_EVENT_NAME}" -echo "GITHUB_EVENT_LABEL_NAME: ${GITHUB_EVENT_LABEL_NAME}" -if [[ x"${GITHUB_EVENT_NAME}" == x"schedule" || x"${GITHUB_EVENT_LABEL_NAME}" == x"run-decode" ]]; then - mkdir -p pruned_transducer_stateless3/exp - ln -s $PWD/$repo/exp/pretrained.pt pruned_transducer_stateless3/exp/epoch-999.pt - ln -s $PWD/$repo/data/lang_char data/ - - ls -lh data - ls -lh pruned_transducer_stateless3/exp - - log "Decoding test and dev" - - # use a small value for decoding with CPU - max_duration=100 - - for method in greedy_search fast_beam_search modified_beam_search; do - log "Decoding with $method" - - ./pruned_transducer_stateless3/decode.py \ - --decoding-method $method \ - --epoch 999 \ - --avg 1 \ - --max-duration $max_duration \ - --exp-dir pruned_transducer_stateless3/exp - done - - rm pruned_transducer_stateless3/exp/*.pt -fi diff --git a/.github/workflows/aishell.yml b/.github/workflows/aishell.yml new file mode 100644 index 000000000..e3f327867 --- /dev/null +++ b/.github/workflows/aishell.yml @@ -0,0 +1,82 @@ +name: aishell + +on: + push: + branches: + - master + - refactor-ci + + pull_request: + branches: + - master + + workflow_dispatch: + + schedule: + # minute (0-59) + # hour (0-23) + # day of the month (1-31) + # month (1-12) + # day of the week (0-6) + # nightly build at 15:50 UTC time every day + - cron: "50 15 * * *" + +concurrency: + group: aishell-${{ github.ref }} + cancel-in-progress: true + +jobs: + generate_build_matrix: + if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule') + + # see https://github.com/pytorch/pytorch/pull/50633 + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Generating build matrix + id: set-matrix + run: | + # outputting for debugging purposes + python ./.github/scripts/docker/generate_build_matrix.py + MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py) + echo "::set-output name=matrix::${MATRIX}" + aishell: + needs: generate_build_matrix + name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }} + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Free space + shell: bash + run: | + df -h + rm -rf /opt/hostedtoolcache + df -h + echo "pwd: $PWD" + echo "github.workspace ${{ github.workspace }}" + + - name: Run tests + uses: addnab/docker-run-action@v3 + with: + image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }} + options: | + --volume ${{ github.workspace }}/:/icefall + shell: bash + run: | + export PYTHONPATH=/icefall:$PYTHONPATH + cd /icefall + git config --global --add safe.directory /icefall + + .github/scripts/aishell/ASR/run.sh diff --git a/.github/workflows/run-aishell-2022-06-20.yml b/.github/workflows/run-aishell-2022-06-20.yml deleted file mode 100644 index 53fcb2c03..000000000 --- a/.github/workflows/run-aishell-2022-06-20.yml +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright 2022 Fangjun Kuang (csukuangfj@gmail.com) - -# See ../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name: run-aishell-2022-06-20 -# pruned RNN-T + reworked model with random combiner -# https://huggingface.co/csukuangfj/icefall-aishell-pruned-transducer-stateless3-2022-06-20 - -on: - push: - branches: - - master - pull_request: - types: [labeled] - - schedule: - # minute (0-59) - # hour (0-23) - # day of the month (1-31) - # month (1-12) - # day of the week (0-6) - # nightly build at 15:50 UTC time every day - - cron: "50 15 * * *" - -concurrency: - group: run_aishell_2022_06_20-${{ github.ref }} - cancel-in-progress: true - -jobs: - run_aishell_2022_06_20: - if: github.event.label.name == 'ready' || github.event.label.name == 'run-decode' || github.event_name == 'push' || github.event_name == 'schedule' - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-latest] - python-version: [3.8] - - fail-fast: false - - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - cache: 'pip' - cache-dependency-path: '**/requirements-ci.txt' - - - name: Install Python dependencies - run: | - grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install - pip uninstall -y protobuf - pip install --no-binary protobuf protobuf==3.20.* - - - name: Cache kaldifeat - id: my-cache - uses: actions/cache@v2 - with: - path: | - ~/tmp/kaldifeat - key: cache-tmp-${{ matrix.python-version }}-2023-05-22 - - - name: Install kaldifeat - if: steps.my-cache.outputs.cache-hit != 'true' - shell: bash - run: | - .github/scripts/install-kaldifeat.sh - - - name: Inference with pre-trained model - shell: bash - env: - GITHUB_EVENT_NAME: ${{ github.event_name }} - GITHUB_EVENT_LABEL_NAME: ${{ github.event.label.name }} - run: | - sudo apt-get -qq install git-lfs tree - export PYTHONPATH=$PWD:$PYTHONPATH - export PYTHONPATH=~/tmp/kaldifeat/kaldifeat/python:$PYTHONPATH - export PYTHONPATH=~/tmp/kaldifeat/build/lib:$PYTHONPATH - - .github/scripts/run-aishell-pruned-transducer-stateless3-2022-06-20.sh - - - name: Display decoding results for aishell pruned_transducer_stateless3 - if: github.event_name == 'schedule' || github.event.label.name == 'run-decode' - shell: bash - run: | - cd egs/aishell/ASR/ - tree ./pruned_transducer_stateless3/exp - - cd pruned_transducer_stateless3 - echo "results for pruned_transducer_stateless3" - echo "===greedy search===" - find exp/greedy_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2 - find exp/greedy_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2 - - echo "===fast_beam_search===" - find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2 - find exp/fast_beam_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2 - - echo "===modified beam search===" - find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for test" {} + | sort -n -k2 - find exp/modified_beam_search -name "log-*" -exec grep -n --color "best for dev" {} + | sort -n -k2 - - - name: Upload decoding results for aishell pruned_transducer_stateless3 - uses: actions/upload-artifact@v2 - if: github.event_name == 'schedule' || github.event.label.name == 'run-decode' - with: - name: aishell-torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu-pruned_transducer_stateless3-2022-06-20 - path: egs/aishell/ASR/pruned_transducer_stateless3/exp/ diff --git a/.github/workflows/run-yesno-recipe.yml b/.github/workflows/run-yesno-recipe.yml index a99811815..24b8660f6 100644 --- a/.github/workflows/run-yesno-recipe.yml +++ b/.github/workflows/run-yesno-recipe.yml @@ -20,7 +20,6 @@ on: push: branches: - master - - refactor-ci pull_request: branches: