From fee1f84b20a5a704428c5eac80de2ac4033e1b27 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 15 Oct 2021 00:41:33 +0800 Subject: [PATCH] Test pre-trained model in CI (#80) * Add CI to run pre-trained models. * Minor fixes. * Install kaldifeat * Install a CPU version of PyTorch. * Fix CI errors. * Disable decoder layers in pretrained.py if it is not used. * Clone pre-trained model from GitHub. * Minor fixes. * Minor fixes. * Minor fixes. --- .github/workflows/run-pretrained.yml | 106 ++++++++++++++++++ .github/workflows/test.yml | 5 + .../ASR/conformer_ctc/pretrained.py | 20 +++- egs/librispeech/ASR/conformer_ctc/train.py | 30 +++-- 4 files changed, 150 insertions(+), 11 deletions(-) create mode 100644 .github/workflows/run-pretrained.yml diff --git a/.github/workflows/run-pretrained.yml b/.github/workflows/run-pretrained.yml new file mode 100644 index 000000000..97d3c32d2 --- /dev/null +++ b/.github/workflows/run-pretrained.yml @@ -0,0 +1,106 @@ +# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com) + +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: run-pre-trained-conformer-ctc + +on: + push: + branches: + - master + pull_request: + types: [labeled] + +jobs: + run_pre_trained_conformer_ctc: + if: github.event.label.name == 'ready' || github.event_name == 'push' + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-18.04] + python-version: [3.6, 3.7, 3.8, 3.9] + torch: ["1.8.1"] + k2-version: ["1.9.dev20210919"] + + fail-fast: false + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Python dependencies + run: | + python3 -m pip install --upgrade pip pytest + pip install torch==${{ matrix.torch }}+cpu -f https://download.pytorch.org/whl/torch_stable.html + pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/ + + python3 -m pip install git+https://github.com/lhotse-speech/lhotse + python3 -m pip install kaldifeat + # We are in ./icefall and there is a file: requirements.txt in it + pip install -r requirements.txt + + - name: Install graphviz + shell: bash + run: | + python3 -m pip install -qq graphviz + sudo apt-get -qq install graphviz + + - name: Download pre-trained model + shell: bash + run: | + sudo apt-get -qq install git-lfs tree sox + cd egs/librispeech/ASR + mkdir tmp + cd tmp + git lfs install + git clone https://github.com/csukuangfj/icefall-asr-conformer-ctc-bpe-500 + cd .. + tree tmp + soxi tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/*.flac + ls -lh tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/*.flac + + - name: Run CTC decoding + shell: bash + run: | + export PYTHONPATH=$PWD:PYTHONPATH + cd egs/librispeech/ASR + ./conformer_ctc/pretrained.py \ + --num-classes 500 \ + --checkpoint ./tmp/icefall-asr-conformer-ctc-bpe-500/exp/pretrained.pt \ + --bpe-model ./tmp/icefall-asr-conformer-ctc-bpe-500/data/lang_bpe_500/bpe.model \ + --method ctc-decoding \ + ./tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/1089-134686-0001.flac \ + ./tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/1221-135766-0001.flac \ + ./tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/1221-135766-0002.flac + + - name: Run HLG decoding + shell: bash + run: | + export PYTHONPATH=$PWD:$PYTHONPATH + cd egs/librispeech/ASR + ./conformer_ctc/pretrained.py \ + --num-classes 500 \ + --checkpoint ./tmp/icefall-asr-conformer-ctc-bpe-500/exp/pretrained.pt \ + --words-file ./tmp/icefall-asr-conformer-ctc-bpe-500/data/lang_bpe_500/words.txt \ + --HLG ./tmp/icefall-asr-conformer-ctc-bpe-500/data/lang_bpe_500/HLG.pt \ + ./tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/1089-134686-0001.flac \ + ./tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/1221-135766-0001.flac \ + ./tmp/icefall-asr-conformer-ctc-bpe-500/test_wavs/1221-135766-0002.flac diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 150b5258a..c6114ce73 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -84,3 +84,8 @@ jobs: echo "lib_path: $lib_path" export DYLD_LIBRARY_PATH=$lib_path:$DYLD_LIBRARY_PATH pytest ./test + + # runt tests for conformer ctc + cd egs/librispeech/ASR/conformer_ctc + pytest + diff --git a/egs/librispeech/ASR/conformer_ctc/pretrained.py b/egs/librispeech/ASR/conformer_ctc/pretrained.py index edbdb5b2e..99bd9c017 100755 --- a/egs/librispeech/ASR/conformer_ctc/pretrained.py +++ b/egs/librispeech/ASR/conformer_ctc/pretrained.py @@ -166,6 +166,15 @@ def get_parser(): """, ) + parser.add_argument( + "--num-classes", + type=int, + default=5000, + help=""" + Vocab size in the BPE model. + """, + ) + parser.add_argument( "--eos-id", type=int, @@ -199,7 +208,6 @@ def get_params() -> AttributeDict: "use_feat_batchnorm": True, "feature_dim": 80, "nhead": 8, - "num_classes": 5000, "attention_dim": 512, "num_decoder_layers": 6, # parameters for decoding @@ -242,7 +250,13 @@ def main(): args = parser.parse_args() params = get_params() + if args.method != "attention-decoder": + # to save memory as the attention decoder + # will not be used + params.num_decoder_layers = 0 + params.update(vars(args)) + logging.info(f"{params}") device = torch.device("cpu") @@ -264,7 +278,7 @@ def main(): ) checkpoint = torch.load(args.checkpoint, map_location="cpu") - model.load_state_dict(checkpoint["model"]) + model.load_state_dict(checkpoint["model"], strict=False) model.to(device) model.eval() @@ -305,7 +319,7 @@ def main(): logging.info("Use CTC decoding") bpe_model = spm.SentencePieceProcessor() bpe_model.load(params.bpe_model) - max_token_id = bpe_model.get_piece_size() - 1 + max_token_id = params.num_classes - 1 H = k2.ctc_topo( max_token=max_token_id, diff --git a/egs/librispeech/ASR/conformer_ctc/train.py b/egs/librispeech/ASR/conformer_ctc/train.py index 5554aaa7c..d1cdfa8bb 100755 --- a/egs/librispeech/ASR/conformer_ctc/train.py +++ b/egs/librispeech/ASR/conformer_ctc/train.py @@ -96,6 +96,26 @@ def get_parser(): """, ) + parser.add_argument( + "--exp-dir", + type=str, + default="conformer_ctc/exp", + help="""The experiment dir. + It specifies the directory where all training related + files, e.g., checkpoints, log, etc, are saved + """, + ) + + parser.add_argument( + "--lang-dir", + type=str, + default="data/lang_bpe", + help="""The lang dir + It contains language related input files such as + "lexicon.txt" + """, + ) + return parser @@ -110,12 +130,6 @@ def get_params() -> AttributeDict: Explanation of options saved in `params`: - - exp_dir: It specifies the directory where all training related - files, e.g., checkpoints, log, etc, are saved - - - lang_dir: It contains language related input files such as - "lexicon.txt" - - best_train_loss: Best training loss so far. It is used to select the model that has the lowest training loss. It is updated during the training. @@ -166,8 +180,6 @@ def get_params() -> AttributeDict: """ params = AttributeDict( { - "exp_dir": Path("conformer_ctc/exp"), - "lang_dir": Path("data/lang_bpe"), "best_train_loss": float("inf"), "best_valid_loss": float("inf"), "best_train_epoch": -1, @@ -638,6 +650,8 @@ def main(): parser = get_parser() LibriSpeechAsrDataModule.add_arguments(parser) args = parser.parse_args() + args.exp_dir = Path(args.exp_dir) + args.lang_dir = Path(args.lang_dir) world_size = args.world_size assert world_size >= 1