From ca0d7c5795cdd5b2df984c338f32cd36bf60c0bf Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 23 Dec 2021 12:35:25 +0800 Subject: [PATCH] Minor fixes. --- ...d.yml => run-pretrained-conformer-ctc.yml} | 0 .../workflows/run-pretrained-transducer.yml | 109 ++++++++++++++++++ README.md | 2 +- egs/librispeech/ASR/RESULTS.md | 23 ++-- egs/librispeech/ASR/transducer/decode.py | 4 +- egs/librispeech/ASR/transducer/export.py | 8 +- egs/librispeech/ASR/transducer/joiner.py | 3 +- egs/librispeech/ASR/transducer/train.py | 4 +- 8 files changed, 132 insertions(+), 21 deletions(-) rename .github/workflows/{run-pretrained.yml => run-pretrained-conformer-ctc.yml} (100%) create mode 100644 .github/workflows/run-pretrained-transducer.yml diff --git a/.github/workflows/run-pretrained.yml b/.github/workflows/run-pretrained-conformer-ctc.yml similarity index 100% rename from .github/workflows/run-pretrained.yml rename to .github/workflows/run-pretrained-conformer-ctc.yml diff --git a/.github/workflows/run-pretrained-transducer.yml b/.github/workflows/run-pretrained-transducer.yml new file mode 100644 index 000000000..23f3ed697 --- /dev/null +++ b/.github/workflows/run-pretrained-transducer.yml @@ -0,0 +1,109 @@ +# Copyright 2021 Fangjun Kuang (csukuangfj@gmail.com) + +# See ../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: run-pre-trained-tranducer + +on: + push: + branches: + - master + pull_request: + types: [labeled] + +jobs: + run_pre_trained_transducer: + if: github.event.label.name == 'ready' || github.event_name == 'push' + runs-on: ${{ matrix.os }} + strategy: + matrix: + os: [ubuntu-18.04] + python-version: [3.7, 3.8, 3.9] + torch: ["1.10.0"] + torchaudio: ["0.10.0"] + k2-version: ["1.9.dev20211101"] + + fail-fast: false + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Python dependencies + run: | + python3 -m pip install --upgrade pip pytest + # numpy 1.20.x does not support python 3.6 + pip install numpy==1.19 + pip install torch==${{ matrix.torch }}+cpu torchaudio==${{ matrix.torchaudio }}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html + pip install k2==${{ matrix.k2-version }}+cpu.torch${{ matrix.torch }} -f https://k2-fsa.org/nightly/ + + python3 -m pip install git+https://github.com/lhotse-speech/lhotse + python3 -m pip install kaldifeat + # We are in ./icefall and there is a file: requirements.txt in it + pip install -r requirements.txt + + - name: Install graphviz + shell: bash + run: | + python3 -m pip install -qq graphviz + sudo apt-get -qq install graphviz + + - name: Download pre-trained model + shell: bash + run: | + sudo apt-get -qq install git-lfs tree sox + cd egs/librispeech/ASR + mkdir tmp + cd tmp + git lfs install + git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-bpe-500-2021-12-23 + + cd .. + tree tmp + soxi tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/*.wav + ls -lh tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/*.wav + + - name: Run greedy search decoding + shell: bash + run: | + export PYTHONPATH=$PWD:PYTHONPATH + cd egs/librispeech/ASR + ./transducer_stateless/pretrained.py \ + --method greedy_search \ + --checkpoint ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/exp/pretrained.pt \ + --bpe-model ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/data/lang_bpe_500/bpe.model \ + ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/1089-134686-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/1221-135766-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/1221-135766-0002.wav + + - name: Run beam search decoding + shell: bash + run: | + export PYTHONPATH=$PWD:$PYTHONPATH + cd egs/librispeech/ASR + ./transducer_stateless/pretrained.py \ + --method beam_search \ + --beam-size 4 \ + --checkpoint ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/exp/pretrained.pt \ + --bpe-model ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/data/lang_bpe_500/bpe.model \ + ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/1089-134686-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/1221-135766-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-bpe-500-2021-12-23/test_wavs/1221-135766-0002.wav diff --git a/README.md b/README.md index 931fb0198..f0a678839 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ The best WER with greedy search is: | | test-clean | test-other | |-----|------------|------------| -| WER | 3.16 | 7.71 | +| WER | 3.07 | 7.51 | We provide a Colab notebook to run a pre-trained RNN-T conformer model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1_u6yK9jDkPwG_NLrZMN2XK7Aeq4suMO2?usp=sharing) diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 317b1591a..aab2b61e0 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -2,7 +2,10 @@ ### LibriSpeech BPE training results (Transducer) -#### 2021-12-22 +#### Conformer encoder + embedding decoder + +Using commit `fb6a57e9e01dd8aae2af2a6b4568daad8bc8ab32`. + Conformer encoder + non-current decoder. The decoder contains only an embedding layer and a Conv1d (with kernel size 2). @@ -60,8 +63,8 @@ avg=10 ``` -#### 2021-12-17 -Using commit `cb04c8a7509425ab45fae888b0ca71bbbd23f0de`. +#### Conformer encoder + LSTM decoder +Using commit `TODO`. Conformer encoder + LSTM decoder. @@ -69,9 +72,9 @@ The best WER is | | test-clean | test-other | |-----|------------|------------| -| WER | 3.16 | 7.71 | +| WER | 3.07 | 7.51 | -using `--epoch 26 --avg 12` with **greedy search**. +using `--epoch 34 --avg 11` with **greedy search**. The training command to reproduce the above WER is: @@ -80,19 +83,19 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" ./transducer/train.py \ --world-size 4 \ - --num-epochs 30 \ + --num-epochs 35 \ --start-epoch 0 \ --exp-dir transducer/exp-lr-2.5-full \ --full-libri 1 \ - --max-duration 250 \ + --max-duration 180 \ --lr-factor 2.5 ``` The decoding command is: ``` -epoch=26 -avg=12 +epoch=34 +avg=11 ./transducer/decode.py \ --epoch $epoch \ @@ -102,7 +105,7 @@ avg=12 --max-duration 100 ``` -You can find the tensorboard log at: +You can find the tensorboard log at: ### LibriSpeech BPE training results (Conformer-CTC) diff --git a/egs/librispeech/ASR/transducer/decode.py b/egs/librispeech/ASR/transducer/decode.py index 752712829..ef0992618 100755 --- a/egs/librispeech/ASR/transducer/decode.py +++ b/egs/librispeech/ASR/transducer/decode.py @@ -70,14 +70,14 @@ def get_parser(): parser.add_argument( "--epoch", type=int, - default=26, + default=34, help="It specifies the checkpoint to use for decoding." "Note: Epoch counts from 0.", ) parser.add_argument( "--avg", type=int, - default=12, + default=11, help="Number of checkpoints to average. Automatically select " "consecutive checkpoints before the checkpoint specified by " "'--epoch'. ", diff --git a/egs/librispeech/ASR/transducer/export.py b/egs/librispeech/ASR/transducer/export.py index c74700feb..3351fbc67 100755 --- a/egs/librispeech/ASR/transducer/export.py +++ b/egs/librispeech/ASR/transducer/export.py @@ -23,8 +23,8 @@ Usage: ./transducer/export.py \ --exp-dir ./transducer/exp \ --bpe-model data/lang_bpe_500/bpe.model \ - --epoch 26 \ - --avg 12 + --epoch 34 \ + --avg 11 It will generate a file exp_dir/pretrained.pt @@ -66,7 +66,7 @@ def get_parser(): parser.add_argument( "--epoch", type=int, - default=26, + default=34, help="It specifies the checkpoint to use for decoding." "Note: Epoch counts from 0.", ) @@ -74,7 +74,7 @@ def get_parser(): parser.add_argument( "--avg", type=int, - default=12, + default=11, help="Number of checkpoints to average. Automatically select " "consecutive checkpoints before the checkpoint specified by " "'--epoch'. ", diff --git a/egs/librispeech/ASR/transducer/joiner.py b/egs/librispeech/ASR/transducer/joiner.py index f1968d1d0..2ef3f1de6 100644 --- a/egs/librispeech/ASR/transducer/joiner.py +++ b/egs/librispeech/ASR/transducer/joiner.py @@ -16,7 +16,6 @@ import torch import torch.nn as nn -import torch.nn.functional as F class Joiner(nn.Module): @@ -48,7 +47,7 @@ class Joiner(nn.Module): # Now decoder_out is (N, 1, U, C) logit = encoder_out + decoder_out - logit = F.tanh(logit) + logit = torch.tanh(logit) output = self.output_linear(logit) diff --git a/egs/librispeech/ASR/transducer/train.py b/egs/librispeech/ASR/transducer/train.py index fe3cc386b..dcb75609c 100755 --- a/egs/librispeech/ASR/transducer/train.py +++ b/egs/librispeech/ASR/transducer/train.py @@ -23,7 +23,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" ./transducer/train.py \ --world-size 4 \ - --num-epochs 30 \ + --num-epochs 35 \ --start-epoch 0 \ --exp-dir transducer/exp \ --full-libri 1 \ @@ -92,7 +92,7 @@ def get_parser(): parser.add_argument( "--num-epochs", type=int, - default=30, + default=35, help="Number of epochs to train.", )