From 3b937611994f4f4babafdf26b742af59c0d1f6fe Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 7 Feb 2022 16:31:10 +0800 Subject: [PATCH] Update RESULTS. --- .../run-pretrained-transducer-stateless.yml | 71 +++++++++++++++---- README.md | 6 +- egs/librispeech/ASR/RESULTS.md | 51 +++++++------ .../ASR/transducer_stateless/pretrained.py | 21 +++++- 4 files changed, 109 insertions(+), 40 deletions(-) diff --git a/.github/workflows/run-pretrained-transducer-stateless.yml b/.github/workflows/run-pretrained-transducer-stateless.yml index 5f4a425d9..de66b90c5 100644 --- a/.github/workflows/run-pretrained-transducer-stateless.yml +++ b/.github/workflows/run-pretrained-transducer-stateless.yml @@ -74,24 +74,53 @@ jobs: mkdir tmp cd tmp git lfs install - git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10 + git clone https://huggingface.co/csukuangfj/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07 cd .. tree tmp - soxi tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/*.wav - ls -lh tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/*.wav + soxi tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/*.wav + ls -lh tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/*.wav - - name: Run greedy search decoding + - name: Run greedy search decoding (max-sym-per-frame 1) shell: bash run: | export PYTHONPATH=$PWD:PYTHONPATH cd egs/librispeech/ASR ./transducer_stateless/pretrained.py \ --method greedy_search \ - --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/exp/pretrained.pt \ - --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/data/lang_bpe_500/bpe.model \ - ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/1089-134686-0001.wav \ - ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/1221-135766-0001.wav \ - ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/1221-135766-0002.wav + --max-sym-per-frame 1 \ + --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/exp/pretrained.pt \ + --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/data/lang_bpe_500/bpe.model \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1089-134686-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0002.wav + + - name: Run greedy search decoding (max-sym-per-frame 2) + shell: bash + run: | + export PYTHONPATH=$PWD:PYTHONPATH + cd egs/librispeech/ASR + ./transducer_stateless/pretrained.py \ + --method greedy_search \ + --max-sym-per-frame 2 \ + --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/exp/pretrained.pt \ + --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/data/lang_bpe_500/bpe.model \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1089-134686-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0002.wav + + - name: Run greedy search decoding (max-sym-per-frame 3) + shell: bash + run: | + export PYTHONPATH=$PWD:PYTHONPATH + cd egs/librispeech/ASR + ./transducer_stateless/pretrained.py \ + --method greedy_search \ + --max-sym-per-frame 3 \ + --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/exp/pretrained.pt \ + --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/data/lang_bpe_500/bpe.model \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1089-134686-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0002.wav - name: Run beam search decoding shell: bash @@ -101,8 +130,22 @@ jobs: ./transducer_stateless/pretrained.py \ --method beam_search \ --beam-size 4 \ - --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/exp/pretrained.pt \ - --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/data/lang_bpe_500/bpe.model \ - ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/1089-134686-0001.wav \ - ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/1221-135766-0001.wav \ - ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-01-10/test_wavs/1221-135766-0002.wav + --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/exp/pretrained.pt \ + --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/data/lang_bpe_500/bpe.model \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1089-134686-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0002.wav + + - name: Run modified beam search decoding + shell: bash + run: | + export PYTHONPATH=$PWD:$PYTHONPATH + cd egs/librispeech/ASR + ./transducer_stateless/pretrained.py \ + --method modified_beam_search \ + --beam-size 4 \ + --checkpoint ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/exp/pretrained.pt \ + --bpe-model ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/data/lang_bpe_500/bpe.model \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1089-134686-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0001.wav \ + ./tmp/icefall-asr-librispeech-transducer-stateless-bpe-500-2022-02-07/test_wavs/1221-135766-0002.wav diff --git a/README.md b/README.md index 38c25900f..28c9b6ce4 100644 --- a/README.md +++ b/README.md @@ -80,16 +80,16 @@ We provide a Colab notebook to run a pre-trained RNN-T conformer model: [![Open Using Conformer as encoder. The decoder consists of 1 embedding layer and 1 convolutional layer. -The best WER using beam search with beam size 4 is: +The best WER using modified beam search with beam size 4 is: | | test-clean | test-other | |-----|------------|------------| -| WER | 2.68 | 6.72 | +| WER | 2.67 | 6.64 | Note: No auxiliary losses are used in the training and no LMs are used in the decoding. -We provide a Colab notebook to run a pre-trained transducer conformer + stateless decoder model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1Rc4Is-3Yp9LbcEz_Iy8hfyenyHsyjvqE?usp=sharing) +We provide a Colab notebook to run a pre-trained transducer conformer + stateless decoder model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CO1bXJ-2khDckZIW8zjOPHGSKLHpTDlp?usp=sharing) ### Aishell diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 0d9cc7ba5..17679ba3d 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -4,62 +4,73 @@ #### Conformer encoder + embedding decoder -Using commit `4c1b3665ee6efb935f4dd93a80ff0e154b13efb6`. +Using commit `TODO`. Conformer encoder + non-recurrent decoder. The decoder contains only an embedding layer and a Conv1d (with kernel size 2). The WERs are -| | test-clean | test-other | comment | -|---------------------------|------------|------------|------------------------------------------| -| greedy search | 2.69 | 6.81 | --epoch 71, --avg 15, --max-duration 100 | -| beam search (beam size 4) | 2.68 | 6.72 | --epoch 71, --avg 15, --max-duration 100 | +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|------------------------------------------| +| greedy search (max sym per frame 1) | 2.68 | 6.71 | --epoch 61, --avg 18, --max-duration 100 | +| greedy search (max sym per frame 2) | 2.69 | 6.71 | --epoch 61, --avg 18, --max-duration 100 | +| greedy search (max sym per frame 3) | 2.69 | 6.71 | --epoch 61, --avg 18, --max-duration 100 | +| modified beam search (beam size 4) | 2.67 | 6.64 | --epoch 61, --avg 18, --max-duration 100 | + The training command for reproducing is given below: ``` +cd egs/librispeech/ASR/ +./prepare.sh export CUDA_VISIBLE_DEVICES="0,1,2,3" - ./transducer_stateless/train.py \ --world-size 4 \ --num-epochs 76 \ --start-epoch 0 \ --exp-dir transducer_stateless/exp-full \ --full-libri 1 \ - --max-duration 250 \ - --lr-factor 3 + --max-duration 300 \ + --lr-factor 5 \ + --bpe-model data/lang_bpe_500/bpe.model \ + --modified-transducer-prob 0.25 ``` The tensorboard training log can be found at - + The decoding command is: ``` -epoch=71 -avg=15 +epoch=61 +avg=18 ## greedy search -./transducer_stateless/decode.py \ - --epoch $epoch \ - --avg $avg \ - --exp-dir transducer_stateless/exp-full \ - --bpe-model ./data/lang_bpe_500/bpe.model \ - --max-duration 100 +for sym in 1 2 3; do + ./transducer_stateless/decode.py \ + --epoch $epoch \ + --avg $avg \ + --exp-dir transducer_stateless/exp-full \ + --bpe-model ./data/lang_bpe_500/bpe.model \ + --max-duration 100 \ + --max-sym-per-frame $sym +done + +## modified beam search -## beam search ./transducer_stateless/decode.py \ --epoch $epoch \ --avg $avg \ --exp-dir transducer_stateless/exp-full \ --bpe-model ./data/lang_bpe_500/bpe.model \ --max-duration 100 \ - --decoding-method beam_search \ + --context-size 2 \ + --decoding-method modified_beam_search \ --beam-size 4 ``` You can find a pretrained model by visiting - + #### Conformer encoder + LSTM decoder diff --git a/egs/librispeech/ASR/transducer_stateless/pretrained.py b/egs/librispeech/ASR/transducer_stateless/pretrained.py index e5dba8f0e..9b678f025 100755 --- a/egs/librispeech/ASR/transducer_stateless/pretrained.py +++ b/egs/librispeech/ASR/transducer_stateless/pretrained.py @@ -22,10 +22,11 @@ Usage: --checkpoint ./transducer_stateless/exp/pretrained.pt \ --bpe-model ./data/lang_bpe_500/bpe.model \ --method greedy_search \ + --max-sym-per-frame 1 \ /path/to/foo.wav \ /path/to/bar.wav \ -(1) beam search +(2) beam search ./transducer_stateless/pretrained.py \ --checkpoint ./transducer_stateless/exp/pretrained.pt \ --bpe-model ./data/lang_bpe_500/bpe.model \ @@ -34,6 +35,15 @@ Usage: /path/to/foo.wav \ /path/to/bar.wav \ +(3) modified beam search +./transducer_stateless/pretrained.py \ + --checkpoint ./transducer_stateless/exp/pretrained.pt \ + --bpe-model ./data/lang_bpe_500/bpe.model \ + --method modified_beam_search \ + --beam-size 4 \ + /path/to/foo.wav \ + /path/to/bar.wav \ + You can also use `./transducer_stateless/exp/epoch-xx.pt`. Note: ./transducer_stateless/exp/pretrained.pt is generated by @@ -50,7 +60,7 @@ import kaldifeat import sentencepiece as spm import torch import torchaudio -from beam_search import beam_search, greedy_search +from beam_search import beam_search, greedy_search, modified_beam_search from conformer import Conformer from decoder import Decoder from joiner import Joiner @@ -90,6 +100,7 @@ def get_parser(): help="""Possible values are: - greedy_search - beam_search + - modified_beam_search """, ) @@ -107,7 +118,7 @@ def get_parser(): "--beam-size", type=int, default=4, - help="Used only when --method is beam_search", + help="Used only when --method is beam_search and modified_beam_search ", ) parser.add_argument( @@ -300,6 +311,10 @@ def main(): hyp = beam_search( model=model, encoder_out=encoder_out_i, beam=params.beam_size ) + elif params.decoding_method == "modified_beam_search": + hyp = modified_beam_search( + model=model, encoder_out=encoder_out_i, beam=params.beam_size + ) else: raise ValueError(f"Unsupported method: {params.method}")