From 1f5216236f277051920073b07c5611e1b193e88e Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Sun, 12 Jun 2022 11:21:43 +0800 Subject: [PATCH] update RESULTS.md --- egs/librispeech/ASR/RESULTS.md | 170 +++++++++++++++++- .../streaming_decode.py | 55 +++++- 2 files changed, 217 insertions(+), 8 deletions(-) diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 66410ef40..53a63ddea 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -1,5 +1,163 @@ ## Results +### LibriSpeech BPE training results (Pruned Stateless Conv-Emformer RNN-T) + +[conv_emformer_transducer_stateless](./conv_emformer_transducer_stateless) + +It implements [Emformer](https://arxiv.org/abs/2010.10759) augmented with convolution module for streaming ASR. +It is modified from [torchaudio](https://github.com/pytorch/audio). + +See for more details. + +#### Training on full librispeech + +The WERs are: + +| | test-clean | test-other | comment | decoding mode | +|-------------------------------------|------------|------------|---------------------------------------------| +| greedy search (max sym per frame 1) | 3.63 | 9.61 | --epoch 30 --avg 10 | simulated streaming | +| greedy search (max sym per frame 1) | 3.64 | 9.65 | --epoch 30 --avg 10 | streaming | +| fast beam search | 3.61 | 9.4 | --epoch 30 --avg 10 | simulated streaming | +| fast beam search | 3.58 | 9.5 | --epoch 30 --avg 10 | streaming | +| modified beam search | 3.56 | 9.41 | --epoch 30 --avg 10 | simulated streaming | +| modified beam search | 3.54 | 9.46 | --epoch 30 --avg 10 | streaming | + +The training command is: + +```bash +./conv_emformer_transducer_stateless/train.py \ + --world-size 6 \ + --num-epochs 30 \ + --start-epoch 1 \ + --exp-dir conv_emformer_transducer_stateless/exp \ + --full-libri 1 \ + --max-duration 300 \ + --master-port 12321 \ + --num-encoder-layers 12 \ + --chunk-length 32 \ + --cnn-module-kernel 31 \ + --left-context-length 32 \ + --right-context-length 8 \ + --memory-size 32 +``` + +The tensorboard log can be found at + + +The simulated streaming decoding command using greedy search is: +```bash +./conv_emformer_transducer_stateless/decode.py \ + --epoch 30 \ + --avg 10 \ + --exp-dir conv_emformer_transducer_stateless/exp \ + --max-duration 300 \ + --num-encoder-layers 12 \ + --chunk-length 32 \ + --cnn-module-kernel 31 \ + --left-context-length 32 \ + --right-context-length 8 \ + --memory-size 32 \ + --decoding-method greedy_search \ + --use-averaged-model True +``` + +The simulated streaming decoding command using fast beam search is: +```bash +./conv_emformer_transducer_stateless/decode.py \ + --epoch 30 \ + --avg 10 \ + --exp-dir conv_emformer_transducer_stateless/exp \ + --max-duration 300 \ + --num-encoder-layers 12 \ + --chunk-length 32 \ + --cnn-module-kernel 31 \ + --left-context-length 32 \ + --right-context-length 8 \ + --memory-size 32 \ + --decoding-method fast_beam_search \ + --use-averaged-model True \ + --beam 4 \ + --max-contexts 4 \ + --max-states 8 +``` + +The simulated streaming decoding command using modified beam search is: +```bash +./conv_emformer_transducer_stateless/decode.py \ + --epoch 30 \ + --avg 10 \ + --exp-dir conv_emformer_transducer_stateless/exp \ + --max-duration 300 \ + --num-encoder-layers 12 \ + --chunk-length 32 \ + --cnn-module-kernel 31 \ + --left-context-length 32 \ + --right-context-length 8 \ + --memory-size 32 \ + --decoding-method modified_beam_search \ + --use-averaged-model True \ + --beam-size 4 +``` + +The streaming decoding command using greedy search is: +```bash +./conv_emformer_transducer_stateless/streaming_decode.py \ + --epoch 30 \ + --avg 10 \ + --exp-dir conv_emformer_transducer_stateless/exp \ + --num-decode-streams 2000 \ + --num-encoder-layers 12 \ + --chunk-length 32 \ + --cnn-module-kernel 31 \ + --left-context-length 32 \ + --right-context-length 8 \ + --memory-size 32 \ + --decoding-method greedy_search \ + --use-averaged-model True +``` + +The streaming decoding command using fast beam search is: +```bash +./conv_emformer_transducer_stateless/streaming_decode.py \ + --epoch 30 \ + --avg 10 \ + --exp-dir conv_emformer_transducer_stateless/exp \ + --num-decode-streams 2000 \ + --num-encoder-layers 12 \ + --chunk-length 32 \ + --cnn-module-kernel 31 \ + --left-context-length 32 \ + --right-context-length 8 \ + --memory-size 32 \ + --decoding-method fast_beam_search \ + --use-averaged-model True \ + --beam 4 \ + --max-contexts 4 \ + --max-states 8 +``` + +The streaming decoding command using modified beam search is: +```bash +./conv_emformer_transducer_stateless/streaming_decode.py \ + --epoch 30 \ + --avg 10 \ + --exp-dir conv_emformer_transducer_stateless/exp \ + --num-decode-streams 2000 \ + --num-encoder-layers 12 \ + --chunk-length 32 \ + --cnn-module-kernel 31 \ + --left-context-length 32 \ + --right-context-length 8 \ + --memory-size 32 \ + --decoding-method modified_beam_search \ + --use-averaged-model True \ + --beam-size 4 +``` + +Pretrained models, training logs, decoding logs, and decoding results +are available at + + ### LibriSpeech BPE training results (Pruned Stateless Emformer RNN-T) [pruned_stateless_emformer_rnnt2](./pruned_stateless_emformer_rnnt2) @@ -280,12 +438,12 @@ The WERs are: | | test-clean | test-other | comment | |-------------------------------------|------------|------------|-------------------------------------------------------------------------------| -| greedy search (max sym per frame 1) | 2.75 | 6.74 | --epoch 30 --avg 6 --use_averaged_model False | -| greedy search (max sym per frame 1) | 2.69 | 6.64 | --epoch 30 --avg 6 --use_averaged_model True | -| fast beam search | 2.72 | 6.67 | --epoch 30 --avg 6 --use_averaged_model False | -| fast beam search | 2.66 | 6.6 | --epoch 30 --avg 6 --use_averaged_model True | -| modified beam search | 2.67 | 6.68 | --epoch 30 --avg 6 --use_averaged_model False | -| modified beam search | 2.62 | 6.57 | --epoch 30 --avg 6 --use_averaged_model True | +| greedy search (max sym per frame 1) | 2.75 | 6.74 | --epoch 30 --avg 6 --use-averaged-model False | +| greedy search (max sym per frame 1) | 2.69 | 6.64 | --epoch 30 --avg 6 --use-averaged-model True | +| fast beam search | 2.72 | 6.67 | --epoch 30 --avg 6 --use-averaged-model False | +| fast beam search | 2.66 | 6.6 | --epoch 30 --avg 6 --use-averaged-model True | +| modified beam search | 2.67 | 6.68 | --epoch 30 --avg 6 --use-averaged-model False | +| modified beam search | 2.62 | 6.57 | --epoch 30 --avg 6 --use-averaged-model True | The training command is: diff --git a/egs/librispeech/ASR/conv_emformer_transducer_stateless/streaming_decode.py b/egs/librispeech/ASR/conv_emformer_transducer_stateless/streaming_decode.py index 52cb4abf5..2b024fa34 100755 --- a/egs/librispeech/ASR/conv_emformer_transducer_stateless/streaming_decode.py +++ b/egs/librispeech/ASR/conv_emformer_transducer_stateless/streaming_decode.py @@ -16,7 +16,57 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Usage: +(1) greedy search +./conv_emformer_transducer_stateless/streaming_decode.py \ + --epoch 30 \ + --avg 10 \ + --exp-dir conv_emformer_transducer_stateless/exp \ + --num-decode-streams 2000 \ + --num-encoder-layers 12 \ + --chunk-length 32 \ + --cnn-module-kernel 31 \ + --left-context-length 32 \ + --right-context-length 8 \ + --memory-size 32 \ + --decoding-method greedy_search \ + --use-averaged-model True +(2) modified beam search +./conv_emformer_transducer_stateless/streaming_decode.py \ + --epoch 30 \ + --avg 10 \ + --exp-dir conv_emformer_transducer_stateless/exp \ + --num-decode-streams 2000 \ + --num-encoder-layers 12 \ + --chunk-length 32 \ + --cnn-module-kernel 31 \ + --left-context-length 32 \ + --right-context-length 8 \ + --memory-size 32 \ + --decoding-method modified_beam_search \ + --use-averaged-model True \ + --beam-size 4 + +(3) fast beam search +./conv_emformer_transducer_stateless/streaming_decode.py \ + --epoch 30 \ + --avg 10 \ + --exp-dir conv_emformer_transducer_stateless/exp \ + --num-decode-streams 2000 \ + --num-encoder-layers 12 \ + --chunk-length 32 \ + --cnn-module-kernel 31 \ + --left-context-length 32 \ + --right-context-length 8 \ + --memory-size 32 \ + --decoding-method fast_beam_search \ + --use-averaged-model True \ + --beam 4 \ + --max-contexts 4 \ + --max-states 8 +""" import argparse import logging import warnings @@ -686,8 +736,9 @@ def decode_dataset( ) del streams[i] - key = "greedy_search" - if params.decoding_method == "fast_beam_search": + if params.decoding_method == "greedy_search": + key = "greedy_search" + elif params.decoding_method == "fast_beam_search": key = ( f"beam_{params.beam}_" f"max_contexts_{params.max_contexts}_"