From cc0d4ffa4f115e6345274c203716e935b369cb77 Mon Sep 17 00:00:00 2001 From: pkufool Date: Mon, 11 Apr 2022 15:27:24 +0800 Subject: [PATCH 1/8] Add mix precision support --- egs/librispeech/ASR/pruned_transducer_stateless2/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py index d08fa15b5..c78a0f1c3 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py @@ -870,7 +870,7 @@ def run(rank, world_size, args): valid_cuts += librispeech.dev_other_cuts() valid_dl = librispeech.valid_dataloaders(valid_cuts) - if not params.print_diagnostics: + if not params.print_diagnostics and not params.use_fp16: scan_pessimistic_batches_for_oom( model=model, train_dl=train_dl, From ddd8f9e15ef33aa86f2b3f52278d75cdbe0138de Mon Sep 17 00:00:00 2001 From: pkufool Date: Mon, 11 Apr 2022 15:40:14 +0800 Subject: [PATCH 2/8] Minor fixes --- egs/librispeech/ASR/pruned_transducer_stateless2/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py index c78a0f1c3..31b85d53c 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py @@ -39,7 +39,6 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" --exp-dir pruned_transducer_stateless2/exp \ --full-libri 1 \ --max-duration 550 - """ From a92133ef960a43d3e9f4834594acad2051c9aa22 Mon Sep 17 00:00:00 2001 From: pkufool Date: Mon, 11 Apr 2022 15:41:45 +0800 Subject: [PATCH 3/8] Minor fixes --- egs/librispeech/ASR/pruned_transducer_stateless2/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py index 31b85d53c..577231995 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py @@ -869,7 +869,7 @@ def run(rank, world_size, args): valid_cuts += librispeech.dev_other_cuts() valid_dl = librispeech.valid_dataloaders(valid_cuts) - if not params.print_diagnostics and not params.use_fp16: + if not params.print_diagnostics: scan_pessimistic_batches_for_oom( model=model, train_dl=train_dl, From e8eb0b94d912c08afd9adce3675091f05baf3cf0 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 11 Apr 2022 20:56:11 +0800 Subject: [PATCH 4/8] Updating RESULTS.md; fix in beam_search.py --- egs/librispeech/ASR/README.md | 16 ++-- egs/librispeech/ASR/RESULTS.md | 78 +++++++++++++++++++ .../beam_search.py | 2 +- 3 files changed, 88 insertions(+), 8 deletions(-) diff --git a/egs/librispeech/ASR/README.md b/egs/librispeech/ASR/README.md index a7b2e2c3b..b3e90a052 100644 --- a/egs/librispeech/ASR/README.md +++ b/egs/librispeech/ASR/README.md @@ -9,13 +9,15 @@ for how to run models in this recipe. There are various folders containing the name `transducer` in this folder. The following table lists the differences among them. -| | Encoder | Decoder | Comment | -|---------------------------------------|-----------|--------------------|---------------------------------------------------| -| `transducer` | Conformer | LSTM | | -| `transducer_stateless` | Conformer | Embedding + Conv1d | | -| `transducer_lstm` | LSTM | LSTM | | -| `transducer_stateless_multi_datasets` | Conformer | Embedding + Conv1d | Using data from GigaSpeech as extra training data | -| `pruned_transducer_stateless` | Conformer | Embedding + Conv1d | Using k2 pruned RNN-T loss | +| | Encoder | Decoder | Comment | +|---------------------------------------|---------------------|--------------------|---------------------------------------------------| +| `transducer` | Conformer | LSTM | | +| `transducer_stateless` | Conformer | Embedding + Conv1d | | +| `transducer_lstm` | LSTM | LSTM | | +| `transducer_stateless_multi_datasets` | Conformer | Embedding + Conv1d | Using data from GigaSpeech as extra training data | +| `pruned_transducer_stateless` | Conformer | Embedding + Conv1d | Using k2 pruned RNN-T loss | +| `pruned_transducer_stateless2` | Conformer(modified) | Embedding + Conv1d | Using k2 pruned RNN-T loss | + The decoder in `transducer_stateless` is modified from the paper [Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/). diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 6dbc659f7..ce90da356 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -1,5 +1,79 @@ ## Results +### LibriSpeech BPE training results (Pruned Transducer 2) + +This is with a reworked version of the conformer encoder, with many changes. + +[pruned_transducer_stateless2](./pruned_transducer_stateless2) + +using commit `34aad74a2c849542dd5f6359c9e6b527e8782fd6`. +See + +The WERs are: + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-------------------------------------------------------------------------------| +| greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 25, --avg 8, --max-duration 600 | +| fast beam search | 2.61 | 6.17 | --epoch 25, --avg 8, --max-duration 600 --decoding-method fast_beam_search | +| modified beam search | 2.59 | 6.19 | --epoch 25, --avg 8, --max-duration 600 --decoding-method modified_beam_search| + + +The train and decode commands are: +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp --world-size 8 --num-epochs 26 --full-libri 1 --max-duration 300` +and: +`python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp --epoch 25 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600` + +The Tensorboard log is at + + +The WERs for librispeech 100 hours are: + +Trained with one job: +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_ws1 --world-size 1 --num-epochs 40 --full-libri 0 --max-duration 300` +and decoded with: +`python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp_100h_ws1 --epoch 19 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600`. + +The Tensorboard log is at (learning rate +schedule is not visible due to a since-fixed bug). + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-------------------------------------------------------| +| greedy search (max sym per frame 1) | 7.12 | 18.42 | --epoch 19 --avg 8 | +| greedy search (max sym per frame 1) | 6.71 | 17.77 | --epoch 29 --avg 8 | +| fast beam search | 6.58 | 17.27 | --epoch 19 --avg 8 --decoding-method fast_beam_search | + +Trained with two jobs: +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_ws2 --world-size 2 --num-epochs 40 --full-libri 0 --max-duration 300` +and decoded with: +`python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp_100h_ws2 --epoch 19 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600`. + +The Tensorboard log is at +(learning rate schedule is not visible due to a since-fixed bug). + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-----------------------| +| greedy search (max sym per frame 1) | 7.05 | 18.77 | --epoch 19, --avg 8 | +| greedy search (max sym per frame 1) | 6.82 | 18.14 | --epoch 29, --avg 8 | +| greedy search (max sym per frame 1) | 6.81 | 17.66 | --epoch 30, --avg 10 | + + +Trained with 4 jobs: +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_ws4 --world-size 4 --num-epochs 40 --full-libri 0 --max-duration 300` +and decoded with: +`python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp_100h_ws4 --epoch 19 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600`. + + +The Tensorboard log is at +(learning rate schedule is not visible due to a since-fixed bug). + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-----------------------| +| greedy search (max sym per frame 1) | 7.31 | 19.55 | --epoch 19, --avg 8 | +| greedy search (max sym per frame 1) | 7.08 | 18.59 | --epoch 29, --avg 8 | +| greedy search (max sym per frame 1) | 6.86 | 18.29 | --epoch 30, --avg 10 | + + + ### LibriSpeech BPE training results (Pruned Transducer) Conformer encoder + non-current decoder. The decoder @@ -23,6 +97,10 @@ The WERs are: | modified beam search (beam size 4) | 2.56 | 6.27 | --epoch 42, --avg 11, --max-duration 100 | | beam search (beam size 4) | 2.57 | 6.27 | --epoch 42, --avg 11, --max-duration 100 | + + + + The decoding time for `test-clean` and `test-other` is given below: (A V100 GPU with 32 GB RAM is used for decoding. Note: Not all GPU RAM is used during decoding.) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py index 5876d5158..d0e5c083f 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py @@ -89,7 +89,7 @@ def fast_beam_search( # (shape.NumElements(), 1, joiner_dim) # fmt: off current_encoder_out = torch.index_select( - encoder_out[:, t:t + 1, :], 0, shape.row_ids(1) + encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64) ) # fmt: on logits = model.joiner( From ead822477c3c51190b78a61c48cb29ff8c198cba Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 11 Apr 2022 21:01:13 +0800 Subject: [PATCH 5/8] Fix rebase --- egs/librispeech/ASR/pruned_transducer_stateless2/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py index 577231995..d08fa15b5 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py @@ -39,6 +39,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" --exp-dir pruned_transducer_stateless2/exp \ --full-libri 1 \ --max-duration 550 + """ From 65818d16ded697d6b11c65addc002ac5faae2eaf Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 12 Apr 2022 11:48:16 +0800 Subject: [PATCH 6/8] Add more results --- egs/librispeech/ASR/RESULTS.md | 71 +++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index ce90da356..645e24fdc 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -13,9 +13,15 @@ The WERs are: | | test-clean | test-other | comment | |-------------------------------------|------------|------------|-------------------------------------------------------------------------------| -| greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 25, --avg 8, --max-duration 600 | -| fast beam search | 2.61 | 6.17 | --epoch 25, --avg 8, --max-duration 600 --decoding-method fast_beam_search | -| modified beam search | 2.59 | 6.19 | --epoch 25, --avg 8, --max-duration 600 --decoding-method modified_beam_search| +| greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 25 --avg 8 --max-duration 600 | +| fast beam search | 2.61 | 6.17 | --epoch 25 --avg 8 --max-duration 600 --decoding-method fast_beam_search | +| modified beam search | 2.59 | 6.19 | --epoch 25 --avg 8 --max-duration 600 --decoding-method modified_beam_search| +| greedy search (max sym per frame 1) | 2.70 | 6.04 | --epoch 34 --avg 10 --max-duration 600 | +| fast beam search | 2.66 | 6.00 | --epoch 34 --avg 10 --max-duration 600 --decoding-method fast_beam_search | +| greedy search (max sym per frame 1) | 2.60 | 6.06 | --epoch 37 --avg 10 --max-duration 600 | +| fast beam search | 2.62 | 5.97 | --epoch 37 --avg 10 --max-duration 600 --decoding-method fast_beam_search | + + The train and decode commands are: @@ -23,7 +29,8 @@ The train and decode commands are: and: `python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp --epoch 25 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600` -The Tensorboard log is at +The Tensorboard log is at (apologies, log starts +only from epoch 3). The WERs for librispeech 100 hours are: @@ -40,7 +47,9 @@ schedule is not visible due to a since-fixed bug). |-------------------------------------|------------|------------|-------------------------------------------------------| | greedy search (max sym per frame 1) | 7.12 | 18.42 | --epoch 19 --avg 8 | | greedy search (max sym per frame 1) | 6.71 | 17.77 | --epoch 29 --avg 8 | -| fast beam search | 6.58 | 17.27 | --epoch 19 --avg 8 --decoding-method fast_beam_search | +| greedy search (max sym per frame 1) | 6.64 | 17.19 | --epoch 39 --avg 10 | +| fast beam search | 6.58 | 17.27 | --epoch 29 --avg 8 --decoding-method fast_beam_search | +| fast beam search | 6.53 | 16.82 | --epoch 39 --avg 10 --decoding-method fast_beam_search | Trained with two jobs: `python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_ws2 --world-size 2 --num-epochs 40 --full-libri 0 --max-duration 300` @@ -52,9 +61,9 @@ The Tensorboard log is at . +Train command was +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_fp16 --world-size 1 --num-epochs 40 --full-libri 0 --max-duration 500 --use-fp16 True` + +The Tensorboard log is at + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-----------------------| +| greedy search (max sym per frame 1) | 7.10 | 18.79 | --epoch 19 --avg 8 | +| greedy search (max sym per frame 1) | 6.92 | 18.16 | --epoch 29 --avg 8 | +| greedy search (max sym per frame 1) | 6.89 | 17.75 | --epoch 30 --avg 10 | + +https://tensorboard.dev/experiment/Km7QBHYnSLWs4qQnAJWsaA/ @@ -91,11 +116,11 @@ The WERs are: | | test-clean | test-other | comment | |-------------------------------------|------------|------------|------------------------------------------| -| greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 42, --avg 11, --max-duration 100 | -| greedy search (max sym per frame 2) | 2.62 | 6.37 | --epoch 42, --avg 11, --max-duration 100 | -| greedy search (max sym per frame 3) | 2.62 | 6.37 | --epoch 42, --avg 11, --max-duration 100 | -| modified beam search (beam size 4) | 2.56 | 6.27 | --epoch 42, --avg 11, --max-duration 100 | -| beam search (beam size 4) | 2.57 | 6.27 | --epoch 42, --avg 11, --max-duration 100 | +| greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 42 --avg 11 --max-duration 100 | +| greedy search (max sym per frame 2) | 2.62 | 6.37 | --epoch 42 --avg 11 --max-duration 100 | +| greedy search (max sym per frame 3) | 2.62 | 6.37 | --epoch 42 --avg 11 --max-duration 100 | +| modified beam search (beam size 4) | 2.56 | 6.27 | --epoch 42 --avg 11 --max-duration 100 | +| beam search (beam size 4) | 2.57 | 6.27 | --epoch 42 --avg 11 --max-duration 100 | @@ -189,7 +214,7 @@ The WERs are | | test-clean | test-other | comment | |---------------------------|------------|------------|------------------------------------------| -| greedy search | 2.85 | 6.98 | --epoch 28, --avg 15, --max-duration 100 | +| greedy search | 2.85 | 6.98 | --epoch 28 --avg 15 --max-duration 100 | The training command for reproducing is given below: @@ -249,8 +274,8 @@ The WERs are | | test-clean | test-other | comment | |-------------------------------------|------------|------------|------------------------------------------| -| greedy search (max sym per frame 1) | 2.64 | 6.55 | --epoch 39, --avg 15, --max-duration 100 | -| modified beam search (beam size 4) | 2.61 | 6.46 | --epoch 39, --avg 15, --max-duration 100 | +| greedy search (max sym per frame 1) | 2.64 | 6.55 | --epoch 39 --avg 15 --max-duration 100 | +| modified beam search (beam size 4) | 2.61 | 6.46 | --epoch 39 --avg 15 --max-duration 100 | The training command for reproducing is given below: @@ -319,10 +344,10 @@ The WERs are | | test-clean | test-other | comment | |-------------------------------------|------------|------------|------------------------------------------| -| greedy search (max sym per frame 1) | 2.67 | 6.67 | --epoch 63, --avg 19, --max-duration 100 | -| greedy search (max sym per frame 2) | 2.67 | 6.67 | --epoch 63, --avg 19, --max-duration 100 | -| greedy search (max sym per frame 3) | 2.67 | 6.67 | --epoch 63, --avg 19, --max-duration 100 | -| modified beam search (beam size 4) | 2.67 | 6.57 | --epoch 63, --avg 19, --max-duration 100 | +| greedy search (max sym per frame 1) | 2.67 | 6.67 | --epoch 63 --avg 19 --max-duration 100 | +| greedy search (max sym per frame 2) | 2.67 | 6.67 | --epoch 63 --avg 19 --max-duration 100 | +| greedy search (max sym per frame 3) | 2.67 | 6.67 | --epoch 63 --avg 19 --max-duration 100 | +| modified beam search (beam size 4) | 2.67 | 6.57 | --epoch 63 --avg 19 --max-duration 100 | The training command for reproducing is given below: From d0a53aad487ff24dc1fca256346cc3350239cfff Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 12 Apr 2022 11:51:15 +0800 Subject: [PATCH 7/8] Fix tensorboard log location --- egs/librispeech/ASR/RESULTS.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 645e24fdc..9f47ac495 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -87,7 +87,7 @@ floats and max-duration increased from 300 to 500, after merging +The Tensorboard log is at | | test-clean | test-other | comment | |-------------------------------------|------------|------------|-----------------------| @@ -95,7 +95,6 @@ The Tensorboard log is at Date: Tue, 12 Apr 2022 12:20:10 +0800 Subject: [PATCH 8/8] Add one more epoch of full expt --- egs/librispeech/ASR/RESULTS.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 9f47ac495..01637beb1 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -14,12 +14,12 @@ The WERs are: | | test-clean | test-other | comment | |-------------------------------------|------------|------------|-------------------------------------------------------------------------------| | greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 25 --avg 8 --max-duration 600 | -| fast beam search | 2.61 | 6.17 | --epoch 25 --avg 8 --max-duration 600 --decoding-method fast_beam_search | -| modified beam search | 2.59 | 6.19 | --epoch 25 --avg 8 --max-duration 600 --decoding-method modified_beam_search| +| fast beam search | 2.61 | 6.17 | --epoch 25 --avg 8 --max-duration 600 --decoding-method fast_beam_search | +| modified beam search | 2.59 | 6.19 | --epoch 25 --avg 8 --max-duration 600 --decoding-method modified_beam_search | | greedy search (max sym per frame 1) | 2.70 | 6.04 | --epoch 34 --avg 10 --max-duration 600 | -| fast beam search | 2.66 | 6.00 | --epoch 34 --avg 10 --max-duration 600 --decoding-method fast_beam_search | -| greedy search (max sym per frame 1) | 2.60 | 6.06 | --epoch 37 --avg 10 --max-duration 600 | -| fast beam search | 2.62 | 5.97 | --epoch 37 --avg 10 --max-duration 600 --decoding-method fast_beam_search | +| fast beam search | 2.66 | 6.00 | --epoch 34 --avg 10 --max-duration 600 --decoding-method fast_beam_search | +| greedy search (max sym per frame 1) | 2.62 | 6.03 | --epoch 38 --avg 10 --max-duration 600 | +| fast beam search | 2.57 | 5.95 | --epoch 38 --avg 10 --max-duration 600 --decoding-method fast_beam_search |