From cc0d4ffa4f115e6345274c203716e935b369cb77 Mon Sep 17 00:00:00 2001 From: pkufool Date: Mon, 11 Apr 2022 15:27:24 +0800 Subject: [PATCH 01/11] Add mix precision support --- egs/librispeech/ASR/pruned_transducer_stateless2/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py index d08fa15b5..c78a0f1c3 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py @@ -870,7 +870,7 @@ def run(rank, world_size, args): valid_cuts += librispeech.dev_other_cuts() valid_dl = librispeech.valid_dataloaders(valid_cuts) - if not params.print_diagnostics: + if not params.print_diagnostics and not params.use_fp16: scan_pessimistic_batches_for_oom( model=model, train_dl=train_dl, From ddd8f9e15ef33aa86f2b3f52278d75cdbe0138de Mon Sep 17 00:00:00 2001 From: pkufool Date: Mon, 11 Apr 2022 15:40:14 +0800 Subject: [PATCH 02/11] Minor fixes --- egs/librispeech/ASR/pruned_transducer_stateless2/train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py index c78a0f1c3..31b85d53c 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py @@ -39,7 +39,6 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" --exp-dir pruned_transducer_stateless2/exp \ --full-libri 1 \ --max-duration 550 - """ From a92133ef960a43d3e9f4834594acad2051c9aa22 Mon Sep 17 00:00:00 2001 From: pkufool Date: Mon, 11 Apr 2022 15:41:45 +0800 Subject: [PATCH 03/11] Minor fixes --- egs/librispeech/ASR/pruned_transducer_stateless2/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py index 31b85d53c..577231995 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py @@ -869,7 +869,7 @@ def run(rank, world_size, args): valid_cuts += librispeech.dev_other_cuts() valid_dl = librispeech.valid_dataloaders(valid_cuts) - if not params.print_diagnostics and not params.use_fp16: + if not params.print_diagnostics: scan_pessimistic_batches_for_oom( model=model, train_dl=train_dl, From e8eb0b94d912c08afd9adce3675091f05baf3cf0 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 11 Apr 2022 20:56:11 +0800 Subject: [PATCH 04/11] Updating RESULTS.md; fix in beam_search.py --- egs/librispeech/ASR/README.md | 16 ++-- egs/librispeech/ASR/RESULTS.md | 78 +++++++++++++++++++ .../beam_search.py | 2 +- 3 files changed, 88 insertions(+), 8 deletions(-) diff --git a/egs/librispeech/ASR/README.md b/egs/librispeech/ASR/README.md index a7b2e2c3b..b3e90a052 100644 --- a/egs/librispeech/ASR/README.md +++ b/egs/librispeech/ASR/README.md @@ -9,13 +9,15 @@ for how to run models in this recipe. There are various folders containing the name `transducer` in this folder. The following table lists the differences among them. -| | Encoder | Decoder | Comment | -|---------------------------------------|-----------|--------------------|---------------------------------------------------| -| `transducer` | Conformer | LSTM | | -| `transducer_stateless` | Conformer | Embedding + Conv1d | | -| `transducer_lstm` | LSTM | LSTM | | -| `transducer_stateless_multi_datasets` | Conformer | Embedding + Conv1d | Using data from GigaSpeech as extra training data | -| `pruned_transducer_stateless` | Conformer | Embedding + Conv1d | Using k2 pruned RNN-T loss | +| | Encoder | Decoder | Comment | +|---------------------------------------|---------------------|--------------------|---------------------------------------------------| +| `transducer` | Conformer | LSTM | | +| `transducer_stateless` | Conformer | Embedding + Conv1d | | +| `transducer_lstm` | LSTM | LSTM | | +| `transducer_stateless_multi_datasets` | Conformer | Embedding + Conv1d | Using data from GigaSpeech as extra training data | +| `pruned_transducer_stateless` | Conformer | Embedding + Conv1d | Using k2 pruned RNN-T loss | +| `pruned_transducer_stateless2` | Conformer(modified) | Embedding + Conv1d | Using k2 pruned RNN-T loss | + The decoder in `transducer_stateless` is modified from the paper [Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/). diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 6dbc659f7..ce90da356 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -1,5 +1,79 @@ ## Results +### LibriSpeech BPE training results (Pruned Transducer 2) + +This is with a reworked version of the conformer encoder, with many changes. + +[pruned_transducer_stateless2](./pruned_transducer_stateless2) + +using commit `34aad74a2c849542dd5f6359c9e6b527e8782fd6`. +See + +The WERs are: + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-------------------------------------------------------------------------------| +| greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 25, --avg 8, --max-duration 600 | +| fast beam search | 2.61 | 6.17 | --epoch 25, --avg 8, --max-duration 600 --decoding-method fast_beam_search | +| modified beam search | 2.59 | 6.19 | --epoch 25, --avg 8, --max-duration 600 --decoding-method modified_beam_search| + + +The train and decode commands are: +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp --world-size 8 --num-epochs 26 --full-libri 1 --max-duration 300` +and: +`python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp --epoch 25 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600` + +The Tensorboard log is at + + +The WERs for librispeech 100 hours are: + +Trained with one job: +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_ws1 --world-size 1 --num-epochs 40 --full-libri 0 --max-duration 300` +and decoded with: +`python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp_100h_ws1 --epoch 19 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600`. + +The Tensorboard log is at (learning rate +schedule is not visible due to a since-fixed bug). + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-------------------------------------------------------| +| greedy search (max sym per frame 1) | 7.12 | 18.42 | --epoch 19 --avg 8 | +| greedy search (max sym per frame 1) | 6.71 | 17.77 | --epoch 29 --avg 8 | +| fast beam search | 6.58 | 17.27 | --epoch 19 --avg 8 --decoding-method fast_beam_search | + +Trained with two jobs: +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_ws2 --world-size 2 --num-epochs 40 --full-libri 0 --max-duration 300` +and decoded with: +`python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp_100h_ws2 --epoch 19 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600`. + +The Tensorboard log is at +(learning rate schedule is not visible due to a since-fixed bug). + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-----------------------| +| greedy search (max sym per frame 1) | 7.05 | 18.77 | --epoch 19, --avg 8 | +| greedy search (max sym per frame 1) | 6.82 | 18.14 | --epoch 29, --avg 8 | +| greedy search (max sym per frame 1) | 6.81 | 17.66 | --epoch 30, --avg 10 | + + +Trained with 4 jobs: +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_ws4 --world-size 4 --num-epochs 40 --full-libri 0 --max-duration 300` +and decoded with: +`python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp_100h_ws4 --epoch 19 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600`. + + +The Tensorboard log is at +(learning rate schedule is not visible due to a since-fixed bug). + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-----------------------| +| greedy search (max sym per frame 1) | 7.31 | 19.55 | --epoch 19, --avg 8 | +| greedy search (max sym per frame 1) | 7.08 | 18.59 | --epoch 29, --avg 8 | +| greedy search (max sym per frame 1) | 6.86 | 18.29 | --epoch 30, --avg 10 | + + + ### LibriSpeech BPE training results (Pruned Transducer) Conformer encoder + non-current decoder. The decoder @@ -23,6 +97,10 @@ The WERs are: | modified beam search (beam size 4) | 2.56 | 6.27 | --epoch 42, --avg 11, --max-duration 100 | | beam search (beam size 4) | 2.57 | 6.27 | --epoch 42, --avg 11, --max-duration 100 | + + + + The decoding time for `test-clean` and `test-other` is given below: (A V100 GPU with 32 GB RAM is used for decoding. Note: Not all GPU RAM is used during decoding.) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py index 5876d5158..d0e5c083f 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py @@ -89,7 +89,7 @@ def fast_beam_search( # (shape.NumElements(), 1, joiner_dim) # fmt: off current_encoder_out = torch.index_select( - encoder_out[:, t:t + 1, :], 0, shape.row_ids(1) + encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64) ) # fmt: on logits = model.joiner( From ead822477c3c51190b78a61c48cb29ff8c198cba Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 11 Apr 2022 21:01:13 +0800 Subject: [PATCH 05/11] Fix rebase --- egs/librispeech/ASR/pruned_transducer_stateless2/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py index 577231995..d08fa15b5 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py @@ -39,6 +39,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" --exp-dir pruned_transducer_stateless2/exp \ --full-libri 1 \ --max-duration 550 + """ From bdeff338c2245b3980d0385ffa37e4468aa6a02e Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 12 Apr 2022 09:09:56 +0800 Subject: [PATCH 06/11] Fix CI errors. (#310) --- egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py | 1 + 1 file changed, 1 insertion(+) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py b/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py index b0eb4d749..3cc472974 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py @@ -174,6 +174,7 @@ def main(): # is defined in local/train_bpe_model.py params.blank_id = sp.piece_to_id("") + params.unk_id = sp.piece_to_id("") params.vocab_size = sp.get_piece_size() logging.info(f"{params}") From 65818d16ded697d6b11c65addc002ac5faae2eaf Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 12 Apr 2022 11:48:16 +0800 Subject: [PATCH 07/11] Add more results --- egs/librispeech/ASR/RESULTS.md | 71 +++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index ce90da356..645e24fdc 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -13,9 +13,15 @@ The WERs are: | | test-clean | test-other | comment | |-------------------------------------|------------|------------|-------------------------------------------------------------------------------| -| greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 25, --avg 8, --max-duration 600 | -| fast beam search | 2.61 | 6.17 | --epoch 25, --avg 8, --max-duration 600 --decoding-method fast_beam_search | -| modified beam search | 2.59 | 6.19 | --epoch 25, --avg 8, --max-duration 600 --decoding-method modified_beam_search| +| greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 25 --avg 8 --max-duration 600 | +| fast beam search | 2.61 | 6.17 | --epoch 25 --avg 8 --max-duration 600 --decoding-method fast_beam_search | +| modified beam search | 2.59 | 6.19 | --epoch 25 --avg 8 --max-duration 600 --decoding-method modified_beam_search| +| greedy search (max sym per frame 1) | 2.70 | 6.04 | --epoch 34 --avg 10 --max-duration 600 | +| fast beam search | 2.66 | 6.00 | --epoch 34 --avg 10 --max-duration 600 --decoding-method fast_beam_search | +| greedy search (max sym per frame 1) | 2.60 | 6.06 | --epoch 37 --avg 10 --max-duration 600 | +| fast beam search | 2.62 | 5.97 | --epoch 37 --avg 10 --max-duration 600 --decoding-method fast_beam_search | + + The train and decode commands are: @@ -23,7 +29,8 @@ The train and decode commands are: and: `python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp --epoch 25 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600` -The Tensorboard log is at +The Tensorboard log is at (apologies, log starts +only from epoch 3). The WERs for librispeech 100 hours are: @@ -40,7 +47,9 @@ schedule is not visible due to a since-fixed bug). |-------------------------------------|------------|------------|-------------------------------------------------------| | greedy search (max sym per frame 1) | 7.12 | 18.42 | --epoch 19 --avg 8 | | greedy search (max sym per frame 1) | 6.71 | 17.77 | --epoch 29 --avg 8 | -| fast beam search | 6.58 | 17.27 | --epoch 19 --avg 8 --decoding-method fast_beam_search | +| greedy search (max sym per frame 1) | 6.64 | 17.19 | --epoch 39 --avg 10 | +| fast beam search | 6.58 | 17.27 | --epoch 29 --avg 8 --decoding-method fast_beam_search | +| fast beam search | 6.53 | 16.82 | --epoch 39 --avg 10 --decoding-method fast_beam_search | Trained with two jobs: `python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_ws2 --world-size 2 --num-epochs 40 --full-libri 0 --max-duration 300` @@ -52,9 +61,9 @@ The Tensorboard log is at . +Train command was +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_fp16 --world-size 1 --num-epochs 40 --full-libri 0 --max-duration 500 --use-fp16 True` + +The Tensorboard log is at + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-----------------------| +| greedy search (max sym per frame 1) | 7.10 | 18.79 | --epoch 19 --avg 8 | +| greedy search (max sym per frame 1) | 6.92 | 18.16 | --epoch 29 --avg 8 | +| greedy search (max sym per frame 1) | 6.89 | 17.75 | --epoch 30 --avg 10 | + +https://tensorboard.dev/experiment/Km7QBHYnSLWs4qQnAJWsaA/ @@ -91,11 +116,11 @@ The WERs are: | | test-clean | test-other | comment | |-------------------------------------|------------|------------|------------------------------------------| -| greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 42, --avg 11, --max-duration 100 | -| greedy search (max sym per frame 2) | 2.62 | 6.37 | --epoch 42, --avg 11, --max-duration 100 | -| greedy search (max sym per frame 3) | 2.62 | 6.37 | --epoch 42, --avg 11, --max-duration 100 | -| modified beam search (beam size 4) | 2.56 | 6.27 | --epoch 42, --avg 11, --max-duration 100 | -| beam search (beam size 4) | 2.57 | 6.27 | --epoch 42, --avg 11, --max-duration 100 | +| greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 42 --avg 11 --max-duration 100 | +| greedy search (max sym per frame 2) | 2.62 | 6.37 | --epoch 42 --avg 11 --max-duration 100 | +| greedy search (max sym per frame 3) | 2.62 | 6.37 | --epoch 42 --avg 11 --max-duration 100 | +| modified beam search (beam size 4) | 2.56 | 6.27 | --epoch 42 --avg 11 --max-duration 100 | +| beam search (beam size 4) | 2.57 | 6.27 | --epoch 42 --avg 11 --max-duration 100 | @@ -189,7 +214,7 @@ The WERs are | | test-clean | test-other | comment | |---------------------------|------------|------------|------------------------------------------| -| greedy search | 2.85 | 6.98 | --epoch 28, --avg 15, --max-duration 100 | +| greedy search | 2.85 | 6.98 | --epoch 28 --avg 15 --max-duration 100 | The training command for reproducing is given below: @@ -249,8 +274,8 @@ The WERs are | | test-clean | test-other | comment | |-------------------------------------|------------|------------|------------------------------------------| -| greedy search (max sym per frame 1) | 2.64 | 6.55 | --epoch 39, --avg 15, --max-duration 100 | -| modified beam search (beam size 4) | 2.61 | 6.46 | --epoch 39, --avg 15, --max-duration 100 | +| greedy search (max sym per frame 1) | 2.64 | 6.55 | --epoch 39 --avg 15 --max-duration 100 | +| modified beam search (beam size 4) | 2.61 | 6.46 | --epoch 39 --avg 15 --max-duration 100 | The training command for reproducing is given below: @@ -319,10 +344,10 @@ The WERs are | | test-clean | test-other | comment | |-------------------------------------|------------|------------|------------------------------------------| -| greedy search (max sym per frame 1) | 2.67 | 6.67 | --epoch 63, --avg 19, --max-duration 100 | -| greedy search (max sym per frame 2) | 2.67 | 6.67 | --epoch 63, --avg 19, --max-duration 100 | -| greedy search (max sym per frame 3) | 2.67 | 6.67 | --epoch 63, --avg 19, --max-duration 100 | -| modified beam search (beam size 4) | 2.67 | 6.57 | --epoch 63, --avg 19, --max-duration 100 | +| greedy search (max sym per frame 1) | 2.67 | 6.67 | --epoch 63 --avg 19 --max-duration 100 | +| greedy search (max sym per frame 2) | 2.67 | 6.67 | --epoch 63 --avg 19 --max-duration 100 | +| greedy search (max sym per frame 3) | 2.67 | 6.67 | --epoch 63 --avg 19 --max-duration 100 | +| modified beam search (beam size 4) | 2.67 | 6.57 | --epoch 63 --avg 19 --max-duration 100 | The training command for reproducing is given below: From d0a53aad487ff24dc1fca256346cc3350239cfff Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 12 Apr 2022 11:51:15 +0800 Subject: [PATCH 08/11] Fix tensorboard log location --- egs/librispeech/ASR/RESULTS.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 645e24fdc..9f47ac495 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -87,7 +87,7 @@ floats and max-duration increased from 300 to 500, after merging +The Tensorboard log is at | | test-clean | test-other | comment | |-------------------------------------|------------|------------|-----------------------| @@ -95,7 +95,6 @@ The Tensorboard log is at Date: Tue, 12 Apr 2022 12:20:10 +0800 Subject: [PATCH 09/11] Add one more epoch of full expt --- egs/librispeech/ASR/RESULTS.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 9f47ac495..01637beb1 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -14,12 +14,12 @@ The WERs are: | | test-clean | test-other | comment | |-------------------------------------|------------|------------|-------------------------------------------------------------------------------| | greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 25 --avg 8 --max-duration 600 | -| fast beam search | 2.61 | 6.17 | --epoch 25 --avg 8 --max-duration 600 --decoding-method fast_beam_search | -| modified beam search | 2.59 | 6.19 | --epoch 25 --avg 8 --max-duration 600 --decoding-method modified_beam_search| +| fast beam search | 2.61 | 6.17 | --epoch 25 --avg 8 --max-duration 600 --decoding-method fast_beam_search | +| modified beam search | 2.59 | 6.19 | --epoch 25 --avg 8 --max-duration 600 --decoding-method modified_beam_search | | greedy search (max sym per frame 1) | 2.70 | 6.04 | --epoch 34 --avg 10 --max-duration 600 | -| fast beam search | 2.66 | 6.00 | --epoch 34 --avg 10 --max-duration 600 --decoding-method fast_beam_search | -| greedy search (max sym per frame 1) | 2.60 | 6.06 | --epoch 37 --avg 10 --max-duration 600 | -| fast beam search | 2.62 | 5.97 | --epoch 37 --avg 10 --max-duration 600 --decoding-method fast_beam_search | +| fast beam search | 2.66 | 6.00 | --epoch 34 --avg 10 --max-duration 600 --decoding-method fast_beam_search | +| greedy search (max sym per frame 1) | 2.62 | 6.03 | --epoch 38 --avg 10 --max-duration 600 | +| fast beam search | 2.57 | 5.95 | --epoch 38 --avg 10 --max-duration 600 --decoding-method fast_beam_search | From 78418ac37cdcfa1c4d0f54fe77901f74644ff96a Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Wed, 13 Apr 2022 13:09:24 +0800 Subject: [PATCH 10/11] fix comments --- egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py | 4 ++-- icefall/diagnostics.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py index d59aa2160..f89d2963e 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py @@ -367,7 +367,7 @@ class ActivationBalancer(torch.nn.Module): min_positive: the minimum, per channel, of the proportion of the time that (x > 0), below which we start to modify the derivatives. max_positive: the maximum, per channel, of the proportion of the time - that (x > 0), below which we start to modify the derivatives. + that (x > 0), above which we start to modify the derivatives. max_factor: the maximum factor by which we modify the derivatives for either the sign constraint or the magnitude constraint; e.g. with max_factor=0.02, the the derivatives would be multiplied by @@ -413,7 +413,7 @@ class DoubleSwishFunction(torch.autograd.Function): """ double_swish(x) = x * torch.sigmoid(x-1) This is a definition, originally motivated by its close numerical - similarity to swish(swish(x), where swish(x) = x * sigmoid(x). + similarity to swish(swish(x)), where swish(x) = x * sigmoid(x). Memory-efficient derivative computation: double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1) diff --git a/icefall/diagnostics.py b/icefall/diagnostics.py index ce4ac1464..bc8fe3069 100644 --- a/icefall/diagnostics.py +++ b/icefall/diagnostics.py @@ -111,7 +111,7 @@ def get_diagnostics_for_dim( options object sizes_same: True if all the tensor sizes are the same on this dimension - stats_type: either "abs" or "positive" or "eigs" or "value", + stats_type: either "abs" or "positive" or "eigs" or "value", imdictates the type of stats we accumulate, abs is mean absolute value, "positive" is proportion of positive to nonnegative values, "eigs" is eigenvalues after doing outer product on this dim, sum From af6ae840ee4a30f67eb3dafb30fd2aeb51e41768 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 13 Apr 2022 20:22:11 +0800 Subject: [PATCH 11/11] Add results for mixed precision with max-duration 300 --- egs/librispeech/ASR/RESULTS.md | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 01637beb1..3488535a6 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -2,9 +2,10 @@ ### LibriSpeech BPE training results (Pruned Transducer 2) +[pruned_transducer_stateless2](./pruned_transducer_stateless2) This is with a reworked version of the conformer encoder, with many changes. -[pruned_transducer_stateless2](./pruned_transducer_stateless2) +#### Training on fulll librispeech using commit `34aad74a2c849542dd5f6359c9e6b527e8782fd6`. See @@ -33,9 +34,9 @@ The Tensorboard log is at . +Train command was +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_fp16 --world-size 1 --num-epochs 40 --full-libri 0 --max-duration 300 --use-fp16 True` + +The Tensorboard log is at + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-----------------------| +| greedy search (max sym per frame 1) | 7.10 | 18.57 | --epoch 19 --avg 8 | +| greedy search (max sym per frame 1) | 6.81 | 17.84 | --epoch 29 --avg 8 | +| greedy search (max sym per frame 1) | 6.63 | 17.39 | --epoch 30 --avg 10 | + + Trained with 1 job, with --use-fp16=True --max-duration=500, i.e. with half-precision floats and max-duration increased from 300 to 500, after merging . Train command was