diff --git a/egs/librispeech/ASR/README.md b/egs/librispeech/ASR/README.md index a7b2e2c3b..b3e90a052 100644 --- a/egs/librispeech/ASR/README.md +++ b/egs/librispeech/ASR/README.md @@ -9,13 +9,15 @@ for how to run models in this recipe. There are various folders containing the name `transducer` in this folder. The following table lists the differences among them. -| | Encoder | Decoder | Comment | -|---------------------------------------|-----------|--------------------|---------------------------------------------------| -| `transducer` | Conformer | LSTM | | -| `transducer_stateless` | Conformer | Embedding + Conv1d | | -| `transducer_lstm` | LSTM | LSTM | | -| `transducer_stateless_multi_datasets` | Conformer | Embedding + Conv1d | Using data from GigaSpeech as extra training data | -| `pruned_transducer_stateless` | Conformer | Embedding + Conv1d | Using k2 pruned RNN-T loss | +| | Encoder | Decoder | Comment | +|---------------------------------------|---------------------|--------------------|---------------------------------------------------| +| `transducer` | Conformer | LSTM | | +| `transducer_stateless` | Conformer | Embedding + Conv1d | | +| `transducer_lstm` | LSTM | LSTM | | +| `transducer_stateless_multi_datasets` | Conformer | Embedding + Conv1d | Using data from GigaSpeech as extra training data | +| `pruned_transducer_stateless` | Conformer | Embedding + Conv1d | Using k2 pruned RNN-T loss | +| `pruned_transducer_stateless2` | Conformer(modified) | Embedding + Conv1d | Using k2 pruned RNN-T loss | + The decoder in `transducer_stateless` is modified from the paper [Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/). diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 6dbc659f7..3488535a6 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -1,5 +1,119 @@ ## Results +### LibriSpeech BPE training results (Pruned Transducer 2) + +[pruned_transducer_stateless2](./pruned_transducer_stateless2) +This is with a reworked version of the conformer encoder, with many changes. + +#### Training on fulll librispeech + +using commit `34aad74a2c849542dd5f6359c9e6b527e8782fd6`. +See + +The WERs are: + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-------------------------------------------------------------------------------| +| greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 25 --avg 8 --max-duration 600 | +| fast beam search | 2.61 | 6.17 | --epoch 25 --avg 8 --max-duration 600 --decoding-method fast_beam_search | +| modified beam search | 2.59 | 6.19 | --epoch 25 --avg 8 --max-duration 600 --decoding-method modified_beam_search | +| greedy search (max sym per frame 1) | 2.70 | 6.04 | --epoch 34 --avg 10 --max-duration 600 | +| fast beam search | 2.66 | 6.00 | --epoch 34 --avg 10 --max-duration 600 --decoding-method fast_beam_search | +| greedy search (max sym per frame 1) | 2.62 | 6.03 | --epoch 38 --avg 10 --max-duration 600 | +| fast beam search | 2.57 | 5.95 | --epoch 38 --avg 10 --max-duration 600 --decoding-method fast_beam_search | + + + + +The train and decode commands are: +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp --world-size 8 --num-epochs 26 --full-libri 1 --max-duration 300` +and: +`python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp --epoch 25 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600` + +The Tensorboard log is at (apologies, log starts +only from epoch 3). + + +#### Training on train-clean-100: + +Trained with 1 job: +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_ws1 --world-size 1 --num-epochs 40 --full-libri 0 --max-duration 300` +and decoded with: +`python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp_100h_ws1 --epoch 19 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600`. + +The Tensorboard log is at (learning rate +schedule is not visible due to a since-fixed bug). + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-------------------------------------------------------| +| greedy search (max sym per frame 1) | 7.12 | 18.42 | --epoch 19 --avg 8 | +| greedy search (max sym per frame 1) | 6.71 | 17.77 | --epoch 29 --avg 8 | +| greedy search (max sym per frame 1) | 6.64 | 17.19 | --epoch 39 --avg 10 | +| fast beam search | 6.58 | 17.27 | --epoch 29 --avg 8 --decoding-method fast_beam_search | +| fast beam search | 6.53 | 16.82 | --epoch 39 --avg 10 --decoding-method fast_beam_search | + +Trained with 2 jobs: +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_ws2 --world-size 2 --num-epochs 40 --full-libri 0 --max-duration 300` +and decoded with: +`python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp_100h_ws2 --epoch 19 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600`. + +The Tensorboard log is at +(learning rate schedule is not visible due to a since-fixed bug). + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-----------------------| +| greedy search (max sym per frame 1) | 7.05 | 18.77 | --epoch 19 --avg 8 | +| greedy search (max sym per frame 1) | 6.82 | 18.14 | --epoch 29 --avg 8 | +| greedy search (max sym per frame 1) | 6.81 | 17.66 | --epoch 30 --avg 10 | + + +Trained with 4 jobs: +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_ws4 --world-size 4 --num-epochs 40 --full-libri 0 --max-duration 300` +and decoded with: +`python3 ./pruned_transducer_stateless2/decode.py --exp-dir pruned_transducer_stateless2/exp_100h_ws4 --epoch 19 --avg 8 --bpe-model ./data/lang_bpe_500/bpe.model --max-duration 600`. + + +The Tensorboard log is at +(learning rate schedule is not visible due to a since-fixed bug). + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-----------------------| +| greedy search (max sym per frame 1) | 7.31 | 19.55 | --epoch 19 --avg 8 | +| greedy search (max sym per frame 1) | 7.08 | 18.59 | --epoch 29 --avg 8 | +| greedy search (max sym per frame 1) | 6.86 | 18.29 | --epoch 30 --avg 10 | + + + +Trained with 1 job, with --use-fp16=True --max-duration=300 i.e. with half-precision +floats (but without increasing max-duration), after merging . +Train command was +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_fp16 --world-size 1 --num-epochs 40 --full-libri 0 --max-duration 300 --use-fp16 True` + +The Tensorboard log is at + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-----------------------| +| greedy search (max sym per frame 1) | 7.10 | 18.57 | --epoch 19 --avg 8 | +| greedy search (max sym per frame 1) | 6.81 | 17.84 | --epoch 29 --avg 8 | +| greedy search (max sym per frame 1) | 6.63 | 17.39 | --epoch 30 --avg 10 | + + +Trained with 1 job, with --use-fp16=True --max-duration=500, i.e. with half-precision +floats and max-duration increased from 300 to 500, after merging . +Train command was +`python3 ./pruned_transducer_stateless2/train.py --exp-dir=pruned_transducer_stateless2/exp_100h_fp16 --world-size 1 --num-epochs 40 --full-libri 0 --max-duration 500 --use-fp16 True` + +The Tensorboard log is at + +| | test-clean | test-other | comment | +|-------------------------------------|------------|------------|-----------------------| +| greedy search (max sym per frame 1) | 7.10 | 18.79 | --epoch 19 --avg 8 | +| greedy search (max sym per frame 1) | 6.92 | 18.16 | --epoch 29 --avg 8 | +| greedy search (max sym per frame 1) | 6.89 | 17.75 | --epoch 30 --avg 10 | + + + + ### LibriSpeech BPE training results (Pruned Transducer) Conformer encoder + non-current decoder. The decoder @@ -17,11 +131,15 @@ The WERs are: | | test-clean | test-other | comment | |-------------------------------------|------------|------------|------------------------------------------| -| greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 42, --avg 11, --max-duration 100 | -| greedy search (max sym per frame 2) | 2.62 | 6.37 | --epoch 42, --avg 11, --max-duration 100 | -| greedy search (max sym per frame 3) | 2.62 | 6.37 | --epoch 42, --avg 11, --max-duration 100 | -| modified beam search (beam size 4) | 2.56 | 6.27 | --epoch 42, --avg 11, --max-duration 100 | -| beam search (beam size 4) | 2.57 | 6.27 | --epoch 42, --avg 11, --max-duration 100 | +| greedy search (max sym per frame 1) | 2.62 | 6.37 | --epoch 42 --avg 11 --max-duration 100 | +| greedy search (max sym per frame 2) | 2.62 | 6.37 | --epoch 42 --avg 11 --max-duration 100 | +| greedy search (max sym per frame 3) | 2.62 | 6.37 | --epoch 42 --avg 11 --max-duration 100 | +| modified beam search (beam size 4) | 2.56 | 6.27 | --epoch 42 --avg 11 --max-duration 100 | +| beam search (beam size 4) | 2.57 | 6.27 | --epoch 42 --avg 11 --max-duration 100 | + + + + The decoding time for `test-clean` and `test-other` is given below: (A V100 GPU with 32 GB RAM is used for decoding. Note: Not all GPU RAM is used during decoding.) @@ -111,7 +229,7 @@ The WERs are | | test-clean | test-other | comment | |---------------------------|------------|------------|------------------------------------------| -| greedy search | 2.85 | 6.98 | --epoch 28, --avg 15, --max-duration 100 | +| greedy search | 2.85 | 6.98 | --epoch 28 --avg 15 --max-duration 100 | The training command for reproducing is given below: @@ -171,8 +289,8 @@ The WERs are | | test-clean | test-other | comment | |-------------------------------------|------------|------------|------------------------------------------| -| greedy search (max sym per frame 1) | 2.64 | 6.55 | --epoch 39, --avg 15, --max-duration 100 | -| modified beam search (beam size 4) | 2.61 | 6.46 | --epoch 39, --avg 15, --max-duration 100 | +| greedy search (max sym per frame 1) | 2.64 | 6.55 | --epoch 39 --avg 15 --max-duration 100 | +| modified beam search (beam size 4) | 2.61 | 6.46 | --epoch 39 --avg 15 --max-duration 100 | The training command for reproducing is given below: @@ -241,10 +359,10 @@ The WERs are | | test-clean | test-other | comment | |-------------------------------------|------------|------------|------------------------------------------| -| greedy search (max sym per frame 1) | 2.67 | 6.67 | --epoch 63, --avg 19, --max-duration 100 | -| greedy search (max sym per frame 2) | 2.67 | 6.67 | --epoch 63, --avg 19, --max-duration 100 | -| greedy search (max sym per frame 3) | 2.67 | 6.67 | --epoch 63, --avg 19, --max-duration 100 | -| modified beam search (beam size 4) | 2.67 | 6.57 | --epoch 63, --avg 19, --max-duration 100 | +| greedy search (max sym per frame 1) | 2.67 | 6.67 | --epoch 63 --avg 19 --max-duration 100 | +| greedy search (max sym per frame 2) | 2.67 | 6.67 | --epoch 63 --avg 19 --max-duration 100 | +| greedy search (max sym per frame 3) | 2.67 | 6.67 | --epoch 63 --avg 19 --max-duration 100 | +| modified beam search (beam size 4) | 2.67 | 6.57 | --epoch 63 --avg 19 --max-duration 100 | The training command for reproducing is given below: diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py b/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py index b0eb4d749..3cc472974 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless/pretrained.py @@ -174,6 +174,7 @@ def main(): # is defined in local/train_bpe_model.py params.blank_id = sp.piece_to_id("") + params.unk_id = sp.piece_to_id("") params.vocab_size = sp.get_piece_size() logging.info(f"{params}") diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py index fae1d5a96..2e9bf3e0b 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py @@ -89,7 +89,7 @@ def fast_beam_search( # (shape.NumElements(), 1, joiner_dim) # fmt: off current_encoder_out = torch.index_select( - encoder_out[:, t:t + 1, :], 0, shape.row_ids(1) + encoder_out[:, t:t + 1, :], 0, shape.row_ids(1).to(torch.int64) ) # fmt: on logits = model.joiner( diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py index d59aa2160..f89d2963e 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/scaling.py @@ -367,7 +367,7 @@ class ActivationBalancer(torch.nn.Module): min_positive: the minimum, per channel, of the proportion of the time that (x > 0), below which we start to modify the derivatives. max_positive: the maximum, per channel, of the proportion of the time - that (x > 0), below which we start to modify the derivatives. + that (x > 0), above which we start to modify the derivatives. max_factor: the maximum factor by which we modify the derivatives for either the sign constraint or the magnitude constraint; e.g. with max_factor=0.02, the the derivatives would be multiplied by @@ -413,7 +413,7 @@ class DoubleSwishFunction(torch.autograd.Function): """ double_swish(x) = x * torch.sigmoid(x-1) This is a definition, originally motivated by its close numerical - similarity to swish(swish(x), where swish(x) = x * sigmoid(x). + similarity to swish(swish(x)), where swish(x) = x * sigmoid(x). Memory-efficient derivative computation: double_swish(x) = x * s, where s(x) = torch.sigmoid(x-1) diff --git a/icefall/diagnostics.py b/icefall/diagnostics.py index ce4ac1464..bc8fe3069 100644 --- a/icefall/diagnostics.py +++ b/icefall/diagnostics.py @@ -111,7 +111,7 @@ def get_diagnostics_for_dim( options object sizes_same: True if all the tensor sizes are the same on this dimension - stats_type: either "abs" or "positive" or "eigs" or "value", + stats_type: either "abs" or "positive" or "eigs" or "value", imdictates the type of stats we accumulate, abs is mean absolute value, "positive" is proportion of positive to nonnegative values, "eigs" is eigenvalues after doing outer product on this dim, sum