From b65873fb4cc5658413d75b449054d4c0cc83e8ee Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Sun, 20 Oct 2024 17:48:57 +0800 Subject: [PATCH] update RESULTS.md --- egs/librispeech/ASR/README.md | 8 +- egs/librispeech/ASR/RESULTS.md | 310 +++++++++++++++++++++++++ egs/librispeech/ASR/zipformer/train.py | 9 +- 3 files changed, 321 insertions(+), 6 deletions(-) diff --git a/egs/librispeech/ASR/README.md b/egs/librispeech/ASR/README.md index 8b87ee19b..0dbfdc931 100644 --- a/egs/librispeech/ASR/README.md +++ b/egs/librispeech/ASR/README.md @@ -50,7 +50,7 @@ We place an additional Conv1d layer right after the input embedding layer. | `conformer-ctc2` | Reworked Conformer | Use auxiliary attention head | | `conformer-ctc3` | Reworked Conformer | Streaming version + delay penalty | | `zipformer-ctc` | Zipformer | Use auxiliary attention head | -| `zipformer` | Upgraded Zipformer | Use auxiliary transducer head / attention-decoder head | The latest recipe | +| `zipformer` | Upgraded Zipformer | Use auxiliary transducer head / attention-decoder head (the latest recipe) | # MMI @@ -58,3 +58,9 @@ We place an additional Conv1d layer right after the input embedding layer. |------------------------------|-----------|---------------------------------------------------| | `conformer-mmi` | Conformer | | | `zipformer-mmi` | Zipformer | CTC warmup + use HP as decoding graph for decoding | + +# CR-CTC + +| | Encoder | Comment | +|------------------------------|--------------------|------------------------------| +| `zipformer` | Upgraded Zipformer | Could also be an auxiliary loss to improve transducer or CTC/AED (the latest recipe) | diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index bc7d8a5ef..6a669f072 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -1,5 +1,315 @@ ## Results +### zipformer (zipformer + pruned-transducer w/ CR-CTC) + +See for more details. + +[zipformer](./zipformer) + +#### Non-streaming + +##### large-scale model, number of model parameters: 148824074, i.e., 148.8 M + +You can find a pretrained model, training logs, decoding logs, and decoding results at: + + +You can use to deploy it. + +| decoding method | test-clean | test-other | comment | +|--------------------------------------|------------|------------|---------------------| +| greedy_search | 1.9 | 3.96 | --epoch 50 --avg 26 | +| modified_beam_search | 1.88 | 3.95 | --epoch 50 --avg 26 | + +The training command using 2 80G-A100 GPUs is: +```bash +export CUDA_VISIBLE_DEVICES="0,1" +# for non-streaming model training: +./zipformer/train.py \ + --world-size 2 \ + --num-epochs 50 \ + --start-epoch 1 \ + --use-fp16 1 \ + --exp-dir zipformer/exp-large-cr-ctc-rnnt \ + --use-cr-ctc 1 \ + --use-ctc 1 \ + --use-transducer 1 \ + --use-attention-decoder 0 \ + --num-encoder-layers 2,2,4,5,4,2 \ + --feedforward-dim 512,768,1536,2048,1536,768 \ + --encoder-dim 192,256,512,768,512,256 \ + --encoder-unmasked-dim 192,192,256,320,256,192 \ + --ctc-loss-scale 0.1 \ + --enable-spec-aug 0 \ + --cr-loss-scale 0.02 \ + --time-mask-ratio 2.5 \ + --full-libri 1 \ + --max-duration 1400 \ + --master-port 12345 +``` + +The decoding command is: +```bash +export CUDA_VISIBLE_DEVICES="0" +for m in greedy_search modified_beam_search; do + ./zipformer/decode.py \ + --epoch 50 \ + --avg 26 \ + --exp-dir zipformer/exp-large-cr-ctc-rnnt \ + --use-cr-ctc 1 \ + --use-ctc 1 \ + --use-transducer 1 \ + --use-attention-decoder 0 \ + --num-encoder-layers 2,2,4,5,4,2 \ + --feedforward-dim 512,768,1536,2048,1536,768 \ + --encoder-dim 192,256,512,768,512,256 \ + --encoder-unmasked-dim 192,192,256,320,256,192 \ + --max-duration 300 \ + --decoding-method $m +done +``` + +### zipformer (zipformer + CR-CTC-AED) + +See for more details. + +[zipformer](./zipformer) + +#### Non-streaming + +##### large-scale model, number of model parameters: 174319650, i.e., 174.3 M + +You can find a pretrained model, training logs, decoding logs, and decoding results at: + + +You can use to deploy it. + +| decoding method | test-clean | test-other | comment | +|--------------------------------------|------------|------------|---------------------| +| attention-decoder-rescoring-no-ngram | 1.96 | 4.08 | --epoch 50 --avg 20 | + +The training command using 2 80G-A100 GPUs is: +```bash +export CUDA_VISIBLE_DEVICES="0,1" +# for non-streaming model training: +./zipformer/train.py \ + --world-size 2 \ + --num-epochs 50 \ + --start-epoch 1 \ + --use-fp16 1 \ + --exp-dir zipformer/exp-large-cr-ctc-aed \ + --use-cr-ctc 1 \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 1 \ + --num-encoder-layers 2,2,4,5,4,2 \ + --feedforward-dim 512,768,1536,2048,1536,768 \ + --encoder-dim 192,256,512,768,512,256 \ + --encoder-unmasked-dim 192,192,256,320,256,192 \ + --ctc-loss-scale 0.1 \ + --attention-decoder-loss-scale 0.9 \ + --enable-spec-aug 0 \ + --cr-loss-scale 0.02 \ + --time-mask-ratio 2.5 \ + --full-libri 1 \ + --max-duration 1200 \ + --master-port 12345 +``` + +The decoding command is: +```bash +export CUDA_VISIBLE_DEVICES="0" +./zipformer/ctc_decode.py \ + --epoch 50 \ + --avg 20 \ + --exp-dir zipformer/exp-large-cr-ctc-aed/ \ + --use-cr-ctc 1 \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 1 \ + --num-encoder-layers 2,2,4,5,4,2 \ + --feedforward-dim 512,768,1536,2048,1536,768 \ + --encoder-dim 192,256,512,768,512,256 \ + --encoder-unmasked-dim 192,192,256,320,256,192 \ + --max-duration 200 \ + --decoding-method attention-decoder-rescoring-no-ngram +done +``` + +### zipformer (zipformer + CR-CTC) + +See for more details. + +[zipformer](./zipformer) + +#### Non-streaming + +##### small-scale model, number of model parameters: 22118279, i.e., 22.1 M + +You can find a pretrained model, training logs, decoding logs, and decoding results at: + + +You can use to deploy it. + +| decoding method | test-clean | test-other | comment | +|--------------------------------------|------------|------------|---------------------| +| ctc-greedy-decoding | 2.57 | 5.95 | --epoch 50 --avg 25 | + +The training command using 2 32G-V100 GPUs is: +```bash +export CUDA_VISIBLE_DEVICES="0,1" +# for non-streaming model training: +./zipformer/train.py \ + --world-size 2 \ + --num-epochs 50 \ + --start-epoch 1 \ + --use-fp16 1 \ + --exp-dir zipformer/exp-small/ \ + --use-cr-ctc 1 \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 0 \ + --num-encoder-layers 2,2,2,2,2,2 \ + --feedforward-dim 512,768,768,768,768,768 \ + --encoder-dim 192,256,256,256,256,256 \ + --encoder-unmasked-dim 192,192,192,192,192,192 \ + --base-lr 0.04 \ + --enable-spec-aug 0 \ + --cr-loss-scale 0.2 \ + --time-mask-ratio 2.5 \ + --full-libri 1 \ + --max-duration 850 \ + --master-port 12345 +``` + +The decoding command is: +```bash +export CUDA_VISIBLE_DEVICES="0" +for m in ctc-greedy-search; do + ./zipformer/ctc_decode.py \ + --epoch 50 \ + --avg 25 \ + --exp-dir zipformer/exp-small \ + --use-cr-ctc 1 \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 0 \ + --num-encoder-layers 2,2,2,2,2,2 \ + --feedforward-dim 512,768,768,768,768,768 \ + --encoder-dim 192,256,256,256,256,256 \ + --encoder-unmasked-dim 192,192,192,192,192,192 \ + --max-duration 600 \ + --decoding-method $m +done +``` + +##### medium-scale model, number of model parameters: 64250603, i.e., 64.3 M + +You can find a pretrained model, training logs, decoding logs, and decoding results at: + + +You can use to deploy it. + +| decoding method | test-clean | test-other | comment | +|--------------------------------------|------------|------------|---------------------| +| ctc-greedy-decoding | 2.12 | 4.62 | --epoch 50 --avg 24 | + +The training command using 4 32G-V100 GPUs is: +```bash +export CUDA_VISIBLE_DEVICES="0,1,2,3" +# For non-streaming model training: +./zipformer/train.py \ + --world-size 4 \ + --num-epochs 50 \ + --start-epoch 1 \ + --use-fp16 1 \ + --exp-dir zipformer/exp \ + --use-cr-ctc 1 \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 0 \ + --enable-spec-aug 0 \ + --cr-loss-scale 0.2 \ + --time-mask-ratio 2.5 \ + --full-libri 1 \ + --max-duration 700 \ + --master-port 12345 +``` + +The decoding command is: +```bash +export CUDA_VISIBLE_DEVICES="0" +for m in ctc-greedy-search; do + ./zipformer/ctc_decode.py \ + --epoch 50 \ + --avg 24 \ + --exp-dir zipformer/exp \ + --use-cr-ctc 1 \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 0 \ + --max-duration 600 \ + --decoding-method $m +done +``` + +##### large-scale model, number of model parameters: 147010094, i.e., 147.0 M + +You can find a pretrained model, training logs, decoding logs, and decoding results at: + + +You can use to deploy it. + +| decoding method | test-clean | test-other | comment | +|--------------------------------------|------------|------------|---------------------| +| ctc-greedy-decoding | 2.03 | 4.37 | --epoch 50 --avg 26 | + +The training command using 2 80G-A100 GPUs is: +```bash +export CUDA_VISIBLE_DEVICES="0,1" +# For non-streaming model training: +./zipformer/train.py \ + --world-size 2 \ + --num-epochs 50 \ + --start-epoch 1 \ + --use-fp16 1 \ + --exp-dir zipformer/exp-large \ + --use-cr-ctc 1 \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 0 \ + --num-encoder-layers 2,2,4,5,4,2 \ + --feedforward-dim 512,768,1536,2048,1536,768 \ + --encoder-dim 192,256,512,768,512,256 \ + --encoder-unmasked-dim 192,192,256,320,256,192 \ + --enable-spec-aug 0 \ + --cr-loss-scale 0.2 \ + --time-mask-ratio 2.5 \ + --full-libri 1 \ + --max-duration 1400 \ + --master-port 12345 +``` + +The decoding command is: +```bash +export CUDA_VISIBLE_DEVICES="0" +for m in ctc-greedy-search; do + ./zipformer/ctc_decode.py \ + --epoch 50 \ + --avg 26 \ + --exp-dir zipformer/exp-large \ + --use-cr-ctc 1 \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 0 \ + --num-encoder-layers 2,2,4,5,4,2 \ + --feedforward-dim 512,768,1536,2048,1536,768 \ + --encoder-dim 192,256,512,768,512,256 \ + --encoder-unmasked-dim 192,192,256,320,256,192 \ + --max-duration 600 \ + --decoding-method $m +done +``` + ### zipformer (zipformer + CTC/AED) See for more details. diff --git a/egs/librispeech/ASR/zipformer/train.py b/egs/librispeech/ASR/zipformer/train.py index 3a8995c81..c074c32ec 100755 --- a/egs/librispeech/ASR/zipformer/train.py +++ b/egs/librispeech/ASR/zipformer/train.py @@ -45,11 +45,10 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3" --max-duration 1000 It supports training with: - - transducer loss (default), with `--use-transducer True --use-ctc False` - - ctc loss (not recommended), with `--use-transducer False --use-ctc True` - - transducer loss & ctc loss, with `--use-transducer True --use-ctc True` - - ctc loss & attention decoder loss, no transducer loss, - with `--use-transducer False --use-ctc True --use-attention-decoder True` + - transducer loss (default) + - ctc loss + - attention decoder loss + - cr-ctc loss (should use half the max-duration compared to regular ctc) """