diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index ee5422aba..6f00bc14d 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -1,5 +1,75 @@ ## Results +### zipformer (zipformer + CTC/AED) + +See for more details. + +[zipformer](./zipformer) + +#### Non-streaming + +##### large-scale model, number of model parameters: 174319650, i.e., 174.3 M + +You can find a pretrained model, training logs, decoding logs, and decoding results at: + + +You can use to deploy it. + +Results of the CTC head: + +| decoding method | test-clean | test-other | comment | +|--------------------------------------|------------|------------|---------------------| +| ctc-decoding | 2.29 | 5.14 | --epoch 50 --avg 29 | +| attention-decoder-rescoring-no-ngram | 2.1 | 4.57 | --epoch 50 --avg 29 | + +The training command is: +```bash +export CUDA_VISIBLE_DEVICES="0,1,2,3" +# For non-streaming model training: +./zipformer/train.py \ + --world-size 4 \ + --num-epochs 50 \ + --start-epoch 1 \ + --use-fp16 1 \ + --exp-dir zipformer/exp-large \ + --full-libri 1 \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 1 \ + --ctc-loss-scale 0.1 \ + --attention-decoder-loss-scale 0.9 \ + --num-encoder-layers 2,2,4,5,4,2 \ + --feedforward-dim 512,768,1536,2048,1536,768 \ + --encoder-dim 192,256,512,768,512,256 \ + --encoder-unmasked-dim 192,192,256,320,256,192 \ + --max-duration 1200 \ + --master-port 12345 +``` + +The decoding command is: +```bash +export CUDA_VISIBLE_DEVICES="0" +for m in ctc-decoding attention-decoder-rescoring-no-ngram; do + ./zipformer/ctc_decode.py \ + --epoch 50 \ + --avg 29 \ + --exp-dir zipformer/exp-large \ + --use-ctc 1 \ + --use-transducer 0 \ + --use-attention-decoder 1 \ + --attention-decoder-loss-scale 0.9 \ + --num-encoder-layers 2,2,4,5,4,2 \ + --feedforward-dim 512,768,1536,2048,1536,768 \ + --encoder-dim 192,256,512,768,512,256 \ + --encoder-unmasked-dim 192,192,256,320,256,192 \ + --max-duration 100 \ + --causal 0 \ + --num-paths 100 \ + --decoding-method $m +done +``` + + ### zipformer (zipformer + pruned stateless transducer + CTC) See for more details. diff --git a/egs/librispeech/ASR/zipformer/model.py b/egs/librispeech/ASR/zipformer/model.py index a5d46dab9..bd1ed26d8 100644 --- a/egs/librispeech/ASR/zipformer/model.py +++ b/egs/librispeech/ASR/zipformer/model.py @@ -72,6 +72,8 @@ class AsrModel(nn.Module): Whether use transducer head. Default: True. use_ctc: Whether use CTC head. Default: False. + use_attention_decoder: + Whether use attention-decoder head. Default: False. """ super().__init__() diff --git a/egs/librispeech/ASR/zipformer/train.py b/egs/librispeech/ASR/zipformer/train.py index cce058d6c..d87041a52 100755 --- a/egs/librispeech/ASR/zipformer/train.py +++ b/egs/librispeech/ASR/zipformer/train.py @@ -48,7 +48,8 @@ It supports training with: - transducer loss (default), with `--use-transducer True --use-ctc False` - ctc loss (not recommended), with `--use-transducer False --use-ctc True` - transducer loss & ctc loss, with `--use-transducer True --use-ctc True` - - ctc loss & attention decoder loss, with `--use-ctc True --use-attention-decoder True ` + - ctc loss & attention decoder loss, no transducer loss, + with `--use-transducer False --use-ctc True --use-attention-decoder True` """