diff --git a/README.md b/README.md index 23389d483..d5e24bf3e 100644 --- a/README.md +++ b/README.md @@ -34,11 +34,12 @@ We do provide a Colab notebook for this recipe. ### LibriSpeech -We provide 3 models for this recipe: +We provide 4 models for this recipe: - [conformer CTC model][LibriSpeech_conformer_ctc] - [TDNN LSTM CTC model][LibriSpeech_tdnn_lstm_ctc] -- [RNN-T Conformer model][LibriSpeech_transducer] +- [Transducer: Conformer encoder + LSTM decoder][LibriSpeech_transducer] +- [Transducer: Conformer encoder + Embedding decoder][LibriSpeech_transducer_stateless] #### Conformer CTC Model @@ -62,9 +63,9 @@ The WER for this model is: We provide a Colab notebook to run a pre-trained TDNN LSTM CTC model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1kNmDXNMwREi0rZGAOIAOJo93REBuOTcd?usp=sharing) -#### RNN-T Conformer model +#### Transducer: Conformer encoder + LSTM decoder -Using Conformer as encoder. +Using Conformer as encoder and LSTM as decoder. The best WER with greedy search is: @@ -74,6 +75,19 @@ The best WER with greedy search is: We provide a Colab notebook to run a pre-trained RNN-T conformer model: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1_u6yK9jDkPwG_NLrZMN2XK7Aeq4suMO2?usp=sharing) +#### Transducer: Conformer encoder + Embedding decoder + +Using Conformer as encoder. The decoder consists of 1 embedding layer +and 1 convolutional layer. + +The best WER with beam search with beam size 4 is: + +| | test-clean | test-other | +|-----|------------|------------| +| WER | 2.92 | 7.37 | + +Note: No auxiliary losses are used in the training and no LMs are used +in the decoding. ### Aishell @@ -143,6 +157,7 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad [LibriSpeech_tdnn_lstm_ctc]: egs/librispeech/ASR/tdnn_lstm_ctc [LibriSpeech_conformer_ctc]: egs/librispeech/ASR/conformer_ctc [LibriSpeech_transducer]: egs/librispeech/ASR/transducer +[LibriSpeech_transducer_stateless]: egs/librispeech/ASR/transducer_stateless [Aishell_tdnn_lstm_ctc]: egs/aishell/ASR/tdnn_lstm_ctc [Aishell_conformer_ctc]: egs/aishell/ASR/conformer_ctc [TIMIT_tdnn_lstm_ctc]: egs/timit/ASR/tdnn_lstm_ctc diff --git a/egs/librispeech/ASR/README.md b/egs/librispeech/ASR/README.md index 1ee12b0ab..c8ee98d7d 100644 --- a/egs/librispeech/ASR/README.md +++ b/egs/librispeech/ASR/README.md @@ -12,5 +12,9 @@ The following table lists the differences among them. | | Encoder | Decoder | |------------------------|-----------|--------------------| | `transducer` | Conformer | LSTM | -| `transducer_stateless` | Conformer | Conv1d + Embedding | +| `transducer_stateless` | Conformer | Embedding + Conv1d | | `transducer_lstm ` | LSTM | LSTM | + +The decoder in `transducer_stateless` is modified from the paper +[Rnn-Transducer with Stateless Prediction Network](https://ieeexplore.ieee.org/document/9054419/). +We place an additional Conv1d layer right after the input embedding layer. diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index 19f5b18a7..317b1591a 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -1,11 +1,69 @@ ## Results -### LibriSpeech BPE training results (RNN-T) +### LibriSpeech BPE training results (Transducer) + +#### 2021-12-22 +Conformer encoder + non-current decoder. The decoder +contains only an embedding layer and a Conv1d (with kernel size 2). + +The WERs are + +| | test-clean | test-other | comment | +|---------------------------|------------|------------|------------------------------------------| +| greedy search | 2.99 | 7.52 | --epoch 20, --avg 10, --max-duration 100 | +| beam search (beam size 2) | 2.95 | 7.43 | | +| beam search (beam size 3) | 2.94 | 7.37 | | +| beam search (beam size 4) | 2.92 | 7.37 | | +| beam search (beam size 5) | 2.93 | 7.38 | | +| beam search (beam size 8) | 2.92 | 7.38 | | + +The training command for reproducing is given below: + +``` +export CUDA_VISIBLE_DEVICES="0,1,2,3" + +./transducer_stateless/train.py \ + --world-size 4 \ + --num-epochs 30 \ + --start-epoch 0 \ + --exp-dir transducer_stateless/exp-full \ + --full-libri 1 \ + --max-duration 250 \ + --lr-factor 3 +``` + +The tensorboard training log can be found at + + +The decoding command is: +``` +epoch=20 +avg=10 + +## greedy search +./transducer_stateless/decode.py \ + --epoch $epoch \ + --avg $avg \ + --exp-dir transducer_stateless/exp-full \ + --bpe-model ./data/lang_bpe_500/bpe.model \ + --max-duration 100 + +## beam search +./transducer_stateless/decode.py \ + --epoch $epoch \ + --avg $avg \ + --exp-dir transducer_stateless/exp-full \ + --bpe-model ./data/lang_bpe_500/bpe.model \ + --max-duration 100 \ + --decoding-method beam_search \ + --beam-size 4 +``` + #### 2021-12-17 Using commit `cb04c8a7509425ab45fae888b0ca71bbbd23f0de`. -RNN-T + Conformer encoder. +Conformer encoder + LSTM decoder. The best WER is diff --git a/egs/librispeech/ASR/transducer/model.py b/egs/librispeech/ASR/transducer/model.py index 8a4d3ca69..cb9afd8a2 100644 --- a/egs/librispeech/ASR/transducer/model.py +++ b/egs/librispeech/ASR/transducer/model.py @@ -27,11 +27,6 @@ from encoder_interface import EncoderInterface from icefall.utils import add_sos -assert hasattr(torchaudio.functional, "rnnt_loss"), ( - f"Current torchaudio version: {torchaudio.__version__}\n" - "Please install a version >= 0.10.0" -) - class Transducer(nn.Module): """It implements https://arxiv.org/pdf/1211.3711.pdf @@ -115,6 +110,11 @@ class Transducer(nn.Module): # Note: y does not start with SOS y_padded = y.pad(mode="constant", padding_value=0) + assert hasattr(torchaudio.functional, "rnnt_loss"), ( + f"Current torchaudio version: {torchaudio.__version__}\n" + "Please install a version >= 0.10.0" + ) + loss = torchaudio.functional.rnnt_loss( logits=logits, targets=y_padded, diff --git a/egs/librispeech/ASR/transducer_lstm/model.py b/egs/librispeech/ASR/transducer_lstm/model.py index 8a4d3ca69..cb9afd8a2 100644 --- a/egs/librispeech/ASR/transducer_lstm/model.py +++ b/egs/librispeech/ASR/transducer_lstm/model.py @@ -27,11 +27,6 @@ from encoder_interface import EncoderInterface from icefall.utils import add_sos -assert hasattr(torchaudio.functional, "rnnt_loss"), ( - f"Current torchaudio version: {torchaudio.__version__}\n" - "Please install a version >= 0.10.0" -) - class Transducer(nn.Module): """It implements https://arxiv.org/pdf/1211.3711.pdf @@ -115,6 +110,11 @@ class Transducer(nn.Module): # Note: y does not start with SOS y_padded = y.pad(mode="constant", padding_value=0) + assert hasattr(torchaudio.functional, "rnnt_loss"), ( + f"Current torchaudio version: {torchaudio.__version__}\n" + "Please install a version >= 0.10.0" + ) + loss = torchaudio.functional.rnnt_loss( logits=logits, targets=y_padded, diff --git a/egs/librispeech/ASR/transducer_stateless/model.py b/egs/librispeech/ASR/transducer_stateless/model.py index 7053f621e..2f0f9a183 100644 --- a/egs/librispeech/ASR/transducer_stateless/model.py +++ b/egs/librispeech/ASR/transducer_stateless/model.py @@ -27,11 +27,6 @@ from encoder_interface import EncoderInterface from icefall.utils import add_sos -assert hasattr(torchaudio.functional, "rnnt_loss"), ( - f"Current torchaudio version: {torchaudio.__version__}\n" - "Please install a version >= 0.10.0" -) - class Transducer(nn.Module): """It implements https://arxiv.org/pdf/1211.3711.pdf @@ -113,6 +108,11 @@ class Transducer(nn.Module): # Note: y does not start with SOS y_padded = y.pad(mode="constant", padding_value=0) + assert hasattr(torchaudio.functional, "rnnt_loss"), ( + f"Current torchaudio version: {torchaudio.__version__}\n" + "Please install a version >= 0.10.0" + ) + loss = torchaudio.functional.rnnt_loss( logits=logits, targets=y_padded,