From 1e4920410ffbdf9cfaa5729bd634ba38a389e87a Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 9 Nov 2021 20:46:11 +0800 Subject: [PATCH] Update RESULTS using vocab size 500, att rate 0.8 --- egs/librispeech/ASR/RESULTS.md | 55 +++++++++++++++++++ egs/librispeech/ASR/conformer_ctc/decode.py | 5 ++ egs/librispeech/ASR/conformer_ctc/train.py | 14 ++++- .../ASR/conformer_ctc/transformer.py | 14 ++++- icefall/decode.py | 9 ++- 5 files changed, 89 insertions(+), 8 deletions(-) diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md index eb679b951..d7229f6b0 100644 --- a/egs/librispeech/ASR/RESULTS.md +++ b/egs/librispeech/ASR/RESULTS.md @@ -1,6 +1,61 @@ ## Results ### LibriSpeech BPE training results (Conformer-CTC) + +#### 2021-11-09 + +The best WER, as of 2021-11-09, for the librispeech test dataset is below +(using HLG decoding + n-gram LM rescoring + attention decoder rescoring): + +| | test-clean | test-other | +|-----|------------|------------| +| WER | 2.42 | 5.73 | + +Scale values used in n-gram LM rescoring and attention rescoring for the best WERs are: +| ngram_lm_scale | attention_scale | +|----------------|-----------------| +| 2.0 | 2.0 | + + +To reproduce the above result, use the following commands for training: + +``` +cd egs/librispeech/ASR/conformer_ctc +./prepare.sh +export CUDA_VISIBLE_DEVICES="0,1,2,3" +./conformer_ctc/train.py \ + --exp-dir conformer_ctc/exp_500_att0.8 \ + --lang-dir data/lang_bpe_500 \ + --att-rate 0.8 \ + --full-libri 1 \ + --max-duration 200 \ + --concatenate-cuts 0 \ + --world-size 4 \ + --bucketing-sampler 1 \ + --start-epoch 0 \ + --num-epochs 80 +``` + +and the following command for decoding + +``` +./conformer_ctc/decode.py \ + --exp-dir conformer_ctc/exp_500_att0.8 \ + --lang-dir data/lang_bpe_500 \ + --max-duration 30 \ + --concatenate-cuts 0 \ + --bucketing-sampler 1 \ + --num-paths 1000 \ + --epoch 77 \ + --avg 55 \ + --method attention-decoder \ + --nbest-scale 0.5 +``` + +You can find the pre-trained model by visiting + + + #### 2021-08-19 (Wei Kang): Result of https://github.com/k2-fsa/icefall/pull/13 diff --git a/egs/librispeech/ASR/conformer_ctc/decode.py b/egs/librispeech/ASR/conformer_ctc/decode.py index c5ae3ad7d..ad936b6e4 100755 --- a/egs/librispeech/ASR/conformer_ctc/decode.py +++ b/egs/librispeech/ASR/conformer_ctc/decode.py @@ -601,6 +601,11 @@ def main(): G.labels[G.labels >= first_word_disambig_id] = 0 G = k2.Fsa.from_fsas([G]).to(device) G = k2.arc_sort(G) + # Save a dummy value so that it can be loaded in C++. + # See https://github.com/pytorch/pytorch/issues/67902 + # for why we need to do this. + G["dummy"] = 1 + torch.save(G.as_dict(), params.lm_dir / "G_4_gram.pt") else: logging.info("Loading pre-compiled G_4_gram.pt") diff --git a/egs/librispeech/ASR/conformer_ctc/train.py b/egs/librispeech/ASR/conformer_ctc/train.py index 1384204dd..46ea5c60c 100755 --- a/egs/librispeech/ASR/conformer_ctc/train.py +++ b/egs/librispeech/ASR/conformer_ctc/train.py @@ -81,7 +81,7 @@ def get_parser(): parser.add_argument( "--num-epochs", type=int, - default=35, + default=78, help="Number of epochs to train.", ) @@ -108,13 +108,22 @@ def get_parser(): parser.add_argument( "--lang-dir", type=str, - default="data/lang_bpe_5000", + default="data/lang_bpe_500", help="""The lang dir It contains language related input files such as "lexicon.txt" """, ) + parser.add_argument( + "--att-rate", + type=float, + default=0.8, + help="""The attention rate. + The total loss is (1 - att_rate) * ctc_loss + att_rate * att_loss + """, + ) + return parser @@ -198,7 +207,6 @@ def get_params() -> AttributeDict: "beam_size": 10, "reduction": "sum", "use_double_scores": True, - "att_rate": 0.7, # parameters for Noam "weight_decay": 1e-6, "lr_factor": 5.0, diff --git a/egs/librispeech/ASR/conformer_ctc/transformer.py b/egs/librispeech/ASR/conformer_ctc/transformer.py index 3e6abb695..c9666362f 100644 --- a/egs/librispeech/ASR/conformer_ctc/transformer.py +++ b/egs/librispeech/ASR/conformer_ctc/transformer.py @@ -311,7 +311,7 @@ class Transformer(nn.Module): self, memory: torch.Tensor, memory_key_padding_mask: torch.Tensor, - token_ids: List[List[int]], + token_ids: List[torch.Tensor], sos_id: int, eos_id: int, ) -> torch.Tensor: @@ -334,6 +334,11 @@ class Transformer(nn.Module): """ # The common part between this function and decoder_forward could be # extracted as a separate function. + if isinstance(token_ids[0], torch.Tensor): + # This branch is executed by torchscript in C++. + # See https://github.com/k2-fsa/k2/pull/870 + # https://github.com/k2-fsa/k2/blob/3c1c18400060415b141ccea0115fd4bf0ad6234e/k2/torch/bin/attention_rescore.cu#L286 + token_ids = [tolist(t) for t in token_ids] ys_in = add_sos(token_ids, sos_id=sos_id) ys_in = [torch.tensor(y) for y in ys_in] @@ -660,7 +665,7 @@ class PositionalEncoding(nn.Module): self.xscale = math.sqrt(self.d_model) self.dropout = nn.Dropout(p=dropout) # not doing: self.pe = None because of errors thrown by torchscript - self.pe = torch.zeros(0, 0, dtype=torch.float32) + self.pe = torch.zeros(1, 0, self.d_model, dtype=torch.float32) def extend_pe(self, x: torch.Tensor) -> None: """Extend the time t in the positional encoding if required. @@ -1000,3 +1005,8 @@ def add_eos(token_ids: List[List[int]], eos_id: int) -> List[List[int]]: with EOS ID. """ return [utt + [eos_id] for utt in token_ids] + + +def tolist(t: torch.Tensor) -> List[int]: + """Used by jit""" + return torch.jit.annotate(List[int], t.tolist()) diff --git a/icefall/decode.py b/icefall/decode.py index 619b3267a..c47c5da27 100644 --- a/icefall/decode.py +++ b/icefall/decode.py @@ -224,6 +224,7 @@ class Nbest(object): else: word_seq = lattice.aux_labels.index(path) word_seq = word_seq.remove_axis(word_seq.num_axes - 2) + word_seq = word_seq.remove_values_leq(0) # Each utterance has `num_paths` paths but some of them transduces # to the same word sequence, so we need to remove repeated word @@ -363,7 +364,7 @@ class Nbest(object): Return a ragged tensor with 2 axes [utt][path_scores]. Its dtype is torch.float64. """ - saved_scores = self.fsa.scores + saved_scores = self.fsa.scores.clone() # The `scores` of every arc consists of `am_scores` and `lm_scores` self.fsa.scores = self.fsa.scores - self.fsa.lm_scores @@ -390,10 +391,10 @@ class Nbest(object): Return a ragged tensor with 2 axes [utt][path_scores]. Its dtype is torch.float64. """ - saved_scores = self.fsa.scores + saved_scores = self.fsa.scores.clone() # The `scores` of every arc consists of `am_scores` and `lm_scores` - self.fsa.scores = self.fsa.lm_scores + self.fsa.scores = self.fsa.lm_scores.clone() lm_scores = self.fsa.get_tot_scores( use_double_scores=True, log_semiring=False @@ -870,6 +871,7 @@ def rescore_with_attention_decoder( ngram_lm_scale_list = [0.01, 0.05, 0.08] ngram_lm_scale_list += [0.1, 0.3, 0.5, 0.6, 0.7, 0.9, 1.0] ngram_lm_scale_list += [1.1, 1.2, 1.3, 1.5, 1.7, 1.9, 2.0] + ngram_lm_scale_list += [2.1, 2.2, 2.3, 2.5, 3.0, 4.0, 5.0] else: ngram_lm_scale_list = [ngram_lm_scale] @@ -877,6 +879,7 @@ def rescore_with_attention_decoder( attention_scale_list = [0.01, 0.05, 0.08] attention_scale_list += [0.1, 0.3, 0.5, 0.6, 0.7, 0.9, 1.0] attention_scale_list += [1.1, 1.2, 1.3, 1.5, 1.7, 1.9, 2.0] + attention_scale_list += [2.1, 2.2, 2.3, 2.5, 3.0, 4.0, 5.0] else: attention_scale_list = [attention_scale]