diff --git a/docs/source/recipes/librispeech/conformer_ctc.rst b/docs/source/recipes/librispeech/conformer_ctc.rst index 57ac246e1..0c3c76c23 100644 --- a/docs/source/recipes/librispeech/conformer_ctc.rst +++ b/docs/source/recipes/librispeech/conformer_ctc.rst @@ -303,6 +303,10 @@ The commonly used options are: $ cd egs/librispeech/ASR $ ./conformer_ctc/decode.py --method ctc-decoding --max-duration 300 + # Caution: The above command is tested with a model with vocab size 500. + # The default settings in the master will not work. + # Please see https://github.com/k2-fsa/icefall/issues/103 + # We will fix it later and delete this note. And the following command uses attention decoder for rescoring: @@ -328,6 +332,8 @@ Usage: .. code-block:: bash $ cd egs/librispeech/ASR + # NOTE: Tested with a model with vocab size 500. + # It won't work for a model with vocab size 5000. $ ./conformer_ctc/decode.py \ --epoch 25 \ --avg 1 \ @@ -399,7 +405,7 @@ Download the pre-trained model The following commands describe how to download the pre-trained model: -.. code-block:: +.. code-block:: bash $ cd egs/librispeech/ASR $ mkdir tmp @@ -410,10 +416,23 @@ The following commands describe how to download the pre-trained model: .. CAUTION:: You have to use ``git lfs`` to download the pre-trained model. + Otherwise, you will have the following issue when running ``decode.py``: + + .. code-block:: + + _pickle.UnpicklingError: invalid load key, 'v' + + To fix that issue, please use: + + .. code-block:: bash + + cd icefall_asr_librispeech_conformer_ctc + git lfs pull + .. CAUTION:: - In order to use this pre-trained model, your k2 version has to be v1.7 or later. + In order to use this pre-trained model, your k2 version has to be v1.9 or later. After downloading, you will have the following files: diff --git a/egs/librispeech/ASR/conformer_ctc/train.py b/egs/librispeech/ASR/conformer_ctc/train.py index 223c8d993..1384204dd 100755 --- a/egs/librispeech/ASR/conformer_ctc/train.py +++ b/egs/librispeech/ASR/conformer_ctc/train.py @@ -362,22 +362,25 @@ def compute_loss( if params.att_rate != 0.0: with torch.set_grad_enabled(is_training): - if hasattr(model, "module"): - att_loss = model.module.decoder_forward( - encoder_memory, - memory_mask, - token_ids=token_ids, - sos_id=graph_compiler.sos_id, - eos_id=graph_compiler.eos_id, - ) - else: - att_loss = model.decoder_forward( - encoder_memory, - memory_mask, - token_ids=token_ids, - sos_id=graph_compiler.sos_id, - eos_id=graph_compiler.eos_id, - ) + mmodel = model.module if hasattr(model, "module") else model + # Note: We need to generate an unsorted version of token_ids + # `encode_supervisions()` called above sorts text, but + # encoder_memory and memory_mask are not sorted, so we + # use an unsorted version `supervisions["text"]` to regenerate + # the token_ids + # + # See https://github.com/k2-fsa/icefall/issues/97 + # for more details + unsorted_token_ids = graph_compiler.texts_to_ids( + supervisions["text"] + ) + att_loss = mmodel.decoder_forward( + encoder_memory, + memory_mask, + token_ids=unsorted_token_ids, + sos_id=graph_compiler.sos_id, + eos_id=graph_compiler.eos_id, + ) loss = (1.0 - params.att_rate) * ctc_loss + params.att_rate * att_loss else: loss = ctc_loss diff --git a/egs/librispeech/ASR/conformer_mmi/train-with-attention.py b/egs/librispeech/ASR/conformer_mmi/train-with-attention.py index 8b8994059..011dadd73 100755 --- a/egs/librispeech/ASR/conformer_mmi/train-with-attention.py +++ b/egs/librispeech/ASR/conformer_mmi/train-with-attention.py @@ -394,24 +394,16 @@ def compute_loss( mmi_loss = loss_fn(dense_fsa_vec=dense_fsa_vec, texts=texts) if params.att_rate != 0.0: - token_ids = graph_compiler.texts_to_ids(texts) + token_ids = graph_compiler.texts_to_ids(supervisions["text"]) with torch.set_grad_enabled(is_training): - if hasattr(model, "module"): - att_loss = model.module.decoder_forward( - encoder_memory, - memory_mask, - token_ids=token_ids, - sos_id=graph_compiler.sos_id, - eos_id=graph_compiler.eos_id, - ) - else: - att_loss = model.decoder_forward( - encoder_memory, - memory_mask, - token_ids=token_ids, - sos_id=graph_compiler.sos_id, - eos_id=graph_compiler.eos_id, - ) + mmodel = model.module if hasattr(model, "module") else model + att_loss = mmodel.decoder_forward( + encoder_memory, + memory_mask, + token_ids=token_ids, + sos_id=graph_compiler.sos_id, + eos_id=graph_compiler.eos_id, + ) loss = (1.0 - params.att_rate) * mmi_loss + params.att_rate * att_loss else: loss = mmi_loss diff --git a/egs/librispeech/ASR/conformer_mmi/train.py b/egs/librispeech/ASR/conformer_mmi/train.py index 6580792ff..c36677762 100755 --- a/egs/librispeech/ASR/conformer_mmi/train.py +++ b/egs/librispeech/ASR/conformer_mmi/train.py @@ -394,24 +394,16 @@ def compute_loss( mmi_loss = loss_fn(dense_fsa_vec=dense_fsa_vec, texts=texts) if params.att_rate != 0.0: - token_ids = graph_compiler.texts_to_ids(texts) + token_ids = graph_compiler.texts_to_ids(supervisions["text"]) with torch.set_grad_enabled(is_training): - if hasattr(model, "module"): - att_loss = model.module.decoder_forward( - encoder_memory, - memory_mask, - token_ids=token_ids, - sos_id=graph_compiler.sos_id, - eos_id=graph_compiler.eos_id, - ) - else: - att_loss = model.decoder_forward( - encoder_memory, - memory_mask, - token_ids=token_ids, - sos_id=graph_compiler.sos_id, - eos_id=graph_compiler.eos_id, - ) + mmodel = model.module if hasattr(model, "module") else model + att_loss = mmodel.decoder_forward( + encoder_memory, + memory_mask, + token_ids=token_ids, + sos_id=graph_compiler.sos_id, + eos_id=graph_compiler.eos_id, + ) loss = (1.0 - params.att_rate) * mmi_loss + params.att_rate * att_loss else: loss = mmi_loss diff --git a/icefall/decode.py b/icefall/decode.py index 62d27dd68..d11920618 100644 --- a/icefall/decode.py +++ b/icefall/decode.py @@ -224,6 +224,7 @@ class Nbest(object): else: word_seq = lattice.aux_labels.index(path) word_seq = word_seq.remove_axis(word_seq.num_axes - 2) + word_seq = word_seq.remove_values_leq(0) # Each utterance has `num_paths` paths but some of them transduces # to the same word sequence, so we need to remove repeated word @@ -732,6 +733,12 @@ def rescore_with_whole_lattice( logging.info( f"num_arcs before pruning: {inv_lattice.arcs.num_elements()}" ) + logging.info( + "This OOM is not an error. You can ignore it. " + "If your model does not converge well, or --max-duration " + "is too large, or the input sound file is difficult to " + "decode, you will meet this exception." + ) # NOTE(fangjun): The choice of the threshold 1e-9 is arbitrary here # to avoid OOM. You may need to fine tune it. @@ -864,6 +871,7 @@ def rescore_with_attention_decoder( ngram_lm_scale_list = [0.01, 0.05, 0.08] ngram_lm_scale_list += [0.1, 0.3, 0.5, 0.6, 0.7, 0.9, 1.0] ngram_lm_scale_list += [1.1, 1.2, 1.3, 1.5, 1.7, 1.9, 2.0] + ngram_lm_scale_list += [2.1, 2.2, 2.3, 2.5, 3.0, 4.0, 5.0] else: ngram_lm_scale_list = [ngram_lm_scale] @@ -871,6 +879,7 @@ def rescore_with_attention_decoder( attention_scale_list = [0.01, 0.05, 0.08] attention_scale_list += [0.1, 0.3, 0.5, 0.6, 0.7, 0.9, 1.0] attention_scale_list += [1.1, 1.2, 1.3, 1.5, 1.7, 1.9, 2.0] + attention_scale_list += [2.1, 2.2, 2.3, 2.5, 3.0, 4.0, 5.0] else: attention_scale_list = [attention_scale]