diff --git a/docs/source/recipes/librispeech/conformer_ctc.rst b/docs/source/recipes/librispeech/conformer_ctc.rst index bda11ae38..2cb04d1ba 100644 --- a/docs/source/recipes/librispeech/conformer_ctc.rst +++ b/docs/source/recipes/librispeech/conformer_ctc.rst @@ -14,6 +14,12 @@ with the `LibriSpeech `_ dataset. We recommend you to use a GPU or several GPUs to run this recipe. +In this tutorial, you will learn: + + - (1) How to prepare data for training and decoding + - (2) How to start the training, either with a single GPU or multiple GPUs + - (3) How to do decoding after training, with n-gram LM rescoring and attention decoder rescoring + - (4) How to use a pre-trained model, provided by us Data preparation ---------------- @@ -81,12 +87,12 @@ The following options are used quite often: - ``--full-libri`` If it's True, the training part uses all the training data, i.e., - 960 hours. Otherwise, the training part uses only 100 hours subset. + 960 hours. Otherwise, the training part uses only the subset + ``train-clean-100``, which has 100 hours of training data. .. CAUTION:: - The training set is perturbed by two different speeds: - one with a value 0.9 and the other is 1.1. + The training set is perturbed by speed with two factors: 0.9 and 1.1. If ``--full-libri`` is True, each epoch actually processes ``3x960 == 2880`` hours of data. @@ -143,11 +149,11 @@ The following options are used quite often: .. CAUTION:: Only multi-GPU single-machine DDP training is implemented at present. - Mult-GPU multi-machine DDP training will be added later. + Multi-GPU multi-machine DDP training will be added later. - ``--max-duration`` - It specifies number of seconds over all utterances in a + It specifies the number of seconds over all utterances in a batch, before **padding**. If you encounter CUDA OOM, please reduce it. For instance, if your are using V100 NVIDIA GPU, we recommend you to set it to ``200``. @@ -157,8 +163,8 @@ The following options are used quite often: Due to padding, the number of seconds of all utterances in a batch will usually be larger than ``--max-duration``. - A large value for ``--max-duration`` may cause OOM during training, - while a small value may increase the training time. You have to + A larger value for ``--max-duration`` may cause OOM during training, + while a smaller value may increase the training time. You have to tune it. @@ -272,6 +278,350 @@ training from epoch 3. Also, it trains for 10 epochs. Decoding -------- +The decoding part uses checkpoints saved by the training part, so you have +to run the training part first. + +.. code-block:: bash + + $ cd egs/librispeech/ASR + $ ./conformer_ctc/decode.py --help + +shows the options for decoding. + +The commonly used options are: + + - ``--method`` + + This specifies the decoding method. + + The following command uses attention decoder for rescoring: + + .. code-block:: + + $ cd egs/librispeech/ASR + $ ./conformer_ctc/decode.py --method attention-decoder --max-duration 30 --lattice-score-scale 0.5 + + - ``--lattice-score-scale`` + + It is used to scaled down lattice scores so that we can more unique + paths for rescoring. + + - ``--max-duration`` + + It has the same meaning as the one during training. A larger + value may cause OOM. + Pre-trained Model ----------------- +We have uploaded the pre-trained model to +``_. + +We describe how to use the pre-trained model to transcribe a sound file or +multiple sound files in the following. + +Install kaldifeat +~~~~~~~~~~~~~~~~~ + +`kaldifeat `_ is used to +extract features for a single sound file or multiple soundfiles +at the same time. + +Please refer to ``_ for installation. + +Download the pre-trained model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The following commands describe how to download the pre-trained model: + +.. code-block:: + + $ cd egs/librispeech/ASR + $ mkdir tmp + $ cd tmp + $ git lfs install + $ git clone https://huggingface.co/pkufool/icefall_asr_librispeech_conformer_ctc + +.. CAUTION:: + + You have to use ``git lfs`` to download the pre-trained model. + +After downloading, you will have the following files: + +.. code-block:: bash + + $ cd egs/librispeech/ASR + $ tree tmp + +.. code-block:: bash + + tmp + `-- icefall_asr_librispeech_conformer_ctc + |-- README.md + |-- data + | |-- lang_bpe + | | |-- HLG.pt + | | |-- bpe.model + | | |-- tokens.txt + | | `-- words.txt + | `-- lm + | `-- G_4_gram.pt + |-- exp + | `-- pretraind.pt + `-- test_wavs + |-- 1089-134686-0001.flac + |-- 1221-135766-0001.flac + |-- 1221-135766-0002.flac + `-- trans.txt + + 6 directories, 11 files + +**File descriptions**: + + - ``data/lang_bpe/HLG.pt`` + + It is the decoding graph. + + - ``data/lang_bpe/bpe.model`` + + It is a sentencepiece model. You can use it to reproduce our results. + + - ``data/lang_bpe/tokens.txt`` + + It contains tokens and their IDs, generated from ``bpe.model``. + Provided only for convenience so that you can look up the SOS/EOS ID easily. + + - ``data/lang_bpe/words.txt`` + + It contains words and their IDs. + + - ``data/lm/G_4_gram.pt`` + + It is a 4-gram LM, useful for LM rescoring. + + - ``exp/pretrained.pt`` + + It contains pre-trained model parameters, obtained by averaging + checkpoints from ``epoch-15.pt`` to ``epoch-34.pt``. + Note: We have removed optimizer ``state_dict`` to reduce file size. + + - ``test_waves/*.flac`` + + It contains some test sound files from LibriSpeech ``test-clean`` dataset. + + - `test_waves/trans.txt` + + It contains the reference transcripts for the sound files in `test_waves/`. + +The information of the test sound files is listed below: + +.. code-block:: bash + + $ soxi tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/*.flac + + Input File : 'tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac' + Channels : 1 + Sample Rate : 16000 + Precision : 16-bit + Duration : 00:00:06.62 = 106000 samples ~ 496.875 CDDA sectors + File Size : 116k + Bit Rate : 140k + Sample Encoding: 16-bit FLAC + + Input File : 'tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac' + Channels : 1 + Sample Rate : 16000 + Precision : 16-bit + Duration : 00:00:16.71 = 267440 samples ~ 1253.62 CDDA sectors + File Size : 343k + Bit Rate : 164k + Sample Encoding: 16-bit FLAC + + Input File : 'tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac' + Channels : 1 + Sample Rate : 16000 + Precision : 16-bit + Duration : 00:00:04.83 = 77200 samples ~ 361.875 CDDA sectors + File Size : 105k + Bit Rate : 174k + Sample Encoding: 16-bit FLAC + + Total Duration of 3 files: 00:00:28.16 + +Usage +~~~~~ + +.. code-block:: + + $ cd egs/librispeech/ASR + $ ./conformer_ctc/pretrained.py --help + +displays the help information. + +It supports three decoding methods: + + - HLG decoding + - HLG + n-gram LM rescoring + - HLG + n-gram LM rescoring + attention decoder rescoring + +HLG decoding +^^^^^^^^^^^^ + +HLG decoding uses the best path of the decoding lattice as the decoding result. + +The command to run HLG decoding is: + +.. code-block:: bash + + $ cd egs/librispeech/ASR + $ ./conformer_ctc/pretrained.py \ + --checkpoint ./tmp/icefall_asr_librispeech_conformer_ctc/exp/pretraind.pt \ + --words-file ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/words.txt \ + --HLG ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt \ + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac \ + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac \ + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac + +The output is given below: + +.. code-block:: + + 2021-08-20 11:03:05,712 INFO [pretrained.py:217] device: cuda:0 + 2021-08-20 11:03:05,712 INFO [pretrained.py:219] Creating model + 2021-08-20 11:03:11,345 INFO [pretrained.py:238] Loading HLG from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt + 2021-08-20 11:03:18,442 INFO [pretrained.py:255] Constructing Fbank computer + 2021-08-20 11:03:18,444 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] + 2021-08-20 11:03:18,507 INFO [pretrained.py:271] Decoding started + 2021-08-20 11:03:18,795 INFO [pretrained.py:300] Use HLG decoding + 2021-08-20 11:03:19,149 INFO [pretrained.py:339] + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac: + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS + + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac: + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED + BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac: + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION + + 2021-08-20 11:03:19,149 INFO [pretrained.py:341] Decoding Done + +HLG decoding + LM rescoring +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It uses an n-gram LM to rescore the decoding lattice and the best +path of the rescored lattice is the decoding result. + +The command to run HLG decoding + LM rescoring is: + +.. code-block:: bash + + $ cd egs/librispeech/ASR + $ ./conformer_ctc/pretrained.py \ + --checkpoint ./tmp/icefall_asr_librispeech_conformer_ctc/exp/pretraind.pt \ + --words-file ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/words.txt \ + --HLG ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt \ + --method whole-lattice-rescoring \ + --G ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt \ + --ngram-lm-scale 0.8 \ + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac \ + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac \ + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac + +Its output is: + +.. code-block:: + + 2021-08-20 11:12:17,565 INFO [pretrained.py:217] device: cuda:0 + 2021-08-20 11:12:17,565 INFO [pretrained.py:219] Creating model + 2021-08-20 11:12:23,728 INFO [pretrained.py:238] Loading HLG from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt + 2021-08-20 11:12:30,035 INFO [pretrained.py:246] Loading G from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt + 2021-08-20 11:13:10,779 INFO [pretrained.py:255] Constructing Fbank computer + 2021-08-20 11:13:10,787 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] + 2021-08-20 11:13:10,798 INFO [pretrained.py:271] Decoding started + 2021-08-20 11:13:11,085 INFO [pretrained.py:305] Use HLG decoding + LM rescoring + 2021-08-20 11:13:11,736 INFO [pretrained.py:339] + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac: + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS + + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac: + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED + BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac: + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION + + 2021-08-20 11:13:11,737 INFO [pretrained.py:341] Decoding Done + +HLG decoding + LM rescoring + attention decoder rescoring +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It uses an n-gram LM to rescore the decoding lattice, extracts +n paths from the rescored lattice, recores the extracted paths with +an attention decoder. The path with the highest score is the decoding result. + +The command to run HLG decoding + LM rescoring + attention decoder rescoring is: + +.. code-block:: bash + + $ cd egs/librispeech/ASR + $ ./conformer_ctc/pretrained.py \ + --checkpoint ./tmp/icefall_asr_librispeech_conformer_ctc/exp/pretraind.pt \ + --words-file ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/words.txt \ + --HLG ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt \ + --method attention-decoder \ + --G ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt \ + --ngram-lm-scale 1.3 \ + --attention-decoder-scale 1.2 \ + --lattice-score-scale 0.5 \ + --num-paths 100 \ + --sos-id 1 \ + --eos-id 1 \ + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac \ + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac \ + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac + +The output is below: + +.. code-block:: + + 2021-08-20 11:19:11,397 INFO [pretrained.py:217] device: cuda:0 + 2021-08-20 11:19:11,397 INFO [pretrained.py:219] Creating model + 2021-08-20 11:19:17,354 INFO [pretrained.py:238] Loading HLG from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt + 2021-08-20 11:19:24,615 INFO [pretrained.py:246] Loading G from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt + 2021-08-20 11:20:04,576 INFO [pretrained.py:255] Constructing Fbank computer + 2021-08-20 11:20:04,584 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] + 2021-08-20 11:20:04,595 INFO [pretrained.py:271] Decoding started + 2021-08-20 11:20:04,854 INFO [pretrained.py:313] Use HLG + LM rescoring + attention decoder rescoring + 2021-08-20 11:20:05,805 INFO [pretrained.py:339] + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac: + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS + + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac: + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED + BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac: + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION + + 2021-08-20 11:20:05,805 INFO [pretrained.py:341] Decoding Done + +Colab notebook +-------------- + +We do provide a colab notebook for this recipe showing how to use a pre-trained model. + +|librispeech asr conformer ctc colab notebook| + +.. |librispeech asr conformer ctc colab notebook| image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/drive/1huyupXAcHsUrKaWfI83iMEJ6J0Nh0213?usp=sharing + +.. HINT:: + + Due to limited memory provided by Colab, you have to upgrade to Colab Pro to + run ``HLG decoding + LM rescoring`` and + ``HLG decoding + LM rescoring + attention decoder rescoring``. + Otherwise, you can only run ``HLG decoding`` with Colab. + +**Congratulations!** You have finished the librispeech ASR recipe with +conformer CTC models in ``icefall``. diff --git a/egs/librispeech/ASR/conformer_ctc/decode.py b/egs/librispeech/ASR/conformer_ctc/decode.py index 6abcf3385..321ce970e 100755 --- a/egs/librispeech/ASR/conformer_ctc/decode.py +++ b/egs/librispeech/ASR/conformer_ctc/decode.py @@ -57,28 +57,63 @@ def get_parser(): parser.add_argument( "--epoch", type=int, - default=9, + default=34, help="It specifies the checkpoint to use for decoding." "Note: Epoch counts from 0.", ) parser.add_argument( "--avg", type=int, - default=1, + default=20, help="Number of checkpoints to average. Automatically select " "consecutive checkpoints before the checkpoint specified by " "'--epoch'. ", ) + parser.add_argument( + "--method", + type=str, + default="attention-decoder", + help="""Decoding method. + Supported values are: + - (1) 1best. Extract the best path from the decoding lattice as the + decoding result. + - (2) nbest. Extract n paths from the decoding lattice; the path with + the highest score is the decoding result. + - (3) nbest-rescoring. Extract n paths from the decoding lattice, + rescore them with an n-gram LM (e.g., a 4-gram LM), the path with + the highest score is the decoding result. + - (4) whole-lattice. Rescore the decoding lattice with an n-gram LM + (e.g., a 4-gram LM), the best path of rescored lattice is the + decoding result. + - (5) attention-decoder. Extract n paths from the LM rescored lattice, + the path with the highest score is the decoding result. + - (6) nbest-oracle. Its WER is the lower bound of any n-best + rescoring method can achieve. Useful for debugging n-best + rescoring method. + """, + ) + + parser.add_argument( + "--num-paths", + type=int, + default=100, + help="""Number of paths for n-best based decoding method. + Used only when "method" is one of the following values: + nbest, nbest-rescoring, attention-decoder, and nbest-oracle + """, + ) + parser.add_argument( "--lattice-score-scale", type=float, default=1.0, - help="The scale to be applied to `lattice.scores`." - "It's needed if you use any kinds of n-best based rescoring. " - "Currently, it is used when the decoding method is: nbest, " - "nbest-rescoring, attention-decoder, and nbest-oracle. " - "A smaller value results in more unique paths.", + help="""The scale to be applied to `lattice.scores`. + It's needed if you use any kinds of n-best based rescoring. + Used only when "method" is one of the following values: + nbest, nbest-rescoring, attention-decoder, and nbest-oracle + A smaller value results in more unique paths. + """, ) return parser @@ -104,21 +139,6 @@ def get_params() -> AttributeDict: "min_active_states": 30, "max_active_states": 10000, "use_double_scores": True, - # Possible values for method: - # - 1best - # - nbest - # - nbest-rescoring - # - whole-lattice-rescoring - # - attention-decoder - # - nbest-oracle - # "method": "nbest", - # "method": "nbest-rescoring", - # "method": "whole-lattice-rescoring", - "method": "attention-decoder", - # "method": "nbest-oracle", - # num_paths is used when method is "nbest", "nbest-rescoring", - # attention-decoder, and nbest-oracle - "num_paths": 100, } ) return params @@ -129,7 +149,7 @@ def decode_one_batch( model: nn.Module, HLG: k2.Fsa, batch: dict, - lexicon: Lexicon, + word_table: k2.SymbolTable, sos_id: int, eos_id: int, G: Optional[k2.Fsa] = None, @@ -163,8 +183,8 @@ def decode_one_batch( It is the return value from iterating `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation for the format of the `batch`. - lexicon: - It contains word symbol table. + word_table: + The word symbol table. sos_id: The token ID of the SOS. eos_id: @@ -217,7 +237,7 @@ def decode_one_batch( lattice=lattice, num_paths=params.num_paths, ref_texts=supervisions["text"], - lexicon=lexicon, + word_table=word_table, scale=params.lattice_score_scale, ) @@ -237,7 +257,7 @@ def decode_one_batch( key = f"no_rescore-scale-{params.lattice_score_scale}-{params.num_paths}" # noqa hyps = get_texts(best_path) - hyps = [[lexicon.word_table[i] for i in ids] for ids in hyps] + hyps = [[word_table[i] for i in ids] for ids in hyps] return {key: hyps} assert params.method in [ @@ -283,7 +303,7 @@ def decode_one_batch( ans = dict() for lm_scale_str, best_path in best_path_dict.items(): hyps = get_texts(best_path) - hyps = [[lexicon.word_table[i] for i in ids] for ids in hyps] + hyps = [[word_table[i] for i in ids] for ids in hyps] ans[lm_scale_str] = hyps return ans @@ -293,7 +313,7 @@ def decode_dataset( params: AttributeDict, model: nn.Module, HLG: k2.Fsa, - lexicon: Lexicon, + word_table: k2.SymbolTable, sos_id: int, eos_id: int, G: Optional[k2.Fsa] = None, @@ -309,8 +329,8 @@ def decode_dataset( The neural model. HLG: The decoding graph. - lexicon: - It contains word symbol table. + word_table: + It is the word symbol table. sos_id: The token ID for SOS. eos_id: @@ -344,7 +364,7 @@ def decode_dataset( model=model, HLG=HLG, batch=batch, - lexicon=lexicon, + word_table=word_table, G=G, sos_id=sos_id, eos_id=eos_id, @@ -540,7 +560,7 @@ def main(): params=params, model=model, HLG=HLG, - lexicon=lexicon, + word_table=lexicon.word_table, G=G, sos_id=sos_id, eos_id=eos_id, diff --git a/icefall/decode.py b/icefall/decode.py index bcc869e99..de3219401 100644 --- a/icefall/decode.py +++ b/icefall/decode.py @@ -22,8 +22,6 @@ import kaldialign import torch import torch.nn as nn -from icefall.lexicon import Lexicon - def _get_random_paths( lattice: k2.Fsa, @@ -623,7 +621,7 @@ def nbest_oracle( lattice: k2.Fsa, num_paths: int, ref_texts: List[str], - lexicon: Lexicon, + word_table: k2.SymbolTable, scale: float = 1.0, ) -> Dict[str, List[List[int]]]: """Select the best hypothesis given a lattice and a reference transcript. @@ -644,8 +642,8 @@ def nbest_oracle( ref_texts: A list of reference transcript. Each entry contains space(s) separated words - lexicon: - It is used to convert word IDs to word symbols. + word_table: + It is the word symbol table. scale: It's the scale applied to the lattice.scores. A smaller value yields more unique paths. @@ -680,7 +678,7 @@ def nbest_oracle( best_hyp_words = None min_error = float("inf") for hyp_words in hyps: - hyp_words = [lexicon.word_table[i] for i in hyp_words] + hyp_words = [word_table[i] for i in hyp_words] this_error = kaldialign.edit_distance(ref_words, hyp_words)["total"] if this_error < min_error: min_error = this_error