From 39bc8cae94cb3b5824a93b5033136fba546322b9 Mon Sep 17 00:00:00 2001 From: Mingshuang Luo <37799481+luomingshuang@users.noreply.github.com> Date: Wed, 13 Oct 2021 12:20:16 +0800 Subject: [PATCH] Add ctc decoding to pretrained.py on conformer_ctc (#75) * Add ctc-decoding to pretrained.py * update pretrained.py and conformer_ctc.rst * update ctc-decoding for pretrained.py on conformer_ctc * Update pretrained.py * fix the style issue * Update conformer_ctc.rst * Update the running logs --- .../recipes/librispeech/conformer_ctc.rst | 119 +++++++---- .../ASR/conformer_ctc/pretrained.py | 202 +++++++++++------- 2 files changed, 211 insertions(+), 110 deletions(-) diff --git a/docs/source/recipes/librispeech/conformer_ctc.rst b/docs/source/recipes/librispeech/conformer_ctc.rst index 84e99306f..45ad79313 100644 --- a/docs/source/recipes/librispeech/conformer_ctc.rst +++ b/docs/source/recipes/librispeech/conformer_ctc.rst @@ -429,6 +429,7 @@ After downloading, you will have the following files: |-- README.md |-- data | |-- lang_bpe + | | |-- Linv.pt | | |-- HLG.pt | | |-- bpe.model | | |-- tokens.txt @@ -446,6 +447,9 @@ After downloading, you will have the following files: 6 directories, 11 files **File descriptions**: + - ``data/lang_bpe/Linv.pt`` + + It is the lexicon file, with word IDs as labels and token IDs as aux_labels. - ``data/lang_bpe/HLG.pt`` @@ -527,12 +531,58 @@ Usage displays the help information. -It supports three decoding methods: +It supports 4 decoding methods: + - CTC decoding - HLG decoding - HLG + n-gram LM rescoring - HLG + n-gram LM rescoring + attention decoder rescoring +CTC decoding +^^^^^^^^^^^^ + +CTC decoding uses the best path of the decoding lattice as the decoding result +without any LM or lexicon. + +The command to run CTC decoding is: + +.. code-block:: bash + + $ cd egs/librispeech/ASR + $ ./conformer_ctc/pretrained.py \ + --checkpoint ./tmp/icefall_asr_librispeech_conformer_ctc/exp/pretrained.pt \ + --lang-dir ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe \ + --method ctc-decoding \ + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac \ + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac \ + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac + +The output is given below: + +.. code-block:: + + 2021-10-13 11:21:50,896 INFO [pretrained.py:236] device: cuda:0 + 2021-10-13 11:21:50,896 INFO [pretrained.py:238] Creating model + 2021-10-13 11:21:56,669 INFO [pretrained.py:255] Constructing Fbank computer + 2021-10-13 11:21:56,670 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] + 2021-10-13 11:21:56,683 INFO [pretrained.py:271] Decoding started + 2021-10-13 11:21:57,341 INFO [pretrained.py:290] Building CTC topology + 2021-10-13 11:21:57,625 INFO [lexicon.py:113] Loading pre-compiled tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/Linv.pt + 2021-10-13 11:21:57,679 INFO [pretrained.py:299] Loading BPE model + 2021-10-13 11:22:00,076 INFO [pretrained.py:314] Use CTC decoding + 2021-10-13 11:22:00,087 INFO [pretrained.py:400] + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac: + AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS + + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac: + GOD AS A DIRECT CONSEQUENCE OF THE SIN WHICH MAN THUS PUNISHED HAD GIVEN HER A LOVELY CHILD WHOSE PLACE WAS ON THAT SAME DISHONOURED + BOSOM TO CONNECT HER PARENT FOR EVER WITH THE RACE AND DESCENT OF MORTALS AND TO BE FINALLY A BLESSED SOUL IN HEAVEN + + ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac: + YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION + + 2021-10-13 11:22:00,087 INFO [pretrained.py:402] Decoding Done + HLG decoding ^^^^^^^^^^^^ @@ -545,8 +595,7 @@ The command to run HLG decoding is: $ cd egs/librispeech/ASR $ ./conformer_ctc/pretrained.py \ --checkpoint ./tmp/icefall_asr_librispeech_conformer_ctc/exp/pretrained.pt \ - --words-file ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/words.txt \ - --HLG ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt \ + --lang-dir ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe \ ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac \ ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac \ ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac @@ -555,14 +604,14 @@ The output is given below: .. code-block:: - 2021-08-20 11:03:05,712 INFO [pretrained.py:217] device: cuda:0 - 2021-08-20 11:03:05,712 INFO [pretrained.py:219] Creating model - 2021-08-20 11:03:11,345 INFO [pretrained.py:238] Loading HLG from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt - 2021-08-20 11:03:18,442 INFO [pretrained.py:255] Constructing Fbank computer - 2021-08-20 11:03:18,444 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] - 2021-08-20 11:03:18,507 INFO [pretrained.py:271] Decoding started - 2021-08-20 11:03:18,795 INFO [pretrained.py:300] Use HLG decoding - 2021-08-20 11:03:19,149 INFO [pretrained.py:339] + 2021-10-13 11:25:19,458 INFO [pretrained.py:236] device: cuda:0 + 2021-10-13 11:25:19,458 INFO [pretrained.py:238] Creating model + 2021-10-13 11:25:25,342 INFO [pretrained.py:255] Constructing Fbank computer + 2021-10-13 11:25:25,343 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] + 2021-10-13 11:25:25,356 INFO [pretrained.py:271] Decoding started + 2021-10-13 11:25:26,026 INFO [pretrained.py:327] Loading HLG from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt + 2021-10-13 11:25:33,735 INFO [pretrained.py:359] Use HLG decoding + 2021-10-13 11:25:34,013 INFO [pretrained.py:400] ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac: AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS @@ -573,7 +622,7 @@ The output is given below: ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac: YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION - 2021-08-20 11:03:19,149 INFO [pretrained.py:341] Decoding Done + 2021-10-13 11:25:34,014 INFO [pretrained.py:402] Decoding Done HLG decoding + LM rescoring ^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -588,8 +637,7 @@ The command to run HLG decoding + LM rescoring is: $ cd egs/librispeech/ASR $ ./conformer_ctc/pretrained.py \ --checkpoint ./tmp/icefall_asr_librispeech_conformer_ctc/exp/pretrained.pt \ - --words-file ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/words.txt \ - --HLG ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt \ + --lang-dir ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe \ --method whole-lattice-rescoring \ --G ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt \ --ngram-lm-scale 0.8 \ @@ -601,15 +649,15 @@ Its output is: .. code-block:: - 2021-08-20 11:12:17,565 INFO [pretrained.py:217] device: cuda:0 - 2021-08-20 11:12:17,565 INFO [pretrained.py:219] Creating model - 2021-08-20 11:12:23,728 INFO [pretrained.py:238] Loading HLG from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt - 2021-08-20 11:12:30,035 INFO [pretrained.py:246] Loading G from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt - 2021-08-20 11:13:10,779 INFO [pretrained.py:255] Constructing Fbank computer - 2021-08-20 11:13:10,787 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] - 2021-08-20 11:13:10,798 INFO [pretrained.py:271] Decoding started - 2021-08-20 11:13:11,085 INFO [pretrained.py:305] Use HLG decoding + LM rescoring - 2021-08-20 11:13:11,736 INFO [pretrained.py:339] + 2021-10-13 11:28:19,129 INFO [pretrained.py:236] device: cuda:0 + 2021-10-13 11:28:19,129 INFO [pretrained.py:238] Creating model + 2021-10-13 11:28:23,531 INFO [pretrained.py:255] Constructing Fbank computer + 2021-10-13 11:28:23,532 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] + 2021-10-13 11:28:23,544 INFO [pretrained.py:271] Decoding started + 2021-10-13 11:28:24,141 INFO [pretrained.py:327] Loading HLG from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt + 2021-10-13 11:28:30,752 INFO [pretrained.py:338] Loading G from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt + 2021-10-13 11:28:48,308 INFO [pretrained.py:364] Use HLG decoding + LM rescoring + 2021-10-13 11:28:48,815 INFO [pretrained.py:400] ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac: AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS @@ -620,7 +668,7 @@ Its output is: ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac: YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION - 2021-08-20 11:13:11,737 INFO [pretrained.py:341] Decoding Done + 2021-10-13 11:28:48,815 INFO [pretrained.py:402] Decoding Done HLG decoding + LM rescoring + attention decoder rescoring ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -636,8 +684,7 @@ The command to run HLG decoding + LM rescoring + attention decoder rescoring is: $ cd egs/librispeech/ASR $ ./conformer_ctc/pretrained.py \ --checkpoint ./tmp/icefall_asr_librispeech_conformer_ctc/exp/pretrained.pt \ - --words-file ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/words.txt \ - --HLG ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt \ + --lang-dir ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe \ --method attention-decoder \ --G ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt \ --ngram-lm-scale 1.3 \ @@ -654,15 +701,15 @@ The output is below: .. code-block:: - 2021-08-20 11:19:11,397 INFO [pretrained.py:217] device: cuda:0 - 2021-08-20 11:19:11,397 INFO [pretrained.py:219] Creating model - 2021-08-20 11:19:17,354 INFO [pretrained.py:238] Loading HLG from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt - 2021-08-20 11:19:24,615 INFO [pretrained.py:246] Loading G from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt - 2021-08-20 11:20:04,576 INFO [pretrained.py:255] Constructing Fbank computer - 2021-08-20 11:20:04,584 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] - 2021-08-20 11:20:04,595 INFO [pretrained.py:271] Decoding started - 2021-08-20 11:20:04,854 INFO [pretrained.py:313] Use HLG + LM rescoring + attention decoder rescoring - 2021-08-20 11:20:05,805 INFO [pretrained.py:339] + 2021-10-13 11:29:50,106 INFO [pretrained.py:236] device: cuda:0 + 2021-10-13 11:29:50,106 INFO [pretrained.py:238] Creating model + 2021-10-13 11:29:56,063 INFO [pretrained.py:255] Constructing Fbank computer + 2021-10-13 11:29:56,063 INFO [pretrained.py:265] Reading sound files: ['./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0001.flac', './tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac'] + 2021-10-13 11:29:56,077 INFO [pretrained.py:271] Decoding started + 2021-10-13 11:29:56,770 INFO [pretrained.py:327] Loading HLG from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lang_bpe/HLG.pt + 2021-10-13 11:30:04,023 INFO [pretrained.py:338] Loading G from ./tmp/icefall_asr_librispeech_conformer_ctc/data/lm/G_4_gram.pt + 2021-10-13 11:30:18,163 INFO [pretrained.py:372] Use HLG + LM rescoring + attention decoder rescoring + 2021-10-13 11:30:19,367 INFO [pretrained.py:400] ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1089-134686-0001.flac: AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD LIGHT UP HERE AND THERE THE SQUALID QUARTER OF THE BROTHELS @@ -673,7 +720,7 @@ The output is below: ./tmp/icefall_asr_librispeech_conformer_ctc/test_wavs/1221-135766-0002.flac: YET THESE THOUGHTS AFFECTED HESTER PRYNNE LESS WITH HOPE THAN APPREHENSION - 2021-08-20 11:20:05,805 INFO [pretrained.py:341] Decoding Done + 2021-10-13 11:30:19,367 INFO [pretrained.py:402] Decoding Done Colab notebook -------------- diff --git a/egs/librispeech/ASR/conformer_ctc/pretrained.py b/egs/librispeech/ASR/conformer_ctc/pretrained.py index 00812d674..07d3e7269 100755 --- a/egs/librispeech/ASR/conformer_ctc/pretrained.py +++ b/egs/librispeech/ASR/conformer_ctc/pretrained.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 -# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang, +# Mingshuang Luo) # # See ../../../../LICENSE for clarification regarding multiple authors # @@ -19,6 +20,7 @@ import argparse import logging import math +import sentencepiece as spm from typing import List import k2 @@ -28,6 +30,7 @@ import torchaudio from conformer import Conformer from torch.nn.utils.rnn import pad_sequence +from icefall.lexicon import Lexicon from icefall.decode import ( get_lattice, one_best_decoding, @@ -52,14 +55,10 @@ def get_parser(): ) parser.add_argument( - "--words-file", + "--lang-dir", type=str, required=True, - help="Path to words.txt", - ) - - parser.add_argument( - "--HLG", type=str, required=True, help="Path to HLG.pt." + help="Path to lang bpe dir.", ) parser.add_argument( @@ -68,6 +67,10 @@ def get_parser(): default="1best", help="""Decoding method. Possible values are: + (0) ctc-decoding - Use CTC decoding. It uses a sentence + piece model, i.e., lang_dir/bpe.model, to convert + word pieces to words. It needs neither a lexicon + nor an n-gram LM. (1) 1best - Use the best path as decoding output. Only the transformer encoder output is used for decoding. We call it HLG decoding. @@ -249,23 +252,6 @@ def main(): model.to(device) model.eval() - logging.info(f"Loading HLG from {params.HLG}") - HLG = k2.Fsa.from_dict(torch.load(params.HLG, map_location="cpu")) - HLG = HLG.to(device) - if not hasattr(HLG, "lm_scores"): - # For whole-lattice-rescoring and attention-decoder - HLG.lm_scores = HLG.scores.clone() - - if params.method in ["whole-lattice-rescoring", "attention-decoder"]: - logging.info(f"Loading G from {params.G}") - G = k2.Fsa.from_dict(torch.load(params.G, map_location="cpu")) - # Add epsilon self-loops to G as we will compose - # it with the whole lattice later - G = G.to(device) - G = k2.add_epsilon_self_loops(G) - G = k2.arc_sort(G) - G.lm_scores = G.scores.clone() - logging.info("Constructing Fbank computer") opts = kaldifeat.FbankOptions() opts.device = device @@ -299,60 +285,128 @@ def main(): dtype=torch.int32, ) - lattice = get_lattice( - nnet_output=nnet_output, - decoding_graph=HLG, - supervision_segments=supervision_segments, - search_beam=params.search_beam, - output_beam=params.output_beam, - min_active_states=params.min_active_states, - max_active_states=params.max_active_states, - subsampling_factor=params.subsampling_factor, - ) + try: + if params.method == "ctc-decoding": + logging.info("Building CTC topology") + lexicon = Lexicon(params.lang_dir) + max_token_id = max(lexicon.tokens) + H = k2.ctc_topo( + max_token=max_token_id, + modified=False, + device=device, + ) - if params.method == "1best": - logging.info("Use HLG decoding") - best_path = one_best_decoding( - lattice=lattice, use_double_scores=params.use_double_scores - ) - elif params.method == "whole-lattice-rescoring": - logging.info("Use HLG decoding + LM rescoring") - best_path_dict = rescore_with_whole_lattice( - lattice=lattice, - G_with_epsilon_loops=G, - lm_scale_list=[params.ngram_lm_scale], - ) - best_path = next(iter(best_path_dict.values())) - elif params.method == "attention-decoder": - logging.info("Use HLG + LM rescoring + attention decoder rescoring") - rescored_lattice = rescore_with_whole_lattice( - lattice=lattice, G_with_epsilon_loops=G, lm_scale_list=None - ) - best_path_dict = rescore_with_attention_decoder( - lattice=rescored_lattice, - num_paths=params.num_paths, - model=model, - memory=memory, - memory_key_padding_mask=memory_key_padding_mask, - sos_id=params.sos_id, - eos_id=params.eos_id, - nbest_scale=params.nbest_scale, - ngram_lm_scale=params.ngram_lm_scale, - attention_scale=params.attention_decoder_scale, - ) - best_path = next(iter(best_path_dict.values())) + logging.info("Loading BPE model") + bpe_model = spm.SentencePieceProcessor() + bpe_model.load(params.lang_dir + "/bpe.model") - hyps = get_texts(best_path) - word_sym_table = k2.SymbolTable.from_file(params.words_file) - hyps = [[word_sym_table[i] for i in ids] for ids in hyps] + lattice = get_lattice( + nnet_output=nnet_output, + decoding_graph=H, + supervision_segments=supervision_segments, + search_beam=params.search_beam, + output_beam=params.output_beam, + min_active_states=params.min_active_states, + max_active_states=params.max_active_states, + subsampling_factor=params.subsampling_factor, + ) - s = "\n" - for filename, hyp in zip(params.sound_files, hyps): - words = " ".join(hyp) - s += f"{filename}:\n{words}\n\n" - logging.info(s) + logging.info("Use CTC decoding") + best_path = one_best_decoding( + lattice=lattice, use_double_scores=params.use_double_scores + ) + token_ids = get_texts(best_path) + hyps = bpe_model.decode(token_ids) + hyps = [s.split() for s in hyps] - logging.info("Decoding Done") + if params.method in [ + "1best", + "whole-lattice-rescoring", + "attention-decoder", + ]: + logging.info(f"Loading HLG from {params.lang_dir}/HLG.pt") + HLG = k2.Fsa.from_dict( + torch.load(params.lang_dir + "/HLG.pt", map_location="cpu") + ) + HLG = HLG.to(device) + if not hasattr(HLG, "lm_scores"): + # For whole-lattice-rescoring and attention-decoder + HLG.lm_scores = HLG.scores.clone() + + if params.method in [ + "whole-lattice-rescoring", + "attention-decoder", + ]: + logging.info(f"Loading G from {params.G}") + G = k2.Fsa.from_dict(torch.load(params.G, map_location="cpu")) + # Add epsilon self-loops to G as we will compose + # it with the whole lattice later + G = G.to(device) + G = k2.add_epsilon_self_loops(G) + G = k2.arc_sort(G) + G.lm_scores = G.scores.clone() + + lattice = get_lattice( + nnet_output=nnet_output, + decoding_graph=HLG, + supervision_segments=supervision_segments, + search_beam=params.search_beam, + output_beam=params.output_beam, + min_active_states=params.min_active_states, + max_active_states=params.max_active_states, + subsampling_factor=params.subsampling_factor, + ) + + if params.method == "1best": + logging.info("Use HLG decoding") + best_path = one_best_decoding( + lattice=lattice, use_double_scores=params.use_double_scores + ) + elif params.method == "whole-lattice-rescoring": + logging.info("Use HLG decoding + LM rescoring") + best_path_dict = rescore_with_whole_lattice( + lattice=lattice, + G_with_epsilon_loops=G, + lm_scale_list=[params.ngram_lm_scale], + ) + best_path = next(iter(best_path_dict.values())) + elif params.method == "attention-decoder": + logging.info( + "Use HLG + LM rescoring + attention decoder rescoring" + ) + rescored_lattice = rescore_with_whole_lattice( + lattice=lattice, G_with_epsilon_loops=G, lm_scale_list=None + ) + best_path_dict = rescore_with_attention_decoder( + lattice=rescored_lattice, + num_paths=params.num_paths, + model=model, + memory=memory, + memory_key_padding_mask=memory_key_padding_mask, + sos_id=params.sos_id, + eos_id=params.eos_id, + nbest_scale=params.nbest_scale, + ngram_lm_scale=params.ngram_lm_scale, + attention_scale=params.attention_decoder_scale, + ) + best_path = next(iter(best_path_dict.values())) + + hyps = get_texts(best_path) + word_sym_table = k2.SymbolTable.from_file( + params.lang_dir + "/words.txt" + ) + hyps = [[word_sym_table[i] for i in ids] for ids in hyps] + + s = "\n" + for filename, hyp in zip(params.sound_files, hyps): + words = " ".join(hyp) + s += f"{filename}:\n{words}\n\n" + logging.info(s) + + logging.info("Decoding Done") + + except Exception: + raise ValueError("Please use a supported decoding method.") if __name__ == "__main__":