include previous added decoding method

2025-12-11 06:55:27 +00:00 · 2022-11-02 18:03:56 +08:00 · 2022-11-02 18:03:56 +08:00 · 9a01b9098d
commit 9a01b9098d
parent 6c8d1f9ef5
1 changed files with 51 additions and 14 deletions
--- a/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py
+++ b/egs/librispeech/ASR/lstm_transducer_stateless2/decode.py
@ -131,11 +131,13 @@ from beam_search import (
    greedy_search,
    greedy_search_batch,
    modified_beam_search,
    modified_beam_search_ngram_rescoring,
    modified_beam_search_rnnlm_shallow_fusion,
 )
 from librispeech import LibriSpeech
 from train import add_model_arguments, get_params, get_transducer_model
 from icefall import NgramLm
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
@ -232,6 +234,7 @@ def get_parser():
          - fast_beam_search_nbest
          - fast_beam_search_nbest_oracle
          - fast_beam_search_nbest_LG
          - modified_beam_search_ngram_rescoring
          - modified-beam-search_rnnlm_shallow_fusion # for rnn lm shallow fusion
        If you use fast_beam_search_nbest_LG, you have to specify
        `--lang-dir`, which should contain `LG.pt`.
@ -386,7 +389,23 @@ def get_parser():
        last output linear layer
        """,
    )
-    parser.add_argument("--ilm-scale", type=float, default=-0.1)
+
    parser.add_argument(
        "--tokens-ngram",
        type=int,
        default=3,
        help="""Token Ngram used for rescoring.
            Used only when the decoding method is modified_beam_search_ngram_rescoring""",
    )
    parser.add_argument(
        "--backoff-id",
        type=int,
        default=500,
        help="""ID of the backoff symbol.
                Used only when the decoding method is modified_beam_search_ngram_rescoring""",
    )
    add_model_arguments(parser)
    return parser
@ -399,6 +418,8 @@ def decode_one_batch(
    batch: dict,
    word_table: Optional[k2.SymbolTable] = None,
    decoding_graph: Optional[k2.Fsa] = None,
    ngram_lm: Optional[NgramLm] = None,
    ngram_lm_scale: float = 1.0,
    rnnlm: Optional[RnnLmModel] = None,
    rnnlm_scale: float = 1.0,
 ) -> Dict[str, List[List[str]]]:
@ -534,6 +555,17 @@ def decode_one_batch(
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(hyp.split())
    elif params.decoding_method == "modified_beam_search_ngram_rescoring":
        hyp_tokens = modified_beam_search_ngram_rescoring(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            ngram_lm=ngram_lm,
            ngram_lm_scale=ngram_lm_scale,
            beam=params.beam_size,
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(hyp.split())
    elif params.decoding_method == "modified_beam_search_rnnlm_shallow_fusion":
        hyp_tokens = modified_beam_search_rnnlm_shallow_fusion(
            model=model,
@ -595,9 +627,11 @@ def decode_dataset(
    sp: spm.SentencePieceProcessor,
    word_table: Optional[k2.SymbolTable] = None,
    decoding_graph: Optional[k2.Fsa] = None,
    ngram_lm: Optional[NgramLm] = None,
    ngram_lm_scale: float = 1.0,
    rnnlm: Optional[RnnLmModel] = None,
    rnnlm_scale: float = 1.0,
-) -> Dict[str, List[Tuple[List[str], List[str]]]]:
+) -> Dict[str, List[Tuple[str, List[str], List[str]]]]:
    """Decode dataset.
    Args:
@ -638,13 +672,6 @@ def decode_dataset(
    for batch_idx, batch in enumerate(dl):
        texts = batch["supervisions"]["text"]
        cut_ids = [cut.id for cut in batch["supervisions"]["cut"]]
        total_duration = sum(
            [cut.duration for cut in batch["supervisions"]["cut"]]
        )
        logging.info(
            f"Decoding {batch_idx}-th batch, batch size is {len(cut_ids)}, total duration is {total_duration}"
        )
        hyps_dict = decode_one_batch(
            params=params,
@ -653,6 +680,8 @@ def decode_dataset(
            decoding_graph=decoding_graph,
            word_table=word_table,
            batch=batch,
            ngram_lm=ngram_lm,
            ngram_lm_scale=ngram_lm_scale,
            rnnlm=rnnlm,
            rnnlm_scale=rnnlm_scale,
        )
@ -680,7 +709,7 @@ def decode_dataset(
 def save_results(
    params: AttributeDict,
    test_set_name: str,
-    results_dict: Dict[str, List[Tuple[List[int], List[int]]]],
+    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    test_set_wers = dict()
    for key, results in results_dict.items():
@ -740,6 +769,7 @@ def main():
        "fast_beam_search_nbest_LG",
        "fast_beam_search_nbest_oracle",
        "modified_beam_search",
        "modified_beam_search_ngram_rescoring",
        "modified_beam_search_rnnlm_shallow_fusion",
    )
    params.res_dir = params.exp_dir / params.decoding_method
@ -765,13 +795,10 @@ def main():
    else:
        params.suffix += f"-context-{params.context_size}"
        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
-
+    params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
    if "rnnlm" in params.decoding_method:
        params.suffix += f"-rnnlm-lm-scale-{params.rnn_lm_scale}"
    if "ILME" in params.decoding_method:
        params.suffix += f"-ILME-scale={params.ilm_scale}"
    if params.use_averaged_model:
        params.suffix += "-use-averaged-model"
@ -884,6 +911,14 @@ def main():
    model.to(device)
    model.eval()
    lm_filename = f"{params.tokens_ngram}gram.fst.txt"
    logging.info(f"lm filename: {lm_filename}")
    ngram_lm = NgramLm(
        str(params.lang_dir / lm_filename),
        backoff_id=params.backoff_id,
        is_binary=False,
    )
    logging.info(f"num states: {ngram_lm.lm.num_states}")
    # only load rnnlm if used
    if "rnnlm" in params.decoding_method:
        rnn_lm_scale = params.rnn_lm_scale
@ -951,6 +986,8 @@ def main():
            sp=sp,
            word_table=word_table,
            decoding_graph=decoding_graph,
            ngram_lm=ngram_lm,
            ngram_lm_scale=params.ngram_lm_scale,
            rnnlm=rnn_lm_model,
            rnnlm_scale=rnn_lm_scale,
        )