Fix decoding for gigaspeech in the libri + giga setup. (#345)

2025-12-11 06:55:27 +00:00 · 2022-05-05 20:58:46 +08:00 · 2022-05-05 20:58:46 +08:00 · 8635fb4334
commit 8635fb4334
parent e1c3e98980
1 changed files with 104 additions and 27 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/decode-giga.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/decode-giga.py
@ -69,7 +69,8 @@ import torch.nn as nn
 from asr_datamodule import AsrDataModule
 from beam_search import (
    beam_search,
-    fast_beam_search,
+    fast_beam_search_nbest_oracle,
+    fast_beam_search_one_best,
    greedy_search,
    greedy_search_batch,
    modified_beam_search,
@ -100,27 +101,28 @@ def get_parser():
        "--epoch",
        type=int,
        default=28,
-        help="It specifies the checkpoint to use for decoding."
-        "Note: Epoch counts from 0.",
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 0.
+        You can specify --avg to use more checkpoints for model averaging.""",
    )
+
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=0,
+        help="""If positive, --epoch is ignored and it
+        will use the checkpoint exp_dir/checkpoint-iter.pt.
+        You can specify --avg to use more checkpoints for model averaging.
+        """,
+    )
+
    parser.add_argument(
        "--avg",
        type=int,
        default=15,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
-        "'--epoch'. ",
-    )
-
-    parser.add_argument(
-        "--avg-last-n",
-        type=int,
-        default=0,
-        help="""If positive, --epoch and --avg are ignored and it
-        will use the last n checkpoints exp_dir/checkpoint-xxx.pt
-        where xxx is the number of processed batches while
-        saving that checkpoint.
-        """,
+        "'--epoch' and '--iter'",
    )

    parser.add_argument(
@ -146,6 +148,7 @@ def get_parser():
          - beam_search
          - modified_beam_search
          - fast_beam_search
+          - fast_beam_search_nbest_oracle
        """,
    )

@ -165,7 +168,8 @@ def get_parser():
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
-        Used only when --decoding-method is fast_beam_search""",
+        Used only when --decoding-method is
+        fast_beam_search or fast_beam_search_nbest_oracle""",
    )

    parser.add_argument(
@ -173,7 +177,7 @@ def get_parser():
        type=int,
        default=4,
        help="""Used only when --decoding-method is
-        fast_beam_search""",
+        fast_beam_search or fast_beam_search_nbest_oracle""",
    )

    parser.add_argument(
@ -181,7 +185,7 @@ def get_parser():
        type=int,
        default=8,
        help="""Used only when --decoding-method is
-        fast_beam_search""",
+        fast_beam_search or fast_beam_search_nbest_oracle""",
    )

    parser.add_argument(
@ -199,6 +203,23 @@ def get_parser():
        Used only when --decoding_method is greedy_search""",
    )

+    parser.add_argument(
+        "--num-paths",
+        type=int,
+        default=100,
+        help="""Number of paths for computed nbest oracle WER
+        when the decoding method is fast_beam_search_nbest_oracle.
+        """,
+    )
+
+    parser.add_argument(
+        "--nbest-scale",
+        type=float,
+        default=0.5,
+        help="""Scale applied to lattice scores when computing nbest paths.
+        Used only when the decoding_method is fast_beam_search_nbest_oracle.
+        """,
+    )
    return parser


@ -243,7 +264,8 @@ def decode_one_batch(
        for the format of the `batch`.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
-        only when --decoding_method is fast_beam_search.
+        only when --decoding_method is
+        fast_beam_search or fast_beam_search_nbest_oracle.
    Returns:
      Return the decoding result. See above description for the format of
      the returned dict.
@ -264,7 +286,7 @@ def decode_one_batch(
    hyps = []

    if params.decoding_method == "fast_beam_search":
-        hyp_tokens = fast_beam_search(
+        hyp_tokens = fast_beam_search_one_best(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
@ -275,6 +297,21 @@ def decode_one_batch(
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(hyp.split())
+    elif params.decoding_method == "fast_beam_search_nbest_oracle":
+        hyp_tokens = fast_beam_search_nbest_oracle(
+            model=model,
+            decoding_graph=decoding_graph,
+            encoder_out=encoder_out,
+            encoder_out_lens=encoder_out_lens,
+            beam=params.beam,
+            max_contexts=params.max_contexts,
+            max_states=params.max_states,
+            num_paths=params.num_paths,
+            ref_texts=sp.encode(supervisions["text"]),
+            nbest_scale=params.nbest_scale,
+        )
+        for hyp in sp.decode(hyp_tokens):
+            hyps.append(hyp.split())
    elif (
        params.decoding_method == "greedy_search"
        and params.max_sym_per_frame == 1
@ -328,6 +365,16 @@ def decode_one_batch(
                f"max_states_{params.max_states}"
            ): hyps
        }
+    elif params.decoding_method == "fast_beam_search_nbest_oracle":
+        return {
+            (
+                f"beam_{params.beam}_"
+                f"max_contexts_{params.max_contexts}_"
+                f"max_states_{params.max_states}_"
+                f"num_paths_{params.num_paths}_"
+                f"nbest_scale_{params.nbest_scale}"
+            ): hyps
+        }
    else:
        return {f"beam_size_{params.beam_size}": hyps}

@ -463,17 +510,30 @@ def main():
        "greedy_search",
        "beam_search",
        "fast_beam_search",
+        "fast_beam_search_nbest_oracle",
        "modified_beam_search",
    )
    params.res_dir = params.exp_dir / "giga" / params.decoding_method

+    if params.iter > 0:
+        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+    else:
        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    if "fast_beam_search" in params.decoding_method:
+
+    if params.decoding_method == "fast_beam_search":
        params.suffix += f"-beam-{params.beam}"
        params.suffix += f"-max-contexts-{params.max_contexts}"
        params.suffix += f"-max-states-{params.max_states}"
+    elif params.decoding_method == "fast_beam_search_nbest_oracle":
+        params.suffix += f"-beam-{params.beam}"
+        params.suffix += f"-max-contexts-{params.max_contexts}"
+        params.suffix += f"-max-states-{params.max_states}"
+        params.suffix += f"-num-paths-{params.num_paths}"
+        params.suffix += f"-nbest-scale-{params.nbest_scale}"
    elif "beam_search" in params.decoding_method:
-        params.suffix += f"-beam-{params.beam_size}"
+        params.suffix += (
+            f"-{params.decoding_method}-beam-size-{params.beam_size}"
+        )
    else:
        params.suffix += f"-context-{params.context_size}"
        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
@ -490,8 +550,9 @@ def main():
    sp = spm.SentencePieceProcessor()
    sp.load(params.bpe_model)

-    # <blk> is defined in local/train_bpe_model.py
+    # <blk> and <unk> is defined in local/train_bpe_model.py
    params.blank_id = sp.piece_to_id("<blk>")
+    params.unk_id = sp.unk_id()
    params.vocab_size = sp.get_piece_size()

    logging.info(params)
@ -499,8 +560,20 @@ def main():
    logging.info("About to create model")
    model = get_transducer_model(params)

-    if params.avg_last_n > 0:
-        filenames = find_checkpoints(params.exp_dir)[: params.avg_last_n]
+    if params.iter > 0:
+        filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+            : params.avg
+        ]
+        if len(filenames) == 0:
+            raise ValueError(
+                f"No checkpoints found for"
+                f" --iter {params.iter}, --avg {params.avg}"
+            )
+        elif len(filenames) < params.avg:
+            raise ValueError(
+                f"Not enough checkpoints ({len(filenames)}) found for"
+                f" --iter {params.iter}, --avg {params.avg}"
+            )
        logging.info(f"averaging {filenames}")
        model.to(device)
        model.load_state_dict(average_checkpoints(filenames, device=device))
@ -519,13 +592,17 @@ def main():
    model.to(device)
    model.eval()
    model.device = device
+    model.unk_id = params.unk_id

    # In beam_search.py, we are using model.decoder() and model.joiner(),
    # so we have to switch to the branch for the GigaSpeech dataset.
    model.decoder = model.decoder_giga
    model.joiner = model.joiner_giga

-    if params.decoding_method == "fast_beam_search":
+    if params.decoding_method in (
+        "fast_beam_search",
+        "fast_beam_search_nbest_oracle",
+    ):
        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
    else:
        decoding_graph = None