Support computing nbest oracle WER.

2025-08-09 18:12:19 +00:00 · 2021-08-18 12:54:01 +08:00 · 2021-08-18 12:54:01 +08:00 · 401c1c5143
commit 401c1c5143
parent 1c3b13c7eb
2 changed files with 102 additions and 6 deletions
--- a/egs/librispeech/ASR/conformer_ctc/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc/decode.py
@ -21,6 +21,7 @@ from icefall.dataset.librispeech import LibriSpeechAsrDataModule
 from icefall.decode import (
    get_lattice,
    nbest_decoding,
+    nbest_oracle,
    one_best_decoding,
    rescore_with_attention_decoder,
    rescore_with_n_best_list,
@ -56,6 +57,15 @@ def get_parser():
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch'. ",
    )
+
+    parser.add_argument(
+        "--scale",
+        type=float,
+        default=1.0,
+        help="The scale to be applied to `lattice.scores`."
+        "A smaller value results in more unique paths",
+    )
+
    return parser


@ -85,10 +95,12 @@ def get_params() -> AttributeDict:
            #  - nbest-rescoring
            #  - whole-lattice-rescoring
            #  - attention-decoder
+            #  - nbest-oracle
            #  "method": "whole-lattice-rescoring",
            "method": "attention-decoder",
+            #  "method": "nbest-oracle",
            # num_paths is used when method is "nbest", "nbest-rescoring",
-            # and attention-decoder
+            # attention-decoder, and nbest-oracle
            "num_paths": 100,
        }
    )
@ -179,6 +191,19 @@ def decode_one_batch(
        subsampling_factor=params.subsampling_factor,
    )

+    if params.method == "nbest-oracle":
+        # Note: You can also pass rescored lattices to it.
+        # We choose the HLG decoded lattice for speed reasons
+        # as HLG decoding is faster and the oracle WER
+        # is slightly worse than that of rescored lattices.
+        return nbest_oracle(
+            lattice=lattice,
+            num_paths=params.num_paths,
+            ref_texts=supervisions["text"],
+            lexicon=lexicon,
+            scale=params.scale,
+        )
+
    if params.method in ["1best", "nbest"]:
        if params.method == "1best":
            best_path = one_best_decoding(
@ -284,7 +309,6 @@ def decode_dataset(
    results = []

    num_cuts = 0
-    tot_num_cuts = len(dl.dataset.cuts)

    results = defaultdict(list)
    for batch_idx, batch in enumerate(dl):
@ -315,8 +339,7 @@ def decode_dataset(
        if batch_idx % 100 == 0:
            logging.info(
                f"batch {batch_idx}, cuts processed until now is "
-                f"{num_cuts}/{tot_num_cuts} "
-                f"({float(num_cuts)/tot_num_cuts*100:.6f}%)"
+                f"{num_cuts} "
            )
    return results

--- a/icefall/decode.py
+++ b/icefall/decode.py
@ -2,9 +2,12 @@ import logging
 from typing import Dict, List, Optional, Tuple, Union

 import k2
+import kaldialign
 import torch
 import torch.nn as nn

+from icefall.lexicon import Lexicon
+

 def _intersect_device(
    a_fsas: k2.Fsa,
@ -376,7 +379,7 @@ def rescore_with_n_best_list(
    #
    # num_repeats is also a k2.RaggedInt with 2 axes containing the
    # multiplicities of each path.
-    # num_repeats.num_elements() == unique_word_seqs.num_elements()
+    # num_repeats.num_elements() == unique_word_seqs.tot_size(1)
    #
    # Since k2.ragged.unique_sequences will reorder paths within a seq,
    # `new2old` is a 1-D torch.Tensor mapping from the output path index
@ -549,6 +552,76 @@ def rescore_with_whole_lattice(
    return ans


+def nbest_oracle(
+    lattice: k2.Fsa,
+    num_paths: int,
+    ref_texts: List[str],
+    lexicon: Lexicon,
+    scale: float = 1.0,
+) -> Dict[str, List[List[int]]]:
+    """Select the best hypothesis given a lattice and a reference transcript.
+
+    The basic idea is to extract n paths from the given lattice, unique them,
+    and select the one that has the minimum edit distance with the corresponding
+    reference transcript as the decoding output.
+
+    The decoding result returned from this function is the best result that
+    we can obtain using n-best decoding with all kinds of rescoring techniques.
+
+    Args:
+      lattice:
+        An FsaVec. It can be the return value of :func:`get_lattice`.
+        Note: We assume its aux_labels contain word IDs.
+      num_paths:
+        The size of `n` in n-best.
+      ref_texts:
+        A list of reference transcript. Each entry contains space(s)
+        separated words
+      lexicon:
+        It is used to convert word IDs to word symbols.
+      scale:
+        It's the scale applied to the lattice.scores. A smaller value
+        yields more unique paths.
+    Return:
+      Return a dict. Its key contains the information about the parameters
+      when calling this function, while its value contains the decoding output.
+      `len(ans_dict) == len(ref_texts)`
+    """
+    saved_scores = lattice.scores.clone()
+
+    lattice.scores *= scale
+    path = k2.random_paths(lattice, num_paths=num_paths, use_double_scores=True)
+    lattice.scores = saved_scores
+
+    word_seq = k2.index(lattice.aux_labels, path)
+    word_seq = k2.ragged.remove_values_leq(word_seq, 0)
+    unique_word_seq, _, _ = k2.ragged.unique_sequences(
+        word_seq, need_num_repeats=False, need_new2old_indexes=False
+    )
+    unique_word_ids = k2.ragged.to_list(unique_word_seq)
+    assert len(unique_word_ids) == len(ref_texts)
+    # unique_word_ids[i] contains all hypotheses of the i-th utterance
+
+    results = []
+    for hyps, ref in zip(unique_word_ids, ref_texts):
+        # Note hyps is a list-of-list ints
+        # Each sublist contains a hypothesis
+        ref_words = ref.strip().split()
+        # CAUTION: We don't convert ref_words to ref_words_ids
+        # since there may exist OOV words in ref_words
+        best_hyp_words = None
+        min_error = float("inf")
+        for hyp_words in hyps:
+            hyp_words = [lexicon.word_table[i] for i in hyp_words]
+            this_error = kaldialign.edit_distance(ref_words, hyp_words)["total"]
+            if this_error < min_error:
+                min_error = this_error
+                best_hyp_words = hyp_words
+        results.append(best_hyp_words)
+
+    return {f"nbest_{num_paths}_scale_{scale}_oracle": results}
+
+
 def rescore_with_attention_decoder(
    lattice: k2.Fsa,
    num_paths: int,
@ -605,7 +678,7 @@ def rescore_with_attention_decoder(
    #
    # num_repeats is also a k2.RaggedInt with 2 axes containing the
    # multiplicities of each path.
-    # num_repeats.num_elements() == unique_word_seqs.num_elements()
+    # num_repeats.num_elements() == unique_word_seqs.tot_size(1)
    #
    # Since k2.ragged.unique_sequences will reorder paths within a seq,
    # `new2old` is a 1-D torch.Tensor mapping from the output path index