Support LG for pruned_transducer_stateless2.

2022-06-21 21:55:08 +08:00 · 2022-06-21 21:55:08 +08:00 · f5af662b7b
commit f5af662b7b
parent 136ee53447
3 changed files with 228 additions and 48 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/decode.py
@ -213,7 +213,7 @@ def get_parser():
    parser.add_argument(
        "--beam",
        type=float,
-        default=8.0,
+        default=20.0,
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
@ -236,7 +236,7 @@ def get_parser():
    parser.add_argument(
        "--max-contexts",
        type=int,
-        default=4,
+        default=8,
        help="""Used only when --decoding-method is
        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
        and fast_beam_search_nbest_oracle""",
@ -320,7 +320,8 @@ def decode_one_batch(
        The word symbol table.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
-        only when --decoding_method is fast_beam_search.
+        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
    Returns:
      Return the decoding result. See above description for the format of
      the returned dict.
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
@ -74,6 +74,122 @@ def fast_beam_search_one_best(
    return hyps
 def fast_beam_search_nbest_LG(
    model: Transducer,
    decoding_graph: k2.Fsa,
    encoder_out: torch.Tensor,
    encoder_out_lens: torch.Tensor,
    beam: float,
    max_states: int,
    max_contexts: int,
    num_paths: int,
    nbest_scale: float = 0.5,
    use_double_scores: bool = True,
 ) -> List[List[int]]:
    """It limits the maximum number of symbols per frame to 1.
    The process to get the results is:
     - (1) Use fast beam search to get a lattice
     - (2) Select `num_paths` paths from the lattice using k2.random_paths()
     - (3) Unique the selected paths
     - (4) Intersect the selected paths with the lattice and compute the
           shortest path from the intersection result
     - (5) The path with the largest score is used as the decoding output.
    Args:
      model:
        An instance of `Transducer`.
      decoding_graph:
        Decoding graph used for decoding, may be a TrivialGraph or a HLG.
      encoder_out:
        A tensor of shape (N, T, C) from the encoder.
      encoder_out_lens:
        A tensor of shape (N,) containing the number of frames in `encoder_out`
        before padding.
      beam:
        Beam value, similar to the beam used in Kaldi..
      max_states:
        Max states per stream per frame.
      max_contexts:
        Max contexts pre stream per frame.
      num_paths:
        Number of paths to extract from the decoded lattice.
      nbest_scale:
        It's the scale applied to the lattice.scores. A smaller value
        yields more unique paths.
      use_double_scores:
        True to use double precision for computation. False to use
        single precision.
    Returns:
      Return the decoded result.
    """
    lattice = fast_beam_search(
        model=model,
        decoding_graph=decoding_graph,
        encoder_out=encoder_out,
        encoder_out_lens=encoder_out_lens,
        beam=beam,
        max_states=max_states,
        max_contexts=max_contexts,
    )
    nbest = Nbest.from_lattice(
        lattice=lattice,
        num_paths=num_paths,
        use_double_scores=use_double_scores,
        nbest_scale=nbest_scale,
    )
    # The following code is modified from nbest.intersect()
    word_fsa = k2.invert(nbest.fsa)
    if hasattr(lattice, "aux_labels"):
        # delete token IDs as it is not needed
        del word_fsa.aux_labels
    word_fsa.scores.zero_()
    word_fsa_with_epsilon_loops = k2.linear_fsa_with_self_loops(word_fsa)
    path_to_utt_map = nbest.shape.row_ids(1)
    if hasattr(lattice, "aux_labels"):
        # lattice has token IDs as labels and word IDs as aux_labels.
        # inv_lattice has word IDs as labels and token IDs as aux_labels
        inv_lattice = k2.invert(lattice)
        inv_lattice = k2.arc_sort(inv_lattice)
    else:
        inv_lattice = k2.arc_sort(lattice)
    if inv_lattice.shape[0] == 1:
        path_lattice = k2.intersect_device(
            inv_lattice,
            word_fsa_with_epsilon_loops,
            b_to_a_map=torch.zeros_like(path_to_utt_map),
            sorted_match_a=True,
        )
    else:
        path_lattice = k2.intersect_device(
            inv_lattice,
            word_fsa_with_epsilon_loops,
            b_to_a_map=path_to_utt_map,
            sorted_match_a=True,
        )
    # path_lattice has word IDs as labels and token IDs as aux_labels
    path_lattice = k2.top_sort(k2.connect(path_lattice))
    tot_scores = path_lattice.get_tot_scores(
        use_double_scores=use_double_scores,
        log_semiring=True,  # Note: we always use True
    )
    # See https://github.com/k2-fsa/icefall/pull/420 for why
    # we always use log_semiring=True
    ragged_tot_scores = k2.RaggedTensor(nbest.shape, tot_scores)
    best_hyp_indexes = ragged_tot_scores.argmax()
    best_path = k2.index_fsa(nbest.fsa, best_hyp_indexes)
    hyps = get_texts(best_path)
    return hyps
 def fast_beam_search_nbest(
    model: Transducer,
    decoding_graph: k2.Fsa,
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/decode.py
@ -50,9 +50,9 @@ Usage:
    --exp-dir ./pruned_transducer_stateless2/exp \
    --max-duration 600 \
    --decoding-method fast_beam_search \
-    --beam 4 \
+    --beam 20.0 \
-    --max-contexts 4 \
+    --max-contexts 8 \
-    --max-states 8
+    --max-states 64
 (5) fast beam search (nbest)
 ./pruned_transducer_stateless2/decode.py \
@ -61,9 +61,9 @@ Usage:
    --exp-dir ./pruned_transducer_stateless2/exp \
    --max-duration 600 \
    --decoding-method fast_beam_search_nbest \
-    --beam 4 \
+    --beam 20.0 \
-    --max-contexts 4 \
+    --max-contexts 8 \
-    --max-states 8 \
+    --max-states 64 \
    --num-paths 200 \
    --nbest-scale 0.5
@ -74,11 +74,22 @@ Usage:
    --exp-dir ./pruned_transducer_stateless2/exp \
    --max-duration 600 \
    --decoding-method fast_beam_search_nbest_oracle \
-    --beam 4 \
+    --beam 20.0 \
-    --max-contexts 4 \
+    --max-contexts 8 \
-    --max-states 8 \
+    --max-states 64 \
    --num-paths 200 \
    --nbest-scale 0.5
 (7) fast beam search (with LG)
 ./pruned_transducer_stateless2/decode.py \
    --epoch 28 \
    --avg 15 \
    --exp-dir ./pruned_transducer_stateless2/exp \
    --max-duration 600 \
    --decoding-method fast_beam_search_nbest_LG \
    --beam 20.0 \
    --max-contexts 8 \
    --max-states 64
 """
@ -96,6 +107,7 @@ from asr_datamodule import LibriSpeechAsrDataModule
 from beam_search import (
    beam_search,
    fast_beam_search_nbest,
    fast_beam_search_nbest_LG,
    fast_beam_search_nbest_oracle,
    fast_beam_search_one_best,
    greedy_search,
@ -109,6 +121,7 @@ from icefall.checkpoint import (
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.lexicon import Lexicon
 from icefall.utils import (
    AttributeDict,
    setup_logger,
@ -175,6 +188,9 @@ def get_parser():
          - fast_beam_search
          - fast_beam_search_nbest
          - fast_beam_search_nbest_oracle
          - fast_beam_search_nbest_LG
        If you use fast_beam_search_nbest_LG, you have to specify
        `--lang-dir`, which should contain `LG.pt`.
        """,
    )
@ -190,31 +206,42 @@ def get_parser():
    parser.add_argument(
        "--beam",
        type=float,
-        default=4,
+        default=20.0,
        help="""A floating point value to calculate the cutoff score during beam
        search (i.e., `cutoff = max-score - beam`), which is the same as the
        `beam` in Kaldi.
-        Used only when --decoding-method is
+        Used only when --decoding-method is fast_beam_search,
-        fast_beam_search, fast_beam_search_nbest, or
+        fast_beam_search_nbest, fast_beam_search_nbest_LG,
-        fast_beam_search_nbest_oracle""",
+        and fast_beam_search_nbest_oracle
        """,
    )
    parser.add_argument(
        "--ngram-lm-scale",
        type=float,
        default=0.01,
        help="""
        Used only when --decoding_method is fast_beam_search_nbest_LG.
        It specifies the scale for n-gram LM scores.
        """,
    )
    parser.add_argument(
        "--max-contexts",
        type=int,
-        default=4,
+        default=8,
        help="""Used only when --decoding-method is
-        fast_beam_search, fast_beam_search_nbest, or
+        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
-        fast_beam_search_nbest_oracle""",
+        and fast_beam_search_nbest_oracle""",
    )
    parser.add_argument(
        "--max-states",
        type=int,
-        default=8,
+        default=64,
        help="""Used only when --decoding-method is
-        fast_beam_search, fast_beam_search_nbest, or
+        fast_beam_search, fast_beam_search_nbest, fast_beam_search_nbest_LG,
-        fast_beam_search_nbest_oracle""",
+        and fast_beam_search_nbest_oracle""",
    )
    parser.add_argument(
@ -237,9 +264,8 @@ def get_parser():
        type=int,
        default=200,
        help="""Number of paths for nbest decoding.
-        Used only when the decoding method is fast_beam_search_nbest or
+        Used only when the decoding method is fast_beam_search_nbest,
-        fast_beam_search_nbest_oracle
+        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
        """,
    )
    parser.add_argument(
@ -247,9 +273,8 @@ def get_parser():
        type=float,
        default=0.5,
        help="""Scale applied to lattice scores when computing nbest paths.
-        Used only when the decoding method is fast_beam_search_nbest or
+        Used only when the decoding method is fast_beam_search_nbest,
-        fast_beam_search_nbest_oracle
+        fast_beam_search_nbest_LG, and fast_beam_search_nbest_oracle""",
        """,
    )
    return parser
@ -260,6 +285,7 @@ def decode_one_batch(
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
    batch: dict,
    word_table: Optional[k2.SymbolTable] = None,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[List[str]]]:
    """Decode one batch and return the result in a dict. The dict has the
@ -283,10 +309,12 @@ def decode_one_batch(
        It is the return value from iterating
        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
        for the format of the `batch`.
      word_table:
        The word symbol table.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
-        only when --decoding_method is fast_beam_search,
+        only when --decoding_method is fast_beam_search, fast_beam_search_nbest,
-        fast_beam_search_nbest, or fast_beam_search_nbest_oracle.
+        fast_beam_search_nbest_oracle, and fast_beam_search_nbest_LG.
    Returns:
      Return the decoding result. See above description for the format of
      the returned dict.
@ -318,6 +346,20 @@ def decode_one_batch(
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(hyp.split())
    elif params.decoding_method == "fast_beam_search_nbest_LG":
        hyp_tokens = fast_beam_search_nbest_LG(
            model=model,
            decoding_graph=decoding_graph,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            beam=params.beam,
            max_contexts=params.max_contexts,
            max_states=params.max_states,
            num_paths=params.num_paths,
            nbest_scale=params.nbest_scale,
        )
        for hyp in hyp_tokens:
            hyps.append([word_table[i] for i in hyp])
    elif params.decoding_method == "fast_beam_search_nbest":
        hyp_tokens = fast_beam_search_nbest(
            model=model,
@ -402,16 +444,17 @@ def decode_one_batch(
                f"max_states_{params.max_states}"
            ): hyps
        }
-    elif "fast_beam_search_nbest" in params.decoding_method:
+    elif "fast_beam_search" in params.decoding_method:
-        return {
+        key = f"beam_{params.beam}_"
-            (
+        key += f"max_contexts_{params.max_contexts}_"
-                f"beam_{params.beam}_"
+        key += f"max_states_{params.max_states}"
-                f"max_contexts_{params.max_contexts}_"
+        if "nbest" in params.decoding_method:
-                f"max_states_{params.max_states}_"
+            key += f"num_paths_{params.num_paths}_"
-                f"num_paths_{params.num_paths}_"
+            key += f"nbest_scale_{params.nbest_scale}"
-                f"nbest_scale_{params.nbest_scale}"
+            if "LG" in params.decoding_method:
-            ): hyps
+                key += f"_ngram_lm_scale_{params.ngram_lm_scale}"
-        }
+
        return {key: hyps}
    else:
        return {f"beam_size_{params.beam_size}": hyps}
@ -421,6 +464,7 @@ def decode_dataset(
    params: AttributeDict,
    model: nn.Module,
    sp: spm.SentencePieceProcessor,
    word_table: Optional[k2.SymbolTable] = None,
    decoding_graph: Optional[k2.Fsa] = None,
 ) -> Dict[str, List[Tuple[List[str], List[str]]]]:
    """Decode dataset.
@ -434,6 +478,8 @@ def decode_dataset(
        The neural model.
      sp:
        The BPE model.
      word_table:
        The word symbol table.
      decoding_graph:
        The decoding graph. Can be either a `k2.trivial_graph` or HLG, Used
        only when --decoding_method is fast_beam_search,
@ -465,6 +511,7 @@ def decode_dataset(
            params=params,
            model=model,
            sp=sp,
            word_table=word_table,
            decoding_graph=decoding_graph,
            batch=batch,
        )
@ -548,6 +595,7 @@ def main():
        "beam_search",
        "fast_beam_search",
        "fast_beam_search_nbest",
        "fast_beam_search_nbest_LG",
        "fast_beam_search_nbest_oracle",
        "modified_beam_search",
    )
@ -558,16 +606,15 @@ def main():
    else:
        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
-    if params.decoding_method == "fast_beam_search":
+    if "fast_beam_search" in params.decoding_method:
        params.suffix += f"-beam-{params.beam}"
        params.suffix += f"-max-contexts-{params.max_contexts}"
        params.suffix += f"-max-states-{params.max_states}"
-    elif "fast_beam_search_nbest" in params.decoding_method:
+        if "nbest" in params.decoding_method:
-        params.suffix += f"-beam-{params.beam}"
+            params.suffix += f"-nbest-scale-{params.nbest_scale}"
-        params.suffix += f"-max-contexts-{params.max_contexts}"
+            params.suffix += f"-num-paths-{params.num_paths}"
-        params.suffix += f"-max-states-{params.max_states}"
+            if "LG" in params.decoding_method:
-        params.suffix += f"-num-paths-{params.num_paths}"
+                params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
        params.suffix += f"-nbest-scale-{params.nbest_scale}"
    elif "beam_search" in params.decoding_method:
        params.suffix += (
            f"-{params.decoding_method}-beam-size-{params.beam_size}"
@ -632,9 +679,23 @@ def main():
    model.device = device
    if "fast_beam_search" in params.decoding_method:
-        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
+        if params.decoding_method == "fast_beam_search_nbest_LG":
            lexicon = Lexicon(params.lang_dir)
            word_table = lexicon.word_table
            lg_filename = params.lang_dir / "LG.pt"
            logging.info(f"Loading {lg_filename}")
            decoding_graph = k2.Fsa.from_dict(
                torch.load(lg_filename, map_location=device)
            )
            decoding_graph.scores *= params.ngram_lm_scale
        else:
            word_table = None
            decoding_graph = k2.trivial_graph(
                params.vocab_size - 1, device=device
            )
    else:
        decoding_graph = None
        word_table = None
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
@ -656,6 +717,7 @@ def main():
            params=params,
            model=model,
            sp=sp,
            word_table=word_table,
            decoding_graph=decoding_graph,
        )
@ -664,6 +726,7 @@ def main():
            test_set_name=test_set,
            results_dict=results_dict,
        )
        break
    logging.info("Done!")