Add attention rescoring

2025-08-26 10:16:14 +00:00 · 2024-09-24 10:41:19 +08:00 · 2024-09-24 10:41:19 +08:00 · 83c36ecc18
commit 83c36ecc18
parent ca7dbb085e
1 changed files with 188 additions and 54 deletions
--- a/egs/gigaspeech/ASR/zipformer/ctc_decode.py
+++ b/egs/gigaspeech/ASR/zipformer/ctc_decode.py
@ -21,7 +21,16 @@
 """
 Usage:
-(1) ctc-decoding
+(1) ctc-greedy-search
 ./zipformer/ctc_decode.py \
    --epoch 30 \
    --avg 15 \
    --exp-dir ./zipformer/exp \
    --use-ctc 1 \
    --max-duration 600 \
    --decoding-method ctc-greedy-search
 (2) ctc-decoding
 ./zipformer/ctc_decode.py \
    --epoch 30 \
    --avg 15 \
@ -30,7 +39,7 @@ Usage:
    --max-duration 600 \
    --decoding-method ctc-decoding
-(2) 1best
+(3) 1best
 ./zipformer/ctc_decode.py \
    --epoch 30 \
    --avg 15 \
@ -40,7 +49,7 @@ Usage:
    --hlg-scale 0.6 \
    --decoding-method 1best
-(3) nbest
+(4) nbest
 ./zipformer/ctc_decode.py \
    --epoch 30 \
    --avg 15 \
@ -50,7 +59,7 @@ Usage:
    --hlg-scale 0.6 \
    --decoding-method nbest
-(4) nbest-rescoring
+(5) nbest-rescoring
 ./zipformer/ctc_decode.py \
    --epoch 30 \
    --avg 15 \
@ -62,7 +71,7 @@ Usage:
    --lm-dir data/lm \
    --decoding-method nbest-rescoring
-(5) whole-lattice-rescoring
+(6) whole-lattice-rescoring
 ./zipformer/ctc_decode.py \
    --epoch 30 \
    --avg 15 \
@ -73,6 +82,29 @@ Usage:
    --nbest-scale 1.0 \
    --lm-dir data/lm \
    --decoding-method whole-lattice-rescoring
 (7) attention-decoder-rescoring-no-ngram
 ./zipformer/ctc_decode.py \
    --epoch 30 \
    --avg 15 \
    --exp-dir ./zipformer/exp \
    --use-ctc 1 \
    --use-attention-decoder 1 \
    --max-duration 100 \
    --decoding-method attention-decoder-rescoring-no-ngram
 (8) attention-decoder-rescoring-with-ngram
 ./zipformer/ctc_decode.py \
    --epoch 30 \
    --avg 15 \
    --exp-dir ./zipformer/exp \
    --use-ctc 1 \
    --use-attention-decoder 1 \
    --max-duration 100 \
    --hlg-scale 0.6 \
    --nbest-scale 1.0 \
    --lm-dir data/lm \
    --decoding-method attention-decoder-rescoring-with-ngram
 """
@ -87,9 +119,11 @@ import k2
 import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import GigaSpeechAsrDataModule
 from asr_datamodule import GigaSpeechAsrDataModule
 from gigaspeech_scoring import asr_text_post_processing
 from lhotse import set_caching_enabled
 from train import add_model_arguments, get_model, get_params
 from icefall.checkpoint import (
@ -99,10 +133,13 @@ from icefall.checkpoint import (
    load_checkpoint,
 )
 from icefall.decode import (
    ctc_greedy_search,
    get_lattice,
    nbest_decoding,
    nbest_oracle,
    one_best_decoding,
    rescore_with_attention_decoder_no_ngram,
    rescore_with_attention_decoder_with_ngram,
    rescore_with_n_best_list,
    rescore_with_whole_lattice,
 )
@ -197,23 +234,30 @@ def get_parser():
        default="ctc-decoding",
        help="""Decoding method.
        Supported values are:
-        - (1) ctc-decoding. Use CTC decoding. It uses a sentence piece
+        - (1) ctc-greedy-search. Use CTC greedy search. It uses a sentence piece
          model, i.e., lang_dir/bpe.model, to convert word pieces to words.
          It needs neither a lexicon nor an n-gram LM.
-        - (2) 1best. Extract the best path from the decoding lattice as the
+        - (2) ctc-decoding. Use CTC decoding. It uses a sentence piece
          model, i.e., lang_dir/bpe.model, to convert word pieces to words.
          It needs neither a lexicon nor an n-gram LM.
        - (3) 1best. Extract the best path from the decoding lattice as the
          decoding result.
-        - (3) nbest. Extract n paths from the decoding lattice; the path
+        - (4) nbest. Extract n paths from the decoding lattice; the path
          with the highest score is the decoding result.
-        - (4) nbest-rescoring. Extract n paths from the decoding lattice,
+        - (5) nbest-rescoring. Extract n paths from the decoding lattice,
          rescore them with an n-gram LM (e.g., a 4-gram LM), the path with
          the highest score is the decoding result.
-        - (5) whole-lattice-rescoring. Rescore the decoding lattice with an
+        - (6) whole-lattice-rescoring. Rescore the decoding lattice with an
          n-gram LM (e.g., a 4-gram LM), the best path of rescored lattice
          is the decoding result.
          you have trained an RNN LM using ./rnn_lm/train.py
-        - (6) nbest-oracle. Its WER is the lower bound of any n-best
+        - (7) nbest-oracle. Its WER is the lower bound of any n-best
          rescoring method can achieve. Useful for debugging n-best
          rescoring method.
        - (8) attention-decoder-rescoring-no-ngram. Extract n paths from the decoding
          lattice, rescore them with the attention decoder.
        - (9) attention-decoder-rescoring-with-ngram. Extract n paths from the LM
          rescored lattice, rescore them with the attention decoder.
        """,
    )
@ -256,6 +300,13 @@ def get_parser():
        """,
    )
    parser.add_argument(
        "--skip-scoring",
        type=str2bool,
        default=False,
        help="""Skip scoring, but still save the ASR output (for eval sets).""",
    )
    add_model_arguments(parser)
    return parser
@ -276,17 +327,6 @@ def get_decoding_params() -> AttributeDict:
    return params
 def post_processing(
    results: List[Tuple[str, List[str], List[str]]],
 ) -> List[Tuple[str, List[str], List[str]]]:
    new_results = []
    for key, ref, hyp in results:
        new_ref = asr_text_post_processing(" ".join(ref)).split()
        new_hyp = asr_text_post_processing(" ".join(hyp)).split()
        new_results.append((key, new_ref, new_hyp))
    return new_results
 def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
@ -365,6 +405,15 @@ def decode_one_batch(
    encoder_out, encoder_out_lens = model.forward_encoder(feature, feature_lens)
    ctc_output = model.ctc_output(encoder_out)  # (N, T, C)
    if params.decoding_method == "ctc-greedy-search":
        hyps = ctc_greedy_search(ctc_output, encoder_out_lens)
        # hyps is a list of str, e.g., ['xxx yyy zzz', ...]
        hyps = bpe_model.decode(hyps)
        # hyps is a list of list of str, e.g., [['xxx', 'yyy', 'zzz'], ... ]
        hyps = [s.split() for s in hyps]
        key = "ctc-greedy-search"
        return {key: hyps}
    supervision_segments = torch.stack(
        (
            supervisions["sequence_idx"],
@ -417,7 +466,27 @@ def decode_one_batch(
        # hyps is a list of list of str, e.g., [['xxx', 'yyy', 'zzz'], ... ]
        hyps = [s.split() for s in hyps]
        key = "ctc-decoding"
-        return {key: hyps}
+        return {key: hyps}  # note: returns words
    if params.decoding_method == "attention-decoder-rescoring-no-ngram":
        best_path_dict = rescore_with_attention_decoder_no_ngram(
            lattice=lattice,
            num_paths=params.num_paths,
            attention_decoder=model.attention_decoder,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            nbest_scale=params.nbest_scale,
        )
        ans = dict()
        for a_scale_str, best_path in best_path_dict.items():
            # token_ids is a lit-of-list of IDs
            token_ids = get_texts(best_path)
            # hyps is a list of str, e.g., ['xxx yyy zzz', ...]
            hyps = bpe_model.decode(token_ids)
            # hyps is a list of list of str, e.g., [['xxx', 'yyy', 'zzz'], ... ]
            hyps = [s.split() for s in hyps]
            ans[a_scale_str] = hyps
        return ans
    if params.decoding_method == "nbest-oracle":
        # Note: You can also pass rescored lattices to it.
@ -434,7 +503,7 @@ def decode_one_batch(
        )
        hyps = get_texts(best_path)
        hyps = [[word_table[i] for i in ids] for ids in hyps]
-        key = f"oracle_{params.num_paths}_nbest_scale_{params.nbest_scale}"  # noqa
+        key = f"oracle_{params.num_paths}_nbest-scale-{params.nbest_scale}"  # noqa
        return {key: hyps}
    if params.decoding_method in ["1best", "nbest"]:
@ -442,7 +511,7 @@ def decode_one_batch(
            best_path = one_best_decoding(
                lattice=lattice, use_double_scores=params.use_double_scores
            )
-            key = "no_rescore"
+            key = "no-rescore"
        else:
            best_path = nbest_decoding(
                lattice=lattice,
@ -450,15 +519,16 @@ def decode_one_batch(
                use_double_scores=params.use_double_scores,
                nbest_scale=params.nbest_scale,
            )
-            key = f"no_rescore-nbest-scale-{params.nbest_scale}-{params.num_paths}"  # noqa
+            key = f"no-rescore_nbest-scale-{params.nbest_scale}-{params.num_paths}"  # noqa
        hyps = get_texts(best_path)
        hyps = [[word_table[i] for i in ids] for ids in hyps]
-        return {key: hyps}
+        return {key: hyps}  # note: returns BPE tokens
    assert params.decoding_method in [
        "nbest-rescoring",
        "whole-lattice-rescoring",
        "attention-decoder-rescoring-with-ngram",
    ]
    lm_scale_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
@ -479,6 +549,21 @@ def decode_one_batch(
            G_with_epsilon_loops=G,
            lm_scale_list=lm_scale_list,
        )
    elif params.decoding_method == "attention-decoder-rescoring-with-ngram":
        # lattice uses a 3-gram Lm. We rescore it with a 4-gram LM.
        rescored_lattice = rescore_with_whole_lattice(
            lattice=lattice,
            G_with_epsilon_loops=G,
            lm_scale_list=None,
        )
        best_path_dict = rescore_with_attention_decoder_with_ngram(
            lattice=rescored_lattice,
            num_paths=params.num_paths,
            attention_decoder=model.attention_decoder,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
            nbest_scale=params.nbest_scale,
        )
    else:
        assert False, f"Unsupported decoding method: {params.decoding_method}"
@ -572,39 +657,64 @@ def decode_dataset(
    return results
-def save_results(
+def save_asr_output(
    params: AttributeDict,
    test_set_name: str,
    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
-    test_set_wers = dict()
+    """
    Save text produced by ASR.
    """
    for key, results in results_dict.items():
-        recog_path = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
+        recogs_filename = params.res_dir / f"recogs-{test_set_name}-{params.suffix}.txt"
        results = post_processing(results)
        results = sorted(results)
-        store_transcripts(filename=recog_path, texts=results)
+        store_transcripts(filename=recogs_filename, texts=results)
        logging.info(f"The transcripts are stored in {recog_path}")
        logging.info(f"The transcripts are stored in {recogs_filename}")
 def save_wer_results(
    params: AttributeDict,
    test_set_name: str,
    results_dict: Dict[str, List[Tuple[str, List[str], List[str]]]],
 ):
    if params.decoding_method in (
        "attention-decoder-rescoring-with-ngram",
        "whole-lattice-rescoring",
    ):
        # Set it to False since there are too many logs.
        enable_log = False
    else:
        enable_log = True
    test_set_wers = dict()
    for key, results in results_dict.items():
        results = post_processing(results)
        # The following prints out WERs, per-word error statistics and aligned
        # ref/hyp pairs.
        errs_filename = params.res_dir / f"errs-{test_set_name}-{params.suffix}.txt"
-        with open(errs_filename, "w") as f:
+        with open(errs_filename, "w", encoding="utf8") as fd:
-            wer = write_error_stats(f, f"{test_set_name}-{key}", results)
+            wer = write_error_stats(
                fd, f"{test_set_name}_{key}", results, enable_log=enable_log
            )
            test_set_wers[key] = wer
-        logging.info("Wrote detailed error stats to {}".format(errs_filename))
+        logging.info(f"Wrote detailed error stats to {errs_filename}")
    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
        for key, val in test_set_wers:
            print("{}\t{}".format(key, val), file=f)
-    s = "\nFor {}, WER of different settings are:\n".format(test_set_name)
+    wer_filename = params.res_dir / f"wer-summary-{test_set_name}-{params.suffix}.txt"
-    note = "\tbest for {}".format(test_set_name)
+
    with open(wer_filename, "w", encoding="utf8") as fd:
        print("settings\tWER", file=fd)
        for key, val in test_set_wers:
            print(f"{key}\t{val}", file=fd)
    s = f"\nFor {test_set_name}, WER of different settings are:\n"
    note = f"\tbest for {test_set_name}"
    for key, val in test_set_wers:
-        s += "{}\t{}{}\n".format(key, val, note)
+        s += f"{key}\t{val}{note}\n"
        note = ""
    logging.info(s)
@ -623,20 +733,26 @@ def main():
    params.update(get_decoding_params())
    params.update(vars(args))
    # enable AudioCache
    set_caching_enabled(True)  # lhotse
    assert params.decoding_method in (
        "ctc-greedy-search",
        "ctc-decoding",
        "1best",
        "nbest",
        "nbest-rescoring",
        "whole-lattice-rescoring",
        "nbest-oracle",
        "attention-decoder-rescoring-no-ngram",
        "attention-decoder-rescoring-with-ngram",
    )
    params.res_dir = params.exp_dir / params.decoding_method
    if params.iter > 0:
-        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
+        params.suffix = f"iter-{params.iter}_avg-{params.avg}"
    else:
-        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
+        params.suffix = f"epoch-{params.epoch}_avg-{params.avg}"
    if params.causal:
        assert (
@ -645,11 +761,11 @@ def main():
        assert (
            "," not in params.left_context_frames
        ), "left_context_frames should be one value in decoding."
-        params.suffix += f"-chunk-{params.chunk_size}"
+        params.suffix += f"_chunk-{params.chunk_size}"
-        params.suffix += f"-left-context-{params.left_context_frames}"
+        params.suffix += f"_left-context-{params.left_context_frames}"
    if params.use_averaged_model:
-        params.suffix += "-use-averaged-model"
+        params.suffix += "_use-averaged-model"
    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
    logging.info("Decoding started")
@ -668,8 +784,14 @@ def main():
    params.vocab_size = num_classes
    # <blk> and <unk> are defined in local/train_bpe_model.py
    params.blank_id = 0
    params.eos_id = 1
    params.sos_id = 1
-    if params.decoding_method == "ctc-decoding":
+    if params.decoding_method in [
        "ctc-greedy-search",
        "ctc-decoding",
        "attention-decoder-rescoring-no-ngram",
    ]:
        HLG = None
        H = k2.ctc_topo(
            max_token=max_token_id,
@ -693,6 +815,7 @@ def main():
    if params.decoding_method in (
        "nbest-rescoring",
        "whole-lattice-rescoring",
        "attention-decoder-rescoring-with-ngram",
    ):
        if not (params.lm_dir / "G_4_gram.pt").is_file():
            logging.info("Loading G_4_gram.fst.txt")
@ -724,7 +847,10 @@ def main():
            d = torch.load(params.lm_dir / "G_4_gram.pt", map_location=device)
            G = k2.Fsa.from_dict(d)
-        if params.decoding_method == "whole-lattice-rescoring":
+        if params.decoding_method in [
            "whole-lattice-rescoring",
            "attention-decoder-rescoring-with-ngram",
        ]:
            # Add epsilon self-loops to G as we will compose
            # it with the whole lattice later
            G = k2.add_epsilon_self_loops(G)
@ -825,6 +951,7 @@ def main():
    # we need cut ids to display recognition results.
    args.return_cuts = True
    gigaspeech = GigaSpeechAsrDataModule(args)
    test_cuts = gigaspeech.test_cuts()
@ -832,9 +959,9 @@ def main():
    test_dl = gigaspeech.test_dataloaders(test_cuts)
    test_sets = ["test"]
-    test_dl = [test_dl]
+    test_dls = [test_dl]
-    for test_set, test_dl in zip(test_sets, test_dl):
+    for test_set, test_dl in zip(test_sets, test_dls):
        results_dict = decode_dataset(
            dl=test_dl,
            params=params,
@ -846,12 +973,19 @@ def main():
            G=G,
        )
-        save_results(
+        save_asr_output(
            params=params,
            test_set_name=test_set,
            results_dict=results_dict,
        )
        if not params.skip_scoring:
            save_wer_results(
                params=params,
                test_set_name=test_set,
                results_dict=results_dict,
            )
    logging.info("Done!")