minor fixes

2025-12-11 06:55:27 +00:00 · 2023-07-25 23:26:04 +08:00 · 2023-07-25 23:26:04 +08:00 · 79b8e60f93
commit 79b8e60f93
parent c6334ae45d
2 changed files with 102 additions and 26 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/beam_search.py
@ -779,7 +779,7 @@ def greedy_search_batch(
        )


-def deprecated_greedy_search_batch(
+def deprecated_greedy_search_batch_for_cross_attn(
    model: nn.Module, encoder_out: torch.Tensor
 ) -> List[List[int]]:
    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
@ -847,6 +847,87 @@ def deprecated_greedy_search_batch(
    return ans


+def deprecated_greedy_search_batch_for_cross_attn(
+    model: nn.Module,
+    encoder_out: torch.Tensor,
+    encoder_out_lens: torch.Tensor,
+) -> List[List[int]]:
+    """Greedy search in batch mode. It hardcodes --max-sym-per-frame=1.
+    Args:
+      model:
+        The transducer model.
+      encoder_out:
+        Output from the encoder. Its shape is (N, T, C), where N >= 1.
+    Returns:
+      Return a list-of-list of token IDs containing the decoded results.
+      len(ans) equals to encoder_out.size(0).
+    """
+    assert encoder_out.ndim == 3
+    assert encoder_out.size(0) >= 1, encoder_out.size(0)
+
+    device = next(model.parameters()).device
+
+    batch_size = encoder_out.size(0)
+    T = encoder_out.size(1)
+
+    blank_id = model.decoder.blank_id
+    unk_id = getattr(model, "unk_id", blank_id)
+    context_size = model.decoder.context_size
+
+    hyps = [[blank_id] * context_size for _ in range(batch_size)]
+
+    decoder_input = torch.tensor(
+        hyps,
+        device=device,
+        dtype=torch.int64,
+    )  # (batch_size, context_size)
+
+    decoder_out = model.decoder(decoder_input, need_pad=False)
+    decoder_out = model.joiner.decoder_proj(decoder_out)
+    encoder_out = model.joiner.encoder_proj(encoder_out)
+
+    encoder_out_for_attn = encoder_out.unsqueeze(2)
+    
+
+    # decoder_out: (batch_size, 1, decoder_out_dim)
+    for t in range(T):
+        # current_encoder_out = encoder_out[:, t : t + 1, :].unsqueeze(2)  # noqa
+        # current_encoder_out's shape: (batch_size, 1, 1, encoder_out_dim)
+        current_encoder_out = model.joiner.label_level_am_attention(
+            encoder_out, 
+            decoder_out,
+            encoder_out_lens
+          )
+        logits = model.joiner(
+            current_encoder_out,
+            decoder_out.unsqueeze(1),
+            apply_attn=False,
+            project_input=False,
+        )
+        # logits'shape (batch_size, 1, 1, vocab_size)
+        logits = logits.squeeze(1).squeeze(1)  # (batch_size, vocab_size)
+        assert logits.ndim == 2, logits.shape
+        y = logits.argmax(dim=1).tolist()
+        emitted = False
+        for i, v in enumerate(y):
+            if v not in (blank_id, unk_id):
+                hyps[i].append(v)
+                emitted = True
+        if emitted:
+            # update decoder output
+            decoder_input = [h[-context_size:] for h in hyps]
+            decoder_input = torch.tensor(
+                decoder_input,
+                device=device,
+                dtype=torch.int64,
+            )
+            decoder_out = model.decoder(decoder_input, need_pad=False)
+            decoder_out = model.joiner.decoder_proj(decoder_out)
+
+    ans = [h[context_size:] for h in hyps]
+    return ans
+
+
@dataclass
 class Hypothesis:
    # The predicted tokens so far.
--- a/egs/librispeech/ASR/zipformer_label_level_algn/decode.py
+++ b/egs/librispeech/ASR/zipformer_label_level_algn/decode.py
@ -108,6 +108,7 @@ import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
 from beam_search import (
    beam_search,
+    deprecated_greedy_search_batch_for_cross_attn,
    fast_beam_search_nbest,
    fast_beam_search_nbest_LG,
    fast_beam_search_nbest_oracle,
@ -116,7 +117,7 @@ from beam_search import (
    greedy_search_batch,
    modified_beam_search,
 )
-from train import add_model_arguments, get_params, get_model
+from train import add_model_arguments, get_model, get_params

 from icefall.checkpoint import (
    average_checkpoints,
@ -273,8 +274,7 @@ def get_parser():
        "--context-size",
        type=int,
        default=2,
-        help="The context size in the decoder. 1 means bigram; "
-        "2 means tri-gram",
+        help="The context size in the decoder. 1 means bigram; " "2 means tri-gram",
    )
    parser.add_argument(
        "--max-sym-per-frame",
@ -425,11 +425,13 @@ def decode_one_batch(
        )
        for hyp in sp.decode(hyp_tokens):
            hyps.append(hyp.split())
-    elif (
-        params.decoding_method == "greedy_search"
-        and params.max_sym_per_frame == 1
-    ):
-        hyp_tokens = greedy_search_batch(
+    elif params.decoding_method == "greedy_search" and params.max_sym_per_frame == 1:
+        # hyp_tokens = greedy_search_batch(
+        #     model=model,
+        #     encoder_out=encoder_out,
+        #     encoder_out_lens=encoder_out_lens,
+        # )
+        hyp_tokens = deprecated_greedy_search_batch_for_cross_attn(
            model=model,
            encoder_out=encoder_out,
            encoder_out_lens=encoder_out_lens,
@ -559,9 +561,7 @@ def decode_dataset(
        if batch_idx % log_interval == 0:
            batch_str = f"{batch_idx}/{num_batches}"

-            logging.info(
-                f"batch {batch_str}, cuts processed until now is {num_cuts}"
-            )
+            logging.info(f"batch {batch_str}, cuts processed until now is {num_cuts}")
    return results


@ -594,8 +594,7 @@ def save_results(

    test_set_wers = sorted(test_set_wers.items(), key=lambda x: x[1])
    errs_info = (
-        params.res_dir
-        / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
+        params.res_dir / f"wer-summary-{test_set_name}-{key}-{params.suffix}.txt"
    )
    with open(errs_info, "w") as f:
        print("settings\tWER", file=f)
@ -656,9 +655,7 @@ def main():
            if "LG" in params.decoding_method:
                params.suffix += f"-ngram-lm-scale-{params.ngram_lm_scale}"
    elif "beam_search" in params.decoding_method:
-        params.suffix += (
-            f"-{params.decoding_method}-beam-size-{params.beam_size}"
-        )
+        params.suffix += f"-{params.decoding_method}-beam-size-{params.beam_size}"
    else:
        params.suffix += f"-context-{params.context_size}"
        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"
@ -690,9 +687,9 @@ def main():

    if not params.use_averaged_model:
        if params.iter > 0:
-            filenames = find_checkpoints(
-                params.exp_dir, iteration=-params.iter
-            )[: params.avg]
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
@ -719,9 +716,9 @@ def main():
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
-            filenames = find_checkpoints(
-                params.exp_dir, iteration=-params.iter
-            )[: params.avg + 1]
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
@ -780,9 +777,7 @@ def main():
            decoding_graph.scores *= params.ngram_lm_scale
        else:
            word_table = None
-            decoding_graph = k2.trivial_graph(
-                params.vocab_size - 1, device=device
-            )
+            decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
    else:
        decoding_graph = None
        word_table = None