add conformer exps

2025-12-11 06:55:27 +00:00 · 2024-11-25 15:28:05 +08:00 · 2024-11-25 15:28:05 +08:00 · 2fc53cd7ce
commit 2fc53cd7ce
parent b65873fb4c
6 changed files with 1514 additions and 4 deletions
--- a/egs/librispeech/ASR/conformer_ctc2/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc2/decode.py
@ -44,6 +44,7 @@ from icefall.decode import (
    nbest_oracle,
    one_best_decoding,
    rescore_with_attention_decoder,
+    rescore_with_attention_decoder_no_ngram_old,
    rescore_with_n_best_list,
    rescore_with_rnn_lm,
    rescore_with_whole_lattice,
@ -459,6 +460,27 @@ def decode_one_batch(
        key = "ctc-greedy-search"
        return {key: hyps}

+    if params.method == "attention-decoder-rescoring-no-ngram":
+        best_path_dict = rescore_with_attention_decoder_no_ngram_old(
+            lattice=lattice,
+            num_paths=params.num_paths,
+            model=model,
+            memory=memory,
+            memory_key_padding_mask=memory_key_padding_mask,
+            sos_id=sos_id,
+            eos_id=eos_id,
+        )
+        ans = dict()
+        for a_scale_str, best_path in best_path_dict.items():
+            # token_ids is a lit-of-list of IDs
+            token_ids = get_texts(best_path)
+            # hyps is a list of str, e.g., ['xxx yyy zzz', ...]
+            hyps = bpe_model.decode(token_ids)
+            # hyps is a list of list of str, e.g., [['xxx', 'yyy', 'zzz'], ... ]
+            hyps = [s.split() for s in hyps]
+            ans[a_scale_str] = hyps
+        return ans
+
    if params.method == "nbest-oracle":
        # Note: You can also pass rescored lattices to it.
        # We choose the HLG decoded lattice for speed reasons
@ -761,7 +783,7 @@ def main():
    params.sos_id = sos_id
    params.eos_id = eos_id

-    if params.method == "ctc-decoding" or params.method == "ctc-greedy-search":
+    if params.method == "ctc-decoding" or params.method == "ctc-greedy-search" or params.method == "attention-decoder-rescoring-no-ngram":
        HLG = None
        H = k2.ctc_topo(
            max_token=max_token_id,
--- a/egs/librispeech/ASR/conformer_ctc3/decode.py
+++ b/egs/librispeech/ASR/conformer_ctc3/decode.py
@ -72,7 +72,7 @@ import sentencepiece as spm
 import torch
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
-from train import add_model_arguments, get_ctc_model, get_params
+from train_cr_ctc import add_model_arguments, get_ctc_model, get_params

 from icefall.bpe_graph_compiler import BpeCtcTrainingGraphCompiler
 from icefall.checkpoint import (
@ -458,8 +458,9 @@ def decode_one_batch(
    else:
        encoder_out, encoder_out_lens = model.encoder(feature, feature_lens)

-    nnet_output = model.get_ctc_output(encoder_out)
+    # nnet_output = model.get_ctc_output(encoder_out)
    # nnet_output is (N, T, C)
+    nnet_output = model.ctc_output(encoder_out)  # (N, T, C)

    if params.decoding_method == "ctc-greedy-search":
        timestamps, hyps = ctc_greedy_search(
--- a/egs/librispeech/ASR/conformer_ctc3/model_cr_ctc.py
+++ b/egs/librispeech/ASR/conformer_ctc3/model_cr_ctc.py
@ -0,0 +1,209 @@
+# Copyright  2021-2022  Xiaomi Corp.     (authors: Fangjun Kuang,
+#                                                  Wei Kang,
+#                                                  Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Tuple
+
+import k2
+import torch
+import torch.nn as nn
+from encoder_interface import EncoderInterface
+from scaling import ScaledLinear
+from icefall.utils import make_pad_mask, time_warp
+from lhotse.dataset import SpecAugment
+
+
+class CTCModel(nn.Module):
+    """It implements https://www.cs.toronto.edu/~graves/icml_2006.pdf
+    "Connectionist Temporal Classification: Labelling Unsegmented
+    Sequence Data with Recurrent Neural Networks"
+    """
+
+    def __init__(
+        self,
+        encoder: EncoderInterface,
+        encoder_dim: int,
+        vocab_size: int,
+    ):
+        """
+        Args:
+          encoder:
+            It is the transcription network in the paper. Its accepts
+            two inputs: `x` of (N, T, encoder_dim) and `x_lens` of shape (N,).
+            It returns two tensors: `logits` of shape (N, T, encoder_dm) and
+            `logit_lens` of shape (N,).
+          encoder_dim:
+            The feature embedding dimension.
+          vocab_size:
+            The vocabulary size.
+        """
+        super().__init__()
+        assert isinstance(encoder, EncoderInterface), type(encoder)
+
+        self.encoder = encoder
+        self.ctc_output = nn.Sequential(
+            nn.Dropout(p=0.1),
+            ScaledLinear(encoder_dim, vocab_size),
+            nn.LogSoftmax(dim=-1),
+        )
+
+    def forward_ctc(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        targets: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute CTC loss.
+        Args:
+          encoder_out:
+            Encoder output, of shape (N, T, C).
+          encoder_out_lens:
+            Encoder output lengths, of shape (N,).
+          targets:
+            Target Tensor of shape (sum(target_lengths)). The targets are assumed
+            to be un-padded and concatenated within 1 dimension.
+        """
+        # Compute CTC log-prob
+        ctc_output = self.ctc_output(encoder_out)  # (N, T, C)
+
+        ctc_loss = torch.nn.functional.ctc_loss(
+            log_probs=ctc_output.permute(1, 0, 2),  # (T, N, C)
+            targets=targets.cpu(),
+            input_lengths=encoder_out_lens.cpu(),
+            target_lengths=target_lengths.cpu(),
+            reduction="sum",
+        )
+        return ctc_loss
+
+    def forward_cr_ctc(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        targets: torch.Tensor,
+        target_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Compute CTC loss with consistency regularization loss.
+        Args:
+          encoder_out:
+            Encoder output, of shape (2 * N, T, C).
+          encoder_out_lens:
+            Encoder output lengths, of shape (2 * N,).
+          targets:
+            Target Tensor of shape (2 * sum(target_lengths)). The targets are assumed
+            to be un-padded and concatenated within 1 dimension.
+        """
+        # Compute CTC loss
+        ctc_output = self.ctc_output(encoder_out)  # (2 * N, T, C)
+        ctc_loss = torch.nn.functional.ctc_loss(
+            log_probs=ctc_output.permute(1, 0, 2),  # (T, 2 * N, C)
+            targets=targets.cpu(),
+            input_lengths=encoder_out_lens.cpu(),
+            target_lengths=target_lengths.cpu(),
+            reduction="sum",
+        )
+
+        # Compute consistency regularization loss
+        exchanged_targets = ctc_output.detach().chunk(2, dim=0)
+        exchanged_targets = torch.cat(
+            [exchanged_targets[1], exchanged_targets[0]], dim=0
+        )  # exchange: [x1, x2] -> [x2, x1]
+        cr_loss = nn.functional.kl_div(
+            input=ctc_output,
+            target=exchanged_targets,
+            reduction="none",
+            log_target=True,
+        )  # (2 * N, T, C)
+        length_mask = make_pad_mask(encoder_out_lens).unsqueeze(-1)
+        cr_loss = cr_loss.masked_fill(length_mask, 0.0).sum()
+
+        return ctc_loss, cr_loss
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lens: torch.Tensor,
+        y: k2.RaggedTensor,
+        warmup: float = 1.0,
+        use_cr_ctc: bool = False,
+        use_spec_aug: bool = False,
+        spec_augment: Optional[SpecAugment] = None,
+        supervision_segments: Optional[torch.Tensor] = None,
+        time_warp_factor: Optional[int] = 80,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+          x:
+            A 3-D tensor of shape (N, T, C).
+          x_lens:
+            A 1-D tensor of shape (N,). It contains the number of frames in `x`
+            before padding.
+          warmup: a floating point value which increases throughout training;
+            values >= 1.0 are fully warmed up and have all modules present.
+        """
+        assert x.ndim == 3, x.shape
+        assert x_lens.ndim == 1, x_lens.shape
+        assert y.num_axes == 2, y.num_axes
+
+        assert x.size(0) == x_lens.size(0) == y.dim0, (x.shape, x_lens.shape, y.dim0)
+
+        if use_cr_ctc:
+            if use_spec_aug:
+                assert spec_augment is not None and spec_augment.time_warp_factor < 1
+                # Apply time warping before input duplicating
+                assert supervision_segments is not None
+                x = time_warp(
+                    x,
+                    time_warp_factor=time_warp_factor,
+                    supervision_segments=supervision_segments,
+                )
+                # Independently apply frequency masking and time masking to the two copies
+                x = spec_augment(x.repeat(2, 1, 1))
+            else:
+                x = x.repeat(2, 1, 1)
+            x_lens = x_lens.repeat(2)
+            y = k2.ragged.cat([y, y], axis=0)
+
+        # Compute encoder outputs
+        encoder_out, encoder_out_lens = self.encoder(x, x_lens, warmup=warmup)
+        assert torch.all(encoder_out_lens > 0)
+
+        row_splits = y.shape.row_splits(1)
+        y_lens = row_splits[1:] - row_splits[:-1]
+
+        # Compute CTC loss
+        targets = y.values
+        if not use_cr_ctc:
+            ctc_loss = self.forward_ctc(
+                encoder_out=encoder_out,
+                encoder_out_lens=encoder_out_lens,
+                targets=targets,
+                target_lengths=y_lens,
+            )
+            cr_loss = torch.empty(0)
+        else:
+            ctc_loss, cr_loss = self.forward_cr_ctc(
+                encoder_out=encoder_out,
+                encoder_out_lens=encoder_out_lens,
+                targets=targets,
+                target_lengths=y_lens,
+            )
+            ctc_loss = ctc_loss * 0.5
+            cr_loss = cr_loss * 0.5
+
+        return ctc_loss, cr_loss
--- a/egs/librispeech/ASR/conformer_ctc3/train_cr_ctc.py
+++ b/egs/librispeech/ASR/conformer_ctc3/train_cr_ctc.py
--- a/icefall/decode.py
+++ b/icefall/decode.py
@ -1083,6 +1083,185 @@ def rescore_with_attention_decoder(
    return ans


+def rescore_with_attention_decoder_no_ngram_old(
+    lattice: k2.Fsa,
+    num_paths: int,
+    model: torch.nn.Module,
+    memory: torch.Tensor,
+    memory_key_padding_mask: Optional[torch.Tensor],
+    sos_id: int,
+    eos_id: int,
+    attention_scale: Optional[float] = None,
+    use_double_scores: bool = True,
+) -> Dict[str, k2.Fsa]:
+    """This function extracts `num_paths` paths from the given lattice and uses
+    an attention decoder to rescore them. The path with the highest score is
+    the decoding output.
+
+    Args:
+      lattice:
+        An FsaVec with axes [utt][state][arc].
+      num_paths:
+        Number of paths to extract from the given lattice for rescoring.
+      model:
+        A transformer model. See the class "Transformer" in
+        conformer_ctc/transformer.py for its interface.
+      memory:
+        The encoder memory of the given model. It is the output of
+        the last torch.nn.TransformerEncoder layer in the given model.
+        Its shape is `(T, N, C)`.
+      memory_key_padding_mask:
+        The padding mask for memory with shape `(N, T)`.
+      sos_id:
+        The token ID for SOS.
+      eos_id:
+        The token ID for EOS.
+      nbest_scale:
+        It's the scale applied to `lattice.scores`. A smaller value
+        leads to more unique paths at the risk of missing the correct path.
+      ngram_lm_scale:
+        Optional. It specifies the scale for n-gram LM scores.
+      attention_scale:
+        Optional. It specifies the scale for attention decoder scores.
+    Returns:
+      A dict of FsaVec, whose key contains a string
+      ngram_lm_scale_attention_scale and the value is the
+      best decoding path for each utterance in the lattice.
+    """
+    # max_loop_count = 10
+    # loop_count = 0
+    # while loop_count <= max_loop_count:
+    #     try:
+    #         nbest = Nbest.from_lattice(
+    #             lattice=lattice,
+    #             num_paths=num_paths,
+    #             use_double_scores=use_double_scores,
+    #             nbest_scale=nbest_scale,
+    #         )
+    #         # nbest.fsa.scores are all 0s at this point
+    #         nbest = nbest.intersect(lattice)
+    #         break
+    #     except RuntimeError as e:
+    #         logging.info(f"Caught exception:\n{e}\n")
+    #         logging.info(f"num_paths before decreasing: {num_paths}")
+    #         num_paths = int(num_paths / 2)
+    #         if loop_count >= max_loop_count or num_paths <= 0:
+    #             logging.info("Return None as the resulting lattice is too large.")
+    #             return None
+    #         logging.info(
+    #             "This OOM is not an error. You can ignore it. "
+    #             "If your model does not converge well, or --max-duration "
+    #             "is too large, or the input sound file is difficult to "
+    #             "decode, you will meet this exception."
+    #         )
+    #         logging.info(f"num_paths after decreasing: {num_paths}")
+    #     loop_count += 1
+
+    # # Now nbest.fsa has its scores set.
+    # # Also, nbest.fsa inherits the attributes from `lattice`.
+    # assert hasattr(nbest.fsa, "lm_scores")
+
+    # am_scores = nbest.compute_am_scores()
+    # ngram_lm_scores = nbest.compute_lm_scores()
+
+    # # The `tokens` attribute is set inside `compile_hlg.py`
+    # assert hasattr(nbest.fsa, "tokens")
+    # assert isinstance(nbest.fsa.tokens, torch.Tensor)
+
+    # path_to_utt_map = nbest.shape.row_ids(1).to(torch.long)
+    # # the shape of memory is (T, N, C), so we use axis=1 here
+    # expanded_memory = memory.index_select(1, path_to_utt_map)
+
+    # if memory_key_padding_mask is not None:
+    #     # The shape of memory_key_padding_mask is (N, T), so we
+    #     # use axis=0 here.
+    #     expanded_memory_key_padding_mask = memory_key_padding_mask.index_select(
+    #         0, path_to_utt_map
+    #     )
+    # else:
+    #     expanded_memory_key_padding_mask = None
+
+    # # remove axis corresponding to states.
+    # tokens_shape = nbest.fsa.arcs.shape().remove_axis(1)
+    # tokens = k2.RaggedTensor(tokens_shape, nbest.fsa.tokens)
+    # tokens = tokens.remove_values_leq(0)
+    # token_ids = tokens.tolist()
+
+    # if len(token_ids) == 0:
+    #     print("Warning: rescore_with_attention_decoder(): empty token-ids")
+    #     return None
+
+    # path is a ragged tensor with dtype torch.int32.
+    # It has three axes [utt][path][arc_pos]
+    path = k2.random_paths(lattice, num_paths=num_paths, use_double_scores=True)
+    # Note that labels, aux_labels and scores contains 0s and -1s.
+    # The last entry in each sublist is -1.
+    # The axes are [path][token_id]
+    labels = k2.ragged.index(lattice.labels.contiguous(), path).remove_axis(0)
+    aux_labels = k2.ragged.index(lattice.aux_labels.contiguous(), path).remove_axis(0)
+    scores = k2.ragged.index(lattice.scores.contiguous(), path).remove_axis(0)
+
+    # Remove -1 from labels as we will use it to construct a linear FSA
+    labels = labels.remove_values_eq(-1)
+    fsa = k2.linear_fsa(labels)
+    fsa.aux_labels = aux_labels.values
+
+    # utt_to_path_shape has axes [utt][path]
+    utt_to_path_shape = path.shape.get_layer(0)
+    scores = k2.RaggedTensor(utt_to_path_shape, scores.sum())
+
+    path_to_utt_map = utt_to_path_shape.row_ids(1).to(torch.long)
+    # the shape of memory is (N, T, C), so we use axis=0 here
+    # expanded_encoder_out = encoder_out.index_select(0, path_to_utt_map)
+    # expanded_encoder_out_lens = encoder_out_lens.index_select(0, path_to_utt_map)
+    # # the shape of memory is (T, N, C), so we use axis=1 here
+    expanded_memory = memory.index_select(1, path_to_utt_map)
+
+    if memory_key_padding_mask is not None:
+        # The shape of memory_key_padding_mask is (N, T), so we
+        # use axis=0 here.
+        expanded_memory_key_padding_mask = memory_key_padding_mask.index_select(
+            0, path_to_utt_map
+        )
+    else:
+        expanded_memory_key_padding_mask = None
+
+    token_ids = aux_labels.remove_values_leq(0).tolist()
+
+    nll = model.decoder_nll(
+        memory=expanded_memory,
+        memory_key_padding_mask=expanded_memory_key_padding_mask,
+        token_ids=token_ids,
+        sos_id=sos_id,
+        eos_id=eos_id,
+    )
+    assert nll.ndim == 2
+    assert nll.shape[0] == len(token_ids)
+
+    attention_scores = -nll.sum(dim=1)
+
+    if attention_scale is None:
+        attention_scale_list = [0.01, 0.05, 0.08]
+        attention_scale_list += [0.1, 0.3, 0.5, 0.6, 0.7, 0.9, 1.0]
+        attention_scale_list += [1.1, 1.2, 1.3, 1.5, 1.7, 1.9, 2.0]
+        attention_scale_list += [2.1, 2.2, 2.3, 2.5, 3.0, 4.0, 5.0]
+    else:
+        attention_scale_list = [attention_scale]
+
+    ans = dict()
+
+    for a_scale in attention_scale_list:
+        tot_scores = scores.values + a_scale * attention_scores
+        ragged_tot_scores = k2.RaggedTensor(utt_to_path_shape, tot_scores)
+        max_indexes = ragged_tot_scores.argmax()
+        best_path = k2.index_fsa(fsa, max_indexes)
+
+        key = f"attention_scale_{a_scale}"
+        ans[key] = best_path
+
+    return ans
+
+
 def rescore_with_attention_decoder_with_ngram(
    lattice: k2.Fsa,
    num_paths: int,
--- a/icefall/utils.py
+++ b/icefall/utils.py
@ -983,7 +983,8 @@ def write_error_stats_with_timestamps(
        hyp_count = corr + hyp_sub + ins

        print(f"{word}   {corr} {tot_errs} {ref_count} {hyp_count}", file=f)
-    return float(tot_err_rate), float(mean_delay), float(var_delay)
+    # return float(tot_err_rate), float(mean_delay), float(var_delay)
+    return float(tot_err_rate), mean_delay, var_delay


 def write_surt_error_stats(