Add memory to model

2025-12-11 06:55:27 +00:00 · 2023-05-01 20:47:09 +08:00 · 2023-05-01 20:47:09 +08:00 · fa696e919b
commit fa696e919b
parent 6f5c4688ef
3 changed files with 350 additions and 39 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/model.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/model.py
@ -26,15 +26,16 @@ from icefall.utils import add_sos, make_pad_mask
 from scaling import penalize_abs_values_gt, ScaledLinear
-class Transducer(nn.Module):
+class PromptedTransducer(nn.Module):
    """It implements https://arxiv.org/pdf/1211.3711.pdf
    "Sequence Transduction with Recurrent Neural Networks"
    """
    def __init__(
        self,
        encoder_embed: nn.Module,
        encoder: EncoderInterface,
        text_embed: nn.Module,
        text_encoder: EncoderInterface,
        decoder: nn.Module,
        joiner: nn.Module,
        encoder_dim: int,
@ -68,6 +69,8 @@ class Transducer(nn.Module):
        self.encoder_embed = encoder_embed
        self.encoder = encoder
        self.text_embed = text_embed
        self.text_encoder = text_encoder
        self.decoder = decoder
        self.joiner = joiner
@ -86,6 +89,9 @@ class Transducer(nn.Module):
        self,
        x: torch.Tensor,
        x_lens: torch.Tensor,
        text: torch.Tensor,
        style_lens: torch.Tensor,
        text_lens: torch.Tensor,
        y: k2.RaggedTensor,
        prune_range: int = 5,
        am_scale: float = 0.0,
@ -98,6 +104,21 @@ class Transducer(nn.Module):
          x_lens:
            A 1-D tensor of shape (N,). It contains the number of frames in `x`
            before padding.
          x_lens:
            A 1-D tensor of shape (N,). It contains the number of frames in `x`
            before padding.
          text:
            A 2-D tensor of integer dtype containing prompt text, of shape (N, T).
            It is exptected to contain the style prompt (first) and then the content
            prompt.
          style_lens:
            A 1-D tensor of shape (N,), containing the number of elements (bytes)
            within each row of `text` that correspond to the style prompt (these
            are expected to come first).
          text_lens:
            A 1-D tensor of shape (N,). It contains the number of elements (bytes)
            in `text` before padding, which will include the lengths of the
            style plus the content prompt.
          y:
            A ragged tensor with 2 axes [utt][label]. It contains labels of each
            utterance.
@ -125,14 +146,25 @@ class Transducer(nn.Module):
        assert x.size(0) == x_lens.size(0) == y.dim0
        # logging.info(f"Memory allocated at entry: {torch.cuda.memory_allocated() // 1000000}M")
        x, x_lens = self.encoder_embed(x, x_lens)
        # logging.info(f"Memory allocated after encoder_embed: {torch.cuda.memory_allocated() // 1000000}M")
        src_key_padding_mask = make_pad_mask(x_lens)
        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
-        encoder_out, x_lens = self.encoder(x, x_lens, src_key_padding_mask)
+        text = text.t()  # now (T, N)
        text = self.text_embed(text) # now (T, N, C)
        text_key_padding_mask = make_pad_mask(text_lens)
        memory, text_lens = self.text_encoder(text, text_lens,
                                              text_key_padding_mask)
        memory = self._add_style_indicator(memory, style_lens)
        memory_key_padding_mask = make_pad_mask(text_lens)
        encoder_out, x_lens = self.encoder(x, x_lens, src_key_padding_mask,
                                           memory=memory,
                                           memory_key_padding_mask=memory_key_padding_mask)
        encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
        assert torch.all(x_lens > 0)
@ -217,3 +249,24 @@ class Transducer(nn.Module):
            )
        return (simple_loss, pruned_loss)
    def _add_style_indicator(self, memory: Tensor, style_lens: Tensor):
        """
        Adds to `memory` an indicator that is 0.1 for positions that correspond to
        the `style prompt` and 0 elsewhere.  The scale can be fixed because the
        scale of the memory vector can adjust to compensate (within limits set
        by the balancers)..
        Args:
             memory: (memory_len, batch_size, embed_dim)
         style_lens: (batch_size,),  a vector of lengths of the style prompt.
        """
        (memory_len, batch_size, embed_dim) = memory.shape
        indicator = torch.arange(memory_len, device=memory.device).unsqueeze(-1) < style_lens
        indicator = indicator.to(memory.dtype).unsqueeze(-1)
        return memory + indicator
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
@ -71,7 +71,9 @@ from lhotse.utils import fix_random_seed
 from model import Transducer
 from optim import Eden, ScaledAdam
 from torch import Tensor
 from torch import nn
 from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter
@ -159,6 +161,14 @@ def add_model_arguments(parser: argparse.ArgumentParser):
        help="Embedding dimension in encoder stacks: a single int or comma-separated list."
    )
    parser.add_argument(
        "--text-encoder-dim",
        type=str,
        default="256,256,384,512",
        help="Embedding dimension in text encoder stacks: a comma-separated list of 4 elements, "
        "or you should change other configs in the code."
    )
    parser.add_argument(
        "--query-head-dim",
        type=str,
@ -547,6 +557,32 @@ def get_encoder_embed(params: AttributeDict) -> nn.Module:
    return encoder_embed
 def get_text_embed(params: AttributeDict) -> nn.Module:
    return nn.Embedding(
        num_embeddings=256,   # we encode the text as UTF-8 bytes
        embedding_dim=_to_int_tuple(params.text_encoder_dim)[0],
    )
 def get_text_encoder(params: AttributeDict) -> nn.Module:
    return Zipformer2(
        output_downsampling_factor=8,
        downsampling_factor=(1,2,4,8),
        num_encoder_layers=(2,4,6,6),
        encoder_dim=_to_int_tuple(params.text_encoder_dim),
        encoder_unmasked_dim=(196,196,256,256),
        query_head_dim=(32,32,32,32),
        pos_head_dim=(4,4,4,4),
        value_head_dim=(12,12,12,12),
        pos_dim=48,
        num_heads=(4,4,4,8),
        feedforward_dim=(384,512,768,1024), # could increase this if there is nough data
        cnn_module_kernel=(31,31,15,15),
        dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
        warmup_batches=4000.0,
        causal=False,
    )
 def get_encoder_model(params: AttributeDict) -> nn.Module:
    encoder = Zipformer2(
        output_downsampling_factor=2,
@ -566,6 +602,7 @@ def get_encoder_model(params: AttributeDict) -> nn.Module:
        causal=params.causal,
        chunk_size=_to_int_tuple(params.chunk_size),
        left_context_frames=_to_int_tuple(params.left_context_frames),
        memory_dim=_to_int_tuple(params.text_encoder_dim)[-1],
    )
    return encoder
@ -593,12 +630,16 @@ def get_joiner_model(params: AttributeDict) -> nn.Module:
 def get_transducer_model(params: AttributeDict) -> nn.Module:
    encoder_embed = get_encoder_embed(params)
    encoder = get_encoder_model(params)
    text_embed = get_text_embed(params)
    text_encoder = get_text_encoder(params)
    decoder = get_decoder_model(params)
    joiner = get_joiner_model(params)
    model = Transducer(
        encoder_embed=encoder_embed,
        encoder=encoder,
        text_embed=text_embed,
        text_encoder=text_encoder,
        decoder=decoder,
        joiner=joiner,
        encoder_dim=int(max(params.encoder_dim.split(','))),
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
@ -89,6 +89,9 @@ class Zipformer2(EncoderInterface):
           context chunks for causal training; will be rounded to a number of
           chunks.  Must not be less than cnn_module_kernel (after factoring in
           rounding and downsampling); an error will be thrown if this is violated.
        memory_dim: if supplied and >0, will be the dimension of the memory embeddings
            passed into the zipformer (e.g. this might be the output of another
            Zipformer used to create embedding vectors.)
    """
    def __init__(
            self,
@ -103,6 +106,7 @@ class Zipformer2(EncoderInterface):
            num_heads: Union[int, Tuple[int]] = 8,
            feedforward_dim: Union[int, Tuple[int]] = 1536,
            cnn_module_kernel: Union[int, Tuple[int]] = 31,
            memory_dim: int = -1,
            pos_dim: int = 192,
            dropout: FloatLike = None,  # see code below for default
            warmup_batches: float = 4000.0,
@ -160,6 +164,7 @@ class Zipformer2(EncoderInterface):
                pos_head_dim=pos_head_dim[i],
                value_head_dim=value_head_dim[i],
                feedforward_dim=feedforward_dim[i],
                memory_dim=memory_dim,
                dropout=dropout,
                cnn_module_kernel=cnn_module_kernel[i],
                causal=causal,
@ -271,9 +276,12 @@ class Zipformer2(EncoderInterface):
    def forward(
-        self, x: torch.Tensor,
+        self,
        x: torch.Tensor,
        x_lens: torch.Tensor,
        src_key_padding_mask: Optional[torch.Tensor] = None,
        memory: Optional[Tensor] = None,
        memory_key_padding_mask: Optional[Tensor] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Args:
@ -284,7 +292,12 @@ class Zipformer2(EncoderInterface):
            `x` before padding.
          src_key_padding_mask:
            The mask for padding, of shape (batch_size, seq_len); True means
-            masked position. May be None.
+             masked position. May be None.
          memory:  optionally, the memory embeddings of shape (memory_len, batch_size, memory_dim)
          memory_key_padding_mask: optionally the mask for padding of memory input (for source-
             attention), of shape  (batch_size, memory_len); True means
              masked position.  May be None.
        Returns:
          Return a tuple containing 2 tensors:
            - embeddings: its shape is (batch_size, output_seq_len, max(encoder_dim))
@ -298,6 +311,13 @@ class Zipformer2(EncoderInterface):
        attn_mask = self._get_attn_mask(x, chunk_size, left_context_chunks)
        if self.training and memory is not None:
            batch_size = x.shape[1]
            # setting memory to zero should be equivalent to not using the
            # memory input at all, since the Attention module has no biases.
            memory_dropout_rate = 0.05
            memory = memory * (torch.rand(batch_size, 1) > memory_dropout_rate)
        for i, module in enumerate(self.encoders):
            ds = self.downsampling_factor[i]
            x = convert_num_channels(x, self.encoder_dim[i])
@ -308,6 +328,8 @@ class Zipformer2(EncoderInterface):
                       src_key_padding_mask=(None if src_key_padding_mask is None
                                             else src_key_padding_mask[...,::ds]),
                       attn_mask=attn_mask,
                       memory=memory,
                       memory_key_padding_mask=memory_key_padding_mask,
            )
            outputs.append(x)
@ -416,6 +438,7 @@ class Zipformer2EncoderLayer(nn.Module):
            dropout: FloatLike = 0.1,
            cnn_module_kernel: int = 31,
            causal: bool = False,
            memory_dim: int = -1,
            attention_skip_rate: FloatLike = ScheduledFloat((0.0, 0.2), (4000.0, 0.05), (16000, 0.0), default=0),
            conv_skip_rate: FloatLike = ScheduledFloat((0.0, 0.2), (4000.0, 0.05), (16000, 0.0), default=0),
            const_attention_rate: FloatLike = ScheduledFloat((0.0, 0.25), (4000.0, 0.025), default=0),
@ -452,11 +475,25 @@ class Zipformer2EncoderLayer(nn.Module):
            dropout=0.0,
        )
-        self.self_attn1 = SelfAttention(embed_dim, num_heads,
+
        self.self_attn1 = Attention(embed_dim, embed_dim, num_heads,
                                        value_head_dim)
-        self.self_attn2 = SelfAttention(embed_dim, num_heads,
+        self.self_attn2 = Attention(embed_dim, embed_dim, num_heads,
-                                        value_head_dim)
+                                    value_head_dim)
        if memory_dim > 0:
            self.attn_weights = MultiheadAttentionWeights(
                memory_dim, embed_dim,
                num_heads=num_heads,
                head_dim=query_head_dim,
                dropout=0.0,
            )
            self.src_attn1 = Attention(memory_dim, embed_dim, num_heads,
                                       value_head_dim)
            self.src_attn2 = Attention(memory_dim, embed_dim, num_heads,
                                       value_head_dim)
        self.feed_forward1 = FeedforwardModule(embed_dim,
                                               (feedforward_dim * 3) // 4,
@ -579,6 +616,8 @@ class Zipformer2EncoderLayer(nn.Module):
        chunk_size: int = -1,
        attn_mask: Optional[Tensor] = None,
        src_key_padding_mask: Optional[Tensor] = None,
        memory: Optional[Tensor] = None,
        memory_key_padding_mask: Optional[Tensor] = None,
    ) -> Tensor:
        """
        Pass the input through the encoder layer.
@ -608,11 +647,14 @@ class Zipformer2EncoderLayer(nn.Module):
            pos_emb=pos_emb,
            attn_mask=attn_mask,
            key_padding_mask=src_key_padding_mask,
-            )
+        )
        if memory is not None and hasattr(self, 'attn_weights'):
            src_attn_weights = self.attn_weights(memory, src, memory_key_padding_mask)
        src = src + self.feed_forward1(src)
-        self_attn_dropout_mask = self.get_sequence_dropout_mask(src, attention_skip_rate)
+        attn_dropout_mask = self.get_sequence_dropout_mask(src, attention_skip_rate)
        if True:
            selected_attn_weights = attn_weights[0:2]
@ -630,12 +672,16 @@ class Zipformer2EncoderLayer(nn.Module):
        na = self.balancer_na(self.nonlin_attention(src,
                                                    selected_attn_weights[0:1]))
-        src = src + (na if self_attn_dropout_mask is None else na * self_attn_dropout_mask)
+        src = src + (na if attn_dropout_mask is None else na * attn_dropout_mask)
        self_attn = self.self_attn1(
            src, attn_weights)
-        src = src + (self_attn if self_attn_dropout_mask is None else self_attn * self_attn_dropout_mask)
+        src = src + (self_attn if attn_dropout_mask is None else self_attn * attn_dropout_mask)
        if memory is not None and hasattr(self, 'attn_weights'):
            src = src + self.sequence_dropout(self.src_attn1(memory, src_attn_weights),
                                              attention_skip_rate)
        src = src + self.sequence_dropout(self.conv_module1(src, chunk_size=chunk_size,
                                                            src_key_padding_mask=src_key_padding_mask),
@ -650,7 +696,11 @@ class Zipformer2EncoderLayer(nn.Module):
        self_attn = self.self_attn2(
            src, attn_weights)
-        src = src + (self_attn if self_attn_dropout_mask is None else self_attn * self_attn_dropout_mask)
+        src = src + (self_attn if attn_dropout_mask is None else self_attn * attn_dropout_mask)
        if memory is not None and hasattr(self, 'attn_weights'):
            src = src + self.sequence_dropout(self.src_attn2(memory, src_attn_weights),
                                              attention_skip_rate)
        src = src + self.sequence_dropout(self.conv_module2(src, chunk_size=chunk_size,
                                                            src_key_padding_mask=src_key_padding_mask),
@ -673,9 +723,9 @@ class Zipformer2Encoder(nn.Module):
    r"""Zipformer2Encoder is a stack of N encoder layers
    Args:
-        encoder_layer: an instance of the Zipformer2EncoderLayer() class (required).
+     encoder_layer: an instance of the Zipformer2EncoderLayer() class (required).
        num_layers: the number of sub-encoder-layers in the encoder (required).
-       pos_dim: the dimension for the relative positional encoding
+           pos_dim: the dimension for the relative positional encoding
    Examples::
        >>> encoder_layer = Zipformer2EncoderLayer(embed_dim=512, nhead=8)
@ -721,6 +771,8 @@ class Zipformer2Encoder(nn.Module):
        feature_mask: Union[Tensor, float] = 1.0,
        attn_mask: Optional[Tensor] = None,
        src_key_padding_mask: Optional[Tensor] = None,
        memory: Optional[Tensor] = None,
        memory_key_padding_mask: Optional[Tensor] = None,
    ) -> Tensor:
        r"""Pass the input through the encoder layers in turn.
@ -734,6 +786,10 @@ class Zipformer2Encoder(nn.Module):
                 True means masked position. May be None.
            src_key_padding_mask:  the mask for padding, of shape (batch_size, seq_len); True means
                 masked position.  May be None.
            memory:  optionally, the memory embeddings of shape (memory_len, batch_size, memory_dim)
            memory_key_padding_mask: optionally the mask for padding of memory input (for source-
                attention), of shape  (batch_size, memory_len); True means
                 masked position.  May be None.
        Returns: a Tensor with the same shape as src.
        """
@ -751,6 +807,8 @@ class Zipformer2Encoder(nn.Module):
                chunk_size=chunk_size,
                attn_mask=attn_mask,
                src_key_padding_mask=src_key_padding_mask,
                memory=memory,
                memory_key_padding_mask=memory_key_padding_mask,
            )
            output = output * feature_mask
@ -843,6 +901,8 @@ class DownsampledZipformer2Encoder(nn.Module):
                feature_mask: Union[Tensor, float] = 1.0,
                attn_mask: Optional[Tensor] = None,
                src_key_padding_mask: Optional[Tensor] = None,
                memory: Optional[Tensor] = None,
                memory_key_padding_mask: Optional[Tensor] = None,
    ) -> Tuple[Tensor, Tensor]:
        r"""Downsample, go through encoder, upsample.
@ -855,6 +915,10 @@ class DownsampledZipformer2Encoder(nn.Module):
                 True means masked position. May be None.
            src_key_padding_mask:  the mask for padding, of shape (batch_size, seq_len); True means
                 masked position.  May be None.
            memory:  optionally, the memory embeddings of shape (memory_len, batch_size, memory_dim)
            memory_key_padding_mask: optionally the mask for padding of memory input (for source-
                attention), of shape  (batch_size, memory_len); True means
                 masked position.  May be None.
        Returns: a Tensor with the same shape as src.
        """
@ -870,6 +934,8 @@ class DownsampledZipformer2Encoder(nn.Module):
            feature_mask=feature_mask,
            attn_mask=attn_mask,
            src_key_padding_mask=src_key_padding_mask,
            memory=memory,
            memory_key_padding_mask=memory_key_padding_mask,
        )
        src = self.upsample(src)
        # remove any extra frames that are not a multiple of downsample_factor
@ -1185,7 +1251,6 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
        query_dim = query_head_dim * num_heads
        # self-attention
        q = x[...,0:query_dim]
        k = x[...,query_dim:2*query_dim]
        # p is the position-encoding query
@ -1231,9 +1296,9 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
            attn_scores = attn_scores + pos_scores
        if self.training and random.random() < 0.1:
-            # This is a harder way of limiting the attention scores to not be
+            # This is away of limiting the attention scores to not be
            # too large.  It incurs a penalty if any of them has an absolute
-            # value greater than 50.0.  this should be outside the normal range
+            # value greater than 25.0.  this should be outside the normal range
            # of the attention scores.  We use this mechanism instead of, say,
            # something added to the loss function involving the entropy,
            # because once the entropy gets very small gradients through the
@ -1295,29 +1360,31 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
                logging.info(f"name={self.name}, attn_weights_entropy = {attn_weights_entropy}")
-class SelfAttention(nn.Module):
+class Attention(nn.Module):
    """
    The simplest possible attention module.  This one works with already-computed attention
    weights, e.g. as computed by RelPositionMultiheadAttentionWeights.
    Args:
-          embed_dim: the input and output embedding dimension
+          embed_dim_in: the input embedding dimension
          embed_dim_out: the output embedding dimension (normally the same as input)
          num_heads: the number of attention heads
          value_head_dim: the value dimension per head
    """
    def __init__(
            self,
-            embed_dim: int,
+            embed_dim_in: int,
            embed_dim_out: int,
            num_heads: int,
            value_head_dim: int,
    ) -> None:
        super().__init__()
-        self.in_proj = nn.Linear(embed_dim,
+        self.in_proj = nn.Linear(embed_dim_in,
                                 num_heads * value_head_dim,
-                                 bias=True)
+                                 bias=False)
        self.out_proj = ScaledLinear(num_heads * value_head_dim,
-                                     embed_dim, bias=True,
+                                     embed_dim_out, bias=False,
                                     initial_scale=0.05)
        self.whiten = Whiten(num_groups=1,
@ -1334,35 +1401,182 @@ class SelfAttention(nn.Module):
        """
        Args:
          x: input tensor, of shape (seq_len, batch_size, embed_dim)
-         attn_weights: a tensor of shape (num_heads, batch_size, seq_len, seq_len),
+         attn_weights: a tensor of shape (num_heads, batch_size, query_len, key_len),
-          with seq_len being interpreted as (tgt_seq_len, src_seq_len).  Expect
+          Expect attn_weights.sum(dim=-1) == 1.
          attn_weights.sum(dim=-1) == 1.
        Returns:
           a tensor with the same shape as x.
        """
-        (seq_len, batch_size, embed_dim) = x.shape
+        (num_heads, batch_size, query_len, key_len) = attn_weights.shape
        num_heads = attn_weights.shape[0]
        assert attn_weights.shape == (num_heads, batch_size, seq_len, seq_len)
-        x = self.in_proj(x)     #  (seq_len, batch_size, num_heads * value_head_dim)
+        x = self.in_proj(x)     #  (key_len, batch_size, num_heads * value_head_dim)
-        x = x.reshape(seq_len, batch_size, num_heads, -1).permute(2, 1, 0, 3)
+        x = x.reshape(key_len, batch_size, num_heads, -1).permute(2, 1, 0, 3)
-        # now x: (num_heads, batch_size, seq_len, value_head_dim)
+        # now x: (num_heads, batch_size, key_len, value_head_dim)
        value_head_dim = x.shape[-1]
        # todo: see whether there is benefit in overriding matmul
        x = torch.matmul(attn_weights, x)
-        # v: (num_heads, batch_size, seq_len, value_head_dim)
+        # v: (num_heads, batch_size, query_len, value_head_dim)
        x = x.permute(2, 1, 0, 3).contiguous().view(
-            seq_len, batch_size, num_heads * value_head_dim)
+            query_len, batch_size, num_heads * value_head_dim)
-        # returned value is of shape (seq_len, batch_size, embed_dim), like the input.
+        # returned value is of shape (query_len, batch_size, embed_dim), like the input.
        x = self.out_proj(x)
        x = self.whiten(x)
        return x
 class MultiheadAttentionWeights(nn.Module):
    r"""Module that computes multi-head cross-attention weights.  Allows src and target
    to have different dims.
    Args:
          key_embed_dim: number of channels of the thing that we'll project to
              make the query (corresponds to source).  e.g. 256
          query_embed_dim: number of channels of the thing that we'll project to
              make the query (corresponds to target).  e.g. 256
          num_heads:  number of heads to compute weights for, e.g. 8
           head_dim: dimension of the query and key, per head.  e.g. 24.
             dropout: dropout probability for attn_output_weights. Default: 0.0.
    """
    def __init__(
            self,
            key_embed_dim: int,
            query_embed_dim: int,
            num_heads: int,
            head_dim: int,
            dropout: float = 0.0,
    ) -> None:
        super().__init__()
        self.key_embed_dim = key_embed_dim
        self.query_embed_dim = query_embed_dim
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.dropout = dropout
        self.name = None  # will be overwritten in training code; for diagnostics.
        # the initial_scale is supposed to take over the "scaling" factor of
        # head_dim ** -0.5 that has been used in previous forms of attention,
        # dividing it between the query and key.   Note: this module is intended
        # to be used with the ScaledAdam optimizer; with most other optimizers,
        # it would be necessary to apply the scaling factor in the forward function.
        self.query_in_proj = ScaledLinear(query_embed_dim,
                                          head_dim * num_heads,
                                          bias=True,
                                          initial_scale=head_dim ** -0.25)
        # weights produced by this module are invariant to adding a constant to
        # the keys, so we don't need a bias for the keys.
        self.key_in_proj = ScaledLinear(key_embed_dim,
                                        head_dim * num_heads,
                                        bias=False,
                                        initial_scale=head_dim ** -0.25)
        self.whiten_keys = Whiten(num_groups=num_heads,
                                  whitening_limit=_whitening_schedule(3.0),
                                  prob=(0.025, 0.25),
                                  grad_scale=0.025)
    def forward(
        self,
        key: Tensor,
        query: Tensor,
        key_padding_mask: Optional[Tensor] = None,
    ) -> Tensor:
        r"""
        Args:
              key: input of shape (key_len, batch_size, key_embed_dim)
            query: input of shape (query_len, batch_size, query_embed_dim)
          key_padding_mask: an optional bool tensor of shape (batch_size, key_len).  Positions that
               are True in this mask will be ignored as sources in the attention weighting.
        Returns:
           a tensor of attention weights, of shape (hum_heads, batch_size, query_len, key_len)
        """
        q = self.query_in_proj(query)
        k = self.key_in_proj(key)
        head_dim = self.head_dim
        num_heads = self.num_heads
        query_len, batch_size, _ = q.shape
        key_len, _batch_size, _ = k.shape
        assert _batch_size == batch_size
        k = self.whiten_keys(k)   # does nothing in the forward pass.
        q = q.reshape(query_len, batch_size, num_heads, head_dim)
        k = k.reshape(key_len, batch_size, num_heads, head_dim)
        # time1 refers to target, time2 refers to source.
        q = q.permute(2, 1, 0, 3)  # (head, batch, time1, query_head_dim)
        k = k.permute(2, 1, 3, 0)  # (head, batch, d_k, time2)
        attn_scores = torch.matmul(q, k)
        if self.training and random.random() < 0.1:
            # This is a way of limiting the attention scores to not be
            # too large.  It incurs a penalty if any of them has an absolute
            # value greater than 25.0.  this should be outside the normal range
            # of the attention scores.  We use this mechanism instead of, say,
            # something added to the loss function involving the entropy,
            # because once the entropy gets very small gradients through the
            # softmax can become very small, and we'd get zero derivatives.  The
            # choices of 1.0e-04 as the scale on the penalty makes this
            # mechanism vulnerable to the absolute scale of the loss function,
            # but we view this as a failsafe to avoid "implausible" parameter
            # values rather than a regularization method that should be active
            # under normal circumstances.
            attn_scores = penalize_abs_values_gt(attn_scores,
                                                 limit=25.0,
                                                 penalty=1.0e-04,
                                                 name=self.name)
        assert attn_scores.shape == (num_heads, batch_size, query_len, key_len)
        if key_padding_mask is not None:
            assert key_padding_mask.shape == (batch_size, key_len), key_padding_mask.shape
            attn_scores = attn_scores.masked_fill(
                key_padding_mask.unsqueeze(1),
                -1000,
            )
        # We use our own version of softmax, defined in scaling.py, which should
        # save a little of the memory used in backprop by, if we are in
        # automatic mixed precision mode (amp / autocast), by only storing the
        # half-precision output for backprop purposes.
        attn_weights = softmax(attn_scores, dim=-1)
        if random.random() < 0.001:
            self._print_attn_entropy(attn_weights)
        attn_weights = nn.functional.dropout(
            attn_weights, p=self.dropout, training=self.training
        )
        return attn_weights
    def _print_attn_entropy(
            self,
            attn_weights: Tensor):
        # attn_weights: (num_heads, batch_size, seq_len, seq_len)
        (num_heads, batch_size, seq_len, seq_len) = attn_weights.shape
        with torch.no_grad():
            with torch.cuda.amp.autocast(enabled=False):
                attn_weights = attn_weights.to(torch.float32)
                attn_weights_entropy = -((attn_weights + 1.0e-20).log() * attn_weights).sum(
                    dim=-1).mean(dim=(1,2))
                logging.info(f"name={self.name}, attn_weights_entropy = {attn_weights_entropy}")
 class FeedforwardModule(nn.Module):
    """Feedforward module in Zipformer2 model.
    """
@ -1650,12 +1864,14 @@ def _test_zipformer_main(causal: bool = False):
    batch_size = 5
    seq_len = 20
    # Just make sure the forward pass runs.
    memory_dim = 100
    c = Zipformer2(
        encoder_dim=(64, 96), encoder_unmasked_dim=(48, 64), num_heads=(4, 4),
        causal=causal,
        chunk_size=(4,) if causal else (-1,),
-        left_context_frames=(64,)
+        left_context_frames=(64,),
        memory_dim=memory_dim,
    )
    batch_size = 5
    seq_len = 20
@ -1663,6 +1879,7 @@ def _test_zipformer_main(causal: bool = False):
    f = c(
        torch.randn(seq_len, batch_size, 64),
        torch.full((batch_size,), seq_len, dtype=torch.int64),
        memory=torch.randn(101, batch_size, memory_dim),
    )
    f[0].sum().backward()
    c.eval()