Fix bugs

2025-08-13 12:02:21 +00:00 · 2021-09-18 11:34:35 +08:00 · 2021-09-18 11:34:35 +08:00 · a75f75bbad
commit a75f75bbad
parent c6c3750cab
2 changed files with 167 additions and 238 deletions
--- a/egs/librispeech/ASR/conformer_ctc_bn_2d/conformer.py
+++ b/egs/librispeech/ASR/conformer_ctc_bn_2d/conformer.py
@ -16,19 +16,23 @@
 # limitations under the License.

 import copy
+import random
 import math
 import warnings
-from typing import Optional, Tuple
+from typing import Optional, Tuple, List
 import torch_flow_sampling

 import torch
 from torch import Tensor, nn
+from subsampling import Conv2dSubsampling, VggSubsampling
 from transformer import Supervisions, TransformerEncoderLayer, TransformerDecoderLayer, encoder_padding_mask, \
-        LabelSmoothingLoss, PositionalEncoding, pad_sequence, add_sos, add_eos, decoder_padding_mask
+        LabelSmoothingLoss, PositionalEncoding, pad_sequence, add_sos, add_eos, decoder_padding_mask, \
+        generate_square_subsequent_mask


 class ConformerTrunk(nn.Module):
-    def __init__(num_features: int,
+    def __init__(self,
+                 num_features: int,
                 subsampling_factor: int = 4,
                 d_model: int = 256,
                 nhead: int = 4,
@ -37,6 +41,7 @@ class ConformerTrunk(nn.Module):
                 dropout: float = 0.1,
                 cnn_module_kernel: int = 31,
                 use_feat_batchnorm: bool = True) -> None:
+        super(ConformerTrunk, self).__init__()
        if use_feat_batchnorm:
            self.feat_batchnorm = nn.BatchNorm1d(num_features)

@ -62,12 +67,11 @@ class ConformerTrunk(nn.Module):
            cnn_module_kernel,
        )

-        self.encoder = ConformerEncoder(encoder_layer, num_encoder_layers)
-
+        self.encoder = ConformerEncoder(encoder_layer, num_layers)

    def forward(
        self, x: torch.Tensor, supervision: Optional[Supervisions] = None
-    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
        """
        Args:
          x:
@ -78,12 +82,10 @@ class ConformerTrunk(nn.Module):
            (CAUTION: It contains length information, i.e., start and number of
             frames, before subsampling)

-        Returns:
-          Return a tuple containing 2 tensors:
-            - Encoder output with shape [T, N, C]. It can be used as key and
-              value for the decoder.
-            - Encoder output padding mask. It can be used as
-              memory_key_padding_mask for the decoder. Its shape is [N, T].
+        Return (output, pos_emb, mask), where:
+          output:  The output embedding, of shape (T, N, C).
+          pos_emb:  The positional embedding (this will be used by ctc_encoder forward).
+          mask: The output padding mask, a Tensor of bool, of shape [N, T].
              It is None if `supervision` is None.
        """
        if hasattr(self, 'feat_batchnorm'):
@ -92,13 +94,15 @@ class ConformerTrunk(nn.Module):
            x = x.permute(0, 2, 1)  # [N, C, T] -> [N, T, C]

        x = self.feat_embed(x)
-        x = self.encoder_pos(x)
-        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
-        mask = encoder_padding_mask(x.size(0), supervisions)
-        mask = mask.to(x.device) if mask is not None else None
-        x = self.encoder(x, key_padding_mask=mask)  # (T, N, C)

-        return x, mask
+        x, pos_emb = self.encoder_pos(x)
+        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
+        mask = encoder_padding_mask(x.size(0), supervision)
+
+        mask = mask.to(x.device) if mask is not None else None
+        x = self.encoder(x, pos_emb=pos_emb, key_padding_mask=mask)  # (T, N, C)
+
+        return x, pos_emb, mask


 class BidirectionalConformer(nn.Module):
@ -178,25 +182,26 @@ class BidirectionalConformer(nn.Module):
                   in the discrete bottleneck
    """
    def __init__(
-        num_features: int,
-        num_classes: int,
-        subsampling_factor: int = 4,
-        d_model: int = 256,
-        nhead: int = 4,
-        dim_feedforward: int = 2048,
-        num_trunk_encoder_layers: int = 12,
-        num_ctc_encoder_layers: int = 4,
-        num_decoder_layers: int = 6,
-        num_reverse_encoder_layers: int = 4,
-        num_reverse_decoder_layers: int = 4,
-        num_self_predictor_layers: int = 3,
-        bypass_bottleneck: bool = True,
-        dropout: float = 0.1,
-        cnn_module_kernel: int = 31,
-        is_bpe: bool = False,
-        use_feat_batchnorm: bool = True,
-        discrete_bottleneck_tot_classes: int = 512,
-        discrete_bottleneck_num_groups: int = 4
+            self,
+            num_features: int,
+            num_classes: int,
+            subsampling_factor: int = 4,
+            d_model: int = 256,
+            nhead: int = 4,
+            dim_feedforward: int = 2048,
+            num_trunk_encoder_layers: int = 12,
+            num_ctc_encoder_layers: int = 4,
+            num_decoder_layers: int = 6,
+            num_reverse_encoder_layers: int = 4,
+            num_reverse_decoder_layers: int = 4,
+            num_self_predictor_layers: int = 3,
+            bypass_bottleneck: bool = True,
+            dropout: float = 0.1,
+            cnn_module_kernel: int = 31,
+            is_bpe: bool = False,
+            use_feat_batchnorm: bool = True,
+            discrete_bottleneck_tot_classes: int = 512,
+            discrete_bottleneck_num_groups: int = 4
    ) -> None:
        super(BidirectionalConformer, self).__init__()

@ -236,7 +241,7 @@ class BidirectionalConformer(nn.Module):
            self.token_embed_scale = d_model ** 0.5
            self.token_embed = nn.Embedding(
                num_embeddings=self.decoder_num_class, embedding_dim=d_model,
-                _weight=torch.randn(self.num_classes, d_model) * (1 / self.embed_scale)
+                _weight=torch.randn(self.decoder_num_class, d_model) * (1 / self.token_embed_scale)
            )

            decoder_layer = TransformerDecoderLayer(
@ -315,8 +320,7 @@ class BidirectionalConformer(nn.Module):
        if num_self_predictor_layers > 0:
            encoder_layer = SimpleCausalEncoderLayer(d_model,
                                                     dropout=dropout)
-            self.self_predictor_encoder = simple_causal_encoder(encoder_layer,
-                                                                num_self_predictor_layers)
+            self.self_predictor_encoder = encoder_layer


        self.discrete_bottleneck = DiscreteBottleneck(
@ -326,8 +330,8 @@ class BidirectionalConformer(nn.Module):



-    def forward(self, x: Tensor, supervision: Optional[Supervisions],
-                need_softmax: bool = True) -> Tuple[Tensor, Tensor, Optional[Tensor]]:
+    def forward(self, x: Tensor, supervision: Optional[Supervisions] = None,
+                need_softmax: bool = True) -> Tuple[Tensor, Tensor, Tensor, Tensor, Optional[Tensor]]:
        """
        Forward function that "encodes" the features.

@ -343,26 +347,29 @@ class BidirectionalConformer(nn.Module):
            If true, the last output ("softmax") will be computed.  This can be useful
            in the reverse model, but only necessary if straight_through_scale != 1.0.

-       Returns: (memory, bn_memory, sampled, softmax, key_padding_mask), where:
+       Returns: (memory, bn_memory, pos_emb, sampled, softmax, key_padding_mask), where:

-           `memory` is a Tensor of shape [T, N, E] i.e. [T, batch_size, embedding_dim] where T
-                     is actually a subsampled form of the num_frames of the input `x`.
-                     If self.bypass_bottleneck, it will be taken before the discrete
-                     bottleneck; otherwise, from after.
-           `bn_memory` is the same shape as `memory`, but comes after the discrete bottleneck
-                     regardless of the value of self.bypass_bottleneck.
-           `sampled` is a Tensor of shape [T, N, C] where C corresponds to `discrete_bottleneck_tot_classes`
-                      as given to the constructor.  This will be needed for the 'reverse' model.
-           `softmax` is a "soft" version of `sampled`.  Will only be returned if need_softmax == True;
+           memory: a Tensor of shape [T, N, E] i.e. [T, batch_size, embedding_dim] where T
+                   is actually a subsampled form of the num_frames of the input `x`.
+                   If self.bypass_bottleneck, it will be taken before the discrete
+                   bottleneck; otherwise, from after.
+          bn_memory: The same shape as `memory`, but comes after the discrete bottleneck
+                    regardless of the value of self.bypass_bottleneck.
+            pos_emb: The relative positional embedding; will be given to ctc_encoder_forward()
+            sampled: a Tensor of shape [T, N, C] where C corresponds to `discrete_bottleneck_tot_classes`
+                     as given to the constructor.  This will be needed for the 'reverse' model.
+           softmax:  a "soft" version of `sampled`.  Will only be returned if need_softmax == True;
                     else will be None.
+      key_padding_mask:  The padding mask for the "memory" output, a Tensor of bool of
+                    shape [N, T] (only if supervision was supplied, else None).
        """
-        encoder_output, memory_key_padding_mask = self.trunk(x, supervision)
+        encoder_output, pos_emb, memory_key_padding_mask = self.trunk(x, supervision)

        bn_memory, sampled, softmax = self.discrete_bottleneck(encoder_output)

        memory = encoder_output if self.bypass_bottleneck else bn_memory

-        return (memory, bn_memory, sampled, softmax, memory_key_padding_mask)
+        return (memory, bn_memory, pos_emb, sampled, softmax, memory_key_padding_mask)

    def decoder_forward(
        self,
@ -373,11 +380,13 @@ class BidirectionalConformer(nn.Module):
        eos_id: int,
    ) -> torch.Tensor:
        """
+        Compute the decoder loss function (given a particular list of hypotheses).
+
        Args:
          memory:
-            It's the first output of forward(), with shape [T, N, E]
+            The first output of forward(), with shape [T, N, E]
          memory_key_padding_mask:
-            The padding mask from forward()
+            The padding mask from forward(), a tensor of bool with shape [N, T]
          token_ids:
            A list-of-list IDs. Each sublist contains IDs for an utterance.
            The IDs can be either phone IDs or word piece IDs.
@ -390,6 +399,7 @@ class BidirectionalConformer(nn.Module):
            A scalar, the **sum** of label smoothing loss over utterances
            in the batch without any normalization.
        """
+
        ys_in = add_sos(token_ids, sos_id=sos_id)
        ys_in = [torch.tensor(y) for y in ys_in]
        ys_in_pad = pad_sequence(ys_in, batch_first=True, padding_value=eos_id)
@ -426,6 +436,7 @@ class BidirectionalConformer(nn.Module):
            tgt_mask=tgt_mask,
            memory_key_padding_mask=memory_key_padding_mask,
        )  # (T, N, C)
+
        pred_pad = pred_pad.permute(1, 0, 2)  # (T, N, C) -> (N, T, C)
        pred_pad = self.decoder_output_layer(pred_pad)  # (N, T, C)

@ -436,6 +447,7 @@ class BidirectionalConformer(nn.Module):
    def ctc_encoder_forward(
        self,
        memory: torch.Tensor,
+        pos_emb: torch.Tensor,
        memory_key_padding_mask: torch.Tensor,
    ) -> torch.Tensor:
        """
@ -444,17 +456,19 @@ class BidirectionalConformer(nn.Module):

        Args:
          memory:
-            It's the output of forward(), with shape [T, N, E]
+            It's the output of forward(), with shape (T, N, E)
+          pos_emb:
+             Relative positional embedding tensor: (N, 2*T-1, E)
          memory_key_padding_mask:
-            The padding mask from forward()
+            The padding mask from forward(), a tensor of bool of shape (N, T)

        Returns:
            A Tensor with shape [N, T, C] where C is the number of classes
            (e.g. number of phones or word pieces).  Contains normalized
            log-probabilities.
        """
-
        x = self.ctc_encoder(memory,
+                             pos_emb,
                             key_padding_mask=memory_key_padding_mask)
        x = self.ctc_output_layer(x)
        x = x.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
@ -470,10 +484,10 @@ class BidirectionalConformer(nn.Module):
            softmax: Optional[torch.Tensor],
            reverse_gradient: bool = True) -> Tensor:
        """
-        Returns the total log-prob of the the
-        labels sampled in the discrete bottleneck layer, as predicted using a relatively
-        simple model from previous frames sampled from the bottleneck layer.
-        [Appears on the denominator of an expressin for mutual information].
+        Returns the total log-prob of the the labels sampled in the discrete
+        bottleneck layer, as predicted using a relatively simple model that
+        predicts from previous frames sampled from the bottleneck layer.
+        [Appears on the denominator of an expression for mutual information].

        Args:
          memory_shifted:
@ -482,21 +496,25 @@ class BidirectionalConformer(nn.Module):
             (T, N, E) = memory.shape
             memory_shifted = torch.cat((torch.zeros(1, N, E), memory[:-1,:,:]), dim=0)
          memory_key_padding_mask:
-            The padding mask from the encoder.
-          sampled: is a Tensor of shape [T, N, C] where C corresponds to `discrete_bottleneck_tot_classes`
-                     as given to the constructor.  This will be needed for the 'reverse' model.
+            The padding mask from the encoder, of shape [N, T], boolean, True
+            for masked locations.
+          sampled: sampled and interpolated one-hot values, as a Tensor of shape [T, N, C]
+              where C corresponds to `discrete_bottleneck_tot_classes`
+              as given to the constructor.  This will be needed for the 'reverse'
+              model.
          softmax: is a "soft" version of `sampled`; if None, will default to `sampled`.
          reverse_gradient: will likely be true.  If true, the gradient is reversed twice
                   in this computation, so that we train predictors with the correct
                   gradient, i.e. to predict, not anti-predict (since the return value
                   of this function will appear with positive, not negative, sign in the
                   loss function, so will be minimized).
+                   The gradient w.r.t. the non-self inputs to this function, though (i.e.
+                   memory_shifted, sampled, softmax) will not be reversed, though.
        Returns:
            A scalar tensor, the **sum** of label smoothing loss over utterances
            in the batch without any normalization.
        """

-
        if reverse_gradient:
            # Reversing gradient for memory_shifted puts the gradient back into
            # the correct sign; we reversed it in
@ -506,6 +524,8 @@ class BidirectionalConformer(nn.Module):
            # what happens to the gradients).
            memory_shifted = ReverseGrad.apply(memory_shifted)

+        # no mask is needed for self_predictor_encoder; its CNN
+        # layer uses left-padding only, making it causal.
        predictor = self.self_predictor_encoder(memory_shifted)

        prob = self.discrete_bottleneck.compute_prob(predictor,
@ -566,15 +586,21 @@ class BidirectionalConformer(nn.Module):
        tokens_key_padding_mask = decoder_padding_mask(ys_in_pad, ignore_id=padding_id)


+        # Let S be the length of the longest sentence (padded)
+        token_embedding = self.token_embed(tokens_padded) * self.token_embed_scale # (N, S) -> (N, S, C)
+        # add absolute position-encoding information
+        token_embedding = self.abs_pos(token_embedding)
+
+        token_embedding = token_embedding.permute(1, 0, 2)  # (N, S, C) -> (S, N, C)
+
+        token_memory = self.reverse_encoder(token_embedding,
+                                            src_key_padding_mask=tokens_key_padding_mask)
+        # token_memory is of shape (S, N, C), if S is length of token sequence.
+
        T = memory.shape[0]
        # the targets, here, are the hidden discrete symbols we are predicting
        tgt_mask = generate_square_subsequent_mask(T, device=memory.device)

-        token_embedding = self.token_embed(tokens_padded) * self.token_embed_scale
-        token_memory = self.reverse_encoder(token_embedding,
-                                           src_key_padding_mask=tokens_key_padding_mask)
-        # tokens_encoded is of shape (S, N, C), if S is length of token sequence.
-
        hidden_predictor = self.reverse_decoder(
            tgt=memory_shifted,
            memory=token_memory,
@ -584,25 +610,13 @@ class BidirectionalConformer(nn.Module):

        total_prob = self.discrete_bottleneck.compute_prob(
            hidden_predictor,
-            TODO,  # HERE
-        )
+            sampled,
+            softmax,
+            memory_key_padding_mask)

-        tgt = self.decoder_embed(ys_in_pad)  # (N, T) -> (N, T, C)
-        tgt = self.decoder_pos(tgt)
-        tgt = tgt.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
-        pred_pad = self.decoder(
-            tgt=tgt,
-            memory=memory,
-            tgt_mask=tgt_mask,
-            tgt_key_padding_mask=tgt_key_padding_mask,
-            memory_key_padding_mask=memory_key_padding_mask,
-        )  # (T, N, C)
-        pred_pad = pred_pad.permute(1, 0, 2)  # (T, N, C) -> (N, T, C)
-        pred_pad = self.decoder_output_layer(pred_pad)  # (N, T, C)
+        # TODO: consider using a label-smoothed loss.
+        return total_prob

-        decoder_loss = self.decoder_criterion(pred_pad, ys_out_pad)
-
-        return decoder_loss

 class SimpleCausalEncoderLayer(nn.Module):
    """
@ -650,129 +664,6 @@ class SimpleCausalEncoderLayer(nn.Module):
        src = self.norm_final(src)
        return src

-# for search: SimpleCausalEncoder
-def simple_causal_encoder(encoder_layer: nn.Module,
-                          num_layers: int):
-    return torch.nn.Sequential([copy.deepcopy(encoder_leyer) for _ in range(num_layers)])
-
-
-
-class DiscreteBottleneckConformer(Transformer):
-    """
-    Args:
-        num_features (int): Number of input features
-        num_classes (int): Number of output classes
-        subsampling_factor (int): subsampling factor of encoder (the convolution layers before transformers)
-        d_model (int): attention dimension
-        nhead (int): number of head
-        dim_feedforward (int): feedforward dimention
-        num_encoder_layers (int): number of encoder layers
-        num_decoder_layers (int): number of decoder layers
-        dropout (float): dropout rate
-        cnn_module_kernel (int): Kernel size of convolution module
-        discrete_bottleneck_pos (int): position in the encoder at which to place
-           the discrete bottleneck (this many encoder layers will
-           precede it)
-
-    """
-
-    def __init__(
-        self,
-        num_features: int,
-        num_classes: int,
-        subsampling_factor: int = 4,
-        d_model: int = 256,
-        nhead: int = 4,
-        dim_feedforward: int = 2048,
-        num_encoder_layers: int = 12,
-        num_decoder_layers: int = 6,
-        dropout: float = 0.1,
-        cnn_module_kernel: int = 31,
-        normalize_before: bool = True,
-        vgg_frontend: bool = False,
-        is_espnet_structure: bool = False,
-        mmi_loss: bool = True,
-        use_feat_batchnorm: bool = False,
-        discrete_bottleneck_pos: int = 8,
-        discrete_bottleneck_tot_classes: int = 512,
-        discrete_bottleneck_num_groups: int = 2
-    ) -> None:
-        super(DiscreteBottleneckConformer, self).__init__(
-            num_features=num_features,
-            num_classes=num_classes,
-            subsampling_factor=subsampling_factor,
-            d_model=d_model,
-            nhead=nhead,
-            dim_feedforward=dim_feedforward,
-            num_encoder_layers=num_encoder_layers,
-            num_decoder_layers=num_decoder_layers,
-            dropout=dropout,
-            normalize_before=normalize_before,
-            vgg_frontend=vgg_frontend,
-            mmi_loss=mmi_loss,
-            use_feat_batchnorm=use_feat_batchnorm,
-        )
-
-        self.encoder_pos = RelPositionalEncoding(d_model, dropout)
-
-        encoder_layer = ConformerEncoderLayer(
-            d_model,
-            nhead,
-            dim_feedforward,
-            dropout,
-            cnn_module_kernel,
-            normalize_before,
-            is_espnet_structure,
-        )
-
-        discrete_bottleneck = DiscreteBottleneck(dim=d_model,
-                                                 tot_classes=discrete_bottleneck_tot_classes,
-                                                 num_groups=discrete_bottleneck_num_groups)
-
-        self.encoder = ConformerEncoder(encoder_layer, num_encoder_layers,
-                                        discrete_bottleneck=discrete_bottleneck,
-                                        discrete_bottleneck_pos=discrete_bottleneck_pos)
-
-
-        self.normalize_before = normalize_before
-        self.is_espnet_structure = is_espnet_structure
-        if self.normalize_before and self.is_espnet_structure:
-            self.after_norm = nn.LayerNorm(d_model)
-        else:
-            # Note: TorchScript detects that self.after_norm could be used inside forward()
-            #       and throws an error without this change.
-            self.after_norm = identity
-
-    def run_encoder(
-        self, x: Tensor, supervisions: Optional[Supervisions] = None
-    ) -> Tuple[Tensor, Optional[Tensor]]:
-        """
-        Args:
-          x:
-            The model input. Its shape is [N, T, C].
-          supervisions:
-            Supervision in lhotse format.
-            See https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py#L32  # noqa
-            CAUTION: It contains length information, i.e., start and number of
-            frames, before subsampling
-            It is read directly from the batch, without any sorting. It is used
-            to compute encoder padding mask, which is used as memory key padding
-            mask for the decoder.
-
-        Returns:
-            Tensor: Predictor tensor of dimension (input_length, batch_size, d_model).
-            Tensor: Mask tensor of dimension (batch_size, input_length)
-        """
-        x = self.feat_embed(x)
-        x, pos_emb = self.encoder_pos(x)
-        x = x.permute(1, 0, 2)  # (B, T, F) -> (T, B, F)
-        mask = encoder_padding_mask(x.size(0), supervisions)
-        if mask is not None:
-            mask = mask.to(x.device)
-        x = self.encoder(x, pos_emb, src_key_padding_mask=mask)  # (T, B, F)
-
-        return x, mask
-

 class ReverseGrad(torch.autograd.Function):
    def apply(ctx, x):
@ -878,9 +769,9 @@ class DiscreteBottleneck(nn.Module):
                #        [ True,  True,  True,  True]])
                self.register_buffer('pred_cross_mask',
                                     ((torch.arange(d) // c).unsqueeze(1) >= (torch.arange(d) // c).unsqueeze(0)))
-        self.reset_parameters()
+        self._reset_parameters()

-    def reset_parameters_(self):
+    def _reset_parameters(self):
        if hasattr(self, 'pred_cross'):
            torch.nn.init.kaiming_uniform_(self.pred_cross, a=math.sqrt(5))

@ -967,7 +858,8 @@ class DiscreteBottleneck(nn.Module):
              This will be useful in computing a loss function that has
              a likelihood term with negative sign (i.e. the self-prediction).
              We'll later need negate the gradient one more more time
-              where we give the input to whatever module generated 'x'.
+              where we give the input to the prediction module that
+              generated 'x'.

        Returns a scalar Tensor represnting the total probability.
        """
@ -980,6 +872,8 @@ class DiscreteBottleneck(nn.Module):

        logprobs = self.pred_linear(x)

+        # Add "cross-terms" to logprobs; this is a regression that uses earlier
+        # groups to predict later groups
        if self.num_groups > 1:
            pred_cross = self.pred_cross * self.pred_cross_mask
            t = self.tot_classes
@ -999,11 +893,12 @@ class DiscreteBottleneck(nn.Module):
        logprobs = logprobs.reshape(S, N, self.num_groups, self.classes_per_group)
        # Normalize the log-probs (so they sum to one)
        logprobs = torch.nn.functional.logsoftmax(logprobs, dim=-1)
+        logprobs = logprobs.reshape(S, N, C)

        if padding_mask is not None:
            assert padding_mask.dtype == torch.bool and padding_mask.shape == (N, S)
            padding_mask = torch.logical_not(padding_mask).transpose(0, 1).unsqueeze(-1)
-            # padding_mask.shape == (S, N, E)
+            assert padding_mask.shape == (S, N, 1)
            tot_prob = (logprobs * softmax * padding_mask).sum()
        else:
            tot_prob = (logprobs * softmax).sum()
@ -1080,8 +975,8 @@ class ConformerEncoderLayer(nn.Module):
        self,
        src: Tensor,
        pos_emb: Tensor,
-        src_mask: Optional[Tensor] = None,
-        src_key_padding_mask: Optional[Tensor] = None,
+        attn_mask: Optional[Tensor] = None,
+        key_padding_mask: Optional[Tensor] = None,
    ) -> Tensor:
        """
        Pass the input through the encoder layer.
@ -1089,17 +984,18 @@ class ConformerEncoderLayer(nn.Module):
        Args:
            src: the sequence to the encoder layer (required).
            pos_emb: Positional embedding tensor (required).
-            src_mask: the mask for the src sequence (optional).
-            src_key_padding_mask: the mask for the src keys per batch (optional).
+            attn_mask: the mask for the src sequence (optional).
+            key_padding_mask: the mask for the src keys per batch (optional).

        Shape:
            src: (S, N, E).
            pos_emb: (N, 2*S-1, E)
-            src_mask: (S, S).
-            src_key_padding_mask: (N, S).
+            attn_mask: (S, S).  This probably won't be used, in fact should not
+                 be (e.g. could in principle ensure causal behavior, but
+                  actually the conformer does not support this).
+            key_padding_mask: (N, S).
            S is the source sequence length, N is the batch size, E is the feature number
        """
-
        # macaron style feed forward module
        residual = src
        src = self.norm_ff_macaron(src)
@ -1115,8 +1011,8 @@ class ConformerEncoderLayer(nn.Module):
            src,
            src,
            pos_emb=pos_emb,
-            attn_mask=src_mask,
-            key_padding_mask=src_key_padding_mask,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask,
        )[0]
        src = residual + self.dropout(src_att)

@ -1141,7 +1037,6 @@ class ConformerEncoder(nn.Module):
    Args:
        encoder_layer: an instance of the ConformerEncoderLayer() class (required).
        num_layers: the number of sub-encoder-layers in the encoder (required).
-        norm: the layer normalization component (optional).

    Examples::
        >>> encoder_layer = ConformerEncoderLayer(d_model=512, nhead=8)
@ -1153,7 +1048,7 @@ class ConformerEncoder(nn.Module):

    def __init__(self, encoder_layer: nn.Module, num_layers: int) -> None:
        super(ConformerEncoder, self).__init__()
-        self.layers = torch.nn.ModuleList([copy.deepcopy(encoder_leyer) for _ in range(num_layers)])
+        self.layers = torch.nn.ModuleList([copy.deepcopy(encoder_layer) for _ in range(num_layers)])

    def forward(
        self,
@ -1183,7 +1078,6 @@ class ConformerEncoder(nn.Module):
        return x


-
 class RelPositionalEncoding(torch.nn.Module):
    """Relative positional encoding module.

@ -1313,7 +1207,6 @@ class RelPositionMultiheadAttention(nn.Module):
        nn.init.xavier_uniform_(self.in_proj.weight)
        nn.init.constant_(self.in_proj.bias, 0.0)
        nn.init.constant_(self.out_proj.bias, 0.0)
-
        nn.init.xavier_uniform_(self.pos_bias_u)
        nn.init.xavier_uniform_(self.pos_bias_v)

@ -1663,6 +1556,7 @@ class RelPositionMultiheadAttention(nn.Module):
            )

        attn_output_weights = nn.functional.softmax(attn_output_weights, dim=-1)
+
        attn_output_weights = nn.functional.dropout(
            attn_output_weights, p=dropout_p, training=training
        )
@ -1701,7 +1595,7 @@ class CausalConvolutionModule(nn.Module):
        self, channels: int, kernel_size: int, bias: bool = True
    ) -> None:
        """Construct an ConvolutionModule object."""
-        super(ConvolutionModule, self).__init__()
+        super(CausalConvolutionModule, self).__init__()
        # kernerl_size should be a odd number for 'SAME' padding
        assert (kernel_size - 1) % 2 == 0
        self.kernel_size = kernel_size
@ -1752,11 +1646,12 @@ class CausalConvolutionModule(nn.Module):
        x = nn.functional.glu(x, dim=1)  # (batch, channels, time)

        # 1D Depthwise Conv
-        (B, C, T) = x
+        (B, C, T) = x.shape
        padding = self.kernel_size - 1
        x = torch.cat((torch.zeros(B, C, padding, device=x.device, dtype=x.type), x),
                      dim=2)
-        x = self.depthwise_conv(x)  # <-- This has no padding.
+        x = self.depthwise_conv(x)  # <-- This convolution module does no padding,
+                                    # so we padded manually, on the left only.

        x = self.activation(self.norm(x))

@ -1835,7 +1730,7 @@ class ConvolutionModule(nn.Module):

        x = self.pointwise_conv2(x)  # (batch, channel, time)

-        return x.permute(2, 0, 1)
+        return x.permute(2, 0, 1) # (time, batch channel)


 class Swish(torch.nn.Module):
@ -1851,16 +1746,47 @@ def identity(x):



-def test_discrete_bottleneck_conformer():
+def _gen_rand_tokens(N: int) -> List[List[int]]:
+    ans = []
+    for _ in range(N):
+        S = random.randint(1, 20)
+        ans.append([random.randint(3, 30) for _ in range(S)])
+    return ans
+
+def _gen_supervision(tokens: List[List[int]]):
+    ans = dict()
+    N = len(tokens)
+    ans['sequence_idx'] = torch.arange(N, dtype=torch.int32)
+    ans['start_frame'] = torch.zeros(N, dtype=torch.int32)
+    ans['num_frames'] = torch.tensor([ random.randint(20, 35) for _ in tokens])
+    return ans
+
+def _test_bidirectional_conformer():
    num_features = 40
    num_classes = 1000
-    m = DiscreteBottleneckConformer(num_features, num_classes)
+    m = BidirectionalConformer(num_features, num_classes)
    T = 35
    N = 10
    C = num_features
    feats = torch.randn(N, T, C)
-    ctc_output, _, _ = m(feats)
-    # [N, T, C].
+
+    tokens = _gen_rand_tokens(N)
+    supervision = _gen_supervision(tokens)
+    print("tokens = ", tokens)
+    print("supervision = ", supervision)
+    # memory: [T, N, C]
+    (memory, bn_memory, pos_emb, sampled, softmax, key_padding_mask) = m(feats, supervision)
+
+    # ctc_output: [N, T, C].
+    ctc_output = m.ctc_encoder_forward(memory, pos_emb, key_padding_mask)
+
+    decoder_loss = m.decoder_forward(memory, key_padding_mask, tokens,
+                                     sos_id=1,
+                                     eos_id=2)
+
+    (T, N, E) = memory.shape
+    memory_shifted = torch.cat((torch.zeros(1, N, E), memory[:-1,:,:]), dim=0)
+

 if __name__ == '__main__':
-    test_discrete_bottleneck_conformer()
+    _test_bidirectional_conformer()
--- a/egs/librispeech/ASR/conformer_ctc_bn_2d/transformer.py
+++ b/egs/librispeech/ASR/conformer_ctc_bn_2d/transformer.py
@ -914,6 +914,9 @@ def encoder_padding_mask(
    ).unsqueeze(-1)
    mask = seq_range_expand >= seq_length_expand

+    # Assert that in each row, i.e. each utterance, at least one frame is not
+    # masked.  Otherwise it may lead to nan's appearing in the attention computation.
+    assert torch.all(torch.sum(torch.logical_not(mask), dim=1) != 0)
    return mask