Implement Nextformer-style frontend

2025-12-11 06:55:27 +00:00 · 2022-12-15 21:48:32 +08:00 · 2022-12-15 21:48:32 +08:00 · 076b18db60
commit 076b18db60
parent 37a8c30136
1 changed files with 94 additions and 72 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
@ -735,61 +735,6 @@ class DownsampledZipformerEncoder(nn.Module):
        return self.out_combiner(src_orig, src)
 class DownsamplingZipformerEncoder(nn.Module):
    r"""
    DownsamplingZipformerEncoder is a zipformer encoder that downsamples its input
    by a specified factor before feeding it to the zipformer layers.
    """
    def __init__(self,
                 encoder: nn.Module,
                 input_dim: int,
                 output_dim: int,
                 downsample: int):
        super(DownsampledZipformerEncoder, self).__init__()
        self.downsample_factor = downsample
        self.downsample = AttentionDownsample(input_dim, output_dim, downsample)
        self.encoder = encoder
    def forward(self,
                src: Tensor,
                feature_mask: Union[Tensor, float] = 1.0,
                mask: Optional[Tensor] = None,
                src_key_padding_mask: Optional[Tensor] = None,
    ) -> Tensor:
        r"""Downsample, go through encoder, upsample.
        Args:
            src: the sequence to the encoder (required).
            feature_mask: something that broadcasts with src, that we'll multiply `src`
               by at every layer.  feature_mask is expected to be already downsampled by
               self.downsample_factor.
            mask: the mask for the src sequence (optional).  CAUTION: we need to downsample
                  this, if we are to support it.  Won't work correctly yet.
            src_key_padding_mask: the mask for the src keys per batch (optional).
        Shape:
            src: (S, N, E).
            mask: (S, S).
            src_key_padding_mask: (N, S).
            S is the source sequence length, T is the target sequence length, N is the batch size, E is the feature number
        Returns: output of shape (S, N, F) where F is the number of output features
            (output_dim to constructor)
        """
        src_orig = src
        src = self.downsample(src)
        ds = self.downsample_factor
        if mask is not None:
            mask = mask[::ds,::ds]
        if src_key_padding_mask is not None:
            src_key_padding_mask = src_key_padding_mask[::ds]
        src = self.encoder(
            src, feature_mask=feature_mask, mask=mask, src_key_padding_mask=mask,
        )
        return src
 class AttentionDownsample(torch.nn.Module):
    """
@ -1734,6 +1679,71 @@ class ScalarMultiply(nn.Module):
    def forward(self, x):
        return x * self.scale
 class ConvNeXt(nn.Module):
    """
    Our interpretation of the ConvNeXt module as used in https://arxiv.org/pdf/2206.14747.pdf
    """
    def __init__(self,
                 channels: int,
                 hidden_ratio: int = 4,
                 layerdrop_prob: FloatLike = None):
        super().__init__()
        kernel_size = 7
        pad = (kernel_size - 1) // 2
        hidden_channels = channels * hidden_ratio
        if layerdrop_prob is None:
            layerdrop_prob = ScheduledFloat((0.0, 0.1), (16000.0, 0.01))
        self.layerdrop_prob = layerdrop_prob
        self.depthwise_conv = nn.Conv2d(
            in_channels=channels,
            out_channels=channels,
            groups=channels,
            kernel_size=7,
            padding=(3, 3))
        self.pointwise_conv1 = nn.Conv2d(
            in_channels=channels,
            out_channels=hidden_channels,
            kernel_size=1)
        self.hidden_balancer = ActivationBalancer(hidden_channels,
                                                  channel_dim=1,
                                                  min_positive=0.3,
                                                  max_positive=1.0,
                                                  min_abs=0.75,
                                                  max_abs=5.0,
                                                  min_prob=0.25)
        self.activation = SwooshL()
        self.pointwise_conv2 = ScaledConv2d(
            in_channels=hidden_channels,
            out_channels=channels,
            kernel_size=1,
            initial_scale=0.01)
    def forward(self, x: Tensor) -> Tensor:
        """
        x layout: (N, C, H, W), i.e. (batch_size, num_channels, num_frames, num_freqs)
        The returned value has the same shape as x.
        """
        if (not self.training) or torch.jit.is_scripting() or random.random() < float(self.layerdrop_prob):
            return x
        bypass = x
        x = self.depthwise_conv(x)
        x = self.pointwise_conv1(x)
        x = self.hidden_balancer(x)
        x = self.activation(x)
        x = self.pointwise_conv2(x)
        return bypass + x
 class Conv2dSubsampling(nn.Module):
    """Convolutional 2D subsampling (to 1/2 length).
@ -1752,7 +1762,6 @@ class Conv2dSubsampling(nn.Module):
        layer1_channels: int = 8,
        layer2_channels: int = 32,
        layer3_channels: int = 128,
        bottleneck_channels: int = 64,
        dropout: FloatLike = 0.1,
    ) -> None:
        """
@ -1778,7 +1787,7 @@ class Conv2dSubsampling(nn.Module):
        # training.  (The second one is necessary to stop its bias from getting
        # a too-large gradient).
-        self.conv = nn.Sequential(
+        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=layer1_channels,
@ -1797,21 +1806,29 @@ class Conv2dSubsampling(nn.Module):
                stride=2,
                padding=0,
            ),
            ActivationBalancer(layer2_channels,
                               channel_dim=1,
                               max_abs=4.0),
            SwooshR(),
            nn.Conv2d(
                in_channels=layer2_channels,
                out_channels=layer3_channels,
                kernel_size=3,
                stride=(1, 2), # (time, freq)
            ),
            ActivationBalancer(layer3_channels,
                               channel_dim=1,
                               max_abs=4.0),
            SwooshR(),
        )
        self.convnext1 = nn.Sequential(ConvNeXt(layer2_channels),
                                       ConvNeXt(layer2_channels),
                                       ConvNeXt(layer2_channels),
                                       BasicNorm(layer2_channels,
                                                 channel_dim=1))
        self.conv2 = nn.Sequential(
                                   nn.Conv2d(
                                       in_channels=layer2_channels,
                                       out_channels=layer3_channels,
                                       kernel_size=3,
                                       stride=(1, 2), # (time, freq)
                                   ))
        self.convnext2 = nn.Sequential(ConvNeXt(layer3_channels),
                                       ConvNeXt(layer3_channels),
                                       ConvNeXt(layer3_channels),
                                       BasicNorm(layer3_channels,
                                                 channel_dim=1))
        out_height = (((in_channels - 1) // 2) - 1) // 2
        self.scale = nn.Parameter(torch.ones(out_height * layer3_channels))
@ -1839,7 +1856,12 @@ class Conv2dSubsampling(nn.Module):
        # scaling x by 0.1 allows us to use a larger grad-scale in fp16 "amp" (automatic mixed precision)
        # training, since the weights in the first convolution are otherwise the limiting factor for getting infinite
        # gradients.
-        x = self.conv(x)
+        x = self.conv1(x)
        x = self.convnext1(x)
        x = self.conv2(x)
        x = self.convnext2(x)
        # Now x is of shape (N, odim, ((T-3)//2 - 1)//2, ((idim-1)//2 - 1)//2)
        b, c, t, f = x.size()