From 9e4b84f3744da210e197fd7f3a327f2aec7decb8 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 12 Jan 2023 20:14:51 +0800
Subject: [PATCH] Simplify Conv2dSubsampling, removing all but one ConvNext
 layer

---
 .../pruned_transducer_stateless7/zipformer.py | 49 +++++--------------
 1 file changed, 12 insertions(+), 37 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
index f4c3fe02e..be5e9a7b1 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
@@ -1944,7 +1944,7 @@ class Conv2dSubsampling(nn.Module):
         # training.  (The second one is necessary to stop its bias from getting
         # a too-large gradient).
 
-        self.conv1 = nn.Sequential(
+        self.conv = nn.Sequential(
             nn.Conv2d(
                 in_channels=1,
                 out_channels=layer1_channels,
@@ -1964,21 +1964,9 @@ class Conv2dSubsampling(nn.Module):
                 padding=0,
             ),
             Balancer(layer2_channels,
-                               channel_dim=1,
-                               max_abs=4.0),
+                     channel_dim=1,
+                     max_abs=4.0),
             SwooshR(),
-        )
-
-        self.convnext1 = nn.Sequential(ConvNeXt(layer2_channels, kernel_size=(5, 7)),
-                                       ConvNeXt(layer2_channels, kernel_size=(5, 7)),
-                                       BasicNorm(layer2_channels,
-                                                 channel_dim=1))
-
-
-        cur_width = (in_channels - 1) // 2
-
-
-        self.conv2 = nn.Sequential(
             nn.Conv2d(
                 in_channels=layer2_channels,
                 out_channels=layer3_channels,
@@ -1986,21 +1974,18 @@ class Conv2dSubsampling(nn.Module):
                 stride=(1, 2), # (time, freq)
             ),
             Balancer(layer3_channels,
-                               channel_dim=1,
-                               max_abs=4.0),
+                     channel_dim=1,
+                     max_abs=4.0),
             SwooshR(),
         )
 
-        self.convnext2 = nn.Sequential(ConvNeXt(layer3_channels, kernel_size=(7, 7)),
-                                       ConvNeXt(layer3_channels, kernel_size=(7, 7)),
-                                       ConvNeXt(layer3_channels, kernel_size=(7, 7)))
+        cur_width = (in_channels - 1) // 2
+
+        # just one convnext layer
+        self.convnext = ConvNeXt(layer3_channels, kernel_size=(7, 7))
 
         out_width = (((in_channels - 1) // 2) - 1) // 2
 
-        self.scale = nn.Parameter(torch.ones(out_width * layer3_channels))
-        self.scale_max = 1.0
-        self.scale_min = ScheduledFloat((0.0, 0.9), (4000.0, 0.1))
-
         self.out = nn.Linear(out_width * layer3_channels, out_channels)
         # use a larger than normal grad_scale on this whitening module; there is
         # only one such module, so there is not a concern about adding together
@@ -2031,13 +2016,8 @@ class Conv2dSubsampling(nn.Module):
         # scaling x by 0.1 allows us to use a larger grad-scale in fp16 "amp" (automatic mixed precision)
         # training, since the weights in the first convolution are otherwise the limiting factor for getting infinite
         # gradients.
-        x = self.conv1(x)
-        x = self.convnext1(x)
-
-
-        x = self.conv2(x)
-        x = self.convnext2(x)
-
+        x = self.conv(x)
+        x = self.convnext(x)
 
         # Now x is of shape (N, odim, ((T-3)//2 - 1)//2, ((idim-1)//2 - 1)//2)
         b, c, t, f = x.size()
@@ -2045,13 +2025,8 @@ class Conv2dSubsampling(nn.Module):
         x = x.transpose(1, 2).reshape(b, t, c * f)
         # now x: (N, ((T-1)//2 - 1))//2, out_width * layer3_channels))
 
-        x = x * limit_param_value(self.scale,
-                                  min=float(self.scale_min),
-                                  max=float(self.scale_max),
-                                  training=self.training)
-
-        # Now x is of shape (N, ((T-1)//2 - 1))//2, odim)
         x = self.out(x)
+        # Now x is of shape (N, ((T-1)//2 - 1))//2, odim)
         x = self.out_whiten(x)
         x = self.out_norm(x)
         x = self.dropout(x)