Add another convolutional layer

2025-08-09 10:02:22 +00:00 · 2022-03-30 11:11:32 +08:00 · 2022-03-30 11:11:32 +08:00 · ca6337b78a
commit ca6337b78a
parent 1b8d7defd0
1 changed files with 11 additions and 4 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/conformer.py
@ -954,8 +954,9 @@ class Conv2dSubsampling(nn.Module):

    def __init__(self, in_channels: int,
                 out_channels: int,
-                 layer1_channels: int = 32,
-                 layer2_channels: int = 128) -> None:
+                 layer1_channels: int = 8,
+                 layer2_channels: int = 32,
+                 layer3_channels: int = 128) -> None:
        """
        Args:
          in_channels:
@ -973,7 +974,7 @@ class Conv2dSubsampling(nn.Module):
        self.conv = nn.Sequential(
            ScaledConv2d(
                in_channels=1, out_channels=layer1_channels,
-                kernel_size=3, stride=2
+                kernel_size=3,
            ),
            ActivationBalancer(channel_dim=1),
            DoubleSwish(),
@ -983,8 +984,14 @@ class Conv2dSubsampling(nn.Module):
            ),
            ActivationBalancer(channel_dim=1),
            DoubleSwish(),
+            ScaledConv2d(
+                in_channels=layer2_channels, out_channels=layer3_channels,
+                kernel_size=3, stride=2
+            ),
+            ActivationBalancer(channel_dim=1),
+            DoubleSwish(),
        )
-        self.out = ScaledLinear(layer2_channels * (((in_channels - 1) // 2 - 1) // 2), out_channels)
+        self.out = ScaledLinear(layer3_channels * (((in_channels - 1) // 2 - 1) // 2), out_channels)
        # set learn_eps=False because out_norm is preceded by `out`, and `out`
        # itself has learned scale, so the extra degree of freedom is not
        # needed.