From 437e8b208341bf027744be5d81f0126635150572 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 13 Mar 2022 23:31:08 +0800
Subject: [PATCH] Reduce max-abs limit from 1000 to 100; introduce 2
 DerivBalancer modules in conv layer.

---
 .../ASR/conformer_ctc/subsampling.py          |  4 ++--
 .../ASR/transducer_stateless/conformer.py     | 22 ++++++++++++++++++-
 .../ASR/transducer_stateless/train.py         |  2 +-
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/egs/librispeech/ASR/conformer_ctc/subsampling.py b/egs/librispeech/ASR/conformer_ctc/subsampling.py
index 04481aa5b..3a1eda3f1 100644
--- a/egs/librispeech/ASR/conformer_ctc/subsampling.py
+++ b/egs/librispeech/ASR/conformer_ctc/subsampling.py
@@ -327,7 +327,7 @@ class DerivBalancerFunction(torch.autograd.Function):
                 max_positive: float, # e.g. 0.95
                 max_factor: float, # e.g. 0.01
                 min_abs: float, # e.g. 0.2
-                max_abs: float, # e.g. 1000.0
+                max_abs: float, # e.g. 100.0
     ) -> Tensor:
         if x.requires_grad:
             if channel_dim < 0:
@@ -547,7 +547,7 @@ class DerivBalancer(torch.nn.Module):
                  max_positive: float = 0.95,
                  max_factor: float = 0.01,
                  min_abs: float = 0.2,
-                 max_abs: float = 1000.0):
+                 max_abs: float = 100.0):
         super(DerivBalancer, self).__init__()
         self.channel_dim = channel_dim
         self.min_positive = min_positive
diff --git a/egs/librispeech/ASR/transducer_stateless/conformer.py b/egs/librispeech/ASR/transducer_stateless/conformer.py
index e6466d8e6..65a8431de 100644
--- a/egs/librispeech/ASR/transducer_stateless/conformer.py
+++ b/egs/librispeech/ASR/transducer_stateless/conformer.py
@@ -847,6 +847,22 @@ class ConvolutionModule(nn.Module):
             padding=0,
             bias=bias,
         )
+
+        # after pointwise_conv1 we put x through a gated linear unit (nn.functional.glu).
+        # For most layers the normal rms value of channels of x seems to be in the range 1 to 4,
+        # but sometimes, for some reason, for layer 0 the rms ends up being very large,
+        # between 50 and 100 for different channels.  This will cause very peaky and
+        # sparse derivatives for the sigmoid gating function, which will tend to make
+        # the loss function not learn effectively.  (for most layers the average absolute values
+        # are in the range 0.5..9.0, and the average p(x>0), i.e. positive proportion,
+        # at the output of pointwise_conv1.output is around 0.35 to 0.45 for different
+        # layers, which likely breaks down as 0.5 for the "linear" half and
+        # 0.2 to 0.3 for the part that goes into the sigmoid.  The idea is that if we
+        # constrain the rms values to a reasonable range via a constraint of max_abs=10.0,
+        # it will be in a better position to start learning something, i.e. to latch onto
+        # the correct range.
+        self.deriv_balancer1 = DerivBalancer(channel_dim=1, max_abs=10.0)
+
         self.depthwise_conv = ScaledConv1d(
             channels,
             channels,
@@ -857,6 +873,8 @@ class ConvolutionModule(nn.Module):
             bias=bias,
         )
 
+
+        self.deriv_balancer2 = DerivBalancer(channel_dim=1)
          # shape: (channels, 1), broadcasts with (batch, channel, time).
         self.activation = SwishOffset()
 
@@ -885,12 +903,14 @@ class ConvolutionModule(nn.Module):
 
         # GLU mechanism
         x = self.pointwise_conv1(x)  # (batch, 2*channels, time)
+
+        x = self.deriv_balancer1(x)
         x = nn.functional.glu(x, dim=1)  # (batch, channels, time)
 
         # 1D Depthwise Conv
         x = self.depthwise_conv(x)
 
-        # TODO: can have a learned scale in here, or a fixed one.
+        x = self.deriv_balancer2(x)
         x = self.activation(x)
 
         x = self.pointwise_conv2(x)  # (batch, channel, time)
diff --git a/egs/librispeech/ASR/transducer_stateless/train.py b/egs/librispeech/ASR/transducer_stateless/train.py
index 994b89e49..a0395a398 100755
--- a/egs/librispeech/ASR/transducer_stateless/train.py
+++ b/egs/librispeech/ASR/transducer_stateless/train.py
@@ -110,7 +110,7 @@ def get_parser():
     parser.add_argument(
         "--exp-dir",
         type=str,
-        default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95_noexp",
+        default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95_noexp_convderiv",
         help="""The experiment dir.
         It specifies the directory where all training related
         files, e.g., checkpoints, log, etc, are saved