From 437e8b208341bf027744be5d81f0126635150572 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 13 Mar 2022 23:31:08 +0800 Subject: [PATCH] Reduce max-abs limit from 1000 to 100; introduce 2 DerivBalancer modules in conv layer. --- .../ASR/conformer_ctc/subsampling.py | 4 ++-- .../ASR/transducer_stateless/conformer.py | 22 ++++++++++++++++++- .../ASR/transducer_stateless/train.py | 2 +- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/egs/librispeech/ASR/conformer_ctc/subsampling.py b/egs/librispeech/ASR/conformer_ctc/subsampling.py index 04481aa5b..3a1eda3f1 100644 --- a/egs/librispeech/ASR/conformer_ctc/subsampling.py +++ b/egs/librispeech/ASR/conformer_ctc/subsampling.py @@ -327,7 +327,7 @@ class DerivBalancerFunction(torch.autograd.Function): max_positive: float, # e.g. 0.95 max_factor: float, # e.g. 0.01 min_abs: float, # e.g. 0.2 - max_abs: float, # e.g. 1000.0 + max_abs: float, # e.g. 100.0 ) -> Tensor: if x.requires_grad: if channel_dim < 0: @@ -547,7 +547,7 @@ class DerivBalancer(torch.nn.Module): max_positive: float = 0.95, max_factor: float = 0.01, min_abs: float = 0.2, - max_abs: float = 1000.0): + max_abs: float = 100.0): super(DerivBalancer, self).__init__() self.channel_dim = channel_dim self.min_positive = min_positive diff --git a/egs/librispeech/ASR/transducer_stateless/conformer.py b/egs/librispeech/ASR/transducer_stateless/conformer.py index e6466d8e6..65a8431de 100644 --- a/egs/librispeech/ASR/transducer_stateless/conformer.py +++ b/egs/librispeech/ASR/transducer_stateless/conformer.py @@ -847,6 +847,22 @@ class ConvolutionModule(nn.Module): padding=0, bias=bias, ) + + # after pointwise_conv1 we put x through a gated linear unit (nn.functional.glu). + # For most layers the normal rms value of channels of x seems to be in the range 1 to 4, + # but sometimes, for some reason, for layer 0 the rms ends up being very large, + # between 50 and 100 for different channels. This will cause very peaky and + # sparse derivatives for the sigmoid gating function, which will tend to make + # the loss function not learn effectively. (for most layers the average absolute values + # are in the range 0.5..9.0, and the average p(x>0), i.e. positive proportion, + # at the output of pointwise_conv1.output is around 0.35 to 0.45 for different + # layers, which likely breaks down as 0.5 for the "linear" half and + # 0.2 to 0.3 for the part that goes into the sigmoid. The idea is that if we + # constrain the rms values to a reasonable range via a constraint of max_abs=10.0, + # it will be in a better position to start learning something, i.e. to latch onto + # the correct range. + self.deriv_balancer1 = DerivBalancer(channel_dim=1, max_abs=10.0) + self.depthwise_conv = ScaledConv1d( channels, channels, @@ -857,6 +873,8 @@ class ConvolutionModule(nn.Module): bias=bias, ) + + self.deriv_balancer2 = DerivBalancer(channel_dim=1) # shape: (channels, 1), broadcasts with (batch, channel, time). self.activation = SwishOffset() @@ -885,12 +903,14 @@ class ConvolutionModule(nn.Module): # GLU mechanism x = self.pointwise_conv1(x) # (batch, 2*channels, time) + + x = self.deriv_balancer1(x) x = nn.functional.glu(x, dim=1) # (batch, channels, time) # 1D Depthwise Conv x = self.depthwise_conv(x) - # TODO: can have a learned scale in here, or a fixed one. + x = self.deriv_balancer2(x) x = self.activation(x) x = self.pointwise_conv2(x) # (batch, channel, time) diff --git a/egs/librispeech/ASR/transducer_stateless/train.py b/egs/librispeech/ASR/transducer_stateless/train.py index 994b89e49..a0395a398 100755 --- a/egs/librispeech/ASR/transducer_stateless/train.py +++ b/egs/librispeech/ASR/transducer_stateless/train.py @@ -110,7 +110,7 @@ def get_parser(): parser.add_argument( "--exp-dir", type=str, - default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95_noexp", + default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95_noexp_convderiv", help="""The experiment dir. It specifies the directory where all training related files, e.g., checkpoints, log, etc, are saved