Reduce max-abs limit from 1000 to 100; introduce 2 DerivBalancer modules in conv layer.

This commit is contained in:
Daniel Povey 2022-03-13 23:31:08 +08:00
parent f351777e9c
commit 437e8b2083
3 changed files with 24 additions and 4 deletions

View File

@ -327,7 +327,7 @@ class DerivBalancerFunction(torch.autograd.Function):
max_positive: float, # e.g. 0.95 max_positive: float, # e.g. 0.95
max_factor: float, # e.g. 0.01 max_factor: float, # e.g. 0.01
min_abs: float, # e.g. 0.2 min_abs: float, # e.g. 0.2
max_abs: float, # e.g. 1000.0 max_abs: float, # e.g. 100.0
) -> Tensor: ) -> Tensor:
if x.requires_grad: if x.requires_grad:
if channel_dim < 0: if channel_dim < 0:
@ -547,7 +547,7 @@ class DerivBalancer(torch.nn.Module):
max_positive: float = 0.95, max_positive: float = 0.95,
max_factor: float = 0.01, max_factor: float = 0.01,
min_abs: float = 0.2, min_abs: float = 0.2,
max_abs: float = 1000.0): max_abs: float = 100.0):
super(DerivBalancer, self).__init__() super(DerivBalancer, self).__init__()
self.channel_dim = channel_dim self.channel_dim = channel_dim
self.min_positive = min_positive self.min_positive = min_positive

View File

@ -847,6 +847,22 @@ class ConvolutionModule(nn.Module):
padding=0, padding=0,
bias=bias, bias=bias,
) )
# after pointwise_conv1 we put x through a gated linear unit (nn.functional.glu).
# For most layers the normal rms value of channels of x seems to be in the range 1 to 4,
# but sometimes, for some reason, for layer 0 the rms ends up being very large,
# between 50 and 100 for different channels. This will cause very peaky and
# sparse derivatives for the sigmoid gating function, which will tend to make
# the loss function not learn effectively. (for most layers the average absolute values
# are in the range 0.5..9.0, and the average p(x>0), i.e. positive proportion,
# at the output of pointwise_conv1.output is around 0.35 to 0.45 for different
# layers, which likely breaks down as 0.5 for the "linear" half and
# 0.2 to 0.3 for the part that goes into the sigmoid. The idea is that if we
# constrain the rms values to a reasonable range via a constraint of max_abs=10.0,
# it will be in a better position to start learning something, i.e. to latch onto
# the correct range.
self.deriv_balancer1 = DerivBalancer(channel_dim=1, max_abs=10.0)
self.depthwise_conv = ScaledConv1d( self.depthwise_conv = ScaledConv1d(
channels, channels,
channels, channels,
@ -857,6 +873,8 @@ class ConvolutionModule(nn.Module):
bias=bias, bias=bias,
) )
self.deriv_balancer2 = DerivBalancer(channel_dim=1)
# shape: (channels, 1), broadcasts with (batch, channel, time). # shape: (channels, 1), broadcasts with (batch, channel, time).
self.activation = SwishOffset() self.activation = SwishOffset()
@ -885,12 +903,14 @@ class ConvolutionModule(nn.Module):
# GLU mechanism # GLU mechanism
x = self.pointwise_conv1(x) # (batch, 2*channels, time) x = self.pointwise_conv1(x) # (batch, 2*channels, time)
x = self.deriv_balancer1(x)
x = nn.functional.glu(x, dim=1) # (batch, channels, time) x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv # 1D Depthwise Conv
x = self.depthwise_conv(x) x = self.depthwise_conv(x)
# TODO: can have a learned scale in here, or a fixed one. x = self.deriv_balancer2(x)
x = self.activation(x) x = self.activation(x)
x = self.pointwise_conv2(x) # (batch, channel, time) x = self.pointwise_conv2(x) # (batch, channel, time)

View File

@ -110,7 +110,7 @@ def get_parser():
parser.add_argument( parser.add_argument(
"--exp-dir", "--exp-dir",
type=str, type=str,
default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95_noexp", default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95_noexp_convderiv",
help="""The experiment dir. help="""The experiment dir.
It specifies the directory where all training related It specifies the directory where all training related
files, e.g., checkpoints, log, etc, are saved files, e.g., checkpoints, log, etc, are saved