Reduce max-abs limit from 1000 to 100; introduce 2 DerivBalancer modules in conv layer.

This commit is contained in:
Daniel Povey 2022-03-13 23:31:08 +08:00
parent f351777e9c
commit 437e8b2083
3 changed files with 24 additions and 4 deletions

View File

@ -327,7 +327,7 @@ class DerivBalancerFunction(torch.autograd.Function):
max_positive: float, # e.g. 0.95
max_factor: float, # e.g. 0.01
min_abs: float, # e.g. 0.2
max_abs: float, # e.g. 1000.0
max_abs: float, # e.g. 100.0
) -> Tensor:
if x.requires_grad:
if channel_dim < 0:
@ -547,7 +547,7 @@ class DerivBalancer(torch.nn.Module):
max_positive: float = 0.95,
max_factor: float = 0.01,
min_abs: float = 0.2,
max_abs: float = 1000.0):
max_abs: float = 100.0):
super(DerivBalancer, self).__init__()
self.channel_dim = channel_dim
self.min_positive = min_positive

View File

@ -847,6 +847,22 @@ class ConvolutionModule(nn.Module):
padding=0,
bias=bias,
)
# after pointwise_conv1 we put x through a gated linear unit (nn.functional.glu).
# For most layers the normal rms value of channels of x seems to be in the range 1 to 4,
# but sometimes, for some reason, for layer 0 the rms ends up being very large,
# between 50 and 100 for different channels. This will cause very peaky and
# sparse derivatives for the sigmoid gating function, which will tend to make
# the loss function not learn effectively. (for most layers the average absolute values
# are in the range 0.5..9.0, and the average p(x>0), i.e. positive proportion,
# at the output of pointwise_conv1.output is around 0.35 to 0.45 for different
# layers, which likely breaks down as 0.5 for the "linear" half and
# 0.2 to 0.3 for the part that goes into the sigmoid. The idea is that if we
# constrain the rms values to a reasonable range via a constraint of max_abs=10.0,
# it will be in a better position to start learning something, i.e. to latch onto
# the correct range.
self.deriv_balancer1 = DerivBalancer(channel_dim=1, max_abs=10.0)
self.depthwise_conv = ScaledConv1d(
channels,
channels,
@ -857,6 +873,8 @@ class ConvolutionModule(nn.Module):
bias=bias,
)
self.deriv_balancer2 = DerivBalancer(channel_dim=1)
# shape: (channels, 1), broadcasts with (batch, channel, time).
self.activation = SwishOffset()
@ -885,12 +903,14 @@ class ConvolutionModule(nn.Module):
# GLU mechanism
x = self.pointwise_conv1(x) # (batch, 2*channels, time)
x = self.deriv_balancer1(x)
x = nn.functional.glu(x, dim=1) # (batch, channels, time)
# 1D Depthwise Conv
x = self.depthwise_conv(x)
# TODO: can have a learned scale in here, or a fixed one.
x = self.deriv_balancer2(x)
x = self.activation(x)
x = self.pointwise_conv2(x) # (batch, channel, time)

View File

@ -110,7 +110,7 @@ def get_parser():
parser.add_argument(
"--exp-dir",
type=str,
default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95_noexp",
default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95_noexp_convderiv",
help="""The experiment dir.
It specifies the directory where all training related
files, e.g., checkpoints, log, etc, are saved