From f4f3d057e7d1c85001173451eaa1df2b550dfd59 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 8 Dec 2022 18:27:01 +0800 Subject: [PATCH] Cosmetic improvements to convolution module; enable more stats. --- .../pruned_transducer_stateless7/zipformer.py | 31 ++++++++++++------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py index 90112896a..e22db4d34 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py @@ -1579,8 +1579,11 @@ class ConvolutionModule(nn.Module): # kernerl_size should be a odd number for 'SAME' padding assert (kernel_size - 1) % 2 == 0 + bottleneck_dim = channels + + self.in_proj = LinearWithAuxLoss( - channels, 2 * channels, + channels, 2 * bottleneck_dim, aux_grad_scale=_aux_grad_scale(), prob=_aux_grad_prob_in() ) @@ -1599,35 +1602,38 @@ class ConvolutionModule(nn.Module): # it will be in a better position to start learning something, i.e. to latch onto # the correct range. self.balancer1 = ActivationBalancer( - 2 * channels, channel_dim=-1, + bottleneck_dim, channel_dim=-1, min_positive=ScheduledFloat((0.0, 0.1), (8000.0, 0.05)), max_positive=1.0, min_abs=1.5, max_abs=ScheduledFloat((0.0, 5.0), (8000.0, 10.0), default=1.0), ) - self.pre_sigmoid = Identity() # before sigmoid; for diagnostics. + self.activation1 = Identity() # for diagnostics + self.sigmoid = nn.Sigmoid() + self.activation2 = Identity() # for diagnostics + self.depthwise_conv = nn.Conv1d( - channels, - channels, + bottleneck_dim, + bottleneck_dim, kernel_size, stride=1, padding=(kernel_size - 1) // 2, - groups=channels, + groups=bottleneck_dim, bias=True, ) self.balancer2 = ActivationBalancer( - channels, channel_dim=1, + bottleneck_dim, channel_dim=1, min_positive=ScheduledFloat((0.0, 0.1), (8000.0, 0.05)), max_positive=1.0, min_abs=ScheduledFloat((0.0, 0.2), (20000.0, 1.0)), max_abs=10.0, ) - self.activation = SwooshR() + self.activation3 = SwooshR() self.whiten = Whiten(num_groups=1, whitening_limit=_whitening_schedule(7.5), @@ -1635,7 +1641,7 @@ class ConvolutionModule(nn.Module): grad_scale=0.01) self.out_proj = LinearWithAuxLoss( - channels, channels, + bottleneck_dim, channels, aux_grad_scale=_aux_grad_scale(), prob=_aux_grad_prob_out(), initial_scale=0.05, ) @@ -1658,12 +1664,13 @@ class ConvolutionModule(nn.Module): """ x = self.in_proj(x) # (time, batch, 2*channels) - x = self.balancer1(x) x, s = x.chunk(2, dim=-1) - s = self.pre_sigmoid(s) + s = self.balancer1(s) s = self.sigmoid(s) + x = self.activation1(x) # identity. x = x * s + x = self.activation2(x) # identity # (time, batch, channels) @@ -1679,7 +1686,7 @@ class ConvolutionModule(nn.Module): x = self.balancer2(x) x = x.permute(2, 0, 1) # (time, batch, channels) - x = self.activation(x) + x = self.activation3(x) x = self.whiten(x) # (time, batch, channels) x = self.out_proj(x) # (time, batch, channels)