diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py index ce3b08ae8..adad7ca97 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py @@ -1306,7 +1306,8 @@ class AttentionSqueeze(nn.Module): max_factor=0.02, min_prob=0.1, ) - self.activation = DoubleSwish() # in bottleneck + self.bottleneck_activation = DoubleSwish() # in bottleneck + self.activation = Identity() # for diagnostics # the next two balancers are only to stop parameter-magnitude 'drift': we have # too many degrees of freedom for the scales of the various activations. @@ -1331,11 +1332,12 @@ class AttentionSqueeze(nn.Module): self.out_proj = ScaledLinear(embed_dim, embed_dim, bias=False, initial_scale=0.05) - self.out_whiten = Whiten(num_groups=1, - whitening_limit=10.0, - prob=(0.025, 0.25), - grad_scale=0.01) - + self.out_balancer = ActivationBalancer( + embed_dim, channel_dim=-1, + min_positive=0.45, max_positive=0.55, + min_abs=0.005, max_abs=2.0, + min_prob=0.05, + ) def forward(self, x: Tensor, @@ -1358,7 +1360,7 @@ attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len) # -> (num_heads, batch_size, seq_len, head_dim) bottleneck = torch.matmul(attn_weights, bottleneck) bottleneck = self.bottleneck_balancer(bottleneck) - bottleneck = self.activation(bottleneck) + bottleneck = self.bottleneck_activation(bottleneck) bottleneck = bottleneck.permute(2, 1, 0, 3) # (seq_len, batch_size, num_heads, head_dim) bottleneck = bottleneck.reshape(seq_len, batch_size, bottleneck_dim) scales = self.from_bottleneck_proj(bottleneck) @@ -1367,8 +1369,9 @@ attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len) x = self.activation_balancer(x) scales = self.scale_balancer(scales) x = x * scales + x = self.activation(x) # Identity only. For diagnostics. x = self.out_proj(x) - x = self.out_whiten(x) + x = self.out_balancer(x) return x @@ -1388,11 +1391,10 @@ class FeedforwardModule(nn.Module): self.dropout = nn.Dropout(dropout) self.out_proj = ScaledLinear(feedforward_dim, embed_dim, initial_scale=0.01) - self.out_balancer = ActivationBalancer(embed_dim, - min_positive=0.4, max_positive=0.6, - min_abs=0.01, max_abs=5.0, - channel_dim=-1, min_prob=0.1) - + self.out_whitener = Whiten(num_groups=1, + whitening_limit=10.0, + prob=(0.025, 0.25), + grad_scale=0.01) def forward(self, x: Tensor): @@ -1401,7 +1403,7 @@ class FeedforwardModule(nn.Module): x = self.activation(x) x = self.dropout(x) x = self.out_proj(x) - x = self.out_balancer(x) + x = self.out_whitener(x) return x @@ -1447,7 +1449,7 @@ class NonlinAttentionModule(nn.Module): # to have a larger mean-offset at the output for some reason. self.out_balancer = ActivationBalancer( channels, channel_dim=-1, - min_positive=0.4, max_positive=0.6, + min_positive=0.45, max_positive=0.55, min_abs=0.005, max_abs=1.0, min_prob=0.05, )