diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py index aff5a7704..7c7b39fe9 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py @@ -1403,6 +1403,13 @@ class AttentionSqueeze(nn.Module): prob=_aux_grad_prob_out(), bias=False, initial_scale=0.05) + self.out_balancer = ActivationBalancer( + embed_dim, channel_dim=-1, + min_positive=0.3, max_positive=0.7, + min_abs=ScheduledFloat((0.0, 0.002), (8000.0, 0.02), (20000.0, 0.01)), + ) + + def forward(self, x: Tensor, attn_weights: Tensor): @@ -1438,6 +1445,7 @@ attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len) x = x * scales x = self.activation(x) # Identity only. For diagnostics. x = self.out_proj(x) + x = self.out_balancer(x) return x @@ -1532,7 +1540,7 @@ class NonlinAttentionModule(nn.Module): self.balancer2 = ActivationBalancer( channels, channel_dim=-1, - min_positive=0.4, max_positive=0.5, + min_positive=0.3, max_positive=0.7, min_abs=ScheduledFloat((0.0, 0.001), (8000.0, 0.01), (20000.0, 0.005)), )