Merge branch 'scaled_adam_exp690' into scaled_adam_exp694

2025-12-11 06:55:27 +00:00 · 2022-12-13 18:48:05 +08:00 · 2022-12-13 18:48:05 +08:00 · 8d75006d69
commit 8d75006d69
parent 7de7753ea2 d2465492f9
1 changed files with 9 additions and 1 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
@ -1403,6 +1403,13 @@ class AttentionSqueeze(nn.Module):
                                          prob=_aux_grad_prob_out(),
                                          bias=False, initial_scale=0.05)

+        self.out_balancer = ActivationBalancer(
+            embed_dim, channel_dim=-1,
+            min_positive=0.3, max_positive=0.7,
+            min_abs=ScheduledFloat((0.0, 0.002), (8000.0, 0.02), (20000.0, 0.01)),
+        )
+
+
    def forward(self,
                x: Tensor,
                attn_weights: Tensor):
@ -1438,6 +1445,7 @@ attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len)
        x = x * scales
        x = self.activation(x)  # Identity only.  For diagnostics.
        x = self.out_proj(x)
+        x = self.out_balancer(x)
        return x


@ -1532,7 +1540,7 @@ class NonlinAttentionModule(nn.Module):

        self.balancer2 = ActivationBalancer(
            channels, channel_dim=-1,
-            min_positive=0.4, max_positive=0.5,
+            min_positive=0.3, max_positive=0.7,
            min_abs=ScheduledFloat((0.0, 0.001), (8000.0, 0.01), (20000.0, 0.005)),
        )