Add hard limit of attention weights to +- 50

2025-12-11 06:55:27 +00:00 · 2022-10-20 14:27:55 +08:00 · 2022-10-20 14:27:55 +08:00 · 4565d43d5c
commit 4565d43d5c
parent c3c655d0bd
2 changed files with 40 additions and 3 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/conformer.py
@ -34,7 +34,8 @@ from scaling import (
    Whiten,
    Identity,
    _diag,
-    random_clamp
+    random_clamp,
    with_loss,
 )
 from torch import Tensor, nn
@ -1110,16 +1111,37 @@ class RelPositionMultiheadAttention(nn.Module):
                                              storage_offset=pos_weights.stride(3) * (seq_len - 1))
        # caution: they are really scores at this point.
        attn_output_weights = torch.matmul(q, k) + pos_weights
        # The following is a soft way of encouraging the attention scores to not be too large;
        # in training time, once they get outside a certain range, -5.0..5.0 currently, we
        # randomly either leave them as-is or truncate them to that range.
        if attn_weights_max is not None:
            attn_output_weights = random_clamp(attn_output_weights,
                                               min=-attn_weights_max,
                                               max=attn_weights_max,
                                               prob=0.5)
-        # attn_output_weights: (batch, head, time1, time2)
+        if training and random.random() < 0.1:
            # This is a harder way of limiting the attention scores to not be too large.
            # It incurs a penalty if any of them has an absolute value greater than 50.0.
            # this should be outside the normal range of the attention scores.  We use
            # this mechanism instead of, say, a limit on entropy, because once the entropy
            # gets very small gradients through the softmax can become very small, and
            # some mechanisms like that become ineffective.
            attn_weights_limit = 50.0
            # caution: this penalty will be affected by grad-scaling in amp.
            # It's OK; this is just an emergency brake, and under normal
            # conditions it shouldn't be active
            attn_weights_penalty = 1.0e-04
            aux_loss = attn_weights_penalty * (attn_output_weights.abs() -
                                               attn_weights_limit).relu()
            attn_output_weights = with_loss(attn_output_weights,
                                                aux_loss)
        # attn_output_weights: (batch, head, time1, time2)
        attn_output_weights = attn_output_weights.view(
            bsz * num_heads, seq_len, seq_len
        )
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
@ -608,6 +608,22 @@ class Whiten(nn.Module):
                                                  self.whitening_limit,
                                                  self.grad_scale)
 class WithLoss(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x: Tensor, y: Tensor):
        ctx.y_shape = y.shape
        return x
    @staticmethod
    def backward(ctx, ans_grad: Tensor):
        return ans_grad, torch.ones(ctx.y_shape,
                                    dtype=ans_grad.dtype,
                                    device=ans_grad.device)
 def with_loss(x, y):
    # returns x but adds y.sum() to the loss function.
    return WithLoss.apply(x, y)
 def _no_op(x: Tensor) -> Tensor:
    if (torch.jit.is_scripting()):
        return x
@ -617,7 +633,6 @@ def _no_op(x: Tensor) -> Tensor:
        return x.chunk(1, dim=-1)[0]
 class Identity(torch.nn.Module):
    def __init__(self):
        super(Identity, self).__init__()