Revert scaling, scale only grad.

2025-12-11 06:55:27 +00:00 · 2022-12-05 17:53:23 +08:00 · 2022-12-05 17:53:23 +08:00 · 178eca1c0e
commit 178eca1c0e
parent b93cf0676a
2 changed files with 24 additions and 6 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
@ -900,6 +900,26 @@ def with_loss(x, y):
    return WithLoss.apply(x, y)
 class ScaleGradFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x: Tensor, alpha: float) -> Tensor:
        ctx.alpha = alpha
        return x
    @staticmethod
    def backward(ctx, grad: Tensor):
        return grad * ctx.alpha, None
 def scale_grad(x: Tensor, alpha: float):
    return ScaleGradFunction.apply(x, alpha)
 class ScaleGrad(nn.Module):
    def __init__(self, alpha: float):
        super().__init__()
        self.alpha = alpha
    def forward(self, x: Tensor) -> Tensor:
        return scale_grad(x, self.alpha)
 class LimitParamValue(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x: Tensor, min: float, max: float):
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
@ -43,6 +43,7 @@ from scaling import (
    ScheduledFloat,
    FloatLike,
    limit_param_value,
    ScaleGrad,
 )
 from torch import Tensor, nn
@ -1719,25 +1720,22 @@ class Conv2dSubsampling(nn.Module):
        # a too-large gradient).
        self.conv = nn.Sequential(
-            ScalarMultiply(0.1),
+            nn.Conv2d(
            ScaledConv2d(
                in_channels=1,
                out_channels=layer1_channels,
                kernel_size=3,
                padding=(0, 1),  # (time, freq)
                initial_scale=5.0,
            ),
-            ScalarMultiply(0.25),
+            ScaleGrad(0.1),
            ActivationBalancer(layer1_channels,
                               channel_dim=1),
            DoubleSwish(),
-            ScaledConv2d(
+            nn.Conv2d(
                in_channels=layer1_channels,
                out_channels=layer2_channels,
                kernel_size=3,
                stride=2,
                padding=0,
                initial_scale=5.0,
            ),
            ActivationBalancer(layer2_channels,
                               channel_dim=1),