Remove ExpScale in feedforward layes.

2025-08-09 01:52:41 +00:00 · 2022-03-13 17:29:39 +08:00 · 2022-03-13 17:29:39 +08:00 · f351777e9c
commit f351777e9c
parent 97c0bb82d3
3 changed files with 13 additions and 8 deletions
--- a/egs/librispeech/ASR/conformer_ctc/subsampling.py
+++ b/egs/librispeech/ASR/conformer_ctc/subsampling.py
@ -565,8 +565,13 @@ class DerivBalancer(torch.nn.Module):

 class DoubleSwish(torch.nn.Module):
    def forward(self, x: Tensor) -> Tensor:
-        """Return Swich activation function."""
-        return x * torch.sigmoid(x - 1.0)
+        """Return double-swish activation function which is an approximation to Swish(Swish(x)),
+           that we approximate closely with x * sigmoid(x-1), expressed for more memory-efficient
+           backprop as (x-1) * torch.sigmoid(x - 1) + torch.sigmoid(x - 1)
+        """
+        x1 = x - 1.0
+        s = torch.sigmoid(x1)
+        return (x1 * s) + s  # (x-1) * s + s == x * s


 def _test_exp_scale_swish():
@ -581,10 +586,10 @@ def _test_exp_scale_swish():

    y1 = m1(x1)
    y2 = m2(x2)
-    assert torch.allclose(y1, y2)
+    assert torch.allclose(y1, y2, atol=1e-05)
    y1.sum().backward()
    y2.sum().backward()
-    assert torch.allclose(x1.grad, x2.grad)
+    assert torch.allclose(x1.grad, x2.grad, atol=1e-05)

 def _test_exp_scale_relu():

--- a/egs/librispeech/ASR/transducer_stateless/conformer.py
+++ b/egs/librispeech/ASR/transducer_stateless/conformer.py
@ -19,7 +19,7 @@ import copy
 import math
 import warnings
 from typing import Optional, Tuple, Sequence
-from subsampling import PeLU, ExpScale, SwishExpScale, ExpScaleRelu, DerivBalancer, BasicNorm, ScaledLinear, ScaledConv1d, ScaledConv2d
+from subsampling import PeLU, ExpScale, DoubleSwish, SwishExpScale, ExpScaleRelu, DerivBalancer, BasicNorm, ScaledLinear, ScaledConv1d, ScaledConv2d

 import torch
 from torch import Tensor, nn
@ -159,7 +159,7 @@ class ConformerEncoderLayer(nn.Module):
        self.feed_forward = nn.Sequential(
            ScaledLinear(d_model, dim_feedforward),
            DerivBalancer(channel_dim=-1),
-            SwishExpScale(dim_feedforward, speed=20.0),
+            DoubleSwish(),
            nn.Dropout(dropout),
            ScaledLinear(dim_feedforward, d_model, initial_scale=0.25),
        )
@ -167,7 +167,7 @@ class ConformerEncoderLayer(nn.Module):
        self.feed_forward_macaron = nn.Sequential(
            ScaledLinear(d_model, dim_feedforward),
            DerivBalancer(channel_dim=-1),
-            SwishExpScale(dim_feedforward, speed=20.0),
+            DoubleSwish(),
            nn.Dropout(dropout),
            ScaledLinear(dim_feedforward, d_model, initial_scale=0.25),
        )
--- a/egs/librispeech/ASR/transducer_stateless/train.py
+++ b/egs/librispeech/ASR/transducer_stateless/train.py
@ -110,7 +110,7 @@ def get_parser():
    parser.add_argument(
        "--exp-dir",
        type=str,
-        default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95",
+        default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95_noexp",
        help="""The experiment dir.
        It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved