diff --git a/egs/librispeech/ASR/conformer_ctc/subsampling.py b/egs/librispeech/ASR/conformer_ctc/subsampling.py
index 8d01d8fc0..04481aa5b 100644
--- a/egs/librispeech/ASR/conformer_ctc/subsampling.py
+++ b/egs/librispeech/ASR/conformer_ctc/subsampling.py
@@ -565,8 +565,13 @@ class DerivBalancer(torch.nn.Module):
 
 class DoubleSwish(torch.nn.Module):
     def forward(self, x: Tensor) -> Tensor:
-        """Return Swich activation function."""
-        return x * torch.sigmoid(x - 1.0)
+        """Return double-swish activation function which is an approximation to Swish(Swish(x)),
+           that we approximate closely with x * sigmoid(x-1), expressed for more memory-efficient
+           backprop as (x-1) * torch.sigmoid(x - 1) + torch.sigmoid(x - 1)
+        """
+        x1 = x - 1.0
+        s = torch.sigmoid(x1)
+        return (x1 * s) + s  # (x-1) * s + s == x * s
 
 
 def _test_exp_scale_swish():
@@ -581,10 +586,10 @@ def _test_exp_scale_swish():
 
     y1 = m1(x1)
     y2 = m2(x2)
-    assert torch.allclose(y1, y2)
+    assert torch.allclose(y1, y2, atol=1e-05)
     y1.sum().backward()
     y2.sum().backward()
-    assert torch.allclose(x1.grad, x2.grad)
+    assert torch.allclose(x1.grad, x2.grad, atol=1e-05)
 
 def _test_exp_scale_relu():
 
diff --git a/egs/librispeech/ASR/transducer_stateless/conformer.py b/egs/librispeech/ASR/transducer_stateless/conformer.py
index 3516c2205..e6466d8e6 100644
--- a/egs/librispeech/ASR/transducer_stateless/conformer.py
+++ b/egs/librispeech/ASR/transducer_stateless/conformer.py
@@ -19,7 +19,7 @@ import copy
 import math
 import warnings
 from typing import Optional, Tuple, Sequence
-from subsampling import PeLU, ExpScale, SwishExpScale, ExpScaleRelu, DerivBalancer, BasicNorm, ScaledLinear, ScaledConv1d, ScaledConv2d
+from subsampling import PeLU, ExpScale, DoubleSwish, SwishExpScale, ExpScaleRelu, DerivBalancer, BasicNorm, ScaledLinear, ScaledConv1d, ScaledConv2d
 
 import torch
 from torch import Tensor, nn
@@ -159,7 +159,7 @@ class ConformerEncoderLayer(nn.Module):
         self.feed_forward = nn.Sequential(
             ScaledLinear(d_model, dim_feedforward),
             DerivBalancer(channel_dim=-1),
-            SwishExpScale(dim_feedforward, speed=20.0),
+            DoubleSwish(),
             nn.Dropout(dropout),
             ScaledLinear(dim_feedforward, d_model, initial_scale=0.25),
         )
@@ -167,7 +167,7 @@ class ConformerEncoderLayer(nn.Module):
         self.feed_forward_macaron = nn.Sequential(
             ScaledLinear(d_model, dim_feedforward),
             DerivBalancer(channel_dim=-1),
-            SwishExpScale(dim_feedforward, speed=20.0),
+            DoubleSwish(),
             nn.Dropout(dropout),
             ScaledLinear(dim_feedforward, d_model, initial_scale=0.25),
         )
diff --git a/egs/librispeech/ASR/transducer_stateless/train.py b/egs/librispeech/ASR/transducer_stateless/train.py
index 897cf5411..994b89e49 100755
--- a/egs/librispeech/ASR/transducer_stateless/train.py
+++ b/egs/librispeech/ASR/transducer_stateless/train.py
@@ -110,7 +110,7 @@ def get_parser():
     parser.add_argument(
         "--exp-dir",
         type=str,
-        default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95",
+        default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95_noexp",
         help="""The experiment dir.
         It specifies the directory where all training related
         files, e.g., checkpoints, log, etc, are saved