diff --git a/egs/librispeech/ASR/conformer_ctc/subsampling.py b/egs/librispeech/ASR/conformer_ctc/subsampling.py index 8d01d8fc0..04481aa5b 100644 --- a/egs/librispeech/ASR/conformer_ctc/subsampling.py +++ b/egs/librispeech/ASR/conformer_ctc/subsampling.py @@ -565,8 +565,13 @@ class DerivBalancer(torch.nn.Module): class DoubleSwish(torch.nn.Module): def forward(self, x: Tensor) -> Tensor: - """Return Swich activation function.""" - return x * torch.sigmoid(x - 1.0) + """Return double-swish activation function which is an approximation to Swish(Swish(x)), + that we approximate closely with x * sigmoid(x-1), expressed for more memory-efficient + backprop as (x-1) * torch.sigmoid(x - 1) + torch.sigmoid(x - 1) + """ + x1 = x - 1.0 + s = torch.sigmoid(x1) + return (x1 * s) + s # (x-1) * s + s == x * s def _test_exp_scale_swish(): @@ -581,10 +586,10 @@ def _test_exp_scale_swish(): y1 = m1(x1) y2 = m2(x2) - assert torch.allclose(y1, y2) + assert torch.allclose(y1, y2, atol=1e-05) y1.sum().backward() y2.sum().backward() - assert torch.allclose(x1.grad, x2.grad) + assert torch.allclose(x1.grad, x2.grad, atol=1e-05) def _test_exp_scale_relu(): diff --git a/egs/librispeech/ASR/transducer_stateless/conformer.py b/egs/librispeech/ASR/transducer_stateless/conformer.py index 3516c2205..e6466d8e6 100644 --- a/egs/librispeech/ASR/transducer_stateless/conformer.py +++ b/egs/librispeech/ASR/transducer_stateless/conformer.py @@ -19,7 +19,7 @@ import copy import math import warnings from typing import Optional, Tuple, Sequence -from subsampling import PeLU, ExpScale, SwishExpScale, ExpScaleRelu, DerivBalancer, BasicNorm, ScaledLinear, ScaledConv1d, ScaledConv2d +from subsampling import PeLU, ExpScale, DoubleSwish, SwishExpScale, ExpScaleRelu, DerivBalancer, BasicNorm, ScaledLinear, ScaledConv1d, ScaledConv2d import torch from torch import Tensor, nn @@ -159,7 +159,7 @@ class ConformerEncoderLayer(nn.Module): self.feed_forward = nn.Sequential( ScaledLinear(d_model, dim_feedforward), DerivBalancer(channel_dim=-1), - SwishExpScale(dim_feedforward, speed=20.0), + DoubleSwish(), nn.Dropout(dropout), ScaledLinear(dim_feedforward, d_model, initial_scale=0.25), ) @@ -167,7 +167,7 @@ class ConformerEncoderLayer(nn.Module): self.feed_forward_macaron = nn.Sequential( ScaledLinear(d_model, dim_feedforward), DerivBalancer(channel_dim=-1), - SwishExpScale(dim_feedforward, speed=20.0), + DoubleSwish(), nn.Dropout(dropout), ScaledLinear(dim_feedforward, d_model, initial_scale=0.25), ) diff --git a/egs/librispeech/ASR/transducer_stateless/train.py b/egs/librispeech/ASR/transducer_stateless/train.py index 897cf5411..994b89e49 100755 --- a/egs/librispeech/ASR/transducer_stateless/train.py +++ b/egs/librispeech/ASR/transducer_stateless/train.py @@ -110,7 +110,7 @@ def get_parser(): parser.add_argument( "--exp-dir", type=str, - default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95", + default="transducer_stateless/randcombine1_expscale3_rework2c_maxabs1000_maxp0.95_noexp", help="""The experiment dir. It specifies the directory where all training related files, e.g., checkpoints, log, etc, are saved