From 5eafccb36942b3da42024c48b1af237ef1f613ec Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 11 Mar 2022 17:46:33 +0800 Subject: [PATCH] Change how scales are applied; fix residual bug --- .../ASR/transducer_stateless/conformer.py | 17 +++++++++++++---- .../ASR/transducer_stateless/train.py | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/egs/librispeech/ASR/transducer_stateless/conformer.py b/egs/librispeech/ASR/transducer_stateless/conformer.py index 051512969..2c602bbea 100644 --- a/egs/librispeech/ASR/transducer_stateless/conformer.py +++ b/egs/librispeech/ASR/transducer_stateless/conformer.py @@ -229,10 +229,17 @@ class ConformerEncoderLayer(nn.Module): attn_mask=src_mask, key_padding_mask=src_key_padding_mask, )[0] - src = residual + self.dropout(src_att) + # natural rms scale of mha output is about 2 to 6. scaling down by 0.1 takes it + # to 0.2 to 0.6, which is suitable to add to the inputs assuming the output + # of the previous convolution layer had a magnitude of around 1.0 + # (this magnitude of 1.0, or a bit less, like 0.3, is learned but is + # dictated by considerations of what is done to the output of the + # encoder. + post_scale_mha = 0.1 + src = residual + post_scale_mha * self.dropout(src_att) # convolution module - src = residual + self.dropout(self.conv_module(self.scale_conv(src))) + src = src + self.dropout(self.conv_module(self.scale_conv(src))) # feed forward module src = src + self.dropout(self.feed_forward(self.scale_ff(src))) @@ -891,13 +898,15 @@ class ConvolutionModule(nn.Module): # 1D Depthwise Conv x = self.depthwise_conv(x) + + # TODO: can have a learned scale in here, or a fixed one. + x = self.activation(x) + # x is (batch, channels, time) x = x.permute(0, 2, 1) x = self.scale(x) x = x.permute(0, 2, 1) - x = self.activation(x) - x = self.pointwise_conv2(x) # (batch, channel, time) return x.permute(2, 0, 1) diff --git a/egs/librispeech/ASR/transducer_stateless/train.py b/egs/librispeech/ASR/transducer_stateless/train.py index 5d6d72490..b5e9e846f 100755 --- a/egs/librispeech/ASR/transducer_stateless/train.py +++ b/egs/librispeech/ASR/transducer_stateless/train.py @@ -110,7 +110,7 @@ def get_parser(): parser.add_argument( "--exp-dir", type=str, - default="transducer_stateless/randcombine1_expscale3_brelu2swish2_0.1_bnorm2ma0.5_pbs_cinit", + default="transducer_stateless/randcombine1_expscale3_rework", help="""The experiment dir. It specifies the directory where all training related files, e.g., checkpoints, log, etc, are saved