From db7a3b6eea34e532240dae3409c6d64e8eab9806 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 12 Mar 2022 18:50:02 +0800 Subject: [PATCH] Reduce initial_scale. --- egs/librispeech/ASR/transducer_stateless/conformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/librispeech/ASR/transducer_stateless/conformer.py b/egs/librispeech/ASR/transducer_stateless/conformer.py index a270cd8ae..9dd6bae4d 100644 --- a/egs/librispeech/ASR/transducer_stateless/conformer.py +++ b/egs/librispeech/ASR/transducer_stateless/conformer.py @@ -421,7 +421,7 @@ class RelPositionMultiheadAttention(nn.Module): ), "embed_dim must be divisible by num_heads" self.in_proj = ScaledLinear(embed_dim, 3 * embed_dim, bias=True) - self.out_proj = ScaledLinear(embed_dim, embed_dim, bias=True) + self.out_proj = ScaledLinear(embed_dim, embed_dim, bias=True, initial_scale=0.25) # linear transformation for positional encoding. self.linear_pos = ScaledLinear(embed_dim, embed_dim, bias=False) @@ -869,7 +869,7 @@ class ConvolutionModule(nn.Module): stride=1, padding=0, bias=bias, - initial_scale=0.5 + initial_scale=0.25 ) def forward(self, x: Tensor) -> Tensor: