Reduce initial_scale.

This commit is contained in:
Daniel Povey 2022-03-12 18:50:02 +08:00
parent b7b2d8970b
commit db7a3b6eea

View File

@ -421,7 +421,7 @@ class RelPositionMultiheadAttention(nn.Module):
), "embed_dim must be divisible by num_heads"
self.in_proj = ScaledLinear(embed_dim, 3 * embed_dim, bias=True)
self.out_proj = ScaledLinear(embed_dim, embed_dim, bias=True)
self.out_proj = ScaledLinear(embed_dim, embed_dim, bias=True, initial_scale=0.25)
# linear transformation for positional encoding.
self.linear_pos = ScaledLinear(embed_dim, embed_dim, bias=False)
@ -869,7 +869,7 @@ class ConvolutionModule(nn.Module):
stride=1,
padding=0,
bias=bias,
initial_scale=0.5
initial_scale=0.25
)
def forward(self, x: Tensor) -> Tensor: