From f6f088489d24b1511f1218f08217123e19ccb985 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 5 Jan 2023 23:49:42 +0800 Subject: [PATCH] Adjust lr_scales, make them closer to 1. --- .../ASR/pruned_transducer_stateless7/zipformer.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py index 75cf6dac7..24502efb5 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py @@ -209,7 +209,7 @@ class Zipformer(EncoderInterface): ) # we are adding a new attribute here. # this will be interpreted by get_named_parameter_groups_with_lrs(). - encoder.lr_scale = downsampling_factor[i] ** -0.333 + encoder.lr_scale = downsampling_factor[i] ** -0.25 encoders.append(encoder) self.encoders = nn.ModuleList(encoders) @@ -1086,7 +1086,7 @@ class RelPositionMultiheadAttentionWeights(nn.Module): (4000.0, 0.0)) ) -> None: super().__init__() - self.lr_scale = 0.75 + self.lr_scale = 0.9 self.embed_dim = embed_dim self.num_heads = num_heads self.query_head_dim = query_head_dim @@ -1338,7 +1338,7 @@ class AttentionSqueeze(nn.Module): bottleneck_dim: int = 16): super().__init__() - self.lr_scale = 0.5 + self.lr_scale = 0.9 self.bottleneck_dim = bottleneck_dim @@ -1480,7 +1480,7 @@ class NonlinAttention(nn.Module): ) -> None: super().__init__() - self.lr_scale = 0.75 + self.lr_scale = 0.9 self.hidden_channels = hidden_channels @@ -1585,6 +1585,9 @@ class ConvolutionModule(nn.Module): self.in_proj = nn.Linear( channels, 2 * bottleneck_dim, ) + # the gradients on in_proj are a little noisy, likely to do with the + # sigmoid in glu. + self.in_proj.lr_scale = 0.9 # after in_proj we put x through a gated linear unit (nn.functional.glu).