From f6f088489d24b1511f1218f08217123e19ccb985 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 5 Jan 2023 23:49:42 +0800
Subject: [PATCH] Adjust lr_scales, make them closer to 1.

---
 .../ASR/pruned_transducer_stateless7/zipformer.py     | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
index 75cf6dac7..24502efb5 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
@@ -209,7 +209,7 @@ class Zipformer(EncoderInterface):
                 )
                 # we are adding a new attribute here.
                 # this will be interpreted by get_named_parameter_groups_with_lrs().
-                encoder.lr_scale = downsampling_factor[i] ** -0.333
+                encoder.lr_scale = downsampling_factor[i] ** -0.25
             encoders.append(encoder)
         self.encoders = nn.ModuleList(encoders)
 
@@ -1086,7 +1086,7 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
                                                           (4000.0, 0.0))
     ) -> None:
         super().__init__()
-        self.lr_scale = 0.75
+        self.lr_scale = 0.9
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.query_head_dim = query_head_dim
@@ -1338,7 +1338,7 @@ class AttentionSqueeze(nn.Module):
                  bottleneck_dim: int = 16):
         super().__init__()
 
-        self.lr_scale = 0.5
+        self.lr_scale = 0.9
 
         self.bottleneck_dim = bottleneck_dim
 
@@ -1480,7 +1480,7 @@ class NonlinAttention(nn.Module):
     ) -> None:
         super().__init__()
 
-        self.lr_scale = 0.75
+        self.lr_scale = 0.9
 
         self.hidden_channels = hidden_channels
 
@@ -1585,6 +1585,9 @@ class ConvolutionModule(nn.Module):
         self.in_proj = nn.Linear(
             channels, 2 * bottleneck_dim,
         )
+        # the gradients on in_proj are a little noisy, likely to do with the
+        # sigmoid in glu.
+        self.in_proj.lr_scale = 0.9
 
 
         # after in_proj we put x through a gated linear unit (nn.functional.glu).