diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
index 06dabe420..ca93068a4 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py
@@ -218,9 +218,6 @@ class Zipformer2(EncoderInterface):
                     downsample=downsampling_factor[i],
                     dropout=dropout,
                 )
-                # we are adding a new attribute here.
-                # this will be interpreted by get_named_parameter_groups_with_lrs().
-                encoder.lr_scale = downsampling_factor[i] ** -0.33
 
             encoders.append(encoder)
 
@@ -713,6 +710,8 @@ class Zipformer2EncoderLayer(nn.Module):
             key_padding_mask=src_key_padding_mask,
             )
 
+        src = src + self.feed_forward1(src)
+
         self_attn_dropout_mask = self.get_sequence_dropout_mask(src, attention_skip_rate)
 
         if True:
@@ -733,8 +732,6 @@ class Zipformer2EncoderLayer(nn.Module):
 
         src = src + (na if self_attn_dropout_mask is None else na * self_attn_dropout_mask)
 
-        src = src + self.feed_forward1(src)
-
         self_attn = self.self_attn1(
             src, attn_weights)
 
@@ -1200,7 +1197,6 @@ class RelPositionMultiheadAttentionWeights(nn.Module):
                                                           (4000.0, 0.0))
     ) -> None:
         super().__init__()
-        self.lr_scale = 0.9
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.query_head_dim = query_head_dim
@@ -1518,8 +1514,6 @@ class NonlinAttention(nn.Module):
     ) -> None:
         super().__init__()
 
-        self.lr_scale = 0.95
-
         self.hidden_channels = hidden_channels
 
         self.in_proj = nn.Linear(channels, hidden_channels * 3, bias=True)
@@ -1633,7 +1627,6 @@ class ConvolutionModule(nn.Module):
         )
         # the gradients on in_proj are a little noisy, likely to do with the
         # sigmoid in glu.
-        self.in_proj.lr_scale = 0.9
 
         # after in_proj we put x through a gated linear unit (nn.functional.glu).
         # For most layers the normal rms value of channels of x seems to be in the range 1 to 4,
@@ -1862,7 +1855,7 @@ class Conv2dSubsampling(nn.Module):
         out_channels: int,
         layer1_channels: int = 8,
         layer2_channels: int = 32,
-        layer3_channels: int = 64,
+        layer3_channels: int = 128,
         dropout: FloatLike = 0.1,
     ) -> None:
         """