diff --git a/egs/libriheavy/LM/zipformer1/subformer.py b/egs/libriheavy/LM/zipformer1/subformer.py
index aec9be805..50c5df7c6 100644
--- a/egs/libriheavy/LM/zipformer1/subformer.py
+++ b/egs/libriheavy/LM/zipformer1/subformer.py
@@ -892,7 +892,6 @@ class LearnedDownsamplingModule(nn.Module):
                                        max_positive=0.6,
                                        min_abs=1.0,
                                        max_abs=4.0,
-                                       grad_scale=0.1,
                                        prob=ScheduledFloat((0.0, 1.0), (8000.0, 0.25), default=0.0))
 
 
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
index 10069a0ac..44ca6e0a8 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py
@@ -897,8 +897,9 @@ class AbsValuePenalizer(nn.Module):
 
     def forward(self, x: Tensor) -> Tensor:
         if (torch.jit.is_scripting() or not x.requires_grad or
-            (x.is_cuda and self.mem_cutoff(torch.cuda.memory_allocated()))
+            or not self.training
             or random.random() > self.prob):
+            # or (x.is_cuda and self.mem_cutoff(torch.cuda.memory_allocated()))
             return _no_op(x)  # the _no_op op is to make our diagnostics code work.
 
         x = penalize_abs_values_gt(x,