diff --git a/egs/libriheavy/LM/zipformer1/subformer.py b/egs/libriheavy/LM/zipformer1/subformer.py index aec9be805..50c5df7c6 100644 --- a/egs/libriheavy/LM/zipformer1/subformer.py +++ b/egs/libriheavy/LM/zipformer1/subformer.py @@ -892,7 +892,6 @@ class LearnedDownsamplingModule(nn.Module): max_positive=0.6, min_abs=1.0, max_abs=4.0, - grad_scale=0.1, prob=ScheduledFloat((0.0, 1.0), (8000.0, 0.25), default=0.0)) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py index 10069a0ac..44ca6e0a8 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py @@ -897,8 +897,9 @@ class AbsValuePenalizer(nn.Module): def forward(self, x: Tensor) -> Tensor: if (torch.jit.is_scripting() or not x.requires_grad or - (x.is_cuda and self.mem_cutoff(torch.cuda.memory_allocated())) + or not self.training or random.random() > self.prob): + # or (x.is_cuda and self.mem_cutoff(torch.cuda.memory_allocated())) return _no_op(x) # the _no_op op is to make our diagnostics code work. x = penalize_abs_values_gt(x,