diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py index dcf814129..830fe497b 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py @@ -1390,6 +1390,7 @@ class LimitParamValue(torch.autograd.Function): @staticmethod def forward(ctx, x: Tensor, min: float, max: float): ctx.save_for_backward(x) + assert max >= min ctx.min = min ctx.max = max return x diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py index 7aa5bf7c9..799b90ff2 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/zipformer.py @@ -459,7 +459,7 @@ class ZipformerEncoderLayer(nn.Module): self.balancer1 = Balancer( embed_dim, channel_dim=-1, min_positive=0.45, max_positive=0.55, - min_abs=1.0, max_abs=4.0, + min_abs=0.2, max_abs=4.0, ) # balancer for output of NonlinAttentionModule @@ -1878,7 +1878,10 @@ class Conv2dSubsampling(nn.Module): prob=(0.025, 0.25), grad_scale=0.02) - self.out_norm = BasicNorm(out_channels) + # max_log_eps=0.0 is to prevent both eps and the output of self.out from + # getting large, there is an unnecessary degree of freedom. + self.out_norm = BasicNorm(out_channels, eps=1.0, + min_log_eps=-0.1, max_log_eps=0.0) self.dropout = Dropout2(dropout)