diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py index eb556b4e9..db341a1c9 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/scaling.py @@ -396,6 +396,8 @@ class LinearWithAuxLossFunction(torch.autograd.Function): In the backward pass it will include an auxiliary loss based on predicting x from matmul(y, weight). """ + if torch.is_autocast_enabled(): + x = x.to(torch.float16) ctx.save_for_backward(x, weight, alpha) ctx.aux_grad_scale = aux_grad_scale return torch.matmul(x, weight.t()) @@ -491,10 +493,14 @@ class LinearWithAuxLoss(nn.Module): aux_grad_scale = float(self.aux_grad_scale) if (not self.training or torch.jit.is_scripting() or aux_grad_scale == 0.0 or random.random() > float(self.prob)): - return torch.matmul(x, self.weight.t()) + self.bias + return torch.nn.functional.linear(x, self.weight, self.bias) else: - return LinearWithAuxLossFunction.apply(x, self.weight, self.alpha, - aux_grad_scale) + self.bias + ans = LinearWithAuxLossFunction.apply(x, self.weight, self.alpha, + aux_grad_scale) + if self.bias is not None: + ans += self.bias + return ans + def ScaledLinear(*args, initial_scale: float = 1.0,