Fix issues where grad is None, and unused-grad cases

This commit is contained in:
Daniel Povey 2022-09-22 19:18:16 +08:00
parent c16f795962
commit 24aea947d2
2 changed files with 11 additions and 2 deletions

View File

@ -563,9 +563,14 @@ class RelPositionMultiheadAttention(nn.Module):
need_weights=need_weights,
attn_mask=attn_mask,
)
attn_scores_out = torch.matmul(scores, self.attn_scores_proj_out)
if attn_scores_in is not None:
attn_scores_out = torch.matmul(scores, self.attn_scores_proj_out)
attn_scores_out = attn_scores_out + attn_scores_in
else:
# Here, add self.attn_scores_proj_in in order to make sure it has
# a grad.
attn_scores_out = torch.matmul(scores, self.attn_scores_proj_out +
self.attn_scores_proj_in)
return x, weights, attn_scores_out
def rel_shift(self, x: Tensor) -> Tensor:

View File

@ -127,7 +127,7 @@ class ScaledAdam(Optimizer):
# Perform optimization step
grad = p.grad
if grad.is_sparse:
if grad is not None and grad.is_sparse:
raise RuntimeError(
"ScaledAdam optimizer does not support sparse gradients"
)
@ -138,6 +138,8 @@ class ScaledAdam(Optimizer):
if i == 0:
clipping_scale = self._get_clipping_scale(group, p, state)
if grad is None:
continue
self._step_one_batch(group, p, state, clipping_scale)
@ -211,6 +213,8 @@ class ScaledAdam(Optimizer):
for p in group["params"]:
state = self.state[p]
grad = p.grad
if grad is None:
continue
if grad.is_sparse:
raise RuntimeError(
"ScaledAdam optimizer does not support sparse gradients"