mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-11 06:55:27 +00:00
Fix issues where grad is None, and unused-grad cases
This commit is contained in:
parent
c16f795962
commit
24aea947d2
@ -563,9 +563,14 @@ class RelPositionMultiheadAttention(nn.Module):
|
||||
need_weights=need_weights,
|
||||
attn_mask=attn_mask,
|
||||
)
|
||||
attn_scores_out = torch.matmul(scores, self.attn_scores_proj_out)
|
||||
if attn_scores_in is not None:
|
||||
attn_scores_out = torch.matmul(scores, self.attn_scores_proj_out)
|
||||
attn_scores_out = attn_scores_out + attn_scores_in
|
||||
else:
|
||||
# Here, add self.attn_scores_proj_in in order to make sure it has
|
||||
# a grad.
|
||||
attn_scores_out = torch.matmul(scores, self.attn_scores_proj_out +
|
||||
self.attn_scores_proj_in)
|
||||
return x, weights, attn_scores_out
|
||||
|
||||
def rel_shift(self, x: Tensor) -> Tensor:
|
||||
|
||||
@ -127,7 +127,7 @@ class ScaledAdam(Optimizer):
|
||||
|
||||
# Perform optimization step
|
||||
grad = p.grad
|
||||
if grad.is_sparse:
|
||||
if grad is not None and grad.is_sparse:
|
||||
raise RuntimeError(
|
||||
"ScaledAdam optimizer does not support sparse gradients"
|
||||
)
|
||||
@ -138,6 +138,8 @@ class ScaledAdam(Optimizer):
|
||||
if i == 0:
|
||||
clipping_scale = self._get_clipping_scale(group, p, state)
|
||||
|
||||
if grad is None:
|
||||
continue
|
||||
self._step_one_batch(group, p, state, clipping_scale)
|
||||
|
||||
|
||||
@ -211,6 +213,8 @@ class ScaledAdam(Optimizer):
|
||||
for p in group["params"]:
|
||||
state = self.state[p]
|
||||
grad = p.grad
|
||||
if grad is None:
|
||||
continue
|
||||
if grad.is_sparse:
|
||||
raise RuntimeError(
|
||||
"ScaledAdam optimizer does not support sparse gradients"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user