Configuration changes: scores limit 5->10, min_prob 0.05->0.1, cur_grad_scale more aggressive increase

This commit is contained in:
Daniel Povey 2022-10-22 14:09:53 +08:00
parent 9672dffac2
commit 84580ec022
3 changed files with 3 additions and 3 deletions

View File

@ -649,7 +649,7 @@ class AttentionDownsample(torch.nn.Module):
scores = (src * self.query).sum(dim=-1, keepdim=True)
scores = penalize_abs_values_gt(scores,
limit=5.0,
limit=10.0,
penalty=1.0e-04)
weights = scores.softmax(dim=1)

View File

@ -482,7 +482,7 @@ class ActivationBalancer(torch.nn.Module):
scale_gain_factor: float = 0.02,
min_abs: float = 0.2,
max_abs: float = 100.0,
min_prob: float = 0.05,
min_prob: float = 0.1,
):
super(ActivationBalancer, self).__init__()
self.num_channels = num_channels

View File

@ -850,7 +850,7 @@ def train_one_epoch(
# of the grad scaler is configurable, but we can't configure it to have different
# behavior depending on the current grad scale.
cur_grad_scale = scaler._scale.item()
if cur_grad_scale < 1.0:
if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
scaler.update(cur_grad_scale * 2.0)
if cur_grad_scale < 0.01:
logging.warn("Grad scale is small: {cur_grad_scale}")