mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-11 06:55:27 +00:00
Configuration changes: scores limit 5->10, min_prob 0.05->0.1, cur_grad_scale more aggressive increase
This commit is contained in:
parent
9672dffac2
commit
84580ec022
@ -649,7 +649,7 @@ class AttentionDownsample(torch.nn.Module):
|
||||
scores = (src * self.query).sum(dim=-1, keepdim=True)
|
||||
|
||||
scores = penalize_abs_values_gt(scores,
|
||||
limit=5.0,
|
||||
limit=10.0,
|
||||
penalty=1.0e-04)
|
||||
|
||||
weights = scores.softmax(dim=1)
|
||||
|
||||
@ -482,7 +482,7 @@ class ActivationBalancer(torch.nn.Module):
|
||||
scale_gain_factor: float = 0.02,
|
||||
min_abs: float = 0.2,
|
||||
max_abs: float = 100.0,
|
||||
min_prob: float = 0.05,
|
||||
min_prob: float = 0.1,
|
||||
):
|
||||
super(ActivationBalancer, self).__init__()
|
||||
self.num_channels = num_channels
|
||||
|
||||
@ -850,7 +850,7 @@ def train_one_epoch(
|
||||
# of the grad scaler is configurable, but we can't configure it to have different
|
||||
# behavior depending on the current grad scale.
|
||||
cur_grad_scale = scaler._scale.item()
|
||||
if cur_grad_scale < 1.0:
|
||||
if cur_grad_scale < 1.0 or (cur_grad_scale < 8.0 and batch_idx % 400 == 0):
|
||||
scaler.update(cur_grad_scale * 2.0)
|
||||
if cur_grad_scale < 0.01:
|
||||
logging.warn("Grad scale is small: {cur_grad_scale}")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user