Change to exp_5, 1/sqrt(t) component.

This commit is contained in:
Daniel Povey 2021-09-09 14:08:19 +08:00
parent 56a88badd1
commit d0e5b9b8a5
2 changed files with 9 additions and 30 deletions

View File

@ -1028,39 +1028,16 @@ class Gloam(object):
We define 't = s / warm_step', i.e. t is the step s, normalized so that it We define 't = s / warm_step', i.e. t is the step s, normalized so that it
is 1.0 at warm_step. Our formula for the learning rate as a function of is 1.0 at warm_step. Our formula for the learning rate as a function of
t is: t is:
rate = max_lrate * (t <= 1.0 ? t : base_rate = max_lrate * (t <= 1.0 ? t : t ** -0.5)
sqrt((2 + alpha) / (1 + t + alpha t^2))) epoch_rate = [starts at 1.0 but from first_decrease_epoch, start decreasing it
where alpha is chosen so that the 't' and 'alpha t^2' terms are identical by a factor of decay_per_epoch]
at t == knee_factor (this means alpha = 1.0/knee_factor). So the rate = base_rate * epoch_rate
learning rate increases linearly from t=00 to t=1, and decreases
after that. You can see
that sqrt((2 + alpha) / (1 + t + alpha t^2))) is 1.0 when t == 1,
which is why the line and the curve meet at that point.
On the denominator of that ratio, the "t" term makes it decrease a
bit like 1/sqrt(t) in 1 <= t <= warm_step; the "alpha t^2" term
makes it decrease a bit like 1/t for t > warm_step; and the "1"
term makes it decrease a bit slower than 1/sqrt(t) when t is quite
close to 1.0 (so we linger a little, near the maximum learning rate).
This learning rate schedule ultimately decreases more aggressively
than Noam, i.e. as 1 / t instead of 1 / sqrt(t). The reason we
feel this will work better in conjunction with Madam, is that Madam
keeps the norms of the parameters approximately constant throughout
training; whereas with Noam, if there is no weight decay, these
norms tend to increase as training progresses (although rather
unevenly across different parameter tensors).
As the norms of the parameters increase, the relative changes
in parameters get smaller (the step sizes don't change because
Adam normalizes the gradient magnitudes; they'd get smaller otherwise).
So Noam doesn't have to decrease the learning rate too aggressively
because even with a fixed learning rate, the effective learning rate
would be decreasing (again, this only applies without weight decay).
""" """
if step is None: if step is None:
step = self._step step = self._step
t = step / self._warm_step # floating point division.. t is the normalized step. t = step / self._warm_step # floating point division.. t is the normalized step.
base_rate = self._max_lrate * (t if t <= 1.0 else 1.0) base_rate = self._max_lrate * (t if t <= 1.0 else t ** -0.5)
epoch_rate = self._decay_per_epoch ** max(0, self._epoch + 1 - self._first_decrease_epoch) epoch_rate = self._decay_per_epoch ** max(0, self._epoch + 1 - self._first_decrease_epoch)
return base_rate * epoch_rate return base_rate * epoch_rate

View File

@ -134,7 +134,9 @@ def get_params() -> AttributeDict:
{ {
# exp_3, vs. exp_2, is using 5e-04 not 2d-04 as max learning rate. # exp_3, vs. exp_2, is using 5e-04 not 2d-04 as max learning rate.
# exp_4, vs. exp_3, is using the Gloam optimizer with # exp_4, vs. exp_3, is using the Gloam optimizer with
"exp_dir": Path("conformer_lm/exp_4"), # in exp_5, vs. exp_4, we change Gloam to have a 1/sqrt(t) factor
# as well as the exponential part.
"exp_dir": Path("conformer_lm/exp_5"),
"lm_dataset": Path("data/lm_training_5000/lm_data.pt"), "lm_dataset": Path("data/lm_training_5000/lm_data.pt"),
"num_tokens": 5000, "num_tokens": 5000,
"blank_sym": 0, "blank_sym": 0,
@ -526,7 +528,7 @@ def run(rank, world_size, args):
optimizer = Gloam( optimizer = Gloam(
model.parameters(), model.parameters(),
max_lrate=params.max_lrate, max_lrate=params.max_lrate,
first_decrease_epoch=2, first_decrease_epoch=1,
decay_per_epoch=0.85 decay_per_epoch=0.85
) )