mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-19 05:54:20 +00:00
Make LR update period less frequent later in training; fix bug with param_cov freshness, was too fresh
This commit is contained in:
parent
689441b237
commit
b6ee698278
@ -127,7 +127,11 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
|
|||||||
scalar_max: Maximum absolute value for scalar parameters
|
scalar_max: Maximum absolute value for scalar parameters
|
||||||
size_update_period: The periodicity, in steps, with which we update the size (scale)
|
size_update_period: The periodicity, in steps, with which we update the size (scale)
|
||||||
of the parameter tensor. This is provided to save a little time.
|
of the parameter tensor. This is provided to save a little time.
|
||||||
lr_update_period: The periodicity, in steps, with which we update the learning-rate matrices.
|
lr_update_period: Determines the periodicity, in steps, with which we update the
|
||||||
|
learning-rate matrices. The first number is the periodicity at
|
||||||
|
the start of training, the second number is the periodicity
|
||||||
|
later in training. One step of updating the learning rate matrices
|
||||||
|
can take as long as over 50 minibatches, because SVD on GPU is slow.
|
||||||
** This is important for the speed/optimizaton tradeoff. **
|
** This is important for the speed/optimizaton tradeoff. **
|
||||||
param_cov_period: The periodicity, in steps, with which we update the parameter covariance
|
param_cov_period: The periodicity, in steps, with which we update the parameter covariance
|
||||||
stats.
|
stats.
|
||||||
@ -148,7 +152,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
|
|||||||
param_max_rms=2.0,
|
param_max_rms=2.0,
|
||||||
scalar_max=2.0,
|
scalar_max=2.0,
|
||||||
size_update_period=4,
|
size_update_period=4,
|
||||||
lr_update_period=200,
|
lr_update_period=(200, 2000),
|
||||||
grad_cov_period=3,
|
grad_cov_period=3,
|
||||||
param_cov_period=100,
|
param_cov_period=100,
|
||||||
max_block_size=1024,
|
max_block_size=1024,
|
||||||
@ -350,7 +354,6 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
|
|||||||
"""
|
"""
|
||||||
lr = group["lr"]
|
lr = group["lr"]
|
||||||
size_update_period = group["size_update_period"]
|
size_update_period = group["size_update_period"]
|
||||||
lr_update_period = group["lr_update_period"]
|
|
||||||
grad_cov_period = group["grad_cov_period"]
|
grad_cov_period = group["grad_cov_period"]
|
||||||
param_cov_period = group["param_cov_period"]
|
param_cov_period = group["param_cov_period"]
|
||||||
eps = group["eps"]
|
eps = group["eps"]
|
||||||
@ -386,7 +389,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
|
|||||||
else:
|
else:
|
||||||
if step % param_cov_period == 0:
|
if step % param_cov_period == 0:
|
||||||
self._update_param_cov(group, p, state)
|
self._update_param_cov(group, p, state)
|
||||||
if step % lr_update_period == 0 and step > 0 and "zero_step" in state:
|
if self._is_lr_update_step(group, state):
|
||||||
self._update_lrs(group, p, state)
|
self._update_lrs(group, p, state)
|
||||||
self._zero_exp_avg_sq(state)
|
self._zero_exp_avg_sq(state)
|
||||||
if step % grad_cov_period == 0:
|
if step % grad_cov_period == 0:
|
||||||
@ -463,10 +466,11 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
|
|||||||
(except batch and trivial and rank-1 dims)
|
(except batch and trivial and rank-1 dims)
|
||||||
"""
|
"""
|
||||||
eps = group["eps"]
|
eps = group["eps"]
|
||||||
lr_update_period = group["lr_update_period"]
|
param_cov_period = group["param_cov_period"]
|
||||||
step = state["step"]
|
step = state["step"]
|
||||||
this_weight = (group["param_cov_freshness"] * lr_update_period /
|
|
||||||
(step + lr_update_period))
|
this_weight = (group["param_cov_freshness"] * param_cov_period /
|
||||||
|
(step + param_cov_period))
|
||||||
|
|
||||||
batch_size = p.shape[0]
|
batch_size = p.shape[0]
|
||||||
numel = p.numel() // batch_size
|
numel = p.numel() // batch_size
|
||||||
@ -507,6 +511,30 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
|
|||||||
state["exp_avg_sq"].zero_()
|
state["exp_avg_sq"].zero_()
|
||||||
state["zero_step"] = state["step"]
|
state["zero_step"] = state["step"]
|
||||||
|
|
||||||
|
def _is_lr_update_step(self,
|
||||||
|
group: dict,
|
||||||
|
state: dict) -> False:
|
||||||
|
"""
|
||||||
|
Returns True if on this step we need to update the learning-rate matrices for this tensor
|
||||||
|
and False if not. The periodicity with which we update them increases from
|
||||||
|
(by default) 200 at the start of training to 2000 later on.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
zero_step = state["zero_step"]
|
||||||
|
except:
|
||||||
|
# This parameter tensor has no learning-rate matrices to estimate
|
||||||
|
return False
|
||||||
|
step = state["step"]
|
||||||
|
period_initial, period_final = group["lr_update_period"]
|
||||||
|
# this formula gradually increases the periodicity from period_initial at the start
|
||||||
|
# to period_final when step >> 4 * period_final
|
||||||
|
cur_update_period = (period_initial +
|
||||||
|
((period_final - period_initial) * step /
|
||||||
|
(step + 4 * period_final)))
|
||||||
|
return (step >= zero_step + cur_update_period)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _update_lrs(self,
|
def _update_lrs(self,
|
||||||
group: dict,
|
group: dict,
|
||||||
p: Tensor,
|
p: Tensor,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user