diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py index 1fc5ad5f9..3e4b0ced3 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py @@ -49,7 +49,7 @@ class NeutralGradient(Optimizer): param_eps: An epsilon on the rms value of the parameter, such that when the parameter gets smaller than this we'll start using a fixed learning rate, not decreasing with the parameter norm. - cond_eps: An epsilon that limits the condition number of gradient and parameter + rel_eps: An epsilon that limits the condition number of gradient and parameter covariance matrices param_max: To prevent parameter tensors getting too large, we will clip elements to -param_max..param_max. @@ -69,12 +69,12 @@ class NeutralGradient(Optimizer): scale_speed=0.1, eps=1e-8, param_eps=1.0e-05, - cond_eps=1.0e-10, + rel_eps=1.0e-10, param_max=10.0, - min_diag_smooth=1.0, + min_diag_smooth=0.2, max_size=1023, stats_period=1, - estimate_period=200, + estimate_period=50, ): if not 0.0 <= lr: @@ -111,7 +111,7 @@ class NeutralGradient(Optimizer): eps=eps, scale_speed=scale_speed, param_eps=param_eps, - cond_eps=cond_eps, + rel_eps=rel_eps, min_diag_smooth=min_diag_smooth, param_max=param_max, max_size=max_size, @@ -143,7 +143,7 @@ class NeutralGradient(Optimizer): scale_speed = group["scale_speed"] param_eps = group["param_eps"] eps = group["eps"] - cond_eps = group["cond_eps"] + rel_eps = group["rel_eps"] min_diag_smooth = group["min_diag_smooth"] param_max = group["param_max"] max_size = group["max_size"] @@ -267,11 +267,11 @@ class NeutralGradient(Optimizer): self._estimate(p, state, beta3, max_size, stats_period, estimate_period, eps, param_eps, - cond_eps, min_diag_smooth) + rel_eps, min_diag_smooth) # TEMP!! Override the setting inside _estimate. - #state["ref_exp_avg_sq"][:] = ((exp_avg_sq/bias_correction2 + eps*eps) * - # state["ref_exp_avg_sq"]).sqrt() + state["ref_exp_avg_sq"][:] = (exp_avg_sq/bias_correction2 + eps*eps) + ref_exp_avg_sq = state["ref_exp_avg_sq"] # computed in self._estimate() @@ -288,7 +288,7 @@ class NeutralGradient(Optimizer): cur_grad = grad cur_grad = cur_grad * grad_scale - cur_grad = self._precondition_grad(cur_grad, state) + cur_grad = self._multiply_grad(cur_grad, state) cur_grad *= grad_scale if random.random() < 0.004: @@ -348,11 +348,16 @@ class NeutralGradient(Optimizer): grad_cov = state[f"grad_cov_{dim}"] if grad_cov.ndim == 1: # We are treating this dimension diagonally because it is too big. + # note: other_dims is nonempty because we know ndim != 1 when we get + # here. dim=[] to torch mean() does not work as you would expect. other_dims = [ i for i in range(ndim) if i != dim] - grad_cov.mul_(beta3).add_((grad**2).mean(dim=other_dims)) + grad_cov.mul_(beta3).add_((grad**2).mean(dim=other_dims), + alpha=(1.0-beta3)) else: # full-covariance stats. this_beta3 = self._get_this_beta3(beta3, grad.numel(), size) + if this_beta3 != beta3 and random.random() < 0.01: + print("this_beta3=", this_beta3) grad_cov.mul_(this_beta3).add_(self._get_cov(grad, dim), alpha=(1.0-this_beta3)) @@ -365,7 +370,7 @@ class NeutralGradient(Optimizer): estimate_period: int, eps: float, param_eps: float, - cond_eps: float, + rel_eps: float, min_diag_smooth: float ) -> Tensor: """ @@ -379,6 +384,7 @@ class NeutralGradient(Optimizer): step = state["step"] assert step % stats_period == 0 norm_step = step // stats_period + param_rel_eps = 1.0e-04 scale_change = True @@ -411,13 +417,25 @@ class NeutralGradient(Optimizer): other_dims = [ i for i in range(ndim) if i != dim] bias_correction3 = 1 - beta3 ** (norm_step + 1) # _smoothed means we have the eps terms. - param_var_smoothed = (p**2).mean(dim=other_dims) + param_eps*param_eps + param_var_smoothed = (p**2).mean(dim=other_dims) + param_var_smoothed.add_(param_eps*param_eps + param_rel_eps * param_var_smoothed.mean()) if bias_correction3 < 0.9999: grad_cov = grad_cov / bias_correction3 - grad_var_smoothed = grad_cov + eps*eps + grad_var_smoothed = grad_cov + (eps*eps + rel_eps * grad_cov.mean()) ref_exp_avg_sq = update_ref_exp_avg_sq(ref_exp_avg_sq, grad_var_smoothed, dim, scale_change) + + if True: + def check_close(a, b): + assert ((a-b).abs().sum() < 0.01 * (a.abs()+b.abs()).sum()) + param_cov_smoothed = self._estimate_and_smooth_param_cov(p, dim, + param_eps, + rel_eps=param_rel_eps, + min_diag_smooth=min_diag_smooth) + check_close(param_var_smoothed, param_cov_smoothed.diag()) + + if scale_change: scale_change = False # only use the scale change once else: @@ -427,15 +445,16 @@ class NeutralGradient(Optimizer): # to count the overall change in scale once. grad_var_smoothed *= (param_var_smoothed.sum() / grad_var_smoothed.sum()) proj[:] = (param_var_smoothed / grad_var_smoothed).sqrt() + else: param_cov_smoothed = self._estimate_and_smooth_param_cov(p, dim, param_eps, - cond_eps=1.0e-04, + rel_eps=param_rel_eps, min_diag_smooth=min_diag_smooth) grad_cov_smoothed = self._smooth_grad_cov(p, grad_cov, - eps, norm_step, + eps, norm_step+100000, # TEMp beta3, - cond_eps=cond_eps, + rel_eps=rel_eps, min_diag_smooth=min_diag_smooth) ref_exp_avg_sq = update_ref_exp_avg_sq(ref_exp_avg_sq, grad_cov_smoothed.diag(), @@ -473,9 +492,9 @@ class NeutralGradient(Optimizer): return ans - def _precondition_grad(self, - grad: Tensor, - state: dict) -> Tensor: + def _multiply_grad(self, + grad: Tensor, + state: dict) -> Tensor: """ Multiply the grad by a positive-semidefinite matrix for each dimension, to try to make its covariance the same as that of the parameters (up to a @@ -511,7 +530,7 @@ class NeutralGradient(Optimizer): def _estimate_and_smooth_param_cov(self, p: Tensor, dim: int, param_eps: float, - cond_eps: float = 1.0e-10, + rel_eps: float = 1.0e-10, min_diag_smooth: float = 0.2) -> Tensor: """ Compute a smoothed version of a covariance matrix for one dimension of @@ -522,7 +541,7 @@ class NeutralGradient(Optimizer): dim: The dimenion that we want the covariances for. param_eps: A small epsilon value that represents the minimum root-mean-square parameter value that we'll estimate. - cond_eps: An epsilon value that limits the condition number of the resulting + rel_eps: An epsilon value that limits the condition number of the resulting matrix. Caution: this applies to the variance, not the rms value, so should be quite small. @@ -540,7 +559,7 @@ class NeutralGradient(Optimizer): #diag_smooth = min_diag_smooth diag_smooth = 0.4 diag = param_cov.diag() - extra_diag = (diag * diag_smooth) + (diag.max() * cond_eps + + extra_diag = (diag * diag_smooth) + (diag.mean() * rel_eps + param_eps * param_eps) param_cov.mul_(1-diag_smooth).add_(extra_diag.diag()) return param_cov @@ -551,7 +570,7 @@ class NeutralGradient(Optimizer): eps: float, norm_step: int, beta3: float, - cond_eps: float = 1.0e-10, + rel_eps: float = 1.0e-10, min_diag_smooth: float = 0.2) -> Tensor: """ Compute a smoothed version of a covariance matrix for one dimension of @@ -569,7 +588,7 @@ class NeutralGradient(Optimizer): is just step divided by stats_perio. beta3: The user-supplied beta value for decaying the gradient covariance stats - cond_eps: An epsilon value that limits the condition number of the resulting + rel_eps: An epsilon value that limits the condition number of the resulting matrix. Caution: this applies to the variance, not the rms value, so should be quite small. min_diag_smooth: A minimum proportion by which we smooth the covariance @@ -584,7 +603,7 @@ class NeutralGradient(Optimizer): if bias_correction3 < 0.9999: grad_cov = grad_cov / bias_correction3 - rank_per_iter = p.numel() // size # maximum rank of each iteration's covaraince + rank_per_iter = p.numel() // size # maximum rank of each iteration's covariance # the second part of the following formula roughly represents the number of # frames that have a "large" weight in the stats. num_iters_in_stats = min(norm_step + 1, 1.0 / (1 - beta3)) @@ -598,8 +617,8 @@ class NeutralGradient(Optimizer): print(f"grad diag_smooth = {diag_smooth}, shape={p.shape}") diag = grad_cov.diag() - extra_diag = (diag * diag_smooth) + (diag.max() * cond_eps + - eps * eps) + extra_diag = (diag * diag_smooth).add_(diag.mean() * rel_eps + + eps * eps) grad_cov = (grad_cov * (1-diag_smooth)).add_(extra_diag.diag()) return grad_cov