From 503b79252fdbe063bea04cb6d791ee24d1fa4e93 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 27 May 2022 20:44:10 +0800
Subject: [PATCH] Add new update and max_eff_lr

---
 .../ASR/pruned_transducer_stateless4/optim.py | 127 +++++++++++++++---
 1 file changed, 110 insertions(+), 17 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless4/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless4/optim.py
index ad0295959..33e4bd5cb 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless4/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/optim.py
@@ -462,6 +462,11 @@ class Cain(Optimizer):
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups
         lr (float, optional): learning rate (default: 1e-3)
+        max_eff_lr (float, optional): maximum effective learning rate for
+            natural-gradient update; this limits how aggressively we accelerate
+            directions with small gradients.  The idea is that you might want to
+            leave this constant as the regular learning rate decreasess; but
+            we can investigate alternatives here.
         betas (Tuple[float, float], optional): coefficients used for computing
             running averages of gradient and its square (default: (0.9, 0.999))
         eps (float, optional): term added to the denominator to improve
@@ -491,6 +496,7 @@ class Cain(Optimizer):
         self,
         params,
         lr=1e-3,
+        max_eff_lr=3e-3,
         betas=(0.9, 0.98),
         eps=1e-8,
         target_rms=0.1,
@@ -519,6 +525,7 @@ class Cain(Optimizer):
 
         defaults = dict(
             lr=lr,
+            max_eff_lr=max_eff_lr,
             betas=betas,
             eps=eps,
             target_rms=target_rms,
@@ -546,11 +553,14 @@ class Cain(Optimizer):
         for group in self.param_groups:
             beta1, beta2 = group["betas"]
             lr = group["lr"]
+            max_eff_lr = group["max_eff_lr"]
             target_rms = group["target_rms"]
             rms_eps = group["rms_eps"]
             eps = group["eps"]
             rms_max = group["rms_max"]
 
+            assert lr <= max_eff_lr
+
             for p in group["params"]:
                 if p.grad is None:
                     continue
@@ -568,7 +578,7 @@ class Cain(Optimizer):
                 if len(state) == 0:
                     state["step"] = 0
                     # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(
+                    state["delta"] = torch.zeros_like(
                         p, memory_format=torch.preserve_format
                     )
                     # Exponential moving average of squared gradient values
@@ -583,12 +593,17 @@ class Cain(Optimizer):
                         # that also emulates Eve.
                         state["scale_exp_avg_sq"] = torch.zeros((), device=p.device,
                                                                 dtype=p.dtype)
+                        # alpha is a scale related to the natural-gradient update,
+                        # see self._get_denom()
+                        state["alpha"] = torch.ones((), device=p.device,
+                                                     dtype=p.dtype)
 
 
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+
+                delta, exp_avg_sq = state["delta"], state["exp_avg_sq"]
                 step = state["step"]
 
-                exp_avg.mul_(beta1)
+                delta.mul_(beta1)
 
                 if p.numel() > 1:
                     # This part learns the scale on the parameters.
@@ -624,10 +639,10 @@ class Cain(Optimizer):
                                                            device=scale_delta.device,
                                                            dtype=scale_delta.dtype),
                                               scale_delta)
-                    exp_avg.add_(p, alpha=scale_delta)
+                    delta.add_(p, alpha=scale_delta)
 
 
-                # forget bias_correction1.  We use normal momentum.  exp_avg really
+                # forget bias_correction1.  We use normal momentum.  delta really
                 # just stores the moving-average gradient step.
                 bias_correction2 = 1 - beta2 ** self._step_for_bias_correction(step)
 
@@ -635,18 +650,15 @@ class Cain(Optimizer):
 
                 # Decay the second moment running average coefficient
                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-                denom = (exp_avg_sq.sqrt()).add_(eps)
-
-
-                this_delta_ref = grad / denom
-                this_delta = grad / (exp_avg_sq + eps*eps)
-
-                renorm_scale = ((this_delta_ref**2).mean() /  ((this_delta**2).mean() + eps*eps)) ** 0.5
+                #denom = (exp_avg_sq.sqrt()).add_(eps)
+                denom = self._get_denom(state, step, exp_avg_sq, bias_correction2,
+                                        eps, lr/max_eff_lr)
 
+                this_delta = grad / denom
 
                 this_delta = self._change_coordinates(this_delta, state, forward=False)
 
-                alpha = -lr*(1-beta1)*(bias_correction2 ** 0.5) * renorm_scale
+                alpha = -lr*(1-beta1)*(bias_correction2 ** 0.5)
                 if p.numel() > 1:
                     if step % 10 == 0:
                         state["param_rms"].fill_((p**2).mean().sqrt())
@@ -654,15 +666,96 @@ class Cain(Optimizer):
                     # scale as a separate scalar.  This is how fast we'd be learning.
                     alpha = (alpha / target_rms) * state["param_rms"].clamp(min=rms_eps)
 
-                # treat/repurpose exp_avg as moving-average of `this_delta`
-                # don't use alpha=alpha as I don't think
-                exp_avg.add_(this_delta, alpha=alpha)
+                delta.add_(this_delta, alpha=alpha)
 
-                p.add_(exp_avg)
+                p.add_(delta)
                 state["step"] += 1
 
         return loss
 
+
+    def _get_denom(self,
+                   state: dict,
+                   step: int,
+                   exp_avg_sq: Tensor,
+                   bias_correction2: float,
+                   eps: float,
+                   beta: float):
+        """
+        This function returns a "denominator" value that we'll divide the gradients by
+        before multiplying by -lr and applying momentum.
+
+        Args:
+           state: state-dict for this parameter, needed for its "alpha" member
+           step: the current step index
+           exp_avg_sq:  Moving-average of gradient-squared for each element of the current
+                 parameter matrix (possibly after a co-ordinate change).
+           bias_correction2: A value that will be normally 1.0, but will be less than 1.0
+                near the start of training or just after we re-estimated the co-ordinate
+                changes; we can divide exp_avg_sq by this to get an expected gradient-squared.
+           eps: Epsilon value that prevents division by zero; dimensionally, this is
+                a gradient magnitude (not squared-gradient magnitude).
+           beta: A value <= 1.0, e.g. 0.1; it equals lr / max_eff_lr, and represents
+               the inverse of the "maximum acceleration" we allow the natural gradient to
+               give, versus regular ADAM.  Think of 1/beta as the maximum normalized
+               gradient root-mean-square value we allow.
+
+        We'll explain what this function does by comparing (i) basic ADAM-like updat,e
+        (ii) basic natural-gradient update, (iii) our update which combines properties
+        of both.
+
+        The basic ADAM-like version of this would be:
+             denom = (exp_avg_sq.sqrt()).add_(eps),
+        so after we divide the gradients by this, they will have unit expected squared value.
+
+        The natural-gradient version of this would be:
+             denom = (exp_avg_sq).add_(eps * eps)
+             base_denom = (exp_avg_sq.sqrt()).add_(eps),
+             denom = denom * (base_denom / denom).mean()
+        what this does is: compute 'denom' with no sqrt(), and then scale it in such
+        a way that ([root-mean-square gradient magnitude] / denom).mean() == 1.0, so
+        the overall learning speed is about the same as the baseline.
+
+        We will be returning:
+
+             denom = alpha * (exp_avg_sq)  + beta * (exp_avg_sq.sqrt() + eps)
+
+        and are aiming for alpha to satisfy the following condition:
+
+             ((exp_avg_sq.sqrt() + eps) / denom).mean() == 1.0           (eqn:1)
+
+        ... (eqn:1) means that the overall learning rate / "rate of progress"
+        is about the same as if denom was just (exp_avg_sq.sqrt() + eps).    Also,
+        the beta term in "denom" ensures that:
+                       (1/denom) <= 1/beta * (1/adam_denom)
+        which ensures that the speed of update for any given element will never
+        be greater than the normal ADAM-type update by more than 1/beta.
+
+        The way we update alpha is as follows.  We always have the previous iteration's
+        alpha, call this alpha_prev.  We compute:
+               denom_prev = (alpha_prev * exp_avg_sq)  + beta * (exp_avg_sq.sqrt() + eps)
+        and:
+               mean_prev = (exp_avg_sq.sqrt() + eps) / denom).mean()
+         .. now, mean_prev varies *at most* like 1/alpha, meaning its maximum sensitivity
+        to alpha is an inverse relationship.  So we we update alpha as:
+               alpha_cur = alpha_prev * mean_prev
+        (the idea is that this will eventually converge; and meanwhile, we'll just have
+        a slightly inaccurate alpha).  If `step` is large, we'll assume alpha has converged
+        and always return the denominator with the old alpha.
+        """
+
+        num_steps = 10 if step < 3 else 2 if step < 100 else 1
+
+        for _ in range(num_steps):
+            # our update for alpha is iterative since there isn't a closed-form solution.
+            alpha = state["alpha"]
+            adam_denom = exp_avg_sq.sqrt().add_(eps)
+            denom = alpha * exp_avg_sq + beta * adam_denom
+            mean_speed = (adam_denom / denom).mean()
+            alpha.fill_(alpha * mean_speed)
+        return denom
+
+
     def _change_coordinates(self,
                             grad: Tensor,
                             state: dict,