Remove the natural gradient stuff while keeping cosmetic changes.

2025-09-08 00:24:19 +00:00 · 2022-05-30 11:56:11 +08:00 · 2022-05-30 11:56:11 +08:00 · b01c09a693
commit b01c09a693
parent 8a96f29a11
2 changed files with 3 additions and 115 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless4/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/optim.py
@ -61,20 +61,6 @@ class Cain(Optimizer):
           and a 512x512 one.  This does a better job of separating "important"
           from "unimportant" directions in activation space, and empirically
           improved convergence, final loss function and WERs.
-      (v)  Change the Adam-type update, where the rms update value in each parameter
-           is the same, to a (diagonal) natural-gradient-type update, where if a
-           parameter has larger then average gradients, we give it a smaller
-           update.  Because, in principle, this could lead to unbounded update
-           magnitudes, we limit these by introducing the "max_eff_lr" parameter,
-           which defines the maximum learning rate "if we had an Adam-type
-           update".  So what we really implement is a cross between Adam and
-           natural gradient, where larger "max_eff_lr" means more natural-gradient-like
-           (i.e. more aggressive in directions with small gradients).
-           We recommend to set max_eff_lr to the initial learning rate ("lr")
-           in the learning rate schedule so that the update starts out
-           equivalent to Adam.  Empirically this change to natural-gradient-type
-           update, epecially when combined with a faster-decaying learning rate
-           schedule, gives faster convergence.

    The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
    The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.
@ -84,13 +70,6 @@ class Cain(Optimizer):
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float, optional): learning rate (default: 1e-3)
-        max_eff_lr (float, optional): maximum effective learning rate for
-            natural-gradient update; this limits how aggressively we accelerate
-            directions with small gradients.  The idea is that you might want to
-            leave this constant as the regular learning rate decreasess; but
-            we can investigate alternatives here.  If you set this to <= 0,
-            it disables the natural-gradient aspect, giving a more ADAM-like
-            update (in changed co-ordinates)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
@ -122,7 +101,6 @@ class Cain(Optimizer):
        self,
        params,
        lr=1e-3,
-        max_eff_lr=3e-3,
        betas=(0.9, 0.98),
        eps=1e-8,
        target_rms=0.1,
@ -151,7 +129,6 @@ class Cain(Optimizer):

        defaults = dict(
            lr=lr,
-            max_eff_lr=max_eff_lr,
            betas=betas,
            eps=eps,
            target_rms=target_rms,
@ -179,14 +156,11 @@ class Cain(Optimizer):
        for group in self.param_groups:
            beta1, beta2 = group["betas"]
            lr = group["lr"]
-            max_eff_lr = group["max_eff_lr"]
            target_rms = group["target_rms"]
            rms_eps = group["rms_eps"]
            eps = group["eps"]
            rms_max = group["rms_max"]

-            assert lr <= max_eff_lr
-
            for p in group["params"]:
                if p.grad is None:
                    continue
@ -276,11 +250,8 @@ class Cain(Optimizer):

                # Decay the second moment running average coefficient
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-                if p.numel() > 1 and max_eff_lr > 0.0:
-                    denom = self._get_denom(state, step, exp_avg_sq, bias_correction2,
-                                            eps, lr/max_eff_lr)
-                else:
-                    denom = (exp_avg_sq.sqrt()).add_(eps)
+
+                denom = (exp_avg_sq.sqrt()).add_(eps)

                this_delta = grad / denom

@ -302,88 +273,6 @@ class Cain(Optimizer):
        return loss


-    def _get_denom(self,
-                   state: dict,
-                   step: int,
-                   exp_avg_sq: Tensor,
-                   bias_correction2: float,
-                   eps: float,
-                   beta: float):
-        """
-        This function returns a "denominator" value that we'll divide the gradients by
-        before multiplying by -lr and applying momentum.
-
-        Args:
-           state: state-dict for this parameter, needed for its "alpha" member
-           step: the current step index
-           exp_avg_sq:  Moving-average of gradient-squared for each element of the current
-                 parameter matrix (possibly after a co-ordinate change).
-           bias_correction2: A value that will be normally 1.0, but will be less than 1.0
-                near the start of training or just after we re-estimated the co-ordinate
-                changes; we can divide exp_avg_sq by this to get an expected gradient-squared.
-           eps: Epsilon value that prevents division by zero; dimensionally, this is
-                a gradient magnitude (not squared-gradient magnitude).
-           beta: A value <= 1.0, e.g. 0.1; it equals lr / max_eff_lr, and represents
-               the inverse of the "maximum acceleration" we allow the natural gradient to
-               give, versus regular ADAM.  Think of 1/beta as the maximum normalized
-               gradient root-mean-square value we allow.
-
-        We'll explain what this function does by comparing (i) basic ADAM-like updat,e
-        (ii) basic natural-gradient update, (iii) our update which combines properties
-        of both.
-
-        The basic ADAM-like version of this would be:
-             denom = (exp_avg_sq.sqrt()).add_(eps),
-        so after we divide the gradients by this, they will have unit expected squared value.
-
-        The natural-gradient version of this would be:
-             denom = (exp_avg_sq).add_(eps * eps)
-             base_denom = (exp_avg_sq.sqrt()).add_(eps),
-             denom = denom * (base_denom / denom).mean()
-        what this does is: compute 'denom' with no sqrt(), and then scale it in such
-        a way that ([root-mean-square gradient magnitude] / denom).mean() == 1.0, so
-        the overall learning speed is about the same as the baseline.
-
-        We will be returning:
-
-             denom = alpha * (exp_avg_sq)  + beta * exp_avg_sq.sqrt() + eps
-
-        and are aiming for alpha to satisfy the following condition:
-
-             ((exp_avg_sq.sqrt() + eps) / denom).mean() == 1.0           (eqn:1)
-
-        ... (eqn:1) means that the overall learning rate / "rate of progress"
-        is about the same as if denom was just (exp_avg_sq.sqrt() + eps).    Also,
-        the beta term in "denom" ensures that:
-                       (1/denom) <= 1/beta * (1/adam_denom)
-        which ensures that the speed of update for any given element will never
-        be greater than the normal ADAM-type update by more than 1/beta.
-
-        The way we update alpha is as follows.  We always have the previous iteration's
-        alpha, call this alpha_prev.  We compute:
-               denom_prev = (alpha_prev * exp_avg_sq)  + beta * exp_avg_sq.sqrt() + eps
-        and:
-               mean_prev = (exp_avg_sq.sqrt() + eps) / denom).mean()
-         .. now, mean_prev varies *at most* like 1/alpha, meaning its maximum sensitivity
-        to alpha is an inverse relationship.  So we we update alpha as:
-               alpha_cur = alpha_prev * mean_prev
-        (the idea is that this will eventually converge; and meanwhile, we'll just have
-        a slightly inaccurate alpha).  If `step` is large, we'll assume alpha has converged
-        and always return the denominator with the old alpha.
-        """
-
-        num_steps = 10 if step < 3 else 2 if step < 100 else 1
-
-        for _ in range(num_steps):
-            # our update for alpha is iterative since there isn't a closed-form solution.
-            alpha = state["alpha"]
-            adam_denom = exp_avg_sq.sqrt()
-            denom = alpha * exp_avg_sq + beta * adam_denom + eps
-            mean_speed = ((adam_denom + eps) / denom).mean()
-            alpha.fill_(alpha * mean_speed)
-        return denom
-
-
    def _change_coordinates(self,
                            grad: Tensor,
                            state: dict,
--- a/egs/librispeech/ASR/pruned_transducer_stateless4/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless4/train.py
@ -872,8 +872,7 @@ def run(rank, world_size, args):
        logging.info("Using DDP")
        model = DDP(model, device_ids=[rank])

-    optimizer = Cain(model.parameters(), lr=params.initial_lr,
-                     max_eff_lr=params.initial_lr)
+    optimizer = Cain(model.parameters(), lr=params.initial_lr)

    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)