This time get it right, last time was starting from wrong base.

2025-09-19 05:54:20 +00:00 · 2022-06-24 19:00:58 +08:00 · 2022-06-24 19:00:58 +08:00 · 117d348f70
commit 117d348f70
parent c2ec901e73 6002e19836
1 changed files with 114 additions and 55 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
@ -52,6 +52,10 @@ class NeutralGradient(Optimizer):
                  decreasing with the parameter norm.
   param_rel_eps: An epsilon that limits how different different parameter rms estimates can
                  be within the same tensor
+   param_rel_max: Limits how big we allow the parameter variance eigs to be relative to the
+                  original mean.   Setting this to a not-very-large value like 1 ensures that
+                  we only apply the param scaling for smaller-than-average, not larger-than-average,
+                  eigenvalues, which limits the dispersion of eigenvalues of the parameter tensors.
      param_max:  To prevent parameter tensors getting too large, we will clip elements to
                   -param_max..param_max.
   max_fullcov_size: We will only use the full-covariance (not-diagonal) update for
@ -85,11 +89,12 @@ class NeutralGradient(Optimizer):
            self,
            params,
            lr=3e-02,
-            betas=(0.9, 0.98),
+            betas=(0.9, 0.98, 0.95),
            scale_speed=0.1,
            grad_eps=1e-10,
            param_eps=1.0e-06,
            param_rel_eps=1.0e-04,
+            param_rel_max=1.0,
            param_max_rms=2.0,
            param_min_rms=1.0e-05,
            max_fullcov_size=1023,
@ -140,6 +145,7 @@ class NeutralGradient(Optimizer):
            grad_eps=grad_eps,
            param_eps=param_eps,
            param_rel_eps=param_rel_eps,
+            param_rel_max=param_rel_max,
            param_max_rms=param_max_rms,
            param_min_rms=param_min_rms,
            max_fullcov_size=max_fullcov_size,
@ -174,11 +180,12 @@ class NeutralGradient(Optimizer):

        for group in self.param_groups:
            lr = group["lr"]
-            beta1, beta2 = group["betas"]
+            beta1, beta2, beta3 = group["betas"]
            scale_speed = group["scale_speed"]
            grad_eps = group["grad_eps"]
            param_eps = group["param_eps"]
            param_rel_eps = group["param_rel_eps"]
+            param_rel_max = group["param_rel_max"]
            param_max_rms = group["param_max_rms"]
            param_min_rms = group["param_min_rms"]
            max_fullcov_size = group["max_fullcov_size"]
@ -271,8 +278,7 @@ class NeutralGradient(Optimizer):
                            state["step_within_period"] = random.randint(0,
                                                                         estimate_period-stats_steps)

-                            if param_pow != 1.0:
-                                state["scalar_exp_avg_sq"] = torch.zeros((), **kwargs)
+                            state["scalar_exp_avg_sq"] = torch.zeros((), **kwargs)

                        used_scale = False
                        for dim in range(p.ndim):
@ -285,6 +291,11 @@ class NeutralGradient(Optimizer):
                                state[f"proj_{dim}"] = torch.ones(size, **kwargs)
                            else:
                                state[f"proj_{dim}"] = torch.eye(size, **kwargs)
+
+                                # we will decay the param stats via beta3; they are only
+                                # accumulated once every `estimate_period`
+                                state[f"param_cov_{dim}"] = torch.zeros(size, size, **kwargs)
+
                            if not used_scale:
                                param_rms = (p**2).mean().sqrt().add_(param_eps)
                                state[f"proj_{dim}"] *= param_rms
@ -389,6 +400,7 @@ class NeutralGradient(Optimizer):
                        step_within_period = state["step_within_period"]
                        if step_within_period == estimate_period:
                            self._estimate_projections(p, state, param_eps, param_rel_eps,
+                                                       param_rel_max, beta3,
                                                       param_pow, grad_pow, grad_min_rand)
                            state["step_within_period"] = 0
                        elif step_within_period >= estimate_period - stats_steps:
@ -437,7 +449,8 @@ class NeutralGradient(Optimizer):
                                logging.info(f"cos_angle = {cos_angle}, shape={grad.shape}")

                        alpha = -lr * (1-beta1)
-                        if param_pow != 1.0 or grad_pow != 1.0:
+
+                        if True:
                            # Renormalize scale of cur_grad
                            scalar_exp_avg_sq = state["scalar_exp_avg_sq"]
                            scalar_exp_avg_sq.mul_(beta2).add_((cur_grad**2).mean()/n_cached_grads, alpha=(1-beta2))
@ -509,6 +522,8 @@ class NeutralGradient(Optimizer):
                              state: dict,
                              param_eps: float,
                              param_rel_eps: float,
+                              param_rel_max: float,
+                              beta3: float,
                              param_pow: float,
                              grad_pow: float,
                              grad_min_rand: float,
@ -523,7 +538,6 @@ class NeutralGradient(Optimizer):
        co-ordinates.  (Thus, applying with forward==True and then forward==False
        is not a round trip).

-
        Args:
             p: The parameter for which we are etimating the projections.  Supplied
                because we need this to estimate parameter covariance matrices in
@ -535,6 +549,7 @@ class NeutralGradient(Optimizer):
            param_rel_eps:  Another constraint on minimum parameter rms, relative to
                 sqrt(overall_param_rms), where overall_param_rms is the sqrt of the
                 parameter variance averaged over the entire tensor.
+            beta3: discounting parameter for param_cov, applied once every estimate_period.
            param_pow: param_pow==1.0 means fully normalizing for parameter scale;
                 setting to a smaller value closer to 0.0 means partial normalization.
           grad_min_rand: minimum proportion of random tensor to mix with the
@ -557,10 +572,13 @@ class NeutralGradient(Optimizer):
                grad_cov = state[f"grad_cov_{dim}"] / count
                del state[f"grad_cov_{dim}"]  # save memory

-                self._randomize_lowrank_cov(grad_cov, count, p.shape,
-                                            grad_min_rand)
+                #self._randomize_lowrank_cov(grad_cov, count, p.shape,
+                #                            grad_min_rand)

-                param_cov = self._get_param_cov(p, dim)
+                cur_param_cov = self._get_param_cov(p, dim)
+                cur_param_cov = cur_param_cov / (cur_param_cov.diag().mean() + 1.0e-20)  # normalize size..
+                param_cov = state[f"param_cov_{dim}"]
+                param_cov.mul_(beta3).add_(cur_param_cov, alpha=(1-beta3))

                # We want the orthogonal matrix U that diagonalizes P, where
                # P is the SPD matrix such that P G^{param_pow} P^T == C^{param_pow},
@ -570,7 +588,11 @@ class NeutralGradient(Optimizer):
                # since the P's are related by taking-to-a-power.
                P = self._estimate_proj(grad_cov,
                                        param_cov,
-                                        param_pow / grad_pow)
+                                        param_pow / grad_pow,
+                                        param_rel_eps,
+                                        param_rel_max)
+
+
                # The only thing we want from P is the basis that diagonalizes
                # it, i.e. if we do the symmetric svd P = U S U^T, we can
                # interpret the shape of U as (canonical_coordinate,
@ -585,7 +607,8 @@ class NeutralGradient(Optimizer):
                proj.fill_(1.0)

        self._estimate_param_scale(p, state, param_eps,
-                                   param_rel_eps, param_pow)
+                                   param_rel_eps, param_rel_max,
+                                   param_pow)


    def _estimate_param_scale(self,
@ -593,6 +616,7 @@ class NeutralGradient(Optimizer):
                              state: dict,
                              param_eps: float,
                              param_rel_eps: float,
+                              param_rel_max: float,
                              param_pow: float) -> None:
        """
        This is called from _estimate_projections() after suitably "diagonalizing" bases
@ -615,8 +639,10 @@ class NeutralGradient(Optimizer):


        params_sq_norm = params_sq
-        if param_pow != 1.0:
-            params_sq_partnorm = params_sq
+
+        # param_diag_vars is the diagonals of the parameter variances
+        # on each tensor dim, with an arbitrary scalar factor.
+        param_diag_vars = [None] * p.ndim

        for _ in range(3):
            # Iterate 3 times, this should be enough to converge.
@ -630,35 +656,39 @@ class NeutralGradient(Optimizer):
                # (this is after normalizing previous dimensions' variance)
                this_var = params_sq_norm.mean(dim=other_dims, keepdim=True)
                params_sq_norm = params_sq_norm / this_var
-                if param_pow != 1.0:
-                    params_sq_partnorm = params_sq_partnorm / (this_var ** param_pow)

-                this_var = this_var.reshape(size)
-                this_scale = (this_var ** (param_pow * 0.5)).reshape(size)
-                proj = state[f"proj_{dim}"]

-                if proj.ndim == 1:
-                    proj *= this_scale
+                if param_diag_vars[dim] == None:
+                    param_diag_vars[dim] = this_var.reshape(size)
                else:
-                    # scale the rows; dim 0 of `proj` corresponds to the
-                    # diagonalized space.
-                    proj *= this_scale.unsqueeze(-1)
+                    param_diag_vars[dim] *= this_var.reshape(size)
+
+        for dim in range(p.ndim):
+            size = p.shape[dim]
+            if size == 1:
+                continue
+            param_diag_var = param_diag_vars[dim]
+            param_diag_var = self._smooth_param_diag_var(param_diag_var,
+                                                         param_pow,
+                                                         param_rel_eps,
+                                                         param_rel_max)
+            param_scale = param_diag_var ** 0.5
+            proj = state[f"proj_{dim}"]
+
+            if proj.ndim == 1:
+                proj *= param_scale
+            else:
+                # scale the rows; dim 0 of `proj` corresponds to the
+                # diagonalized space.
+                proj *= param_scale.unsqueeze(-1)
+        # the scalar factors on the various dimensions' projections are a little
+        # arbitrary, and their product is arbitrary too.  we normalize the scalar
+        # factor outside this function, see calar_exp_avg_sq.

-        if param_pow != 1.0:
-            # need to get the overall scale correct, as if we had param_pow == 1.0
-            scale = (params_sq_partnorm.mean() ** 0.5)
-            for dim in range(p.ndim):
-                size = p.shape[dim]
-                if size == 1:
-                    continue
-                else:
-                    state[f"proj_{dim}"] *= scale
-                    break

        # Reset the squared-gradient stats because we have changed the basis.
        state["exp_avg_sq"].fill_(0.0)
-        if param_pow != 1.0:
-            state["scalar_exp_avg_sq"].fill_(0.0)
+        state["scalar_exp_avg_sq"].fill_(0.0)



@ -726,8 +756,8 @@ class NeutralGradient(Optimizer):

        param_cov = torch.matmul(p.t(), p) / num_outer_products

-        self._randomize_lowrank_cov(param_cov, num_outer_products,
-                                    p.shape)
+        #self._randomize_lowrank_cov(param_cov, num_outer_products,
+        #                            p.shape)

        return param_cov

@ -794,8 +824,31 @@ class NeutralGradient(Optimizer):
                            (proj.t() if forward else proj)).transpose(-1, dim)


-    def _estimate_proj(self, grad_cov: Tensor, param_cov: Tensor,
-                       param_pow: float = 1.0) -> Tensor:
+    def _smooth_param_diag_var(self,
+                               param_diag: Tensor,
+                               param_pow: float,
+                               param_rel_eps: float,
+                               param_rel_max: float) -> Tensor:
+        """
+        Applies the smoothing formula to the eigenvalues of the parameter covariance
+        tensor.
+        (Actually when we use this function in _get_param_cov, they won't exactly
+        be the eigenvalues because we diagonalize relative to the grad_cov,
+        but they will be parameter covariances in certain directions).
+        """
+        # normalize  so mean = 1.0
+        param_diag = param_diag / (param_diag.mean()  + 1.0e-20)
+        # use 1/(1/x + 1/y) to apply soft-max of param_diag with param_rel_max.
+        # use param_rel_eps here to prevent division by zero.
+        param_diag = 1. / (1. / (param_diag + param_rel_eps) + 1. / param_rel_max)
+        return param_diag ** param_pow
+
+    def _estimate_proj(self,
+                       grad_cov: Tensor,
+                       param_cov: Tensor,
+                       param_pow: float = 1.0,
+                       param_rel_eps: float = 0.0,
+                       param_rel_max: float = 1.0e+10) -> Tensor:
        """
        Return a symmetric positive definite matrix P such that
             (P grad_cov P^T == param_cov^{param_pow}),
@ -839,8 +892,11 @@ class NeutralGradient(Optimizer):
        #  P = Q^T (Q G Q^T)^{-0.5} Q.

        # because C is symmetric, C == U S U^T, we can ignore V.
-        # S_sqrt is S.sqrt() in the normal case where param_pow == 1.0
-        S_sqrt = S ** (0.5 * param_pow)
+        # S_sqrt is S.sqrt() in the limit where param_pow == 1.0,
+        # param_rel_eps=0, param_rel_max=inf
+        S_smoothed = self._smooth_param_diag_var(S, param_pow,
+                                                 param_rel_eps, param_rel_max)
+        S_sqrt = S_smoothed ** 0.5
        Q = (U * S_sqrt).t()

        X = torch.matmul(Q, torch.matmul(G, Q.t()))
@ -856,23 +912,24 @@ class NeutralGradient(Optimizer):

        P = torch.matmul(Y, Y.t())

-        if random.random() < 0.025:
-
-            # TEMP:
-            _,s,_ = P.svd()
-            print(f"Min,max eig of P: {s.min()},{s.max()}")
-
+        if random.random() < 1.0: #0.025:
            # TODO: remove this testing code.
            assert (P - P.t()).abs().mean() < 0.01  # make sure symmetric.
+            try:
+                P = 0.5 * (P + P.t())
+                _,s,_ = P.svd()
+                print(f"Min,max eig of P: {s.min()},{s.max()}")
+            except:
+                pass
            # testing...  note, this is only true modulo "eps"
            C_check = torch.matmul(torch.matmul(P, G), P)
-            # C_check should equal C
-            C_diff = C_check - C
+            C_smoothed = torch.matmul(Q.t(), Q)
+            # C_check should equal C_smoothed
+            C_diff = C_check - C_smoothed
            # Roundoff can cause significant differences, so use a fairly large
            # threshold of 0.001.  We may increase this later or even remove the check.
-            if not C_diff.abs().mean() < 0.01 * C.diag().mean():
-                print(f"Warning: large C_diff: {C_diff.abs().mean()}, C diag mean: {C.diag().mean()}")
-
+            if not C_diff.abs().mean() < 0.01 * C_smoothed.diag().mean():
+                print(f"Warning: large C_diff: {C_diff.abs().mean()}, C diag mean: {C_smoothed.diag().mean()}")

        return P

@ -899,7 +956,7 @@ class NeutralGradient(Optimizer):
            return


-        _, beta2 = group["betas"]
+        _, beta2, _ = group["betas"]

        for p in group["params"]:
            if p.grad is None:
@ -1602,9 +1659,11 @@ def _test_eve_cain():
        if iter == 0: optim = Eve(m.parameters(), lr=0.003)
        elif iter == 1: optim = Cain(m.parameters(), lr=0.03)
        elif iter == 2: optim = NeutralGradient(m.parameters(), lr=0.04, max_fullcov_size=10,
-                                                estimate_period=500, stats_steps=100)
+                                                estimate_period=500, stats_steps=100,
+                                                lr_for_speedup=0.02)
        elif iter == 3: optim = NeutralGradient(m.parameters(), lr=0.04, max_fullcov_size=1000,
-                                                estimate_period=500, stats_steps=100)
+                                                estimate_period=500, stats_steps=100,
+                                                lr_for_speedup=0.02)
        scheduler = Eden(optim, lr_batches=200, lr_epochs=5, verbose=False)

        start = timeit.default_timer()