From b3bb2dac6fb338291aa451fbd929c14d590b3e14 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 10 Jul 2022 06:28:01 +0800
Subject: [PATCH] Iterative, more principled way of estimating param_cov

---
 .../ASR/pruned_transducer_stateless7/optim.py | 104 +++++++++++++++---
 1 file changed, 91 insertions(+), 13 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
index 1f6b33b70..67cdbc48f 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
@@ -360,23 +360,100 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
         ndim = p.ndim
         numel = p.numel()
 
+        if numel in p.shape:
+            return  # Nothing to do for this parameter matrix.  E.g. a bias or a scalar.
+
+        scale_arr = [None] * ndim
+
+        # the small random part is to ensure there are no exact zeros, e.g.
+        # if we initialized some parameters to zero.
+        eps = 1.0e-20
+        cur_p = p + eps * torch.randn_like(p)
+
+        dims_to_sum = list(range(ndim-1))
+
         for dim in range(ndim):
             size = p.shape[dim]
             if size == 1 or size == p.numel():
                 continue
             param_cov = state[f"param_cov_{dim}"]
             U, S, V = _svd(param_cov)
-            rank = numel // size
-            rms = self._smooth_param_rms(group, S.sqrt(), rank)
+            Q = state[f"Q_{dim}"]
+            Q[:] = U.t()
 
-            if random.random() < 0.0005:
-                logging.info(f"Shape={tuple(p.shape)}, dim={dim}, rank={rank}, size={size}, rms={rms[::10]}")
+            M = cur_p.transpose(dim, -1)
+            N = torch.matmul(M, U)
+            cur_param_var = (N * N).mean(dim=dims_to_sum)
+            # OK, cur_param_var would be the same as S if the variance stats
+            # param_cov_{dim} were accumulated from this exact parameter matrix,
+            # but actually they contain other versions of the parameter
+            # covariance so they will, in general, be less extreme.  We scale
+            # p so that it matches the accumulated stats, the idea is to
+            # ensure it doesn't have any too-small eigenvalues (where the
+            # stats permit).
+            scale = (S.clamp(min=eps) / cur_param_var.clamp(min=eps)).sqrt()
+            N *= scale
+            cur_p = N.transpose(dim, -1)
+
+        # OK, at this point we have a matrix cur_p that is multiplied by orthogonal matrices
+        # in each non-trivial dim, that is also multiplied by scalars to match the
+        # accumulated covariance stats (in general this will make a modest difference,
+        # making the eigenvalue distribution a bit flatter).  Now we will work out
+        # the "scaling" part of the learning-rate matrices Q.  We can't do this independenty
+        # for each dim, because there is a risk of "counting things twice".
+        # E.g. for a matrix with 2 dims, if we do SVD M = U S V^T, if we consider the
+        # covariance on the left and the right, S will be reflected in both covariances,
+        # so it doesn't make sense to correct for S twice.  Not all parameter matrices
+        # have exactly 2 dims, and also we're dealing with accumulated parameter stats
+        # which makes things not quite so simple, so we don't want to just take the sqrt of S.
+
+        # cur_scales[dim] will be a 1-d tensor of shape (size,) = (p.shape[dim],),
+        # containing the scales on the learning-rate matrix for this dimension.
+        # we apply these scales to the parameter matrix before estimating the
+        # cur_scales for the other dims.
+        cur_scales = [None] * ndim
+
+
+        #debug = random.random() < 0.1
+        debug = True
+        for i in range(4):  # for 4 iterations..
+            for dim in range(ndim):
+                size = p.shape[dim]
+                if size == 1 or size == p.numel():
+                    continue
+
+                M = cur_p.transpose(dim, -1)
+                # correct for the fact that we have already normalized this dim in cur_p.
+                if cur_scales[dim] is not None:
+                    M *= cur_scales[dim]
+                rms = (M**2).mean(dim=dims_to_sum).sqrt()
+                rank = numel // size
+                smoothed_rms = self._smooth_param_rms(group, rms, rank)
+                cur_scales[dim] = smoothed_rms
+                M /= smoothed_rms # normalize this dim..
+
+                if debug:
+                    logging.info(f"i={i} shape={tuple(p.shape)}, dim={dim}, rank={rank}, size={size}, rms={rms[::10]}, smoothed_rms={smoothed_rms[::10]}")
+
+        # Apply the scales in `cur_scales` to Q for each dim, this reflects the
+        # parameter rms values in the parameter-diagonalized space.
+        for dim in range(ndim):
+            if cur_scales[dim] is not None:
+                # Q is indexed indexed [diagonalized_coordinate, canonical_coordinate],
+                # want to multiply on the diagonalized co-ordinate.
+                state[f"Q_{dim}"] *= cur_scales[dim].unsqueeze(-1)
+
+        for dim in range(ndim):
+            size = p.shape[dim]
+            if size == 1 or size == p.numel():
+                continue
 
             Q = state[f"Q_{dim}"]
-            Q[:] = (U * rms).t()
-
             if True:
-                # This block does the actual diagonalization.
+                # This block does the diagonalization of the gradient covariance, by
+                # multiplying by an orthogonal matrix (we'll take into account each element's
+                # gradient variance via exp_avg_sq).
+                #
                 # Suppose the actual parameter matrix p is M, of shape (-1, size), where
                 # the -1 represents all other tensor dims treated as a batch dimension.
                 # M_grad is the same shape as M.  We could write a pseudo-loss as
@@ -606,12 +683,13 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
         new_mean = (eps + (smooth + 1) * mean)
         ans = rms / new_mean
 
-        # Apply max_rms
-        max_rms = 5.0
-        ans.clamp_(max=max_eig*2)
-        ans /= ans.mean()
-        ans.clamp_(max=max_eig)
-        ans /= ans.mean()
+        if False:
+            # Apply max_rms
+            max_rms = 5.0
+            ans.clamp_(max=max_rms*2)
+            ans /= ans.mean()
+            ans.clamp_(max=max_rms)
+            ans /= ans.mean()
         return ans