From 3110138ab578eb18c465b2f5ee76cd0428921e19 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 30 Jul 2022 07:30:32 +0800
Subject: [PATCH 1/2] Smooth grad_cov with eps; add a 4th stage of smoothing,
 this time on Z_inv.

---
 .../ASR/pruned_transducer_stateless7/optim.py | 27 ++++++++++++++++---
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
index 945e3ab19..5ebe88b4d 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
@@ -114,7 +114,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
                    param covariance equals the dimension of the covaraince matrix.
                    param_rms_smooth{0,1} determine the smoothing proportions for other
                    conditions.
-  cov_min,cov_max: [IMPORTANT] 4-tuples of minimums and maximums of the diagonal values of
+  cov_min,cov_max: [IMPORTANT] 5-tuples of minimums and maximums of the diagonal values of
                    covariance matrices, after normalizing to unit-mean.  The first 3 are
                    for smoothing the parameter covariance, normalized in 3 different ways:
                    (1) relative to its own diagonal (in a basis that diagonalizes the grad
@@ -123,6 +123,11 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
                    (3) in the canonical basis.
 
                    (4) is for smoothing the grad covariance used for (2)
+
+                   (5) is for smoothing the final learning-rate matrix Z relative to
+                      its own diagonal.  Only the cov_max is actually used.
+                      the cov_min ends up not affecting the result, so we set it
+                      to 0.0.
           cov_pow: This was mainly added for development and experimentation purposes;
                   it allows you to smooth the parameter covariance matrices at the
                   stages (1), (2), (3) of smoothing mentioned above, and also
@@ -162,8 +167,8 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             lr=3e-02,
             betas=(0.9, 0.98),
             size_lr_scale=0.1,
-            cov_min=(0.025, 0.0025, 0.02, 0.0001),
-            cov_max=(10.0, 80.0, 5.0, 400.0),
+            cov_min=(0.025, 0.0025, 0.02, 0.0001, 0.0),
+            cov_max=(10.0, 80.0, 5.0, 400.0, 10.0),
             cov_pow=(1.0, 1.0, 1.0, 1.0),
             param_rms_smooth0=0.4,
             param_rms_smooth1=0.2,
@@ -877,6 +882,9 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
         """
         ndim = len(p_shape)
         P_proj = [None] * ndim
+        denom_rel_eps = group["denom_rel_eps"]
+        eps = group["eps"]
+
         for dim in range(1, ndim):
             size = p_shape[dim]
             try:
@@ -899,6 +907,11 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             # dimensions with zero grad separate from those with nonzero grad.
             G_prime = _diag(torch.matmul(U_g.transpose(2,3), torch.matmul(grad_cov, U_g)))
 
+            # Use the form of the diagonalized gradient matrix that we get after
+            # we add the Adam-type smoothing with epsilon.
+            G_prime += (_mean(G_prime, exclude_dims=[0], keepdim=True) *(denom_rel_eps * denom_rel_eps) +
+                        (eps * eps))
+
             # P_prime is P' above, which represents param_cov in the basis that diagonalizes G_prime.
             # It is not smoothed yet.
             P_prime = torch.matmul(U_g.transpose(2, 3), torch.matmul(param_cov, U_g))
@@ -957,12 +970,18 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
                 _check_similar(G_prime, G_prime_check, "G_prime")
 
 
+            Z_prime_inv_diag = _diag(Z_prime_inv)  # aliased with Z_prime_inv
+            # this is smoothing Z relative to its own diagonal.  This is z_inv,
+            # so by applying a minimum here, we are applying a maximum of the
+            # eigs of Z after normalizing so the diagonal is 1.
+            Z_prime_inv_diag *= (1. + 1. / group["cov_max"][4])
 
             # We really want the SVD on Z, which will be used for the learning-rate matrix
             # Q, but Z_prime is better, numerically, to work on because it's closer to
             # being diagonalized.
             U_z_prime, S_z_prime_inv, _ = _svd(Z_prime_inv)
 
+
             U_z = torch.matmul(U_g, U_z_prime)
             # We could obtain Z in two possible ways.
             # Commenting the check below.
@@ -2203,7 +2222,7 @@ def _test_eve_cain():
         fix_random_seed(42)
         Linear = torch.nn.Linear if iter == 0 else ScaledLinear
 
-        hidden_dim = 400
+        hidden_dim = 768
         m = torch.nn.Sequential(Linear(E, hidden_dim),
                                 torch.nn.PReLU(),
                                 Linear(hidden_dim, hidden_dim),

From 17bc002e6e1439a7ad01e6828cfbc7d780418bca Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 30 Jul 2022 07:45:29 +0800
Subject: [PATCH 2/2] Refactoring that does not affect results.

---
 .../ASR/pruned_transducer_stateless7/optim.py       | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
index 5ebe88b4d..4146ec4b4 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
@@ -124,10 +124,9 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
 
                    (4) is for smoothing the grad covariance used for (2)
 
-                   (5) is for smoothing the final learning-rate matrix Z relative to
-                      its own diagonal.  Only the cov_max is actually used.
-                      the cov_min ends up not affecting the result, so we set it
-                      to 0.0.
+                   (5) is for smoothing the inverse Z^{-1} final learning-rate matrix Z relative to
+                      its own diagonal.  Only the cov_min[4] is actually used, we ignore
+                      cov_max[4]
           cov_pow: This was mainly added for development and experimentation purposes;
                   it allows you to smooth the parameter covariance matrices at the
                   stages (1), (2), (3) of smoothing mentioned above, and also
@@ -167,8 +166,8 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             lr=3e-02,
             betas=(0.9, 0.98),
             size_lr_scale=0.1,
-            cov_min=(0.025, 0.0025, 0.02, 0.0001, 0.0),
-            cov_max=(10.0, 80.0, 5.0, 400.0, 10.0),
+            cov_min=(0.025, 0.0025, 0.02, 0.0001, 0.1),
+            cov_max=(10.0, 80.0, 5.0, 400.0, 100.0),
             cov_pow=(1.0, 1.0, 1.0, 1.0),
             param_rms_smooth0=0.4,
             param_rms_smooth1=0.2,
@@ -974,7 +973,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             # this is smoothing Z relative to its own diagonal.  This is z_inv,
             # so by applying a minimum here, we are applying a maximum of the
             # eigs of Z after normalizing so the diagonal is 1.
-            Z_prime_inv_diag *= (1. + 1. / group["cov_max"][4])
+            Z_prime_inv_diag *= (1. + group["cov_min"][4])
 
             # We really want the SVD on Z, which will be used for the learning-rate matrix
             # Q, but Z_prime is better, numerically, to work on because it's closer to