diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
index c2e8379be..aebbd7c47 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
@@ -109,7 +109,7 @@ class PrAdam(BatchedOptimizer):
                   is the scaling factor on the learning rate of p_scale.
        param_pow: Power on the parameter covariance matrix, 1.0 means learn proportional
                   to parameter rms (1.0 will be too  much, should be between 0 and 1.)
-                  This is one of the most important tunable factors, along with max_lr_factor.
+                  This is one of the most important tunable factors, along with param_cov_max.
 param_rms_smooth0: Limiting value of smoothing proportion for parameter matrix, as
                    assumed rank of param covariance [==product of sizes on the other
                    tensor dims] approaches 0.
@@ -117,24 +117,41 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
                    param covariance equals the dimension of the covaraince matrix.
                    param_rms_smooth{0,1} determine the smoothing proportions for other
                    conditions.
-    max_lr_factor: How much faster we allow any direction in parameter space to learn faster
-                   than the mean... this is a relatively important thing to tune,
-                    along with param_pow.
-            eps:  An epsilon to prevent division by zero
+    param_cov_min: [IMPORTANT] A 3-tuple of minimums of the diagonal values of the parameter
+                   covariance, normalized in 3 different ways: relative to its own
+                   diagonal, scaled by the grad covariance, and in the canonical basis.
+                   With param_cov_max, defines how "aggressive" we allow our update to
+                   be.
+    param_cov_max: [IMPORTANT] A 3-tuple of maximums of the diagonal values of the parameter
+                   covariance, normalized in 3 different ways: relative to its own
+                   diagonal, scaled by the grad covariance, and in the canonical basis.
+        param_pow: This was mainly added for development and experimentation purposes;
+                   it allows you to smooth the parameter covariance by taking it to
+                   a power, but if this is not 1.0 it will cause a speed penalty because
+                   it requires SVD.
+            eps:  A general-purpose epsilon to prevent division by zero
    param_min_rms: Minimum root-mean-square value of parameter tensor, for purposes of
-                  learning the scale on the parameters (we'll keep it >= this size)
+                  learning the scale on the parameters (we'll constrain the rms of each non-scalar
+                  parameter tensor to be >= this value)
    param_max_rms: Maximum root-mean-square value of parameter tensor, for purposes of
-                  learning the scale on the parameters (we'll keep it <= this size)
-      scalar_max: Maximum absolute value for scalar parameters
+                  learning the scale on the parameters (we'll constrain the rms of each non-scalar
+                  parameter tensor to be <= this value)
+      scalar_max: Maximum absolute value for scalar parameters (applicable if your
+                  model has any parameters with numel() == 1)
    size_update_period: The periodicity, in steps, with which we update the size (scale)
-                   of the parameter tensor.  This is provided to save a little time.
-   lr_update_period: Determines the periodicity, in steps, with which we update the
-                   learning-rate matrices.   The first number is the periodicity at
+                  of the parameter tensor.  This is provided to save a little time
+                   in the update.
+   lr_update_period: [IMPORTANT]: A 2-tuple of ints that Determines the periodicity, in steps, with
+                   which we update the learning-rate matrices.   The first number is the periodicity at
                    the start of training, the second number is the periodicity
-                   later in training.  One step of updating the learning rate matrices
-                   can take as long as over 50 minibatches, because SVD on GPU is slow.
-                   ** This is important for the speed/optimizaton tradeoff. **
-    max_block_size: The maximum block size in block-diagonal co-ordinate transformations.
+                   later in training, and we gradually increase from one to the other.
+                   The reason for such a complicated schedule is that updating the learning
+                   rate matrices is very slow, principally because it requires SVD, and SVD
+                   seems to have quite slow implementations.
+    max_block_size: [IMPORTANT] The maximum block size in block-diagonal co-ordinate
+                   transformations.  You can probably set this to 512 or 1024.  Larger
+                   values will require more MEMORY and may be a bit slower, but should
+                   lead to better optimization performance.
     """
     def __init__(
             self,
@@ -142,8 +159,8 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             lr=3e-02,
             betas=(0.9, 0.98),
             size_lr_scale=0.1,
-            min_lr_factor=(0.05, 0.01, 0.01),
-            max_lr_factor=(10.0, 40.0, 10.0),
+            param_cov_min=(0.05, 0.01, 0.01),
+            param_cov_max=(10.0, 40.0, 10.0),
             param_pow=(1.0, 1.0, 1.0),
             param_rms_smooth0=0.4,
             param_rms_smooth1=0.2,
@@ -165,8 +182,8 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
         defaults = dict(
             lr=lr,
             size_lr_scale=size_lr_scale,
-            min_lr_factor=min_lr_factor,
-            max_lr_factor=max_lr_factor,
+            param_cov_min=param_cov_min,
+            param_cov_max=param_cov_max,
             param_pow=param_pow,
             param_rms_smooth0=param_rms_smooth0,
             param_rms_smooth1=param_rms_smooth1,
@@ -649,12 +666,12 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             # so we need to transpose Q as we convert M to the diagonalized co-ordinate.
             M = torch.matmul(M, Q.transpose(-2, -1))     # (batch_size, num_blocks, x, z, y, block_size)
             M = _move_dim(M, 1, -2)  # (batch_size, x, z, y, num_blocks, block_size)
-            M = M.reshape(*M.shape[:-2], size) # # (batch_size, x, z, y, size)
+            M = M.reshape(*M.shape[:-2], size)   # (batch_size, x, z, y, size)
             cur_p = M.transpose(dim, -1) # (batch_size, x, size, y, z)
 
             # cur_param_var is a diagonal parameter variance over dimension `dim`,
             # of the current "slightly-whitened" parameter; it
-            # will have shape something like [1, size, 1]; or [batch_size, 1, size, 1].
+            # will have shape [batch_size, 1, size, 1].
             cur_param_var = _mean(cur_p**2,
                                   exclude_dims=[0,dim],
                                   keepdim=True)  # (batch_size, 1, size, 1, 1) if dim==2
@@ -848,6 +865,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             # It is not smoothed yet.
             P_prime = torch.matmul(U_g.transpose(2, 3), torch.matmul(param_cov, U_g))
 
+            P_prime_unsmoothed = P_prime
             P_prime = self._smooth_param_cov(group, p_shape, P_prime, G_prime)
 
             # C will satisfy: P_prime == torch.matmul(C, C.transpose(2, 3))
@@ -900,7 +918,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             U_z, S, _ = _svd(Z)
             if True:
                 skip = 10 if S.shape[-1] > 40 else 1
-                logging.info(f"Eigs of Z are: {S[0,0,::skip]}")
+                logging.info(f"dim={dim}, G_prime is {G_prime[0,0,::skip]}, Eigs of Z are: {S[0,0,::skip]}")
 
             # state[f"Q_{dim}"] is indexed: [batch_idx, block_idx, diagonalized_coordinate, canonical_coordinate].
             # so we need to transpose U_z as U_z is indexed
@@ -917,8 +935,11 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             this_P_proj = _diag(torch.matmul(U_prod, torch.matmul(P_prime, U_prod.transpose(2, 3))))
             P_proj[dim] =  this_P_proj.clone().reshape(batch_size, size)
             if True:
+                this_P_proj_unsmoothed = _diag(torch.matmul(U_prod, torch.matmul(P_prime_unsmoothed,
+                                                                                 U_prod.transpose(2, 3))))
+                this_P_proj_unsmoothed = this_P_proj_unsmoothed.clone().reshape(batch_size, size)
                 skip = 10 if P_proj[dim].shape[-1] > 40 else 1
-                logging.info(f"Eigs of P_proj are: {P_proj[dim][0,::skip]}")
+                logging.info(f"dim={dim}, diag of P_proj is: {P_proj[dim][0,::skip]}, diag of unsmoothed P_proj is: {this_P_proj_unsmoothed[0,::skip]}")
 
         return P_proj
 
@@ -989,15 +1010,15 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
         _diag(P_norm).add_(smooth)
 
         P_norm = self._smooth_cov(P_norm,
-                                  group["min_lr_factor"][0],
-                                  group["max_lr_factor"][0],
+                                  group["param_cov_min"][0],
+                                  group["param_cov_max"][0],
                                   group["param_pow"][0])
         # Remove the diagonal preconditioning on P_norm, giving us stage-1-smoothed
         # version of P_prime.
         P_prime = P_norm * P_prime_scale
 
         # Make sure G_prime has unit mean and no eigenvalue is super small.  Note, G_prime
-        # is already diagonal.
+        # is already diagonalized, the variable G_prime is just the tensor of eigenvalues.
         G_prime_mean = _mean(G_prime, exclude_dims=[0], keepdim=True)
         G_prime_smooth = 0.001
         # make sure G_prime has no zero eigs, and is unit mean.
@@ -1012,16 +1033,16 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
         P_gnorm = P_prime * G_prime_scale
         # Apply another round of smoothing "relative to G"
         P_gnorm = self._smooth_cov(P_gnorm,
-                                   group["min_lr_factor"][1],
-                                   group["max_lr_factor"][1],
+                                   group["param_cov_min"][1],
+                                   group["param_cov_max"][1],
                                    group["param_pow"][1])
         # Undo the scaling relative to G, so we have stage-2-smoothed version of P_prime.
         P_prime = P_gnorm / G_prime_scale
 
         # Apply a 3rd round of smoothing in the canonical basis.
         P_prime = self._smooth_cov(P_prime,
-                                   group["min_lr_factor"][2],
-                                   group["max_lr_factor"][2],
+                                   group["param_cov_min"][2],
+                                   group["param_cov_max"][2],
                                    group["param_pow"][2])
         return P_prime
 
@@ -1349,7 +1370,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
         """
         smooth0 = group["param_rms_smooth0"]
         smooth1 = group["param_rms_smooth1"]
-        max_lr_factor = group["max_lr_factor"]
+        param_cov_max = group["param_cov_max"]
         param_pow = group["param_pow"]
         eps = group["eps"]
         batch_size = rms.shape[0]
@@ -1369,9 +1390,9 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
         ans = rms / new_mean
 
 
-        # Apply a `soft min` of max_lr_factor via the formula
+        # Apply a `soft min` of param_cov_max via the formula
         # softmin(x,y) = 1/(1/x + 1/y).
-        ans = 1. / (1. / ans + 1. / max_lr_factor)
+        ans = 1. / (1. / ans + 1. / param_cov_max)
         # and renormalize to mean=1.
         ans /= _mean(ans, exclude_dims=[0], keepdim=True)