From 0666789cb83e076c28190e7859eb9d5710b22845 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 30 Jul 2022 21:48:54 -0700
Subject: [PATCH] Small numerical improvements; config change of eps and G_diag
 changed 1.01 to 1.005; decrease an eps from 1e-10 to 1e-20

---
 .../ASR/pruned_transducer_stateless7/optim.py   | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
index 9ae941109..135c266ef 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
@@ -124,9 +124,6 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
 
                    (4) is for smoothing the grad covariance used for (2)
 
-                   (5) is for smoothing the inverse Z^{-1} final learning-rate matrix Z relative to
-                      its own diagonal.  Only the cov_min[4] is actually used, we ignore
-                      cov_max[4]
           cov_pow: This was mainly added for development and experimentation purposes;
                   it allows you to smooth the parameter covariance matrices at the
                   stages (1), (2), (3) of smoothing mentioned above, and also
@@ -166,8 +163,8 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             lr=3e-02,
             betas=(0.9, 0.98),
             size_lr_scale=0.1,
-            cov_min=(0.025, 0.0025, 0.02, 0.0001, 0.05),
-            cov_max=(10.0, 80.0, 5.0, 400.0, 100.0),
+            cov_min=(0.025, 0.0025, 0.02, 0.0001),
+            cov_max=(10.0, 80.0, 5.0, 400.0),
             cov_pow=(1.0, 1.0, 1.0, 1.0),
             param_rms_smooth0=0.4,
             param_rms_smooth1=0.2,
@@ -720,10 +717,9 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
           p_shape: The shape of the parameter we are optimizing
           P: a Tensor of shape (batch_size, num_blocks, block_size, block_size),
                    containing the parameter covariance
-          G: the  gradient covariance, of shape (batch_size, num_blocks,
+          G: the gradient covariance, of shape (batch_size, num_blocks,
                   block_size, block_size)
 
-
         state[f"param_cov_{dim}"], which is an estimate of the covariance of the parameter
         p, averaged over time, and taken over dimension `dim` of the tensor.
 
@@ -765,7 +761,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
 
         G = G.clone()
         G_diag = _diag(G) # aliased
-        G_diag *= 1.01 # improve its condition, for numerical reasons.
+        G_diag *= 1.005 # improve its condition, for numerical reasons.
         G = self._smooth_cov(G,
                              group["cov_min"][3],
                              group["cov_max"][3],
@@ -774,6 +770,8 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
         # C C^T == G.
         C = G.cholesky()
 
+        P_orig = P.clone()
+
         # treat the last dim of C as being in an arbitrary space, its next-to-last dim
         # is the "canonical" one that we need to sum with the dims of P.
         P_gnorm = torch.matmul(C.transpose(2, 3),
@@ -841,7 +839,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
                 X: the batch of symmetric positive definite tensors we are smoothing;
                    of shape (batch_size, num_blocks, block_size, block_size)
        """
-        eps = 1.0e-10
+        eps = 1.0e-20
         if power != 1.0:
             U, S, _ = _svd(X)
             S_mean = _mean(S, exclude_dims=[0], keepdim=True)
@@ -866,6 +864,7 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             _diag(X_inv).add_(1. / (max_eig * cur_diag_mean))
             X = X_inv.inverse()
             X /= _mean(_diag(X), exclude_dims=[0], keepdim=True).unsqueeze(-1)
+            X = 0.5 * (X + X.transpose(-2, -1)) # make sure exactly symmetric.
             return X