From cc388675a9fc233fdddae5e33ba6da23ee6a3935 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 23 Jul 2022 08:24:59 +0800
Subject: [PATCH] Bug fix RE rankj

---
 .../ASR/pruned_transducer_stateless7/optim.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
index 87956ed95..debc41890 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
@@ -142,8 +142,8 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             lr=3e-02,
             betas=(0.9, 0.98),
             size_lr_scale=0.1,
-            min_lr_factor=(0.05, 0.05, 0.05),
-            max_lr_factor=(100.0, 100.0, 100.0),
+            min_lr_factor=(0.01, 0.01, 0.01),
+            max_lr_factor=(10.0, 10.0, 10.0),
             param_rms_smooth0=0.75,
             param_rms_smooth1=0.25,
             eps=1.0e-08,
@@ -924,21 +924,23 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
         P_norm = P_prime / P_prime_scale
         # Now P is as normalized as we can make it... do smoothing baserd on 'rank',
         # that is intended to compensate for bad estimates of P.
-        batch_size = p_shape[0]
-        size = P_norm.shape[0]  # size of dim we are concerned with right now
-        # `rank` is the rank of P_prime if we were to estimate it from just one
+        (batch_size, num_blocks, block_size, block_size) = P_norm.shape
+        # `rank_per_block` is the rank of each block of P_prime if we were to estimate it from just one
         # parameter tensor.  We average it over time, but actually it won't be changing
         # too much, so `rank` does tell us something.
-        rank = p_shape.numel() // (size * batch_size)
+        size = num_blocks * block_size
+        rank = p_shape.numel() // (size * batch_size)   # actually the rank of each block
         smooth0 = group["param_rms_smooth0"]
         smooth1 = group["param_rms_smooth1"]
         # We want expr for smoothing amount to be of the form: smooth = alpha * size / (beta*rank + size)
+        # for "size" here, we actually want to use block_size, since we are concerned about the
+        # robustness of the covariance within these blocks.
         # param_rms_smooth{0,1} represents the user-specified desired amount of smoothing
         # when rank==0*size and rank==1*size, respectively.
         # from rank==0*size, we get smooth0 = alpha * size/size, so alpha = smooth0.
         # from setting rank==size, we get smooth1 = alpha * size / (beta*size * size) = alpha/(1+beta),
         # so smooth1 == smooth0 / (1+beta), so (1+beta) = smooth0/smooth1, so beta=smooth0/smooth1 - 1
-        smooth = smooth0 * size / ((smooth0/smooth1 - 1) * rank + size)
+        smooth = smooth0 * block_size / ((smooth0/smooth1 - 1) * rank + block_size)
 
         # add rank-dependent smoothing amount to diagonal of P_prime.  _diag() returns an aliased tensor.
         # we don't need to multiply `smooth` by anything, because at this point, P_prime should have
@@ -2084,7 +2086,8 @@ def _test_eve_cain():
     input_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
     output_magnitudes = (1.0 * torch.randn(E, dtype=dtype, device=device)).exp()
 
-    for iter in [3, 2, 1, 0]:
+    #for iter in [3, 2, 1, 0]:  # will restore 1,0 later
+    for iter in [3, 2]:
         fix_random_seed(42)
         Linear = torch.nn.Linear if iter == 0 else ScaledLinear
         # TODO: find out why this is not converging...