From 962e95f1190d1cbaf2dce50a68ad6a0fe45742ed Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 24 Jul 2022 09:20:53 +0800
Subject: [PATCH] Using a more flexible test. Moved to simpler update , tuned
 diffrently.

---
 .../ASR/pruned_transducer_stateless7/optim.py | 36 ++++++++++++++-----
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
index d1e8ee97d..a8a0a05b0 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
@@ -159,8 +159,8 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             lr=3e-02,
             betas=(0.9, 0.98),
             size_lr_scale=0.1,
-            param_cov_min=(0.05, 0.01, 0.01),
-            param_cov_max=(10.0, 40.0, 10.0),
+            param_cov_min=(0.05, 0.01, 0.04),
+            param_cov_max=(10.0, 40.0, 5.0),
             param_pow=(1.0, 1.0, 1.0),
             param_rms_smooth0=0.4,
             param_rms_smooth1=0.2,
@@ -418,7 +418,8 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
                 # Only update the parameter-dependent part of the learning
                 # rate matrices at most every other time we reach here, and
                 # less frequently than that later in training.
-                self._update_param_scales(group, p, state, P_proj)
+                #self._update_param_scales(group, p, state, P_proj)
+                self._update_param_scales_simple(group, p, state, P_proj)
 
                 # We won't be doing this any more.
                 #self._diagonalize_grad_cov(group, p, state)
@@ -599,6 +600,13 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
             # individual tensor dims
             this_P_proj /= _mean(this_P_proj, exclude_dims=[0], keepdim=True)
 
+            if True:
+                # debug info.
+                scale = this_P_proj.sqrt()
+                step = state["step"]
+                scale_min, scale_max, scale_mean = scale.min().item(), scale.max().item(), scale.mean().item()
+                logging.info(f"step={step}, dim={dim}, size={size}, scale min,max,mean={scale_min,scale_max,scale_mean}")
+
             Q *= this_P_proj.sqrt()
 
     def _update_param_scales(self,
@@ -775,7 +783,9 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
                 scale = cur_scales[dim].reshape(batch_size, num_blocks, block_size, 1)
 
                 # Geometrically interpolate scale with P_proj[dim].sqrt()
-                scale = (scale * P_proj[dim].reshape(batch_size, num_blocks, block_size, 1).sqrt()).sqrt()
+                P_proj_weight = 0.5
+                scale = ((scale ** (1-P_proj_weight)) *
+                         (P_proj[dim].reshape(batch_size, num_blocks, block_size, 1) ** (P_proj_weight * 0.5)))
 
                 # The following normalization step will ensure the Frobenius
                 # norm is unchanged, from applying this scale: at least,
@@ -787,9 +797,15 @@ param_rms_smooth1: Smoothing proportion for parameter matrix, if assumed rank of
                 # individual tensor dims
                 scale /= _mean(scale**2, exclude_dims=[0], keepdim=True).sqrt()
 
+
+                if True:
+                    # debug info.
+                    step = state["step"]
+                    scale_min, scale_max, scale_mean = scale.min().item(), scale.max().item(), scale.mean().item()
+                    logging.info(f"step={step}, dim={dim}, size={size}, scale min,max,mean={scale_min,scale_max,scale_mean}")
+
                 # Q is indexed [batch_index, block_index, diagonalized_coordinate, canonical_coordinate],
                 # want to multiply on the diagonalized co-ordinate.
-                # else: Q is indexed [batch_index, canonical_coordinate].
                 state[f"Q_{dim}"] *= scale
         state["last_param_scale_update"] = state["step"]
 
@@ -2163,11 +2179,13 @@ def _test_eve_cain():
         fix_random_seed(42)
         Linear = torch.nn.Linear if iter == 0 else ScaledLinear
         # TODO: find out why this is not converging...
-        m = torch.nn.Sequential(Linear(E, 200),
+
+        hidden_dim = 512
+        m = torch.nn.Sequential(Linear(E, hidden_dim),
                                 torch.nn.PReLU(),
-                                Linear(200, 200),
+                                Linear(hidden_dim, hidden_dim),
                                 torch.nn.PReLU(),
-                                Linear(200, E),
+                                Linear(hidden_dim, E),
                                 ).to(device)
 
         train_pairs = [ (100.0 * torch.randn(B, T, E, device=device, dtype=dtype) * input_magnitudes,
@@ -2176,7 +2194,7 @@ def _test_eve_cain():
         if iter == 0: optim = Eve(m.parameters(), lr=0.003)
         elif iter == 1: optim = Cain(m.parameters(), lr=0.03)
         elif iter == 2: optim = PrAdam(m.parameters(), lr=0.03)
-        elif iter == 3: optim = PrAdam(m.parameters(), lr=0.03, max_block_size=100)
+        elif iter == 3: optim = PrAdam(m.parameters(), lr=0.03, max_block_size=256)
         scheduler = Eden(optim, lr_batches=200, lr_epochs=5, verbose=False)
 
         #TEMP