From 57957cc04913853befba9da72392bfe5994fa25e Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 15 Jun 2022 12:39:16 +0800
Subject: [PATCH] Add diagnostics

---
 .../ASR/pruned_transducer_stateless7/optim.py | 28 +++++++++++++++----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
index 61874ddd1..42e18f31a 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
@@ -21,7 +21,7 @@ import torch
 import random
 from torch import Tensor
 from torch.optim import Optimizer
-
+from icefall import diagnostics # only for testing code
 
 class NeutralGradient(Optimizer):
     """
@@ -65,8 +65,8 @@ class NeutralGradient(Optimizer):
             self,
             params,
             lr=1e-2,
-            betas=(0.9, 0.98, 0.99),
-            scale_speed=0.05,
+            betas=(0.9, 0.98, 0.98),
+            scale_speed=0.025,
             eps=1e-8,
             param_eps=1.0e-05,
             cond_eps=1.0e-10,
@@ -261,7 +261,7 @@ class NeutralGradient(Optimizer):
                         if step % stats_period == 0:
                             self._accumulate_per_dim_stats(grad, state, beta3, eps)
 
-                        if step % estimate_period == 0 or step in [25, 50, 200, 400]:
+                        if step % estimate_period == 0  or step in [25, 50, 200, 400]:
                             self._estimate(p, state, beta3, max_size,
                                            stats_period, estimate_period,
                                            eps, param_eps,
@@ -526,7 +526,10 @@ class NeutralGradient(Optimizer):
         param_cov = torch.matmul(p.t(), p) / p.shape[0]
 
         # later we may be able to find a more principled formula for this.
-        diag_smooth = max(min_diag_smooth, size / (size + num_outer_products))
+        #if random.random() < 0.2:
+        #    print(f"param diag_smooth = {diag_smooth}, shape={p.shape}")
+        #diag_smooth = min_diag_smooth
+        diag_smooth = 0.4
         diag = param_cov.diag()
         extra_diag = (diag * diag_smooth) + (diag.max() * cond_eps +
                                              param_eps * param_eps)
@@ -582,10 +585,13 @@ class NeutralGradient(Optimizer):
         num_outer_products = rank_per_iter * num_iters_in_stats
         diag_smooth = max(min_diag_smooth,
                           size / (size + num_outer_products))
+        if random.random() < 0.5:
+            print(f"grad diag_smooth = {diag_smooth}, shape={p.shape}")
+
         diag = grad_cov.diag()
         extra_diag = (diag * diag_smooth) + (diag.max() * cond_eps +
                                              eps * eps)
-        grad_cov.mul_(1-diag_smooth).add_(extra_diag.diag())
+        grad_cov = (grad_cov * (1-diag_smooth)).add_(extra_diag.diag())
         return grad_cov
 
     def _get_cov(self, x: Tensor, dim: int) -> Tensor:
@@ -1338,6 +1344,14 @@ def _test_eve_cain():
         start = timeit.default_timer()
         for epoch in range(150):
             scheduler.step_epoch()
+
+            if epoch == 130:
+                opts = diagnostics.TensorDiagnosticOptions(
+                    2 ** 22
+                )  # allow 4 megabytes per sub-module
+                diagnostic = diagnostics.attach_diagnostics(m, opts)
+
+
             for n, (x,y) in enumerate(train_pairs):
                 y_out = m(x)
                 loss = ((y_out - y)**2).mean() * 100.0
@@ -1356,6 +1370,8 @@ def _test_eve_cain():
                 optim.zero_grad()
                 scheduler.step_batch()
 
+        diagnostic.print_diagnostics()
+
         stop = timeit.default_timer()
         print(f"Iter={iter}, Time taken: {stop - start}")