diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
index 384b6407d..d7ebaec5c 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/optim.py
@@ -94,10 +94,6 @@ class NeutralGradient(Optimizer):
             raise ValueError(
                 "Invalid beta parameter at index 1: {}".format(betas[1])
             )
-        if not 0.0 < betas[2] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 2: {}".format(betas[2])
-            )
         if not 0 < scale_speed < 1.0:
             raise ValueError("Invalid scale_speed value: {}".format(scale_speed))
         if not 0.0 <= grad_eps < 0.1:
@@ -199,10 +195,10 @@ class NeutralGradient(Optimizer):
 
                         if not is_one_axis:
                             # each parameter has a different random time, modulo estimate_period,
-                            # to re-estimate the projections.  "steps_this_period" will
-                            # be reset to 0 when it reaches esetimate_period.
-                            state["steps_this_period"] = random.random(0,
-                                                                       estimate_period-stats_steps)
+                            # to re-estimate the projections.  "step_within_period" will increase by
+                            # 1 on each step, and will  be reset to 0 when it reaches estimate_period.
+                            state["step_within_period"] = random.randint(0,
+                                                                         estimate_period-stats_steps)
 
                         used_scale = False
                         for dim in range(p.ndim):
@@ -212,7 +208,7 @@ class NeutralGradient(Optimizer):
                                 continue
                             elif size > max_fullcov_size:
                                 # diagonal only...
-                                state[f"proj_{dim}"] = torch.ones(size, **kwargs) * param_rms
+                                state[f"proj_{dim}"] = torch.ones(size, **kwargs)
                             else:
                                 state[f"proj_{dim}"] = torch.eye(size, **kwargs)
                             if not used_scale:
@@ -279,12 +275,12 @@ class NeutralGradient(Optimizer):
                     else:
                         # The full update.
                         step_within_period = state["step_within_period"]
-                        if step_within_period == estimate_step:
+                        if step_within_period == estimate_period:
                             self._estimate_projections(p, state, param_eps, param_rel_eps, param_pow)
                             state["step_within_period"] = 0
 
                         if step_within_period >= estimate_period - stats_steps:
-                            self._store_grad_stats(grad, state)
+                            self._store_grad_stats(grad, state, max_fullcov_size)
 
                         cur_grad = grad
 
@@ -299,6 +295,10 @@ class NeutralGradient(Optimizer):
                         # stats when we changed the co-ordinates.
                         bias_correction2 = 1 - beta2 ** (min(step, step_within_period) + 1)
 
+                        cur_grad = cur_grad / (exp_avg_sq.sqrt() + grad_eps)
+                        if bias_correction2 < 0.99:
+                            cur_grad *= bias_correction2
+
                         cur_grad = self._change_coordinates(cur_grad, state, forward=False)
 
                         if random.random() < 0.004:
@@ -316,7 +316,7 @@ class NeutralGradient(Optimizer):
                             # is from gradient descent.
                             prod = (grad*cur_grad).mean()
                             cos_angle = prod / ((grad**2).mean() * (cur_grad**2).mean()).sqrt()
-                            if random.random() < 0.01 or cos_angle < 0.01:
+                            if random.random() < 0.04 or cos_angle < 0.01:
                                 print(f"cos_angle = {cos_angle}, shape={grad.shape}")
 
                         alpha = -lr * (1-beta1)
@@ -417,22 +417,22 @@ class NeutralGradient(Optimizer):
             size = p.shape[dim]
             if size == 1:
                 continue
-            count = state[f"grad_cov_count_{dim}"]
-            assert count != 0  # we can find a way to deal with this case later,
-                               # if it happens.
-            grad_cov = state[f"grad_cov_{dim}"] / count
-            del state[f"grad_cov_{dim}"]  # save memory
             proj = state[f"proj_{dim}"]
 
             if proj.ndim == 2:
                 # This dimension gets the full-covariance treatment.
+                count = state[f"grad_cov_count_{dim}"]
+                assert count != 0
+                grad_cov = state[f"grad_cov_{dim}"] / count
+                del state[f"grad_cov_{dim}"]  # save memory
+
                 self._randomize_lowrank_cov(grad_cov, count)
                 param_cov = self._get_param_cov(p, dim)
 
                 # P is the SPD matrix such that P G P^T == C^{param_pow},
                 # where G == grad_cov and C == param_cov.
-                P = self._estimate_proj(grad_cov_smoothed,
-                                        param_cov_smoothed,
+                P = self._estimate_proj(grad_cov,
+                                        param_cov,
                                         param_pow)
                 # The only thing we want from P is the basis that diagonalizes
                 # it, i.e. if we do the symmetric svd P = U S U^T, we can
@@ -470,13 +470,12 @@ class NeutralGradient(Optimizer):
         # Rotate `p` to the diagonalize basis.  At this point there is no scaling,
         # just an orthogonal transformation; this function is going to add the
         # scaling to state[f"proj_{dim}"]
-        rotated_p = self._change_coordinates(rotated_p, state, forward=True)
+        rotated_p = self._change_coordinates(p, state, forward=True)
 
         params_sq = rotated_p**2
         params_sq.add_(param_eps*param_eps +
                        param_rel_eps*param_rel_eps * params_sq.mean())
 
-        param_var = torch.ones_like(p)
 
         for _ in range(3):
             # Iterate 3 times, this should be enough to converge.
@@ -486,13 +485,16 @@ class NeutralGradient(Optimizer):
                     continue
                 # p will have at least one non-trivial dim.
                 other_dims = [ i for i in range(p.ndim) if i != dim ]
-                this_var = param_var.mean(dim=other_dims, keepdim=True)
-                param_var = param_var / this_var
+                # Compute diagonal variance along this dimension
+                # (this is after normalizing previous dimensions' variance)
+                this_var = params_sq.mean(dim=other_dims, keepdim=True)
+                params_sq = params_sq / this_var
 
                 this_var = this_var.reshape(size)
                 this_scale = (this_var ** (param_pow * 0.5)).reshape(size)
                 proj = state[f"proj_{dim}"]
 
+                #print(f"iter={_}, dim={dim}, this_scale = {this_scale}")
                 if proj.ndim == 1:
                     proj *= this_scale
                 else:
@@ -591,6 +593,7 @@ class NeutralGradient(Optimizer):
         # To be confident in our estimate of the covariance, we want `rank` (which
         # actually represents the number of outer products added together)
         # to be at least `required_rank`; otherwise we'll add a random component.
+        size = cov.shape[0]
         required_rank = int(size * 1.2) + 1
 
         if rank  < required_rank:
@@ -598,7 +601,7 @@ class NeutralGradient(Optimizer):
             # params are exactly zero or we sample a zero matrix; the exact value is not
             # going to affect the performance.
             eps = 1.0e-20
-            param_cov_scale = param_cov.diag().mean() + 1.0e-20
+            cov_scale = cov.diag().mean() + 1.0e-20
 
             # The following formula assumes that the "missing" outer products are
             # expected to be smaller than the ones that we do have, i.e. 0.2 the size.
@@ -606,12 +609,12 @@ class NeutralGradient(Optimizer):
             # i.e. how big it is relative to the trace of the existing matrix.
             missing_rank = (required_rank - rank)
             rand_scale = 0.2 * missing_rank / rank
-            R = torch.randn(size, size)
-            R = torch.matmul(C, C.t()) # positive semidefinite random matrix
+            R = torch.randn(size, size, device=cov.device, dtype=cov.dtype)
+            R = torch.matmul(R, R.t()) # positive semidefinite random matrix
             R_scale = R.diag().mean() + 1.0e-20
             if random.random() < 0.02:
                 print(f"required_rank={required_rank}, rank={rank}, size={size}, R_scale={R_scale}")
-            param_cov.add_(R, alpha=rand_scale * param_cov_scale / R_scale)
+            cov.add_(R, alpha=rand_scale * cov_scale / R_scale)
 
 
     def _multiply_on_dim(self, x: Tensor, proj: Tensor, dim: int,
@@ -1356,8 +1359,10 @@ def _test_eve_cain():
 
         if iter == 0: optim = Eve(m.parameters(), lr=0.003)
         elif iter == 1: optim = Cain(m.parameters(), lr=0.03)
-        elif iter == 2: optim = NeutralGradient(m.parameters(), lr=0.03, max_size=10)
-        elif iter == 3: optim = NeutralGradient(m.parameters(), lr=0.03, max_size=1000)
+        elif iter == 2: optim = NeutralGradient(m.parameters(), lr=0.03, max_fullcov_size=10,
+                                                estimate_period=500, stats_steps=100)
+        elif iter == 3: optim = NeutralGradient(m.parameters(), lr=0.03, max_fullcov_size=1000,
+                                                estimate_period=500, stats_steps=100)
         scheduler = Eden(optim, lr_batches=200, lr_epochs=10, verbose=False)
 
         start = timeit.default_timer()
@@ -1375,7 +1380,10 @@ def _test_eve_cain():
             for n, (x,y) in enumerate(train_pairs):
                 y_out = m(x)
                 loss = ((y_out - y)**2).mean() * 100.0
-                avg_loss = 0.95 * avg_loss + 0.05 * loss.item()
+                if epoch == 0 and n == 0:
+                    avg_loss = loss.item()
+                else:
+                    avg_loss = 0.95 * avg_loss + 0.05 * loss.item()
                 if n == 0 and epoch % 10 == 0:
                     norm1 = '%.2e' % (m[0].weight**2).mean().sqrt().item()
                     norm1b = '%.2e' % (m[0].bias**2).mean().sqrt().item()