From 2b0727a355d73205c1a91b770902c0da04aec958 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 5 Apr 2022 00:31:28 +0800
Subject: [PATCH] Fix weight decay formula by adding 1/1-beta

---
 egs/librispeech/ASR/pruned_transducer_stateless2/optim.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py
index 17450def8..607a4e350 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py
@@ -156,9 +156,12 @@ class Eve(Optimizer):
                 #  epsilon = 1.0 * 0.5 * (1.0e-04 / 0.1) = 1.0e-06.
                 # Note that this is close to the "traditional" value used for weight
                 # decay.
-
+                #
                 # this is the weight-decay amount...
-                weight_decay = (delta ** 2).mean() * (0.5 * (step_size / target_rms) ** 2)
+                #
+                # Regarding the 1/1-beta factor below: this is to compensate for the deltas on successive
+                # frames being correlated.  I have to figure out the justification.
+                weight_decay = (delta ** 2).mean() * (0.5 * (step_size / target_rms) ** 2 * (1.0 / (1.0 - beta)))
 
                 p.mul_(1 - weight_decay)
                 p.add_(delta, alpha=-step_size)