From 2b0727a355d73205c1a91b770902c0da04aec958 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 5 Apr 2022 00:31:28 +0800 Subject: [PATCH] Fix weight decay formula by adding 1/1-beta --- egs/librispeech/ASR/pruned_transducer_stateless2/optim.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py index 17450def8..607a4e350 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py @@ -156,9 +156,12 @@ class Eve(Optimizer): # epsilon = 1.0 * 0.5 * (1.0e-04 / 0.1) = 1.0e-06. # Note that this is close to the "traditional" value used for weight # decay. - + # # this is the weight-decay amount... - weight_decay = (delta ** 2).mean() * (0.5 * (step_size / target_rms) ** 2) + # + # Regarding the 1/1-beta factor below: this is to compensate for the deltas on successive + # frames being correlated. I have to figure out the justification. + weight_decay = (delta ** 2).mean() * (0.5 * (step_size / target_rms) ** 2 * (1.0 / (1.0 - beta))) p.mul_(1 - weight_decay) p.add_(delta, alpha=-step_size)