diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py b/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py index a2e0463da..e47c08657 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py @@ -100,9 +100,6 @@ class Eve(Optimizer): if p.grad is None: continue - - - # Perform optimization step grad = p.grad if grad.is_sparse: @@ -144,12 +141,3 @@ class Eve(Optimizer): p.addcdiv_(exp_avg, denom, value=-step_size) return loss - -# Note on avg-change per epoch.. -# suppose epoch is 4k iters. -# if avg-change as rms(diff) / rms(params) equals 0.2, and rms(params) = 0.1, -# then rm(diff) 0.1 * 0.2, var(diff) = (0.1 * 0.2)**2, = 0.0004. So var(diff per minibatch) -# = 0.0004 / 4000 = 1e-07, rms(diff per minibatch) = 3.16e-04. So LR would be 3e-04. -# Suggested lr_schedule? -# -# .. 6e-05 is 1/5 of that... diff --git a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py index 92509f4ec..a114dd8f1 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py @@ -154,15 +154,15 @@ def get_parser(): parser.add_argument( "--lr-begin-steps", type=float, - default=25000, + default=5000, help="Number of steps that affects how rapidly the learning rate initially decreases" ) parser.add_argument( - "--lr-end-epochs", + "--lr-epochs", type=float, default=-1, - help="""Number of epochs that affects how rapidly the learning rate finally decreases; + help="""Number of epochs for purposes of the learning-rate schedule; if -1, will be set the same as --num-epochs """ ) @@ -784,15 +784,15 @@ def run(rank, world_size, args): model.parameters(), lr=params.initial_lr) - # The `epoch` variable in the lambda expression binds to the value below - # in `for epoch in range(params.start_epoch, params.num_epochs):`. But set it to 0 + # The `epoch` variable in the lambda expression picks up to the value below + # in `for epoch in range(params.start_epoch, params.num_epochs):`. Set it to 0 # here to avoid crash in constructor. epoch = 0 - lr_end_epochs = params.lr_end_epochs if params.lr_end_epochs > 0 else params.num_epochs + lr_epochs = params.lr_epochs if params.lr_epochs > 0 else params.num_epochs scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, - lambda step: (((step + params.lr_begin_steps) / params.lr_begin_steps) ** -0.5 * - ((epoch + lr_end_epochs) / lr_end_epochs) ** -2.0)) + lambda step: (((step**2 + params.lr_begin_steps**2) / params.lr_begin_steps**2) ** -0.25 * + ((epoch + lr_epochs) / lr_epochs) ** -0.5)) if checkpoints and "optimizer" in checkpoints: