Refactor how learning rate is set.

2025-08-09 01:52:41 +00:00 · 2022-04-10 15:25:27 +08:00 · 2022-04-10 15:25:27 +08:00 · d1e4ae788d
commit d1e4ae788d
parent 82d58629ea
3 changed files with 174 additions and 31 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/optim.py
@ -16,7 +16,7 @@
 import random
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 import torch
 from torch import Tensor
@ -141,3 +141,152 @@ class Eve(Optimizer):
                p.addcdiv_(exp_avg, denom, value=-step_size)
        return loss
 class LRScheduler(object):
    """
    Base-class for learning rate schedulers where the learning-rate depends on both the
    batch and the epoch.
    """
    def __init__(self, optimizer: Optimizer, verbose: bool = False):
        # Attach optimizer
        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer
        self.verbose = verbose
        for group in optimizer.param_groups:
            group.setdefault('initial_lr', group['lr'])
        self.base_lrs = [group['initial_lr'] for group in optimizer.param_groups]
        self.epoch = 0
        self.batch = 0
    def state_dict(self):
        """Returns the state of the scheduler as a :class:`dict`.
        It contains an entry for every variable in self.__dict__ which
        is not the optimizer.
        """
        return {'base_lrs': self.base_lrs,
                'epoch': self.epoch,
                'batch': self.batch}
    def load_state_dict(self, state_dict):
        """Loads the schedulers state.
        Args:
            state_dict (dict): scheduler state. Should be an object returned
                from a call to :meth:`state_dict`.
        """
        self.__dict__.update(state_dict)
    def get_last_lr(self) -> List[float]:
        """ Return last computed learning rate by current scheduler.  Will be a list of float.
        """
        return self._last_lr
    def get_lr(self):
        # Compute list of learning rates from self.epoch and self.batch and
        # self.base_lrs; this must be overloaded by the user.
        # e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ]
        raise NotImplementedError
    def step_batch(self, batch: Optional[int] = None) -> None:
        # Step the batch index, or just set it.  If `batch` is specified, it
        # must be the batch index from the start of training, i.e. summed over
        # all epochs.
        # You can call this in any order; if you don't provide 'batch', it should
        # of course be called once per batch.
        if batch is not None:
            self.batch = batch
        else:
            self.batch = self.batch + 1
        self._set_lrs()
    def step_epoch(self, epoch: Optional[int] = None):
        # Step the epoch index, or just set it.  If you provide the 'epoch' arg,
        # you should call this at the start of the epoch; if you don't provide the 'epoch'
        # arg, you should call it at the end of the epoch.
        if epoch is not None:
            self.epoch = epoch
        else:
            self.epoch = self.epoch + 1
        self._set_lrs()
    def _set_lrs(self):
        values = self.get_lr()
        assert len(values) == len(self.optimizer.param_groups)
        for i, data in enumerate(zip(self.optimizer.param_groups, values)):
            param_group, lr = data
            param_group['lr'] = lr
            self.print_lr(self.verbose, i, lr)
        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
    def print_lr(self, is_verbose, group, lr):
        """Display the current learning rate.
        """
        if is_verbose:
            print(f'Epoch={self.epoch}, batch={self.batch}: adjusting learning rate'
                  f' of group {group} to {lr:.4e}.')
 class Eden(LRScheduler):
    """
    Eden scheduler.
     lr = initial_lr = (((batch**2 + lr_batches**2) / lr_batchses**2) ** -0.25 *
                       (((epoch**2 + lr_epochs**2) / lr_epochs**2) ** -0.25))
     E.g. suggest initial-lr = 0.003 (passed to optimizer).
    Args:
        optimizer: the optimizer to change the learning rates on
        lr_batches: the number of batches after which we start significantly
              decreasing the learning rate, suggest 5000.
        lr_epochs: the number of epochs after which we start significantly
              decreasing the learning rate, suggest 6.
    """
    def __init__(self, optimizer: Optimizer,
                 lr_batches: Union[int, float],
                 lr_epochs: Union[int, float],
                 verbose: bool = False):
        super(Eden, self).__init__(optimizer, verbose)
        self.lr_batches = lr_batches
        self.lr_epochs = lr_epochs
    def get_lr(self):
        factor =  (((self.batch**2 + self.lr_batches**2) / self.lr_batches**2) ** -0.25 *
                   (((self.epoch**2 + self.lr_epochs**2) / self.lr_epochs**2) ** -0.25))
        return [ x * factor for x in self.base_lrs ]
 def _test_eden():
    m = torch.nn.Linear(100, 100)
    optim = Eve(m.parameters(), lr=0.003)
    scheduler = Eden(optim, lr_batches=30, lr_epochs=2, verbose=True)
    for epoch in range(10):
        scheduler.step_epoch(epoch)  # sets epoch to `epoch`
        for step in range(20):
            x = torch.randn(200, 100).detach()
            x.requires_grad = True
            y = m(x)
            dy = torch.randn(200, 100).detach()
            f = (y * dy).sum()
            f.backward()
            optim.step()
            scheduler.step_batch()
            optim.zero_grad()
    print("last lr = ", scheduler.get_last_lr())
    print("state dict = ", scheduler.state_dict())
 if __name__ == '__main__':
    _test_eden()
--- a/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless2/train.py
@ -40,7 +40,7 @@ import math
 import warnings
 from pathlib import Path
 from shutil import copyfile
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, Union
 import k2
 import sentencepiece as spm
@ -55,7 +55,7 @@ from lhotse.cut import Cut
 from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
 from model import Transducer
-from optim import Eve
+from optim import Eve, Eden
 from torch import Tensor
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils.tensorboard import SummaryWriter
@ -74,6 +74,7 @@ from icefall.utils import (
    str2bool,
 )
 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
 def get_parser():
    parser = argparse.ArgumentParser(
@ -152,7 +153,7 @@ def get_parser():
    )
    parser.add_argument(
-        "--lr-steps",
+        "--lr-batches",
        type=float,
        default=5000,
        help="""Number of steps that affects how rapidly the learning rate decreases.
@ -378,7 +379,7 @@ def load_checkpoint_if_available(
    params: AttributeDict,
    model: nn.Module,
    optimizer: Optional[torch.optim.Optimizer] = None,
-    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
+    scheduler: Optional[LRSchedulerType] = None,
 ) -> Optional[Dict[str, Any]]:
    """Load checkpoint from file.
@ -443,7 +444,7 @@ def save_checkpoint(
    params: AttributeDict,
    model: nn.Module,
    optimizer: Optional[torch.optim.Optimizer] = None,
-    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
+    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
    rank: int = 0,
 ) -> None:
@ -593,7 +594,7 @@ def train_one_epoch(
    params: AttributeDict,
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
-    scheduler: torch.optim.lr_scheduler._LRScheduler,
+    scheduler: LRSchedulerType,
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
@ -656,17 +657,15 @@ def train_one_epoch(
        # NOTE: We use reduction==sum and loss is computed over utterances
        # in the batch and there is no normalization to it so far.
        loss.backward()
        scheduler.step_batch(params.batch_idx_train)
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()
        if params.print_diagnostics and batch_idx == 5:
            return
-        if (
+        if (params.batch_idx_train > 0
-            params.batch_idx_train > 0
+            and params.batch_idx_train % params.save_every_n == 0):
            and params.batch_idx_train % params.save_every_n == 0
        ):
            params.cur_batch_idx = batch_idx
            save_checkpoint_with_global_batch_idx(
                out_dir=params.exp_dir,
@ -686,13 +685,17 @@ def train_one_epoch(
            )
        if batch_idx % params.log_interval == 0:
            cur_lr = scheduler.get_last_lr()[0]
            logging.info(
                f"Epoch {params.cur_epoch}, "
                f"batch {batch_idx}, loss[{loss_info}], "
-                f"tot_loss[{tot_loss}], batch size: {batch_size}"
+                f"tot_loss[{tot_loss}], batch size: {batch_size}, "
                f"lr: {cur_lr:.2e}"
            )
            if tb_writer is not None:
                tb_writer.add_scalar("train/learning_rate", cur_lr)
                loss_info.write_summary(
                    tb_writer, "train/current_", params.batch_idx_train
                )
@ -784,14 +787,7 @@ def run(rank, world_size, args):
        model.parameters(),
        lr=params.initial_lr)
-    # The `epoch` variable in the lambda expression picks up to the value below
+    scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
    # in `for epoch in range(params.start_epoch, params.num_epochs):`.  Set it to 0
    # here to avoid crash in constructor.
    epoch = 0
    scheduler = torch.optim.lr_scheduler.LambdaLR(
        optimizer,
        lambda step: (((step**2 + params.lr_steps**2) / params.lr_steps**2) ** -0.25 *
                      (((epoch**2 + params.lr_epochs**2) / params.lr_epochs**2) ** -0.25)))
    if checkpoints and "optimizer" in checkpoints:
@ -854,19 +850,14 @@ def run(rank, world_size, args):
        )
    for epoch in range(params.start_epoch, params.num_epochs):
        scheduler.step_epoch(epoch)
        fix_random_seed(params.seed + epoch)
        train_dl.sampler.set_epoch(epoch)
        cur_lr = scheduler.get_last_lr()[0]
        if tb_writer is not None:
            tb_writer.add_scalar(
                "train/learning_rate", cur_lr, params.batch_idx_train
            )
            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
        if rank == 0:
            logging.info("epoch {}, learning rate {}".format(epoch, cur_lr))
        params.cur_epoch = epoch
        train_one_epoch(
--- a/icefall/checkpoint.py
+++ b/icefall/checkpoint.py
@ -28,15 +28,18 @@ from lhotse.dataset.sampling.base import CutSampler
 from torch.cuda.amp import GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 # use duck typing for LRScheduler since we have different possibilities, see
 # our class LRScheduler.
 LRSchedulerType = object
 def save_checkpoint(
    filename: Path,
    model: Union[nn.Module, DDP],
    params: Optional[Dict[str, Any]] = None,
    optimizer: Optional[Optimizer] = None,
-    scheduler: Optional[_LRScheduler] = None,
+    scheduler: Optional[LRSchedulerType] = None,
    scaler: Optional[GradScaler] = None,
    sampler: Optional[CutSampler] = None,
    rank: int = 0,
@ -89,7 +92,7 @@ def load_checkpoint(
    filename: Path,
    model: nn.Module,
    optimizer: Optional[Optimizer] = None,
-    scheduler: Optional[_LRScheduler] = None,
+    scheduler: Optional[LRSchedulerType] = None,
    scaler: Optional[GradScaler] = None,
    sampler: Optional[CutSampler] = None,
    strict: bool = False,
@ -167,7 +170,7 @@ def save_checkpoint_with_global_batch_idx(
    model: Union[nn.Module, DDP],
    params: Optional[Dict[str, Any]] = None,
    optimizer: Optional[Optimizer] = None,
-    scheduler: Optional[_LRScheduler] = None,
+    scheduler: Optional[LRSchedulerType] = None,
    scaler: Optional[GradScaler] = None,
    sampler: Optional[CutSampler] = None,
    rank: int = 0,