add optim

2025-12-11 06:55:27 +00:00 · 2023-06-01 16:37:18 +08:00 · 2023-06-01 16:37:18 +08:00 · 771583de0f
commit 771583de0f
parent d864f9f979
2 changed files with 204 additions and 2 deletions
--- a/icefall/rnn_lm/optim.py
+++ b/icefall/rnn_lm/optim.py
@ -0,0 +1,184 @@
 # Copyright      2023  Xiaomi Corp.        (authors: Yifan Yang)
 #
 # See ../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from collections import defaultdict
 from typing import List, Optional, Tuple, Union
 import torch
 from torch import Tensor
 from torch.optim import Optimizer
 class LRScheduler(object):
    """
    Base-class for learning rate schedulers where the learning-rate depends on both the
    batch and the epoch.
    """
    def __init__(self, optimizer: Optimizer, verbose: bool = False):
        # Attach optimizer
        if not isinstance(optimizer, Optimizer):
            raise TypeError("{} is not an Optimizer".format(type(optimizer).__name__))
        self.optimizer = optimizer
        self.verbose = verbose
        for group in optimizer.param_groups:
            group.setdefault("base_lr", group["lr"])
        self.base_lrs = [group["base_lr"] for group in optimizer.param_groups]
        self.epoch = 0
        self.batch = 0
    def state_dict(self):
        """Returns the state of the scheduler as a :class:`dict`.
        It contains an entry for every variable in self.__dict__ which
        is not the optimizer.
        """
        return {
            "base_lrs": self.base_lrs,
            "epoch": self.epoch,
            "batch": self.batch,
        }
    def load_state_dict(self, state_dict):
        """Loads the schedulers state.
        Args:
            state_dict (dict): scheduler state. Should be an object returned
                from a call to :meth:`state_dict`.
        """
        self.__dict__.update(state_dict)
    def get_last_lr(self) -> List[float]:
        """Return last computed learning rate by current scheduler.  Will be a list of float."""
        return self._last_lr
    def get_lr(self):
        # Compute list of learning rates from self.epoch and self.batch and
        # self.base_lrs; this must be overloaded by the user.
        # e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ]
        raise NotImplementedError
    def step_batch(self, batch: Optional[int] = None) -> None:
        # Step the batch index, or just set it.  If `batch` is specified, it
        # must be the batch index from the start of training, i.e. summed over
        # all epochs.
        # You can call this in any order; if you don't provide 'batch', it should
        # of course be called once per batch.
        if batch is not None:
            self.batch = batch
        else:
            self.batch = self.batch + 1
        self._set_lrs()
    def step_epoch(self, epoch: Optional[int] = None):
        # Step the epoch index, or just set it.  If you provide the 'epoch' arg,
        # you should call this at the start of the epoch; if you don't provide the 'epoch'
        # arg, you should call it at the end of the epoch.
        if epoch is not None:
            self.epoch = epoch
        else:
            self.epoch = self.epoch + 1
        self._set_lrs()
    def _set_lrs(self):
        values = self.get_lr()
        assert len(values) == len(self.optimizer.param_groups)
        for i, data in enumerate(zip(self.optimizer.param_groups, values)):
            param_group, lr = data
            param_group["lr"] = lr
            self.print_lr(self.verbose, i, lr)
        self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
    def print_lr(self, is_verbose, group, lr):
        """Display the current learning rate."""
        if is_verbose:
            logging.info(
                f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate"
                f" of group {group} to {lr:.4e}."
            )
 class NewBobScheduler(LRScheduler):
    """
    New-Bob Scheduler
    The basic formula is:
      lr = lr * annealing_factor if (prev_metric - current_metric) / prev_metric < threshold
    where metric is training loss.
    Args:
      optimizer: the optimizer to change the learning rates on
      annealing_factor: the annealing factor used in new_bob strategy.
      threshold: the rate between losses used to perform learning annealing in new_bob strategy.
      patient: when the annealing condition is violated patient times, the learning rate is finally reduced.
    """
    def __init__(
        self,
        optimizer: Optimizer,
        annealing_factor: float = 0.5,
        threshold: float = 0.0025,
        patient: int = 0,
        verbose: bool = False,
    ):
        super(NewBobScheduler, self).__init__(optimizer, verbose)
        self.annealing_factor = annealing_factor
        self.threshold = threshold
        self.patient = patient
        self.current_patient = self.patient
        self.prev_metric = None
        self.current_metric = None
    def step_batch(self, current_metric: Tensor) -> None:
        self.current_metric = current_metric
        self._set_lrs()
    def get_lr(self):
        """Returns the new lr.
        Args:
          metric: A number for determining whether to change the lr value.
        """
        factor = 1
        if self.prev_metric is not None:
            if self.prev_metric == 0:
                improvement = 0
            else:
                improvement = (
                    self.prev_metric - self.current_metric
                ) / self.prev_metric
            if improvement < self.threshold:
                if self.current_patient == 0:
                    factor = self.annealing_factor
                    self.current_patient = self.patient
                else:
                    self.current_patient -= 1
        self.prev_metric = self.current_metric
        return [x * factor for x in self.base_lrs]
    def state_dict(self):
        return {
            "base_lrs": self.base_lrs,
            "prev_metric": self.prev_metric,
            "current_metric": current_metric,
            "current_patient": self.current_patient,
        }
--- a/icefall/rnn_lm/train.py
+++ b/icefall/rnn_lm/train.py
@ -40,6 +40,7 @@ import torch.optim as optim
 from dataset import get_dataloader
 from lhotse.utils import fix_random_seed
 from model import RnnLmModel
 from optim import NewBobScheduler
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.nn.utils import clip_grad_norm_
 from torch.utils.tensorboard import SummaryWriter
@ -449,6 +450,7 @@ def train_one_epoch(
    params: AttributeDict,
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
    scheduler: Optional[torch.optim.lr_scheduler._LRScheduler],
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
    model_avg: nn.Module = None,
@ -471,6 +473,8 @@ def train_one_epoch(
        The stored model averaged from the start of training.
      optimizer:
        The optimizer we are using.
      scheduler:
        The learning rate scheduler, we call step() every step.
      train_dl:
        Dataloader for the training dataset.
      valid_dl:
@ -500,6 +504,7 @@ def train_one_epoch(
        # summary stats
        tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
        scheduler.step_batch(loss)
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 5.0, 2.0)
@ -527,6 +532,7 @@ def train_one_epoch(
                model_avg=model_avg,
                params=params,
                optimizer=optimizer,
                scheduler=scheduler,
                rank=rank,
            )
@ -534,11 +540,12 @@ def train_one_epoch(
            # Note: "frames" here means "num_tokens"
            this_batch_ppl = math.exp(loss_info["loss"] / loss_info["frames"])
            tot_ppl = math.exp(tot_loss["loss"] / tot_loss["frames"])
-
+            cur_lr = scheduler.get_last_lr()[0]
            logging.info(
                f"Epoch {params.cur_epoch}, "
                f"batch {batch_idx}, loss[{loss_info}, ppl: {this_batch_ppl}] "
                f"tot_loss[{tot_loss}, ppl: {tot_ppl}], "
                f"lr: {cur_lr:.2e}, "
                f"batch size: {batch_size}"
            )
@ -656,10 +663,20 @@ def run(rank, world_size, args):
        lr=params.lr,
        weight_decay=params.weight_decay,
    )
    scheduler = NewBobScheduler(optimizer)
    if checkpoints:
        logging.info("Load optimizer state_dict from checkpoint")
        optimizer.load_state_dict(checkpoints["optimizer"])
    if (
        checkpoints
        and "scheduler" in checkpoints
        and checkpoints["scheduler"] is not None
    ):
        logging.info("Loading scheduler state dict")
        scheduler.load_state_dict(checkpoints["scheduler"])
    logging.info(f"Loading LM training data from {params.lm_data}")
    train_dl = get_dataloader(
        filename=params.lm_data,
@ -674,7 +691,6 @@ def run(rank, world_size, args):
        params=params,
    )
    # Note: No learning rate scheduler is used here
    for epoch in range(params.start_epoch, params.num_epochs + 1):
        if is_distributed:
            train_dl.sampler.set_epoch(epoch - 1)
@ -686,6 +702,7 @@ def run(rank, world_size, args):
            model=model,
            model_avg=model_avg,
            optimizer=optimizer,
            scheduler=scheduler,
            train_dl=train_dl,
            valid_dl=valid_dl,
            tb_writer=tb_writer,
@ -698,6 +715,7 @@ def run(rank, world_size, args):
            model=model,
            model_avg=model_avg,
            optimizer=optimizer,
            scheduler=scheduler,
            rank=rank,
        )