add optim

This commit is contained in:
Yifan Yang 2023-06-01 16:37:18 +08:00
parent d864f9f979
commit 771583de0f
2 changed files with 204 additions and 2 deletions

184
icefall/rnn_lm/optim.py Normal file
View File

@ -0,0 +1,184 @@
# Copyright 2023 Xiaomi Corp. (authors: Yifan Yang)
#
# See ../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from collections import defaultdict
from typing import List, Optional, Tuple, Union
import torch
from torch import Tensor
from torch.optim import Optimizer
class LRScheduler(object):
"""
Base-class for learning rate schedulers where the learning-rate depends on both the
batch and the epoch.
"""
def __init__(self, optimizer: Optimizer, verbose: bool = False):
# Attach optimizer
if not isinstance(optimizer, Optimizer):
raise TypeError("{} is not an Optimizer".format(type(optimizer).__name__))
self.optimizer = optimizer
self.verbose = verbose
for group in optimizer.param_groups:
group.setdefault("base_lr", group["lr"])
self.base_lrs = [group["base_lr"] for group in optimizer.param_groups]
self.epoch = 0
self.batch = 0
def state_dict(self):
"""Returns the state of the scheduler as a :class:`dict`.
It contains an entry for every variable in self.__dict__ which
is not the optimizer.
"""
return {
"base_lrs": self.base_lrs,
"epoch": self.epoch,
"batch": self.batch,
}
def load_state_dict(self, state_dict):
"""Loads the schedulers state.
Args:
state_dict (dict): scheduler state. Should be an object returned
from a call to :meth:`state_dict`.
"""
self.__dict__.update(state_dict)
def get_last_lr(self) -> List[float]:
"""Return last computed learning rate by current scheduler. Will be a list of float."""
return self._last_lr
def get_lr(self):
# Compute list of learning rates from self.epoch and self.batch and
# self.base_lrs; this must be overloaded by the user.
# e.g. return [some_formula(self.batch, self.epoch, base_lr) for base_lr in self.base_lrs ]
raise NotImplementedError
def step_batch(self, batch: Optional[int] = None) -> None:
# Step the batch index, or just set it. If `batch` is specified, it
# must be the batch index from the start of training, i.e. summed over
# all epochs.
# You can call this in any order; if you don't provide 'batch', it should
# of course be called once per batch.
if batch is not None:
self.batch = batch
else:
self.batch = self.batch + 1
self._set_lrs()
def step_epoch(self, epoch: Optional[int] = None):
# Step the epoch index, or just set it. If you provide the 'epoch' arg,
# you should call this at the start of the epoch; if you don't provide the 'epoch'
# arg, you should call it at the end of the epoch.
if epoch is not None:
self.epoch = epoch
else:
self.epoch = self.epoch + 1
self._set_lrs()
def _set_lrs(self):
values = self.get_lr()
assert len(values) == len(self.optimizer.param_groups)
for i, data in enumerate(zip(self.optimizer.param_groups, values)):
param_group, lr = data
param_group["lr"] = lr
self.print_lr(self.verbose, i, lr)
self._last_lr = [group["lr"] for group in self.optimizer.param_groups]
def print_lr(self, is_verbose, group, lr):
"""Display the current learning rate."""
if is_verbose:
logging.info(
f"Epoch={self.epoch}, batch={self.batch}: adjusting learning rate"
f" of group {group} to {lr:.4e}."
)
class NewBobScheduler(LRScheduler):
"""
New-Bob Scheduler
The basic formula is:
lr = lr * annealing_factor if (prev_metric - current_metric) / prev_metric < threshold
where metric is training loss.
Args:
optimizer: the optimizer to change the learning rates on
annealing_factor: the annealing factor used in new_bob strategy.
threshold: the rate between losses used to perform learning annealing in new_bob strategy.
patient: when the annealing condition is violated patient times, the learning rate is finally reduced.
"""
def __init__(
self,
optimizer: Optimizer,
annealing_factor: float = 0.5,
threshold: float = 0.0025,
patient: int = 0,
verbose: bool = False,
):
super(NewBobScheduler, self).__init__(optimizer, verbose)
self.annealing_factor = annealing_factor
self.threshold = threshold
self.patient = patient
self.current_patient = self.patient
self.prev_metric = None
self.current_metric = None
def step_batch(self, current_metric: Tensor) -> None:
self.current_metric = current_metric
self._set_lrs()
def get_lr(self):
"""Returns the new lr.
Args:
metric: A number for determining whether to change the lr value.
"""
factor = 1
if self.prev_metric is not None:
if self.prev_metric == 0:
improvement = 0
else:
improvement = (
self.prev_metric - self.current_metric
) / self.prev_metric
if improvement < self.threshold:
if self.current_patient == 0:
factor = self.annealing_factor
self.current_patient = self.patient
else:
self.current_patient -= 1
self.prev_metric = self.current_metric
return [x * factor for x in self.base_lrs]
def state_dict(self):
return {
"base_lrs": self.base_lrs,
"prev_metric": self.prev_metric,
"current_metric": current_metric,
"current_patient": self.current_patient,
}

View File

@ -40,6 +40,7 @@ import torch.optim as optim
from dataset import get_dataloader
from lhotse.utils import fix_random_seed
from model import RnnLmModel
from optim import NewBobScheduler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.nn.utils import clip_grad_norm_
from torch.utils.tensorboard import SummaryWriter
@ -449,6 +450,7 @@ def train_one_epoch(
params: AttributeDict,
model: nn.Module,
optimizer: torch.optim.Optimizer,
scheduler: Optional[torch.optim.lr_scheduler._LRScheduler],
train_dl: torch.utils.data.DataLoader,
valid_dl: torch.utils.data.DataLoader,
model_avg: nn.Module = None,
@ -471,6 +473,8 @@ def train_one_epoch(
The stored model averaged from the start of training.
optimizer:
The optimizer we are using.
scheduler:
The learning rate scheduler, we call step() every step.
train_dl:
Dataloader for the training dataset.
valid_dl:
@ -500,6 +504,7 @@ def train_one_epoch(
# summary stats
tot_loss = (tot_loss * (1 - 1 / params.reset_interval)) + loss_info
scheduler.step_batch(loss)
optimizer.zero_grad()
loss.backward()
clip_grad_norm_(model.parameters(), 5.0, 2.0)
@ -527,6 +532,7 @@ def train_one_epoch(
model_avg=model_avg,
params=params,
optimizer=optimizer,
scheduler=scheduler,
rank=rank,
)
@ -534,11 +540,12 @@ def train_one_epoch(
# Note: "frames" here means "num_tokens"
this_batch_ppl = math.exp(loss_info["loss"] / loss_info["frames"])
tot_ppl = math.exp(tot_loss["loss"] / tot_loss["frames"])
cur_lr = scheduler.get_last_lr()[0]
logging.info(
f"Epoch {params.cur_epoch}, "
f"batch {batch_idx}, loss[{loss_info}, ppl: {this_batch_ppl}] "
f"tot_loss[{tot_loss}, ppl: {tot_ppl}], "
f"lr: {cur_lr:.2e}, "
f"batch size: {batch_size}"
)
@ -656,10 +663,20 @@ def run(rank, world_size, args):
lr=params.lr,
weight_decay=params.weight_decay,
)
scheduler = NewBobScheduler(optimizer)
if checkpoints:
logging.info("Load optimizer state_dict from checkpoint")
optimizer.load_state_dict(checkpoints["optimizer"])
if (
checkpoints
and "scheduler" in checkpoints
and checkpoints["scheduler"] is not None
):
logging.info("Loading scheduler state dict")
scheduler.load_state_dict(checkpoints["scheduler"])
logging.info(f"Loading LM training data from {params.lm_data}")
train_dl = get_dataloader(
filename=params.lm_data,
@ -674,7 +691,6 @@ def run(rank, world_size, args):
params=params,
)
# Note: No learning rate scheduler is used here
for epoch in range(params.start_epoch, params.num_epochs + 1):
if is_distributed:
train_dl.sampler.set_epoch(epoch - 1)
@ -686,6 +702,7 @@ def run(rank, world_size, args):
model=model,
model_avg=model_avg,
optimizer=optimizer,
scheduler=scheduler,
train_dl=train_dl,
valid_dl=valid_dl,
tb_writer=tb_writer,
@ -698,6 +715,7 @@ def run(rank, world_size, args):
model=model,
model_avg=model_avg,
optimizer=optimizer,
scheduler=scheduler,
rank=rank,
)