WIP: Periodically saving checkpoint after processing given number of batches.

2025-09-03 06:04:18 +00:00 · 2022-03-20 19:42:20 +08:00 · 2022-03-20 19:42:20 +08:00 · 1a9e8a5718
commit 1a9e8a5718
parent ad28c8c5eb
1 changed files with 99 additions and 0 deletions
--- a/icefall/checkpoint.py
+++ b/icefall/checkpoint.py
@ -15,7 +15,10 @@
 # limitations under the License.
 import glob
 import logging
 import os
 import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
@ -151,3 +154,99 @@ def average_checkpoints(
            avg[k] //= n
    return avg
 def save_checkpoint_with_global_batch_idx(
    out_dir: Path,
    global_batch_idx: int,
    model: Union[nn.Module, DDP],
    params: Optional[Dict[str, Any]] = None,
    optimizer: Optional[Optimizer] = None,
    scheduler: Optional[_LRScheduler] = None,
    scaler: Optional[GradScaler] = None,
    rank: int = 0,
 ):
    """Save training info after processing given number of batches.
    Args:
      out_dir:
        The directory to save the checkpoint.
      global_batch_idx:
        The number of batches processed so far from the very start of the
        training. The saved checkpoint will have the following filename:
            f'out_dir / checkpoint-{global_batch_idx}.pt'
      model:
        The neural network model whose `state_dict` will be saved in the
        checkpoint.
      params:
        A dict of training configurations to be saved.
      optimizer:
        The optimizer used in the training. Its `state_dict` will be saved.
      scheduler:
        The learning rate scheduler used in the training. Its `state_dict` will
        be saved.
      scaler:
        The scaler used for mix precision training. Its `state_dict` will
        be saved.
      rank:
        The rank ID used in DDP training of the current node. Set it to 0
        if DDP is not used.
    """
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    filename = out_dir / f"checkpoint-{global_batch_idx}.pt"
    save_checkpoint(
        filename=filename,
        model=model,
        params=params,
        optimizer=optimizer,
        scheduler=scheduler,
        scaler=scaler,
        rank=rank,
    )
 def remove_checkpoints(
    out_dir: Path,
    topk: int,
    rank: int = 0,
 ):
    """Remove checkpoints from the given directory.
    We assume that checkpoint filename has the form `checkpoint-xxx.pt`
    where xxx is a number, representing the number of processed batches
    when saving that checkpoint. We sort checkpoints by filename and keep
    only the `topk` checkpoints with the highest `xxx`.
    Args:
      out_dir:
        The directory containing checkpoints to be removed.
      topk:
        Number of checkpoints to keep.
      rank:
        If using DDP for training, it is the rank of the current node.
        Use 0 if no DDP is used for training.
    """
    assert topk >= 1, topk
    if rank != 0:
        return
    checkpoints = list(glob.glob(f"{out_dir}/checkpoint-[0-9]*.pt"))
    if len(checkpoints) == 0:
        logging.warn(f"No checkpoints found in {out_dir}")
        return
    if len(checkpoints) <= topk:
        return
    pattern = re.compile(r"checkpoint-([0-9]+).pt")
    idx_checkpoints = [
        (int(pattern.search(c).group(1)), c) for c in checkpoints
    ]
    idx_checkpoints = sorted(idx_checkpoints, reverse=True, key=lambda x: x[0])
    to_remove = idx_checkpoints[topk:]
    to_remove = [ic[1] for ic in to_remove]
    for c in to_remove:
        os.remove(c)