Add averaged model && change start from 0 to 1 && fix typo for gigaspeech (#990)

* Add averaged model && change start from 0 to 1 && fix typo * Update train.py * Set use-averaged-model False for BC --------- Co-authored-by: yifanyang <yifanyeung@yifanyangs-MacBook-Pro.local>
2023-04-09 20:53:47 +08:00 · 2023-04-09 20:53:47 +08:00 · 6434c8eadc
commit 6434c8eadc
parent 136aa94d57
2 changed files with 184 additions and 76 deletions
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/decode.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/decode.py
@ -19,40 +19,40 @@
 Usage:
 (1) greedy search
 ./pruned_transducer_stateless2/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./pruned_transducer_stateless2/exp \
-        --max-duration 600 \
-        --decoding-method greedy_search
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless2/exp \
+    --max-duration 600 \
+    --decoding-method greedy_search

 (2) beam search
 ./pruned_transducer_stateless2/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./pruned_transducer_stateless2/exp \
-        --max-duration 600 \
-        --decoding-method beam_search \
-        --beam-size 4
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless2/exp \
+    --max-duration 600 \
+    --decoding-method beam_search \
+    --beam-size 4

 (3) modified beam search
 ./pruned_transducer_stateless2/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./pruned_transducer_stateless2/exp \
-        --max-duration 600 \
-        --decoding-method modified_beam_search \
-        --beam-size 4
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless2/exp \
+    --max-duration 600 \
+    --decoding-method modified_beam_search \
+    --beam-size 4

 (4) fast beam search
 ./pruned_transducer_stateless2/decode.py \
-        --epoch 28 \
-        --avg 15 \
-        --exp-dir ./pruned_transducer_stateless2/exp \
-        --max-duration 600 \
-        --decoding-method fast_beam_search \
-        --beam 4 \
-        --max-contexts 4 \
-        --max-states 8
+    --epoch 28 \
+    --avg 15 \
+    --exp-dir ./pruned_transducer_stateless2/exp \
+    --max-duration 600 \
+    --decoding-method fast_beam_search \
+    --beam 4 \
+    --max-contexts 4 \
+    --max-states 8
 """


@ -76,12 +76,17 @@ from beam_search import (
 )
 from gigaspeech_scoring import asr_text_post_processing
 from train import get_params, get_transducer_model
-
-from icefall.checkpoint import average_checkpoints, find_checkpoints, load_checkpoint
+from icefall.checkpoint import (
+    average_checkpoints,
+    average_checkpoints_with_averaged_model,
+    find_checkpoints,
+    load_checkpoint,
+)
 from icefall.utils import (
    AttributeDict,
    setup_logger,
    store_transcripts,
+    str2bool,
    write_error_stats,
 )

@ -94,9 +99,9 @@ def get_parser():
    parser.add_argument(
        "--epoch",
        type=int,
-        default=29,
+        default=30,
        help="""It specifies the checkpoint to use for decoding.
-        Note: Epoch counts from 0.
+        Note: Epoch counts from 1.
        You can specify --avg to use more checkpoints for model averaging.""",
    )

@ -119,6 +124,17 @@ def get_parser():
        "'--epoch' and '--iter'",
    )

+    parser.add_argument(
+        "--use-averaged-model",
+        type=str2bool,
+        default=False,
+        help="Whether to load averaged model. Currently it only supports "
+        "using --epoch. If True, it would decode with the averaged model "
+        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
+        "Actually only the models with epoch number of `epoch-avg` and "
+        "`epoch` are loaded for averaging. ",
+    )
+
    parser.add_argument(
        "--exp-dir",
        type=str,
@ -464,6 +480,9 @@ def main():
        params.suffix += f"-context-{params.context_size}"
        params.suffix += f"-max-sym-per-frame-{params.max_sym_per_frame}"

+    if params.use_averaged_model:
+        params.suffix += "-use-averaged-model"
+
    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
    logging.info("Decoding started")

@ -476,7 +495,7 @@ def main():
    sp = spm.SentencePieceProcessor()
    sp.load(params.bpe_model)

-    # <blk> and <unk> is defined in local/train_bpe_model.py
+    # <blk> and <unk> are defined in local/train_bpe_model.py
    params.blank_id = sp.piece_to_id("<blk>")
    params.unk_id = sp.piece_to_id("<unk>")
    params.vocab_size = sp.get_piece_size()
@ -486,37 +505,85 @@ def main():
    logging.info("About to create model")
    model = get_transducer_model(params)

-    if params.iter > 0:
-        filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
-            : params.avg
-        ]
-        if len(filenames) == 0:
-            raise ValueError(
-                f"No checkpoints found for --iter {params.iter}, --avg {params.avg}"
-            )
-        elif len(filenames) < params.avg:
-            raise ValueError(
-                f"Not enough checkpoints ({len(filenames)}) found for"
-                f" --iter {params.iter}, --avg {params.avg}"
-            )
-        logging.info(f"averaging {filenames}")
-        model.to(device)
-        model.load_state_dict(average_checkpoints(filenames, device=device))
-    elif params.avg == 1:
-        load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+    if not params.use_averaged_model:
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
+        elif params.avg == 1:
+            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+        else:
+            start = params.epoch - params.avg + 1
+            filenames = []
+            for i in range(start, params.epoch + 1):
+                if i >= 1:
+                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
+            logging.info(f"averaging {filenames}")
+            model.to(device)
+            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
-        start = params.epoch - params.avg + 1
-        filenames = []
-        for i in range(start, params.epoch + 1):
-            if start >= 0:
-                filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
-        logging.info(f"averaging {filenames}")
-        model.to(device)
-        model.load_state_dict(average_checkpoints(filenames, device=device))
+        if params.iter > 0:
+            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
+                : params.avg + 1
+            ]
+            if len(filenames) == 0:
+                raise ValueError(
+                    f"No checkpoints found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            elif len(filenames) < params.avg + 1:
+                raise ValueError(
+                    f"Not enough checkpoints ({len(filenames)}) found for"
+                    f" --iter {params.iter}, --avg {params.avg}"
+                )
+            filename_start = filenames[-1]
+            filename_end = filenames[0]
+            logging.info(
+                "Calculating the averaged model over iteration checkpoints"
+                f" from {filename_start} (excluded) to {filename_end}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )
+        else:
+            assert params.avg > 0, params.avg
+            start = params.epoch - params.avg
+            assert start >= 1, start
+            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
+            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
+            logging.info(
+                f"Calculating the averaged model over epoch range from "
+                f"{start} (excluded) to {params.epoch}"
+            )
+            model.to(device)
+            model.load_state_dict(
+                average_checkpoints_with_averaged_model(
+                    filename_start=filename_start,
+                    filename_end=filename_end,
+                    device=device,
+                )
+            )

    model.to(device)
    model.eval()
-    model.device = device

    if params.decoding_method == "fast_beam_search":
        decoding_graph = k2.trivial_graph(params.vocab_size - 1, device=device)
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/train.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/train.py
@ -42,6 +42,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"


 import argparse
+import copy
 import logging
 import warnings
 from pathlib import Path
@ -70,7 +71,10 @@ from torch.utils.tensorboard import SummaryWriter
 from icefall import diagnostics
 from icefall.checkpoint import load_checkpoint, remove_checkpoints
 from icefall.checkpoint import save_checkpoint as save_checkpoint_impl
-from icefall.checkpoint import save_checkpoint_with_global_batch_idx
+from icefall.checkpoint import (
+    save_checkpoint_with_global_batch_idx,
+    update_averaged_model,
+)
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
@ -114,10 +118,10 @@ def get_parser():
    parser.add_argument(
        "--start-epoch",
        type=int,
-        default=0,
-        help="""Resume training from from this epoch.
-        If it is positive, it will load checkpoint from
-        transducer_stateless2/exp/epoch-{start_epoch-1}.pt
+        default=1,
+        help="""Resume training from this epoch.
+        If larger than 1, it will load checkpoint from
+        exp-dir/epoch-{start_epoch-1}.pt
        """,
    )

@ -240,7 +244,7 @@ def get_parser():
    parser.add_argument(
        "--keep-last-k",
        type=int,
-        default=20,
+        default=30,
        help="""Only keep this number of checkpoints on disk.
        For instance, if it is 3, there are only 3 checkpoints
        in the exp-dir with filenames `checkpoint-xxx.pt`.
@ -248,6 +252,19 @@ def get_parser():
        """,
    )

+    parser.add_argument(
+        "--average-period",
+        type=int,
+        default=200,
+        help="""Update the averaged model, namely `model_avg`, after processing
+        this number of batches. `model_avg` is a separate version of model,
+        in which each floating-point parameter is the average of all the
+        parameters from the start of training. Each time we take the average,
+        we do: `model_avg = model * (average_period / batch_idx_train) +
+            model_avg * ((batch_idx_train - average_period) / batch_idx_train)`.
+        """,
+    )
+
    parser.add_argument(
        "--use-fp16",
        type=str2bool,
@ -385,6 +402,7 @@ def get_transducer_model(params: AttributeDict) -> nn.Module:
 def load_checkpoint_if_available(
    params: AttributeDict,
    model: nn.Module,
+    model_avg: nn.Module = None,
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
 ) -> Optional[Dict[str, Any]]:
@ -392,7 +410,7 @@ def load_checkpoint_if_available(

    If params.start_batch is positive, it will load the checkpoint from
    `params.exp_dir/checkpoint-{params.start_batch}.pt`. Otherwise, if
-    params.start_epoch is positive, it will load the checkpoint from
+    params.start_epoch is larger than 1, it will load the checkpoint from
    `params.start_epoch - 1`.

    Apart from loading state dict for `model` and `optimizer` it also updates
@ -404,6 +422,8 @@ def load_checkpoint_if_available(
        The return value of :func:`get_params`.
      model:
        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
      optimizer:
        The optimizer that we are using.
      scheduler:
@ -413,7 +433,7 @@ def load_checkpoint_if_available(
    """
    if params.start_batch > 0:
        filename = params.exp_dir / f"checkpoint-{params.start_batch}.pt"
-    elif params.start_epoch > 0:
+    elif params.start_epoch > 1:
        filename = params.exp_dir / f"epoch-{params.start_epoch-1}.pt"
    else:
        return None
@ -423,6 +443,7 @@ def load_checkpoint_if_available(
    saved_params = load_checkpoint(
        filename,
        model=model,
+        model_avg=model_avg,
        optimizer=optimizer,
        scheduler=scheduler,
    )
@ -449,7 +470,8 @@ def load_checkpoint_if_available(

 def save_checkpoint(
    params: AttributeDict,
-    model: nn.Module,
+    model: Union[nn.Module, DDP],
+    model_avg: Optional[nn.Module] = None,
    optimizer: Optional[torch.optim.Optimizer] = None,
    scheduler: Optional[LRSchedulerType] = None,
    sampler: Optional[CutSampler] = None,
@ -463,6 +485,8 @@ def save_checkpoint(
        It is returned by :func:`get_params`.
      model:
        The training model.
+      model_avg:
+        The stored model averaged from the start of training.
      optimizer:
        The optimizer used in the training.
      sampler:
@ -476,6 +500,7 @@ def save_checkpoint(
    save_checkpoint_impl(
        filename=filename,
        model=model,
+        model_avg=model_avg,
        params=params,
        optimizer=optimizer,
        scheduler=scheduler,
@ -495,14 +520,14 @@ def save_checkpoint(

 def compute_loss(
    params: AttributeDict,
-    model: nn.Module,
+    model: Union[nn.Module, DDP],
    sp: spm.SentencePieceProcessor,
    batch: dict,
    is_training: bool,
    warmup: float = 1.0,
 ) -> Tuple[Tensor, MetricsTracker]:
    """
-    Compute CTC loss given the model and its inputs.
+    Compute transducer loss given the model and its inputs.

    Args:
      params:
@ -568,7 +593,7 @@ def compute_loss(

 def compute_validation_loss(
    params: AttributeDict,
-    model: nn.Module,
+    model: Union[nn.Module, DDP],
    sp: spm.SentencePieceProcessor,
    valid_dl: torch.utils.data.DataLoader,
    world_size: int = 1,
@ -602,13 +627,14 @@ def compute_validation_loss(

 def train_one_epoch(
    params: AttributeDict,
-    model: nn.Module,
+    model: Union[nn.Module, DDP],
    optimizer: torch.optim.Optimizer,
    scheduler: LRSchedulerType,
    sp: spm.SentencePieceProcessor,
    train_dl: torch.utils.data.DataLoader,
    valid_dl: torch.utils.data.DataLoader,
    scaler: GradScaler,
+    model_avg: Optional[nn.Module] = None,
    tb_writer: Optional[SummaryWriter] = None,
    world_size: int = 1,
    rank: int = 0,
@ -634,6 +660,8 @@ def train_one_epoch(
        Dataloader for the validation dataset.
      scaler:
        The scaler used for mix precision training.
+      model_avg:
+        The stored model averaged from the start of training.
      tb_writer:
        Writer to write log messages to tensorboard.
      world_size:
@ -660,6 +688,7 @@ def train_one_epoch(
            loss, loss_info = compute_loss(
                params=params,
                model=model,
+                model_avg=model_avg,
                sp=sp,
                batch=batch,
                is_training=True,
@ -688,6 +717,7 @@ def train_one_epoch(
                out_dir=params.exp_dir,
                global_batch_idx=params.batch_idx_train,
                model=model,
+                model_avg=model_avg,
                params=params,
                optimizer=optimizer,
                scheduler=scheduler,
@ -791,7 +821,16 @@ def run(rank, world_size, args):
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")

-    checkpoints = load_checkpoint_if_available(params=params, model=model)
+    assert params.save_every_n >= params.average_period
+    model_avg: Optional[nn.Module] = None
+    if rank == 0:
+        # model_avg is only used with rank 0
+        model_avg = copy.deepcopy(model).to(torch.float64)
+
+    assert params.start_epoch > 0, params.start_epoch
+    checkpoints = load_checkpoint_if_available(
+        params=params, model=model, model_avg=model_avg
+    )

    model.to(device)
    if world_size > 1:
@ -850,10 +889,10 @@ def run(rank, world_size, args):
        logging.info("Loading grad scaler state dict")
        scaler.load_state_dict(checkpoints["grad_scaler"])

-    for epoch in range(params.start_epoch, params.num_epochs):
-        scheduler.step_epoch(epoch)
-        fix_random_seed(params.seed + epoch)
-        train_dl.sampler.set_epoch(epoch)
+    for epoch in range(params.start_epoch, params.num_epochs + 1):
+        scheduler.step_epoch(epoch - 1)
+        fix_random_seed(params.seed + epoch - 1)
+        train_dl.sampler.set_epoch(epoch - 1)

        if tb_writer is not None:
            tb_writer.add_scalar("train/epoch", epoch, params.batch_idx_train)
@ -863,6 +902,7 @@ def run(rank, world_size, args):
        train_one_epoch(
            params=params,
            model=model,
+            model_avg=model_avg,
            optimizer=optimizer,
            scheduler=scheduler,
            sp=sp,
@ -881,6 +921,7 @@ def run(rank, world_size, args):
        save_checkpoint(
            params=params,
            model=model,
+            model_avg=model_avg,
            optimizer=optimizer,
            scheduler=scheduler,
            sampler=train_dl.sampler,
@ -896,7 +937,7 @@ def run(rank, world_size, args):


 def scan_pessimistic_batches_for_oom(
-    model: nn.Module,
+    model: Union[nn.Module, DDP],
    train_dl: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    sp: spm.SentencePieceProcessor,