Filter uneven-sized batch (#843)

* add filter_uneven_sized_batch fucntion * set --filter-uneven-sized-batch=True as default
2025-12-10 22:45:27 +00:00 · 2023-01-16 20:15:35 +08:00 · 2023-01-16 20:15:35 +08:00 · 2a463a420d
commit 2a463a420d
parent 5c8e9628cc
2 changed files with 68 additions and 1 deletions
--- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py
@ -82,7 +82,13 @@ from icefall.checkpoint import (
 from icefall.dist import cleanup_dist, setup_dist
 from icefall.env import get_env_info
 from icefall.hooks import register_inf_check_hooks
-from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
+from icefall.utils import (
+    AttributeDict,
+    MetricsTracker,
+    filter_uneven_sized_batch,
+    setup_logger,
+    str2bool,
+)

 LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]

@ -368,6 +374,21 @@ def get_parser():
        help="Whether to use half precision training.",
    )

+    parser.add_argument(
+        "--filter-uneven-sized-batch",
+        type=str2bool,
+        default=True,
+        help="""Whether to filter uneven-sized minibatch.
+        For the uneven-sized batch, the total duration after padding would possibly
+        cause OOM. Hence, for each batch, which is sorted descendingly by length,
+        we simply drop the last few shortest samples, so that the retained total frames
+        (after padding) would not exceed `allowed_max_frames`:
+        `allowed_max_frames = int(max_frames * (1.0 + allowed_excess_duration_ratio))`,
+        where `max_frames = max_duration * 1000 // frame_shift_ms`.
+        We set allowed_excess_duration_ratio=0.1.
+        """,
+    )
+
    add_model_arguments(parser)

    return parser
@ -420,6 +441,9 @@ def get_params() -> AttributeDict:
    """
    params = AttributeDict(
        {
+            "frame_shift_ms": 10.0,
+            # only used when params.filter_uneven_sized_batch is True
+            "allowed_excess_duration_ratio": 0.1,
            "best_train_loss": float("inf"),
            "best_valid_loss": float("inf"),
            "best_train_epoch": -1,
@ -642,6 +666,13 @@ def compute_loss(
     warmup: a floating point value which increases throughout training;
        values >= 1.0 are fully warmed up and have all modules present.
    """
+    if params.filter_uneven_sized_batch:
+        max_frames = params.max_duration * 1000 // params.frame_shift_ms
+        allowed_max_frames = int(
+            max_frames * (1.0 + params.allowed_excess_duration_ratio)
+        )
+        batch = filter_uneven_sized_batch(batch, allowed_max_frames)
+
    device = model.device if isinstance(model, DDP) else next(model.parameters()).device
    feature = batch["inputs"]
    # at entry, feature is (N, T, C)
--- a/icefall/utils.py
+++ b/icefall/utils.py
@ -1395,3 +1395,39 @@ def is_module_available(*modules: str) -> bool:
    import importlib

    return all(importlib.util.find_spec(m) is not None for m in modules)
+
+
+def filter_uneven_sized_batch(batch: dict, allowed_max_frames: int):
+    """For the uneven-sized batch, the total duration after padding would possibly
+    cause OOM. Hence, for each batch, which is sorted descendingly by length,
+    we simply drop the last few shortest samples, so that the retained total frames
+    (after padding) would not exceed the given allow_max_frames.
+
+    Args:
+      batch:
+        A batch of data. See `lhotse.dataset.K2SpeechRecognitionDataset()`
+        for the content in it.
+      allowed_max_frames:
+        The allowed max number of frames in batch.
+    """
+    features = batch["inputs"]
+    supervisions = batch["supervisions"]
+
+    N, T, _ = features.size()
+    assert T == supervisions["num_frames"].max(), (T, supervisions["num_frames"].max())
+    keep_num_utt = allowed_max_frames // T
+
+    if keep_num_utt >= N:
+        return batch
+
+    # Note: we assume the samples in batch is sorted descendingly by length
+    logging.info(
+        f"Filtering uneven-sized batch, original batch size is {N}, "
+        f"retained batch size is {keep_num_utt}."
+    )
+    batch["inputs"] = features[:keep_num_utt]
+    for k, v in supervisions.items():
+        assert len(v) == N, (len(v), N)
+        batch["supervisions"][k] = v[:keep_num_utt]
+
+    return batch