From 7472ef7d0eeec4a531f199c961d3afd3bf004556 Mon Sep 17 00:00:00 2001
From: Mingshuang Luo <37799481+luomingshuang@users.noreply.github.com>
Date: Tue, 8 Feb 2022 14:35:33 +0800
Subject: [PATCH] update asr_datamodule.py

---
 .../ASR/conformer_ctc/asr_datamodule.py       | 595 +++++++++++++++++-
 .../ASR/tdnn_lstm_ctc/asr_datamodule.py       | 213 +------
 2 files changed, 596 insertions(+), 212 deletions(-)

diff --git a/egs/librispeech/ASR/conformer_ctc/asr_datamodule.py b/egs/librispeech/ASR/conformer_ctc/asr_datamodule.py
index fa1b8cca3..e5fcc5893 120000
--- a/egs/librispeech/ASR/conformer_ctc/asr_datamodule.py
+++ b/egs/librispeech/ASR/conformer_ctc/asr_datamodule.py
@@ -1 +1,594 @@
-../tdnn_lstm_ctc/asr_datamodule.py
\ No newline at end of file
+# Copyright      2021  Piotr Żelasko
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+from functools import lru_cache
+from pathlib import Path
+
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse.dataset import (
+    BucketingSampler,
+    CutConcatenate,
+    CutMix,
+    K2SpeechRecognitionDataset,
+    PrecomputedFeatures,
+    SingleCutSampler,
+)
+from lhotse.dataset.input_strategies import OnTheFlyFeatures
+from torch.utils.data import DataLoader
+
+from icefall.utils import str2bool
+
+
+class LibriSpeechAsrDataModule:
+    """
+    DataModule for k2 ASR experiments.
+    It assumes there is always one train and valid dataloader,
+    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
+    and test-other).
+
+    It contains all the common data pipeline modules used in ASR
+    experiments, e.g.:
+    - dynamic batch size,
+    - bucketing samplers,
+    - cut concatenation,
+    - augmentation,
+    - on-the-fly feature extraction
+
+    This class should be derived for specific corpora used in ASR tasks.
+    """
+
+    def __init__(self, args: argparse.Namespace):
+        self.args = args
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(
+            title="ASR data related options",
+            description="These options are used for the preparation of "
+            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
+            "effective batch sizes, sampling strategies, applied data "
+            "augmentations, etc.",
+        )
+        group.add_argument(
+            "--full-libri",
+            type=str2bool,
+            default=True,
+            help="When enabled, use 960h LibriSpeech. "
+            "Otherwise, use 100h subset.",
+        )
+        group.add_argument(
+            "--manifest-dir",
+            type=Path,
+            default=Path("data/fbank"),
+            help="Path to directory with train/valid/test cuts.",
+        )
+        group.add_argument(
+            "--max-duration",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--bucketing-sampler",
+            type=str2bool,
+            default=True,
+            help="When enabled, the batches will come from buckets of "
+            "similar duration (saves padding frames).",
+        )
+        group.add_argument(
+            "--num-buckets",
+            type=int,
+            default=30,
+            help="The number of buckets for the BucketingSampler"
+            "(you might want to increase it for larger datasets).",
+        )
+        group.add_argument(
+            "--concatenate-cuts",
+            type=str2bool,
+            default=False,
+            help="When enabled, utterances (cuts) will be concatenated "
+            "to minimize the amount of padding.",
+        )
+        group.add_argument(
+            "--duration-factor",
+            type=float,
+            default=1.0,
+            help="Determines the maximum duration of a concatenated cut "
+            "relative to the duration of the longest cut in a batch.",
+        )
+        group.add_argument(
+            "--gap",
+            type=float,
+            default=1.0,
+            help="The amount of padding (in seconds) inserted between "
+            "concatenated cuts. This padding is filled with noise when "
+            "noise augmentation is used.",
+        )
+        group.add_argument(
+            "--on-the-fly-feats",
+            type=str2bool,
+            default=False,
+            help="When enabled, use on-the-fly cut mixing and feature "
+            "extraction. Will drop existing precomputed feature manifests "
+            "if available.",
+        )
+        group.add_argument(
+            "--shuffle",
+            type=str2bool,
+            default=True,
+            help="When enabled (=default), the examples will be "
+            "shuffled for each epoch.",
+        )
+        group.add_argument(
+            "--return-cuts",
+            type=str2bool,
+            default=True,
+            help="When enabled, each batch will have the "
+            "field: batch['supervisions']['cut'] with the cuts that "
+            "were used to construct it.",
+        )
+
+        group.add_argument(
+            "--num-workers",
+            type=int,
+            default=2,
+            help="The number of training dataloader workers that "
+            "collect the batches.",
+        )
+
+        group.add_argument(
+            "--enable-spec-aug",
+            type=str2bool,
+            default=True,
+            help="When enabled, use SpecAugment for training dataset.",
+        )
+
+        group.add_argument(
+            "--spec-aug-time-warp-factor",
+            type=int,
+            default=80,
+            help="Used only when --enable-spec-aug is True. "
+            "It specifies the factor for time warping in SpecAugment. "
+            "Larger values mean more warping. "
+            "A value less than 1 means to disable time warp.",
+        )
+
+        group.add_argument(
+            "--enable-musan",
+            type=str2bool,
+            default=True,
+            help="When enabled, select noise from MUSAN and mix it"
+            "with training dataset. ",
+        )
+
+    def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
+        logging.info("About to get Musan cuts")
+        cuts_musan = load_manifest(
+            self.args.manifest_dir / "cuts_musan.json.gz"
+        )
+
+        transforms = []
+        if self.args.enable_musan:
+            logging.info("Enable MUSAN")
+            transforms.append(
+                CutMix(
+                    cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True
+                )
+            )
+        else:
+            logging.info("Disable MUSAN")
+
+        if self.args.concatenate_cuts:
+            logging.info(
+                f"Using cut concatenation with duration factor "
+                f"{self.args.duration_factor} and gap {self.args.gap}."
+            )
+            # Cut concatenation should be the first transform in the list,
+            # so that if we e.g. mix noise in, it will fill the gaps between
+            # different utterances.
+            transforms = [
+                CutConcatenate(
+                    duration_factor=self.args.duration_factor, gap=self.args.gap
+                )
+            ] + transforms
+
+        input_transforms = []
+        if self.args.enable_spec_aug:
+            logging.info("Enable SpecAugment")
+            logging.info(
+                f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
+            )
+            input_transforms.append(
+                SpecAugment(
+                    time_warp_factor=self.args.spec_aug_time_warp_factor,
+                    num_frame_masks=10,
+                    features_mask_size=27,
+                    num_feature_masks=2,
+                    frames_mask_size=100,
+                    max_frames_mask_fraction=0.4,
+                )
+            )
+        else:
+            logging.info("Disable SpecAugment")
+
+        logging.info("About to create train dataset")
+        train = K2SpeechRecognitionDataset(
+            cut_transforms=transforms,
+            input_transforms=input_transforms,
+            return_cuts=self.args.return_cuts,
+        )
+
+        if self.args.on_the_fly_feats:
+            # NOTE: the PerturbSpeed transform should be added only if we
+            # remove it from data prep stage.
+            # Add on-the-fly speed perturbation; since originally it would
+            # have increased epoch size by 3, we will apply prob 2/3 and use
+            # 3x more epochs.
+            # Speed perturbation probably should come first before
+            # concatenation, but in principle the transforms order doesn't have
+            # to be strict (e.g. could be randomized)
+            # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
+            # Drop feats to be on the safe side.
+            train = K2SpeechRecognitionDataset(
+                cut_transforms=transforms,
+                input_strategy=OnTheFlyFeatures(
+                    Fbank(FbankConfig(num_mel_bins=80))
+                ),
+                input_transforms=input_transforms,
+                return_cuts=self.args.return_cuts,
+            )
+
+        if self.args.bucketing_sampler:
+            logging.info("Using BucketingSampler.")
+            train_sampler = BucketingSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                shuffle=self.args.shuffle,
+                num_buckets=self.args.num_buckets,
+                bucket_method="equal_duration",
+                drop_last=True,
+            )
+        else:
+            logging.info("Using SingleCutSampler.")
+            train_sampler = SingleCutSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                shuffle=self.args.shuffle,
+            )
+        logging.info("About to create train dataloader")
+
+        train_dl = DataLoader(
+            train,
+            sampler=train_sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+        )
+
+        return train_dl
+
+    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
+        transforms = []
+        if self.args.concatenate_cuts:
+            transforms = [
+                CutConcatenate(
+                    duration_factor=self.args.duration_factor, gap=self.args.gap
+                )
+            ] + transforms
+
+        logging.info("About to create dev dataset")
+        if self.args.on_the_fly_feats:
+            validate = K2SpeechRecognitionDataset(
+                cut_transforms=transforms,
+                input_strategy=OnTheFlyFeatures(
+                    Fbank(FbankConfig(num_mel_bins=80))
+                ),
+                return_cuts=self.args.return_cuts,
+            )
+        else:
+            validate = K2SpeechRecognitionDataset(
+                cut_transforms=transforms,
+                return_cuts=self.args.return_cuts,
+            )
+        valid_sampler = BucketingSampler(
+            cuts_valid,
+            max_duration=self.args.max_duration,
+            shuffle=False,
+        )
+        logging.info("About to create dev dataloader")
+        valid_dl = DataLoader(
+            validate,
+            sampler=valid_sampler,
+            batch_size=None,
+            num_workers=2,
+            persistent_workers=False,
+        )
+
+        return valid_dl
+
+    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
+        logging.debug("About to create test dataset")
+        test = K2SpeechRecognitionDataset(
+            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
+            if self.args.on_the_fly_feats
+            else PrecomputedFeatures(),
+            return_cuts=self.args.return_cuts,
+        )
+        sampler = BucketingSampler(
+            cuts, max_duration=self.args.max_duration, shuffle=False
+        )
+        logging.debug("About to create test dataloader")
+        test_dl = DataLoader(
+            test,
+            batch_size=None,
+            sampler=sampler,
+            num_workers=self.args.num_workers,
+        )
+        return test_dl
+
+    @lru_cache()
+    def train_clean_100_cuts(self) -> CutSet:
+        logging.info("About to get train-clean-100 cuts")
+        return load_manifest(
+            self.args.manifest_dir / "cuts_train-clean-100.json.gz"
+        )
+
+    @lru_cache()
+    def train_clean_360_cuts(self) -> CutSet:
+        logging.info("About to get train-clean-360 cuts")
+        return load_manifest(
+            self.args.manifest_dir / "cuts_train-clean-360.json.gz"
+        )
+
+    @lru_cache()
+    def train_other_500_cuts(self) -> CutSet:
+        logging.info("About to get train-other-500 cuts")
+        return load_manifest(
+            self.args.manifest_dir / "cuts_train-other-500.json.gz"
+        )
+
+    @lru_cache()
+    def dev_clean_cuts(self) -> CutSet:
+        logging.info("About to get dev-clean cuts")
+        return load_manifest(self.args.manifest_dir / "cuts_dev-clean.json.gz")
+
+    @lru_cache()
+    def dev_other_cuts(self) -> CutSet:
+        logging.info("About to get dev-other cuts")
+        return load_manifest(self.args.manifest_dir / "cuts_dev-other.json.gz")
+
+    @lru_cache()
+    def test_clean_cuts(self) -> CutSet:
+        logging.info("About to get test-clean cuts")
+        return load_manifest(self.args.manifest_dir / "cuts_test-clean.json.gz")
+
+    @lru_cache()
+    def test_other_cuts(self) -> CutSet:
+        logging.info("About to get test-other cuts")
+        return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz")
+
+
+import math
+import random
+import numpy as np
+from typing import Optional, Dict
+
+import torch
+
+from lhotse import CutSet
+
+class SpecAugment(torch.nn.Module):
+    """
+    SpecAugment performs three augmentations:
+    - time warping of the feature matrix
+    - masking of ranges of features (frequency bands)
+    - masking of ranges of frames (time)
+
+    The current implementation works with batches, but processes each example separately
+    in a loop rather than simultaneously to achieve different augmentation parameters for
+    each example.
+    """
+
+    def __init__(
+        self,
+        time_warp_factor: Optional[int] = 80,
+        num_feature_masks: int = 1,
+        features_mask_size: int = 13,
+        num_frame_masks: int = 1,
+        frames_mask_size: int = 70,
+        max_frames_mask_fraction: float = 0.2,
+        p=0.5,
+    ):
+        """
+        SpecAugment's constructor.
+
+        :param time_warp_factor: parameter for the time warping; larger values mean more warping.
+            Set to ``None``, or less than ``1``, to disable.
+        :param num_feature_masks: how many feature masks should be applied. Set to ``0`` to disable.
+        :param features_mask_size: the width of the feature mask (expressed in the number of masked feature bins).
+            This is the ``F`` parameter from the SpecAugment paper.
+        :param num_frame_masks: how many frame (temporal) masks should be applied. Set to ``0`` to disable.
+        :param frames_mask_size: the width of the frame (temporal) masks (expressed in the number of masked frames).
+            This is the ``T`` parameter from the SpecAugment paper.
+        :param max_frames_mask_fraction: limits the size of the frame (temporal) mask to this value times the length
+            of the utterance (or supervision segment).
+            This is the parameter denoted by ``p`` in the SpecAugment paper.
+        :param p: the probability of applying this transform.
+            It is different from ``p`` in the SpecAugment paper!
+        """
+        super().__init__()
+        assert 0 <= p <= 1
+        assert num_feature_masks >= 0
+        assert num_frame_masks >= 0
+        assert features_mask_size > 0
+        assert frames_mask_size > 0
+        self.time_warp_factor = time_warp_factor
+        self.num_feature_masks = num_feature_masks
+        self.features_mask_size = features_mask_size
+        self.num_frame_masks = num_frame_masks
+        self.frames_mask_size = frames_mask_size
+        self.max_frames_mask_fraction = max_frames_mask_fraction
+        self.p = p
+
+    def forward(
+        self,
+        features: torch.Tensor,
+        supervision_segments: Optional[torch.IntTensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Computes SpecAugment for a batch of feature matrices.
+
+        Since the batch will usually already be padded, the user can optionally
+        provide a ``supervision_segments`` tensor that will be used to apply SpecAugment
+        only to selected areas of the input. The format of this input is described below.
+
+        :param features: a batch of feature matrices with shape ``(B, T, F)``.
+        :param supervision_segments: an int tensor of shape ``(S, 3)``. ``S`` is the number of
+            supervision segments that exist in ``features`` -- there may be either
+            less or more than the batch size.
+            The second dimension encoder three kinds of information:
+            the sequence index of the corresponding feature matrix in `features`,
+            the start frame index, and the number of frames for each segment.
+        :return: an augmented tensor of shape ``(B, T, F)``.
+        """
+        assert len(features.shape) == 3, (
+            "SpecAugment only supports batches of " "single-channel feature matrices."
+        )
+        features = features.clone()
+        if supervision_segments is None:
+            # No supervisions - apply spec augment to full feature matrices.
+            for sequence_idx in range(features.size(0)):
+                features[sequence_idx] = self._forward_single(features[sequence_idx])
+        else:
+            # Supervisions provided - we will apply time warping only on the supervised areas.
+            for sequence_idx, start_frame, num_frames in supervision_segments:
+                end_frame = start_frame + num_frames
+                features[sequence_idx, start_frame:end_frame] = self._forward_single(
+                    features[sequence_idx, start_frame:end_frame], warp=True, mask=False
+                )
+            # ... and then time-mask the full feature matrices. Note that in this mode,
+            # it might happen that masks are applied to different sequences/examples
+            # than the time warping.
+            for sequence_idx in range(features.size(0)):
+                features[sequence_idx] = self._forward_single(
+                    features[sequence_idx], warp=False, mask=True
+                )
+        return features
+
+    def _forward_single(
+        self, features: torch.Tensor, warp: bool = True, mask: bool = True
+    ) -> torch.Tensor:
+        """
+        Apply SpecAugment to a single feature matrix of shape (T, F).
+        """
+        if random.random() > self.p:
+            # Randomly choose whether this transform is applied
+            return features
+        if warp:
+            if self.time_warp_factor is not None and self.time_warp_factor >= 1:
+                features = time_warp(features, factor=self.time_warp_factor)
+        if mask:
+            from torchaudio.functional import mask_along_axis
+
+            mean = features.mean()
+            for _ in range(self.num_feature_masks):
+                features = mask_along_axis(
+                    features.unsqueeze(0),
+                    mask_param=self.features_mask_size,
+                    mask_value=mean,
+                    axis=2,
+                ).squeeze(0)
+            for _ in range(self.num_frame_masks):
+                _max_tot_mask_frames = self.max_frames_mask_fraction * features.size(0)
+                num_frame_masks = min(self.num_frame_masks, math.ceil(_max_tot_mask_frames / self.frames_mask_size))
+                max_mask_frames = min(self.frames_mask_size, _max_tot_mask_frames // num_frame_masks)
+                
+                features = mask_along_axis(
+                    features.unsqueeze(0),
+                    mask_param=max_mask_frames,
+                    mask_value=mean,
+                    axis=1,
+                ).squeeze(0)
+        return features
+
+    def state_dict(self) -> Dict:
+        return dict(
+            time_warp_factor=self.time_warp_factor,
+            num_feature_masks=self.num_feature_masks,
+            features_mask_size=self.features_mask_size,
+            num_frame_masks=self.num_frame_masks,
+            frames_mask_size=self.frames_mask_size,
+            max_frames_mask_fraction=self.max_frames_mask_fraction,
+            p=self.p,
+        )
+
+    def load_state_dict(self, state_dict: Dict):
+        self.time_warp_factor = state_dict.get(
+            "time_warp_factor", self.time_warp_factor
+        )
+        self.num_feature_masks = state_dict.get(
+            "num_feature_masks", self.num_feature_masks
+        )
+        self.features_mask_size = state_dict.get(
+            "features_mask_size", self.features_mask_size
+        )
+        self.num_frame_masks = state_dict.get("num_frame_masks", self.num_frame_masks)
+        self.frames_mask_size = state_dict.get(
+            "frames_mask_size", self.frames_mask_size
+        )
+        self.max_frames_mask_fraction = state_dict.get(
+            "max_frames_mask_fraction", self.max_frames_mask_fraction
+        )
+        self.p = state_dict.get("p", self.p)
+
+
+def time_warp(features: torch.Tensor, factor: int) -> torch.Tensor:
+    """
+    Time warping as described in the SpecAugment paper.
+    Implementation based on Espresso:
+    https://github.com/freewym/espresso/blob/master/espresso/tools/specaug_interpolate.py#L51
+
+    :param features: input tensor of shape ``(T, F)``
+    :param factor: time warping parameter.
+    :return: a warped tensor of shape ``(T, F)``
+    """
+    t = features.size(0)
+    if t - factor <= factor + 1:
+        return features
+    center = np.random.randint(factor + 1, t - factor)
+    warped = np.random.randint(center - factor, center + factor + 1)
+    if warped == center:
+        return features
+    features = features.unsqueeze(0).unsqueeze(0)
+    left = torch.nn.functional.interpolate(
+        features[:, :, :center, :],
+        size=(warped, features.size(3)),
+        mode="bicubic",
+        align_corners=False,
+    )
+    right = torch.nn.functional.interpolate(
+        features[:, :, center:, :],
+        size=(t - warped, features.size(3)),
+        mode="bicubic",
+        align_corners=False,
+    )
+    return torch.cat((left, right), dim=2).squeeze(0).squeeze(0)
\ No newline at end of file
diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
index e5fcc5893..e075a2d03 100644
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
@@ -28,6 +28,7 @@ from lhotse.dataset import (
     K2SpeechRecognitionDataset,
     PrecomputedFeatures,
     SingleCutSampler,
+    SpecAugment,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
 from torch.utils.data import DataLoader
@@ -218,11 +219,10 @@ class LibriSpeechAsrDataModule:
             input_transforms.append(
                 SpecAugment(
                     time_warp_factor=self.args.spec_aug_time_warp_factor,
-                    num_frame_masks=10,
+                    num_frame_masks=2,
                     features_mask_size=27,
                     num_feature_masks=2,
                     frames_mask_size=100,
-                    max_frames_mask_fraction=0.4,
                 )
             )
         else:
@@ -383,212 +383,3 @@ class LibriSpeechAsrDataModule:
     def test_other_cuts(self) -> CutSet:
         logging.info("About to get test-other cuts")
         return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz")
-
-
-import math
-import random
-import numpy as np
-from typing import Optional, Dict
-
-import torch
-
-from lhotse import CutSet
-
-class SpecAugment(torch.nn.Module):
-    """
-    SpecAugment performs three augmentations:
-    - time warping of the feature matrix
-    - masking of ranges of features (frequency bands)
-    - masking of ranges of frames (time)
-
-    The current implementation works with batches, but processes each example separately
-    in a loop rather than simultaneously to achieve different augmentation parameters for
-    each example.
-    """
-
-    def __init__(
-        self,
-        time_warp_factor: Optional[int] = 80,
-        num_feature_masks: int = 1,
-        features_mask_size: int = 13,
-        num_frame_masks: int = 1,
-        frames_mask_size: int = 70,
-        max_frames_mask_fraction: float = 0.2,
-        p=0.5,
-    ):
-        """
-        SpecAugment's constructor.
-
-        :param time_warp_factor: parameter for the time warping; larger values mean more warping.
-            Set to ``None``, or less than ``1``, to disable.
-        :param num_feature_masks: how many feature masks should be applied. Set to ``0`` to disable.
-        :param features_mask_size: the width of the feature mask (expressed in the number of masked feature bins).
-            This is the ``F`` parameter from the SpecAugment paper.
-        :param num_frame_masks: how many frame (temporal) masks should be applied. Set to ``0`` to disable.
-        :param frames_mask_size: the width of the frame (temporal) masks (expressed in the number of masked frames).
-            This is the ``T`` parameter from the SpecAugment paper.
-        :param max_frames_mask_fraction: limits the size of the frame (temporal) mask to this value times the length
-            of the utterance (or supervision segment).
-            This is the parameter denoted by ``p`` in the SpecAugment paper.
-        :param p: the probability of applying this transform.
-            It is different from ``p`` in the SpecAugment paper!
-        """
-        super().__init__()
-        assert 0 <= p <= 1
-        assert num_feature_masks >= 0
-        assert num_frame_masks >= 0
-        assert features_mask_size > 0
-        assert frames_mask_size > 0
-        self.time_warp_factor = time_warp_factor
-        self.num_feature_masks = num_feature_masks
-        self.features_mask_size = features_mask_size
-        self.num_frame_masks = num_frame_masks
-        self.frames_mask_size = frames_mask_size
-        self.max_frames_mask_fraction = max_frames_mask_fraction
-        self.p = p
-
-    def forward(
-        self,
-        features: torch.Tensor,
-        supervision_segments: Optional[torch.IntTensor] = None,
-        *args,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        Computes SpecAugment for a batch of feature matrices.
-
-        Since the batch will usually already be padded, the user can optionally
-        provide a ``supervision_segments`` tensor that will be used to apply SpecAugment
-        only to selected areas of the input. The format of this input is described below.
-
-        :param features: a batch of feature matrices with shape ``(B, T, F)``.
-        :param supervision_segments: an int tensor of shape ``(S, 3)``. ``S`` is the number of
-            supervision segments that exist in ``features`` -- there may be either
-            less or more than the batch size.
-            The second dimension encoder three kinds of information:
-            the sequence index of the corresponding feature matrix in `features`,
-            the start frame index, and the number of frames for each segment.
-        :return: an augmented tensor of shape ``(B, T, F)``.
-        """
-        assert len(features.shape) == 3, (
-            "SpecAugment only supports batches of " "single-channel feature matrices."
-        )
-        features = features.clone()
-        if supervision_segments is None:
-            # No supervisions - apply spec augment to full feature matrices.
-            for sequence_idx in range(features.size(0)):
-                features[sequence_idx] = self._forward_single(features[sequence_idx])
-        else:
-            # Supervisions provided - we will apply time warping only on the supervised areas.
-            for sequence_idx, start_frame, num_frames in supervision_segments:
-                end_frame = start_frame + num_frames
-                features[sequence_idx, start_frame:end_frame] = self._forward_single(
-                    features[sequence_idx, start_frame:end_frame], warp=True, mask=False
-                )
-            # ... and then time-mask the full feature matrices. Note that in this mode,
-            # it might happen that masks are applied to different sequences/examples
-            # than the time warping.
-            for sequence_idx in range(features.size(0)):
-                features[sequence_idx] = self._forward_single(
-                    features[sequence_idx], warp=False, mask=True
-                )
-        return features
-
-    def _forward_single(
-        self, features: torch.Tensor, warp: bool = True, mask: bool = True
-    ) -> torch.Tensor:
-        """
-        Apply SpecAugment to a single feature matrix of shape (T, F).
-        """
-        if random.random() > self.p:
-            # Randomly choose whether this transform is applied
-            return features
-        if warp:
-            if self.time_warp_factor is not None and self.time_warp_factor >= 1:
-                features = time_warp(features, factor=self.time_warp_factor)
-        if mask:
-            from torchaudio.functional import mask_along_axis
-
-            mean = features.mean()
-            for _ in range(self.num_feature_masks):
-                features = mask_along_axis(
-                    features.unsqueeze(0),
-                    mask_param=self.features_mask_size,
-                    mask_value=mean,
-                    axis=2,
-                ).squeeze(0)
-            for _ in range(self.num_frame_masks):
-                _max_tot_mask_frames = self.max_frames_mask_fraction * features.size(0)
-                num_frame_masks = min(self.num_frame_masks, math.ceil(_max_tot_mask_frames / self.frames_mask_size))
-                max_mask_frames = min(self.frames_mask_size, _max_tot_mask_frames // num_frame_masks)
-                
-                features = mask_along_axis(
-                    features.unsqueeze(0),
-                    mask_param=max_mask_frames,
-                    mask_value=mean,
-                    axis=1,
-                ).squeeze(0)
-        return features
-
-    def state_dict(self) -> Dict:
-        return dict(
-            time_warp_factor=self.time_warp_factor,
-            num_feature_masks=self.num_feature_masks,
-            features_mask_size=self.features_mask_size,
-            num_frame_masks=self.num_frame_masks,
-            frames_mask_size=self.frames_mask_size,
-            max_frames_mask_fraction=self.max_frames_mask_fraction,
-            p=self.p,
-        )
-
-    def load_state_dict(self, state_dict: Dict):
-        self.time_warp_factor = state_dict.get(
-            "time_warp_factor", self.time_warp_factor
-        )
-        self.num_feature_masks = state_dict.get(
-            "num_feature_masks", self.num_feature_masks
-        )
-        self.features_mask_size = state_dict.get(
-            "features_mask_size", self.features_mask_size
-        )
-        self.num_frame_masks = state_dict.get("num_frame_masks", self.num_frame_masks)
-        self.frames_mask_size = state_dict.get(
-            "frames_mask_size", self.frames_mask_size
-        )
-        self.max_frames_mask_fraction = state_dict.get(
-            "max_frames_mask_fraction", self.max_frames_mask_fraction
-        )
-        self.p = state_dict.get("p", self.p)
-
-
-def time_warp(features: torch.Tensor, factor: int) -> torch.Tensor:
-    """
-    Time warping as described in the SpecAugment paper.
-    Implementation based on Espresso:
-    https://github.com/freewym/espresso/blob/master/espresso/tools/specaug_interpolate.py#L51
-
-    :param features: input tensor of shape ``(T, F)``
-    :param factor: time warping parameter.
-    :return: a warped tensor of shape ``(T, F)``
-    """
-    t = features.size(0)
-    if t - factor <= factor + 1:
-        return features
-    center = np.random.randint(factor + 1, t - factor)
-    warped = np.random.randint(center - factor, center + factor + 1)
-    if warped == center:
-        return features
-    features = features.unsqueeze(0).unsqueeze(0)
-    left = torch.nn.functional.interpolate(
-        features[:, :, :center, :],
-        size=(warped, features.size(3)),
-        mode="bicubic",
-        align_corners=False,
-    )
-    right = torch.nn.functional.interpolate(
-        features[:, :, center:, :],
-        size=(t - warped, features.size(3)),
-        mode="bicubic",
-        align_corners=False,
-    )
-    return torch.cat((left, right), dim=2).squeeze(0).squeeze(0)
\ No newline at end of file