From f8065fb2791b9713d8b5b8af0b5b468bce5567a2 Mon Sep 17 00:00:00 2001 From: Mingshuang Luo <37799481+luomingshuang@users.noreply.github.com> Date: Wed, 16 Feb 2022 18:27:07 +0800 Subject: [PATCH] update this pr --- .../ASR/conformer_ctc/asr_datamodule.py | 595 +----------------- .../ASR/tdnn_lstm_ctc/asr_datamodule.py | 229 ++++++- 2 files changed, 228 insertions(+), 596 deletions(-) diff --git a/egs/librispeech/ASR/conformer_ctc/asr_datamodule.py b/egs/librispeech/ASR/conformer_ctc/asr_datamodule.py index 7570faccc..fa1b8cca3 120000 --- a/egs/librispeech/ASR/conformer_ctc/asr_datamodule.py +++ b/egs/librispeech/ASR/conformer_ctc/asr_datamodule.py @@ -1,594 +1 @@ -# Copyright 2021 Piotr Żelasko -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import argparse -import logging -from functools import lru_cache -from pathlib import Path - -from lhotse import CutSet, Fbank, FbankConfig, load_manifest -from lhotse.dataset import ( - BucketingSampler, - CutConcatenate, - CutMix, - K2SpeechRecognitionDataset, - PrecomputedFeatures, - SingleCutSampler, -) -from lhotse.dataset.input_strategies import OnTheFlyFeatures -from torch.utils.data import DataLoader - -from icefall.utils import str2bool - - -class LibriSpeechAsrDataModule: - """ - DataModule for k2 ASR experiments. - It assumes there is always one train and valid dataloader, - but there can be multiple test dataloaders (e.g. LibriSpeech test-clean - and test-other). - - It contains all the common data pipeline modules used in ASR - experiments, e.g.: - - dynamic batch size, - - bucketing samplers, - - cut concatenation, - - augmentation, - - on-the-fly feature extraction - - This class should be derived for specific corpora used in ASR tasks. - """ - - def __init__(self, args: argparse.Namespace): - self.args = args - - @classmethod - def add_arguments(cls, parser: argparse.ArgumentParser): - group = parser.add_argument_group( - title="ASR data related options", - description="These options are used for the preparation of " - "PyTorch DataLoaders from Lhotse CutSet's -- they control the " - "effective batch sizes, sampling strategies, applied data " - "augmentations, etc.", - ) - group.add_argument( - "--full-libri", - type=str2bool, - default=True, - help="When enabled, use 960h LibriSpeech. " - "Otherwise, use 100h subset.", - ) - group.add_argument( - "--manifest-dir", - type=Path, - default=Path("data/fbank"), - help="Path to directory with train/valid/test cuts.", - ) - group.add_argument( - "--max-duration", - type=int, - default=200.0, - help="Maximum pooled recordings duration (seconds) in a " - "single batch. You can reduce it if it causes CUDA OOM.", - ) - group.add_argument( - "--bucketing-sampler", - type=str2bool, - default=True, - help="When enabled, the batches will come from buckets of " - "similar duration (saves padding frames).", - ) - group.add_argument( - "--num-buckets", - type=int, - default=30, - help="The number of buckets for the BucketingSampler" - "(you might want to increase it for larger datasets).", - ) - group.add_argument( - "--concatenate-cuts", - type=str2bool, - default=False, - help="When enabled, utterances (cuts) will be concatenated " - "to minimize the amount of padding.", - ) - group.add_argument( - "--duration-factor", - type=float, - default=1.0, - help="Determines the maximum duration of a concatenated cut " - "relative to the duration of the longest cut in a batch.", - ) - group.add_argument( - "--gap", - type=float, - default=1.0, - help="The amount of padding (in seconds) inserted between " - "concatenated cuts. This padding is filled with noise when " - "noise augmentation is used.", - ) - group.add_argument( - "--on-the-fly-feats", - type=str2bool, - default=False, - help="When enabled, use on-the-fly cut mixing and feature " - "extraction. Will drop existing precomputed feature manifests " - "if available.", - ) - group.add_argument( - "--shuffle", - type=str2bool, - default=True, - help="When enabled (=default), the examples will be " - "shuffled for each epoch.", - ) - group.add_argument( - "--return-cuts", - type=str2bool, - default=True, - help="When enabled, each batch will have the " - "field: batch['supervisions']['cut'] with the cuts that " - "were used to construct it.", - ) - - group.add_argument( - "--num-workers", - type=int, - default=2, - help="The number of training dataloader workers that " - "collect the batches.", - ) - - group.add_argument( - "--enable-spec-aug", - type=str2bool, - default=True, - help="When enabled, use SpecAugment for training dataset.", - ) - - group.add_argument( - "--spec-aug-time-warp-factor", - type=int, - default=80, - help="Used only when --enable-spec-aug is True. " - "It specifies the factor for time warping in SpecAugment. " - "Larger values mean more warping. " - "A value less than 1 means to disable time warp.", - ) - - group.add_argument( - "--enable-musan", - type=str2bool, - default=True, - help="When enabled, select noise from MUSAN and mix it" - "with training dataset. ", - ) - - def train_dataloaders(self, cuts_train: CutSet) -> DataLoader: - logging.info("About to get Musan cuts") - cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" - ) - - transforms = [] - if self.args.enable_musan: - logging.info("Enable MUSAN") - transforms.append( - CutMix( - cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True - ) - ) - else: - logging.info("Disable MUSAN") - - if self.args.concatenate_cuts: - logging.info( - f"Using cut concatenation with duration factor " - f"{self.args.duration_factor} and gap {self.args.gap}." - ) - # Cut concatenation should be the first transform in the list, - # so that if we e.g. mix noise in, it will fill the gaps between - # different utterances. - transforms = [ - CutConcatenate( - duration_factor=self.args.duration_factor, gap=self.args.gap - ) - ] + transforms - - input_transforms = [] - if self.args.enable_spec_aug: - logging.info("Enable SpecAugment") - logging.info( - f"Time warp factor: {self.args.spec_aug_time_warp_factor}" - ) - input_transforms.append( - SpecAugment( - time_warp_factor=self.args.spec_aug_time_warp_factor, - features_mask_size=27, - num_feature_masks=2, - frames_mask_size=100, - ) - ) - else: - logging.info("Disable SpecAugment") - - logging.info("About to create train dataset") - train = K2SpeechRecognitionDataset( - cut_transforms=transforms, - input_transforms=input_transforms, - return_cuts=self.args.return_cuts, - ) - - if self.args.on_the_fly_feats: - # NOTE: the PerturbSpeed transform should be added only if we - # remove it from data prep stage. - # Add on-the-fly speed perturbation; since originally it would - # have increased epoch size by 3, we will apply prob 2/3 and use - # 3x more epochs. - # Speed perturbation probably should come first before - # concatenation, but in principle the transforms order doesn't have - # to be strict (e.g. could be randomized) - # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms # noqa - # Drop feats to be on the safe side. - train = K2SpeechRecognitionDataset( - cut_transforms=transforms, - input_strategy=OnTheFlyFeatures( - Fbank(FbankConfig(num_mel_bins=80)) - ), - input_transforms=input_transforms, - return_cuts=self.args.return_cuts, - ) - - if self.args.bucketing_sampler: - logging.info("Using BucketingSampler.") - train_sampler = BucketingSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - num_buckets=self.args.num_buckets, - bucket_method="equal_duration", - drop_last=True, - ) - else: - logging.info("Using SingleCutSampler.") - train_sampler = SingleCutSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - ) - logging.info("About to create train dataloader") - - train_dl = DataLoader( - train, - sampler=train_sampler, - batch_size=None, - num_workers=self.args.num_workers, - persistent_workers=False, - ) - - return train_dl - - def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader: - transforms = [] - if self.args.concatenate_cuts: - transforms = [ - CutConcatenate( - duration_factor=self.args.duration_factor, gap=self.args.gap - ) - ] + transforms - - logging.info("About to create dev dataset") - if self.args.on_the_fly_feats: - validate = K2SpeechRecognitionDataset( - cut_transforms=transforms, - input_strategy=OnTheFlyFeatures( - Fbank(FbankConfig(num_mel_bins=80)) - ), - return_cuts=self.args.return_cuts, - ) - else: - validate = K2SpeechRecognitionDataset( - cut_transforms=transforms, - return_cuts=self.args.return_cuts, - ) - valid_sampler = BucketingSampler( - cuts_valid, - max_duration=self.args.max_duration, - shuffle=False, - ) - logging.info("About to create dev dataloader") - valid_dl = DataLoader( - validate, - sampler=valid_sampler, - batch_size=None, - num_workers=2, - persistent_workers=False, - ) - - return valid_dl - - def test_dataloaders(self, cuts: CutSet) -> DataLoader: - logging.debug("About to create test dataset") - test = K2SpeechRecognitionDataset( - input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))) - if self.args.on_the_fly_feats - else PrecomputedFeatures(), - return_cuts=self.args.return_cuts, - ) - sampler = BucketingSampler( - cuts, max_duration=self.args.max_duration, shuffle=False - ) - logging.debug("About to create test dataloader") - test_dl = DataLoader( - test, - batch_size=None, - sampler=sampler, - num_workers=self.args.num_workers, - ) - return test_dl - - @lru_cache() - def train_clean_100_cuts(self) -> CutSet: - logging.info("About to get train-clean-100 cuts") - return load_manifest( - self.args.manifest_dir / "cuts_train-clean-100.json.gz" - ) - - @lru_cache() - def train_clean_360_cuts(self) -> CutSet: - logging.info("About to get train-clean-360 cuts") - return load_manifest( - self.args.manifest_dir / "cuts_train-clean-360.json.gz" - ) - - @lru_cache() - def train_other_500_cuts(self) -> CutSet: - logging.info("About to get train-other-500 cuts") - return load_manifest( - self.args.manifest_dir / "cuts_train-other-500.json.gz" - ) - - @lru_cache() - def dev_clean_cuts(self) -> CutSet: - logging.info("About to get dev-clean cuts") - return load_manifest(self.args.manifest_dir / "cuts_dev-clean.json.gz") - - @lru_cache() - def dev_other_cuts(self) -> CutSet: - logging.info("About to get dev-other cuts") - return load_manifest(self.args.manifest_dir / "cuts_dev-other.json.gz") - - @lru_cache() - def test_clean_cuts(self) -> CutSet: - logging.info("About to get test-clean cuts") - return load_manifest(self.args.manifest_dir / "cuts_test-clean.json.gz") - - @lru_cache() - def test_other_cuts(self) -> CutSet: - logging.info("About to get test-other cuts") - return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz") - - -import math -import random -import numpy as np -from typing import Optional, Dict - -import torch - -from lhotse import CutSet - -class SpecAugment(torch.nn.Module): - """ - SpecAugment performs three augmentations: - - time warping of the feature matrix - - masking of ranges of features (frequency bands) - - masking of ranges of frames (time) - - The current implementation works with batches, but processes each example separately - in a loop rather than simultaneously to achieve different augmentation parameters for - each example. - """ - - def __init__( - self, - time_warp_factor: Optional[int] = 80, - num_feature_masks: int = 1, - features_mask_size: int = 13, - num_frame_masks: int = 10, - frames_mask_size: int = 70, - max_frames_mask_fraction: float = 0.4, - p=0.5, - ): - """ - SpecAugment's constructor. - - :param time_warp_factor: parameter for the time warping; larger values mean more warping. - Set to ``None``, or less than ``1``, to disable. - :param num_feature_masks: how many feature masks should be applied. Set to ``0`` to disable. - :param features_mask_size: the width of the feature mask (expressed in the number of masked feature bins). - This is the ``F`` parameter from the SpecAugment paper. - :param num_frame_masks: how many frame (temporal) masks should be applied. Set to ``0`` to disable. - This is the maximum (it's also constrained by max_frames_mask_fraction). - :param frames_mask_size: the width of the frame (temporal) masks (expressed in the number of masked frames). - This is the ``T`` parameter from the SpecAugment paper. - :param max_frames_mask_fraction: limits the size of the frame (temporal) mask to this value times the length - of the utterance (or supervision segment). It is the total masked fraction including all masked regions. - This is the parameter denoted by ``p`` in the SpecAugment paper. - :param p: the probability of applying this transform. - It is different from ``p`` in the SpecAugment paper! - """ - super().__init__() - assert 0 <= p <= 1 - assert num_feature_masks >= 0 - assert num_frame_masks >= 0 - assert features_mask_size > 0 - assert frames_mask_size > 0 - self.time_warp_factor = time_warp_factor - self.num_feature_masks = num_feature_masks - self.features_mask_size = features_mask_size - self.num_frame_masks = num_frame_masks - self.frames_mask_size = frames_mask_size - self.max_frames_mask_fraction = max_frames_mask_fraction - self.p = p - - def forward( - self, - features: torch.Tensor, - supervision_segments: Optional[torch.IntTensor] = None, - *args, - **kwargs, - ) -> torch.Tensor: - """ - Computes SpecAugment for a batch of feature matrices. - - Since the batch will usually already be padded, the user can optionally - provide a ``supervision_segments`` tensor that will be used to apply SpecAugment - only to selected areas of the input. The format of this input is described below. - - :param features: a batch of feature matrices with shape ``(B, T, F)``. - :param supervision_segments: an int tensor of shape ``(S, 3)``. ``S`` is the number of - supervision segments that exist in ``features`` -- there may be either - less or more than the batch size. - The second dimension encoder three kinds of information: - the sequence index of the corresponding feature matrix in `features`, - the start frame index, and the number of frames for each segment. - :return: an augmented tensor of shape ``(B, T, F)``. - """ - assert len(features.shape) == 3, ( - "SpecAugment only supports batches of " "single-channel feature matrices." - ) - features = features.clone() - if supervision_segments is None: - # No supervisions - apply spec augment to full feature matrices. - for sequence_idx in range(features.size(0)): - features[sequence_idx] = self._forward_single(features[sequence_idx]) - else: - # Supervisions provided - we will apply time warping only on the supervised areas. - for sequence_idx, start_frame, num_frames in supervision_segments: - end_frame = start_frame + num_frames - features[sequence_idx, start_frame:end_frame] = self._forward_single( - features[sequence_idx, start_frame:end_frame], warp=True, mask=False - ) - # ... and then time-mask the full feature matrices. Note that in this mode, - # it might happen that masks are applied to different sequences/examples - # than the time warping. - for sequence_idx in range(features.size(0)): - features[sequence_idx] = self._forward_single( - features[sequence_idx], warp=False, mask=True - ) - return features - - def _forward_single( - self, features: torch.Tensor, warp: bool = True, mask: bool = True - ) -> torch.Tensor: - """ - Apply SpecAugment to a single feature matrix of shape (T, F). - """ - if random.random() > self.p: - # Randomly choose whether this transform is applied - return features - if warp: - if self.time_warp_factor is not None and self.time_warp_factor >= 1: - features = time_warp(features, factor=self.time_warp_factor) - if mask: - from torchaudio.functional import mask_along_axis - - mean = features.mean() - for _ in range(self.num_feature_masks): - features = mask_along_axis( - features.unsqueeze(0), - mask_param=self.features_mask_size, - mask_value=mean, - axis=2, - ).squeeze(0) - - _max_tot_mask_frames = self.max_frames_mask_fraction * features.size(0) - num_frame_masks = min(self.num_frame_masks, math.ceil(_max_tot_mask_frames / self.frames_mask_size)) - max_mask_frames = min(self.frames_mask_size, _max_tot_mask_frames // num_frame_masks) - - for _ in range(num_frame_masks): - features = mask_along_axis( - features.unsqueeze(0), - mask_param=max_mask_frames, - mask_value=mean, - axis=1, - ).squeeze(0) - return features - - def state_dict(self) -> Dict: - return dict( - time_warp_factor=self.time_warp_factor, - num_feature_masks=self.num_feature_masks, - features_mask_size=self.features_mask_size, - num_frame_masks=self.num_frame_masks, - frames_mask_size=self.frames_mask_size, - max_frames_mask_fraction=self.max_frames_mask_fraction, - p=self.p, - ) - - def load_state_dict(self, state_dict: Dict): - self.time_warp_factor = state_dict.get( - "time_warp_factor", self.time_warp_factor - ) - self.num_feature_masks = state_dict.get( - "num_feature_masks", self.num_feature_masks - ) - self.features_mask_size = state_dict.get( - "features_mask_size", self.features_mask_size - ) - self.num_frame_masks = state_dict.get("num_frame_masks", self.num_frame_masks) - self.frames_mask_size = state_dict.get( - "frames_mask_size", self.frames_mask_size - ) - self.max_frames_mask_fraction = state_dict.get( - "max_frames_mask_fraction", self.max_frames_mask_fraction - ) - self.p = state_dict.get("p", self.p) - - -def time_warp(features: torch.Tensor, factor: int) -> torch.Tensor: - """ - Time warping as described in the SpecAugment paper. - Implementation based on Espresso: - https://github.com/freewym/espresso/blob/master/espresso/tools/specaug_interpolate.py#L51 - - :param features: input tensor of shape ``(T, F)`` - :param factor: time warping parameter. - :return: a warped tensor of shape ``(T, F)`` - """ - t = features.size(0) - if t - factor <= factor + 1: - return features - center = np.random.randint(factor + 1, t - factor) - warped = np.random.randint(center - factor, center + factor + 1) - if warped == center: - return features - features = features.unsqueeze(0).unsqueeze(0) - left = torch.nn.functional.interpolate( - features[:, :, :center, :], - size=(warped, features.size(3)), - mode="bicubic", - align_corners=False, - ) - right = torch.nn.functional.interpolate( - features[:, :, center:, :], - size=(t - warped, features.size(3)), - mode="bicubic", - align_corners=False, - ) - return torch.cat((left, right), dim=2).squeeze(0).squeeze(0) \ No newline at end of file +../tdnn_lstm_ctc/asr_datamodule.py \ No newline at end of file diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py index e075a2d03..351e99f65 100644 --- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py +++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py @@ -28,7 +28,6 @@ from lhotse.dataset import ( K2SpeechRecognitionDataset, PrecomputedFeatures, SingleCutSampler, - SpecAugment, ) from lhotse.dataset.input_strategies import OnTheFlyFeatures from torch.utils.data import DataLoader @@ -179,6 +178,20 @@ class LibriSpeechAsrDataModule: "with training dataset. ", ) + group.add_argument( + "--max-mask-fraction", + type=float, + default=0.4, + help="the max mask fraction for specaugment.", + ) + + group.add_argument( + "--p", + type=float, + default=0.5, + help="the probility for specaugment.", + ) + def train_dataloaders(self, cuts_train: CutSet) -> DataLoader: logging.info("About to get Musan cuts") cuts_musan = load_manifest( @@ -219,10 +232,12 @@ class LibriSpeechAsrDataModule: input_transforms.append( SpecAugment( time_warp_factor=self.args.spec_aug_time_warp_factor, - num_frame_masks=2, + num_frame_masks=10, features_mask_size=27, num_feature_masks=2, frames_mask_size=100, + max_frames_mask_fraction=self.args.max_mask_fraction, + p=self.args.p, ) ) else: @@ -383,3 +398,213 @@ class LibriSpeechAsrDataModule: def test_other_cuts(self) -> CutSet: logging.info("About to get test-other cuts") return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz") + + +import math +import random +import numpy as np +from typing import Optional, Dict + +import torch + +from lhotse import CutSet + +class SpecAugment(torch.nn.Module): + """ + SpecAugment performs three augmentations: + - time warping of the feature matrix + - masking of ranges of features (frequency bands) + - masking of ranges of frames (time) + + The current implementation works with batches, but processes each example separately + in a loop rather than simultaneously to achieve different augmentation parameters for + each example. + """ + + def __init__( + self, + time_warp_factor: Optional[int] = 80, + num_feature_masks: int = 1, + features_mask_size: int = 13, + num_frame_masks: int = 1, + frames_mask_size: int = 70, + max_frames_mask_fraction: float = 0.2, + p=0.5, + ): + """ + SpecAugment's constructor. + + :param time_warp_factor: parameter for the time warping; larger values mean more warping. + Set to ``None``, or less than ``1``, to disable. + :param num_feature_masks: how many feature masks should be applied. Set to ``0`` to disable. + :param features_mask_size: the width of the feature mask (expressed in the number of masked feature bins). + This is the ``F`` parameter from the SpecAugment paper. + :param num_frame_masks: how many frame (temporal) masks should be applied. Set to ``0`` to disable. + :param frames_mask_size: the width of the frame (temporal) masks (expressed in the number of masked frames). + This is the ``T`` parameter from the SpecAugment paper. + :param max_frames_mask_fraction: limits the size of the frame (temporal) mask to this value times the length + of the utterance (or supervision segment). + This is the parameter denoted by ``p`` in the SpecAugment paper. + :param p: the probability of applying this transform. + It is different from ``p`` in the SpecAugment paper! + """ + super().__init__() + assert 0 <= p <= 1 + assert num_feature_masks >= 0 + assert num_frame_masks >= 0 + assert features_mask_size > 0 + assert frames_mask_size > 0 + self.time_warp_factor = time_warp_factor + self.num_feature_masks = num_feature_masks + self.features_mask_size = features_mask_size + self.num_frame_masks = num_frame_masks + self.frames_mask_size = frames_mask_size + self.max_frames_mask_fraction = max_frames_mask_fraction + self.p = p + + def forward( + self, + features: torch.Tensor, + supervision_segments: Optional[torch.IntTensor] = None, + *args, + **kwargs, + ) -> torch.Tensor: + """ + Computes SpecAugment for a batch of feature matrices. + + Since the batch will usually already be padded, the user can optionally + provide a ``supervision_segments`` tensor that will be used to apply SpecAugment + only to selected areas of the input. The format of this input is described below. + + :param features: a batch of feature matrices with shape ``(B, T, F)``. + :param supervision_segments: an int tensor of shape ``(S, 3)``. ``S`` is the number of + supervision segments that exist in ``features`` -- there may be either + less or more than the batch size. + The second dimension encoder three kinds of information: + the sequence index of the corresponding feature matrix in `features`, + the start frame index, and the number of frames for each segment. + :return: an augmented tensor of shape ``(B, T, F)``. + """ + assert len(features.shape) == 3, ( + "SpecAugment only supports batches of " "single-channel feature matrices." + ) + features = features.clone() + if supervision_segments is None: + # No supervisions - apply spec augment to full feature matrices. + for sequence_idx in range(features.size(0)): + features[sequence_idx] = self._forward_single(features[sequence_idx]) + else: + # Supervisions provided - we will apply time warping only on the supervised areas. + for sequence_idx, start_frame, num_frames in supervision_segments: + end_frame = start_frame + num_frames + features[sequence_idx, start_frame:end_frame] = self._forward_single( + features[sequence_idx, start_frame:end_frame], warp=True, mask=False + ) + # ... and then time-mask the full feature matrices. Note that in this mode, + # it might happen that masks are applied to different sequences/examples + # than the time warping. + for sequence_idx in range(features.size(0)): + features[sequence_idx] = self._forward_single( + features[sequence_idx], warp=False, mask=True + ) + return features + + def _forward_single( + self, features: torch.Tensor, warp: bool = True, mask: bool = True + ) -> torch.Tensor: + """ + Apply SpecAugment to a single feature matrix of shape (T, F). + """ + if random.random() > self.p: + # Randomly choose whether this transform is applied + return features + if warp: + if self.time_warp_factor is not None and self.time_warp_factor >= 1: + features = time_warp(features, factor=self.time_warp_factor) + if mask: + from torchaudio.functional import mask_along_axis + + mean = features.mean() + for _ in range(self.num_feature_masks): + features = mask_along_axis( + features.unsqueeze(0), + mask_param=self.features_mask_size, + mask_value=mean, + axis=2, + ).squeeze(0) + + _max_tot_mask_frames = self.max_frames_mask_fraction * features.size(0) + num_frame_masks = min(self.num_frame_masks, math.ceil(_max_tot_mask_frames / self.frames_mask_size)) + max_mask_frames = min(self.frames_mask_size, _max_tot_mask_frames // num_frame_masks) + + for _ in range(num_frame_masks): + features = mask_along_axis( + features.unsqueeze(0), + mask_param=max_mask_frames, + mask_value=mean, + axis=1, + ).squeeze(0) + return features + + def state_dict(self) -> Dict: + return dict( + time_warp_factor=self.time_warp_factor, + num_feature_masks=self.num_feature_masks, + features_mask_size=self.features_mask_size, + num_frame_masks=self.num_frame_masks, + frames_mask_size=self.frames_mask_size, + max_frames_mask_fraction=self.max_frames_mask_fraction, + p=self.p, + ) + + def load_state_dict(self, state_dict: Dict): + self.time_warp_factor = state_dict.get( + "time_warp_factor", self.time_warp_factor + ) + self.num_feature_masks = state_dict.get( + "num_feature_masks", self.num_feature_masks + ) + self.features_mask_size = state_dict.get( + "features_mask_size", self.features_mask_size + ) + self.num_frame_masks = state_dict.get("num_frame_masks", self.num_frame_masks) + self.frames_mask_size = state_dict.get( + "frames_mask_size", self.frames_mask_size + ) + self.max_frames_mask_fraction = state_dict.get( + "max_frames_mask_fraction", self.max_frames_mask_fraction + ) + self.p = state_dict.get("p", self.p) + + +def time_warp(features: torch.Tensor, factor: int) -> torch.Tensor: + """ + Time warping as described in the SpecAugment paper. + Implementation based on Espresso: + https://github.com/freewym/espresso/blob/master/espresso/tools/specaug_interpolate.py#L51 + + :param features: input tensor of shape ``(T, F)`` + :param factor: time warping parameter. + :return: a warped tensor of shape ``(T, F)`` + """ + t = features.size(0) + if t - factor <= factor + 1: + return features + center = np.random.randint(factor + 1, t - factor) + warped = np.random.randint(center - factor, center + factor + 1) + if warped == center: + return features + features = features.unsqueeze(0).unsqueeze(0) + left = torch.nn.functional.interpolate( + features[:, :, :center, :], + size=(warped, features.size(3)), + mode="bicubic", + align_corners=False, + ) + right = torch.nn.functional.interpolate( + features[:, :, center:, :], + size=(t - warped, features.size(3)), + mode="bicubic", + align_corners=False, + ) + return torch.cat((left, right), dim=2).squeeze(0).squeeze(0)