Support using different musan augmentations for the same audio.

In addition, it returns the original audio without augmentation.
This commit is contained in:
Fangjun Kuang 2025-07-01 16:58:31 +08:00
parent 075e74bcb5
commit 85f6deb8d1

View File

@ -103,13 +103,15 @@ class K2SpeechRecognitionDataset(torch.utils.data.Dataset):
# Sort the cuts by duration so that the first one determines the batch time dimensions. # Sort the cuts by duration so that the first one determines the batch time dimensions.
cuts = cuts.sort_by_duration(ascending=False) cuts = cuts.sort_by_duration(ascending=False)
# Optional CutSet transforms - e.g. padding, or speed perturbation that adjusts if self.cut_transforms:
# the supervision boundaries. orig_cuts = cuts
cuts = cuts.repeat(times=2)
for tnfm in self.cut_transforms: for tnfm in self.cut_transforms:
cuts = tnfm(cuts) cuts = tnfm(cuts)
# Sort the cuts again after transforms cuts = orig_cuts + cuts
cuts = cuts.sort_by_duration(ascending=False)
# Get a tensor with batched feature matrices, shape (B, T, F) # Get a tensor with batched feature matrices, shape (B, T, F)
# Collation performs auto-padding, if necessary. # Collation performs auto-padding, if necessary.
@ -117,7 +119,7 @@ class K2SpeechRecognitionDataset(torch.utils.data.Dataset):
if len(input_tpl) == 3: if len(input_tpl) == 3:
# An input strategy with fault tolerant audio reading mode. # An input strategy with fault tolerant audio reading mode.
# "cuts" may be a subset of the original "cuts" variable, # "cuts" may be a subset of the original "cuts" variable,
# that only has cuts for which we succesfully read the audio. # that only has cuts for which we successfully read the audio.
inputs, _, cuts = input_tpl inputs, _, cuts = input_tpl
else: else:
inputs, _ = input_tpl inputs, _ = input_tpl