From 85f6deb8d18c899000ff07fb51d47708eb42c8c3 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 1 Jul 2025 16:58:31 +0800 Subject: [PATCH] Support using different musan augmentations for the same audio. In addition, it returns the original audio without augmentation. --- .../ASR/zipformer/speech_recognition.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/egs/librispeech/ASR/zipformer/speech_recognition.py b/egs/librispeech/ASR/zipformer/speech_recognition.py index 4a3520b37..828602fcb 100644 --- a/egs/librispeech/ASR/zipformer/speech_recognition.py +++ b/egs/librispeech/ASR/zipformer/speech_recognition.py @@ -103,13 +103,15 @@ class K2SpeechRecognitionDataset(torch.utils.data.Dataset): # Sort the cuts by duration so that the first one determines the batch time dimensions. cuts = cuts.sort_by_duration(ascending=False) - # Optional CutSet transforms - e.g. padding, or speed perturbation that adjusts - # the supervision boundaries. - for tnfm in self.cut_transforms: - cuts = tnfm(cuts) + if self.cut_transforms: + orig_cuts = cuts - # Sort the cuts again after transforms - cuts = cuts.sort_by_duration(ascending=False) + cuts = cuts.repeat(times=2) + + for tnfm in self.cut_transforms: + cuts = tnfm(cuts) + + cuts = orig_cuts + cuts # Get a tensor with batched feature matrices, shape (B, T, F) # Collation performs auto-padding, if necessary. @@ -117,7 +119,7 @@ class K2SpeechRecognitionDataset(torch.utils.data.Dataset): if len(input_tpl) == 3: # An input strategy with fault tolerant audio reading mode. # "cuts" may be a subset of the original "cuts" variable, - # that only has cuts for which we succesfully read the audio. + # that only has cuts for which we successfully read the audio. inputs, _, cuts = input_tpl else: inputs, _ = input_tpl