fix speed perturb issue

This commit is contained in:
root 2025-05-15 22:45:04 -07:00
parent bfb4ebeb83
commit 0e8c1db4d0
2 changed files with 7 additions and 6 deletions

View File

@ -632,9 +632,10 @@ class AsrDataModule:
@lru_cache()
def train_cuts_librispeech(self) -> CutSet:
logging.info("About to get train cuts")
# librispeech_path="fixie-ai/librispeech_asr"
librispeech_path = "/workspace/slam/librispeech_asr"
if self.args.huggingface_dataset_path_or_name is not None:
librispeech_path = self.args.huggingface_dataset_path_or_name + "/librispeech_asr"
else:
librispeech_path = "fixie-ai/librispeech_asr"
# 148_688
librispeech_other = load_dataset(
librispeech_path, "other", split="train.500", streaming=True

View File

@ -867,7 +867,7 @@ def run(rank, world_size, args):
# You should use ../local/display_manifest_statistics.py to get
# an utterance duration distribution for your dataset to select
# the threshold
if c.duration < 1.0 or c.duration > 29.0:
if c.duration < 1.0 or c.duration > 25.0:
logging.warning(
f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
)
@ -892,9 +892,9 @@ def run(rank, world_size, args):
train_cuts = data_module.train_cuts_en_vocalnet()
valid_cuts = data_module.valid_cuts_en_vocalnet()
elif params.dataset_format == "speech_continuation":
# train_cuts = data_module.train_cuts_ultravox()
train_cuts = data_module.train_cuts_ultravox()
# train_cuts = data_module.train_cuts_gigaspeech()
train_cuts = data_module.train_cuts_librispeech()
# train_cuts = data_module.train_cuts_librispeech()
valid_cuts = data_module.valid_cuts_ultravox()
else:
raise ValueError(f"Unknown dataset format: {params.dataset_format}")