working changes for musan mixing

This commit is contained in:
Bailey Hirota 2025-07-15 13:47:59 +09:00
parent 259fafab55
commit 7995b2e909

View File

@ -7,7 +7,7 @@ from lhotse import CutSet, load_manifest
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"): def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str) -> CutSet:
""" """
Updates the storage_path in a CutSet's features to reflect the new dataset-specific Updates the storage_path in a CutSet's features to reflect the new dataset-specific
feature directory structure. feature directory structure.
@ -24,24 +24,8 @@ def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "dat
for cut in cuts: for cut in cuts:
if cut.features is not None: if cut.features is not None:
original_storage_path = Path(cut.features.storage_path) original_storage_path = Path(cut.features.storage_path)
# Check if the path needs updating, i.e., if it's still pointing to the old flat structure
# and isn't already pointing to the new dataset-specific structure.
# We assume old_feature_prefix is 'data/manifests'
# and original_storage_path looks like 'data/manifests/feats_train/feats-12.lca'
# We want to change it to 'data/manifests/<dataset_name>/feats_train/feats-12.lca'
# The check `original_storage_path.parts[1]` ensures it's indeed under 'manifests' and not already processed.
# And `not original_storage_path.parts[2].startswith(dataset_name)` ensures we don't re-process.
if len(original_storage_path.parts) >= 3 and \
original_storage_path.parts[0] == old_feature_prefix.split(os.sep)[0] and \
original_storage_path.parts[1] == old_feature_prefix.split(os.sep)[1] and \
not original_storage_path.parts[2].startswith(dataset_name): # Assumes dataset_name does not start with feats_
# This gives us 'feats_train/feats-12.lca'
# It's important to be robust to potentially different original prefixes
# So we take the part of the path *after* the `old_feature_prefix`
try: try:
relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix) relative_path = original_storage_path.relative_to(old_feature_prefix)
except ValueError: except ValueError:
# If for some reason the path doesn't start with old_feature_prefix, # If for some reason the path doesn't start with old_feature_prefix,
# keep it as is. This can happen if some paths are already absolute or different. # keep it as is. This can happen if some paths are already absolute or different.
@ -49,14 +33,19 @@ def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "dat
updated_cuts.append(cut) updated_cuts.append(cut)
continue continue
# Avoid double-nesting (e.g., reazonspeech/reazonspeech/...)
# Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
new_storage_path = Path("data/manifests") / dataset_name / relative_path_from_old_prefix if relative_path.parts[0] == dataset_name:
cut = cut.with_features(cut.features.with_path(str(new_storage_path))) new_storage_path = Path("data/manifests") / relative_path
# cut.features.storage_path = str(new_storage_path) else:
new_storage_path = Path("data/manifests") / dataset_name / relative_path
logger.info(f"Updating cut {cut.id}: {original_storage_path}{new_storage_path}")
cut.features.storage_path = str(new_storage_path)
updated_cuts.append(cut) updated_cuts.append(cut)
else: else:
logger.warning(f"Skipping update for cut {cut.id}: has no features.")
updated_cuts.append(cut) # No features, or not a path we need to modify updated_cuts.append(cut) # No features, or not a path we need to modify
logger.warning(f"Skipping update for: {original_storage_path}")
return CutSet.from_cuts(updated_cuts) return CutSet.from_cuts(updated_cuts)