working changes for musan mixing

This commit is contained in:
Bailey Hirota 2025-07-15 13:47:59 +09:00
parent 259fafab55
commit 7995b2e909

View File

@ -7,8 +7,8 @@ from lhotse import CutSet, load_manifest
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"): def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str) -> CutSet:
""" """
Updates the storage_path in a CutSet's features to reflect the new dataset-specific Updates the storage_path in a CutSet's features to reflect the new dataset-specific
feature directory structure. feature directory structure.
@ -19,44 +19,33 @@ def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "dat
old_feature_prefix: The prefix that the original feature paths were relative to. old_feature_prefix: The prefix that the original feature paths were relative to.
This typically corresponds to the root of the manifests dir This typically corresponds to the root of the manifests dir
in the original recipe. in the original recipe.
""" """
updated_cuts = [] updated_cuts = []
for cut in cuts: for cut in cuts:
if cut.features is not None: if cut.features is not None:
original_storage_path = Path(cut.features.storage_path) original_storage_path = Path(cut.features.storage_path)
try:
relative_path = original_storage_path.relative_to(old_feature_prefix)
except ValueError:
# If for some reason the path doesn't start with old_feature_prefix,
# keep it as is. This can happen if some paths are already absolute or different.
logger.warning(f"Feature path '{original_storage_path}' does not start with '{old_feature_prefix}'. Skipping update for this cut.")
updated_cuts.append(cut)
continue
# Check if the path needs updating, i.e., if it's still pointing to the old flat structure # Avoid double-nesting (e.g., reazonspeech/reazonspeech/...)
# and isn't already pointing to the new dataset-specific structure. # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
# We assume old_feature_prefix is 'data/manifests' if relative_path.parts[0] == dataset_name:
# and original_storage_path looks like 'data/manifests/feats_train/feats-12.lca' new_storage_path = Path("data/manifests") / relative_path
# We want to change it to 'data/manifests/<dataset_name>/feats_train/feats-12.lca' else:
new_storage_path = Path("data/manifests") / dataset_name / relative_path
# The check `original_storage_path.parts[1]` ensures it's indeed under 'manifests' and not already processed.
# And `not original_storage_path.parts[2].startswith(dataset_name)` ensures we don't re-process. logger.info(f"Updating cut {cut.id}: {original_storage_path}{new_storage_path}")
if len(original_storage_path.parts) >= 3 and \ cut.features.storage_path = str(new_storage_path)
original_storage_path.parts[0] == old_feature_prefix.split(os.sep)[0] and \
original_storage_path.parts[1] == old_feature_prefix.split(os.sep)[1] and \
not original_storage_path.parts[2].startswith(dataset_name): # Assumes dataset_name does not start with feats_
# This gives us 'feats_train/feats-12.lca'
# It's important to be robust to potentially different original prefixes
# So we take the part of the path *after* the `old_feature_prefix`
try:
relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
except ValueError:
# If for some reason the path doesn't start with old_feature_prefix,
# keep it as is. This can happen if some paths are already absolute or different.
logger.warning(f"Feature path '{original_storage_path}' does not start with '{old_feature_prefix}'. Skipping update for this cut.")
updated_cuts.append(cut)
continue
# Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
new_storage_path = Path("data/manifests") / dataset_name / relative_path_from_old_prefix
cut = cut.with_features(cut.features.with_path(str(new_storage_path)))
# cut.features.storage_path = str(new_storage_path)
updated_cuts.append(cut) updated_cuts.append(cut)
else: else:
logger.warning(f"Skipping update for cut {cut.id}: has no features.")
updated_cuts.append(cut) # No features, or not a path we need to modify updated_cuts.append(cut) # No features, or not a path we need to modify
logger.warning(f"Skipping update for: {original_storage_path}")
return CutSet.from_cuts(updated_cuts) return CutSet.from_cuts(updated_cuts)