working changes for musan mixing

2025-12-11 06:55:27 +00:00 · 2025-07-15 13:47:59 +09:00 · 2025-07-15 13:47:59 +09:00 · 7995b2e909
commit 7995b2e909
parent 259fafab55
1 changed files with 21 additions and 32 deletions
--- a/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py
+++ b/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py
@ -7,7 +7,7 @@ from lhotse import CutSet, load_manifest
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"):
+def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str) -> CutSet:
 """
    Updates the storage_path in a CutSet's features to reflect the new dataset-specific
    feature directory structure.
@ -24,24 +24,8 @@ def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "dat
    for cut in cuts:
        if cut.features is not None:
            original_storage_path = Path(cut.features.storage_path)
            # Check if the path needs updating, i.e., if it's still pointing to the old flat structure
            # and isn't already pointing to the new dataset-specific structure.
            # We assume old_feature_prefix is 'data/manifests'
            # and original_storage_path looks like 'data/manifests/feats_train/feats-12.lca'
            # We want to change it to 'data/manifests/<dataset_name>/feats_train/feats-12.lca'
            # The check `original_storage_path.parts[1]` ensures it's indeed under 'manifests' and not already processed.
            # And `not original_storage_path.parts[2].startswith(dataset_name)` ensures we don't re-process.
            if len(original_storage_path.parts) >= 3 and \
                    original_storage_path.parts[0] == old_feature_prefix.split(os.sep)[0] and \
                    original_storage_path.parts[1] == old_feature_prefix.split(os.sep)[1] and \
                    not original_storage_path.parts[2].startswith(dataset_name): # Assumes dataset_name does not start with feats_
                        # This gives us 'feats_train/feats-12.lca'
                # It's important to be robust to potentially different original prefixes
                # So we take the part of the path *after* the `old_feature_prefix`
            try:
-                    relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
+                relative_path = original_storage_path.relative_to(old_feature_prefix)
            except ValueError:
 				# If for some reason the path doesn't start with old_feature_prefix,
 				# keep it as is. This can happen if some paths are already absolute or different.
@ -49,14 +33,19 @@ def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "dat
                updated_cuts.append(cut)
                continue
            # Avoid double-nesting (e.g., reazonspeech/reazonspeech/...)
            # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
-                new_storage_path = Path("data/manifests") / dataset_name / relative_path_from_old_prefix
+			if relative_path.parts[0] == dataset_name:
-                cut = cut.with_features(cut.features.with_path(str(new_storage_path)))
+                    new_storage_path = Path("data/manifests") / relative_path
-                # cut.features.storage_path = str(new_storage_path)
+            else:
                    new_storage_path = Path("data/manifests") / dataset_name / relative_path
            logger.info(f"Updating cut {cut.id}: {original_storage_path} → {new_storage_path}")
            cut.features.storage_path = str(new_storage_path)
            updated_cuts.append(cut)
        else:
            logger.warning(f"Skipping update for cut {cut.id}: has no features.")
            updated_cuts.append(cut) # No features, or not a path we need to modify
            logger.warning(f"Skipping update for: {original_storage_path}")
    return CutSet.from_cuts(updated_cuts)