add utility file for updating the storage_path of cutsets for use in the multilingual training recipe directory structure

2025-08-10 02:22:17 +00:00 · 2025-06-06 11:42:08 +09:00 · 2025-06-06 11:42:08 +09:00 · 052fcc3218
commit 052fcc3218
parent 6255ba5cb2
1 changed files with 68 additions and 48 deletions
--- a/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py
+++ b/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py
@ -1,5 +1,6 @@
 import logging
 from pathlib import Path
 import os # Import os module to handle symlinks
 from lhotse import CutSet, load_manifest
@ -8,7 +9,8 @@ logger = logging.getLogger(__name__)
 def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"):
    """
-    Updates the storage_path in a CutSet's features to reflect the structure in multi_ja_en.
+    Updates the storage_path in a CutSet's features to reflect the new dataset-specific
    feature directory structure.
    Args:
        cuts: The Lhotse CutSet to modify.
@ -18,48 +20,55 @@ def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "dat
                            This typically corresponds to the root of the manifests dir
                            in the original recipe.
    """
-    # updated_cuts = []
+    updated_cuts = []
-    # for cut in cuts:
+    for cut in cuts:
-    #     if cut.features is not None:
+        if cut.features is not None:
-    #         original_storage_path = Path(cut.features.storage_path)
+            original_storage_path = Path(cut.features.storage_path)
-    #         # Check if the path needs updating, i.e., if it's still pointing to the old flat structure
+            # Check if the path needs updating, i.e., if it's still pointing to the old flat structure
-    #         # and isn't already pointing to the new dataset-specific structure.
+            # and isn't already pointing to the new dataset-specific structure.
-    #         # The `startswith` check on the original path is crucial here.
+            # We assume old_feature_prefix is 'data/manifests'
-    #         # Example: 'data/manifests/feats_train/feats-12.lca'
+            # and original_storage_path looks like 'data/manifests/feats_train/feats-12.lca'
-    #         if original_storage_path.parts[0] == old_feature_prefix.split('/')[0] and \
+            # We want to change it to 'data/manifests/<dataset_name>/feats_train/feats-12.lca'
-    #            original_storage_path.parts[1] == old_feature_prefix.split('/')[1] and \
+            
-    #            not original_storage_path.parts[2].startswith(dataset_name):
+            # The check `original_storage_path.parts[1]` ensures it's indeed under 'manifests' and not already processed.
            # And `not original_storage_path.parts[2].startswith(dataset_name)` ensures we don't re-process.
            if len(original_storage_path.parts) >= 3 and \
               original_storage_path.parts[0] == old_feature_prefix.split(os.sep)[0] and \
               original_storage_path.parts[1] == old_feature_prefix.split(os.sep)[1] and \
               not original_storage_path.parts[2].startswith(dataset_name): # Assumes dataset_name does not start with feats_
-    #             # Assuming the original feature files were structured like
+                # This gives us 'feats_train/feats-12.lca'
-    #             # data/manifests/feats_train/some_file.lca
+                # It's important to be robust to potentially different original prefixes
-    #             # We want to change them to data/manifests/reazonspeech/feats_train/some_file.lca
+                # So we take the part of the path *after* the `old_feature_prefix`
                try:
                    relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
                except ValueError:
                    # If for some reason the path doesn't start with old_feature_prefix,
                    # keep it as is. This can happen if some paths are already absolute or different.
                    logger.warning(f"Feature path '{original_storage_path}' does not start with '{old_feature_prefix}'. Skipping update for this cut.")
                    updated_cuts.append(cut)
                    continue
-    #             # This gives us 'feats_train/feats-12.lca'
+                # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
-    #             relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
+                new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix
-
+                # cut = cut.with_features(cut.features.with_path(str(new_storage_path)))
-    #             # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
+                cut.features.storage_path = str(new_storage_path)
-    #             new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix
+            updated_cuts.append(cut)
-    #             cut = cut.with_features_path_prefix(cut.features.with_path(str(new_storage_path)))
+        else:
-    #         updated_cuts.append(cut)
+            updated_cuts.append(cut) # No features, or not a path we need to modify
-    #     else:
+    return CutSet.from_cuts(updated_cuts)
    #         updated_cuts.append(cut) # No features, or not a path we need to modify
    # return CutSet.from_cuts(updated_cuts)
    return cuts.with_features_path_prefix(old_feature_prefix + "/" + dataset_name)
 if __name__ == "__main__":
    # The root where the symlinked manifests are located in the multi_ja_en recipe
    multi_recipe_manifests_root = Path("data/manifests")
    # Define the datasets and their *specific* manifest file prefixes
    # The keys are the dataset names (which are also the subdirectory names)
    # The values are the base filename for their cuts (e.g., "reazonspeech_cuts", "mls_eng_cuts")
    dataset_manifest_prefixes = {
        "reazonspeech": "reazonspeech_cuts",
        "mls_english": "mls_eng_cuts",
    }
    # Define the splits. The script will append "_dev.jsonl.gz", "_train.jsonl.gz", etc.
    splits = ["train", "dev", "test"]
    # This is the path segment *inside* the original recipe's data/manifests
@ -78,26 +87,37 @@ if __name__ == "__main__":
        for split in splits:
            # Construct the path to the symlinked manifest file
            manifest_filename = f"{manifest_prefix}_{split}.jsonl.gz"
-            manifest_path = dataset_symlink_dir / manifest_filename
+            symlink_path = dataset_symlink_dir / manifest_filename # This is the path to the symlink itself
-            if manifest_path.is_file():
+            if symlink_path.is_symlink(): # Check if it's actually a symlink
-                logger.info(f"Processing {dataset_name} {split} cuts from symlink: {manifest_path}")
+                # Get the actual path to the target file that the symlink points to
-                try:
+                # Lhotse's load_manifest will follow this symlink automatically.
-                    # Load the manifest (Lhotse will follow the symlink)
+                target_path = os.path.realpath(symlink_path)
-                    cuts = load_manifest(manifest_path)
+                logger.info(f"Processing symlink '{symlink_path}' pointing to '{target_path}'")
-
+            elif symlink_path.is_file(): # If it's a regular file (not a symlink)
-                    # Update the storage_path within the loaded cuts
+                logger.info(f"Processing regular file: {symlink_path}")
-                    # The `old_feature_prefix` is still 'data/manifests' as that's what the original
+                target_path = symlink_path # Use its own path as target
                    # paths in the underlying manifest refer to.
                    updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
                    # Save the updated cuts back to the *symlinked* path.
                    # Lhotse will write to the target of the symlink.
                    updated_cuts.to_file(manifest_path)
                    logger.info(f"Updated {dataset_name} {split} cuts saved to: {manifest_path}")
                except Exception as e:
                    logger.error(f"Error processing {manifest_path}: {e}", exc_info=True) # Print full traceback
            else:
-                logger.warning(f"Manifest file not found (symlink target might be missing or file name mismatch): {manifest_path}")
+                logger.warning(f"Manifest file not found or neither a file nor a symlink: {symlink_path}")
                continue # Skip to next iteration
            try:
                # Load the manifest. Lhotse will resolve the symlink internally for reading.
                cuts = load_manifest(symlink_path) # Use symlink_path here, Lhotse handles resolution for loading
                # Update the storage_path within the loaded cuts (in memory)
                updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
                # --- CRITICAL CHANGE HERE ---
                # Save the *modified* CutSet to the path of the symlink *itself*.
                # This will overwrite the symlink with the new file, effectively
                # breaking the symlink and creating a new file in its place.
                os.unlink(symlink_path)
                updated_cuts.to_file(symlink_path)
                logger.info(f"Updated {dataset_name} {split} cuts saved (overwriting symlink) to: {symlink_path}")
            except Exception as e:
                logger.error(f"Error processing {symlink_path}: {e}", exc_info=True)
    logger.info("CutSet path updating complete.")