add utility file for updating the storage_path of cutsets for use in the multilingual training recipe directory structure

2025-08-09 10:02:22 +00:00 · 2025-06-06 11:42:08 +09:00 · 2025-06-06 11:42:08 +09:00 · 052fcc3218
commit 052fcc3218
parent 6255ba5cb2
1 changed files with 68 additions and 48 deletions
--- a/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py
+++ b/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py
@ -1,5 +1,6 @@
 import logging
 from pathlib import Path
+import os # Import os module to handle symlinks

 from lhotse import CutSet, load_manifest

@ -8,7 +9,8 @@ logger = logging.getLogger(__name__)

 def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"):
    """
-    Updates the storage_path in a CutSet's features to reflect the structure in multi_ja_en.
+    Updates the storage_path in a CutSet's features to reflect the new dataset-specific
+    feature directory structure.

    Args:
        cuts: The Lhotse CutSet to modify.
@ -18,48 +20,55 @@ def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "dat
                            This typically corresponds to the root of the manifests dir
                            in the original recipe.
    """
-    # updated_cuts = []
-    # for cut in cuts:
-    #     if cut.features is not None:
-    #         original_storage_path = Path(cut.features.storage_path)
+    updated_cuts = []
+    for cut in cuts:
+        if cut.features is not None:
+            original_storage_path = Path(cut.features.storage_path)

-    #         # Check if the path needs updating, i.e., if it's still pointing to the old flat structure
-    #         # and isn't already pointing to the new dataset-specific structure.
-    #         # The `startswith` check on the original path is crucial here.
-    #         # Example: 'data/manifests/feats_train/feats-12.lca'
-    #         if original_storage_path.parts[0] == old_feature_prefix.split('/')[0] and \
-    #            original_storage_path.parts[1] == old_feature_prefix.split('/')[1] and \
-    #            not original_storage_path.parts[2].startswith(dataset_name):
+            # Check if the path needs updating, i.e., if it's still pointing to the old flat structure
+            # and isn't already pointing to the new dataset-specific structure.
+            # We assume old_feature_prefix is 'data/manifests'
+            # and original_storage_path looks like 'data/manifests/feats_train/feats-12.lca'
+            # We want to change it to 'data/manifests/<dataset_name>/feats_train/feats-12.lca'
+            
+            # The check `original_storage_path.parts[1]` ensures it's indeed under 'manifests' and not already processed.
+            # And `not original_storage_path.parts[2].startswith(dataset_name)` ensures we don't re-process.
+            if len(original_storage_path.parts) >= 3 and \
+               original_storage_path.parts[0] == old_feature_prefix.split(os.sep)[0] and \
+               original_storage_path.parts[1] == old_feature_prefix.split(os.sep)[1] and \
+               not original_storage_path.parts[2].startswith(dataset_name): # Assumes dataset_name does not start with feats_

-    #             # Assuming the original feature files were structured like
-    #             # data/manifests/feats_train/some_file.lca
-    #             # We want to change them to data/manifests/reazonspeech/feats_train/some_file.lca
+                # This gives us 'feats_train/feats-12.lca'
+                # It's important to be robust to potentially different original prefixes
+                # So we take the part of the path *after* the `old_feature_prefix`
+                try:
+                    relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
+                except ValueError:
+                    # If for some reason the path doesn't start with old_feature_prefix,
+                    # keep it as is. This can happen if some paths are already absolute or different.
+                    logger.warning(f"Feature path '{original_storage_path}' does not start with '{old_feature_prefix}'. Skipping update for this cut.")
+                    updated_cuts.append(cut)
+                    continue

-    #             # This gives us 'feats_train/feats-12.lca'
-    #             relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
-
-    #             # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
-    #             new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix
-    #             cut = cut.with_features_path_prefix(cut.features.with_path(str(new_storage_path)))
-    #         updated_cuts.append(cut)
-    #     else:
-    #         updated_cuts.append(cut) # No features, or not a path we need to modify
-    # return CutSet.from_cuts(updated_cuts)
-    return cuts.with_features_path_prefix(old_feature_prefix + "/" + dataset_name)
+                # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
+                new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix
+                # cut = cut.with_features(cut.features.with_path(str(new_storage_path)))
+                cut.features.storage_path = str(new_storage_path)
+            updated_cuts.append(cut)
+        else:
+            updated_cuts.append(cut) # No features, or not a path we need to modify
+    return CutSet.from_cuts(updated_cuts)

 if __name__ == "__main__":
    # The root where the symlinked manifests are located in the multi_ja_en recipe
    multi_recipe_manifests_root = Path("data/manifests")

    # Define the datasets and their *specific* manifest file prefixes
-    # The keys are the dataset names (which are also the subdirectory names)
-    # The values are the base filename for their cuts (e.g., "reazonspeech_cuts", "mls_eng_cuts")
    dataset_manifest_prefixes = {
        "reazonspeech": "reazonspeech_cuts",
        "mls_english": "mls_eng_cuts",
    }

-    # Define the splits. The script will append "_dev.jsonl.gz", "_train.jsonl.gz", etc.
    splits = ["train", "dev", "test"]

    # This is the path segment *inside* the original recipe's data/manifests
@ -78,26 +87,37 @@ if __name__ == "__main__":
        for split in splits:
            # Construct the path to the symlinked manifest file
            manifest_filename = f"{manifest_prefix}_{split}.jsonl.gz"
-            manifest_path = dataset_symlink_dir / manifest_filename
+            symlink_path = dataset_symlink_dir / manifest_filename # This is the path to the symlink itself

-            if manifest_path.is_file():
-                logger.info(f"Processing {dataset_name} {split} cuts from symlink: {manifest_path}")
-                try:
-                    # Load the manifest (Lhotse will follow the symlink)
-                    cuts = load_manifest(manifest_path)
-
-                    # Update the storage_path within the loaded cuts
-                    # The `old_feature_prefix` is still 'data/manifests' as that's what the original
-                    # paths in the underlying manifest refer to.
-                    updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
-
-                    # Save the updated cuts back to the *symlinked* path.
-                    # Lhotse will write to the target of the symlink.
-                    updated_cuts.to_file(manifest_path)
-                    logger.info(f"Updated {dataset_name} {split} cuts saved to: {manifest_path}")
-                except Exception as e:
-                    logger.error(f"Error processing {manifest_path}: {e}", exc_info=True) # Print full traceback
+            if symlink_path.is_symlink(): # Check if it's actually a symlink
+                # Get the actual path to the target file that the symlink points to
+                # Lhotse's load_manifest will follow this symlink automatically.
+                target_path = os.path.realpath(symlink_path)
+                logger.info(f"Processing symlink '{symlink_path}' pointing to '{target_path}'")
+            elif symlink_path.is_file(): # If it's a regular file (not a symlink)
+                logger.info(f"Processing regular file: {symlink_path}")
+                target_path = symlink_path # Use its own path as target
            else:
-                logger.warning(f"Manifest file not found (symlink target might be missing or file name mismatch): {manifest_path}")
+                logger.warning(f"Manifest file not found or neither a file nor a symlink: {symlink_path}")
+                continue # Skip to next iteration
+
+
+            try:
+                # Load the manifest. Lhotse will resolve the symlink internally for reading.
+                cuts = load_manifest(symlink_path) # Use symlink_path here, Lhotse handles resolution for loading
+
+                # Update the storage_path within the loaded cuts (in memory)
+                updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
+
+                # --- CRITICAL CHANGE HERE ---
+                # Save the *modified* CutSet to the path of the symlink *itself*.
+                # This will overwrite the symlink with the new file, effectively
+                # breaking the symlink and creating a new file in its place.
+                os.unlink(symlink_path)
+                updated_cuts.to_file(symlink_path)
+                logger.info(f"Updated {dataset_name} {split} cuts saved (overwriting symlink) to: {symlink_path}")
+
+            except Exception as e:
+                logger.error(f"Error processing {symlink_path}: {e}", exc_info=True)

    logger.info("CutSet path updating complete.")