add utility file for updating the storage_path of cutsets for use in the multilingual training recipe directory structure

This commit is contained in:
Kinan Martin 2025-06-06 11:42:08 +09:00
parent 6255ba5cb2
commit 052fcc3218

View File

@ -1,5 +1,6 @@
import logging
from pathlib import Path
import os # Import os module to handle symlinks
from lhotse import CutSet, load_manifest
@ -8,7 +9,8 @@ logger = logging.getLogger(__name__)
def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"):
"""
Updates the storage_path in a CutSet's features to reflect the structure in multi_ja_en.
Updates the storage_path in a CutSet's features to reflect the new dataset-specific
feature directory structure.
Args:
cuts: The Lhotse CutSet to modify.
@ -18,48 +20,55 @@ def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "dat
This typically corresponds to the root of the manifests dir
in the original recipe.
"""
# updated_cuts = []
# for cut in cuts:
# if cut.features is not None:
# original_storage_path = Path(cut.features.storage_path)
updated_cuts = []
for cut in cuts:
if cut.features is not None:
original_storage_path = Path(cut.features.storage_path)
# # Check if the path needs updating, i.e., if it's still pointing to the old flat structure
# # and isn't already pointing to the new dataset-specific structure.
# # The `startswith` check on the original path is crucial here.
# # Example: 'data/manifests/feats_train/feats-12.lca'
# if original_storage_path.parts[0] == old_feature_prefix.split('/')[0] and \
# original_storage_path.parts[1] == old_feature_prefix.split('/')[1] and \
# not original_storage_path.parts[2].startswith(dataset_name):
# Check if the path needs updating, i.e., if it's still pointing to the old flat structure
# and isn't already pointing to the new dataset-specific structure.
# We assume old_feature_prefix is 'data/manifests'
# and original_storage_path looks like 'data/manifests/feats_train/feats-12.lca'
# We want to change it to 'data/manifests/<dataset_name>/feats_train/feats-12.lca'
# The check `original_storage_path.parts[1]` ensures it's indeed under 'manifests' and not already processed.
# And `not original_storage_path.parts[2].startswith(dataset_name)` ensures we don't re-process.
if len(original_storage_path.parts) >= 3 and \
original_storage_path.parts[0] == old_feature_prefix.split(os.sep)[0] and \
original_storage_path.parts[1] == old_feature_prefix.split(os.sep)[1] and \
not original_storage_path.parts[2].startswith(dataset_name): # Assumes dataset_name does not start with feats_
# # Assuming the original feature files were structured like
# # data/manifests/feats_train/some_file.lca
# # We want to change them to data/manifests/reazonspeech/feats_train/some_file.lca
# This gives us 'feats_train/feats-12.lca'
# It's important to be robust to potentially different original prefixes
# So we take the part of the path *after* the `old_feature_prefix`
try:
relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
except ValueError:
# If for some reason the path doesn't start with old_feature_prefix,
# keep it as is. This can happen if some paths are already absolute or different.
logger.warning(f"Feature path '{original_storage_path}' does not start with '{old_feature_prefix}'. Skipping update for this cut.")
updated_cuts.append(cut)
continue
# # This gives us 'feats_train/feats-12.lca'
# relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
# # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
# new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix
# cut = cut.with_features_path_prefix(cut.features.with_path(str(new_storage_path)))
# updated_cuts.append(cut)
# else:
# updated_cuts.append(cut) # No features, or not a path we need to modify
# return CutSet.from_cuts(updated_cuts)
return cuts.with_features_path_prefix(old_feature_prefix + "/" + dataset_name)
# Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix
# cut = cut.with_features(cut.features.with_path(str(new_storage_path)))
cut.features.storage_path = str(new_storage_path)
updated_cuts.append(cut)
else:
updated_cuts.append(cut) # No features, or not a path we need to modify
return CutSet.from_cuts(updated_cuts)
if __name__ == "__main__":
# The root where the symlinked manifests are located in the multi_ja_en recipe
multi_recipe_manifests_root = Path("data/manifests")
# Define the datasets and their *specific* manifest file prefixes
# The keys are the dataset names (which are also the subdirectory names)
# The values are the base filename for their cuts (e.g., "reazonspeech_cuts", "mls_eng_cuts")
dataset_manifest_prefixes = {
"reazonspeech": "reazonspeech_cuts",
"mls_english": "mls_eng_cuts",
}
# Define the splits. The script will append "_dev.jsonl.gz", "_train.jsonl.gz", etc.
splits = ["train", "dev", "test"]
# This is the path segment *inside* the original recipe's data/manifests
@ -78,26 +87,37 @@ if __name__ == "__main__":
for split in splits:
# Construct the path to the symlinked manifest file
manifest_filename = f"{manifest_prefix}_{split}.jsonl.gz"
manifest_path = dataset_symlink_dir / manifest_filename
symlink_path = dataset_symlink_dir / manifest_filename # This is the path to the symlink itself
if manifest_path.is_file():
logger.info(f"Processing {dataset_name} {split} cuts from symlink: {manifest_path}")
try:
# Load the manifest (Lhotse will follow the symlink)
cuts = load_manifest(manifest_path)
# Update the storage_path within the loaded cuts
# The `old_feature_prefix` is still 'data/manifests' as that's what the original
# paths in the underlying manifest refer to.
updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
# Save the updated cuts back to the *symlinked* path.
# Lhotse will write to the target of the symlink.
updated_cuts.to_file(manifest_path)
logger.info(f"Updated {dataset_name} {split} cuts saved to: {manifest_path}")
except Exception as e:
logger.error(f"Error processing {manifest_path}: {e}", exc_info=True) # Print full traceback
if symlink_path.is_symlink(): # Check if it's actually a symlink
# Get the actual path to the target file that the symlink points to
# Lhotse's load_manifest will follow this symlink automatically.
target_path = os.path.realpath(symlink_path)
logger.info(f"Processing symlink '{symlink_path}' pointing to '{target_path}'")
elif symlink_path.is_file(): # If it's a regular file (not a symlink)
logger.info(f"Processing regular file: {symlink_path}")
target_path = symlink_path # Use its own path as target
else:
logger.warning(f"Manifest file not found (symlink target might be missing or file name mismatch): {manifest_path}")
logger.warning(f"Manifest file not found or neither a file nor a symlink: {symlink_path}")
continue # Skip to next iteration
try:
# Load the manifest. Lhotse will resolve the symlink internally for reading.
cuts = load_manifest(symlink_path) # Use symlink_path here, Lhotse handles resolution for loading
# Update the storage_path within the loaded cuts (in memory)
updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
# --- CRITICAL CHANGE HERE ---
# Save the *modified* CutSet to the path of the symlink *itself*.
# This will overwrite the symlink with the new file, effectively
# breaking the symlink and creating a new file in its place.
os.unlink(symlink_path)
updated_cuts.to_file(symlink_path)
logger.info(f"Updated {dataset_name} {split} cuts saved (overwriting symlink) to: {symlink_path}")
except Exception as e:
logger.error(f"Error processing {symlink_path}: {e}", exc_info=True)
logger.info("CutSet path updating complete.")