add utility file for updating the storage_path of cutsets for use in the multilingual training recipe directory structure

This commit is contained in:
Kinan Martin 2025-06-06 11:42:08 +09:00
parent 6255ba5cb2
commit 052fcc3218

View File

@ -1,5 +1,6 @@
import logging import logging
from pathlib import Path from pathlib import Path
import os # Import os module to handle symlinks
from lhotse import CutSet, load_manifest from lhotse import CutSet, load_manifest
@ -8,7 +9,8 @@ logger = logging.getLogger(__name__)
def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"): def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"):
""" """
Updates the storage_path in a CutSet's features to reflect the structure in multi_ja_en. Updates the storage_path in a CutSet's features to reflect the new dataset-specific
feature directory structure.
Args: Args:
cuts: The Lhotse CutSet to modify. cuts: The Lhotse CutSet to modify.
@ -18,48 +20,55 @@ def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "dat
This typically corresponds to the root of the manifests dir This typically corresponds to the root of the manifests dir
in the original recipe. in the original recipe.
""" """
# updated_cuts = [] updated_cuts = []
# for cut in cuts: for cut in cuts:
# if cut.features is not None: if cut.features is not None:
# original_storage_path = Path(cut.features.storage_path) original_storage_path = Path(cut.features.storage_path)
# # Check if the path needs updating, i.e., if it's still pointing to the old flat structure # Check if the path needs updating, i.e., if it's still pointing to the old flat structure
# # and isn't already pointing to the new dataset-specific structure. # and isn't already pointing to the new dataset-specific structure.
# # The `startswith` check on the original path is crucial here. # We assume old_feature_prefix is 'data/manifests'
# # Example: 'data/manifests/feats_train/feats-12.lca' # and original_storage_path looks like 'data/manifests/feats_train/feats-12.lca'
# if original_storage_path.parts[0] == old_feature_prefix.split('/')[0] and \ # We want to change it to 'data/manifests/<dataset_name>/feats_train/feats-12.lca'
# original_storage_path.parts[1] == old_feature_prefix.split('/')[1] and \
# not original_storage_path.parts[2].startswith(dataset_name): # The check `original_storage_path.parts[1]` ensures it's indeed under 'manifests' and not already processed.
# And `not original_storage_path.parts[2].startswith(dataset_name)` ensures we don't re-process.
if len(original_storage_path.parts) >= 3 and \
original_storage_path.parts[0] == old_feature_prefix.split(os.sep)[0] and \
original_storage_path.parts[1] == old_feature_prefix.split(os.sep)[1] and \
not original_storage_path.parts[2].startswith(dataset_name): # Assumes dataset_name does not start with feats_
# # Assuming the original feature files were structured like # This gives us 'feats_train/feats-12.lca'
# # data/manifests/feats_train/some_file.lca # It's important to be robust to potentially different original prefixes
# # We want to change them to data/manifests/reazonspeech/feats_train/some_file.lca # So we take the part of the path *after* the `old_feature_prefix`
try:
relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
except ValueError:
# If for some reason the path doesn't start with old_feature_prefix,
# keep it as is. This can happen if some paths are already absolute or different.
logger.warning(f"Feature path '{original_storage_path}' does not start with '{old_feature_prefix}'. Skipping update for this cut.")
updated_cuts.append(cut)
continue
# # This gives us 'feats_train/feats-12.lca' # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
# relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix) new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix
# cut = cut.with_features(cut.features.with_path(str(new_storage_path)))
# # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca cut.features.storage_path = str(new_storage_path)
# new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix updated_cuts.append(cut)
# cut = cut.with_features_path_prefix(cut.features.with_path(str(new_storage_path))) else:
# updated_cuts.append(cut) updated_cuts.append(cut) # No features, or not a path we need to modify
# else: return CutSet.from_cuts(updated_cuts)
# updated_cuts.append(cut) # No features, or not a path we need to modify
# return CutSet.from_cuts(updated_cuts)
return cuts.with_features_path_prefix(old_feature_prefix + "/" + dataset_name)
if __name__ == "__main__": if __name__ == "__main__":
# The root where the symlinked manifests are located in the multi_ja_en recipe # The root where the symlinked manifests are located in the multi_ja_en recipe
multi_recipe_manifests_root = Path("data/manifests") multi_recipe_manifests_root = Path("data/manifests")
# Define the datasets and their *specific* manifest file prefixes # Define the datasets and their *specific* manifest file prefixes
# The keys are the dataset names (which are also the subdirectory names)
# The values are the base filename for their cuts (e.g., "reazonspeech_cuts", "mls_eng_cuts")
dataset_manifest_prefixes = { dataset_manifest_prefixes = {
"reazonspeech": "reazonspeech_cuts", "reazonspeech": "reazonspeech_cuts",
"mls_english": "mls_eng_cuts", "mls_english": "mls_eng_cuts",
} }
# Define the splits. The script will append "_dev.jsonl.gz", "_train.jsonl.gz", etc.
splits = ["train", "dev", "test"] splits = ["train", "dev", "test"]
# This is the path segment *inside* the original recipe's data/manifests # This is the path segment *inside* the original recipe's data/manifests
@ -78,26 +87,37 @@ if __name__ == "__main__":
for split in splits: for split in splits:
# Construct the path to the symlinked manifest file # Construct the path to the symlinked manifest file
manifest_filename = f"{manifest_prefix}_{split}.jsonl.gz" manifest_filename = f"{manifest_prefix}_{split}.jsonl.gz"
manifest_path = dataset_symlink_dir / manifest_filename symlink_path = dataset_symlink_dir / manifest_filename # This is the path to the symlink itself
if manifest_path.is_file(): if symlink_path.is_symlink(): # Check if it's actually a symlink
logger.info(f"Processing {dataset_name} {split} cuts from symlink: {manifest_path}") # Get the actual path to the target file that the symlink points to
try: # Lhotse's load_manifest will follow this symlink automatically.
# Load the manifest (Lhotse will follow the symlink) target_path = os.path.realpath(symlink_path)
cuts = load_manifest(manifest_path) logger.info(f"Processing symlink '{symlink_path}' pointing to '{target_path}'")
elif symlink_path.is_file(): # If it's a regular file (not a symlink)
# Update the storage_path within the loaded cuts logger.info(f"Processing regular file: {symlink_path}")
# The `old_feature_prefix` is still 'data/manifests' as that's what the original target_path = symlink_path # Use its own path as target
# paths in the underlying manifest refer to.
updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
# Save the updated cuts back to the *symlinked* path.
# Lhotse will write to the target of the symlink.
updated_cuts.to_file(manifest_path)
logger.info(f"Updated {dataset_name} {split} cuts saved to: {manifest_path}")
except Exception as e:
logger.error(f"Error processing {manifest_path}: {e}", exc_info=True) # Print full traceback
else: else:
logger.warning(f"Manifest file not found (symlink target might be missing or file name mismatch): {manifest_path}") logger.warning(f"Manifest file not found or neither a file nor a symlink: {symlink_path}")
continue # Skip to next iteration
try:
# Load the manifest. Lhotse will resolve the symlink internally for reading.
cuts = load_manifest(symlink_path) # Use symlink_path here, Lhotse handles resolution for loading
# Update the storage_path within the loaded cuts (in memory)
updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
# --- CRITICAL CHANGE HERE ---
# Save the *modified* CutSet to the path of the symlink *itself*.
# This will overwrite the symlink with the new file, effectively
# breaking the symlink and creating a new file in its place.
os.unlink(symlink_path)
updated_cuts.to_file(symlink_path)
logger.info(f"Updated {dataset_name} {split} cuts saved (overwriting symlink) to: {symlink_path}")
except Exception as e:
logger.error(f"Error processing {symlink_path}: {e}", exc_info=True)
logger.info("CutSet path updating complete.") logger.info("CutSet path updating complete.")