mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 10:02:22 +00:00
add utility file for updating the storage_path of cutsets for use in the multilingual training recipe directory structure
This commit is contained in:
parent
6255ba5cb2
commit
052fcc3218
@ -1,5 +1,6 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import os # Import os module to handle symlinks
|
||||
|
||||
from lhotse import CutSet, load_manifest
|
||||
|
||||
@ -8,7 +9,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"):
|
||||
"""
|
||||
Updates the storage_path in a CutSet's features to reflect the structure in multi_ja_en.
|
||||
Updates the storage_path in a CutSet's features to reflect the new dataset-specific
|
||||
feature directory structure.
|
||||
|
||||
Args:
|
||||
cuts: The Lhotse CutSet to modify.
|
||||
@ -18,48 +20,55 @@ def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "dat
|
||||
This typically corresponds to the root of the manifests dir
|
||||
in the original recipe.
|
||||
"""
|
||||
# updated_cuts = []
|
||||
# for cut in cuts:
|
||||
# if cut.features is not None:
|
||||
# original_storage_path = Path(cut.features.storage_path)
|
||||
updated_cuts = []
|
||||
for cut in cuts:
|
||||
if cut.features is not None:
|
||||
original_storage_path = Path(cut.features.storage_path)
|
||||
|
||||
# # Check if the path needs updating, i.e., if it's still pointing to the old flat structure
|
||||
# # and isn't already pointing to the new dataset-specific structure.
|
||||
# # The `startswith` check on the original path is crucial here.
|
||||
# # Example: 'data/manifests/feats_train/feats-12.lca'
|
||||
# if original_storage_path.parts[0] == old_feature_prefix.split('/')[0] and \
|
||||
# original_storage_path.parts[1] == old_feature_prefix.split('/')[1] and \
|
||||
# not original_storage_path.parts[2].startswith(dataset_name):
|
||||
# Check if the path needs updating, i.e., if it's still pointing to the old flat structure
|
||||
# and isn't already pointing to the new dataset-specific structure.
|
||||
# We assume old_feature_prefix is 'data/manifests'
|
||||
# and original_storage_path looks like 'data/manifests/feats_train/feats-12.lca'
|
||||
# We want to change it to 'data/manifests/<dataset_name>/feats_train/feats-12.lca'
|
||||
|
||||
# The check `original_storage_path.parts[1]` ensures it's indeed under 'manifests' and not already processed.
|
||||
# And `not original_storage_path.parts[2].startswith(dataset_name)` ensures we don't re-process.
|
||||
if len(original_storage_path.parts) >= 3 and \
|
||||
original_storage_path.parts[0] == old_feature_prefix.split(os.sep)[0] and \
|
||||
original_storage_path.parts[1] == old_feature_prefix.split(os.sep)[1] and \
|
||||
not original_storage_path.parts[2].startswith(dataset_name): # Assumes dataset_name does not start with feats_
|
||||
|
||||
# # Assuming the original feature files were structured like
|
||||
# # data/manifests/feats_train/some_file.lca
|
||||
# # We want to change them to data/manifests/reazonspeech/feats_train/some_file.lca
|
||||
# This gives us 'feats_train/feats-12.lca'
|
||||
# It's important to be robust to potentially different original prefixes
|
||||
# So we take the part of the path *after* the `old_feature_prefix`
|
||||
try:
|
||||
relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
|
||||
except ValueError:
|
||||
# If for some reason the path doesn't start with old_feature_prefix,
|
||||
# keep it as is. This can happen if some paths are already absolute or different.
|
||||
logger.warning(f"Feature path '{original_storage_path}' does not start with '{old_feature_prefix}'. Skipping update for this cut.")
|
||||
updated_cuts.append(cut)
|
||||
continue
|
||||
|
||||
# # This gives us 'feats_train/feats-12.lca'
|
||||
# relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
|
||||
|
||||
# # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
|
||||
# new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix
|
||||
# cut = cut.with_features_path_prefix(cut.features.with_path(str(new_storage_path)))
|
||||
# updated_cuts.append(cut)
|
||||
# else:
|
||||
# updated_cuts.append(cut) # No features, or not a path we need to modify
|
||||
# return CutSet.from_cuts(updated_cuts)
|
||||
return cuts.with_features_path_prefix(old_feature_prefix + "/" + dataset_name)
|
||||
# Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
|
||||
new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix
|
||||
# cut = cut.with_features(cut.features.with_path(str(new_storage_path)))
|
||||
cut.features.storage_path = str(new_storage_path)
|
||||
updated_cuts.append(cut)
|
||||
else:
|
||||
updated_cuts.append(cut) # No features, or not a path we need to modify
|
||||
return CutSet.from_cuts(updated_cuts)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# The root where the symlinked manifests are located in the multi_ja_en recipe
|
||||
multi_recipe_manifests_root = Path("data/manifests")
|
||||
|
||||
# Define the datasets and their *specific* manifest file prefixes
|
||||
# The keys are the dataset names (which are also the subdirectory names)
|
||||
# The values are the base filename for their cuts (e.g., "reazonspeech_cuts", "mls_eng_cuts")
|
||||
dataset_manifest_prefixes = {
|
||||
"reazonspeech": "reazonspeech_cuts",
|
||||
"mls_english": "mls_eng_cuts",
|
||||
}
|
||||
|
||||
# Define the splits. The script will append "_dev.jsonl.gz", "_train.jsonl.gz", etc.
|
||||
splits = ["train", "dev", "test"]
|
||||
|
||||
# This is the path segment *inside* the original recipe's data/manifests
|
||||
@ -78,26 +87,37 @@ if __name__ == "__main__":
|
||||
for split in splits:
|
||||
# Construct the path to the symlinked manifest file
|
||||
manifest_filename = f"{manifest_prefix}_{split}.jsonl.gz"
|
||||
manifest_path = dataset_symlink_dir / manifest_filename
|
||||
symlink_path = dataset_symlink_dir / manifest_filename # This is the path to the symlink itself
|
||||
|
||||
if manifest_path.is_file():
|
||||
logger.info(f"Processing {dataset_name} {split} cuts from symlink: {manifest_path}")
|
||||
try:
|
||||
# Load the manifest (Lhotse will follow the symlink)
|
||||
cuts = load_manifest(manifest_path)
|
||||
|
||||
# Update the storage_path within the loaded cuts
|
||||
# The `old_feature_prefix` is still 'data/manifests' as that's what the original
|
||||
# paths in the underlying manifest refer to.
|
||||
updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
|
||||
|
||||
# Save the updated cuts back to the *symlinked* path.
|
||||
# Lhotse will write to the target of the symlink.
|
||||
updated_cuts.to_file(manifest_path)
|
||||
logger.info(f"Updated {dataset_name} {split} cuts saved to: {manifest_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {manifest_path}: {e}", exc_info=True) # Print full traceback
|
||||
if symlink_path.is_symlink(): # Check if it's actually a symlink
|
||||
# Get the actual path to the target file that the symlink points to
|
||||
# Lhotse's load_manifest will follow this symlink automatically.
|
||||
target_path = os.path.realpath(symlink_path)
|
||||
logger.info(f"Processing symlink '{symlink_path}' pointing to '{target_path}'")
|
||||
elif symlink_path.is_file(): # If it's a regular file (not a symlink)
|
||||
logger.info(f"Processing regular file: {symlink_path}")
|
||||
target_path = symlink_path # Use its own path as target
|
||||
else:
|
||||
logger.warning(f"Manifest file not found (symlink target might be missing or file name mismatch): {manifest_path}")
|
||||
logger.warning(f"Manifest file not found or neither a file nor a symlink: {symlink_path}")
|
||||
continue # Skip to next iteration
|
||||
|
||||
|
||||
try:
|
||||
# Load the manifest. Lhotse will resolve the symlink internally for reading.
|
||||
cuts = load_manifest(symlink_path) # Use symlink_path here, Lhotse handles resolution for loading
|
||||
|
||||
# Update the storage_path within the loaded cuts (in memory)
|
||||
updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
|
||||
|
||||
# --- CRITICAL CHANGE HERE ---
|
||||
# Save the *modified* CutSet to the path of the symlink *itself*.
|
||||
# This will overwrite the symlink with the new file, effectively
|
||||
# breaking the symlink and creating a new file in its place.
|
||||
os.unlink(symlink_path)
|
||||
updated_cuts.to_file(symlink_path)
|
||||
logger.info(f"Updated {dataset_name} {split} cuts saved (overwriting symlink) to: {symlink_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {symlink_path}: {e}", exc_info=True)
|
||||
|
||||
logger.info("CutSet path updating complete.")
|
Loading…
x
Reference in New Issue
Block a user