mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-10 02:22:17 +00:00
add utility file for updating the storage_path of cutsets for use in the multilingual training recipe directory structure
This commit is contained in:
parent
6255ba5cb2
commit
052fcc3218
@ -1,5 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import os # Import os module to handle symlinks
|
||||||
|
|
||||||
from lhotse import CutSet, load_manifest
|
from lhotse import CutSet, load_manifest
|
||||||
|
|
||||||
@ -8,7 +9,8 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"):
|
def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"):
|
||||||
"""
|
"""
|
||||||
Updates the storage_path in a CutSet's features to reflect the structure in multi_ja_en.
|
Updates the storage_path in a CutSet's features to reflect the new dataset-specific
|
||||||
|
feature directory structure.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
cuts: The Lhotse CutSet to modify.
|
cuts: The Lhotse CutSet to modify.
|
||||||
@ -18,48 +20,55 @@ def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "dat
|
|||||||
This typically corresponds to the root of the manifests dir
|
This typically corresponds to the root of the manifests dir
|
||||||
in the original recipe.
|
in the original recipe.
|
||||||
"""
|
"""
|
||||||
# updated_cuts = []
|
updated_cuts = []
|
||||||
# for cut in cuts:
|
for cut in cuts:
|
||||||
# if cut.features is not None:
|
if cut.features is not None:
|
||||||
# original_storage_path = Path(cut.features.storage_path)
|
original_storage_path = Path(cut.features.storage_path)
|
||||||
|
|
||||||
# # Check if the path needs updating, i.e., if it's still pointing to the old flat structure
|
# Check if the path needs updating, i.e., if it's still pointing to the old flat structure
|
||||||
# # and isn't already pointing to the new dataset-specific structure.
|
# and isn't already pointing to the new dataset-specific structure.
|
||||||
# # The `startswith` check on the original path is crucial here.
|
# We assume old_feature_prefix is 'data/manifests'
|
||||||
# # Example: 'data/manifests/feats_train/feats-12.lca'
|
# and original_storage_path looks like 'data/manifests/feats_train/feats-12.lca'
|
||||||
# if original_storage_path.parts[0] == old_feature_prefix.split('/')[0] and \
|
# We want to change it to 'data/manifests/<dataset_name>/feats_train/feats-12.lca'
|
||||||
# original_storage_path.parts[1] == old_feature_prefix.split('/')[1] and \
|
|
||||||
# not original_storage_path.parts[2].startswith(dataset_name):
|
# The check `original_storage_path.parts[1]` ensures it's indeed under 'manifests' and not already processed.
|
||||||
|
# And `not original_storage_path.parts[2].startswith(dataset_name)` ensures we don't re-process.
|
||||||
|
if len(original_storage_path.parts) >= 3 and \
|
||||||
|
original_storage_path.parts[0] == old_feature_prefix.split(os.sep)[0] and \
|
||||||
|
original_storage_path.parts[1] == old_feature_prefix.split(os.sep)[1] and \
|
||||||
|
not original_storage_path.parts[2].startswith(dataset_name): # Assumes dataset_name does not start with feats_
|
||||||
|
|
||||||
# # Assuming the original feature files were structured like
|
# This gives us 'feats_train/feats-12.lca'
|
||||||
# # data/manifests/feats_train/some_file.lca
|
# It's important to be robust to potentially different original prefixes
|
||||||
# # We want to change them to data/manifests/reazonspeech/feats_train/some_file.lca
|
# So we take the part of the path *after* the `old_feature_prefix`
|
||||||
|
try:
|
||||||
|
relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
|
||||||
|
except ValueError:
|
||||||
|
# If for some reason the path doesn't start with old_feature_prefix,
|
||||||
|
# keep it as is. This can happen if some paths are already absolute or different.
|
||||||
|
logger.warning(f"Feature path '{original_storage_path}' does not start with '{old_feature_prefix}'. Skipping update for this cut.")
|
||||||
|
updated_cuts.append(cut)
|
||||||
|
continue
|
||||||
|
|
||||||
# # This gives us 'feats_train/feats-12.lca'
|
# Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
|
||||||
# relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
|
new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix
|
||||||
|
# cut = cut.with_features(cut.features.with_path(str(new_storage_path)))
|
||||||
# # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
|
cut.features.storage_path = str(new_storage_path)
|
||||||
# new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix
|
updated_cuts.append(cut)
|
||||||
# cut = cut.with_features_path_prefix(cut.features.with_path(str(new_storage_path)))
|
else:
|
||||||
# updated_cuts.append(cut)
|
updated_cuts.append(cut) # No features, or not a path we need to modify
|
||||||
# else:
|
return CutSet.from_cuts(updated_cuts)
|
||||||
# updated_cuts.append(cut) # No features, or not a path we need to modify
|
|
||||||
# return CutSet.from_cuts(updated_cuts)
|
|
||||||
return cuts.with_features_path_prefix(old_feature_prefix + "/" + dataset_name)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# The root where the symlinked manifests are located in the multi_ja_en recipe
|
# The root where the symlinked manifests are located in the multi_ja_en recipe
|
||||||
multi_recipe_manifests_root = Path("data/manifests")
|
multi_recipe_manifests_root = Path("data/manifests")
|
||||||
|
|
||||||
# Define the datasets and their *specific* manifest file prefixes
|
# Define the datasets and their *specific* manifest file prefixes
|
||||||
# The keys are the dataset names (which are also the subdirectory names)
|
|
||||||
# The values are the base filename for their cuts (e.g., "reazonspeech_cuts", "mls_eng_cuts")
|
|
||||||
dataset_manifest_prefixes = {
|
dataset_manifest_prefixes = {
|
||||||
"reazonspeech": "reazonspeech_cuts",
|
"reazonspeech": "reazonspeech_cuts",
|
||||||
"mls_english": "mls_eng_cuts",
|
"mls_english": "mls_eng_cuts",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Define the splits. The script will append "_dev.jsonl.gz", "_train.jsonl.gz", etc.
|
|
||||||
splits = ["train", "dev", "test"]
|
splits = ["train", "dev", "test"]
|
||||||
|
|
||||||
# This is the path segment *inside* the original recipe's data/manifests
|
# This is the path segment *inside* the original recipe's data/manifests
|
||||||
@ -78,26 +87,37 @@ if __name__ == "__main__":
|
|||||||
for split in splits:
|
for split in splits:
|
||||||
# Construct the path to the symlinked manifest file
|
# Construct the path to the symlinked manifest file
|
||||||
manifest_filename = f"{manifest_prefix}_{split}.jsonl.gz"
|
manifest_filename = f"{manifest_prefix}_{split}.jsonl.gz"
|
||||||
manifest_path = dataset_symlink_dir / manifest_filename
|
symlink_path = dataset_symlink_dir / manifest_filename # This is the path to the symlink itself
|
||||||
|
|
||||||
if manifest_path.is_file():
|
if symlink_path.is_symlink(): # Check if it's actually a symlink
|
||||||
logger.info(f"Processing {dataset_name} {split} cuts from symlink: {manifest_path}")
|
# Get the actual path to the target file that the symlink points to
|
||||||
try:
|
# Lhotse's load_manifest will follow this symlink automatically.
|
||||||
# Load the manifest (Lhotse will follow the symlink)
|
target_path = os.path.realpath(symlink_path)
|
||||||
cuts = load_manifest(manifest_path)
|
logger.info(f"Processing symlink '{symlink_path}' pointing to '{target_path}'")
|
||||||
|
elif symlink_path.is_file(): # If it's a regular file (not a symlink)
|
||||||
# Update the storage_path within the loaded cuts
|
logger.info(f"Processing regular file: {symlink_path}")
|
||||||
# The `old_feature_prefix` is still 'data/manifests' as that's what the original
|
target_path = symlink_path # Use its own path as target
|
||||||
# paths in the underlying manifest refer to.
|
|
||||||
updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
|
|
||||||
|
|
||||||
# Save the updated cuts back to the *symlinked* path.
|
|
||||||
# Lhotse will write to the target of the symlink.
|
|
||||||
updated_cuts.to_file(manifest_path)
|
|
||||||
logger.info(f"Updated {dataset_name} {split} cuts saved to: {manifest_path}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error processing {manifest_path}: {e}", exc_info=True) # Print full traceback
|
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Manifest file not found (symlink target might be missing or file name mismatch): {manifest_path}")
|
logger.warning(f"Manifest file not found or neither a file nor a symlink: {symlink_path}")
|
||||||
|
continue # Skip to next iteration
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load the manifest. Lhotse will resolve the symlink internally for reading.
|
||||||
|
cuts = load_manifest(symlink_path) # Use symlink_path here, Lhotse handles resolution for loading
|
||||||
|
|
||||||
|
# Update the storage_path within the loaded cuts (in memory)
|
||||||
|
updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
|
||||||
|
|
||||||
|
# --- CRITICAL CHANGE HERE ---
|
||||||
|
# Save the *modified* CutSet to the path of the symlink *itself*.
|
||||||
|
# This will overwrite the symlink with the new file, effectively
|
||||||
|
# breaking the symlink and creating a new file in its place.
|
||||||
|
os.unlink(symlink_path)
|
||||||
|
updated_cuts.to_file(symlink_path)
|
||||||
|
logger.info(f"Updated {dataset_name} {split} cuts saved (overwriting symlink) to: {symlink_path}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing {symlink_path}: {e}", exc_info=True)
|
||||||
|
|
||||||
logger.info("CutSet path updating complete.")
|
logger.info("CutSet path updating complete.")
|
Loading…
x
Reference in New Issue
Block a user