diff --git a/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py b/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py index f1a312e18..c1c418938 100644 --- a/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py +++ b/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py @@ -1,5 +1,6 @@ import logging from pathlib import Path +import os # Import os module to handle symlinks from lhotse import CutSet, load_manifest @@ -8,7 +9,8 @@ logger = logging.getLogger(__name__) def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"): """ - Updates the storage_path in a CutSet's features to reflect the structure in multi_ja_en. + Updates the storage_path in a CutSet's features to reflect the new dataset-specific + feature directory structure. Args: cuts: The Lhotse CutSet to modify. @@ -18,48 +20,55 @@ def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "dat This typically corresponds to the root of the manifests dir in the original recipe. """ - # updated_cuts = [] - # for cut in cuts: - # if cut.features is not None: - # original_storage_path = Path(cut.features.storage_path) + updated_cuts = [] + for cut in cuts: + if cut.features is not None: + original_storage_path = Path(cut.features.storage_path) - # # Check if the path needs updating, i.e., if it's still pointing to the old flat structure - # # and isn't already pointing to the new dataset-specific structure. - # # The `startswith` check on the original path is crucial here. - # # Example: 'data/manifests/feats_train/feats-12.lca' - # if original_storage_path.parts[0] == old_feature_prefix.split('/')[0] and \ - # original_storage_path.parts[1] == old_feature_prefix.split('/')[1] and \ - # not original_storage_path.parts[2].startswith(dataset_name): + # Check if the path needs updating, i.e., if it's still pointing to the old flat structure + # and isn't already pointing to the new dataset-specific structure. + # We assume old_feature_prefix is 'data/manifests' + # and original_storage_path looks like 'data/manifests/feats_train/feats-12.lca' + # We want to change it to 'data/manifests//feats_train/feats-12.lca' + + # The check `original_storage_path.parts[1]` ensures it's indeed under 'manifests' and not already processed. + # And `not original_storage_path.parts[2].startswith(dataset_name)` ensures we don't re-process. + if len(original_storage_path.parts) >= 3 and \ + original_storage_path.parts[0] == old_feature_prefix.split(os.sep)[0] and \ + original_storage_path.parts[1] == old_feature_prefix.split(os.sep)[1] and \ + not original_storage_path.parts[2].startswith(dataset_name): # Assumes dataset_name does not start with feats_ - # # Assuming the original feature files were structured like - # # data/manifests/feats_train/some_file.lca - # # We want to change them to data/manifests/reazonspeech/feats_train/some_file.lca + # This gives us 'feats_train/feats-12.lca' + # It's important to be robust to potentially different original prefixes + # So we take the part of the path *after* the `old_feature_prefix` + try: + relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix) + except ValueError: + # If for some reason the path doesn't start with old_feature_prefix, + # keep it as is. This can happen if some paths are already absolute or different. + logger.warning(f"Feature path '{original_storage_path}' does not start with '{old_feature_prefix}'. Skipping update for this cut.") + updated_cuts.append(cut) + continue - # # This gives us 'feats_train/feats-12.lca' - # relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix) - - # # Construct the new path: data/manifests//feats_train/feats-12.lca - # new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix - # cut = cut.with_features_path_prefix(cut.features.with_path(str(new_storage_path))) - # updated_cuts.append(cut) - # else: - # updated_cuts.append(cut) # No features, or not a path we need to modify - # return CutSet.from_cuts(updated_cuts) - return cuts.with_features_path_prefix(old_feature_prefix + "/" + dataset_name) + # Construct the new path: data/manifests//feats_train/feats-12.lca + new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix + # cut = cut.with_features(cut.features.with_path(str(new_storage_path))) + cut.features.storage_path = str(new_storage_path) + updated_cuts.append(cut) + else: + updated_cuts.append(cut) # No features, or not a path we need to modify + return CutSet.from_cuts(updated_cuts) if __name__ == "__main__": # The root where the symlinked manifests are located in the multi_ja_en recipe multi_recipe_manifests_root = Path("data/manifests") # Define the datasets and their *specific* manifest file prefixes - # The keys are the dataset names (which are also the subdirectory names) - # The values are the base filename for their cuts (e.g., "reazonspeech_cuts", "mls_eng_cuts") dataset_manifest_prefixes = { "reazonspeech": "reazonspeech_cuts", "mls_english": "mls_eng_cuts", } - # Define the splits. The script will append "_dev.jsonl.gz", "_train.jsonl.gz", etc. splits = ["train", "dev", "test"] # This is the path segment *inside* the original recipe's data/manifests @@ -78,26 +87,37 @@ if __name__ == "__main__": for split in splits: # Construct the path to the symlinked manifest file manifest_filename = f"{manifest_prefix}_{split}.jsonl.gz" - manifest_path = dataset_symlink_dir / manifest_filename + symlink_path = dataset_symlink_dir / manifest_filename # This is the path to the symlink itself - if manifest_path.is_file(): - logger.info(f"Processing {dataset_name} {split} cuts from symlink: {manifest_path}") - try: - # Load the manifest (Lhotse will follow the symlink) - cuts = load_manifest(manifest_path) - - # Update the storage_path within the loaded cuts - # The `old_feature_prefix` is still 'data/manifests' as that's what the original - # paths in the underlying manifest refer to. - updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path) - - # Save the updated cuts back to the *symlinked* path. - # Lhotse will write to the target of the symlink. - updated_cuts.to_file(manifest_path) - logger.info(f"Updated {dataset_name} {split} cuts saved to: {manifest_path}") - except Exception as e: - logger.error(f"Error processing {manifest_path}: {e}", exc_info=True) # Print full traceback + if symlink_path.is_symlink(): # Check if it's actually a symlink + # Get the actual path to the target file that the symlink points to + # Lhotse's load_manifest will follow this symlink automatically. + target_path = os.path.realpath(symlink_path) + logger.info(f"Processing symlink '{symlink_path}' pointing to '{target_path}'") + elif symlink_path.is_file(): # If it's a regular file (not a symlink) + logger.info(f"Processing regular file: {symlink_path}") + target_path = symlink_path # Use its own path as target else: - logger.warning(f"Manifest file not found (symlink target might be missing or file name mismatch): {manifest_path}") + logger.warning(f"Manifest file not found or neither a file nor a symlink: {symlink_path}") + continue # Skip to next iteration + + + try: + # Load the manifest. Lhotse will resolve the symlink internally for reading. + cuts = load_manifest(symlink_path) # Use symlink_path here, Lhotse handles resolution for loading + + # Update the storage_path within the loaded cuts (in memory) + updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path) + + # --- CRITICAL CHANGE HERE --- + # Save the *modified* CutSet to the path of the symlink *itself*. + # This will overwrite the symlink with the new file, effectively + # breaking the symlink and creating a new file in its place. + os.unlink(symlink_path) + updated_cuts.to_file(symlink_path) + logger.info(f"Updated {dataset_name} {split} cuts saved (overwriting symlink) to: {symlink_path}") + + except Exception as e: + logger.error(f"Error processing {symlink_path}: {e}", exc_info=True) logger.info("CutSet path updating complete.") \ No newline at end of file