From 6e70cdc65843cedd090a2d453781d501eb972137 Mon Sep 17 00:00:00 2001
From: Bailey Hirota <baileyhirota@icloud.com>
Date: Thu, 10 Jul 2025 15:32:03 +0900
Subject: [PATCH] update musan paths

---
 .../ASR/local/utils/update_cutset_paths.py    | 35 ++++++++++++++-----
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py b/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py
index c1c418938..8071c9023 100644
--- a/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py
+++ b/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py
@@ -30,15 +30,15 @@ def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "dat
             # We assume old_feature_prefix is 'data/manifests'
             # and original_storage_path looks like 'data/manifests/feats_train/feats-12.lca'
             # We want to change it to 'data/manifests/<dataset_name>/feats_train/feats-12.lca'
-            
+
             # The check `original_storage_path.parts[1]` ensures it's indeed under 'manifests' and not already processed.
             # And `not original_storage_path.parts[2].startswith(dataset_name)` ensures we don't re-process.
             if len(original_storage_path.parts) >= 3 and \
-               original_storage_path.parts[0] == old_feature_prefix.split(os.sep)[0] and \
-               original_storage_path.parts[1] == old_feature_prefix.split(os.sep)[1] and \
-               not original_storage_path.parts[2].startswith(dataset_name): # Assumes dataset_name does not start with feats_
+                    original_storage_path.parts[0] == old_feature_prefix.split(os.sep)[0] and \
+                    original_storage_path.parts[1] == old_feature_prefix.split(os.sep)[1] and \
+                    not original_storage_path.parts[2].startswith(dataset_name): # Assumes dataset_name does not start with feats_
 
-                # This gives us 'feats_train/feats-12.lca'
+                        # This gives us 'feats_train/feats-12.lca'
                 # It's important to be robust to potentially different original prefixes
                 # So we take the part of the path *after* the `old_feature_prefix`
                 try:
@@ -65,9 +65,10 @@ if __name__ == "__main__":
 
     # Define the datasets and their *specific* manifest file prefixes
     dataset_manifest_prefixes = {
-        "reazonspeech": "reazonspeech_cuts",
-        "mls_english": "mls_eng_cuts",
-    }
+            "reazonspeech": "reazonspeech_cuts",
+            "mls_english": "mls_eng_cuts",
+            "musan": "musan_cuts",
+            }
 
     splits = ["train", "dev", "test"]
 
@@ -77,6 +78,22 @@ if __name__ == "__main__":
     # then this is 'data/manifests'
     original_feature_base_path = "data/manifests"
 
+    musan_manifest_path = multi_recipe_manifests_root / "musan" / "musan_cuts.jsonl.gz"
+    if musan_manifest_path.exists():
+       logger.info(f"Processing musan manifest: {musan_manifest_path}")
+       try:
+           musan_cuts = load_manifest(musan_manifest_path)
+           updated_musan_cuts = update_paths(
+                   musan_cuts,
+                   "musan",
+                   old_feature_prefix=original_feature_base_path
+                   )
+           updated_musan_cuts.to_file(musan_manifest_path)
+           logger.info(f"Updated musan cuts saved to: {musan_manifest_path}")
+       except Exception as e:
+           logger.error(f"Error processing musan manifest {musan_manifest_path}: {e}", exc_info=True)
+    else:
+        logger.warning(f"Musan manifest not found at {musan_manifest_path}, skipping.")
 
     for dataset_name, manifest_prefix in dataset_manifest_prefixes.items():
         dataset_symlink_dir = multi_recipe_manifests_root / dataset_name
@@ -120,4 +137,4 @@ if __name__ == "__main__":
             except Exception as e:
                 logger.error(f"Error processing {symlink_path}: {e}", exc_info=True)
 
-    logger.info("CutSet path updating complete.")
\ No newline at end of file
+    logger.info("CutSet path updating complete.")