From ce894a7ba2a0df51393dc4167defc34cde0c9570 Mon Sep 17 00:00:00 2001
From: Kinan Martin <kinanmartin0@gmail.com>
Date: Wed, 4 Jun 2025 10:12:39 +0900
Subject: [PATCH] Combined updates. Changed BBPE path structure, changed
 dataset path structure, added script to update cutset paths. WIP

---
 .../ASR/local/prepare_lang_bbpe.py            |  17 ++-
 egs/multi_ja_en/ASR/local/train_bbpe_model.py |  45 ++++++--
 .../ASR/local/utils/update_cutset_paths.py    | 103 ++++++++++++++++++
 .../ASR/local/validate_bpe_lexicon.py         |   2 +-
 egs/multi_ja_en/ASR/prepare.sh                |  45 ++++----
 .../ASR/zipformer/multi_dataset.py            |  12 +-
 egs/multi_ja_en/ASR/zipformer/train.py        |   5 +-
 7 files changed, 185 insertions(+), 44 deletions(-)
 create mode 100644 egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py

diff --git a/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py b/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py
index 6134710ad..31b5c4f2f 100755
--- a/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py
+++ b/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py
@@ -21,7 +21,7 @@
 
 This script takes as input `lang_dir`, which should contain::
 
-    - lang_dir/bbpe.model,
+    - lang_dir/bbpe_2000/bbpe.model
     - lang_dir/words.txt
 
 and generates the following files in the directory `lang_dir`:
@@ -173,7 +173,8 @@ def get_args():
         "--lang-dir",
         type=str,
         help="""Input and output directory.
-        It should contain the bpe.model and words.txt
+        It should contain the words.txt file and the
+        bbpe model in a subdirectory (e.g., bbpe_2000/bbpe.model).
         """,
     )
 
@@ -184,6 +185,13 @@ def get_args():
         help="The out of vocabulary word in lexicon.",
     )
 
+    parser.add_argument(
+        "--vocab-size",
+        type=int,
+        default=2000,  # Add a default value for vocab_size for consistency
+        help="Vocabulary size used for BPE training (determines the bbpe model directory).",
+    )
+
     parser.add_argument(
         "--debug",
         type=str2bool,
@@ -205,6 +213,9 @@ def main():
     args = get_args()
     lang_dir = Path(args.lang_dir)
     model_file = lang_dir / "bbpe.model"
+    
+    if not model_file.is_file():
+        raise FileNotFoundError(f"BPE model not found at: {model_file}")
 
     word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt")
 
@@ -216,7 +227,7 @@ def main():
         if w in words:
             words.remove(w)
 
-    lexicon, token_sym_table = generate_lexicon(model_file, words, args.oov)
+    lexicon, token_sym_table = generate_lexicon(str(model_file), words, args.oov)
 
     lexicon_disambig, max_disambig = add_disambig_symbols(lexicon)
 
diff --git a/egs/multi_ja_en/ASR/local/train_bbpe_model.py b/egs/multi_ja_en/ASR/local/train_bbpe_model.py
index d104f2717..e51193f3e 100755
--- a/egs/multi_ja_en/ASR/local/train_bbpe_model.py
+++ b/egs/multi_ja_en/ASR/local/train_bbpe_model.py
@@ -33,7 +33,7 @@ from pathlib import Path
 import sentencepiece as spm
 
 from icefall import byte_encode
-from icefall.utils import tokenize_by_ja_char
+from icefall.utils import str2bool, tokenize_by_ja_char
 
 
 def get_args():
@@ -41,9 +41,7 @@ def get_args():
     parser.add_argument(
         "--lang-dir",
         type=str,
-        help="""Input and output directory.
-        The generated bpe.model is saved to this directory.
-        """,
+        help="""Input directory.""",
     )
 
     parser.add_argument(
@@ -58,6 +56,27 @@ def get_args():
         help="Vocabulary size for BPE training",
     )
 
+    parser.add_argument(
+        "--output-model",
+        type=str,
+        help="Path to save the trained BPE model.",
+        required=True,
+    )
+
+    parser.add_argument(
+        "--input-sentence-size",
+        type=int,
+        default=1000000,  # Added default value
+        help="Maximum number of sentences to load for BPE training.",
+    )
+
+    parser.add_argument(
+        "--shuffle-input-sentence",
+        type=str2bool,
+        default=True,  # Added default value
+        help="Whether to shuffle input sentences.",
+    )
+
     return parser.parse_args()
 
 
@@ -71,17 +90,20 @@ def main():
     args = get_args()
     vocab_size = args.vocab_size
     lang_dir = Path(args.lang_dir)
+    output_model = Path(args.output_model)
+    input_sentence_size = args.input_sentence_size
+    shuffle_input_sentence = args.shuffle_input_sentence
 
     model_type = "unigram"
 
-    model_prefix = f"{lang_dir}/{model_type}_{vocab_size}"
-    model_file = Path(model_prefix + ".model")
-    if model_file.is_file():
-        print(f"{model_file} exists - skipping")
+    model_prefix = str(output_model.parent / f"{model_type}_{vocab_size}")
+    temp_model_file = Path(model_prefix + ".model")
+
+    if output_model.is_file():
+        print(f"{output_model} exists - skipping")
         return
 
     character_coverage = 1.0
-    input_sentence_size = 100000000
 
     user_defined_symbols = ["<blk>", "<sos/eos>"]
     unk_id = len(user_defined_symbols)
@@ -100,6 +122,7 @@ def main():
         model_type=model_type,
         model_prefix=model_prefix,
         input_sentence_size=input_sentence_size,
+        shuffle_input_sentence=shuffle_input_sentence,
         character_coverage=character_coverage,
         user_defined_symbols=user_defined_symbols,
         unk_id=unk_id,
@@ -107,8 +130,8 @@ def main():
         eos_id=-1,
     )
 
-    shutil.copyfile(model_file, f"{lang_dir}/bbpe.model")
+    shutil.move(str(temp_model_file), str(output_model))
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py b/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py
new file mode 100644
index 000000000..f1a312e18
--- /dev/null
+++ b/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py
@@ -0,0 +1,103 @@
+import logging
+from pathlib import Path
+
+from lhotse import CutSet, load_manifest
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"):
+    """
+    Updates the storage_path in a CutSet's features to reflect the structure in multi_ja_en.
+
+    Args:
+        cuts: The Lhotse CutSet to modify.
+        dataset_name: The name of the dataset (e.g., "reazonspeech", "mls_english")
+                      which corresponds to the new subdirectory for features.
+        old_feature_prefix: The prefix that the original feature paths were relative to.
+                            This typically corresponds to the root of the manifests dir
+                            in the original recipe.
+    """
+    # updated_cuts = []
+    # for cut in cuts:
+    #     if cut.features is not None:
+    #         original_storage_path = Path(cut.features.storage_path)
+
+    #         # Check if the path needs updating, i.e., if it's still pointing to the old flat structure
+    #         # and isn't already pointing to the new dataset-specific structure.
+    #         # The `startswith` check on the original path is crucial here.
+    #         # Example: 'data/manifests/feats_train/feats-12.lca'
+    #         if original_storage_path.parts[0] == old_feature_prefix.split('/')[0] and \
+    #            original_storage_path.parts[1] == old_feature_prefix.split('/')[1] and \
+    #            not original_storage_path.parts[2].startswith(dataset_name):
+
+    #             # Assuming the original feature files were structured like
+    #             # data/manifests/feats_train/some_file.lca
+    #             # We want to change them to data/manifests/reazonspeech/feats_train/some_file.lca
+
+    #             # This gives us 'feats_train/feats-12.lca'
+    #             relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix)
+
+    #             # Construct the new path: data/manifests/<dataset_name>/feats_train/feats-12.lca
+    #             new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix
+    #             cut = cut.with_features_path_prefix(cut.features.with_path(str(new_storage_path)))
+    #         updated_cuts.append(cut)
+    #     else:
+    #         updated_cuts.append(cut) # No features, or not a path we need to modify
+    # return CutSet.from_cuts(updated_cuts)
+    return cuts.with_features_path_prefix(old_feature_prefix + "/" + dataset_name)
+
+if __name__ == "__main__":
+    # The root where the symlinked manifests are located in the multi_ja_en recipe
+    multi_recipe_manifests_root = Path("data/manifests")
+
+    # Define the datasets and their *specific* manifest file prefixes
+    # The keys are the dataset names (which are also the subdirectory names)
+    # The values are the base filename for their cuts (e.g., "reazonspeech_cuts", "mls_eng_cuts")
+    dataset_manifest_prefixes = {
+        "reazonspeech": "reazonspeech_cuts",
+        "mls_english": "mls_eng_cuts",
+    }
+
+    # Define the splits. The script will append "_dev.jsonl.gz", "_train.jsonl.gz", etc.
+    splits = ["train", "dev", "test"]
+
+    # This is the path segment *inside* the original recipe's data/manifests
+    # that your features were stored under.
+    # e.g., if original path was /original/recipe/data/manifests/feats_train/file.lca
+    # then this is 'data/manifests'
+    original_feature_base_path = "data/manifests"
+
+
+    for dataset_name, manifest_prefix in dataset_manifest_prefixes.items():
+        dataset_symlink_dir = multi_recipe_manifests_root / dataset_name
+        if not dataset_symlink_dir.is_dir():
+            logger.warning(f"Dataset symlink directory not found: {dataset_symlink_dir}. Skipping {dataset_name}.")
+            continue
+
+        for split in splits:
+            # Construct the path to the symlinked manifest file
+            manifest_filename = f"{manifest_prefix}_{split}.jsonl.gz"
+            manifest_path = dataset_symlink_dir / manifest_filename
+
+            if manifest_path.is_file():
+                logger.info(f"Processing {dataset_name} {split} cuts from symlink: {manifest_path}")
+                try:
+                    # Load the manifest (Lhotse will follow the symlink)
+                    cuts = load_manifest(manifest_path)
+
+                    # Update the storage_path within the loaded cuts
+                    # The `old_feature_prefix` is still 'data/manifests' as that's what the original
+                    # paths in the underlying manifest refer to.
+                    updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path)
+
+                    # Save the updated cuts back to the *symlinked* path.
+                    # Lhotse will write to the target of the symlink.
+                    updated_cuts.to_file(manifest_path)
+                    logger.info(f"Updated {dataset_name} {split} cuts saved to: {manifest_path}")
+                except Exception as e:
+                    logger.error(f"Error processing {manifest_path}: {e}", exc_info=True) # Print full traceback
+            else:
+                logger.warning(f"Manifest file not found (symlink target might be missing or file name mismatch): {manifest_path}")
+
+    logger.info("CutSet path updating complete.")
\ No newline at end of file
diff --git a/egs/multi_ja_en/ASR/local/validate_bpe_lexicon.py b/egs/multi_ja_en/ASR/local/validate_bpe_lexicon.py
index 721bb48e7..4e843acf5 120000
--- a/egs/multi_ja_en/ASR/local/validate_bpe_lexicon.py
+++ b/egs/multi_ja_en/ASR/local/validate_bpe_lexicon.py
@@ -1 +1 @@
-../../../librispeech/ASR/local/validate_bpe_lexicon.py
\ No newline at end of file
+/root/icefall/egs/librispeech/ASR/local/validate_bpe_lexicon.py
\ No newline at end of file
diff --git a/egs/multi_ja_en/ASR/prepare.sh b/egs/multi_ja_en/ASR/prepare.sh
index c96e622d0..4c6332683 100755
--- a/egs/multi_ja_en/ASR/prepare.sh
+++ b/egs/multi_ja_en/ASR/prepare.sh
@@ -140,28 +140,29 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
     bbpe_dir=$lang_dir/bbpe_${vocab_size}
     mkdir -p $bbpe_dir
 
-    if [ ! -f $lang_dir/transcript_chars.txt ]; then
+    if [ ! -f $bbpe_dir/transcript_chars.txt ]; then
       ./local/prepare_for_bpe_model.py \
-        --lang-dir ./$lang_dir \
+        --lang-dir $bbpe_dir \
         --text $lang_dir/text
     fi
 
-    if [ ! -f $lang_dir/text_words_segmentation ]; then
+    if [ ! -f $bbpe_dir/text_words_segmentation ]; then
       python3 ./local/text2segments.py \
         --input-file ./data/lang_char/text \
-        --output-file $lang_dir/text_words_segmentation
-
+        --output-file $bbpe_dir/text_words_segmentation
       cat ../../mls_english/ASR/data/lang/transcript.txt \
-        >> $lang_dir/text_words_segmentation
+        >> $bbpe_dir/text_words_segmentation
     fi
 
-    cat $lang_dir/text_words_segmentation | sed 's/ /\n/g' \
-      | sort -u | sed '/^$/d' | uniq > $lang_dir/words_no_ids.txt
+    if [ ! -f $bbpe_dir/words_no_ids.txt ]; then
+      cat $bbpe_dir/text_words_segmentation | sed 's/ /\n/g' \
+        | sort -u | sed '/^$/d' | uniq > $bbpe_dir/words_no_ids.txt
+    fi
 
-    if [ ! -f $lang_dir/words.txt ]; then
+    if [ ! -f $bbpe_dir/words.txt ]; then
       python3 ./local/prepare_words.py \
-        --input-file $lang_dir/words_no_ids.txt \
-        --output-file $lang_dir/words.txt
+        --input-file $bbpe_dir/words_no_ids.txt \
+        --output-file $bbpe_dir/words.txt
     fi
 
     if [ ! -f $bbpe_dir/bbpe.model ]; then
@@ -169,26 +170,28 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
         --lang-dir $lang_dir \
         --vocab-size $vocab_size \
         --transcript $lang_dir/text \
-        --output-model $bbpe_dir/bbpe.model # Specify output path
+        --output-model $bbpe_dir/bbpe.model \
+        --input-sentence-size 2000000 # Example: limit to 2 million sentences
     fi
 
-    if [ ! -f $lang_dir/L_disambig.pt ]; then
-      ./local/prepare_lang_bbpe.py --lang-dir $lang_dir
+    if [ ! -f $bbpe_dir/L_disambig.pt ]; then
+      ./local/prepare_lang_bbpe.py --lang-dir $bbpe_dir --vocab-size $vocab_size
 
-      log "Validating $lang_dir/lexicon.txt"
+      log "Validating $bbpe_dir/lexicon.txt"
       ln -svf $(realpath ../../multi_zh_en/ASR/local/validate_bpe_lexicon.py) local/
       ./local/validate_bpe_lexicon.py \
-        --lexicon $lang_dir/lexicon.txt \
-        --bpe-model $bbpe_dir/bbpe.model # Use the model in the bbpe subdir
+        --lexicon $bbpe_dir/lexicon.txt \
+        --bpe-model $bbpe_dir/bbpe.model
     fi
 
+    # Remove top-level files (if they were created)
     rm -f $lang_dir/lexicon.txt $lang_dir/L_disambig.pt
   done
 
-  # Optionally, create a symlink for consistency if other parts of the recipe expect data/lang/bpe_2000
-  # if [ -d $lang_dir/bbpe_2000 ] && [ ! -e $lang_dir/bpe_2000 ]; then
-  #   ln -s bbpe_2000 $lang_dir/bpe_2000
-  # fi
+  # Optional symlink
+  if [ -d $lang_dir/bbpe_2000 ] && [ ! -e $lang_dir/bpe_2000 ]; then
+    ln -s bbpe_2000 $lang_dir/bpe_2000
+  fi
 fi
 
 log "prepare.sh: PREPARATION DONE"
\ No newline at end of file
diff --git a/egs/multi_ja_en/ASR/zipformer/multi_dataset.py b/egs/multi_ja_en/ASR/zipformer/multi_dataset.py
index 171dccf5b..eb1bd5fae 100644
--- a/egs/multi_ja_en/ASR/zipformer/multi_dataset.py
+++ b/egs/multi_ja_en/ASR/zipformer/multi_dataset.py
@@ -29,12 +29,12 @@ class MultiDataset:
 
         logging.info("Loading Reazonspeech TRAIN set in lazy mode")
         reazonspeech_train_cuts = load_manifest_lazy(
-            self.manifest_dir / "reazonspeech_cuts_train.jsonl.gz"
+            self.manifest_dir / "reazonspeech/reazonspeech_cuts_train.jsonl.gz"
         )
 
         logging.info("Loading MLS English TRAIN set in lazy mode")
         mls_eng_train_cuts = load_manifest_lazy(
-            self.manifest_dir / "mls_eng_cuts_train.jsonl.gz"
+            self.manifest_dir / "mls_english/mls_eng_cuts_train.jsonl.gz"
         )
 
         return CutSet.mux(
@@ -51,12 +51,12 @@ class MultiDataset:
 
         logging.info("Loading Reazonspeech DEV set in lazy mode")
         reazonspeech_dev_cuts = load_manifest_lazy(
-            self.manifest_dir / "reazonspeech_cuts_dev.jsonl.gz"
+            self.manifest_dir / "reazonspeech/reazonspeech_cuts_dev.jsonl.gz"
         )
 
         logging.info("Loading MLS English DEV set in lazy mode")
         mls_eng_dev_cuts = load_manifest_lazy(
-            self.manifest_dir / "mls_eng_cuts_dev.jsonl.gz"
+            self.manifest_dir / "mls_english/mls_eng_cuts_dev.jsonl.gz"
         )
 
         return CutSet.mux(
@@ -73,12 +73,12 @@ class MultiDataset:
 
         logging.info("Loading Reazonspeech TEST set in lazy mode")
         reazonspeech_test_cuts = load_manifest_lazy(
-            self.manifest_dir / "reazonspeech_cuts_test.jsonl.gz"
+            self.manifest_dir / "reazonspeech/reazonspeech_cuts_test.jsonl.gz"
         )
 
         logging.info("Loading MLS English TEST set in lazy mode")
         mls_eng_test_cuts = load_manifest_lazy(
-            self.manifest_dir / "mls_eng_cuts_test.jsonl.gz"
+            self.manifest_dir / "mls_english/mls_eng_cuts_test.jsonl.gz"
         )
 
         return CutSet.mux(
diff --git a/egs/multi_ja_en/ASR/zipformer/train.py b/egs/multi_ja_en/ASR/zipformer/train.py
index 3eb9b94e8..e3e7bfaf2 100755
--- a/egs/multi_ja_en/ASR/zipformer/train.py
+++ b/egs/multi_ja_en/ASR/zipformer/train.py
@@ -327,7 +327,7 @@ def get_parser():
     parser.add_argument(
         "--bpe-model",
         type=str,
-        default="data/lang_bbpe_2000/bbpe.model",
+        default="data/lang/bbpe_2000/bbpe.model",
         help="Path to the BPE model",
     )
 
@@ -1120,7 +1120,7 @@ def run(rank, world_size, args):
 
     # <blk> is defined in local/prepare_lang_char.py
     params.blank_id = sentencepiece_processor.piece_to_id("<blk>")
-    arams.vocab_size = sentencepiece_processor.get_piece_size()
+    params.vocab_size = sentencepiece_processor.get_piece_size()
 
     if not params.use_transducer:
         params.ctc_loss_scale = 1.0
@@ -1393,6 +1393,7 @@ def main():
     MultiDatasetAsrDataModule.add_arguments(parser)
     args = parser.parse_args()
     args.exp_dir = Path(args.exp_dir)
+    print(args)
 
     world_size = args.world_size
     assert world_size >= 1