From ce894a7ba2a0df51393dc4167defc34cde0c9570 Mon Sep 17 00:00:00 2001 From: Kinan Martin Date: Wed, 4 Jun 2025 10:12:39 +0900 Subject: [PATCH] Combined updates. Changed BBPE path structure, changed dataset path structure, added script to update cutset paths. WIP --- .../ASR/local/prepare_lang_bbpe.py | 17 ++- egs/multi_ja_en/ASR/local/train_bbpe_model.py | 45 ++++++-- .../ASR/local/utils/update_cutset_paths.py | 103 ++++++++++++++++++ .../ASR/local/validate_bpe_lexicon.py | 2 +- egs/multi_ja_en/ASR/prepare.sh | 45 ++++---- .../ASR/zipformer/multi_dataset.py | 12 +- egs/multi_ja_en/ASR/zipformer/train.py | 5 +- 7 files changed, 185 insertions(+), 44 deletions(-) create mode 100644 egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py diff --git a/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py b/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py index 6134710ad..31b5c4f2f 100755 --- a/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py +++ b/egs/multi_ja_en/ASR/local/prepare_lang_bbpe.py @@ -21,7 +21,7 @@ This script takes as input `lang_dir`, which should contain:: - - lang_dir/bbpe.model, + - lang_dir/bbpe_2000/bbpe.model - lang_dir/words.txt and generates the following files in the directory `lang_dir`: @@ -173,7 +173,8 @@ def get_args(): "--lang-dir", type=str, help="""Input and output directory. - It should contain the bpe.model and words.txt + It should contain the words.txt file and the + bbpe model in a subdirectory (e.g., bbpe_2000/bbpe.model). """, ) @@ -184,6 +185,13 @@ def get_args(): help="The out of vocabulary word in lexicon.", ) + parser.add_argument( + "--vocab-size", + type=int, + default=2000, # Add a default value for vocab_size for consistency + help="Vocabulary size used for BPE training (determines the bbpe model directory).", + ) + parser.add_argument( "--debug", type=str2bool, @@ -205,6 +213,9 @@ def main(): args = get_args() lang_dir = Path(args.lang_dir) model_file = lang_dir / "bbpe.model" + + if not model_file.is_file(): + raise FileNotFoundError(f"BPE model not found at: {model_file}") word_sym_table = k2.SymbolTable.from_file(lang_dir / "words.txt") @@ -216,7 +227,7 @@ def main(): if w in words: words.remove(w) - lexicon, token_sym_table = generate_lexicon(model_file, words, args.oov) + lexicon, token_sym_table = generate_lexicon(str(model_file), words, args.oov) lexicon_disambig, max_disambig = add_disambig_symbols(lexicon) diff --git a/egs/multi_ja_en/ASR/local/train_bbpe_model.py b/egs/multi_ja_en/ASR/local/train_bbpe_model.py index d104f2717..e51193f3e 100755 --- a/egs/multi_ja_en/ASR/local/train_bbpe_model.py +++ b/egs/multi_ja_en/ASR/local/train_bbpe_model.py @@ -33,7 +33,7 @@ from pathlib import Path import sentencepiece as spm from icefall import byte_encode -from icefall.utils import tokenize_by_ja_char +from icefall.utils import str2bool, tokenize_by_ja_char def get_args(): @@ -41,9 +41,7 @@ def get_args(): parser.add_argument( "--lang-dir", type=str, - help="""Input and output directory. - The generated bpe.model is saved to this directory. - """, + help="""Input directory.""", ) parser.add_argument( @@ -58,6 +56,27 @@ def get_args(): help="Vocabulary size for BPE training", ) + parser.add_argument( + "--output-model", + type=str, + help="Path to save the trained BPE model.", + required=True, + ) + + parser.add_argument( + "--input-sentence-size", + type=int, + default=1000000, # Added default value + help="Maximum number of sentences to load for BPE training.", + ) + + parser.add_argument( + "--shuffle-input-sentence", + type=str2bool, + default=True, # Added default value + help="Whether to shuffle input sentences.", + ) + return parser.parse_args() @@ -71,17 +90,20 @@ def main(): args = get_args() vocab_size = args.vocab_size lang_dir = Path(args.lang_dir) + output_model = Path(args.output_model) + input_sentence_size = args.input_sentence_size + shuffle_input_sentence = args.shuffle_input_sentence model_type = "unigram" - model_prefix = f"{lang_dir}/{model_type}_{vocab_size}" - model_file = Path(model_prefix + ".model") - if model_file.is_file(): - print(f"{model_file} exists - skipping") + model_prefix = str(output_model.parent / f"{model_type}_{vocab_size}") + temp_model_file = Path(model_prefix + ".model") + + if output_model.is_file(): + print(f"{output_model} exists - skipping") return character_coverage = 1.0 - input_sentence_size = 100000000 user_defined_symbols = ["", ""] unk_id = len(user_defined_symbols) @@ -100,6 +122,7 @@ def main(): model_type=model_type, model_prefix=model_prefix, input_sentence_size=input_sentence_size, + shuffle_input_sentence=shuffle_input_sentence, character_coverage=character_coverage, user_defined_symbols=user_defined_symbols, unk_id=unk_id, @@ -107,8 +130,8 @@ def main(): eos_id=-1, ) - shutil.copyfile(model_file, f"{lang_dir}/bbpe.model") + shutil.move(str(temp_model_file), str(output_model)) if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py b/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py new file mode 100644 index 000000000..f1a312e18 --- /dev/null +++ b/egs/multi_ja_en/ASR/local/utils/update_cutset_paths.py @@ -0,0 +1,103 @@ +import logging +from pathlib import Path + +from lhotse import CutSet, load_manifest + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def update_paths(cuts: CutSet, dataset_name: str, old_feature_prefix: str = "data/manifests"): + """ + Updates the storage_path in a CutSet's features to reflect the structure in multi_ja_en. + + Args: + cuts: The Lhotse CutSet to modify. + dataset_name: The name of the dataset (e.g., "reazonspeech", "mls_english") + which corresponds to the new subdirectory for features. + old_feature_prefix: The prefix that the original feature paths were relative to. + This typically corresponds to the root of the manifests dir + in the original recipe. + """ + # updated_cuts = [] + # for cut in cuts: + # if cut.features is not None: + # original_storage_path = Path(cut.features.storage_path) + + # # Check if the path needs updating, i.e., if it's still pointing to the old flat structure + # # and isn't already pointing to the new dataset-specific structure. + # # The `startswith` check on the original path is crucial here. + # # Example: 'data/manifests/feats_train/feats-12.lca' + # if original_storage_path.parts[0] == old_feature_prefix.split('/')[0] and \ + # original_storage_path.parts[1] == old_feature_prefix.split('/')[1] and \ + # not original_storage_path.parts[2].startswith(dataset_name): + + # # Assuming the original feature files were structured like + # # data/manifests/feats_train/some_file.lca + # # We want to change them to data/manifests/reazonspeech/feats_train/some_file.lca + + # # This gives us 'feats_train/feats-12.lca' + # relative_path_from_old_prefix = original_storage_path.relative_to(old_feature_prefix) + + # # Construct the new path: data/manifests//feats_train/feats-12.lca + # new_storage_path = Path(old_feature_prefix) / dataset_name / relative_path_from_old_prefix + # cut = cut.with_features_path_prefix(cut.features.with_path(str(new_storage_path))) + # updated_cuts.append(cut) + # else: + # updated_cuts.append(cut) # No features, or not a path we need to modify + # return CutSet.from_cuts(updated_cuts) + return cuts.with_features_path_prefix(old_feature_prefix + "/" + dataset_name) + +if __name__ == "__main__": + # The root where the symlinked manifests are located in the multi_ja_en recipe + multi_recipe_manifests_root = Path("data/manifests") + + # Define the datasets and their *specific* manifest file prefixes + # The keys are the dataset names (which are also the subdirectory names) + # The values are the base filename for their cuts (e.g., "reazonspeech_cuts", "mls_eng_cuts") + dataset_manifest_prefixes = { + "reazonspeech": "reazonspeech_cuts", + "mls_english": "mls_eng_cuts", + } + + # Define the splits. The script will append "_dev.jsonl.gz", "_train.jsonl.gz", etc. + splits = ["train", "dev", "test"] + + # This is the path segment *inside* the original recipe's data/manifests + # that your features were stored under. + # e.g., if original path was /original/recipe/data/manifests/feats_train/file.lca + # then this is 'data/manifests' + original_feature_base_path = "data/manifests" + + + for dataset_name, manifest_prefix in dataset_manifest_prefixes.items(): + dataset_symlink_dir = multi_recipe_manifests_root / dataset_name + if not dataset_symlink_dir.is_dir(): + logger.warning(f"Dataset symlink directory not found: {dataset_symlink_dir}. Skipping {dataset_name}.") + continue + + for split in splits: + # Construct the path to the symlinked manifest file + manifest_filename = f"{manifest_prefix}_{split}.jsonl.gz" + manifest_path = dataset_symlink_dir / manifest_filename + + if manifest_path.is_file(): + logger.info(f"Processing {dataset_name} {split} cuts from symlink: {manifest_path}") + try: + # Load the manifest (Lhotse will follow the symlink) + cuts = load_manifest(manifest_path) + + # Update the storage_path within the loaded cuts + # The `old_feature_prefix` is still 'data/manifests' as that's what the original + # paths in the underlying manifest refer to. + updated_cuts = update_paths(cuts, dataset_name, old_feature_prefix=original_feature_base_path) + + # Save the updated cuts back to the *symlinked* path. + # Lhotse will write to the target of the symlink. + updated_cuts.to_file(manifest_path) + logger.info(f"Updated {dataset_name} {split} cuts saved to: {manifest_path}") + except Exception as e: + logger.error(f"Error processing {manifest_path}: {e}", exc_info=True) # Print full traceback + else: + logger.warning(f"Manifest file not found (symlink target might be missing or file name mismatch): {manifest_path}") + + logger.info("CutSet path updating complete.") \ No newline at end of file diff --git a/egs/multi_ja_en/ASR/local/validate_bpe_lexicon.py b/egs/multi_ja_en/ASR/local/validate_bpe_lexicon.py index 721bb48e7..4e843acf5 120000 --- a/egs/multi_ja_en/ASR/local/validate_bpe_lexicon.py +++ b/egs/multi_ja_en/ASR/local/validate_bpe_lexicon.py @@ -1 +1 @@ -../../../librispeech/ASR/local/validate_bpe_lexicon.py \ No newline at end of file +/root/icefall/egs/librispeech/ASR/local/validate_bpe_lexicon.py \ No newline at end of file diff --git a/egs/multi_ja_en/ASR/prepare.sh b/egs/multi_ja_en/ASR/prepare.sh index c96e622d0..4c6332683 100755 --- a/egs/multi_ja_en/ASR/prepare.sh +++ b/egs/multi_ja_en/ASR/prepare.sh @@ -140,28 +140,29 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then bbpe_dir=$lang_dir/bbpe_${vocab_size} mkdir -p $bbpe_dir - if [ ! -f $lang_dir/transcript_chars.txt ]; then + if [ ! -f $bbpe_dir/transcript_chars.txt ]; then ./local/prepare_for_bpe_model.py \ - --lang-dir ./$lang_dir \ + --lang-dir $bbpe_dir \ --text $lang_dir/text fi - if [ ! -f $lang_dir/text_words_segmentation ]; then + if [ ! -f $bbpe_dir/text_words_segmentation ]; then python3 ./local/text2segments.py \ --input-file ./data/lang_char/text \ - --output-file $lang_dir/text_words_segmentation - + --output-file $bbpe_dir/text_words_segmentation cat ../../mls_english/ASR/data/lang/transcript.txt \ - >> $lang_dir/text_words_segmentation + >> $bbpe_dir/text_words_segmentation fi - cat $lang_dir/text_words_segmentation | sed 's/ /\n/g' \ - | sort -u | sed '/^$/d' | uniq > $lang_dir/words_no_ids.txt + if [ ! -f $bbpe_dir/words_no_ids.txt ]; then + cat $bbpe_dir/text_words_segmentation | sed 's/ /\n/g' \ + | sort -u | sed '/^$/d' | uniq > $bbpe_dir/words_no_ids.txt + fi - if [ ! -f $lang_dir/words.txt ]; then + if [ ! -f $bbpe_dir/words.txt ]; then python3 ./local/prepare_words.py \ - --input-file $lang_dir/words_no_ids.txt \ - --output-file $lang_dir/words.txt + --input-file $bbpe_dir/words_no_ids.txt \ + --output-file $bbpe_dir/words.txt fi if [ ! -f $bbpe_dir/bbpe.model ]; then @@ -169,26 +170,28 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then --lang-dir $lang_dir \ --vocab-size $vocab_size \ --transcript $lang_dir/text \ - --output-model $bbpe_dir/bbpe.model # Specify output path + --output-model $bbpe_dir/bbpe.model \ + --input-sentence-size 2000000 # Example: limit to 2 million sentences fi - if [ ! -f $lang_dir/L_disambig.pt ]; then - ./local/prepare_lang_bbpe.py --lang-dir $lang_dir + if [ ! -f $bbpe_dir/L_disambig.pt ]; then + ./local/prepare_lang_bbpe.py --lang-dir $bbpe_dir --vocab-size $vocab_size - log "Validating $lang_dir/lexicon.txt" + log "Validating $bbpe_dir/lexicon.txt" ln -svf $(realpath ../../multi_zh_en/ASR/local/validate_bpe_lexicon.py) local/ ./local/validate_bpe_lexicon.py \ - --lexicon $lang_dir/lexicon.txt \ - --bpe-model $bbpe_dir/bbpe.model # Use the model in the bbpe subdir + --lexicon $bbpe_dir/lexicon.txt \ + --bpe-model $bbpe_dir/bbpe.model fi + # Remove top-level files (if they were created) rm -f $lang_dir/lexicon.txt $lang_dir/L_disambig.pt done - # Optionally, create a symlink for consistency if other parts of the recipe expect data/lang/bpe_2000 - # if [ -d $lang_dir/bbpe_2000 ] && [ ! -e $lang_dir/bpe_2000 ]; then - # ln -s bbpe_2000 $lang_dir/bpe_2000 - # fi + # Optional symlink + if [ -d $lang_dir/bbpe_2000 ] && [ ! -e $lang_dir/bpe_2000 ]; then + ln -s bbpe_2000 $lang_dir/bpe_2000 + fi fi log "prepare.sh: PREPARATION DONE" \ No newline at end of file diff --git a/egs/multi_ja_en/ASR/zipformer/multi_dataset.py b/egs/multi_ja_en/ASR/zipformer/multi_dataset.py index 171dccf5b..eb1bd5fae 100644 --- a/egs/multi_ja_en/ASR/zipformer/multi_dataset.py +++ b/egs/multi_ja_en/ASR/zipformer/multi_dataset.py @@ -29,12 +29,12 @@ class MultiDataset: logging.info("Loading Reazonspeech TRAIN set in lazy mode") reazonspeech_train_cuts = load_manifest_lazy( - self.manifest_dir / "reazonspeech_cuts_train.jsonl.gz" + self.manifest_dir / "reazonspeech/reazonspeech_cuts_train.jsonl.gz" ) logging.info("Loading MLS English TRAIN set in lazy mode") mls_eng_train_cuts = load_manifest_lazy( - self.manifest_dir / "mls_eng_cuts_train.jsonl.gz" + self.manifest_dir / "mls_english/mls_eng_cuts_train.jsonl.gz" ) return CutSet.mux( @@ -51,12 +51,12 @@ class MultiDataset: logging.info("Loading Reazonspeech DEV set in lazy mode") reazonspeech_dev_cuts = load_manifest_lazy( - self.manifest_dir / "reazonspeech_cuts_dev.jsonl.gz" + self.manifest_dir / "reazonspeech/reazonspeech_cuts_dev.jsonl.gz" ) logging.info("Loading MLS English DEV set in lazy mode") mls_eng_dev_cuts = load_manifest_lazy( - self.manifest_dir / "mls_eng_cuts_dev.jsonl.gz" + self.manifest_dir / "mls_english/mls_eng_cuts_dev.jsonl.gz" ) return CutSet.mux( @@ -73,12 +73,12 @@ class MultiDataset: logging.info("Loading Reazonspeech TEST set in lazy mode") reazonspeech_test_cuts = load_manifest_lazy( - self.manifest_dir / "reazonspeech_cuts_test.jsonl.gz" + self.manifest_dir / "reazonspeech/reazonspeech_cuts_test.jsonl.gz" ) logging.info("Loading MLS English TEST set in lazy mode") mls_eng_test_cuts = load_manifest_lazy( - self.manifest_dir / "mls_eng_cuts_test.jsonl.gz" + self.manifest_dir / "mls_english/mls_eng_cuts_test.jsonl.gz" ) return CutSet.mux( diff --git a/egs/multi_ja_en/ASR/zipformer/train.py b/egs/multi_ja_en/ASR/zipformer/train.py index 3eb9b94e8..e3e7bfaf2 100755 --- a/egs/multi_ja_en/ASR/zipformer/train.py +++ b/egs/multi_ja_en/ASR/zipformer/train.py @@ -327,7 +327,7 @@ def get_parser(): parser.add_argument( "--bpe-model", type=str, - default="data/lang_bbpe_2000/bbpe.model", + default="data/lang/bbpe_2000/bbpe.model", help="Path to the BPE model", ) @@ -1120,7 +1120,7 @@ def run(rank, world_size, args): # is defined in local/prepare_lang_char.py params.blank_id = sentencepiece_processor.piece_to_id("") - arams.vocab_size = sentencepiece_processor.get_piece_size() + params.vocab_size = sentencepiece_processor.get_piece_size() if not params.use_transducer: params.ctc_loss_scale = 1.0 @@ -1393,6 +1393,7 @@ def main(): MultiDatasetAsrDataModule.add_arguments(parser) args = parser.parse_args() args.exp_dir = Path(args.exp_dir) + print(args) world_size = args.world_size assert world_size >= 1