From daff070d6849ceab9cd8637566614cd5f1e84a10 Mon Sep 17 00:00:00 2001 From: Kinan Martin Date: Thu, 15 May 2025 07:24:26 +0900 Subject: [PATCH] restore version of mls_english compute_fbank_mls_english.py and prepare.sh from commit 547f5c5 --- .../ASR/local/compute_fbank_mls_english.py | 6 +++++- egs/mls_english/ASR/prepare.sh | 13 ++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/egs/mls_english/ASR/local/compute_fbank_mls_english.py b/egs/mls_english/ASR/local/compute_fbank_mls_english.py index 8c5cae842..e9bd81551 100644 --- a/egs/mls_english/ASR/local/compute_fbank_mls_english.py +++ b/egs/mls_english/ASR/local/compute_fbank_mls_english.py @@ -120,6 +120,11 @@ def main(): cut_sets = make_cutset_blueprints(mls_eng_hf_dataset_path) for part, cut_set in cut_sets: logging.info(f"Processing {part}") + cut_set = cut_set.save_audios( + num_jobs=num_jobs, + storage_path=(args.audio_dir / part).as_posix(), + ) # makes new cutset that loads audio from paths to actual audio files + cut_set = cut_set.compute_and_store_features( extractor=extractor, num_jobs=num_jobs, @@ -127,7 +132,6 @@ def main(): storage_type=LilcomChunkyWriter, ) - cut_set = cut_set.save_audios(args.audio_dir / part) # makes new cutset that uses paths to actual audio files cut_set.to_file(args.manifest_dir / f"mls_eng_cuts_{part}.jsonl.gz") logging.info("All fbank computed for MLS English.") diff --git a/egs/mls_english/ASR/prepare.sh b/egs/mls_english/ASR/prepare.sh index ad1c10080..14ca69dae 100755 --- a/egs/mls_english/ASR/prepare.sh +++ b/egs/mls_english/ASR/prepare.sh @@ -60,9 +60,9 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then --audio-dir data/audio \ --dl-dir $dl_dir/mls_english # --dl-dir /root/datasets/parler-tts--mls_eng - python local/validate_manifest.py --manifest data/manifests/mls_english_cuts_train.jsonl.gz - python local/validate_manifest.py --manifest data/manifests/mls_english_cuts_dev.jsonl.gz - python local/validate_manifest.py --manifest data/manifests/mls_english_cuts_test.jsonl.gz + python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_train.jsonl.gz + python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_dev.jsonl.gz + python local/validate_manifest.py --manifest data/manifests/mls_eng_cuts_test.jsonl.gz touch data/manifests/.mls_english-validated.done fi fi @@ -71,7 +71,10 @@ if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Prepare transcript for BPE training" if [ ! -f data/lang/transcript.txt ]; then log "Generating transcripts for BPE training" - ./local/utils/generate_transcript.py --lang-dir data/lang + python local/utils/generate_transcript.py \ + --dataset-path $dl_dir/mls_english \ + --lang-dir data/lang \ + --split train fi fi @@ -83,7 +86,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then mkdir -p $bpe_dir if [ ! -f $bpe_dir/bpe.model ]; then - ./local/train_bpe_model.py \ + python local/train_bpe_model.py \ --lang-dir $bpe_dir \ --vocab-size $vocab_size \ --transcript data/lang/transcript.txt