From e69e1c04b2e824ffda04ac2bdcb02db6e2fd75e2 Mon Sep 17 00:00:00 2001 From: Kinan Martin Date: Wed, 16 Apr 2025 07:15:25 +0900 Subject: [PATCH] separate transcript prep stage from bpe train stage --- egs/mls_english/ASR/prepare.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/egs/mls_english/ASR/prepare.sh b/egs/mls_english/ASR/prepare.sh index 27aaa5e37..eb42510b9 100644 --- a/egs/mls_english/ASR/prepare.sh +++ b/egs/mls_english/ASR/prepare.sh @@ -45,12 +45,15 @@ mkdir -p data/lang lang_dir=data/lang if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Prepare BPE tokenizer" - + log "Stage 1: Prepare transcript for BPE training" if [ ! -f $lang_dir/transcript.txt ]; then log "Generating transcripts for BPE training" ./local/utils/generate_transcript.py --lang-dir $lang_dir fi +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Prepare BPE tokenizer" for vocab_size in ${vocab_sizes[@]}; do log "Training BPE model with vocab_size=${vocab_size}"