fix stage 5 output pathing

This commit is contained in:
Kinan Martin 2025-05-15 09:11:40 +09:00
parent daff070d68
commit e6615df4eb

View File

@ -112,9 +112,9 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
fi fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Prepare Byte BPE based lang" log "Stage 5: Prepare Byte BPE based lang in data/lang"
lang_dir=data/lang
# Check if required char-based lang data exists # Check if required char-based lang data exists
if [ ! -d ../../reazonspeech/ASR/data/lang_char ] && [ ! -d ./data/lang_char ]; then if [ ! -d ../../reazonspeech/ASR/data/lang_char ] && [ ! -d ./data/lang_char ]; then
@ -123,25 +123,22 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
fi fi
# Check if BPE data from MLS English exists # Check if BPE data from MLS English exists
if [ ! -d ../../mls_english/ASR/data/lang ] ; then if [ ! -d ../../mls_english/ASR/data/lang/bpe_2000 ] || [ ! -f ../../mls_english/ASR/data/lang/transcript.txt ]; then
log "Abort! Please run ../../mls_english/ASR/prepare.sh --stage 3 --stop-stage 3" log "Abort! Please ensure ../../mls_english/ASR/data/lang/bpe_2000 and ../../mls_english/ASR/data/lang/transcript.txt exist."
log "Please run ../../mls_english/ASR/prepare.sh --stage 3 --stop-stage 3 if you haven't already."
exit 1 exit 1
fi fi
cd data/ # Create the target lang directory if it doesn't exist
# Symlink the full lang directory from MLS English (which includes bpe_2000/ and transcript.txt) mkdir -p $lang_dir
if [ ! -d ./lang_bpe_2000 ]; then
ln -svf $(realpath ../../../mls_english/ASR/data/lang) lang_bpe_2000 # Combine Japanese char-level text and English BPE transcript
fi cat data/lang_char/text ../../mls_english/ASR/data/lang/transcript.txt \
cd ../ > $lang_dir/text
for vocab_size in ${vocab_sizes[@]}; do for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bbpe_${vocab_size} bbpe_dir=$lang_dir/bbpe_${vocab_size}
mkdir -p $lang_dir mkdir -p $bbpe_dir
# Combine Japanese char-level text and English BPE transcript
cat data/lang_char/text data/lang_bpe_2000/transcript.txt \
> $lang_dir/text
if [ ! -f $lang_dir/transcript_chars.txt ]; then if [ ! -f $lang_dir/transcript_chars.txt ]; then
./local/prepare_for_bpe_model.py \ ./local/prepare_for_bpe_model.py \
@ -154,7 +151,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
--input-file ./data/lang_char/text \ --input-file ./data/lang_char/text \
--output-file $lang_dir/text_words_segmentation --output-file $lang_dir/text_words_segmentation
cat ./data/lang_bpe_2000/transcript.txt \ cat ../../mls_english/ASR/data/lang/transcript.txt \
>> $lang_dir/text_words_segmentation >> $lang_dir/text_words_segmentation
fi fi
@ -167,11 +164,12 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
--output-file $lang_dir/words.txt --output-file $lang_dir/words.txt
fi fi
if [ ! -f $lang_dir/bbpe.model ]; then if [ ! -f $bbpe_dir/bbpe.model ]; then
./local/train_bbpe_model.py \ ./local/train_bbpe_model.py \
--lang-dir $lang_dir \ --lang-dir $lang_dir \
--vocab-size $vocab_size \ --vocab-size $vocab_size \
--transcript $lang_dir/text --transcript $lang_dir/text \
--output-model $bbpe_dir/bbpe.model # Specify output path
fi fi
if [ ! -f $lang_dir/L_disambig.pt ]; then if [ ! -f $lang_dir/L_disambig.pt ]; then
@ -181,9 +179,16 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
ln -svf $(realpath ../../multi_zh_en/ASR/local/validate_bpe_lexicon.py) local/ ln -svf $(realpath ../../multi_zh_en/ASR/local/validate_bpe_lexicon.py) local/
./local/validate_bpe_lexicon.py \ ./local/validate_bpe_lexicon.py \
--lexicon $lang_dir/lexicon.txt \ --lexicon $lang_dir/lexicon.txt \
--bpe-model $lang_dir/bbpe.model --bpe-model $bbpe_dir/bbpe.model # Use the model in the bbpe subdir
fi fi
rm -f $lang_dir/lexicon.txt $lang_dir/L_disambig.pt
done done
# Optionally, create a symlink for consistency if other parts of the recipe expect data/lang/bpe_2000
# if [ -d $lang_dir/bbpe_2000 ] && [ ! -e $lang_dir/bpe_2000 ]; then
# ln -s bbpe_2000 $lang_dir/bpe_2000
# fi
fi fi
log "prepare.sh: PREPARATION DONE" log "prepare.sh: PREPARATION DONE"