mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
fix stage 5 output pathing
This commit is contained in:
parent
daff070d68
commit
e6615df4eb
@ -112,9 +112,9 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||||
log "Stage 5: Prepare Byte BPE based lang"
|
log "Stage 5: Prepare Byte BPE based lang in data/lang"
|
||||||
|
lang_dir=data/lang
|
||||||
|
|
||||||
# Check if required char-based lang data exists
|
# Check if required char-based lang data exists
|
||||||
if [ ! -d ../../reazonspeech/ASR/data/lang_char ] && [ ! -d ./data/lang_char ]; then
|
if [ ! -d ../../reazonspeech/ASR/data/lang_char ] && [ ! -d ./data/lang_char ]; then
|
||||||
@ -123,25 +123,22 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Check if BPE data from MLS English exists
|
# Check if BPE data from MLS English exists
|
||||||
if [ ! -d ../../mls_english/ASR/data/lang ] ; then
|
if [ ! -d ../../mls_english/ASR/data/lang/bpe_2000 ] || [ ! -f ../../mls_english/ASR/data/lang/transcript.txt ]; then
|
||||||
log "Abort! Please run ../../mls_english/ASR/prepare.sh --stage 3 --stop-stage 3"
|
log "Abort! Please ensure ../../mls_english/ASR/data/lang/bpe_2000 and ../../mls_english/ASR/data/lang/transcript.txt exist."
|
||||||
|
log "Please run ../../mls_english/ASR/prepare.sh --stage 3 --stop-stage 3 if you haven't already."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cd data/
|
# Create the target lang directory if it doesn't exist
|
||||||
# Symlink the full lang directory from MLS English (which includes bpe_2000/ and transcript.txt)
|
mkdir -p $lang_dir
|
||||||
if [ ! -d ./lang_bpe_2000 ]; then
|
|
||||||
ln -svf $(realpath ../../../mls_english/ASR/data/lang) lang_bpe_2000
|
# Combine Japanese char-level text and English BPE transcript
|
||||||
fi
|
cat data/lang_char/text ../../mls_english/ASR/data/lang/transcript.txt \
|
||||||
cd ../
|
> $lang_dir/text
|
||||||
|
|
||||||
for vocab_size in ${vocab_sizes[@]}; do
|
for vocab_size in ${vocab_sizes[@]}; do
|
||||||
lang_dir=data/lang_bbpe_${vocab_size}
|
bbpe_dir=$lang_dir/bbpe_${vocab_size}
|
||||||
mkdir -p $lang_dir
|
mkdir -p $bbpe_dir
|
||||||
|
|
||||||
# Combine Japanese char-level text and English BPE transcript
|
|
||||||
cat data/lang_char/text data/lang_bpe_2000/transcript.txt \
|
|
||||||
> $lang_dir/text
|
|
||||||
|
|
||||||
if [ ! -f $lang_dir/transcript_chars.txt ]; then
|
if [ ! -f $lang_dir/transcript_chars.txt ]; then
|
||||||
./local/prepare_for_bpe_model.py \
|
./local/prepare_for_bpe_model.py \
|
||||||
@ -154,7 +151,7 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
--input-file ./data/lang_char/text \
|
--input-file ./data/lang_char/text \
|
||||||
--output-file $lang_dir/text_words_segmentation
|
--output-file $lang_dir/text_words_segmentation
|
||||||
|
|
||||||
cat ./data/lang_bpe_2000/transcript.txt \
|
cat ../../mls_english/ASR/data/lang/transcript.txt \
|
||||||
>> $lang_dir/text_words_segmentation
|
>> $lang_dir/text_words_segmentation
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@ -167,11 +164,12 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
--output-file $lang_dir/words.txt
|
--output-file $lang_dir/words.txt
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $lang_dir/bbpe.model ]; then
|
if [ ! -f $bbpe_dir/bbpe.model ]; then
|
||||||
./local/train_bbpe_model.py \
|
./local/train_bbpe_model.py \
|
||||||
--lang-dir $lang_dir \
|
--lang-dir $lang_dir \
|
||||||
--vocab-size $vocab_size \
|
--vocab-size $vocab_size \
|
||||||
--transcript $lang_dir/text
|
--transcript $lang_dir/text \
|
||||||
|
--output-model $bbpe_dir/bbpe.model # Specify output path
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
if [ ! -f $lang_dir/L_disambig.pt ]; then
|
||||||
@ -181,9 +179,16 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
ln -svf $(realpath ../../multi_zh_en/ASR/local/validate_bpe_lexicon.py) local/
|
ln -svf $(realpath ../../multi_zh_en/ASR/local/validate_bpe_lexicon.py) local/
|
||||||
./local/validate_bpe_lexicon.py \
|
./local/validate_bpe_lexicon.py \
|
||||||
--lexicon $lang_dir/lexicon.txt \
|
--lexicon $lang_dir/lexicon.txt \
|
||||||
--bpe-model $lang_dir/bbpe.model
|
--bpe-model $bbpe_dir/bbpe.model # Use the model in the bbpe subdir
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
rm -f $lang_dir/lexicon.txt $lang_dir/L_disambig.pt
|
||||||
done
|
done
|
||||||
|
|
||||||
|
# Optionally, create a symlink for consistency if other parts of the recipe expect data/lang/bpe_2000
|
||||||
|
# if [ -d $lang_dir/bbpe_2000 ] && [ ! -e $lang_dir/bpe_2000 ]; then
|
||||||
|
# ln -s bbpe_2000 $lang_dir/bpe_2000
|
||||||
|
# fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
log "prepare.sh: PREPARATION DONE"
|
log "prepare.sh: PREPARATION DONE"
|
Loading…
x
Reference in New Issue
Block a user