from local

This commit is contained in:
dohe0342 2023-02-21 14:35:51 +09:00
parent 4036a73bcc
commit 2639b73933
3 changed files with 23 additions and 49 deletions

Binary file not shown.

View File

@ -172,68 +172,42 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 7: Prepare bigram P"
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
if [ ! -f $lang_dir/transcript_tokens.txt ]; then
./local/convert_transcript_words_to_tokens.py \
--lexicon $lang_dir/lexicon.txt \
--transcript $lang_dir/transcript_words.txt \
--oov "<UNK>" \
> $lang_dir/transcript_tokens.txt
fi
if [ ! -f $lang_dir/P.arpa ]; then
./shared/make_kn_lm.py \
-ngram-order 2 \
-text $lang_dir/transcript_tokens.txt \
-lm $lang_dir/P.arpa
fi
if [ ! -f $lang_dir/P.fst.txt ]; then
python3 -m kaldilm \
--read-symbol-table="$lang_dir/tokens.txt" \
--disambig-symbol='#0' \
--max-order=2 \
$lang_dir/P.arpa > $lang_dir/P.fst.txt
fi
done
fi
if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
log "Stage 8: Prepare G"
log "Stage 7: Prepare G"
# We assume you have install kaldilm, if not, please install
# it using: pip install kaldilm
mkdir -p data/lm
if [ ! -f data/lm/G_3_gram.fst.txt ]; then
if [ ! -f data/lm/G_4_gram_small.fst.txt ]; then
# It is used in building HLG
python3 -m kaldilm \
--read-symbol-table="data/lang_phone/words.txt" \
--disambig-symbol='#0' \
--max-order=3 \
$dl_dir/lm/3-gram.pruned.1e-7.arpa > data/lm/G_3_gram.fst.txt
fi
if [ ! -f data/lm/G_4_gram.fst.txt ]; then
# It is used for LM rescoring
python3 -m kaldilm \
--read-symbol-table="data/lang_phone/words.txt" \
--read-symbol-table="data/lang/words.txt" \
--disambig-symbol='#0' \
--max-order=4 \
$dl_dir/lm/4-gram.arpa > data/lm/G_4_gram.fst.txt
--max-arpa-warnings=-1 \
$dl_dir/lm/4gram_small.arpa > data/lm/G_4_gram_small.fst.txt
fi
if [ ! -f data/lm/G_4_gram_big.fst.txt ]; then
# It is used for LM rescoring
python3 -m kaldilm \
--read-symbol-table="data/lang/words.txt" \
--disambig-symbol='#0' \
--max-order=4 \
--max-arpa-warnings=-1 \
$dl_dir/lm/4gram_big.arpa > data/lm/G_4_gram_big.fst.txt
fi
fi
if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
log "Stage 9: Compile HLG"
./local/compile_hlg.py --lang-dir data/lang_phone
if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
log "Stage 8: Compile HLG"
for vocab_size in ${vocab_sizes[@]}; do
lang_dir=data/lang_bpe_${vocab_size}
./local/compile_hlg.py --lang-dir $lang_dir
if [ ! -f $lang_dir/HLG.pt ]; then
./local/compile_hlg.py \
--lang-dir $lang_dir \
--lm G_4_gram_small
fi
done
fi