From 16f1799ef3d65ee0a84a3c96a24f03f58d68f023 Mon Sep 17 00:00:00 2001 From: wgb14 Date: Sat, 13 Nov 2021 23:59:50 -0500 Subject: [PATCH] support HLG for BPE --- egs/gigaspeech/ASR/local/compile_hlg.py | 1 + egs/gigaspeech/ASR/prepare.sh | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) create mode 120000 egs/gigaspeech/ASR/local/compile_hlg.py diff --git a/egs/gigaspeech/ASR/local/compile_hlg.py b/egs/gigaspeech/ASR/local/compile_hlg.py new file mode 120000 index 000000000..471aa7fb4 --- /dev/null +++ b/egs/gigaspeech/ASR/local/compile_hlg.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/compile_hlg.py \ No newline at end of file diff --git a/egs/gigaspeech/ASR/prepare.sh b/egs/gigaspeech/ASR/prepare.sh index 47ed2dc5f..19aad1d75 100755 --- a/egs/gigaspeech/ASR/prepare.sh +++ b/egs/gigaspeech/ASR/prepare.sh @@ -249,13 +249,27 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then # it using: pip install kaldilm mkdir -p data/lm + if [ ! -f data/lm/3-gram.arpa ]; then + ./shared/make_kn_lm.py \ + -ngram-order 3 \ + -text "data/lang_phone/transcript_words.txt" \ + -lm data/lm/3-gram.arpa + fi + if [ ! -f data/lm/G_3_gram.fst.txt ]; then # It is used in building HLG python3 -m kaldilm \ --read-symbol-table="data/lang_phone/words.txt" \ --disambig-symbol='#0' \ --max-order=3 \ - $dl_dir/lm/3-gram.pruned.1e-7.arpa > data/lm/G_3_gram.fst.txt + $data/lm/3-gram.arpa > data/lm/G_3_gram.fst.txt + fi + + if [ ! -f data/lm/4-gram.arpa ]; then + ./shared/make_kn_lm.py \ + -ngram-order 4 \ + -text "data/lang_phone/transcript_words.txt" \ + -lm data/lm/4-gram.arpa fi if [ ! -f data/lm/G_4_gram.fst.txt ]; then @@ -270,7 +284,7 @@ fi if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then log "Stage 9: Compile HLG" - ./local/compile_hlg.py --lang-dir data/lang_phone + # ./local/compile_hlg.py --lang-dir data/lang_phone for vocab_size in ${vocab_sizes[@]}; do lang_dir=data/lang_bpe_${vocab_size}