diff --git a/egs/tedlium2/ASR/.prepare.sh.swp b/egs/tedlium2/ASR/.prepare.sh.swp index 1b5b08a23..ab728ea20 100644 Binary files a/egs/tedlium2/ASR/.prepare.sh.swp and b/egs/tedlium2/ASR/.prepare.sh.swp differ diff --git a/egs/tedlium2/ASR/.prepare_ted3.sh.swp b/egs/tedlium2/ASR/.prepare_ted3.sh.swp index dba02ec97..53d4ccd3c 100644 Binary files a/egs/tedlium2/ASR/.prepare_ted3.sh.swp and b/egs/tedlium2/ASR/.prepare_ted3.sh.swp differ diff --git a/egs/tedlium2/ASR/prepare.sh b/egs/tedlium2/ASR/prepare.sh index 31e0ac32e..0d8c04923 100755 --- a/egs/tedlium2/ASR/prepare.sh +++ b/egs/tedlium2/ASR/prepare.sh @@ -172,68 +172,42 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then fi if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then - log "Stage 7: Prepare bigram P" - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} - - if [ ! -f $lang_dir/transcript_tokens.txt ]; then - ./local/convert_transcript_words_to_tokens.py \ - --lexicon $lang_dir/lexicon.txt \ - --transcript $lang_dir/transcript_words.txt \ - --oov "" \ - > $lang_dir/transcript_tokens.txt - fi - - if [ ! -f $lang_dir/P.arpa ]; then - ./shared/make_kn_lm.py \ - -ngram-order 2 \ - -text $lang_dir/transcript_tokens.txt \ - -lm $lang_dir/P.arpa - fi - - if [ ! -f $lang_dir/P.fst.txt ]; then - python3 -m kaldilm \ - --read-symbol-table="$lang_dir/tokens.txt" \ - --disambig-symbol='#0' \ - --max-order=2 \ - $lang_dir/P.arpa > $lang_dir/P.fst.txt - fi - done -fi - -if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then - log "Stage 8: Prepare G" + log "Stage 7: Prepare G" # We assume you have install kaldilm, if not, please install # it using: pip install kaldilm mkdir -p data/lm - if [ ! -f data/lm/G_3_gram.fst.txt ]; then + if [ ! -f data/lm/G_4_gram_small.fst.txt ]; then # It is used in building HLG python3 -m kaldilm \ - --read-symbol-table="data/lang_phone/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $dl_dir/lm/3-gram.pruned.1e-7.arpa > data/lm/G_3_gram.fst.txt - fi - - if [ ! -f data/lm/G_4_gram.fst.txt ]; then - # It is used for LM rescoring - python3 -m kaldilm \ - --read-symbol-table="data/lang_phone/words.txt" \ + --read-symbol-table="data/lang/words.txt" \ --disambig-symbol='#0' \ --max-order=4 \ - $dl_dir/lm/4-gram.arpa > data/lm/G_4_gram.fst.txt + --max-arpa-warnings=-1 \ + $dl_dir/lm/4gram_small.arpa > data/lm/G_4_gram_small.fst.txt + fi + + if [ ! -f data/lm/G_4_gram_big.fst.txt ]; then + # It is used for LM rescoring + python3 -m kaldilm \ + --read-symbol-table="data/lang/words.txt" \ + --disambig-symbol='#0' \ + --max-order=4 \ + --max-arpa-warnings=-1 \ + $dl_dir/lm/4gram_big.arpa > data/lm/G_4_gram_big.fst.txt fi fi -if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then - log "Stage 9: Compile HLG" - ./local/compile_hlg.py --lang-dir data/lang_phone +if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then + log "Stage 8: Compile HLG" for vocab_size in ${vocab_sizes[@]}; do lang_dir=data/lang_bpe_${vocab_size} - ./local/compile_hlg.py --lang-dir $lang_dir + + if [ ! -f $lang_dir/HLG.pt ]; then + ./local/compile_hlg.py \ + --lang-dir $lang_dir \ + --lm G_4_gram_small + fi done fi -