mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
50 lines
1.2 KiB
Bash
Executable File
50 lines
1.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
|
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
|
|
|
set -eou pipefail
|
|
|
|
|
|
. prepare.sh --stage -1 --stop-stage 6 || exit 1
|
|
|
|
. prepare_lm.sh --stage 0 --stop-stage 0 || exit 1
|
|
|
|
log "Running prepare_mmi.sh"
|
|
|
|
stage=0
|
|
stop_stage=100
|
|
|
|
. shared/parse_options.sh || exit 1
|
|
|
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
|
log "Stage 0: Prepare bigram token-level P for MMI training"
|
|
|
|
for vocab_size in ${vocab_sizes[@]}; do
|
|
lang_dir=data/lang_bpe_${vocab_size}
|
|
|
|
if [ ! -f $lang_dir/transcript_tokens.txt ]; then
|
|
./local/convert_transcript_words_to_tokens.py \
|
|
--lexicon $lang_dir/lexicon.txt \
|
|
--transcript $lang_dir/transcript_words.txt \
|
|
--oov "<UNK>" \
|
|
> $lang_dir/transcript_tokens.txt
|
|
fi
|
|
|
|
if [ ! -f $lang_dir/P.arpa ]; then
|
|
./shared/make_kn_lm.py \
|
|
-ngram-order 2 \
|
|
-text $lang_dir/transcript_tokens.txt \
|
|
-lm $lang_dir/P.arpa
|
|
fi
|
|
|
|
if [ ! -f $lang_dir/P.fst.txt ]; then
|
|
python3 -m kaldilm \
|
|
--read-symbol-table="$lang_dir/tokens.txt" \
|
|
--disambig-symbol='#0' \
|
|
--max-order=2 \
|
|
$lang_dir/P.arpa > $lang_dir/P.fst.txt
|
|
fi
|
|
done
|
|
fi
|