mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 18:12:19 +00:00
104 lines
2.7 KiB
Bash
Executable File
104 lines
2.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
|
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
|
|
|
set -eou pipefail
|
|
|
|
stage=1
|
|
stop_stage=5
|
|
|
|
data_dir=path/to/fluent/speech/commands
|
|
target_root_dir=data/
|
|
|
|
lang_dir=${target_root_dir}/lang_phone
|
|
lm_dir=${target_root_dir}/lm
|
|
manifest_dir=${target_root_dir}/manifests
|
|
fbanks_dir=${target_root_dir}/fbanks
|
|
|
|
. shared/parse_options.sh || exit 1
|
|
|
|
mkdir -p $lang_dir
|
|
mkdir -p $lm_dir
|
|
|
|
log() {
|
|
# This function is from espnet
|
|
local fname=${BASH_SOURCE[1]##*/}
|
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
|
}
|
|
|
|
log "data_dir: $data_dir"
|
|
|
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|
log "Stage 1: Prepare slu manifest"
|
|
mkdir -p $manifest_dir
|
|
lhotse prepare slu $data_dir $manifest_dir
|
|
fi
|
|
|
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|
log "Stage 2: Compute fbank for SLU"
|
|
mkdir -p $fbanks_dir
|
|
python ./local/compute_fbank_slu.py $manifest_dir $fbanks_dir
|
|
fi
|
|
|
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|
log "Stage 3: Prepare lang"
|
|
# NOTE: "<UNK> SIL" is added for implementation convenience
|
|
# as the graph compiler code requires that there is a OOV word
|
|
# in the lexicon.
|
|
python ./local/generate_lexicon.py $data_dir $lm_dir
|
|
fi
|
|
|
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|
log "Stage 4: Train LM"
|
|
# We use a unigram G
|
|
./shared/make_kn_lm.py \
|
|
-ngram-order 1 \
|
|
-text $lm_dir/words_transcript.txt \
|
|
-lm $lm_dir/G_transcript.arpa
|
|
|
|
./shared/make_kn_lm.py \
|
|
-ngram-order 1 \
|
|
-text $lm_dir/words_frames.txt \
|
|
-lm $lm_dir/G_frames.arpa
|
|
|
|
python ./local/prepare_lang.py $lm_dir
|
|
|
|
if [ ! -f $lm_dir/G_transcript.fst.txt ]; then
|
|
python -m kaldilm \
|
|
--read-symbol-table="$lm_dir/words_transcript.txt" \
|
|
$lm_dir/G_transcript.arpa > $lm_dir/G_transcript.fst.txt
|
|
fi
|
|
|
|
if [ ! -f $lm_dir/G_frames.fst.txt ]; then
|
|
python -m kaldilm \
|
|
--read-symbol-table="$lm_dir/words_frames.txt" \
|
|
$lm_dir/G_frames.arpa > $lm_dir/G_frames.fst.txt
|
|
fi
|
|
|
|
mkdir -p $lm_dir/frames
|
|
mkdir -p $lm_dir/transcript
|
|
|
|
chmod -R +777 .
|
|
|
|
for i in G_frames.arpa G_frames.fst.txt L_disambig_frames.pt L_frames.pt lexicon_disambig_frames.txt tokens_frames.txt words_frames.txt;
|
|
do
|
|
j=${i//"_frames"/}
|
|
mv "$lm_dir/$i" $lm_dir/frames/$j
|
|
done
|
|
|
|
for i in G_transcript.arpa G_transcript.fst.txt L_disambig_transcript.pt L_transcript.pt lexicon_disambig_transcript.txt tokens_transcript.txt words_transcript.txt;
|
|
do
|
|
j=${i//"_transcript"/}
|
|
mv "$lm_dir/$i" $lm_dir/transcript/$j
|
|
done
|
|
fi
|
|
|
|
|
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|
log "Stage 5: Compile HLG"
|
|
./local/compile_hlg.py --lang-dir $lm_dir/frames
|
|
./local/compile_hlg.py --lang-dir $lm_dir/transcript
|
|
|
|
fi
|