update prepare char dict

This commit is contained in:
Yuekai Zhang 2022-07-07 11:06:51 +08:00
parent 58fdc7aa7e
commit a820c86337
4 changed files with 27 additions and 13 deletions

View File

@ -0,0 +1 @@
../../../wenetspeech/ASR/local/prepare_lang.py

View File

@ -0,0 +1 @@
../../../wenetspeech/ASR/local/prepare_words.py

View File

@ -0,0 +1 @@
../../../wenetspeech/ASR/local/text2segments.py

View File

@ -112,30 +112,41 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
fi fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 6: Prepare char based lang" log "Stage 5: Prepare char based lang"
lang_char_dir=data/lang_char lang_char_dir=data/lang_char
mkdir -p $lang_char_dir mkdir -p $lang_char_dir
# Prepare text. # Prepare text.
grep "\"text\":" data/manifests/aishell2_supervisions_train.jsonl.gz \ # Note: in Linux, you can install jq with the following command:
| sed -e 's/["text:\t ]*//g' | sed 's/,//g' \ # 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
| ./local/text2token.py -t "char" > $lang_char_dir/text # 2. chmod +x ./jq
# 3. cp jq /usr/bin
if [ ! -f $lang_char_dir/text ]; then
gunzip -c data/manifests/aishell2_supervisions_train.jsonl.gz \
| jq '.text' | sed 's/"//g' \
| ./local/text2token.py -t "char" > $lang_char_dir/text
fi
# Prepare words.txt # The implementation of chinese word segmentation for text,
grep "\"text\":" data/manifests/aishell2_supervisions_train.jsonl.gz \ # and it will take about 15 minutes.
| sed -e 's/["text:\t]*//g' | sed 's/,//g' \ # If can't install paddle-tiny with python 3.8, please refer
| ./local/text2token.py -t "char" > $lang_char_dir/text_words # https://github.com/fxsjy/jieba/issues/920
if [ ! -f $lang_char_dir/text_words_segmentation ]; then
python3 ./local/text2segments.py \
--input-file $lang_char_dir/text \
--output-file $lang_char_dir/text_words_segmentation
fi
cat $lang_char_dir/text_words | sed 's/ /\n/g' | sort -u | sed '/^$/d' \ cat $lang_char_dir/text_words_segmentation | sed 's/ /\n/g' \
| uniq > $lang_char_dir/words_no_ids.txt | sort -u | sed '/^$/d' | uniq > $lang_char_dir/words_no_ids.txt
if [ ! -f $lang_char_dir/words.txt ]; then if [ ! -f $lang_char_dir/words.txt ]; then
./local/prepare_words.py \ python3 ./local/prepare_words.py \
--input-file $lang_char_dir/words_no_ids.txt --input-file $lang_char_dir/words_no_ids.txt \
--output-file $lang_char_dir/words.txt --output-file $lang_char_dir/words.txt
fi fi
if [ ! -f $lang_char_dir/L_disambig.pt ]; then if [ ! -f $lang_char_dir/L_disambig.pt ]; then
./local/prepare_char.py python3 ./local/prepare_char.py
fi fi
fi fi