From a820c86337e0eaf8e93e1bd77eb9a6578e6f19ce Mon Sep 17 00:00:00 2001 From: Yuekai Zhang Date: Thu, 7 Jul 2022 11:06:51 +0800 Subject: [PATCH] update prepare char dict --- egs/aishell2/ASR/local/prepare_lang.py | 1 + egs/aishell2/ASR/local/prepare_words.py | 1 + egs/aishell2/ASR/local/text2segments.py | 1 + egs/aishell2/ASR/prepare.sh | 37 ++++++++++++++++--------- 4 files changed, 27 insertions(+), 13 deletions(-) create mode 120000 egs/aishell2/ASR/local/prepare_lang.py create mode 120000 egs/aishell2/ASR/local/prepare_words.py create mode 120000 egs/aishell2/ASR/local/text2segments.py diff --git a/egs/aishell2/ASR/local/prepare_lang.py b/egs/aishell2/ASR/local/prepare_lang.py new file mode 120000 index 000000000..5d88dc1c8 --- /dev/null +++ b/egs/aishell2/ASR/local/prepare_lang.py @@ -0,0 +1 @@ +../../../wenetspeech/ASR/local/prepare_lang.py \ No newline at end of file diff --git a/egs/aishell2/ASR/local/prepare_words.py b/egs/aishell2/ASR/local/prepare_words.py new file mode 120000 index 000000000..e58fabb8f --- /dev/null +++ b/egs/aishell2/ASR/local/prepare_words.py @@ -0,0 +1 @@ +../../../wenetspeech/ASR/local/prepare_words.py \ No newline at end of file diff --git a/egs/aishell2/ASR/local/text2segments.py b/egs/aishell2/ASR/local/text2segments.py new file mode 120000 index 000000000..7d68a39c3 --- /dev/null +++ b/egs/aishell2/ASR/local/text2segments.py @@ -0,0 +1 @@ +../../../wenetspeech/ASR/local/text2segments.py \ No newline at end of file diff --git a/egs/aishell2/ASR/prepare.sh b/egs/aishell2/ASR/prepare.sh index cffcb0bdd..9b0c0182a 100755 --- a/egs/aishell2/ASR/prepare.sh +++ b/egs/aishell2/ASR/prepare.sh @@ -112,30 +112,41 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 6: Prepare char based lang" + log "Stage 5: Prepare char based lang" lang_char_dir=data/lang_char mkdir -p $lang_char_dir # Prepare text. - grep "\"text\":" data/manifests/aishell2_supervisions_train.jsonl.gz \ - | sed -e 's/["text:\t ]*//g' | sed 's/,//g' \ - | ./local/text2token.py -t "char" > $lang_char_dir/text + # Note: in Linux, you can install jq with the following command: + # 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 + # 2. chmod +x ./jq + # 3. cp jq /usr/bin + if [ ! -f $lang_char_dir/text ]; then + gunzip -c data/manifests/aishell2_supervisions_train.jsonl.gz \ + | jq '.text' | sed 's/"//g' \ + | ./local/text2token.py -t "char" > $lang_char_dir/text + fi - # Prepare words.txt - grep "\"text\":" data/manifests/aishell2_supervisions_train.jsonl.gz \ - | sed -e 's/["text:\t]*//g' | sed 's/,//g' \ - | ./local/text2token.py -t "char" > $lang_char_dir/text_words + # The implementation of chinese word segmentation for text, + # and it will take about 15 minutes. + # If can't install paddle-tiny with python 3.8, please refer + # https://github.com/fxsjy/jieba/issues/920 + if [ ! -f $lang_char_dir/text_words_segmentation ]; then + python3 ./local/text2segments.py \ + --input-file $lang_char_dir/text \ + --output-file $lang_char_dir/text_words_segmentation + fi - cat $lang_char_dir/text_words | sed 's/ /\n/g' | sort -u | sed '/^$/d' \ - | uniq > $lang_char_dir/words_no_ids.txt + cat $lang_char_dir/text_words_segmentation | sed 's/ /\n/g' \ + | sort -u | sed '/^$/d' | uniq > $lang_char_dir/words_no_ids.txt if [ ! -f $lang_char_dir/words.txt ]; then - ./local/prepare_words.py \ - --input-file $lang_char_dir/words_no_ids.txt + python3 ./local/prepare_words.py \ + --input-file $lang_char_dir/words_no_ids.txt \ --output-file $lang_char_dir/words.txt fi if [ ! -f $lang_char_dir/L_disambig.pt ]; then - ./local/prepare_char.py + python3 ./local/prepare_char.py fi fi \ No newline at end of file