From a820c86337e0eaf8e93e1bd77eb9a6578e6f19ce Mon Sep 17 00:00:00 2001
From: Yuekai Zhang <zhangyuekai@foxmail.com>
Date: Thu, 7 Jul 2022 11:06:51 +0800
Subject: [PATCH] update prepare char dict

---
 egs/aishell2/ASR/local/prepare_lang.py  |  1 +
 egs/aishell2/ASR/local/prepare_words.py |  1 +
 egs/aishell2/ASR/local/text2segments.py |  1 +
 egs/aishell2/ASR/prepare.sh             | 37 ++++++++++++++++---------
 4 files changed, 27 insertions(+), 13 deletions(-)
 create mode 120000 egs/aishell2/ASR/local/prepare_lang.py
 create mode 120000 egs/aishell2/ASR/local/prepare_words.py
 create mode 120000 egs/aishell2/ASR/local/text2segments.py

diff --git a/egs/aishell2/ASR/local/prepare_lang.py b/egs/aishell2/ASR/local/prepare_lang.py
new file mode 120000
index 000000000..5d88dc1c8
--- /dev/null
+++ b/egs/aishell2/ASR/local/prepare_lang.py
@@ -0,0 +1 @@
+../../../wenetspeech/ASR/local/prepare_lang.py
\ No newline at end of file
diff --git a/egs/aishell2/ASR/local/prepare_words.py b/egs/aishell2/ASR/local/prepare_words.py
new file mode 120000
index 000000000..e58fabb8f
--- /dev/null
+++ b/egs/aishell2/ASR/local/prepare_words.py
@@ -0,0 +1 @@
+../../../wenetspeech/ASR/local/prepare_words.py
\ No newline at end of file
diff --git a/egs/aishell2/ASR/local/text2segments.py b/egs/aishell2/ASR/local/text2segments.py
new file mode 120000
index 000000000..7d68a39c3
--- /dev/null
+++ b/egs/aishell2/ASR/local/text2segments.py
@@ -0,0 +1 @@
+../../../wenetspeech/ASR/local/text2segments.py
\ No newline at end of file
diff --git a/egs/aishell2/ASR/prepare.sh b/egs/aishell2/ASR/prepare.sh
index cffcb0bdd..9b0c0182a 100755
--- a/egs/aishell2/ASR/prepare.sh
+++ b/egs/aishell2/ASR/prepare.sh
@@ -112,30 +112,41 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
 fi
 
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  log "Stage 6: Prepare char based lang"
+  log "Stage 5: Prepare char based lang"
   lang_char_dir=data/lang_char
   mkdir -p $lang_char_dir
 
   # Prepare text.
-  grep "\"text\":" data/manifests/aishell2_supervisions_train.jsonl.gz \
-    | sed -e 's/["text:\t ]*//g' | sed 's/,//g' \
-    | ./local/text2token.py -t "char" > $lang_char_dir/text
+  # Note: in Linux, you can install jq with the following command:
+  # 1. wget -O jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
+  # 2. chmod +x ./jq
+  # 3. cp jq /usr/bin
+  if [ ! -f $lang_char_dir/text ]; then
+    gunzip -c data/manifests/aishell2_supervisions_train.jsonl.gz \
+      | jq '.text' | sed 's/"//g' \
+      | ./local/text2token.py -t "char" > $lang_char_dir/text
+  fi
 
-  # Prepare words.txt
-  grep "\"text\":" data/manifests/aishell2_supervisions_train.jsonl.gz \
-    | sed -e 's/["text:\t]*//g' | sed 's/,//g' \
-    | ./local/text2token.py -t "char" > $lang_char_dir/text_words
+  # The implementation of chinese word segmentation for text,
+  # and it will take about 15 minutes.
+  # If can't install paddle-tiny with python 3.8, please refer
+  # https://github.com/fxsjy/jieba/issues/920
+  if [ ! -f $lang_char_dir/text_words_segmentation ]; then
+    python3 ./local/text2segments.py \
+      --input-file $lang_char_dir/text \
+      --output-file $lang_char_dir/text_words_segmentation
+  fi
 
-  cat $lang_char_dir/text_words | sed 's/ /\n/g' | sort -u | sed '/^$/d' \
-    | uniq > $lang_char_dir/words_no_ids.txt
+  cat $lang_char_dir/text_words_segmentation | sed 's/ /\n/g' \
+    | sort -u | sed '/^$/d' | uniq > $lang_char_dir/words_no_ids.txt
 
   if [ ! -f $lang_char_dir/words.txt ]; then
-    ./local/prepare_words.py \
-      --input-file $lang_char_dir/words_no_ids.txt
+    python3 ./local/prepare_words.py \
+      --input-file $lang_char_dir/words_no_ids.txt \
       --output-file $lang_char_dir/words.txt
   fi
 
   if [ ! -f $lang_char_dir/L_disambig.pt ]; then
-    ./local/prepare_char.py
+    python3 ./local/prepare_char.py
   fi
 fi
\ No newline at end of file