diff --git a/egs/commonvoice/ASR/local/word_segment_yue.py b/egs/commonvoice/ASR/local/word_segment_yue.py
index e5f645d80..55eaf6db1 100755
--- a/egs/commonvoice/ASR/local/word_segment_yue.py
+++ b/egs/commonvoice/ASR/local/word_segment_yue.py
@@ -30,6 +30,7 @@ from pathlib import Path
 from typing import List
 
 import pycantonese
+from preprocess_commonvoice import normalize_text
 from tqdm.auto import tqdm
 
 from icefall.utils import is_cjk
@@ -54,6 +55,13 @@ def get_parser():
         type=str,
         help="The output directory",
     )
+    parser.add_argument(
+        "--lang",
+        "-l",
+        default="yue",
+        type=str,
+        help="The language",
+    )
     return parser
 
 
@@ -102,13 +110,15 @@ if __name__ == "__main__":
 
     input_file = Path(args.input_file)
     output_dir = Path(args.output_dir)
+    lang = Path(args.lang)
 
     assert input_file.is_file(), f"{input_file} does not exist"
     assert output_dir.is_dir(), f"{output_dir} does not exist"
 
     lines = input_file.read_text(encoding="utf-8").strip().split("\n")
+    norm_lines = [normalize_text(line, lang) for line in lines]
 
-    text_words_segments = get_word_segments(lines)
+    text_words_segments = get_word_segments(norm_lines)
     with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f:
         f.writelines(text_words_segments)
 
diff --git a/egs/commonvoice/ASR/prepare.sh b/egs/commonvoice/ASR/prepare.sh
index e90deea44..498b51ba3 100755
--- a/egs/commonvoice/ASR/prepare.sh
+++ b/egs/commonvoice/ASR/prepare.sh
@@ -194,7 +194,8 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
           # Get words.txt and words_no_ids.txt
           ./local/word_segment_yue.py \
             --input-file $lang_dir/text \
-            --output-dir $lang_dir
+            --output-dir $lang_dir \
+            --lang $lang
 
           mv $lang_dir/text $lang_dir/_text
           cp $lang_dir/transcript_words.txt $lang_dir/text