diff --git a/egs/commonvoice/ASR/local/word_segment_yue.py b/egs/commonvoice/ASR/local/word_segment_yue.py index e5f645d80..55eaf6db1 100755 --- a/egs/commonvoice/ASR/local/word_segment_yue.py +++ b/egs/commonvoice/ASR/local/word_segment_yue.py @@ -30,6 +30,7 @@ from pathlib import Path from typing import List import pycantonese +from preprocess_commonvoice import normalize_text from tqdm.auto import tqdm from icefall.utils import is_cjk @@ -54,6 +55,13 @@ def get_parser(): type=str, help="The output directory", ) + parser.add_argument( + "--lang", + "-l", + default="yue", + type=str, + help="The language", + ) return parser @@ -102,13 +110,15 @@ if __name__ == "__main__": input_file = Path(args.input_file) output_dir = Path(args.output_dir) + lang = Path(args.lang) assert input_file.is_file(), f"{input_file} does not exist" assert output_dir.is_dir(), f"{output_dir} does not exist" lines = input_file.read_text(encoding="utf-8").strip().split("\n") + norm_lines = [normalize_text(line, lang) for line in lines] - text_words_segments = get_word_segments(lines) + text_words_segments = get_word_segments(norm_lines) with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f: f.writelines(text_words_segments) diff --git a/egs/commonvoice/ASR/prepare.sh b/egs/commonvoice/ASR/prepare.sh index e90deea44..498b51ba3 100755 --- a/egs/commonvoice/ASR/prepare.sh +++ b/egs/commonvoice/ASR/prepare.sh @@ -194,7 +194,8 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then # Get words.txt and words_no_ids.txt ./local/word_segment_yue.py \ --input-file $lang_dir/text \ - --output-dir $lang_dir + --output-dir $lang_dir \ + --lang $lang mv $lang_dir/text $lang_dir/_text cp $lang_dir/transcript_words.txt $lang_dir/text