updated scripts for text

2024-03-12 12:40:44 +08:00 · 2024-03-12 12:40:44 +08:00 · d887bf8c63
commit d887bf8c63
parent d45e4c61e1
2 changed files with 13 additions and 2 deletions
--- a/egs/commonvoice/ASR/local/word_segment_yue.py
+++ b/egs/commonvoice/ASR/local/word_segment_yue.py
@ -30,6 +30,7 @@ from pathlib import Path
 from typing import List

 import pycantonese
+from preprocess_commonvoice import normalize_text
 from tqdm.auto import tqdm

 from icefall.utils import is_cjk
@ -54,6 +55,13 @@ def get_parser():
        type=str,
        help="The output directory",
    )
+    parser.add_argument(
+        "--lang",
+        "-l",
+        default="yue",
+        type=str,
+        help="The language",
+    )
    return parser


@ -102,13 +110,15 @@ if __name__ == "__main__":

    input_file = Path(args.input_file)
    output_dir = Path(args.output_dir)
+    lang = Path(args.lang)

    assert input_file.is_file(), f"{input_file} does not exist"
    assert output_dir.is_dir(), f"{output_dir} does not exist"

    lines = input_file.read_text(encoding="utf-8").strip().split("\n")
+    norm_lines = [normalize_text(line, lang) for line in lines]

-    text_words_segments = get_word_segments(lines)
+    text_words_segments = get_word_segments(norm_lines)
    with open(output_dir / "transcript_words.txt", "w+", encoding="utf-8") as f:
        f.writelines(text_words_segments)

--- a/egs/commonvoice/ASR/prepare.sh
+++ b/egs/commonvoice/ASR/prepare.sh
@ -194,7 +194,8 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then
          # Get words.txt and words_no_ids.txt
          ./local/word_segment_yue.py \
            --input-file $lang_dir/text \
-            --output-dir $lang_dir
+            --output-dir $lang_dir \
+            --lang $lang

          mv $lang_dir/text $lang_dir/_text
          cp $lang_dir/transcript_words.txt $lang_dir/text