updated scripts for text norm

2025-08-26 18:24:18 +00:00 · 2024-03-13 10:57:59 +08:00 · 2024-03-13 10:57:59 +08:00 · b30a4d6162
commit b30a4d6162
parent 09a358a23e
2 changed files with 20 additions and 7 deletions
--- a/egs/commonvoice/ASR/local/preprocess_commonvoice.py
+++ b/egs/commonvoice/ASR/local/preprocess_commonvoice.py
@ -55,6 +55,7 @@ def normalize_text(utt: str, language: str) -> str:
    elif language in ["yue", "zh-HK"]:
        # Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
        # Not sure why they decided to do this...
+        # None en/zh-yue tokens are manually removed here
        return (
            utt.replace("，", "")
            .replace("。", " ")
@ -80,6 +81,7 @@ def normalize_text(utt: str, language: str) -> str:
            .replace("》", "")
            .replace("…", "")
            .replace("⋯", "")
+            .replace("·", "")
            .upper()
        )
    else:
--- a/egs/commonvoice/ASR/local/word_segment_yue.py
+++ b/egs/commonvoice/ASR/local/word_segment_yue.py
@ -26,6 +26,7 @@ files in the directory "data/lang_char":

 import argparse
 import logging
+import re
 from pathlib import Path
 from typing import List

@ -33,7 +34,7 @@ import pycantonese
 from preprocess_commonvoice import normalize_text
 from tqdm.auto import tqdm

-from icefall.utils import is_cjk
+from icefall.utils import is_cjk, tokenize_by_CJK_char


 def get_parser():
@ -73,23 +74,28 @@ def get_word_segments(lines: List[str]) -> List[str]:

    for line in tqdm(lines, desc="Segmenting lines"):
        try:
-            # code switching
-            if len(line.strip().split(" ")) > 1:
+            if is_cs(line):  # code switching
                segments = []
-                for segment in line.strip().split(" "):
+                curr_str = ""
+                for segment in tokenize_by_CJK_char(line).split(" "):
                    if segment.strip() == "":
                        continue
                    try:
                        if not is_cjk(segment[0]):  # en segment
+                            if curr_str:
+                                segments.extend(pycantonese.segment(curr_str))
+                                curr_str = ""
                            segments.append(segment)
                        else:  # zh segment
-                            segments.extend(pycantonese.segment(segment))
+                            curr_str += segment
+                            # segments.extend(pycantonese.segment(segment))
                    except Exception as e:
                        logging.error(f"Failed to process segment: {segment}")
                        raise e
+                if curr_str:  # process the last segment
+                    segments.extend(pycantonese.segment(curr_str))
                new_lines.append(" ".join(segments) + "\n")
-            # not code switching
-            else:
+            else:  # not code switching
                new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
        except Exception as e:
            logging.error(f"Failed to process line: {line}")
@ -104,6 +110,11 @@ def get_words(lines: List[str]) -> List[str]:
    return list(words)


+def is_cs(line: str) -> bool:
+    english_markers = r"[a-zA-Z]+"
+    return bool(re.search(english_markers, line))
+
+
 if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()