diff --git a/egs/commonvoice/ASR/local/preprocess_commonvoice.py b/egs/commonvoice/ASR/local/preprocess_commonvoice.py index 0da827a60..c0be499c4 100755 --- a/egs/commonvoice/ASR/local/preprocess_commonvoice.py +++ b/egs/commonvoice/ASR/local/preprocess_commonvoice.py @@ -55,6 +55,7 @@ def normalize_text(utt: str, language: str) -> str: elif language in ["yue", "zh-HK"]: # Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese # Not sure why they decided to do this... + # None en/zh-yue tokens are manually removed here return ( utt.replace(",", "") .replace("。", " ") @@ -80,6 +81,7 @@ def normalize_text(utt: str, language: str) -> str: .replace("》", "") .replace("…", "") .replace("⋯", "") + .replace("·", "") .upper() ) else: diff --git a/egs/commonvoice/ASR/local/word_segment_yue.py b/egs/commonvoice/ASR/local/word_segment_yue.py index 716993654..51894da3a 100755 --- a/egs/commonvoice/ASR/local/word_segment_yue.py +++ b/egs/commonvoice/ASR/local/word_segment_yue.py @@ -26,6 +26,7 @@ files in the directory "data/lang_char": import argparse import logging +import re from pathlib import Path from typing import List @@ -33,7 +34,7 @@ import pycantonese from preprocess_commonvoice import normalize_text from tqdm.auto import tqdm -from icefall.utils import is_cjk +from icefall.utils import is_cjk, tokenize_by_CJK_char def get_parser(): @@ -73,23 +74,28 @@ def get_word_segments(lines: List[str]) -> List[str]: for line in tqdm(lines, desc="Segmenting lines"): try: - # code switching - if len(line.strip().split(" ")) > 1: + if is_cs(line): # code switching segments = [] - for segment in line.strip().split(" "): + curr_str = "" + for segment in tokenize_by_CJK_char(line).split(" "): if segment.strip() == "": continue try: if not is_cjk(segment[0]): # en segment + if curr_str: + segments.extend(pycantonese.segment(curr_str)) + curr_str = "" segments.append(segment) else: # zh segment - segments.extend(pycantonese.segment(segment)) + curr_str += segment + # segments.extend(pycantonese.segment(segment)) except Exception as e: logging.error(f"Failed to process segment: {segment}") raise e + if curr_str: # process the last segment + segments.extend(pycantonese.segment(curr_str)) new_lines.append(" ".join(segments) + "\n") - # not code switching - else: + else: # not code switching new_lines.append(" ".join(pycantonese.segment(line)) + "\n") except Exception as e: logging.error(f"Failed to process line: {line}") @@ -104,6 +110,11 @@ def get_words(lines: List[str]) -> List[str]: return list(words) +def is_cs(line: str) -> bool: + english_markers = r"[a-zA-Z]+" + return bool(re.search(english_markers, line)) + + if __name__ == "__main__": parser = get_parser() args = parser.parse_args()