diff --git a/egs/commonvoice/ASR/local/preprocess_commonvoice.py b/egs/commonvoice/ASR/local/preprocess_commonvoice.py index 3be85ed15..cc88ef8d7 100755 --- a/egs/commonvoice/ASR/local/preprocess_commonvoice.py +++ b/egs/commonvoice/ASR/local/preprocess_commonvoice.py @@ -56,48 +56,16 @@ def normalize_text(utt: str, language: str) -> str: # Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese # Not sure why they decided to do this... # None en/zh-yue tokens are manually removed here - return ( - utt.replace(",", "") - .replace("。", " ") - .replace("?", "") - .replace("!", "") - .replace("?", "") - .replace("!", "") - .replace("‘", "") - .replace("、", "") - .replace(",", "") - .replace(".", "") - .replace(":", "") - .replace(";", "") - .replace("「", "") - .replace("」", "") - .replace("“", "") - .replace("”", "") - .replace("\\", "") - .replace("~", "") - .replace("—", "") - .replace("ㄧ", "") - .replace("《", "") - .replace("》", "") - .replace("…", "") - .replace("⋯", "") - .replace("·", "") - .replace("﹒", "") - .replace(".", "") - .replace(":", "") - .replace("︰", "") - .replace("﹖", "") - .replace("(", "") - .replace(")", "") - .replace("-", "") - .replace("~", "") - .replace(";", "") - .replace("", "") - .replace("﹔", "") - .replace("/", "") - .replace("A", "") - .replace("B", "") - .upper() + + # fmt: off + tokens_to_remove = [",", "。", "?", "!", "?", "!", "‘", "、", ",", "\.", ":", ";", "「", "」", "“", "”", "~", "—", "ㄧ", "《", "》", "…", "⋯", "·", "﹒", ".", ":", "︰", "﹖", "(", ")", "-", "~", ";", "", "⠀", "﹔", "/", "A", "B", "–", "‧"] + + # fmt: on + utt = utt.upper().replace("\\", "") + return re.sub( + pattern="|".join([f"[{token}]" for token in tokens_to_remove]), + repl="", + string=utt, ) else: raise NotImplementedError(