Update preprocess_commonvoice.py

This commit is contained in:
jinzr 2024-03-13 10:36:50 +08:00
parent a39aa8a59d
commit 09a358a23e

View File

@ -56,8 +56,7 @@ def normalize_text(utt: str, language: str) -> str:
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese # Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
# Not sure why they decided to do this... # Not sure why they decided to do this...
return ( return (
utt.replace(" ", "") utt.replace("", "")
.replace("", "")
.replace("", " ") .replace("", " ")
.replace("", "") .replace("", "")
.replace("", "") .replace("", "")
@ -65,6 +64,22 @@ def normalize_text(utt: str, language: str) -> str:
.replace("!", "") .replace("!", "")
.replace("", "") .replace("", "")
.replace("", "") .replace("", "")
.replace(",", "")
.replace(".", "")
.replace(":", "")
.replace(";", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("\\", "")
.replace("~", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.upper() .upper()
) )
else: else: