Update preprocess_commonvoice.py

This commit is contained in:
jinzr 2024-03-15 19:31:35 +08:00
parent d9a0ab59db
commit 6993183dd7

View File

@ -56,48 +56,16 @@ def normalize_text(utt: str, language: str) -> str:
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese # Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
# Not sure why they decided to do this... # Not sure why they decided to do this...
# None en/zh-yue tokens are manually removed here # None en/zh-yue tokens are manually removed here
return (
utt.replace("", "") # fmt: off
.replace("", " ") tokens_to_remove = ["", "", "", "", "?", "!", "", "", ",", "\.", ":", ";", "", "", "", "", "~", "", "", "", "", "", "", "·", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""]
.replace("", "")
.replace("", "") # fmt: on
.replace("?", "") utt = utt.upper().replace("\\", "")
.replace("!", "") return re.sub(
.replace("", "") pattern="|".join([f"[{token}]" for token in tokens_to_remove]),
.replace("", "") repl="",
.replace(",", "") string=utt,
.replace(".", "")
.replace(":", "")
.replace(";", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("\\", "")
.replace("~", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("·", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
.upper()
) )
else: else:
raise NotImplementedError( raise NotImplementedError(