Merge c053b7c8f0772e22fc1481aa7e61a8d03d020f5f into 0904e490c5fb424dc5cb4d14ae468e4d32a07dc4

This commit is contained in:
Mohammad Sadegh Gholizadeh 2025-11-28 11:44:09 +08:00 committed by GitHub
commit 6fccd13ad9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 6 additions and 1 deletions

View File

@ -52,6 +52,11 @@ def normalize_text(utt: str, language: str) -> str:
return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper()
elif language == "pl":
return re.sub(r"[^a-ząćęłńóśźżA-ZĄĆĘŁŃÓŚŹŻ' ]", "", utt).upper()
elif language == "fa":
utt = utt.replace("ي", "ی").replace("ك", "ک")
utt = re.sub(r"[^\u0600-\u06FF0-9\u06F0-\u06F9\s]|[.,?!\-]", "", utt)
utt = re.sub(r"\s+", " ", utt).strip()
return utt
elif language in ["yue", "zh-HK"]:
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
# Not sure why they decided to do this...

View File

@ -42,7 +42,7 @@ use_invalidated=false
# - speech
dl_dir=$PWD/download
release=cv-corpus-12.0-2022-12-07
release=cv-corpus-12.0-2022-12-07 ## -> consider changing relaese name or download the file manually and move it to download folder.
lang=fr
perturb_speed=false