Merge c053b7c8f0772e22fc1481aa7e61a8d03d020f5f into 34fc1fdf0d8ff520e2bb18267d046ca207c78ef9

This commit is contained in:
Mohammad Gholizadeh 2025-08-02 08:31:33 +00:00 committed by GitHub
commit a852b92de6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 6 additions and 1 deletions

View File

@ -52,6 +52,11 @@ def normalize_text(utt: str, language: str) -> str:
return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper()
elif language == "pl":
return re.sub(r"[^a-ząćęłńóśźżA-ZĄĆĘŁŃÓŚŹŻ' ]", "", utt).upper()
elif language == "fa":
utt = utt.replace("ي", "ی").replace("ك", "ک")
utt = re.sub(r"[^\u0600-\u06FF0-9\u06F0-\u06F9\s]|[.,?!\-]", "", utt)
utt = re.sub(r"\s+", " ", utt).strip()
return utt
elif language in ["yue", "zh-HK"]:
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
# Not sure why they decided to do this...

View File

@ -42,7 +42,7 @@ use_invalidated=false
# - speech
dl_dir=$PWD/download
release=cv-corpus-12.0-2022-12-07
release=cv-corpus-12.0-2022-12-07 ## -> consider changing relaese name or download the file manually and move it to download folder.
lang=fr
perturb_speed=false