adding farsi cv preprocessing

This commit is contained in:
Mohammad Gholizadeh 2025-08-02 09:27:18 +01:00
parent 34fc1fdf0d
commit c053b7c8f0
2 changed files with 6 additions and 1 deletions

View File

@ -52,6 +52,11 @@ def normalize_text(utt: str, language: str) -> str:
return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper() return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper()
elif language == "pl": elif language == "pl":
return re.sub(r"[^a-ząćęłńóśźżA-ZĄĆĘŁŃÓŚŹŻ' ]", "", utt).upper() return re.sub(r"[^a-ząćęłńóśźżA-ZĄĆĘŁŃÓŚŹŻ' ]", "", utt).upper()
elif language == "fa":
utt = utt.replace("ي", "ی").replace("ك", "ک")
utt = re.sub(r"[^\u0600-\u06FF0-9\u06F0-\u06F9\s]|[.,?!\-]", "", utt)
utt = re.sub(r"\s+", " ", utt).strip()
return utt
elif language in ["yue", "zh-HK"]: elif language in ["yue", "zh-HK"]:
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese # Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
# Not sure why they decided to do this... # Not sure why they decided to do this...

View File

@ -42,7 +42,7 @@ use_invalidated=false
# - speech # - speech
dl_dir=$PWD/download dl_dir=$PWD/download
release=cv-corpus-12.0-2022-12-07 release=cv-corpus-12.0-2022-12-07 ## -> consider changing relaese name or download the file manually and move it to download folder.
lang=fr lang=fr
perturb_speed=false perturb_speed=false