diff --git a/egs/commonvoice/ASR/local/preprocess_commonvoice.py b/egs/commonvoice/ASR/local/preprocess_commonvoice.py index cc88ef8d7..bb6bab195 100755 --- a/egs/commonvoice/ASR/local/preprocess_commonvoice.py +++ b/egs/commonvoice/ASR/local/preprocess_commonvoice.py @@ -52,6 +52,11 @@ def normalize_text(utt: str, language: str) -> str: return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper() elif language == "pl": return re.sub(r"[^a-ząćęłńóśźżA-ZĄĆĘŁŃÓŚŹŻ' ]", "", utt).upper() + elif language == "fa": + utt = utt.replace("ي", "ی").replace("ك", "ک") + utt = re.sub(r"[^\u0600-\u06FF0-9\u06F0-\u06F9\s]|[.,?!\-]", "", utt) + utt = re.sub(r"\s+", " ", utt).strip() + return utt elif language in ["yue", "zh-HK"]: # Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese # Not sure why they decided to do this... diff --git a/egs/commonvoice/ASR/prepare.sh b/egs/commonvoice/ASR/prepare.sh index 200114a86..fe9f5e33f 100755 --- a/egs/commonvoice/ASR/prepare.sh +++ b/egs/commonvoice/ASR/prepare.sh @@ -42,7 +42,7 @@ use_invalidated=false # - speech dl_dir=$PWD/download -release=cv-corpus-12.0-2022-12-07 +release=cv-corpus-12.0-2022-12-07 ## -> consider changing relaese name or download the file manually and move it to download folder. lang=fr perturb_speed=false