mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 10:02:22 +00:00
adding farsi cv preprocessing
This commit is contained in:
parent
34fc1fdf0d
commit
c053b7c8f0
@ -52,6 +52,11 @@ def normalize_text(utt: str, language: str) -> str:
|
|||||||
return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper()
|
return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper()
|
||||||
elif language == "pl":
|
elif language == "pl":
|
||||||
return re.sub(r"[^a-ząćęłńóśźżA-ZĄĆĘŁŃÓŚŹŻ' ]", "", utt).upper()
|
return re.sub(r"[^a-ząćęłńóśźżA-ZĄĆĘŁŃÓŚŹŻ' ]", "", utt).upper()
|
||||||
|
elif language == "fa":
|
||||||
|
utt = utt.replace("ي", "ی").replace("ك", "ک")
|
||||||
|
utt = re.sub(r"[^\u0600-\u06FF0-9\u06F0-\u06F9\s]|[.,?!\-]", "", utt)
|
||||||
|
utt = re.sub(r"\s+", " ", utt).strip()
|
||||||
|
return utt
|
||||||
elif language in ["yue", "zh-HK"]:
|
elif language in ["yue", "zh-HK"]:
|
||||||
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
|
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
|
||||||
# Not sure why they decided to do this...
|
# Not sure why they decided to do this...
|
||||||
|
@ -42,7 +42,7 @@ use_invalidated=false
|
|||||||
# - speech
|
# - speech
|
||||||
|
|
||||||
dl_dir=$PWD/download
|
dl_dir=$PWD/download
|
||||||
release=cv-corpus-12.0-2022-12-07
|
release=cv-corpus-12.0-2022-12-07 ## -> consider changing relaese name or download the file manually and move it to download folder.
|
||||||
lang=fr
|
lang=fr
|
||||||
perturb_speed=false
|
perturb_speed=false
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user