diff --git a/egs/commonvoice/ASR/local/preprocess_commonvoice.py b/egs/commonvoice/ASR/local/preprocess_commonvoice.py index 0eafed174..a9cfce502 100755 --- a/egs/commonvoice/ASR/local/preprocess_commonvoice.py +++ b/egs/commonvoice/ASR/local/preprocess_commonvoice.py @@ -52,7 +52,9 @@ def normalize_text(utt: str, language: str) -> str: return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper() elif language == "pl": return re.sub(r"[^a-ząćęłńóśźżA-ZĄĆĘŁŃÓŚŹŻ' ]", "", utt).upper() - elif language == "yue": + elif language in ["yue", "zh-HK"]: + # Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese + # Not sure why they decided to do this... return ( utt.replace(" ", "") .replace(",", "") @@ -60,6 +62,7 @@ def normalize_text(utt: str, language: str) -> str: .replace("?", "") .replace("!", "") .replace("?", "") + .replace("!", "") .replace("‘", "") .replace("、", "") .upper() diff --git a/egs/commonvoice/ASR/prepare.sh b/egs/commonvoice/ASR/prepare.sh index f71210e3f..c4942b0c4 100755 --- a/egs/commonvoice/ASR/prepare.sh +++ b/egs/commonvoice/ASR/prepare.sh @@ -174,7 +174,7 @@ if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then fi if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then - if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then + if [ $lang == "yue" ] || [ $lang == "zh-TW" ] || [ $lang == "zh-CN" ] || [ $lang == "zh-HK" ]; then log "Stage 9: Prepare Char based lang" lang_dir=data/${lang}/lang_char/ mkdir -p $lang_dir @@ -190,7 +190,7 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then gunzip -c data/${lang}/manifests/cv-${lang}_supervisions_train.jsonl.gz \ | jq '.text' | sed 's/"//g' > $lang_dir/text - if [ $lang == "yue" ]; then + if [ $lang == "yue" ] || [ $lang == "zh-HK" ]; then # Get words.txt and words_no_ids.txt ./local/word_segment_yue.py \ --input-file $lang_dir/text \ @@ -299,7 +299,7 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then # We assume you have install kaldilm, if not, please install # it using: pip install kaldilm - if [ $lang == "yue" ] || [ $lang == "zh_TW" ] || [ $lang == "zh_CN" ] || [ $lang == "zh_HK" ]; then + if [ $lang == "yue" ] || [ $lang == "zh-TW" ] || [ $lang == "zh-CN" ] || [ $lang == "zh-HK" ]; then lang_dir=data/${lang}/lang_char mkdir -p $lang_dir/lm