From af62891812f7e082a6a4b76690e03255fc7df16c Mon Sep 17 00:00:00 2001 From: jinzr Date: Thu, 7 Mar 2024 19:27:54 +0800 Subject: [PATCH] init commit --- egs/commonvoice/ASR/local/compile_hlg.py | 1 - egs/commonvoice/ASR/local/compile_lg.py | 1 - egs/commonvoice/ASR/local/preprocess_commonvoice.py | 9 +++++++++ 3 files changed, 9 insertions(+), 2 deletions(-) delete mode 120000 egs/commonvoice/ASR/local/compile_hlg.py delete mode 120000 egs/commonvoice/ASR/local/compile_lg.py diff --git a/egs/commonvoice/ASR/local/compile_hlg.py b/egs/commonvoice/ASR/local/compile_hlg.py deleted file mode 120000 index 471aa7fb4..000000000 --- a/egs/commonvoice/ASR/local/compile_hlg.py +++ /dev/null @@ -1 +0,0 @@ -../../../librispeech/ASR/local/compile_hlg.py \ No newline at end of file diff --git a/egs/commonvoice/ASR/local/compile_lg.py b/egs/commonvoice/ASR/local/compile_lg.py deleted file mode 120000 index 462d6d3fb..000000000 --- a/egs/commonvoice/ASR/local/compile_lg.py +++ /dev/null @@ -1 +0,0 @@ -../../../librispeech/ASR/local/compile_lg.py \ No newline at end of file diff --git a/egs/commonvoice/ASR/local/preprocess_commonvoice.py b/egs/commonvoice/ASR/local/preprocess_commonvoice.py index c0f4ca427..dbacdd821 100755 --- a/egs/commonvoice/ASR/local/preprocess_commonvoice.py +++ b/egs/commonvoice/ASR/local/preprocess_commonvoice.py @@ -52,6 +52,15 @@ def normalize_text(utt: str, language: str) -> str: return re.sub(r"[^A-ZÀÂÆÇÉÈÊËÎÏÔŒÙÛÜ' ]", "", utt).upper() elif language == "pl": return re.sub(r"[^a-ząćęłńóśźżA-ZĄĆĘŁŃÓŚŹŻ' ]", "", utt).upper() + elif language == "yue": + return ( + utt.replace(" ", "") + .replace(",", "") + .replace("。", " ") + .replace("?", "") + .replace("!", "") + .replace("?", "") + ) else: raise NotImplementedError( f"""