Update preprocess_gigaspeech2.py

This commit is contained in:
Yifan Yang 2024-04-02 15:31:35 +08:00 committed by GitHub
parent e35741583c
commit 4a6405fe34
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -63,11 +63,11 @@ def normalize_text(
text = re.sub(r"\u0039", r"\u0E59", text) text = re.sub(r"\u0039", r"\u0E59", text)
# Currency symbols mapping # Currency symbols mapping
text = re.sub(r"\u0024", r"", text) # $ text = re.sub(r"\u0024", r"\u0E14\u0E2D\u0E25\u0E25\u0E32\u0E23\u0E4C", text) # $ -> ดอลลาร์
text = re.sub(r"\u00A3", r"", text) text = re.sub(r"\u00A3", r"\u0E1B\u0E2D\u0E19\u0E14\u0E4C", text) # £ -> ปอนด์
text = re.sub(r"\u00A5", r"\u", text) text = re.sub(r"\u00A5", r"\u0E2E\u0E22\u0E27\u0E31\u0E19", text) # ¥ -> หยวน
text = re.sub(r"\u00AC", r"\u", text) text = re.sub(r"\u20AC", r"\u0E22\u0E39\u0E42\u0E23", text) # € -> ยูโร
text = re.sub(r"\u0E3F", r"\u0E1A\u0E32\u0E17", text) # ฿ -> บาท
# Remove blank symbols # Remove blank symbols
text = re.sub(r"\s", "", utt) text = re.sub(r"\s", "", utt)