icefall/egs/libriheavy/ASR/zipformer/text_normalization.py
2023-07-19 22:04:11 +08:00

27 lines
713 B
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
def replace_full_width_symbol(s: str) -> str:
# replace full-width symbol with theri half width counterpart
s = s.replace("", '"')
s = s.replace("", '"')
s = s.replace("", "'")
s = s.replace("", "'")
return s
def upper_ref_text(text: str) -> str:
text = replace_full_width_symbol(text)
text = text.upper() # upper case all characters
# Only keep all alpha-numeric characters, hypen and apostrophe
text = text.replace("--", " ")
text = re.sub("[^a-zA-Z0-9\s\'-]+", "", text)
return text
def simple_normalization(text: str) -> str:
text = replace_full_width_symbol(text)
text = text.replace("--", " ")
return text