icefall/egs/libriheavy/ASR/zipformer/text_normalization.py
Wei Kang 238b45bea8
Libriheavy recipe (zipformer) (#1261)
* initial commit for libriheavy

* Data prepare pipeline

* Fix train.py

* Fix decode.py

* Add results

* minor fixes

* black

* black

* Incorporate PR https://github.com/k2-fsa/icefall/pull/1269

---------

Co-authored-by: zr_jin <peter.jin.cn@gmail.com>
2023-11-23 01:22:57 +08:00

51 lines
1.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from num2words import num2words
def remove_punc_to_upper(text: str) -> str:
text = text.replace("", "'")
text = text.replace("", "'")
tokens = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'")
s_list = [x.upper() if x in tokens else " " for x in text]
s = " ".join("".join(s_list).split()).strip()
return s
def word_normalization(word: str) -> str:
# 1. Use full word for some abbreviation
# 2. Convert digits to english words
# 3. Convert ordinal number to english words
if word == "MRS":
return "MISSUS"
if word == "MR":
return "MISTER"
if word == "ST":
return "SAINT"
if word == "ECT":
return "ET CETERA"
if word[-2:] in ("ST", "ND", "RD", "TH") and word[:-2].isnumeric(): # e.g 9TH, 6TH
word = num2words(word[:-2], to="ordinal")
word = word.replace("-", " ")
if word.isnumeric():
num = int(word)
if num > 1500 and num < 2030:
word = num2words(word, to="year")
else:
word = num2words(word)
word = word.replace("-", " ")
return word.upper()
def text_normalization(text: str) -> str:
text = text.upper()
return " ".join([word_normalization(x) for x in text.split()])
if __name__ == "__main__":
assert remove_punc_to_upper("I like this 《book>") == "I LIKE THIS BOOK"
assert (
text_normalization("Hello Mrs st 21st world 3rd she 99th MR")
== "HELLO MISSUS SAINT TWENTY FIRST WORLD THIRD SHE NINETY NINTH MISTER"
)