mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
* initial commit for libriheavy * Data prepare pipeline * Fix train.py * Fix decode.py * Add results * minor fixes * black * black * Incorporate PR https://github.com/k2-fsa/icefall/pull/1269 --------- Co-authored-by: zr_jin <peter.jin.cn@gmail.com>
51 lines
1.5 KiB
Python
51 lines
1.5 KiB
Python
from num2words import num2words
|
||
|
||
|
||
def remove_punc_to_upper(text: str) -> str:
|
||
text = text.replace("‘", "'")
|
||
text = text.replace("’", "'")
|
||
tokens = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'")
|
||
s_list = [x.upper() if x in tokens else " " for x in text]
|
||
s = " ".join("".join(s_list).split()).strip()
|
||
return s
|
||
|
||
|
||
def word_normalization(word: str) -> str:
|
||
# 1. Use full word for some abbreviation
|
||
# 2. Convert digits to english words
|
||
# 3. Convert ordinal number to english words
|
||
if word == "MRS":
|
||
return "MISSUS"
|
||
if word == "MR":
|
||
return "MISTER"
|
||
if word == "ST":
|
||
return "SAINT"
|
||
if word == "ECT":
|
||
return "ET CETERA"
|
||
|
||
if word[-2:] in ("ST", "ND", "RD", "TH") and word[:-2].isnumeric(): # e.g 9TH, 6TH
|
||
word = num2words(word[:-2], to="ordinal")
|
||
word = word.replace("-", " ")
|
||
|
||
if word.isnumeric():
|
||
num = int(word)
|
||
if num > 1500 and num < 2030:
|
||
word = num2words(word, to="year")
|
||
else:
|
||
word = num2words(word)
|
||
word = word.replace("-", " ")
|
||
return word.upper()
|
||
|
||
|
||
def text_normalization(text: str) -> str:
|
||
text = text.upper()
|
||
return " ".join([word_normalization(x) for x in text.split()])
|
||
|
||
|
||
if __name__ == "__main__":
|
||
assert remove_punc_to_upper("I like this 《book>") == "I LIKE THIS BOOK"
|
||
assert (
|
||
text_normalization("Hello Mrs st 21st world 3rd she 99th MR")
|
||
== "HELLO MISSUS SAINT TWENTY FIRST WORLD THIRD SHE NINETY NINTH MISTER"
|
||
)
|