icefall/egs/libriheavy/ASR/zipformer/text_normalization.py

import re

words = {
    0: "zero",
    1: "one",
    2: "two",
    3: "three",
    4: "four",
    5: "five",
    6: "six",
    7: "seven",
    8: "eight",
    9: "nine",
    10: "ten",
    11: "eleven",
    12: "twelve",
    13: "thirteen",
    14: "fourteen",
    15: "fifteen",
    16: "sixteen",
    17: "seventeen",
    18: "eighteen",
    19: "nineteen",
    20: "twenty",
    30: "thirty",
    40: "forty",
    50: "fifty",
    60: "sixty",
    70: "seventy",
    80: "eighty",
    90: "ninety",
}
ordinal_nums = [
    "zeroth",
    "first",
    "second",
    "third",
    "fourth",
    "fifth",
    "sixth",
    "seventh",
    "eighth",
    "ninth",
    "tenth",
    "eleventh",
    "twelfth",
    "thirteenth",
    "fourteenth",
    "fifteenth",
    "sixteenth",
    "seventeenth",
    "eighteenth",
    "nineteenth",
    "twentieth",
]

num_ordinal_dict = {num: ordinal_nums[num] for num in range(21)}


def year_to_words(num: int):
    assert isinstance(num, int), num
    # check if a num is representing a year
    if num > 1500 and num < 2000:
        return words[num // 100] + " " + num_to_words(num % 100)
    elif num == 2000:
        return "TWO THOUSAND"
    elif num > 2000:
        return "TWO THOUSAND AND " + num_to_words(num % 100)
    else:
        return num_to_words(num)


def num_to_words(num: int):
    # Return the English words of a integer number

    # If this is a year number
    if num > 1500 and num < 2030:
        return year_to_words(num)

    if num < 20:
        return words[num]
    if num < 100:
        if num % 10 == 0:
            return words[num // 10 * 10]
        else:
            return words[num // 10 * 10] + " " + words[num % 10]
    if num < 1000:
        return words[num // 100] + " hundred and " + num_to_words(num % 100)
    if num < 1000000:
        return num_to_words(num // 1000) + " thousand " + num_to_words(num % 1000)
    return num


def num_to_ordinal_word(num: int):

    return num_ordinal_dict.get(num, num_to_words(num)).upper()


def replace_full_width_symbol(s: str) -> str:
    # replace full-width symbol with theri half width counterpart
    s = s.replace("“", '"')
    s = s.replace("”", '"')
    s = s.replace("‘", "'")
    s = s.replace("’", "'")

    return s


def upper_normalization(text: str) -> str:
    text = replace_full_width_symbol(text)
    text = text.upper()  # upper case all characters

    # Only keep all alpha-numeric characters, hypen and apostrophe
    text = text.replace("-", " ")
    text = re.sub("[^a-zA-Z0-9\s']+", "", text)
    return text


def word_normalization(word: str) -> str:
    if word == "MRS":
        return "MISSUS"
    if word == "MR":
        return "MISTER"
    if word == "ST":
        return "SAINT"
    if word == "ECT":
        return "ET CETERA"
    if word.isnumeric():
        word = num_to_words(int(word))
        return word.upper()
    if word[-2:] == "TH" and word[0].isnumeric():  #  e.g 9TH, 6TH
        return num_to_ordinal_word(int(word[:-2])).upper()

    return word


def simple_normalization(text: str) -> str:
    text = replace_full_width_symbol(text)
    text = text.replace("--", " ")

    return text


if __name__ == "__main__":

    s = str(1830)
    out = word_normalization(s)
    print(s, out)