import re words = { 0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine", 10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen", 17: "seventeen", 18: "eighteen", 19: "nineteen", 20: "twenty", 30: "thirty", 40: "forty", 50: "fifty", 60: "sixty", 70: "seventy", 80: "eighty", 90: "ninety", } ordinal_nums = [ "zeroth", "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth", "twentieth", ] num_ordinal_dict = {num: ordinal_nums[num] for num in range(21)} def year_to_words(num: int): assert isinstance(num, int), num # check if a num is representing a year if num > 1500 and num < 2000: return words[num // 100] + " " + num_to_words(num % 100) elif num == 2000: return "TWO THOUSAND" elif num > 2000: return "TWO THOUSAND AND " + num_to_words(num % 100) else: return num_to_words(num) def num_to_words(num: int): # Return the English words of a integer number # If this is a year number if num > 1500 and num < 2030: return year_to_words(num) if num < 20: return words[num] if num < 100: if num % 10 == 0: return words[num // 10 * 10] else: return words[num // 10 * 10] + " " + words[num % 10] if num < 1000: return words[num // 100] + " hundred and " + num_to_words(num % 100) if num < 1000000: return num_to_words(num // 1000) + " thousand " + num_to_words(num % 1000) return num def num_to_ordinal_word(num: int): return num_ordinal_dict.get(num, num_to_words(num)).upper() def replace_full_width_symbol(s: str) -> str: # replace full-width symbol with theri half width counterpart s = s.replace("“", '"') s = s.replace("”", '"') s = s.replace("‘", "'") s = s.replace("’", "'") return s def upper_normalization(text: str) -> str: text = replace_full_width_symbol(text) text = text.upper() # upper case all characters # Only keep all alpha-numeric characters, hypen and apostrophe text = text.replace("-", " ") text = re.sub("[^a-zA-Z0-9\s']+", "", text) return text def word_normalization(word: str) -> str: if word == "MRS": return "MISSUS" if word == "MR": return "MISTER" if word == "ST": return "SAINT" if word == "ECT": return "ET CETERA" if word.isnumeric(): word = num_to_words(int(word)) return word.upper() if word[-2:] == "TH" and word[0].isnumeric(): # e.g 9TH, 6TH return num_to_ordinal_word(int(word[:-2])).upper() return word def simple_normalization(text: str) -> str: text = replace_full_width_symbol(text) text = text.replace("--", " ") return text if __name__ == "__main__": s = str(1830) out = word_normalization(s) print(s, out)