add text normalization for librispeech test sets

2023-09-14 18:36:09 +08:00 · 2023-09-14 18:36:09 +08:00 · 84ff2ab67c
commit 84ff2ab67c
parent f9ef9f38eb
1 changed files with 152 additions and 0 deletions
--- a/egs/libriheavy/ASR/zipformer_prompt_asr/ls_text_normalization.py
+++ b/egs/libriheavy/ASR/zipformer_prompt_asr/ls_text_normalization.py
@ -0,0 +1,152 @@
+import re
+
+words = {
+    0: "zero",
+    1: "one",
+    2: "two",
+    3: "three",
+    4: "four",
+    5: "five",
+    6: "six",
+    7: "seven",
+    8: "eight",
+    9: "nine",
+    10: "ten",
+    11: "eleven",
+    12: "twelve",
+    13: "thirteen",
+    14: "fourteen",
+    15: "fifteen",
+    16: "sixteen",
+    17: "seventeen",
+    18: "eighteen",
+    19: "nineteen",
+    20: "twenty",
+    30: "thirty",
+    40: "forty",
+    50: "fifty",
+    60: "sixty",
+    70: "seventy",
+    80: "eighty",
+    90: "ninety",
+}
+ordinal_nums = [
+    "zeroth",
+    "first",
+    "second",
+    "third",
+    "fourth",
+    "fifth",
+    "sixth",
+    "seventh",
+    "eighth",
+    "ninth",
+    "tenth",
+    "eleventh",
+    "twelfth",
+    "thirteenth",
+    "fourteenth",
+    "fifteenth",
+    "sixteenth",
+    "seventeenth",
+    "eighteenth",
+    "nineteenth",
+    "twentieth",
+]
+
+num_ordinal_dict = {num: ordinal_nums[num] for num in range(21)}
+
+
+def year_to_words(num: int):
+    assert isinstance(num, int), num
+    # check if a num is representing a year
+    if num > 1500 and num < 2000:
+        return words[num // 100] + " " + num_to_words(num % 100)
+    elif num == 2000:
+        return "TWO THOUSAND"
+    elif num > 2000:
+        return "TWO THOUSAND AND " + num_to_words(num % 100)
+    else:
+        return num_to_words(num)
+
+
+def num_to_words(num: int):
+    # Return the English words of a integer number
+
+    # If this is a year number
+    if num > 1500 and num < 2030:
+        return year_to_words(num)
+
+    if num < 20:
+        return words[num]
+    if num < 100:
+        if num % 10 == 0:
+            return words[num // 10 * 10]
+        else:
+            return words[num // 10 * 10] + " " + words[num % 10]
+    if num < 1000:
+        return words[num // 100] + " hundred and " + num_to_words(num % 100)
+    if num < 1000000:
+        return num_to_words(num // 1000) + " thousand " + num_to_words(num % 1000)
+    return num
+
+
+def num_to_ordinal_word(num: int):
+
+    return num_ordinal_dict.get(num, num_to_words(num)).upper()
+
+
+def replace_full_width_symbol(s: str) -> str:
+    # replace full-width symbol with theri half width counterpart
+    s = s.replace("“", '"')
+    s = s.replace("”", '"')
+    s = s.replace("‘", "'")
+    s = s.replace("’", "'")
+
+    return s
+
+
+def decoding_normalization(text: str) -> str:
+    text = replace_full_width_symbol(text)
+
+    # Only keep all alpha-numeric characters, hypen and apostrophe
+    text = text.replace("-", " ")
+    text = re.sub("[^a-zA-Z0-9\s']+", "", text)
+    return text
+
+
+def word_normalization(word: str) -> str:
+    # 1 .Use full word for some abbreviation
+    # 2. Convert digits to english words
+    # 3. Convert ordinal number to english words
+    if word == "MRS":
+        return "MISSUS"
+    if word == "MR":
+        return "MISTER"
+    if word == "ST":
+        return "SAINT"
+    if word == "ECT":
+        return "ET CETERA"
+    if word.isnumeric():
+        word = num_to_words(int(word))
+        return str(word).upper()
+    if word[-2:] == "TH" and word[0].isnumeric():  #  e.g 9TH, 6TH
+        return num_to_ordinal_word(int(word[:-2])).upper()
+    if word[0] == "\'":
+        return word[1:]
+
+    return word
+
+
+def simple_normalization(text: str) -> str:
+    text = replace_full_width_symbol(text)
+    text = text.replace("--", " ")
+
+    return text
+
+
+if __name__ == "__main__":
+
+    s = str(1830)
+    out = word_normalization(s)
+    print(s, out)