add more normalizations such as number/year to words; fix a few bugs when feeding input to WER computation

2025-12-11 06:55:27 +00:00 · 2023-07-20 15:50:50 +08:00 · 2023-07-20 15:50:50 +08:00 · 754ac00509
commit 754ac00509
parent 5532bb1683
2 changed files with 156 additions and 15 deletions
--- a/egs/libriheavy/ASR/zipformer/decode.py
+++ b/egs/libriheavy/ASR/zipformer/decode.py
@ -118,7 +118,12 @@ from beam_search import (
    greedy_search_batch,
    modified_beam_search,
 )
-from text_normalization import simple_normalization, upper_normalization
+from lhotse.cut import Cut
 from text_normalization import (
    simple_normalization,
    upper_normalization,
    word_normalization,
 )
 from train import add_model_arguments, get_model, get_params
 from icefall.checkpoint import (
@ -802,14 +807,29 @@ def main():
    args.return_cuts = True
    libriheavy = LibriHeavyAsrDataModule(args)
    def add_texts(c: Cut):
        text = c.supervisions[0].text
        c.supervisions[0].texts = [text]
        return c
    test_clean_cuts = libriheavy.test_clean_cuts()
    test_other_cuts = libriheavy.test_other_cuts()
    ls_test_clean_cuts = libriheavy.librispeech_test_clean_cuts()
    ls_test_other_cuts = libriheavy.librispeech_test_other_cuts()
    ls_test_clean_cuts = ls_test_clean_cuts.map(add_texts)
    ls_test_other_cuts = ls_test_other_cuts.map(add_texts)
    test_clean_dl = libriheavy.test_dataloaders(test_clean_cuts)
    test_other_dl = libriheavy.test_dataloaders(test_other_cuts)
    ls_test_clean_dl = libriheavy.test_dataloaders(ls_test_clean_cuts)
    ls_test_other_dl = libriheavy.test_dataloaders(ls_test_other_cuts)
-    test_sets = ["libriheavy-test-clean", "libriheavy-test-other"]
+    # test_sets = ["libriheavy-test-clean", "libriheavy-test-other", "librispeech-test-clean", "librispeech-test-other"]
-    test_dl = [test_clean_dl, test_other_dl]
+    # test_dl = [test_clean_dl, test_other_dl, ls_test_clean_dl, ls_test_other_dl]
    test_sets = ["librispeech-test-clean", "librispeech-test-other"]
    test_dl = [ls_test_clean_dl, ls_test_other_dl]
    for test_set, test_dl in zip(test_sets, test_dl):
        results_dict = decode_dataset(
@ -834,12 +854,12 @@ def main():
            for k in results_dict:
                new_ans = []
                for item in results_dict[k]:
-                    id, hyp, ref = item
+                    id, ref, hyp = item
-                    hyp = [upper_normalization(w.upper()) for w in hyp]
+                    hyp = upper_normalization(" ".join(hyp)).split()
                    hyp = [word_normalization(w) for w in hyp]
                    hyp = " ".join(hyp).split()
                    hyp = [w for w in hyp if w != ""]
-                    ref = [upper_normalization(w.upper()) for w in ref]
+                    new_ans.append((id, ref, hyp))
                    ref = [w for w in ref if w != ""]
                    new_ans.append((id, hyp, ref))
                new_res[k] = new_ans
            save_results(
--- a/egs/libriheavy/ASR/zipformer/text_normalization.py
+++ b/egs/libriheavy/ASR/zipformer/text_normalization.py
@ -1,5 +1,101 @@
 import re
 words = {
    0: "zero",
    1: "one",
    2: "two",
    3: "three",
    4: "four",
    5: "five",
    6: "six",
    7: "seven",
    8: "eight",
    9: "nine",
    10: "ten",
    11: "eleven",
    12: "twelve",
    13: "thirteen",
    14: "fourteen",
    15: "fifteen",
    16: "sixteen",
    17: "seventeen",
    18: "eighteen",
    19: "nineteen",
    20: "twenty",
    30: "thirty",
    40: "forty",
    50: "fifty",
    60: "sixty",
    70: "seventy",
    80: "eighty",
    90: "ninety",
 }
 ordinal_nums = [
    "zeroth",
    "first",
    "second",
    "third",
    "fourth",
    "fifth",
    "sixth",
    "seventh",
    "eighth",
    "ninth",
    "tenth",
    "eleventh",
    "twelfth",
    "thirteenth",
    "fourteenth",
    "fifteenth",
    "sixteenth",
    "seventeenth",
    "eighteenth",
    "nineteenth",
    "twentieth",
 ]
 num_ordinal_dict = {num: ordinal_nums[num] for num in range(21)}
 def year_to_words(num: int):
    assert isinstance(num, int), num
    # check if a num is representing a year
    if num > 1500 and num < 2000:
        return words[num // 100] + " " + num_to_words(num % 100)
    elif num == 2000:
        return "TWO THOUSAND"
    elif num > 2000:
        return "TWO THOUSAND AND " + num_to_words(num % 100)
    else:
        return num_to_words(num)
 def num_to_words(num: int):
    # Return the English words of a integer number
    # If this is a year number
    if num > 1500 and num < 2030:
        return year_to_words(num)
    if num < 20:
        return words[num]
    if num < 100:
        if num % 10 == 0:
            return words[num // 10 * 10]
        else:
            return words[num // 10 * 10] + " " + words[num % 10]
    if num < 1000:
        return words[num // 100] + " hundred and " + num_to_words(num % 100)
    if num < 1000000:
        return num_to_words(num // 1000) + " thousand " + num_to_words(num % 1000)
    return num
 def num_to_ordinal_word(num: int):
    return num_ordinal_dict.get(num, num_to_words(num)).upper()
 def replace_full_width_symbol(s: str) -> str:
    # replace full-width symbol with theri half width counterpart
    s = s.replace("“", '"')
@ -10,18 +106,43 @@ def replace_full_width_symbol(s: str) -> str:
    return s
-def upper_ref_text(text: str) -> str:
+def upper_normalization(text: str) -> str:
    text = replace_full_width_symbol(text)
    text = text.upper()  # upper case all characters
    # Only keep all alpha-numeric characters, hypen and apostrophe
-    text = text.replace("--", " ")
+    text = text.replace("-", " ")
-    text = re.sub("[^a-zA-Z0-9\s\'-]+", "", text)
+    text = re.sub("[^a-zA-Z0-9\s']+", "", text)
    return text
 def word_normalization(word: str) -> str:
    if word == "MRS":
        return "MISSUS"
    if word == "MR":
        return "MISTER"
    if word == "ST":
        return "SAINT"
    if word == "ECT":
        return "ET CETERA"
    if word.isnumeric():
        word = num_to_words(int(word))
        return word.upper()
    if word[-2:] == "TH" and word[0].isnumeric():  #  e.g 9TH, 6TH
        return num_to_ordinal_word(int(word[:-2])).upper()
    return word
 def simple_normalization(text: str) -> str:
    text = replace_full_width_symbol(text)
    text = text.replace("--", " ")
    return text
 if __name__ == "__main__":
    s = str(1830)
    out = word_normalization(s)
    print(s, out)