update text processing

This commit is contained in:
Nagendra Goel 2022-09-29 17:13:31 +00:00
parent d327a9dbd8
commit b441c1d5a3

View File

@ -16,6 +16,28 @@ def get_args():
return parser.parse_args() return parser.parse_args()
# replacement function to convert lowercase letter to uppercase
def to_upper(match_obj):
if match_obj.group() is not None:
return match_obj.group().upper()
def insert_groups_and_capitalize_3(match):
return f"{match.group(1)} {match.group(2)} {match.group(3)}".upper()
def insert_groups_and_capitalize_2(match):
return f"{match.group(1)} {match.group(2)}".upper()
def insert_groups_and_capitalize_1(match):
return f"{match.group(1)}".upper()
def insert_groups_and_capitalize_1s(match):
return f"{match.group(1)}".upper() + "'s"
# fmt: off # fmt: off
class FisherSwbdNormalizer: class FisherSwbdNormalizer:
"""Note: the functions "normalize" and "keep" implement the logic """Note: the functions "normalize" and "keep" implement the logic
@ -31,23 +53,23 @@ class FisherSwbdNormalizer:
self.remove_regexp_before = re.compile( self.remove_regexp_before = re.compile(
r"|".join([ r"|".join([
# special symbols # special symbols
r"\[\[SKIP.*\]\]", r"\[\[skip.*\]\]",
r"\[SKIP.*\]", r"\[skip.*\]",
r"\[PAUSE.*\]", r"\[pause.*\]",
r"\[SILENCE\]", r"\[silence\]",
r"<B_ASIDE>", r"<b_aside>",
r"<E_ASIDE>", r"<e_aside>",
]) ])
) )
# tuples of (pattern, replacement) # tuples of (pattern, replacement)
# note: Kaldi replaces sighs, coughs, etc with [noise]. # note: Kaldi replaces sighs, coughs, etc with [noise].
# We don't do that here. # We don't do that here.
# We also uppercase the text as the first operation. # We also lowercase the text as the first operation.
self.replace_regexps: Tuple[re.Pattern, str] = [ self.replace_regexps: Tuple[re.Pattern, str] = [
# SWBD: # SWBD:
# [LAUGHTER-STORY] -> STORY # [LAUGHTER-STORY] -> STORY
(re.compile(r"\[LAUGHTER-(.*?)\]"), r"\1"), (re.compile(r"\[laughter-(.*?)\]"), r"\1"),
# [WEA[SONABLE]-/REASONABLE] # [WEA[SONABLE]-/REASONABLE]
(re.compile(r"\[\S+/(\S+)\]"), r"\1"), (re.compile(r"\[\S+/(\S+)\]"), r"\1"),
# -[ADV]AN[TAGE]- -> AN # -[ADV]AN[TAGE]- -> AN
@ -58,19 +80,22 @@ class FisherSwbdNormalizer:
# -[AN]Y- -> Y- # -[AN]Y- -> Y-
(re.compile(r"-?\[.*?\](\w+)-?"), r"\1-"), (re.compile(r"-?\[.*?\](\w+)-?"), r"\1-"),
# special tokens # special tokens
(re.compile(r"\[LAUGH.*?\]"), r"[LAUGHTER]"), (re.compile(r"\[laugh.*?\]"), r"[laughter]"),
(re.compile(r"\[SIGH.*?\]"), r"[SIGH]"), (re.compile(r"\[sigh.*?\]"), r"[sigh]"),
(re.compile(r"\[COUGH.*?\]"), r"[COUGH]"), (re.compile(r"\[cough.*?\]"), r"[cough]"),
(re.compile(r"\[MN.*?\]"), r"[VOCALIZED-NOISE]"), (re.compile(r"\[mn.*?\]"), r"[vocalized-noise]"),
(re.compile(r"\[BREATH.*?\]"), r"[BREATH]"), (re.compile(r"\[breath.*?\]"), r"[breath]"),
(re.compile(r"\[LIPSMACK.*?\]"), r"[LIPSMACK]"), (re.compile(r"\[lipsmack.*?\]"), r"[lipsmack]"),
(re.compile(r"\[SNEEZE.*?\]"), r"[SNEEZE]"), (re.compile(r"\[sneeze.*?\]"), r"[sneeze]"),
# abbreviations # abbreviations
(re.compile(r"(\w)\.(\w)\.(\w)",), r"\1 \2 \3"), (re.compile(r"(\w)\.(\w)\.(\w)",), insert_groups_and_capitalize_3),
(re.compile(r"(\w)\.(\w)",), r"\1 \2"), (re.compile(r"(\w)\.(\w)",), insert_groups_and_capitalize_2),
(re.compile(r"([a-h,j-z])\.",), insert_groups_and_capitalize_1),
(re.compile(r"\._",), r" "), (re.compile(r"\._",), r" "),
(re.compile(r"_(\w)",), r"\1"), (re.compile(r"_(\w)",), insert_groups_and_capitalize_1),
(re.compile(r"(\w)\.s",), r"\1's"), (re.compile(r"(\w)\.s",), insert_groups_and_capitalize_1s),
(re.compile(r"([A-Z])\'s",), insert_groups_and_capitalize_1s),
(re.compile(r"(\s\w\b|^\w\b)",), insert_groups_and_capitalize_1),
# words between apostrophes # words between apostrophes
(re.compile(r"'(\S*?)'"), r"\1"), (re.compile(r"'(\S*?)'"), r"\1"),
# dangling dashes (2 passes) # dangling dashes (2 passes)
@ -78,6 +103,8 @@ class FisherSwbdNormalizer:
(re.compile(r"\s-\s"), r" "), (re.compile(r"\s-\s"), r" "),
# special symbol with trailing dash # special symbol with trailing dash
(re.compile(r"(\[.*?\])-"), r"\1"), (re.compile(r"(\[.*?\])-"), r"\1"),
# Just remove all dashes
(re.compile(r"-"), r" "),
] ]
# unwanted symbols in the transcripts # unwanted symbols in the transcripts
@ -97,7 +124,7 @@ class FisherSwbdNormalizer:
self.whitespace_regexp = re.compile(r"\s+") self.whitespace_regexp = re.compile(r"\s+")
def normalize(self, text: str) -> str: def normalize(self, text: str) -> str:
text = text.upper() text = text.lower()
# first remove # first remove
text = self.remove_regexp_before.sub("", text) text = self.remove_regexp_before.sub("", text)
@ -153,10 +180,11 @@ def main():
def test(): def test():
normalizer = FisherSwbdNormalizer() normalizer = FisherSwbdNormalizer()
for text in [ for text in [
"[laughterr]", "[laughterr] [SILENCE]",
"[laugh] oh this is great [silence] <B_ASIDE> yes", "[laugh] oh this is great [silence] <B_ASIDE> yes",
"[laugh] oh this is [laught] this is great [silence] <B_ASIDE> yes", "[laugh] oh this is [laught] this is great [silence] <B_ASIDE> yes",
"i don't kn- - know a.b.c's", "i don't kn- - know A.B.C's",
"so x. corp is good?",
"'absolutely yes", "'absolutely yes",
"absolutely' yes", "absolutely' yes",
"'absolutely' yes", "'absolutely' yes",
@ -172,6 +200,9 @@ def test():
"[WEA[SONABLE]-/REASONABLE]", "[WEA[SONABLE]-/REASONABLE]",
"[VOCALIZED-NOISE]-", "[VOCALIZED-NOISE]-",
"~BULL", "~BULL",
"Frank E Peretti P E R E T T I",
"yeah yeah like Double O Seven hes supposed to do it",
"P A P E R paper",
]: ]:
print(text) print(text)
print(normalizer.normalize(text)) print(normalizer.normalize(text))
@ -179,5 +210,5 @@ def test():
if __name__ == "__main__": if __name__ == "__main__":
# test() test()
main() # main()