mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-27 02:34:21 +00:00
update text processing
This commit is contained in:
parent
d327a9dbd8
commit
b441c1d5a3
@ -16,6 +16,28 @@ def get_args():
|
|||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
# replacement function to convert lowercase letter to uppercase
|
||||||
|
def to_upper(match_obj):
|
||||||
|
if match_obj.group() is not None:
|
||||||
|
return match_obj.group().upper()
|
||||||
|
|
||||||
|
|
||||||
|
def insert_groups_and_capitalize_3(match):
|
||||||
|
return f"{match.group(1)} {match.group(2)} {match.group(3)}".upper()
|
||||||
|
|
||||||
|
|
||||||
|
def insert_groups_and_capitalize_2(match):
|
||||||
|
return f"{match.group(1)} {match.group(2)}".upper()
|
||||||
|
|
||||||
|
|
||||||
|
def insert_groups_and_capitalize_1(match):
|
||||||
|
return f"{match.group(1)}".upper()
|
||||||
|
|
||||||
|
|
||||||
|
def insert_groups_and_capitalize_1s(match):
|
||||||
|
return f"{match.group(1)}".upper() + "'s"
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
class FisherSwbdNormalizer:
|
class FisherSwbdNormalizer:
|
||||||
"""Note: the functions "normalize" and "keep" implement the logic
|
"""Note: the functions "normalize" and "keep" implement the logic
|
||||||
@ -31,23 +53,23 @@ class FisherSwbdNormalizer:
|
|||||||
self.remove_regexp_before = re.compile(
|
self.remove_regexp_before = re.compile(
|
||||||
r"|".join([
|
r"|".join([
|
||||||
# special symbols
|
# special symbols
|
||||||
r"\[\[SKIP.*\]\]",
|
r"\[\[skip.*\]\]",
|
||||||
r"\[SKIP.*\]",
|
r"\[skip.*\]",
|
||||||
r"\[PAUSE.*\]",
|
r"\[pause.*\]",
|
||||||
r"\[SILENCE\]",
|
r"\[silence\]",
|
||||||
r"<B_ASIDE>",
|
r"<b_aside>",
|
||||||
r"<E_ASIDE>",
|
r"<e_aside>",
|
||||||
])
|
])
|
||||||
)
|
)
|
||||||
|
|
||||||
# tuples of (pattern, replacement)
|
# tuples of (pattern, replacement)
|
||||||
# note: Kaldi replaces sighs, coughs, etc with [noise].
|
# note: Kaldi replaces sighs, coughs, etc with [noise].
|
||||||
# We don't do that here.
|
# We don't do that here.
|
||||||
# We also uppercase the text as the first operation.
|
# We also lowercase the text as the first operation.
|
||||||
self.replace_regexps: Tuple[re.Pattern, str] = [
|
self.replace_regexps: Tuple[re.Pattern, str] = [
|
||||||
# SWBD:
|
# SWBD:
|
||||||
# [LAUGHTER-STORY] -> STORY
|
# [LAUGHTER-STORY] -> STORY
|
||||||
(re.compile(r"\[LAUGHTER-(.*?)\]"), r"\1"),
|
(re.compile(r"\[laughter-(.*?)\]"), r"\1"),
|
||||||
# [WEA[SONABLE]-/REASONABLE]
|
# [WEA[SONABLE]-/REASONABLE]
|
||||||
(re.compile(r"\[\S+/(\S+)\]"), r"\1"),
|
(re.compile(r"\[\S+/(\S+)\]"), r"\1"),
|
||||||
# -[ADV]AN[TAGE]- -> AN
|
# -[ADV]AN[TAGE]- -> AN
|
||||||
@ -58,19 +80,22 @@ class FisherSwbdNormalizer:
|
|||||||
# -[AN]Y- -> Y-
|
# -[AN]Y- -> Y-
|
||||||
(re.compile(r"-?\[.*?\](\w+)-?"), r"\1-"),
|
(re.compile(r"-?\[.*?\](\w+)-?"), r"\1-"),
|
||||||
# special tokens
|
# special tokens
|
||||||
(re.compile(r"\[LAUGH.*?\]"), r"[LAUGHTER]"),
|
(re.compile(r"\[laugh.*?\]"), r"[laughter]"),
|
||||||
(re.compile(r"\[SIGH.*?\]"), r"[SIGH]"),
|
(re.compile(r"\[sigh.*?\]"), r"[sigh]"),
|
||||||
(re.compile(r"\[COUGH.*?\]"), r"[COUGH]"),
|
(re.compile(r"\[cough.*?\]"), r"[cough]"),
|
||||||
(re.compile(r"\[MN.*?\]"), r"[VOCALIZED-NOISE]"),
|
(re.compile(r"\[mn.*?\]"), r"[vocalized-noise]"),
|
||||||
(re.compile(r"\[BREATH.*?\]"), r"[BREATH]"),
|
(re.compile(r"\[breath.*?\]"), r"[breath]"),
|
||||||
(re.compile(r"\[LIPSMACK.*?\]"), r"[LIPSMACK]"),
|
(re.compile(r"\[lipsmack.*?\]"), r"[lipsmack]"),
|
||||||
(re.compile(r"\[SNEEZE.*?\]"), r"[SNEEZE]"),
|
(re.compile(r"\[sneeze.*?\]"), r"[sneeze]"),
|
||||||
# abbreviations
|
# abbreviations
|
||||||
(re.compile(r"(\w)\.(\w)\.(\w)",), r"\1 \2 \3"),
|
(re.compile(r"(\w)\.(\w)\.(\w)",), insert_groups_and_capitalize_3),
|
||||||
(re.compile(r"(\w)\.(\w)",), r"\1 \2"),
|
(re.compile(r"(\w)\.(\w)",), insert_groups_and_capitalize_2),
|
||||||
|
(re.compile(r"([a-h,j-z])\.",), insert_groups_and_capitalize_1),
|
||||||
(re.compile(r"\._",), r" "),
|
(re.compile(r"\._",), r" "),
|
||||||
(re.compile(r"_(\w)",), r"\1"),
|
(re.compile(r"_(\w)",), insert_groups_and_capitalize_1),
|
||||||
(re.compile(r"(\w)\.s",), r"\1's"),
|
(re.compile(r"(\w)\.s",), insert_groups_and_capitalize_1s),
|
||||||
|
(re.compile(r"([A-Z])\'s",), insert_groups_and_capitalize_1s),
|
||||||
|
(re.compile(r"(\s\w\b|^\w\b)",), insert_groups_and_capitalize_1),
|
||||||
# words between apostrophes
|
# words between apostrophes
|
||||||
(re.compile(r"'(\S*?)'"), r"\1"),
|
(re.compile(r"'(\S*?)'"), r"\1"),
|
||||||
# dangling dashes (2 passes)
|
# dangling dashes (2 passes)
|
||||||
@ -78,6 +103,8 @@ class FisherSwbdNormalizer:
|
|||||||
(re.compile(r"\s-\s"), r" "),
|
(re.compile(r"\s-\s"), r" "),
|
||||||
# special symbol with trailing dash
|
# special symbol with trailing dash
|
||||||
(re.compile(r"(\[.*?\])-"), r"\1"),
|
(re.compile(r"(\[.*?\])-"), r"\1"),
|
||||||
|
# Just remove all dashes
|
||||||
|
(re.compile(r"-"), r" "),
|
||||||
]
|
]
|
||||||
|
|
||||||
# unwanted symbols in the transcripts
|
# unwanted symbols in the transcripts
|
||||||
@ -97,7 +124,7 @@ class FisherSwbdNormalizer:
|
|||||||
self.whitespace_regexp = re.compile(r"\s+")
|
self.whitespace_regexp = re.compile(r"\s+")
|
||||||
|
|
||||||
def normalize(self, text: str) -> str:
|
def normalize(self, text: str) -> str:
|
||||||
text = text.upper()
|
text = text.lower()
|
||||||
|
|
||||||
# first remove
|
# first remove
|
||||||
text = self.remove_regexp_before.sub("", text)
|
text = self.remove_regexp_before.sub("", text)
|
||||||
@ -153,10 +180,11 @@ def main():
|
|||||||
def test():
|
def test():
|
||||||
normalizer = FisherSwbdNormalizer()
|
normalizer = FisherSwbdNormalizer()
|
||||||
for text in [
|
for text in [
|
||||||
"[laughterr]",
|
"[laughterr] [SILENCE]",
|
||||||
"[laugh] oh this is great [silence] <B_ASIDE> yes",
|
"[laugh] oh this is great [silence] <B_ASIDE> yes",
|
||||||
"[laugh] oh this is [laught] this is great [silence] <B_ASIDE> yes",
|
"[laugh] oh this is [laught] this is great [silence] <B_ASIDE> yes",
|
||||||
"i don't kn- - know a.b.c's",
|
"i don't kn- - know A.B.C's",
|
||||||
|
"so x. corp is good?",
|
||||||
"'absolutely yes",
|
"'absolutely yes",
|
||||||
"absolutely' yes",
|
"absolutely' yes",
|
||||||
"'absolutely' yes",
|
"'absolutely' yes",
|
||||||
@ -172,6 +200,9 @@ def test():
|
|||||||
"[WEA[SONABLE]-/REASONABLE]",
|
"[WEA[SONABLE]-/REASONABLE]",
|
||||||
"[VOCALIZED-NOISE]-",
|
"[VOCALIZED-NOISE]-",
|
||||||
"~BULL",
|
"~BULL",
|
||||||
|
"Frank E Peretti P E R E T T I",
|
||||||
|
"yeah yeah like Double O Seven he’s supposed to do it",
|
||||||
|
"P A P E R paper",
|
||||||
]:
|
]:
|
||||||
print(text)
|
print(text)
|
||||||
print(normalizer.normalize(text))
|
print(normalizer.normalize(text))
|
||||||
@ -179,5 +210,5 @@ def test():
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# test()
|
test()
|
||||||
main()
|
# main()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user