From 1ce9a8b3c4ce2300550757dbc026e96327371347 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 30 May 2023 20:11:30 +0800 Subject: [PATCH] add preprocessing --- egs/must_c/ST/local/preprocess_must_c.py | 29 +++++++++---- egs/must_c/ST/local/remove_punctuation.py | 41 +++++++++++++++++++ .../ST/local/test_remove_punctuation.py | 17 ++++++++ 3 files changed, 79 insertions(+), 8 deletions(-) create mode 100644 egs/must_c/ST/local/remove_punctuation.py create mode 100755 egs/must_c/ST/local/test_remove_punctuation.py diff --git a/egs/must_c/ST/local/preprocess_must_c.py b/egs/must_c/ST/local/preprocess_must_c.py index 22c063f72..10d0ba5c3 100755 --- a/egs/must_c/ST/local/preprocess_must_c.py +++ b/egs/must_c/ST/local/preprocess_must_c.py @@ -1,4 +1,13 @@ #!/usr/bin/env python3 +""" +This script normalizes transcripts from supervisions. + +Usage: + ./local/preprocess_must_c.py \ + --manifest-dir ./data/manifests/v1.0/ \ + --tgt-lang de +""" + import argparse import logging import re @@ -33,10 +42,18 @@ def preprocess_must_c(manifest_dir: Path, tgt_lang: str): prefix = "must_c" suffix = "jsonl.gz" - parts = ["dev"] + parts = ["dev", "tst-COMMON", "tst-HE", "train"] for p in parts: + logging.info(f"Processing {p}") name = f"en-{tgt_lang}_{p}" + # norm: normalization + # rm: remove punctuation + dst_name = manifest_dir / f"must_c_supervisions_{name}_norm_rm.jsonl.gz" + if dst_name.is_file(): + logging.info(f"{dst_name} exists - skipping") + continue + manifests = read_manifests_if_cached( dataset_parts=name, output_dir=manifest_dir, @@ -48,14 +65,10 @@ def preprocess_must_c(manifest_dir: Path, tgt_lang: str): raise RuntimeError(f"Processing {p} failed.") supervisions = manifests[name]["supervisions"] - if True: - supervisions2 = supervisions.transform_text(normalize_punctuation_lang) + supervisions = supervisions.transform_text(normalize_punctuation_lang) + supervisions = supervisions.transform_text(lambda x: x.lower()) - for s, s2 in zip(supervisions, supervisions2): - if s.text != s2.text: - print(s.text) - print(s2.text) - print("-" * 10) + supervisions.to_file(dst_name) def main(): diff --git a/egs/must_c/ST/local/remove_punctuation.py b/egs/must_c/ST/local/remove_punctuation.py new file mode 100644 index 000000000..723946ec3 --- /dev/null +++ b/egs/must_c/ST/local/remove_punctuation.py @@ -0,0 +1,41 @@ +# Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang) +import re +import string + + +def remove_punctuation(s: str) -> str: + """ + It implements https://github.com/espnet/espnet/blob/master/utils/remove_punctuation.pl + """ + + # Remove punctuation except apostrophe + # s//spacemark/g; # for scoring + s = re.sub("", "spacemark", s) + + # s/'/apostrophe/g; + s = re.sub("'", "apostrophe", s) + + # s/[[:punct:]]//g; + s = s.translate(str.maketrans("", "", string.punctuation)) + # string punctuation returns the following string + # !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ + # See + # https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string + + # s/apostrophe/'/g; + s = re.sub("apostrophe", "'", s) + + # s/spacemark//g; # for scoring + s = re.sub("spacemark", "", s) + + # remove whitespace + # s/\s+/ /g; + s = re.sub("\s+", " ", s) + + # s/^\s+//; + s = re.sub("^\s+", "", s) + + # s/\s+$//; + s = re.sub("\s+$", "", s) + + return s diff --git a/egs/must_c/ST/local/test_remove_punctuation.py b/egs/must_c/ST/local/test_remove_punctuation.py new file mode 100755 index 000000000..a4f318550 --- /dev/null +++ b/egs/must_c/ST/local/test_remove_punctuation.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + +from remove_punctuation import remove_punctuation + + +def test_remove_punctuation(): + s = "a,b'c!#" + n = remove_punctuation(s) + assert n == "ab'c", n + + s = " ab " # remove leading and trailing spaces + n = remove_punctuation(s) + assert n == "ab", n + + +if __name__ == "__main__": + test_remove_punctuation()