From 1ce9a8b3c4ce2300550757dbc026e96327371347 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Tue, 30 May 2023 20:11:30 +0800
Subject: [PATCH] add preprocessing

---
 egs/must_c/ST/local/preprocess_must_c.py      | 29 +++++++++----
 egs/must_c/ST/local/remove_punctuation.py     | 41 +++++++++++++++++++
 .../ST/local/test_remove_punctuation.py       | 17 ++++++++
 3 files changed, 79 insertions(+), 8 deletions(-)
 create mode 100644 egs/must_c/ST/local/remove_punctuation.py
 create mode 100755 egs/must_c/ST/local/test_remove_punctuation.py
diff --git a/egs/must_c/ST/local/preprocess_must_c.py b/egs/must_c/ST/local/preprocess_must_c.py
index 22c063f72..10d0ba5c3 100755
--- a/egs/must_c/ST/local/preprocess_must_c.py
+++ b/egs/must_c/ST/local/preprocess_must_c.py
@@ -1,4 +1,13 @@
 #!/usr/bin/env python3
+"""
+This script normalizes transcripts from supervisions.
+
+Usage:
+  ./local/preprocess_must_c.py \
+    --manifest-dir ./data/manifests/v1.0/ \
+    --tgt-lang de
+"""
+
 import argparse
 import logging
 import re
@@ -33,10 +42,18 @@ def preprocess_must_c(manifest_dir: Path, tgt_lang: str):
 
     prefix = "must_c"
     suffix = "jsonl.gz"
-    parts = ["dev"]
+    parts = ["dev", "tst-COMMON", "tst-HE", "train"]
     for p in parts:
+        logging.info(f"Processing {p}")
         name = f"en-{tgt_lang}_{p}"
 
+        # norm: normalization
+        # rm: remove punctuation
+        dst_name = manifest_dir / f"must_c_supervisions_{name}_norm_rm.jsonl.gz"
+        if dst_name.is_file():
+            logging.info(f"{dst_name} exists - skipping")
+            continue
+
         manifests = read_manifests_if_cached(
             dataset_parts=name,
             output_dir=manifest_dir,
@@ -48,14 +65,10 @@ def preprocess_must_c(manifest_dir: Path, tgt_lang: str):
             raise RuntimeError(f"Processing {p} failed.")
 
         supervisions = manifests[name]["supervisions"]
-        if True:
-            supervisions2 = supervisions.transform_text(normalize_punctuation_lang)
+        supervisions = supervisions.transform_text(normalize_punctuation_lang)
+        supervisions = supervisions.transform_text(lambda x: x.lower())
 
-        for s, s2 in zip(supervisions, supervisions2):
-            if s.text != s2.text:
-                print(s.text)
-                print(s2.text)
-                print("-" * 10)
+        supervisions.to_file(dst_name)
 
 
 def main():
diff --git a/egs/must_c/ST/local/remove_punctuation.py b/egs/must_c/ST/local/remove_punctuation.py
new file mode 100644
index 000000000..723946ec3
--- /dev/null
+++ b/egs/must_c/ST/local/remove_punctuation.py
@@ -0,0 +1,41 @@
+# Copyright    2023  Xiaomi Corp.        (authors: Fangjun Kuang)
+import re
+import string
+
+
+def remove_punctuation(s: str) -> str:
+    """
+    It implements https://github.com/espnet/espnet/blob/master/utils/remove_punctuation.pl
+    """
+
+    # Remove punctuation except apostrophe
+    # s/<space>/spacemark/g;  # for scoring
+    s = re.sub("<space>", "spacemark", s)
+
+    # s/'/apostrophe/g;
+    s = re.sub("'", "apostrophe", s)
+
+    # s/[[:punct:]]//g;
+    s = s.translate(str.maketrans("", "", string.punctuation))
+    # string punctuation returns the following string
+    # !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
+    # See
+    # https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
+
+    # s/apostrophe/'/g;
+    s = re.sub("apostrophe", "'", s)
+
+    # s/spacemark/<space>/g;  # for scoring
+    s = re.sub("spacemark", "<space>", s)
+
+    # remove whitespace
+    # s/\s+/ /g;
+    s = re.sub("\s+", " ", s)
+
+    # s/^\s+//;
+    s = re.sub("^\s+", "", s)
+
+    # s/\s+$//;
+    s = re.sub("\s+$", "", s)
+
+    return s
diff --git a/egs/must_c/ST/local/test_remove_punctuation.py b/egs/must_c/ST/local/test_remove_punctuation.py
new file mode 100755
index 000000000..a4f318550
--- /dev/null
+++ b/egs/must_c/ST/local/test_remove_punctuation.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+
+from remove_punctuation import remove_punctuation
+
+
+def test_remove_punctuation():
+    s = "a,b'c!#"
+    n = remove_punctuation(s)
+    assert n == "ab'c", n
+
+    s = "  ab  "  # remove leading and trailing spaces
+    n = remove_punctuation(s)
+    assert n == "ab", n
+
+
+if __name__ == "__main__":
+    test_remove_punctuation()