add normalize punctuation

2023-05-30 19:11:11 +08:00 · 2023-05-30 19:11:11 +08:00 · c850cb862f
commit c850cb862f
parent 1aeffa73bc
5 changed files with 550 additions and 0 deletions
--- a/egs/must_c/ST/local/normalize_punctuation.py
+++ b/egs/must_c/ST/local/normalize_punctuation.py
@ -0,0 +1,169 @@
+# Copyright    2023  Xiaomi Corp.        (authors: Fangjun Kuang)
+import re
+
+
+def normalize_punctuation(s: str, lang: str) -> str:
+    """
+    This function implements
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
+
+    Args:
+      s:
+        A string to be normalized.
+      lang:
+        The language to which `s` belongs
+    Returns:
+      Return a normalized string.
+    """
+    #  s/\r//g;
+    s = re.sub("\r", "", s)
+
+    # remove extra spaces
+    # s/\(/ \(/g;
+    s = re.sub("\(", " (", s)  # add a space before (
+
+    # s/\)/\) /g; s/ +/ /g;
+    s = re.sub("\)", ") ", s)  # add a space after )
+    s = re.sub(" +", " ", s)  # convert multiple spaces to one
+
+    # s/\) ([\.\!\:\?\;\,])/\)$1/g;
+    s = re.sub("\) ([\.\!\:\?\;\,])", r")\1", s)
+
+    # s/\( /\(/g;
+    s = re.sub("\( ", "(", s)  # remove space after (
+
+    # s/ \)/\)/g;
+    s = re.sub(" \)", ")", s)  # remove space before )
+
+    # s/(\d) \%/$1\%/g;
+    s = re.sub("(\d) \%", r"\1%", s)  # remove space between a digit and %
+
+    # s/ :/:/g;
+    s = re.sub(" :", ":", s)  # remove space before :
+
+    # s/ ;/;/g;
+    s = re.sub(" ;", ";", s)  # remove space before ;
+
+    # normalize unicode punctuation
+    # s/\`/\'/g;
+    s = re.sub("`", "'", s)  # replace ` with '
+
+    # s/\'\'/ \" /g;
+    s = re.sub("''", '"', s)  #  replace '' with "
+
+    # s/„/\"/g;
+    s = re.sub("„", '"', s)  #  replace „ with "
+
+    # s/“/\"/g;
+    s = re.sub("“", '"', s)  #  replace “ with "
+
+    # s/”/\"/g;
+    s = re.sub("”", '"', s)  #  replace ” with "
+
+    # s/–/-/g;
+    s = re.sub("–", "-", s)  #  replace – with -
+
+    # s/—/ - /g; s/ +/ /g;
+    s = re.sub("—", " - ", s)
+    s = re.sub(" +", " ", s)  # convert multiple spaces to one
+
+    # s/´/\'/g;
+    s = re.sub("´", "'", s)
+
+    # s/([a-z])‘([a-z])/$1\'$2/gi;
+    s = re.sub("([a-z])‘([a-z])", r"\1'\2", s, flags=re.IGNORECASE)
+
+    # s/([a-z])’([a-z])/$1\'$2/gi;
+    s = re.sub("([a-z])’([a-z])", r"\1'\2", s, flags=re.IGNORECASE)
+
+    # s/‘/\'/g;
+    s = re.sub("‘", "'", s)
+
+    # s/‚/\'/g;
+    s = re.sub("‚", "'", s)
+
+    # s/’/\"/g;
+    s = re.sub("’", '"', s)
+
+    # s/''/\"/g;
+    s = re.sub("''", '"', s)
+
+    # s/´´/\"/g;
+    s = re.sub("´´", '"', s)
+
+    # s/…/.../g;
+    s = re.sub("…", "...", s)
+
+    # French quotes
+
+    # s/ « / \"/g;
+    s = re.sub(" « ", ' "', s)
+
+    # s/« /\"/g;
+    s = re.sub("« ", '"', s)
+
+    # s/«/\"/g;
+    s = re.sub("«", '"', s)
+
+    # s/ » /\" /g;
+    s = re.sub(" » ", '" ', s)
+
+    # s/ »/\"/g;
+    s = re.sub(" »", '"', s)
+
+    # s/»/\"/g;
+    s = re.sub("»", '"', s)
+
+    # handle pseudo-spaces
+
+    # s/ \%/\%/g;
+    s = re.sub(" %", r"%", s)
+
+    # s/nº /nº /g;
+    s = re.sub("nº ", "nº ", s)
+
+    # s/ :/:/g;
+    s = re.sub(" :", ":", s)
+
+    # s/ ºC/ ºC/g;
+    s = re.sub(" ºC", " ºC", s)
+
+    # s/ cm/ cm/g;
+    s = re.sub(" cm", " cm", s)
+
+    # s/ \?/\?/g;
+    s = re.sub(" \?", "\?", s)
+
+    # s/ \!/\!/g;
+    s = re.sub(" \!", "\!", s)
+
+    # s/ ;/;/g;
+    s = re.sub(" ;", ";", s)
+
+    # s/, /, /g; s/ +/ /g;
+    s = re.sub(", ", ", ", s)
+    s = re.sub(" +", " ", s)
+
+    if lang == "en":
+        # English "quotation," followed by comma, style
+        # s/\"([,\.]+)/$1\"/g;
+        s = re.sub('"([,\.]+)', r'\1"', s)
+    elif lang in ("cs", "cz"):
+        # Czech is confused
+        pass
+    else:
+        # German/Spanish/French "quotation", followed by comma, style
+        # s/,\"/\",/g;
+        s = re.sub(',"', '",', s)
+
+        # s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
+        s = re.sub('(\.+)"(\s*[^<])', r'"\1\2', s)
+
+    if lang in ("de", "es", "cz", "cs", "fr"):
+        # s/(\d) (\d)/$1,$2/g;
+        s = re.sub("(\d) (\d)", r"\1,\2", s)
+    else:
+        # s/(\d) (\d)/$1.$2/g;
+        s = re.sub("(\d) (\d)", r"\1.\2", s)
+
+    return s
--- a/egs/must_c/ST/local/preprocess_must_c.py
+++ b/egs/must_c/ST/local/preprocess_must_c.py
@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import re
+from pathlib import Path
+from functools import partial
+
+from normalize_punctuation import normalize_punctuation
+from lhotse.recipes.utils import read_manifests_if_cached
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--manifest-dir",
+        type=Path,
+        required=True,
+        help="Manifest directory",
+    )
+    parser.add_argument(
+        "--tgt-lang",
+        type=str,
+        required=True,
+        help="Target language, e.g., zh, de, fr.",
+    )
+    return parser.parse_args()
+
+
+def preprocess_must_c(manifest_dir: Path, tgt_lang: str):
+    print(manifest_dir)
+
+    normalize_punctuation_lang = partial(normalize_punctuation, lang=tgt_lang)
+
+    prefix = "must_c"
+    suffix = "jsonl.gz"
+    parts = ["dev"]
+    for p in parts:
+        name = f"en-{tgt_lang}_{p}"
+
+        manifests = read_manifests_if_cached(
+            dataset_parts=name,
+            output_dir=manifest_dir,
+            prefix=prefix,
+            suffix=suffix,
+            types=("supervisions",),
+        )
+        if name not in manifests:
+            raise RuntimeError(f"Processing {p} failed.")
+
+        supervisions = manifests[name]["supervisions"]
+        if True:
+            supervisions2 = supervisions.transform_text(normalize_punctuation_lang)
+
+        for s, s2 in zip(supervisions, supervisions2):
+            if s.text != s2.text:
+                print(s.text)
+                print(s2.text)
+                print("-" * 10)
+
+
+def main():
+    args = get_args()
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    logging.info(vars(args))
+    assert args.manifest_dir.is_dir(), args.manifest_dir
+
+    preprocess_must_c(
+        manifest_dir=args.manifest_dir,
+        tgt_lang=args.tgt_lang,
+    )
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/must_c/ST/local/test_normalize_punctuation.py
+++ b/egs/must_c/ST/local/test_normalize_punctuation.py
@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+
+from normalize_punctuation import normalize_punctuation
+
+
+def test_normalize_punctuation():
+    #  s/\r//g;
+    s = "a\r\nb\r\n"
+    n = normalize_punctuation(s, lang="en")
+    assert "\r" not in n
+    assert len(s) - 2 == len(n), (len(s), len(n))
+
+    # s/\(/ \(/g;
+    s = "(ab (c"
+    n = normalize_punctuation(s, lang="en")
+    assert n == " (ab (c", n
+
+    # s/\)/\) /g;
+    s = "a)b c)"
+    n = normalize_punctuation(s, lang="en")
+    assert n == "a) b c) "
+
+    # s/ +/ /g;
+    s = "  a  b     c  d    "
+    n = normalize_punctuation(s, lang="en")
+    assert n == " a b c d "
+
+    # s/\) ([\.\!\:\?\;\,])/\)$1/g;
+    for i in ".!:?;,":
+        s = f"a)  {i}"
+        n = normalize_punctuation(s, lang="en")
+        assert n == f"a){i}"
+
+    # s/\( /\(/g;
+    s = "a(    b"
+    n = normalize_punctuation(s, lang="en")
+    assert n == "a (b", n
+
+    # s/ \)/\)/g;
+    s = "ab    ) a"
+    n = normalize_punctuation(s, lang="en")
+    assert n == "ab) a", n
+
+    # s/(\d) \%/$1\%/g;
+    s = "1   %a"
+    n = normalize_punctuation(s, lang="en")
+    assert n == "1%a", n
+
+    # s/ :/:/g;
+    s = "a  :"
+    n = normalize_punctuation(s, lang="en")
+    assert n == "a:", n
+
+    # s/ ;/;/g;
+    s = "a  ;"
+    n = normalize_punctuation(s, lang="en")
+    assert n == "a;", n
+
+    # s/\`/\'/g;
+    s = "`a`"
+    n = normalize_punctuation(s, lang="en")
+    assert n == "'a'", n
+
+    # s/\'\'/ \" /g;
+    s = "''a''"
+    n = normalize_punctuation(s, lang="en")
+    assert n == '"a"', n
+
+    # s/„/\"/g;
+    s = '„a"'
+    n = normalize_punctuation(s, lang="en")
+    assert n == '"a"', n
+
+    # s/“/\"/g;
+    s = "“a„"
+    n = normalize_punctuation(s, lang="en")
+    assert n == '"a"', n
+
+    # s/”/\"/g;
+    s = "“a”"
+    n = normalize_punctuation(s, lang="en")
+    assert n == '"a"', n
+
+    # s/–/-/g;
+    s = "a–b"
+    n = normalize_punctuation(s, lang="en")
+    assert n == "a-b", n
+
+    # s/—/ - /g; s/ +/ /g;
+    s = "a—b"
+    n = normalize_punctuation(s, lang="en")
+    assert n == "a - b", n
+
+    # s/´/\'/g;
+    s = "a´b"
+    n = normalize_punctuation(s, lang="en")
+    assert n == "a'b", n
+
+    # s/([a-z])‘([a-z])/$1\'$2/gi;
+    for i in "‘’":
+        s = f"a{i}B"
+        n = normalize_punctuation(s, lang="en")
+        assert n == "a'B", n
+
+        s = f"A{i}B"
+        n = normalize_punctuation(s, lang="en")
+        assert n == "A'B", n
+
+        s = f"A{i}b"
+        n = normalize_punctuation(s, lang="en")
+        assert n == "A'b", n
+
+    # s/‘/\'/g;
+    # s/‚/\'/g;
+    for i in "‘‚":
+        s = f"a{i}b"
+        n = normalize_punctuation(s, lang="en")
+        assert n == "a'b", n
+
+    # s/’/\"/g;
+    s = "’"
+    n = normalize_punctuation(s, lang="en")
+    assert n == '"', n
+
+    # s/''/\"/g;
+    s = "''"
+    n = normalize_punctuation(s, lang="en")
+    assert n == '"', n
+
+    # s/´´/\"/g;
+    s = "´´"
+    n = normalize_punctuation(s, lang="en")
+    assert n == '"', n
+
+    # s/…/.../g;
+    s = "…"
+    n = normalize_punctuation(s, lang="en")
+    assert n == "...", n
+
+    # s/ « / \"/g;
+    s = "a « b"
+    n = normalize_punctuation(s, lang="en")
+    assert n == 'a "b', n
+
+    # s/« /\"/g;
+    s = "a « b"
+    n = normalize_punctuation(s, lang="en")
+    assert n == 'a "b', n
+
+    # s/«/\"/g;
+    s = "a«b"
+    n = normalize_punctuation(s, lang="en")
+    assert n == 'a"b', n
+
+    # s/ » /\" /g;
+    s = " » "
+    n = normalize_punctuation(s, lang="en")
+    assert n == '" ', n
+
+    # s/ »/\"/g;
+    s = " »"
+    n = normalize_punctuation(s, lang="en")
+    assert n == '"', n
+
+    # s/»/\"/g;
+    s = "»"
+    n = normalize_punctuation(s, lang="en")
+    assert n == '"', n
+
+    # s/ \%/\%/g;
+    s = " %"
+    n = normalize_punctuation(s, lang="en")
+    assert n == "%", n
+
+    # s/ :/:/g;
+    s = " :"
+    n = normalize_punctuation(s, lang="en")
+    assert n == ":", n
+
+    # s/(\d) (\d)/$1.$2/g;
+    s = "2 3"
+    n = normalize_punctuation(s, lang="en")
+    assert n == "2.3", n
+
+    # s/(\d) (\d)/$1,$2/g;
+    s = "2 3"
+    n = normalize_punctuation(s, lang="de")
+    assert n == "2,3", n
+
+
+def main():
+    test_normalize_punctuation()
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/must_c/ST/prepare.sh
+++ b/egs/must_c/ST/prepare.sh
@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+set -eou pipefail
+
+nj=10
+stage=-1
+stop_stage=100
+
+version=v1.0
+tgt_lang=de
+dl_dir=$PWD/download
+
+must_c_dir=$dl_dir/must-c/$version/en-$tgt_lang/data
+
+# We assume dl_dir (download dir) contains the following
+# directories and files.
+#  - $dl_dir/must-c/$version/en-$tgt_lang/data/{dev,train,tst-COMMON,tst-HE}
+#
+# Please go to https://ict.fbk.eu/must-c-releases/
+# to download and untar the dataset if you have not already done this.
+
+#  - $dl_dir/musan
+#      This directory contains the following directories downloaded from
+#       http://www.openslr.org/17/
+#
+#     - music
+#     - noise
+#     - speech
+
+. shared/parse_options.sh || exit 1
+
+# vocab size for sentence piece models.
+# It will generate
+#  data/lang_bpe_${tgt_lang}_xxx
+#  data/lang_bpe_${tgt_lang}_yyy
+# if the array contains xxx, yyy
+vocab_sizes=(
+  # 5000
+  # 2000
+  # 1000
+  500
+)
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+if [ ! -d $must_c_dir ]; then
+  log "$must_c_dir does not exist"
+  exit 1
+fi
+
+for d in dev train tst-COMMON tst-HE; do
+  if [ ! -d $must_c_dir/$d ]; then
+    log "$must_c_dir/$d does not exist!"
+    exit 1
+  fi
+done
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Download musan"
+  if [ ! -d $dl_dir/musan ]; then
+    lhotse download musan $dl_dir
+  fi
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare musan manifest"
+  # We assume that you have downloaded the musan corpus
+  # to data/musan
+  mkdir -p data/manifests
+  if [ ! -e data/manifests/.musan.done ]; then
+    lhotse prepare musan $dl_dir/musan data/manifests
+    touch data/manifests/.musan.done
+  fi
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Prepare must-c $version manifest for target language $tgt_lang"
+  mkdir -p data/manifests/$version
+  if [ ! -e data/manifests/$version/.done ]; then
+    lhotse prepare must-c \
+      -j $nj \
+      --tgt-lang $tgt_lang \
+      $dl_dir/must-c/$version/ \
+      data/manifests/$version/
+
+    touch data/manifests/$version/.done
+  fi
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Text normalization"
+  ./local/preprocess_must_c.py \
+    --manifest-dir ./data/manifests/$version/ \
+    --tgt-lang $tgt_lang
+fi
--- a/egs/must_c/ST/shared
+++ b/egs/must_c/ST/shared
@ -0,0 +1 @@
+../../../icefall/shared