From c850cb862fc0819d9bc7003b14a00ade227753dd Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 30 May 2023 19:11:11 +0800 Subject: [PATCH] add normalize punctuation --- egs/must_c/ST/local/normalize_punctuation.py | 169 +++++++++++++++ egs/must_c/ST/local/preprocess_must_c.py | 76 +++++++ .../ST/local/test_normalize_punctuation.py | 196 ++++++++++++++++++ egs/must_c/ST/prepare.sh | 108 ++++++++++ egs/must_c/ST/shared | 1 + 5 files changed, 550 insertions(+) create mode 100644 egs/must_c/ST/local/normalize_punctuation.py create mode 100755 egs/must_c/ST/local/preprocess_must_c.py create mode 100755 egs/must_c/ST/local/test_normalize_punctuation.py create mode 100755 egs/must_c/ST/prepare.sh create mode 120000 egs/must_c/ST/shared diff --git a/egs/must_c/ST/local/normalize_punctuation.py b/egs/must_c/ST/local/normalize_punctuation.py new file mode 100644 index 000000000..efd47e091 --- /dev/null +++ b/egs/must_c/ST/local/normalize_punctuation.py @@ -0,0 +1,169 @@ +# Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang) +import re + + +def normalize_punctuation(s: str, lang: str) -> str: + """ + This function implements + https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl + + Args: + s: + A string to be normalized. + lang: + The language to which `s` belongs + Returns: + Return a normalized string. + """ + # s/\r//g; + s = re.sub("\r", "", s) + + # remove extra spaces + # s/\(/ \(/g; + s = re.sub("\(", " (", s) # add a space before ( + + # s/\)/\) /g; s/ +/ /g; + s = re.sub("\)", ") ", s) # add a space after ) + s = re.sub(" +", " ", s) # convert multiple spaces to one + + # s/\) ([\.\!\:\?\;\,])/\)$1/g; + s = re.sub("\) ([\.\!\:\?\;\,])", r")\1", s) + + # s/\( /\(/g; + s = re.sub("\( ", "(", s) # remove space after ( + + # s/ \)/\)/g; + s = re.sub(" \)", ")", s) # remove space before ) + + # s/(\d) \%/$1\%/g; + s = re.sub("(\d) \%", r"\1%", s) # remove space between a digit and % + + # s/ :/:/g; + s = re.sub(" :", ":", s) # remove space before : + + # s/ ;/;/g; + s = re.sub(" ;", ";", s) # remove space before ; + + # normalize unicode punctuation + # s/\`/\'/g; + s = re.sub("`", "'", s) # replace ` with ' + + # s/\'\'/ \" /g; + s = re.sub("''", '"', s) # replace '' with " + + # s/„/\"/g; + s = re.sub("„", '"', s) # replace „ with " + + # s/“/\"/g; + s = re.sub("“", '"', s) # replace “ with " + + # s/”/\"/g; + s = re.sub("”", '"', s) # replace ” with " + + # s/–/-/g; + s = re.sub("–", "-", s) # replace – with - + + # s/—/ - /g; s/ +/ /g; + s = re.sub("—", " - ", s) + s = re.sub(" +", " ", s) # convert multiple spaces to one + + # s/´/\'/g; + s = re.sub("´", "'", s) + + # s/([a-z])‘([a-z])/$1\'$2/gi; + s = re.sub("([a-z])‘([a-z])", r"\1'\2", s, flags=re.IGNORECASE) + + # s/([a-z])’([a-z])/$1\'$2/gi; + s = re.sub("([a-z])’([a-z])", r"\1'\2", s, flags=re.IGNORECASE) + + # s/‘/\'/g; + s = re.sub("‘", "'", s) + + # s/‚/\'/g; + s = re.sub("‚", "'", s) + + # s/’/\"/g; + s = re.sub("’", '"', s) + + # s/''/\"/g; + s = re.sub("''", '"', s) + + # s/´´/\"/g; + s = re.sub("´´", '"', s) + + # s/…/.../g; + s = re.sub("…", "...", s) + + # French quotes + + # s/ « / \"/g; + s = re.sub(" « ", ' "', s) + + # s/« /\"/g; + s = re.sub("« ", '"', s) + + # s/«/\"/g; + s = re.sub("«", '"', s) + + # s/ » /\" /g; + s = re.sub(" » ", '" ', s) + + # s/ »/\"/g; + s = re.sub(" »", '"', s) + + # s/»/\"/g; + s = re.sub("»", '"', s) + + # handle pseudo-spaces + + # s/ \%/\%/g; + s = re.sub(" %", r"%", s) + + # s/nº /nº /g; + s = re.sub("nº ", "nº ", s) + + # s/ :/:/g; + s = re.sub(" :", ":", s) + + # s/ ºC/ ºC/g; + s = re.sub(" ºC", " ºC", s) + + # s/ cm/ cm/g; + s = re.sub(" cm", " cm", s) + + # s/ \?/\?/g; + s = re.sub(" \?", "\?", s) + + # s/ \!/\!/g; + s = re.sub(" \!", "\!", s) + + # s/ ;/;/g; + s = re.sub(" ;", ";", s) + + # s/, /, /g; s/ +/ /g; + s = re.sub(", ", ", ", s) + s = re.sub(" +", " ", s) + + if lang == "en": + # English "quotation," followed by comma, style + # s/\"([,\.]+)/$1\"/g; + s = re.sub('"([,\.]+)', r'\1"', s) + elif lang in ("cs", "cz"): + # Czech is confused + pass + else: + # German/Spanish/French "quotation", followed by comma, style + # s/,\"/\",/g; + s = re.sub(',"', '",', s) + + # s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence + s = re.sub('(\.+)"(\s*[^<])', r'"\1\2', s) + + if lang in ("de", "es", "cz", "cs", "fr"): + # s/(\d) (\d)/$1,$2/g; + s = re.sub("(\d) (\d)", r"\1,\2", s) + else: + # s/(\d) (\d)/$1.$2/g; + s = re.sub("(\d) (\d)", r"\1.\2", s) + + return s diff --git a/egs/must_c/ST/local/preprocess_must_c.py b/egs/must_c/ST/local/preprocess_must_c.py new file mode 100755 index 000000000..22c063f72 --- /dev/null +++ b/egs/must_c/ST/local/preprocess_must_c.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +import argparse +import logging +import re +from pathlib import Path +from functools import partial + +from normalize_punctuation import normalize_punctuation +from lhotse.recipes.utils import read_manifests_if_cached + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--manifest-dir", + type=Path, + required=True, + help="Manifest directory", + ) + parser.add_argument( + "--tgt-lang", + type=str, + required=True, + help="Target language, e.g., zh, de, fr.", + ) + return parser.parse_args() + + +def preprocess_must_c(manifest_dir: Path, tgt_lang: str): + print(manifest_dir) + + normalize_punctuation_lang = partial(normalize_punctuation, lang=tgt_lang) + + prefix = "must_c" + suffix = "jsonl.gz" + parts = ["dev"] + for p in parts: + name = f"en-{tgt_lang}_{p}" + + manifests = read_manifests_if_cached( + dataset_parts=name, + output_dir=manifest_dir, + prefix=prefix, + suffix=suffix, + types=("supervisions",), + ) + if name not in manifests: + raise RuntimeError(f"Processing {p} failed.") + + supervisions = manifests[name]["supervisions"] + if True: + supervisions2 = supervisions.transform_text(normalize_punctuation_lang) + + for s, s2 in zip(supervisions, supervisions2): + if s.text != s2.text: + print(s.text) + print(s2.text) + print("-" * 10) + + +def main(): + args = get_args() + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + logging.basicConfig(format=formatter, level=logging.INFO) + + logging.info(vars(args)) + assert args.manifest_dir.is_dir(), args.manifest_dir + + preprocess_must_c( + manifest_dir=args.manifest_dir, + tgt_lang=args.tgt_lang, + ) + + +if __name__ == "__main__": + main() diff --git a/egs/must_c/ST/local/test_normalize_punctuation.py b/egs/must_c/ST/local/test_normalize_punctuation.py new file mode 100755 index 000000000..28ecfb4f1 --- /dev/null +++ b/egs/must_c/ST/local/test_normalize_punctuation.py @@ -0,0 +1,196 @@ +#!/usr/bin/env python3 + +from normalize_punctuation import normalize_punctuation + + +def test_normalize_punctuation(): + # s/\r//g; + s = "a\r\nb\r\n" + n = normalize_punctuation(s, lang="en") + assert "\r" not in n + assert len(s) - 2 == len(n), (len(s), len(n)) + + # s/\(/ \(/g; + s = "(ab (c" + n = normalize_punctuation(s, lang="en") + assert n == " (ab (c", n + + # s/\)/\) /g; + s = "a)b c)" + n = normalize_punctuation(s, lang="en") + assert n == "a) b c) " + + # s/ +/ /g; + s = " a b c d " + n = normalize_punctuation(s, lang="en") + assert n == " a b c d " + + # s/\) ([\.\!\:\?\;\,])/\)$1/g; + for i in ".!:?;,": + s = f"a) {i}" + n = normalize_punctuation(s, lang="en") + assert n == f"a){i}" + + # s/\( /\(/g; + s = "a( b" + n = normalize_punctuation(s, lang="en") + assert n == "a (b", n + + # s/ \)/\)/g; + s = "ab ) a" + n = normalize_punctuation(s, lang="en") + assert n == "ab) a", n + + # s/(\d) \%/$1\%/g; + s = "1 %a" + n = normalize_punctuation(s, lang="en") + assert n == "1%a", n + + # s/ :/:/g; + s = "a :" + n = normalize_punctuation(s, lang="en") + assert n == "a:", n + + # s/ ;/;/g; + s = "a ;" + n = normalize_punctuation(s, lang="en") + assert n == "a;", n + + # s/\`/\'/g; + s = "`a`" + n = normalize_punctuation(s, lang="en") + assert n == "'a'", n + + # s/\'\'/ \" /g; + s = "''a''" + n = normalize_punctuation(s, lang="en") + assert n == '"a"', n + + # s/„/\"/g; + s = '„a"' + n = normalize_punctuation(s, lang="en") + assert n == '"a"', n + + # s/“/\"/g; + s = "“a„" + n = normalize_punctuation(s, lang="en") + assert n == '"a"', n + + # s/”/\"/g; + s = "“a”" + n = normalize_punctuation(s, lang="en") + assert n == '"a"', n + + # s/–/-/g; + s = "a–b" + n = normalize_punctuation(s, lang="en") + assert n == "a-b", n + + # s/—/ - /g; s/ +/ /g; + s = "a—b" + n = normalize_punctuation(s, lang="en") + assert n == "a - b", n + + # s/´/\'/g; + s = "a´b" + n = normalize_punctuation(s, lang="en") + assert n == "a'b", n + + # s/([a-z])‘([a-z])/$1\'$2/gi; + for i in "‘’": + s = f"a{i}B" + n = normalize_punctuation(s, lang="en") + assert n == "a'B", n + + s = f"A{i}B" + n = normalize_punctuation(s, lang="en") + assert n == "A'B", n + + s = f"A{i}b" + n = normalize_punctuation(s, lang="en") + assert n == "A'b", n + + # s/‘/\'/g; + # s/‚/\'/g; + for i in "‘‚": + s = f"a{i}b" + n = normalize_punctuation(s, lang="en") + assert n == "a'b", n + + # s/’/\"/g; + s = "’" + n = normalize_punctuation(s, lang="en") + assert n == '"', n + + # s/''/\"/g; + s = "''" + n = normalize_punctuation(s, lang="en") + assert n == '"', n + + # s/´´/\"/g; + s = "´´" + n = normalize_punctuation(s, lang="en") + assert n == '"', n + + # s/…/.../g; + s = "…" + n = normalize_punctuation(s, lang="en") + assert n == "...", n + + # s/ « / \"/g; + s = "a « b" + n = normalize_punctuation(s, lang="en") + assert n == 'a "b', n + + # s/« /\"/g; + s = "a « b" + n = normalize_punctuation(s, lang="en") + assert n == 'a "b', n + + # s/«/\"/g; + s = "a«b" + n = normalize_punctuation(s, lang="en") + assert n == 'a"b', n + + # s/ » /\" /g; + s = " » " + n = normalize_punctuation(s, lang="en") + assert n == '" ', n + + # s/ »/\"/g; + s = " »" + n = normalize_punctuation(s, lang="en") + assert n == '"', n + + # s/»/\"/g; + s = "»" + n = normalize_punctuation(s, lang="en") + assert n == '"', n + + # s/ \%/\%/g; + s = " %" + n = normalize_punctuation(s, lang="en") + assert n == "%", n + + # s/ :/:/g; + s = " :" + n = normalize_punctuation(s, lang="en") + assert n == ":", n + + # s/(\d) (\d)/$1.$2/g; + s = "2 3" + n = normalize_punctuation(s, lang="en") + assert n == "2.3", n + + # s/(\d) (\d)/$1,$2/g; + s = "2 3" + n = normalize_punctuation(s, lang="de") + assert n == "2,3", n + + +def main(): + test_normalize_punctuation() + + +if __name__ == "__main__": + main() diff --git a/egs/must_c/ST/prepare.sh b/egs/must_c/ST/prepare.sh new file mode 100755 index 000000000..75a9aa27e --- /dev/null +++ b/egs/must_c/ST/prepare.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash + +# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + +set -eou pipefail + +nj=10 +stage=-1 +stop_stage=100 + +version=v1.0 +tgt_lang=de +dl_dir=$PWD/download + +must_c_dir=$dl_dir/must-c/$version/en-$tgt_lang/data + +# We assume dl_dir (download dir) contains the following +# directories and files. +# - $dl_dir/must-c/$version/en-$tgt_lang/data/{dev,train,tst-COMMON,tst-HE} +# +# Please go to https://ict.fbk.eu/must-c-releases/ +# to download and untar the dataset if you have not already done this. + +# - $dl_dir/musan +# This directory contains the following directories downloaded from +# http://www.openslr.org/17/ +# +# - music +# - noise +# - speech + +. shared/parse_options.sh || exit 1 + +# vocab size for sentence piece models. +# It will generate +# data/lang_bpe_${tgt_lang}_xxx +# data/lang_bpe_${tgt_lang}_yyy +# if the array contains xxx, yyy +vocab_sizes=( + # 5000 + # 2000 + # 1000 + 500 +) + +# All files generated by this script are saved in "data". +# You can safely remove "data" and rerun this script to regenerate it. +mkdir -p data + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "dl_dir: $dl_dir" + +if [ ! -d $must_c_dir ]; then + log "$must_c_dir does not exist" + exit 1 +fi + +for d in dev train tst-COMMON tst-HE; do + if [ ! -d $must_c_dir/$d ]; then + log "$must_c_dir/$d does not exist!" + exit 1 + fi +done + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "Stage 0: Download musan" + if [ ! -d $dl_dir/musan ]; then + lhotse download musan $dl_dir + fi +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare musan manifest" + # We assume that you have downloaded the musan corpus + # to data/musan + mkdir -p data/manifests + if [ ! -e data/manifests/.musan.done ]; then + lhotse prepare musan $dl_dir/musan data/manifests + touch data/manifests/.musan.done + fi +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Prepare must-c $version manifest for target language $tgt_lang" + mkdir -p data/manifests/$version + if [ ! -e data/manifests/$version/.done ]; then + lhotse prepare must-c \ + -j $nj \ + --tgt-lang $tgt_lang \ + $dl_dir/must-c/$version/ \ + data/manifests/$version/ + + touch data/manifests/$version/.done + fi +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Text normalization" + ./local/preprocess_must_c.py \ + --manifest-dir ./data/manifests/$version/ \ + --tgt-lang $tgt_lang +fi diff --git a/egs/must_c/ST/shared b/egs/must_c/ST/shared new file mode 120000 index 000000000..4cbd91a7e --- /dev/null +++ b/egs/must_c/ST/shared @@ -0,0 +1 @@ +../../../icefall/shared \ No newline at end of file