add normalize punctuation

This commit is contained in:
Fangjun Kuang 2023-05-30 19:11:11 +08:00
parent 1aeffa73bc
commit c850cb862f
5 changed files with 550 additions and 0 deletions

View File

@ -0,0 +1,169 @@
# Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang)
import re
def normalize_punctuation(s: str, lang: str) -> str:
"""
This function implements
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/normalize-punctuation.perl
Args:
s:
A string to be normalized.
lang:
The language to which `s` belongs
Returns:
Return a normalized string.
"""
# s/\r//g;
s = re.sub("\r", "", s)
# remove extra spaces
# s/\(/ \(/g;
s = re.sub("\(", " (", s) # add a space before (
# s/\)/\) /g; s/ +/ /g;
s = re.sub("\)", ") ", s) # add a space after )
s = re.sub(" +", " ", s) # convert multiple spaces to one
# s/\) ([\.\!\:\?\;\,])/\)$1/g;
s = re.sub("\) ([\.\!\:\?\;\,])", r")\1", s)
# s/\( /\(/g;
s = re.sub("\( ", "(", s) # remove space after (
# s/ \)/\)/g;
s = re.sub(" \)", ")", s) # remove space before )
# s/(\d) \%/$1\%/g;
s = re.sub("(\d) \%", r"\1%", s) # remove space between a digit and %
# s/ :/:/g;
s = re.sub(" :", ":", s) # remove space before :
# s/ ;/;/g;
s = re.sub(" ;", ";", s) # remove space before ;
# normalize unicode punctuation
# s/\`/\'/g;
s = re.sub("`", "'", s) # replace ` with '
# s/\'\'/ \" /g;
s = re.sub("''", '"', s) # replace '' with "
# s/„/\"/g;
s = re.sub("", '"', s) # replace „ with "
# s/“/\"/g;
s = re.sub("", '"', s) # replace “ with "
# s/”/\"/g;
s = re.sub("", '"', s) # replace ” with "
# s//-/g;
s = re.sub("", "-", s) # replace with -
# s/—/ - /g; s/ +/ /g;
s = re.sub("", " - ", s)
s = re.sub(" +", " ", s) # convert multiple spaces to one
# s/´/\'/g;
s = re.sub("´", "'", s)
# s/([a-z])([a-z])/$1\'$2/gi;
s = re.sub("([a-z])([a-z])", r"\1'\2", s, flags=re.IGNORECASE)
# s/([a-z])([a-z])/$1\'$2/gi;
s = re.sub("([a-z])([a-z])", r"\1'\2", s, flags=re.IGNORECASE)
# s//\'/g;
s = re.sub("", "'", s)
# s//\'/g;
s = re.sub("", "'", s)
# s//\"/g;
s = re.sub("", '"', s)
# s/''/\"/g;
s = re.sub("''", '"', s)
# s/´´/\"/g;
s = re.sub("´´", '"', s)
# s/…/.../g;
s = re.sub("", "...", s)
# French quotes
# s/ « / \"/g;
s = re.sub(" « ", ' "', s)
# s/« /\"/g;
s = re.sub("« ", '"', s)
# s/«/\"/g;
s = re.sub("«", '"', s)
# s/ » /\" /g;
s = re.sub(" » ", '" ', s)
# s/ »/\"/g;
s = re.sub(" »", '"', s)
# s/»/\"/g;
s = re.sub("»", '"', s)
# handle pseudo-spaces
# s/ \%/\%/g;
s = re.sub(" %", r"%", s)
# s/nº /nº /g;
s = re.sub(" ", "", s)
# s/ :/:/g;
s = re.sub(" :", ":", s)
# s/ ºC/ ºC/g;
s = re.sub(" ºC", " ºC", s)
# s/ cm/ cm/g;
s = re.sub(" cm", " cm", s)
# s/ \?/\?/g;
s = re.sub(" \?", "\?", s)
# s/ \!/\!/g;
s = re.sub(" \!", "\!", s)
# s/ ;/;/g;
s = re.sub(" ;", ";", s)
# s/, /, /g; s/ +/ /g;
s = re.sub(", ", ", ", s)
s = re.sub(" +", " ", s)
if lang == "en":
# English "quotation," followed by comma, style
# s/\"([,\.]+)/$1\"/g;
s = re.sub('"([,\.]+)', r'\1"', s)
elif lang in ("cs", "cz"):
# Czech is confused
pass
else:
# German/Spanish/French "quotation", followed by comma, style
# s/,\"/\",/g;
s = re.sub(',"', '",', s)
# s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence
s = re.sub('(\.+)"(\s*[^<])', r'"\1\2', s)
if lang in ("de", "es", "cz", "cs", "fr"):
# s/(\d) (\d)/$1,$2/g;
s = re.sub("(\d) (\d)", r"\1,\2", s)
else:
# s/(\d) (\d)/$1.$2/g;
s = re.sub("(\d) (\d)", r"\1.\2", s)
return s

View File

@ -0,0 +1,76 @@
#!/usr/bin/env python3
import argparse
import logging
import re
from pathlib import Path
from functools import partial
from normalize_punctuation import normalize_punctuation
from lhotse.recipes.utils import read_manifests_if_cached
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--manifest-dir",
type=Path,
required=True,
help="Manifest directory",
)
parser.add_argument(
"--tgt-lang",
type=str,
required=True,
help="Target language, e.g., zh, de, fr.",
)
return parser.parse_args()
def preprocess_must_c(manifest_dir: Path, tgt_lang: str):
print(manifest_dir)
normalize_punctuation_lang = partial(normalize_punctuation, lang=tgt_lang)
prefix = "must_c"
suffix = "jsonl.gz"
parts = ["dev"]
for p in parts:
name = f"en-{tgt_lang}_{p}"
manifests = read_manifests_if_cached(
dataset_parts=name,
output_dir=manifest_dir,
prefix=prefix,
suffix=suffix,
types=("supervisions",),
)
if name not in manifests:
raise RuntimeError(f"Processing {p} failed.")
supervisions = manifests[name]["supervisions"]
if True:
supervisions2 = supervisions.transform_text(normalize_punctuation_lang)
for s, s2 in zip(supervisions, supervisions2):
if s.text != s2.text:
print(s.text)
print(s2.text)
print("-" * 10)
def main():
args = get_args()
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
logging.info(vars(args))
assert args.manifest_dir.is_dir(), args.manifest_dir
preprocess_must_c(
manifest_dir=args.manifest_dir,
tgt_lang=args.tgt_lang,
)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,196 @@
#!/usr/bin/env python3
from normalize_punctuation import normalize_punctuation
def test_normalize_punctuation():
# s/\r//g;
s = "a\r\nb\r\n"
n = normalize_punctuation(s, lang="en")
assert "\r" not in n
assert len(s) - 2 == len(n), (len(s), len(n))
# s/\(/ \(/g;
s = "(ab (c"
n = normalize_punctuation(s, lang="en")
assert n == " (ab (c", n
# s/\)/\) /g;
s = "a)b c)"
n = normalize_punctuation(s, lang="en")
assert n == "a) b c) "
# s/ +/ /g;
s = " a b c d "
n = normalize_punctuation(s, lang="en")
assert n == " a b c d "
# s/\) ([\.\!\:\?\;\,])/\)$1/g;
for i in ".!:?;,":
s = f"a) {i}"
n = normalize_punctuation(s, lang="en")
assert n == f"a){i}"
# s/\( /\(/g;
s = "a( b"
n = normalize_punctuation(s, lang="en")
assert n == "a (b", n
# s/ \)/\)/g;
s = "ab ) a"
n = normalize_punctuation(s, lang="en")
assert n == "ab) a", n
# s/(\d) \%/$1\%/g;
s = "1 %a"
n = normalize_punctuation(s, lang="en")
assert n == "1%a", n
# s/ :/:/g;
s = "a :"
n = normalize_punctuation(s, lang="en")
assert n == "a:", n
# s/ ;/;/g;
s = "a ;"
n = normalize_punctuation(s, lang="en")
assert n == "a;", n
# s/\`/\'/g;
s = "`a`"
n = normalize_punctuation(s, lang="en")
assert n == "'a'", n
# s/\'\'/ \" /g;
s = "''a''"
n = normalize_punctuation(s, lang="en")
assert n == '"a"', n
# s/„/\"/g;
s = '„a"'
n = normalize_punctuation(s, lang="en")
assert n == '"a"', n
# s/“/\"/g;
s = "“a„"
n = normalize_punctuation(s, lang="en")
assert n == '"a"', n
# s/”/\"/g;
s = "“a”"
n = normalize_punctuation(s, lang="en")
assert n == '"a"', n
# s//-/g;
s = "ab"
n = normalize_punctuation(s, lang="en")
assert n == "a-b", n
# s/—/ - /g; s/ +/ /g;
s = "a—b"
n = normalize_punctuation(s, lang="en")
assert n == "a - b", n
# s/´/\'/g;
s = "a´b"
n = normalize_punctuation(s, lang="en")
assert n == "a'b", n
# s/([a-z])([a-z])/$1\'$2/gi;
for i in "":
s = f"a{i}B"
n = normalize_punctuation(s, lang="en")
assert n == "a'B", n
s = f"A{i}B"
n = normalize_punctuation(s, lang="en")
assert n == "A'B", n
s = f"A{i}b"
n = normalize_punctuation(s, lang="en")
assert n == "A'b", n
# s//\'/g;
# s//\'/g;
for i in "":
s = f"a{i}b"
n = normalize_punctuation(s, lang="en")
assert n == "a'b", n
# s//\"/g;
s = ""
n = normalize_punctuation(s, lang="en")
assert n == '"', n
# s/''/\"/g;
s = "''"
n = normalize_punctuation(s, lang="en")
assert n == '"', n
# s/´´/\"/g;
s = "´´"
n = normalize_punctuation(s, lang="en")
assert n == '"', n
# s/…/.../g;
s = ""
n = normalize_punctuation(s, lang="en")
assert n == "...", n
# s/ « / \"/g;
s = "a « b"
n = normalize_punctuation(s, lang="en")
assert n == 'a "b', n
# s/« /\"/g;
s = "a « b"
n = normalize_punctuation(s, lang="en")
assert n == 'a "b', n
# s/«/\"/g;
s = "a«b"
n = normalize_punctuation(s, lang="en")
assert n == 'a"b', n
# s/ » /\" /g;
s = " » "
n = normalize_punctuation(s, lang="en")
assert n == '" ', n
# s/ »/\"/g;
s = " »"
n = normalize_punctuation(s, lang="en")
assert n == '"', n
# s/»/\"/g;
s = "»"
n = normalize_punctuation(s, lang="en")
assert n == '"', n
# s/ \%/\%/g;
s = " %"
n = normalize_punctuation(s, lang="en")
assert n == "%", n
# s/ :/:/g;
s = " :"
n = normalize_punctuation(s, lang="en")
assert n == ":", n
# s/(\d) (\d)/$1.$2/g;
s = "2 3"
n = normalize_punctuation(s, lang="en")
assert n == "2.3", n
# s/(\d) (\d)/$1,$2/g;
s = "2 3"
n = normalize_punctuation(s, lang="de")
assert n == "2,3", n
def main():
test_normalize_punctuation()
if __name__ == "__main__":
main()

108
egs/must_c/ST/prepare.sh Executable file
View File

@ -0,0 +1,108 @@
#!/usr/bin/env bash
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
set -eou pipefail
nj=10
stage=-1
stop_stage=100
version=v1.0
tgt_lang=de
dl_dir=$PWD/download
must_c_dir=$dl_dir/must-c/$version/en-$tgt_lang/data
# We assume dl_dir (download dir) contains the following
# directories and files.
# - $dl_dir/must-c/$version/en-$tgt_lang/data/{dev,train,tst-COMMON,tst-HE}
#
# Please go to https://ict.fbk.eu/must-c-releases/
# to download and untar the dataset if you have not already done this.
# - $dl_dir/musan
# This directory contains the following directories downloaded from
# http://www.openslr.org/17/
#
# - music
# - noise
# - speech
. shared/parse_options.sh || exit 1
# vocab size for sentence piece models.
# It will generate
# data/lang_bpe_${tgt_lang}_xxx
# data/lang_bpe_${tgt_lang}_yyy
# if the array contains xxx, yyy
vocab_sizes=(
# 5000
# 2000
# 1000
500
)
# All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
log "dl_dir: $dl_dir"
if [ ! -d $must_c_dir ]; then
log "$must_c_dir does not exist"
exit 1
fi
for d in dev train tst-COMMON tst-HE; do
if [ ! -d $must_c_dir/$d ]; then
log "$must_c_dir/$d does not exist!"
exit 1
fi
done
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download musan"
if [ ! -d $dl_dir/musan ]; then
lhotse download musan $dl_dir
fi
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare musan manifest"
# We assume that you have downloaded the musan corpus
# to data/musan
mkdir -p data/manifests
if [ ! -e data/manifests/.musan.done ]; then
lhotse prepare musan $dl_dir/musan data/manifests
touch data/manifests/.musan.done
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Prepare must-c $version manifest for target language $tgt_lang"
mkdir -p data/manifests/$version
if [ ! -e data/manifests/$version/.done ]; then
lhotse prepare must-c \
-j $nj \
--tgt-lang $tgt_lang \
$dl_dir/must-c/$version/ \
data/manifests/$version/
touch data/manifests/$version/.done
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Text normalization"
./local/preprocess_must_c.py \
--manifest-dir ./data/manifests/$version/ \
--tgt-lang $tgt_lang
fi

1
egs/must_c/ST/shared Symbolic link
View File

@ -0,0 +1 @@
../../../icefall/shared