add preprocessing

This commit is contained in:
Fangjun Kuang 2023-05-30 20:11:30 +08:00
parent c850cb862f
commit 1ce9a8b3c4
3 changed files with 79 additions and 8 deletions

View File

@ -1,4 +1,13 @@
#!/usr/bin/env python3
"""
This script normalizes transcripts from supervisions.
Usage:
./local/preprocess_must_c.py \
--manifest-dir ./data/manifests/v1.0/ \
--tgt-lang de
"""
import argparse
import logging
import re
@ -33,10 +42,18 @@ def preprocess_must_c(manifest_dir: Path, tgt_lang: str):
prefix = "must_c"
suffix = "jsonl.gz"
parts = ["dev"]
parts = ["dev", "tst-COMMON", "tst-HE", "train"]
for p in parts:
logging.info(f"Processing {p}")
name = f"en-{tgt_lang}_{p}"
# norm: normalization
# rm: remove punctuation
dst_name = manifest_dir / f"must_c_supervisions_{name}_norm_rm.jsonl.gz"
if dst_name.is_file():
logging.info(f"{dst_name} exists - skipping")
continue
manifests = read_manifests_if_cached(
dataset_parts=name,
output_dir=manifest_dir,
@ -48,14 +65,10 @@ def preprocess_must_c(manifest_dir: Path, tgt_lang: str):
raise RuntimeError(f"Processing {p} failed.")
supervisions = manifests[name]["supervisions"]
if True:
supervisions2 = supervisions.transform_text(normalize_punctuation_lang)
supervisions = supervisions.transform_text(normalize_punctuation_lang)
supervisions = supervisions.transform_text(lambda x: x.lower())
for s, s2 in zip(supervisions, supervisions2):
if s.text != s2.text:
print(s.text)
print(s2.text)
print("-" * 10)
supervisions.to_file(dst_name)
def main():

View File

@ -0,0 +1,41 @@
# Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang)
import re
import string
def remove_punctuation(s: str) -> str:
"""
It implements https://github.com/espnet/espnet/blob/master/utils/remove_punctuation.pl
"""
# Remove punctuation except apostrophe
# s/<space>/spacemark/g; # for scoring
s = re.sub("<space>", "spacemark", s)
# s/'/apostrophe/g;
s = re.sub("'", "apostrophe", s)
# s/[[:punct:]]//g;
s = s.translate(str.maketrans("", "", string.punctuation))
# string punctuation returns the following string
# !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
# See
# https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
# s/apostrophe/'/g;
s = re.sub("apostrophe", "'", s)
# s/spacemark/<space>/g; # for scoring
s = re.sub("spacemark", "<space>", s)
# remove whitespace
# s/\s+/ /g;
s = re.sub("\s+", " ", s)
# s/^\s+//;
s = re.sub("^\s+", "", s)
# s/\s+$//;
s = re.sub("\s+$", "", s)
return s

View File

@ -0,0 +1,17 @@
#!/usr/bin/env python3
from remove_punctuation import remove_punctuation
def test_remove_punctuation():
s = "a,b'c!#"
n = remove_punctuation(s)
assert n == "ab'c", n
s = " ab " # remove leading and trailing spaces
n = remove_punctuation(s)
assert n == "ab", n
if __name__ == "__main__":
test_remove_punctuation()