mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-11 06:55:27 +00:00
add preprocessing
This commit is contained in:
parent
c850cb862f
commit
1ce9a8b3c4
@ -1,4 +1,13 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
This script normalizes transcripts from supervisions.
|
||||
|
||||
Usage:
|
||||
./local/preprocess_must_c.py \
|
||||
--manifest-dir ./data/manifests/v1.0/ \
|
||||
--tgt-lang de
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import re
|
||||
@ -33,10 +42,18 @@ def preprocess_must_c(manifest_dir: Path, tgt_lang: str):
|
||||
|
||||
prefix = "must_c"
|
||||
suffix = "jsonl.gz"
|
||||
parts = ["dev"]
|
||||
parts = ["dev", "tst-COMMON", "tst-HE", "train"]
|
||||
for p in parts:
|
||||
logging.info(f"Processing {p}")
|
||||
name = f"en-{tgt_lang}_{p}"
|
||||
|
||||
# norm: normalization
|
||||
# rm: remove punctuation
|
||||
dst_name = manifest_dir / f"must_c_supervisions_{name}_norm_rm.jsonl.gz"
|
||||
if dst_name.is_file():
|
||||
logging.info(f"{dst_name} exists - skipping")
|
||||
continue
|
||||
|
||||
manifests = read_manifests_if_cached(
|
||||
dataset_parts=name,
|
||||
output_dir=manifest_dir,
|
||||
@ -48,14 +65,10 @@ def preprocess_must_c(manifest_dir: Path, tgt_lang: str):
|
||||
raise RuntimeError(f"Processing {p} failed.")
|
||||
|
||||
supervisions = manifests[name]["supervisions"]
|
||||
if True:
|
||||
supervisions2 = supervisions.transform_text(normalize_punctuation_lang)
|
||||
supervisions = supervisions.transform_text(normalize_punctuation_lang)
|
||||
supervisions = supervisions.transform_text(lambda x: x.lower())
|
||||
|
||||
for s, s2 in zip(supervisions, supervisions2):
|
||||
if s.text != s2.text:
|
||||
print(s.text)
|
||||
print(s2.text)
|
||||
print("-" * 10)
|
||||
supervisions.to_file(dst_name)
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
41
egs/must_c/ST/local/remove_punctuation.py
Normal file
41
egs/must_c/ST/local/remove_punctuation.py
Normal file
@ -0,0 +1,41 @@
|
||||
# Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||
import re
|
||||
import string
|
||||
|
||||
|
||||
def remove_punctuation(s: str) -> str:
|
||||
"""
|
||||
It implements https://github.com/espnet/espnet/blob/master/utils/remove_punctuation.pl
|
||||
"""
|
||||
|
||||
# Remove punctuation except apostrophe
|
||||
# s/<space>/spacemark/g; # for scoring
|
||||
s = re.sub("<space>", "spacemark", s)
|
||||
|
||||
# s/'/apostrophe/g;
|
||||
s = re.sub("'", "apostrophe", s)
|
||||
|
||||
# s/[[:punct:]]//g;
|
||||
s = s.translate(str.maketrans("", "", string.punctuation))
|
||||
# string punctuation returns the following string
|
||||
# !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
|
||||
# See
|
||||
# https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
|
||||
|
||||
# s/apostrophe/'/g;
|
||||
s = re.sub("apostrophe", "'", s)
|
||||
|
||||
# s/spacemark/<space>/g; # for scoring
|
||||
s = re.sub("spacemark", "<space>", s)
|
||||
|
||||
# remove whitespace
|
||||
# s/\s+/ /g;
|
||||
s = re.sub("\s+", " ", s)
|
||||
|
||||
# s/^\s+//;
|
||||
s = re.sub("^\s+", "", s)
|
||||
|
||||
# s/\s+$//;
|
||||
s = re.sub("\s+$", "", s)
|
||||
|
||||
return s
|
||||
17
egs/must_c/ST/local/test_remove_punctuation.py
Executable file
17
egs/must_c/ST/local/test_remove_punctuation.py
Executable file
@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from remove_punctuation import remove_punctuation
|
||||
|
||||
|
||||
def test_remove_punctuation():
|
||||
s = "a,b'c!#"
|
||||
n = remove_punctuation(s)
|
||||
assert n == "ab'c", n
|
||||
|
||||
s = " ab " # remove leading and trailing spaces
|
||||
n = remove_punctuation(s)
|
||||
assert n == "ab", n
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_remove_punctuation()
|
||||
Loading…
x
Reference in New Issue
Block a user