mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-11 06:55:27 +00:00
add preprocessing
This commit is contained in:
parent
c850cb862f
commit
1ce9a8b3c4
@ -1,4 +1,13 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
This script normalizes transcripts from supervisions.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
./local/preprocess_must_c.py \
|
||||||
|
--manifest-dir ./data/manifests/v1.0/ \
|
||||||
|
--tgt-lang de
|
||||||
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
@ -33,10 +42,18 @@ def preprocess_must_c(manifest_dir: Path, tgt_lang: str):
|
|||||||
|
|
||||||
prefix = "must_c"
|
prefix = "must_c"
|
||||||
suffix = "jsonl.gz"
|
suffix = "jsonl.gz"
|
||||||
parts = ["dev"]
|
parts = ["dev", "tst-COMMON", "tst-HE", "train"]
|
||||||
for p in parts:
|
for p in parts:
|
||||||
|
logging.info(f"Processing {p}")
|
||||||
name = f"en-{tgt_lang}_{p}"
|
name = f"en-{tgt_lang}_{p}"
|
||||||
|
|
||||||
|
# norm: normalization
|
||||||
|
# rm: remove punctuation
|
||||||
|
dst_name = manifest_dir / f"must_c_supervisions_{name}_norm_rm.jsonl.gz"
|
||||||
|
if dst_name.is_file():
|
||||||
|
logging.info(f"{dst_name} exists - skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
manifests = read_manifests_if_cached(
|
manifests = read_manifests_if_cached(
|
||||||
dataset_parts=name,
|
dataset_parts=name,
|
||||||
output_dir=manifest_dir,
|
output_dir=manifest_dir,
|
||||||
@ -48,14 +65,10 @@ def preprocess_must_c(manifest_dir: Path, tgt_lang: str):
|
|||||||
raise RuntimeError(f"Processing {p} failed.")
|
raise RuntimeError(f"Processing {p} failed.")
|
||||||
|
|
||||||
supervisions = manifests[name]["supervisions"]
|
supervisions = manifests[name]["supervisions"]
|
||||||
if True:
|
supervisions = supervisions.transform_text(normalize_punctuation_lang)
|
||||||
supervisions2 = supervisions.transform_text(normalize_punctuation_lang)
|
supervisions = supervisions.transform_text(lambda x: x.lower())
|
||||||
|
|
||||||
for s, s2 in zip(supervisions, supervisions2):
|
supervisions.to_file(dst_name)
|
||||||
if s.text != s2.text:
|
|
||||||
print(s.text)
|
|
||||||
print(s2.text)
|
|
||||||
print("-" * 10)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|||||||
41
egs/must_c/ST/local/remove_punctuation.py
Normal file
41
egs/must_c/ST/local/remove_punctuation.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
# Copyright 2023 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||||
|
import re
|
||||||
|
import string
|
||||||
|
|
||||||
|
|
||||||
|
def remove_punctuation(s: str) -> str:
|
||||||
|
"""
|
||||||
|
It implements https://github.com/espnet/espnet/blob/master/utils/remove_punctuation.pl
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Remove punctuation except apostrophe
|
||||||
|
# s/<space>/spacemark/g; # for scoring
|
||||||
|
s = re.sub("<space>", "spacemark", s)
|
||||||
|
|
||||||
|
# s/'/apostrophe/g;
|
||||||
|
s = re.sub("'", "apostrophe", s)
|
||||||
|
|
||||||
|
# s/[[:punct:]]//g;
|
||||||
|
s = s.translate(str.maketrans("", "", string.punctuation))
|
||||||
|
# string punctuation returns the following string
|
||||||
|
# !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
|
||||||
|
# See
|
||||||
|
# https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
|
||||||
|
|
||||||
|
# s/apostrophe/'/g;
|
||||||
|
s = re.sub("apostrophe", "'", s)
|
||||||
|
|
||||||
|
# s/spacemark/<space>/g; # for scoring
|
||||||
|
s = re.sub("spacemark", "<space>", s)
|
||||||
|
|
||||||
|
# remove whitespace
|
||||||
|
# s/\s+/ /g;
|
||||||
|
s = re.sub("\s+", " ", s)
|
||||||
|
|
||||||
|
# s/^\s+//;
|
||||||
|
s = re.sub("^\s+", "", s)
|
||||||
|
|
||||||
|
# s/\s+$//;
|
||||||
|
s = re.sub("\s+$", "", s)
|
||||||
|
|
||||||
|
return s
|
||||||
17
egs/must_c/ST/local/test_remove_punctuation.py
Executable file
17
egs/must_c/ST/local/test_remove_punctuation.py
Executable file
@ -0,0 +1,17 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
from remove_punctuation import remove_punctuation
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_punctuation():
|
||||||
|
s = "a,b'c!#"
|
||||||
|
n = remove_punctuation(s)
|
||||||
|
assert n == "ab'c", n
|
||||||
|
|
||||||
|
s = " ab " # remove leading and trailing spaces
|
||||||
|
n = remove_punctuation(s)
|
||||||
|
assert n == "ab", n
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_remove_punctuation()
|
||||||
Loading…
x
Reference in New Issue
Block a user