This commit is contained in:
yfyeung 2024-04-02 07:24:27 +00:00
parent 9d1f0b5022
commit e35741583c

View File

@ -18,6 +18,7 @@
import logging import logging
import re import re
from pathlib import Path from pathlib import Path
import unicodedata
from lhotse import CutSet, SupervisionSegment from lhotse import CutSet, SupervisionSegment
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
@ -25,11 +26,56 @@ from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import str2bool from icefall.utils import str2bool
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--lang",
type=str,
)
return parser.parse_args()
def normalize_text( def normalize_text(
utt: str, text: str,
lang: str,
) -> str: ) -> str:
whitespace_pattern = (re.compile(r"\s\s+"),) text = unicodedata.normalize("NFKC", text)
return whitespace_pattern.sub("", utt)
# Convert to upper case
text = text.upper()
# Remove brackets with content
text = re.sub(r"\([^\)]*\)", " ", text)
# Language-related normalization
if lang == "Thai":
# Digit mapping
text = re.sub(r"\u0030", r"\u0E50", text)
text = re.sub(r"\u0031", r"\u0E51", text)
text = re.sub(r"\u0032", r"\u0E52", text)
text = re.sub(r"\u0033", r"\u0E53", text)
text = re.sub(r"\u0034", r"\u0E54", text)
text = re.sub(r"\u0035", r"\u0E55", text)
text = re.sub(r"\u0036", r"\u0E56", text)
text = re.sub(r"\u0037", r"\u0E57", text)
text = re.sub(r"\u0038", r"\u0E58", text)
text = re.sub(r"\u0039", r"\u0E59", text)
# Currency symbols mapping
text = re.sub(r"\u0024", r"", text) # $
text = re.sub(r"\u00A3", r"", text)
text = re.sub(r"\u00A5", r"\u", text)
text = re.sub(r"\u00AC", r"\u", text)
# Remove blank symbols
text = re.sub(r"\s", "", utt)
else:
text = re.sub(r"\s+", " ", text).strip()
return utt
def preprocess_gigaspeech2(args): def preprocess_gigaspeech2(args):
@ -79,7 +125,8 @@ def main():
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO) logging.basicConfig(format=formatter, level=logging.INFO)
preprocess_gigaspeech2() args = get_args()
preprocess_gigaspeech2(args)
if __name__ == "__main__": if __name__ == "__main__":