mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
update
This commit is contained in:
parent
9d1f0b5022
commit
e35741583c
@ -18,6 +18,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
from lhotse import CutSet, SupervisionSegment
|
from lhotse import CutSet, SupervisionSegment
|
||||||
from lhotse.recipes.utils import read_manifests_if_cached
|
from lhotse.recipes.utils import read_manifests_if_cached
|
||||||
@ -25,11 +26,56 @@ from lhotse.recipes.utils import read_manifests_if_cached
|
|||||||
from icefall.utils import str2bool
|
from icefall.utils import str2bool
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--lang",
|
||||||
|
type=str,
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def normalize_text(
|
def normalize_text(
|
||||||
utt: str,
|
text: str,
|
||||||
|
lang: str,
|
||||||
) -> str:
|
) -> str:
|
||||||
whitespace_pattern = (re.compile(r"\s\s+"),)
|
text = unicodedata.normalize("NFKC", text)
|
||||||
return whitespace_pattern.sub("", utt)
|
|
||||||
|
# Convert to upper case
|
||||||
|
text = text.upper()
|
||||||
|
|
||||||
|
# Remove brackets with content
|
||||||
|
text = re.sub(r"\([^\)]*\)", " ", text)
|
||||||
|
|
||||||
|
# Language-related normalization
|
||||||
|
if lang == "Thai":
|
||||||
|
# Digit mapping
|
||||||
|
text = re.sub(r"\u0030", r"\u0E50", text)
|
||||||
|
text = re.sub(r"\u0031", r"\u0E51", text)
|
||||||
|
text = re.sub(r"\u0032", r"\u0E52", text)
|
||||||
|
text = re.sub(r"\u0033", r"\u0E53", text)
|
||||||
|
text = re.sub(r"\u0034", r"\u0E54", text)
|
||||||
|
text = re.sub(r"\u0035", r"\u0E55", text)
|
||||||
|
text = re.sub(r"\u0036", r"\u0E56", text)
|
||||||
|
text = re.sub(r"\u0037", r"\u0E57", text)
|
||||||
|
text = re.sub(r"\u0038", r"\u0E58", text)
|
||||||
|
text = re.sub(r"\u0039", r"\u0E59", text)
|
||||||
|
|
||||||
|
# Currency symbols mapping
|
||||||
|
text = re.sub(r"\u0024", r"", text) # $
|
||||||
|
text = re.sub(r"\u00A3", r"", text)
|
||||||
|
text = re.sub(r"\u00A5", r"\u", text)
|
||||||
|
text = re.sub(r"\u00AC", r"\u", text)
|
||||||
|
|
||||||
|
|
||||||
|
# Remove blank symbols
|
||||||
|
text = re.sub(r"\s", "", utt)
|
||||||
|
|
||||||
|
else:
|
||||||
|
text = re.sub(r"\s+", " ", text).strip()
|
||||||
|
|
||||||
|
return utt
|
||||||
|
|
||||||
|
|
||||||
def preprocess_gigaspeech2(args):
|
def preprocess_gigaspeech2(args):
|
||||||
@ -79,7 +125,8 @@ def main():
|
|||||||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
|
|
||||||
preprocess_gigaspeech2()
|
args = get_args()
|
||||||
|
preprocess_gigaspeech2(args)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
Loading…
x
Reference in New Issue
Block a user