updated scripts for text norm

This commit is contained in:
jinzr 2024-03-13 10:57:59 +08:00
parent 09a358a23e
commit b30a4d6162
2 changed files with 20 additions and 7 deletions

View File

@ -55,6 +55,7 @@ def normalize_text(utt: str, language: str) -> str:
elif language in ["yue", "zh-HK"]:
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
# Not sure why they decided to do this...
# None en/zh-yue tokens are manually removed here
return (
utt.replace("", "")
.replace("", " ")
@ -80,6 +81,7 @@ def normalize_text(utt: str, language: str) -> str:
.replace("", "")
.replace("", "")
.replace("", "")
.replace("·", "")
.upper()
)
else:

View File

@ -26,6 +26,7 @@ files in the directory "data/lang_char":
import argparse
import logging
import re
from pathlib import Path
from typing import List
@ -33,7 +34,7 @@ import pycantonese
from preprocess_commonvoice import normalize_text
from tqdm.auto import tqdm
from icefall.utils import is_cjk
from icefall.utils import is_cjk, tokenize_by_CJK_char
def get_parser():
@ -73,23 +74,28 @@ def get_word_segments(lines: List[str]) -> List[str]:
for line in tqdm(lines, desc="Segmenting lines"):
try:
# code switching
if len(line.strip().split(" ")) > 1:
if is_cs(line): # code switching
segments = []
for segment in line.strip().split(" "):
curr_str = ""
for segment in tokenize_by_CJK_char(line).split(" "):
if segment.strip() == "":
continue
try:
if not is_cjk(segment[0]): # en segment
if curr_str:
segments.extend(pycantonese.segment(curr_str))
curr_str = ""
segments.append(segment)
else: # zh segment
segments.extend(pycantonese.segment(segment))
curr_str += segment
# segments.extend(pycantonese.segment(segment))
except Exception as e:
logging.error(f"Failed to process segment: {segment}")
raise e
if curr_str: # process the last segment
segments.extend(pycantonese.segment(curr_str))
new_lines.append(" ".join(segments) + "\n")
# not code switching
else:
else: # not code switching
new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
except Exception as e:
logging.error(f"Failed to process line: {line}")
@ -104,6 +110,11 @@ def get_words(lines: List[str]) -> List[str]:
return list(words)
def is_cs(line: str) -> bool:
english_markers = r"[a-zA-Z]+"
return bool(re.search(english_markers, line))
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()