mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
updated scripts for text norm
This commit is contained in:
parent
09a358a23e
commit
b30a4d6162
@ -55,6 +55,7 @@ def normalize_text(utt: str, language: str) -> str:
|
||||
elif language in ["yue", "zh-HK"]:
|
||||
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
|
||||
# Not sure why they decided to do this...
|
||||
# None en/zh-yue tokens are manually removed here
|
||||
return (
|
||||
utt.replace(",", "")
|
||||
.replace("。", " ")
|
||||
@ -80,6 +81,7 @@ def normalize_text(utt: str, language: str) -> str:
|
||||
.replace("》", "")
|
||||
.replace("…", "")
|
||||
.replace("⋯", "")
|
||||
.replace("·", "")
|
||||
.upper()
|
||||
)
|
||||
else:
|
||||
|
@ -26,6 +26,7 @@ files in the directory "data/lang_char":
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
@ -33,7 +34,7 @@ import pycantonese
|
||||
from preprocess_commonvoice import normalize_text
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from icefall.utils import is_cjk
|
||||
from icefall.utils import is_cjk, tokenize_by_CJK_char
|
||||
|
||||
|
||||
def get_parser():
|
||||
@ -73,23 +74,28 @@ def get_word_segments(lines: List[str]) -> List[str]:
|
||||
|
||||
for line in tqdm(lines, desc="Segmenting lines"):
|
||||
try:
|
||||
# code switching
|
||||
if len(line.strip().split(" ")) > 1:
|
||||
if is_cs(line): # code switching
|
||||
segments = []
|
||||
for segment in line.strip().split(" "):
|
||||
curr_str = ""
|
||||
for segment in tokenize_by_CJK_char(line).split(" "):
|
||||
if segment.strip() == "":
|
||||
continue
|
||||
try:
|
||||
if not is_cjk(segment[0]): # en segment
|
||||
if curr_str:
|
||||
segments.extend(pycantonese.segment(curr_str))
|
||||
curr_str = ""
|
||||
segments.append(segment)
|
||||
else: # zh segment
|
||||
segments.extend(pycantonese.segment(segment))
|
||||
curr_str += segment
|
||||
# segments.extend(pycantonese.segment(segment))
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to process segment: {segment}")
|
||||
raise e
|
||||
if curr_str: # process the last segment
|
||||
segments.extend(pycantonese.segment(curr_str))
|
||||
new_lines.append(" ".join(segments) + "\n")
|
||||
# not code switching
|
||||
else:
|
||||
else: # not code switching
|
||||
new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to process line: {line}")
|
||||
@ -104,6 +110,11 @@ def get_words(lines: List[str]) -> List[str]:
|
||||
return list(words)
|
||||
|
||||
|
||||
def is_cs(line: str) -> bool:
|
||||
english_markers = r"[a-zA-Z]+"
|
||||
return bool(re.search(english_markers, line))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = get_parser()
|
||||
args = parser.parse_args()
|
||||
|
Loading…
x
Reference in New Issue
Block a user