mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
updated scripts for text norm
This commit is contained in:
parent
09a358a23e
commit
b30a4d6162
@ -55,6 +55,7 @@ def normalize_text(utt: str, language: str) -> str:
|
|||||||
elif language in ["yue", "zh-HK"]:
|
elif language in ["yue", "zh-HK"]:
|
||||||
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
|
# Mozilla Common Voice uses both "yue" and "zh-HK" for Cantonese
|
||||||
# Not sure why they decided to do this...
|
# Not sure why they decided to do this...
|
||||||
|
# None en/zh-yue tokens are manually removed here
|
||||||
return (
|
return (
|
||||||
utt.replace(",", "")
|
utt.replace(",", "")
|
||||||
.replace("。", " ")
|
.replace("。", " ")
|
||||||
@ -80,6 +81,7 @@ def normalize_text(utt: str, language: str) -> str:
|
|||||||
.replace("》", "")
|
.replace("》", "")
|
||||||
.replace("…", "")
|
.replace("…", "")
|
||||||
.replace("⋯", "")
|
.replace("⋯", "")
|
||||||
|
.replace("·", "")
|
||||||
.upper()
|
.upper()
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
@ -26,6 +26,7 @@ files in the directory "data/lang_char":
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@ -33,7 +34,7 @@ import pycantonese
|
|||||||
from preprocess_commonvoice import normalize_text
|
from preprocess_commonvoice import normalize_text
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
from icefall.utils import is_cjk
|
from icefall.utils import is_cjk, tokenize_by_CJK_char
|
||||||
|
|
||||||
|
|
||||||
def get_parser():
|
def get_parser():
|
||||||
@ -73,23 +74,28 @@ def get_word_segments(lines: List[str]) -> List[str]:
|
|||||||
|
|
||||||
for line in tqdm(lines, desc="Segmenting lines"):
|
for line in tqdm(lines, desc="Segmenting lines"):
|
||||||
try:
|
try:
|
||||||
# code switching
|
if is_cs(line): # code switching
|
||||||
if len(line.strip().split(" ")) > 1:
|
|
||||||
segments = []
|
segments = []
|
||||||
for segment in line.strip().split(" "):
|
curr_str = ""
|
||||||
|
for segment in tokenize_by_CJK_char(line).split(" "):
|
||||||
if segment.strip() == "":
|
if segment.strip() == "":
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
if not is_cjk(segment[0]): # en segment
|
if not is_cjk(segment[0]): # en segment
|
||||||
|
if curr_str:
|
||||||
|
segments.extend(pycantonese.segment(curr_str))
|
||||||
|
curr_str = ""
|
||||||
segments.append(segment)
|
segments.append(segment)
|
||||||
else: # zh segment
|
else: # zh segment
|
||||||
segments.extend(pycantonese.segment(segment))
|
curr_str += segment
|
||||||
|
# segments.extend(pycantonese.segment(segment))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Failed to process segment: {segment}")
|
logging.error(f"Failed to process segment: {segment}")
|
||||||
raise e
|
raise e
|
||||||
|
if curr_str: # process the last segment
|
||||||
|
segments.extend(pycantonese.segment(curr_str))
|
||||||
new_lines.append(" ".join(segments) + "\n")
|
new_lines.append(" ".join(segments) + "\n")
|
||||||
# not code switching
|
else: # not code switching
|
||||||
else:
|
|
||||||
new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
|
new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Failed to process line: {line}")
|
logging.error(f"Failed to process line: {line}")
|
||||||
@ -104,6 +110,11 @@ def get_words(lines: List[str]) -> List[str]:
|
|||||||
return list(words)
|
return list(words)
|
||||||
|
|
||||||
|
|
||||||
|
def is_cs(line: str) -> bool:
|
||||||
|
english_markers = r"[a-zA-Z]+"
|
||||||
|
return bool(re.search(english_markers, line))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = get_parser()
|
parser = get_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user