Update preprocess_mdcc.py

This commit is contained in:
jinzr 2024-03-08 19:22:31 +08:00
parent 8b96efe9e5
commit 336944c9b3

View File

@ -89,9 +89,9 @@ def get_word_segments(lines: List[str]) -> List[str]:
for line in tqdm(lines, desc="Segmenting lines"): for line in tqdm(lines, desc="Segmenting lines"):
try: try:
# code switching # code switching
if len(line.split(" ")) > 1: if len(line.strip().split(" ")) > 1:
segments = [] segments = []
for segment in line.split(" "): for segment in line.strip().split(" "):
try: try:
if not is_cjk(segment[0]): # en segment if not is_cjk(segment[0]): # en segment
segments.append(segment) segments.append(segment)