Update preprocess_mdcc.py

This commit is contained in:
jinzr 2024-03-08 19:20:21 +08:00
parent ba01616103
commit c8dbc01308

View File

@ -92,16 +92,21 @@ def get_word_segments(lines: List[str]) -> List[str]:
if len(line.split(" ")) > 1: if len(line.split(" ")) > 1:
segments = [] segments = []
for segment in line.split(" "): for segment in line.split(" "):
try:
if not is_cjk(segment[0]): # en segment if not is_cjk(segment[0]): # en segment
segments.append(segment) segments.append(segment)
else: # zh segment else: # zh segment
segments.extend(pycantonese.segment(segment)) segments.extend(pycantonese.segment(segment))
except Exception as e:
logging.error(f"Failed to process segment: {segment}")
raise e
new_lines.append(" ".join(segments) + "\n") new_lines.append(" ".join(segments) + "\n")
# not code switching # not code switching
else: else:
new_lines.append(" ".join(pycantonese.segment(line)) + "\n") new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
except: except:
logging.error(f"Failed to process line: {line}") logging.error(f"Failed to process line: {line}")
raise e
return new_lines return new_lines