Update preprocess_mdcc.py

This commit is contained in:
jinzr 2024-03-08 19:18:19 +08:00
parent 1c509a24f8
commit ba01616103

View File

@ -26,6 +26,7 @@ files in the directory "data/lang_char":
""" """
import argparse import argparse
import logging
from pathlib import Path from pathlib import Path
from typing import List from typing import List
@ -86,18 +87,21 @@ def get_word_segments(lines: List[str]) -> List[str]:
new_lines = [] new_lines = []
for line in tqdm(lines, desc="Segmenting lines"): for line in tqdm(lines, desc="Segmenting lines"):
# code switching try:
if len(line.split(" ")) > 1: # code switching
segments = [] if len(line.split(" ")) > 1:
for segment in line.split(" "): segments = []
if not is_cjk(segment[0]): # en segment for segment in line.split(" "):
segments.append(segment) if not is_cjk(segment[0]): # en segment
else: # zh segment segments.append(segment)
segments.extend(pycantonese.segment(segment)) else: # zh segment
new_lines.append(" ".join(segments) + "\n") segments.extend(pycantonese.segment(segment))
# not code switching new_lines.append(" ".join(segments) + "\n")
else: # not code switching
new_lines.append(" ".join(pycantonese.segment(line)) + "\n") else:
new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
except:
logging.error(f"Failed to process line: {line}")
return new_lines return new_lines