mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
Update preprocess_mdcc.py
This commit is contained in:
parent
ba01616103
commit
c8dbc01308
@ -92,16 +92,21 @@ def get_word_segments(lines: List[str]) -> List[str]:
|
|||||||
if len(line.split(" ")) > 1:
|
if len(line.split(" ")) > 1:
|
||||||
segments = []
|
segments = []
|
||||||
for segment in line.split(" "):
|
for segment in line.split(" "):
|
||||||
if not is_cjk(segment[0]): # en segment
|
try:
|
||||||
segments.append(segment)
|
if not is_cjk(segment[0]): # en segment
|
||||||
else: # zh segment
|
segments.append(segment)
|
||||||
segments.extend(pycantonese.segment(segment))
|
else: # zh segment
|
||||||
|
segments.extend(pycantonese.segment(segment))
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to process segment: {segment}")
|
||||||
|
raise e
|
||||||
new_lines.append(" ".join(segments) + "\n")
|
new_lines.append(" ".join(segments) + "\n")
|
||||||
# not code switching
|
# not code switching
|
||||||
else:
|
else:
|
||||||
new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
|
new_lines.append(" ".join(pycantonese.segment(line)) + "\n")
|
||||||
except:
|
except:
|
||||||
logging.error(f"Failed to process line: {line}")
|
logging.error(f"Failed to process line: {line}")
|
||||||
|
raise e
|
||||||
return new_lines
|
return new_lines
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user