From c8dbc0130840f85d8c15a18d9540fd998d9d1045 Mon Sep 17 00:00:00 2001 From: jinzr Date: Fri, 8 Mar 2024 19:20:21 +0800 Subject: [PATCH] Update preprocess_mdcc.py --- egs/mdcc/ASR/local/preprocess_mdcc.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/egs/mdcc/ASR/local/preprocess_mdcc.py b/egs/mdcc/ASR/local/preprocess_mdcc.py index bc7b0a4d7..9c2c4f4e8 100755 --- a/egs/mdcc/ASR/local/preprocess_mdcc.py +++ b/egs/mdcc/ASR/local/preprocess_mdcc.py @@ -92,16 +92,21 @@ def get_word_segments(lines: List[str]) -> List[str]: if len(line.split(" ")) > 1: segments = [] for segment in line.split(" "): - if not is_cjk(segment[0]): # en segment - segments.append(segment) - else: # zh segment - segments.extend(pycantonese.segment(segment)) + try: + if not is_cjk(segment[0]): # en segment + segments.append(segment) + else: # zh segment + segments.extend(pycantonese.segment(segment)) + except Exception as e: + logging.error(f"Failed to process segment: {segment}") + raise e new_lines.append(" ".join(segments) + "\n") # not code switching else: new_lines.append(" ".join(pycantonese.segment(line)) + "\n") except: logging.error(f"Failed to process line: {line}") + raise e return new_lines