From ba016161036c60dc18d6601431a0b13797f31ec5 Mon Sep 17 00:00:00 2001 From: jinzr Date: Fri, 8 Mar 2024 19:18:19 +0800 Subject: [PATCH] Update preprocess_mdcc.py --- egs/mdcc/ASR/local/preprocess_mdcc.py | 28 +++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/egs/mdcc/ASR/local/preprocess_mdcc.py b/egs/mdcc/ASR/local/preprocess_mdcc.py index 751b0a8b8..bc7b0a4d7 100755 --- a/egs/mdcc/ASR/local/preprocess_mdcc.py +++ b/egs/mdcc/ASR/local/preprocess_mdcc.py @@ -26,6 +26,7 @@ files in the directory "data/lang_char": """ import argparse +import logging from pathlib import Path from typing import List @@ -86,18 +87,21 @@ def get_word_segments(lines: List[str]) -> List[str]: new_lines = [] for line in tqdm(lines, desc="Segmenting lines"): - # code switching - if len(line.split(" ")) > 1: - segments = [] - for segment in line.split(" "): - if not is_cjk(segment[0]): # en segment - segments.append(segment) - else: # zh segment - segments.extend(pycantonese.segment(segment)) - new_lines.append(" ".join(segments) + "\n") - # not code switching - else: - new_lines.append(" ".join(pycantonese.segment(line)) + "\n") + try: + # code switching + if len(line.split(" ")) > 1: + segments = [] + for segment in line.split(" "): + if not is_cjk(segment[0]): # en segment + segments.append(segment) + else: # zh segment + segments.extend(pycantonese.segment(segment)) + new_lines.append(" ".join(segments) + "\n") + # not code switching + else: + new_lines.append(" ".join(pycantonese.segment(line)) + "\n") + except: + logging.error(f"Failed to process line: {line}") return new_lines