From ecffd6ce93d94740a918c37fdd4eaf96a714aa84 Mon Sep 17 00:00:00 2001 From: zr_jin <60612200+JinZr@users.noreply.github.com> Date: Sat, 2 Sep 2023 17:50:09 +0800 Subject: [PATCH] Update preprocess_kespeech.py --- egs/multi_zh-hans/ASR/local/preprocess_kespeech.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py b/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py index c434ead7e..79b3a14dc 100755 --- a/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py +++ b/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py @@ -78,6 +78,8 @@ def preprocess_kespeech(speed_perturb: bool = False): list(manifests.keys()), dataset_parts, ) + logging_threshold = 50 + logging_count = 0 for partition, m in manifests.items(): logging.info(f"Processing {partition}") @@ -97,7 +99,8 @@ def preprocess_kespeech(speed_perturb: bool = False): orig_text = text sup.text = normalize_text(sup.text) text = str(sup.text) - if len(orig_text) != len(text): + if len(orig_text) != len(text) and logging_count < logging_threshold: + logging_count += 1 logging.info( f"\nOriginal text vs normalized text:\n{orig_text}\n{text}" )