Update egs/multi_zh-hans/ASR/local/preprocess_kespeech.py

Co-authored-by: Fangjun Kuang <csukuangfj@gmail.com>
This commit is contained in:
zr_jin 2023-09-07 15:09:02 +08:00 committed by GitHub
parent 465ff40470
commit 5fb9730a61
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -95,14 +95,12 @@ def preprocess_kespeech(speed_perturb: bool = False):
m["supervisions"] = m["supervisions"].filter(has_no_oov) m["supervisions"] = m["supervisions"].filter(has_no_oov)
logging.info(f"Normalizing text in {partition}") logging.info(f"Normalizing text in {partition}")
for sup in m["supervisions"]: for sup in m["supervisions"]:
text = str(sup.text) orig_text = sup.text
orig_text = text
sup.text = normalize_text(sup.text) sup.text = normalize_text(sup.text)
text = str(sup.text) if logging_count < logging_threshold and len(orig_text) != len(sup.text):
if len(orig_text) != len(text) and logging_count < logging_threshold:
logging_count += 1 logging_count += 1
logging.info( logging.info(
f"\nOriginal text vs normalized text:\n{orig_text}\n{text}" f"\nOriginal text vs normalized text:\n{orig_text}\n{sup.text}"
) )
# Create long-recording cut manifests. # Create long-recording cut manifests.