mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-11 06:55:27 +00:00
Update preprocess_kespeech.py
This commit is contained in:
parent
40a7c17a3c
commit
ecffd6ce93
@ -78,6 +78,8 @@ def preprocess_kespeech(speed_perturb: bool = False):
|
|||||||
list(manifests.keys()),
|
list(manifests.keys()),
|
||||||
dataset_parts,
|
dataset_parts,
|
||||||
)
|
)
|
||||||
|
logging_threshold = 50
|
||||||
|
logging_count = 0
|
||||||
|
|
||||||
for partition, m in manifests.items():
|
for partition, m in manifests.items():
|
||||||
logging.info(f"Processing {partition}")
|
logging.info(f"Processing {partition}")
|
||||||
@ -97,7 +99,8 @@ def preprocess_kespeech(speed_perturb: bool = False):
|
|||||||
orig_text = text
|
orig_text = text
|
||||||
sup.text = normalize_text(sup.text)
|
sup.text = normalize_text(sup.text)
|
||||||
text = str(sup.text)
|
text = str(sup.text)
|
||||||
if len(orig_text) != len(text):
|
if len(orig_text) != len(text) and logging_count < logging_threshold:
|
||||||
|
logging_count += 1
|
||||||
logging.info(
|
logging.info(
|
||||||
f"\nOriginal text vs normalized text:\n{orig_text}\n{text}"
|
f"\nOriginal text vs normalized text:\n{orig_text}\n{text}"
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user