Merge branch 'dev_multi_zh-hans' of https://github.com/JinZr/icefall into dev_multi_zh-hans

This commit is contained in:
jinzr 2023-09-07 15:12:11 +08:00
commit 14328f0995
2 changed files with 4 additions and 6 deletions

View File

@ -72,7 +72,7 @@ def get_parser():
"--num-splits", "--num-splits",
type=int, type=int,
required=True, required=True,
help="The number of splits of the L subset", help="The number of splits of the given subset",
) )
parser.add_argument( parser.add_argument(

View File

@ -95,14 +95,12 @@ def preprocess_kespeech(speed_perturb: bool = False):
m["supervisions"] = m["supervisions"].filter(has_no_oov) m["supervisions"] = m["supervisions"].filter(has_no_oov)
logging.info(f"Normalizing text in {partition}") logging.info(f"Normalizing text in {partition}")
for sup in m["supervisions"]: for sup in m["supervisions"]:
text = str(sup.text) orig_text = sup.text
orig_text = text
sup.text = normalize_text(sup.text) sup.text = normalize_text(sup.text)
text = str(sup.text) if logging_count < logging_threshold and len(orig_text) != len(sup.text):
if len(orig_text) != len(text) and logging_count < logging_threshold:
logging_count += 1 logging_count += 1
logging.info( logging.info(
f"\nOriginal text vs normalized text:\n{orig_text}\n{text}" f"\nOriginal text vs normalized text:\n{orig_text}\n{sup.text}"
) )
# Create long-recording cut manifests. # Create long-recording cut manifests.