This commit is contained in:
JinZr 2023-07-26 13:48:46 +08:00
parent a91c90636b
commit 24296d8d48
2 changed files with 57 additions and 16 deletions

View File

@ -125,6 +125,7 @@ from icefall.checkpoint import (
find_checkpoints, find_checkpoints,
load_checkpoint, load_checkpoint,
) )
from lhotse.cut import Cut
from icefall.lexicon import Lexicon from icefall.lexicon import Lexicon
from icefall.utils import ( from icefall.utils import (
AttributeDict, AttributeDict,
@ -792,11 +793,19 @@ def main():
# test_clean_dl = librispeech.test_dataloaders(test_clean_cuts) # test_clean_dl = librispeech.test_dataloaders(test_clean_cuts)
# test_other_dl = librispeech.test_dataloaders(test_other_cuts) # test_other_dl = librispeech.test_dataloaders(test_other_cuts)
def remove_short_utt(c: Cut):
T = ((c.num_frames - 7) // 2 + 1) // 2
if T <= 0:
logging.warning(
f"Excluding cut with ID: {c.id} from decoding, num_frames: {c.num_frames}"
)
return T > 0
test_sets_cuts = multi_dataset.test_cuts() test_sets_cuts = multi_dataset.test_cuts()
test_sets = test_sets_cuts.keys() test_sets = test_sets_cuts.keys()
test_dl = [ test_dl = [
librispeech.test_dataloaders(test_sets_cuts[cuts_name]) librispeech.test_dataloaders(test_sets_cuts[cuts_name].filter(remove_short_utt))
for cuts_name in test_sets for cuts_name in test_sets
] ]

View File

@ -221,22 +221,31 @@ class MultiDataset:
logging.info("About to get multidataset test cuts") logging.info("About to get multidataset test cuts")
# Aidatatang_200zh # Aidatatang_200zh
logging.info("Loading Aidatatang_200zh TEST set in lazy mode") logging.info("Loading Aidatatang_200zh set in lazy mode")
aidatatang_test_cuts = load_manifest_lazy( aidatatang_test_cuts = load_manifest_lazy(
self.fbank_dir / "aidatatang_cuts_test.jsonl.gz" self.fbank_dir / "aidatatang_cuts_test.jsonl.gz"
) )
aidatatang_dev_cuts = load_manifest_lazy(
self.fbank_dir / "aidatatang_cuts_dev.jsonl.gz"
)
# AISHELL # AISHELL
logging.info("Loading Aishell TEST set in lazy mode") logging.info("Loading Aishell set in lazy mode")
aishell_test_cuts = load_manifest_lazy( aishell_test_cuts = load_manifest_lazy(
self.fbank_dir / "aishell_cuts_test.jsonl.gz" self.fbank_dir / "aishell_cuts_test.jsonl.gz"
) )
aishell_dev_cuts = load_manifest_lazy(
self.fbank_dir / "aishell_cuts_dev.jsonl.gz"
)
# AISHELL-2 # AISHELL-2
logging.info("Loading Aishell-2 TEST set in lazy mode") logging.info("Loading Aishell-2 set in lazy mode")
aishell2_test_cuts = load_manifest_lazy( aishell2_test_cuts = load_manifest_lazy(
self.fbank_dir / "aishell2_cuts_test.jsonl.gz" self.fbank_dir / "aishell2_cuts_test.jsonl.gz"
) )
aishell2_dev_cuts = load_manifest_lazy(
self.fbank_dir / "aishell2_cuts_dev.jsonl.gz"
)
# AISHELL-4 # AISHELL-4
logging.info("Loading Aishell-4 TEST set in lazy mode") logging.info("Loading Aishell-4 TEST set in lazy mode")
@ -245,40 +254,63 @@ class MultiDataset:
) )
# Ali-Meeting # Ali-Meeting
logging.info("Loading Ali-Meeting TEST set in lazy mode") logging.info("Loading Ali-Meeting set in lazy mode")
alimeeting_test_cuts = load_manifest_lazy( alimeeting_test_cuts = load_manifest_lazy(
self.fbank_dir / "alimeeting-far_cuts_test.jsonl.gz" self.fbank_dir / "alimeeting-far_cuts_test.jsonl.gz"
) )
alimeeting_eval_cuts = load_manifest_lazy(
self.fbank_dir / "alimeeting-far_cuts_eval.jsonl.gz"
)
# MagicData # MagicData
logging.info("Loading MagicData TEST set in lazy mode") logging.info("Loading MagicData set in lazy mode")
magicdata_test_cuts = load_manifest_lazy( magicdata_test_cuts = load_manifest_lazy(
self.fbank_dir / "magicdata_cuts_test.jsonl.gz" self.fbank_dir / "magicdata_cuts_test.jsonl.gz"
) )
magicdata_dev_cuts = load_manifest_lazy(
self.fbank_dir / "magicdata_cuts_dev.jsonl.gz"
)
# KeSpeech # KeSpeech
logging.info("Loading KeSpeech TEST set in lazy mode") logging.info("Loading KeSpeech set in lazy mode")
kespeech_test_cuts = load_manifest_lazy( kespeech_test_cuts = load_manifest_lazy(
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_test.jsonl.gz" self.fbank_dir / "kespeech" / "kespeech-asr_cuts_test.jsonl.gz"
) )
kespeech_dev_phase1_cuts = load_manifest_lazy(
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase1.jsonl.gz"
)
kespeech_dev_phase2_cuts = load_manifest_lazy(
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase2.jsonl.gz"
)
# WeNetSpeech # WeNetSpeech
logging.info("Loading WeNetSpeech TEST set in lazy mode") logging.info("Loading WeNetSpeech set in lazy mode")
wenetspeech_test_meeting_cuts = load_manifest_lazy( wenetspeech_test_meeting_cuts = load_manifest_lazy(
self.fbank_dir / "wenetspeech" / "cuts_TEST_MEETING.jsonl.gz" self.fbank_dir / "wenetspeech" / "cuts_TEST_MEETING.jsonl.gz"
) )
wenetspeech_test_net_cuts = load_manifest_lazy( wenetspeech_test_net_cuts = load_manifest_lazy(
self.fbank_dir / "wenetspeech" / "cuts_TEST_NET.jsonl.gz" self.fbank_dir / "wenetspeech" / "cuts_TEST_NET.jsonl.gz"
) )
wenetspeech_dev_cuts = load_manifest_lazy(
self.fbank_dir / "wenetspeech" / "cuts_DEV.jsonl.gz"
)
return { return {
"aidatatang": aidatatang_test_cuts, "aidatatang_test": aidatatang_test_cuts,
# "alimeeting": alimeeting_test_cuts, "aidatatang_dev": aidatatang_dev_cuts,
"aishell": aishell_test_cuts, "alimeeting_test": alimeeting_test_cuts,
"aishell-2": aishell2_test_cuts, "alimeeting_eval": alimeeting_eval_cuts,
"aishell_test": aishell_test_cuts,
"aishell_dev": aishell_dev_cuts,
"aishell-2_test": aishell2_test_cuts,
"aishell-2_dev": aishell2_dev_cuts,
"aishell-4": aishell4_test_cuts, "aishell-4": aishell4_test_cuts,
"magicdata": magicdata_test_cuts, "magicdata_test": magicdata_test_cuts,
"kespeech": kespeech_test_cuts, "magicdata_dev": magicdata_dev_cuts,
"wenetspeech-meeting": wenetspeech_test_meeting_cuts, "kespeech-asr_test": kespeech_test_cuts,
"wenetspeech-net": wenetspeech_test_net_cuts, "kespeech-asr_dev_phase1": kespeech_dev_phase1_cuts,
"kespeech-asr_dev_phase2": kespeech_dev_phase2_cuts,
"wenetspeech-meeting_test": wenetspeech_test_meeting_cuts,
"wenetspeech-net_test": wenetspeech_test_net_cuts,
"wenetspeech_dev": wenetspeech_dev_cuts,
} }