diff --git a/egs/multi_zh-hans/ASR/zipformer/decode.py b/egs/multi_zh-hans/ASR/zipformer/decode.py index df7124c0b..2ab7244ed 100755 --- a/egs/multi_zh-hans/ASR/zipformer/decode.py +++ b/egs/multi_zh-hans/ASR/zipformer/decode.py @@ -125,6 +125,7 @@ from icefall.checkpoint import ( find_checkpoints, load_checkpoint, ) +from lhotse.cut import Cut from icefall.lexicon import Lexicon from icefall.utils import ( AttributeDict, @@ -792,11 +793,19 @@ def main(): # test_clean_dl = librispeech.test_dataloaders(test_clean_cuts) # test_other_dl = librispeech.test_dataloaders(test_other_cuts) + def remove_short_utt(c: Cut): + T = ((c.num_frames - 7) // 2 + 1) // 2 + if T <= 0: + logging.warning( + f"Excluding cut with ID: {c.id} from decoding, num_frames: {c.num_frames}" + ) + return T > 0 + test_sets_cuts = multi_dataset.test_cuts() test_sets = test_sets_cuts.keys() test_dl = [ - librispeech.test_dataloaders(test_sets_cuts[cuts_name]) + librispeech.test_dataloaders(test_sets_cuts[cuts_name].filter(remove_short_utt)) for cuts_name in test_sets ] diff --git a/egs/multi_zh-hans/ASR/zipformer/multi_dataset.py b/egs/multi_zh-hans/ASR/zipformer/multi_dataset.py index 09e98ba0c..542e16955 100644 --- a/egs/multi_zh-hans/ASR/zipformer/multi_dataset.py +++ b/egs/multi_zh-hans/ASR/zipformer/multi_dataset.py @@ -221,22 +221,31 @@ class MultiDataset: logging.info("About to get multidataset test cuts") # Aidatatang_200zh - logging.info("Loading Aidatatang_200zh TEST set in lazy mode") + logging.info("Loading Aidatatang_200zh set in lazy mode") aidatatang_test_cuts = load_manifest_lazy( self.fbank_dir / "aidatatang_cuts_test.jsonl.gz" ) + aidatatang_dev_cuts = load_manifest_lazy( + self.fbank_dir / "aidatatang_cuts_dev.jsonl.gz" + ) # AISHELL - logging.info("Loading Aishell TEST set in lazy mode") + logging.info("Loading Aishell set in lazy mode") aishell_test_cuts = load_manifest_lazy( self.fbank_dir / "aishell_cuts_test.jsonl.gz" ) + aishell_dev_cuts = load_manifest_lazy( + self.fbank_dir / "aishell_cuts_dev.jsonl.gz" + ) # AISHELL-2 - logging.info("Loading Aishell-2 TEST set in lazy mode") + logging.info("Loading Aishell-2 set in lazy mode") aishell2_test_cuts = load_manifest_lazy( self.fbank_dir / "aishell2_cuts_test.jsonl.gz" ) + aishell2_dev_cuts = load_manifest_lazy( + self.fbank_dir / "aishell2_cuts_dev.jsonl.gz" + ) # AISHELL-4 logging.info("Loading Aishell-4 TEST set in lazy mode") @@ -245,40 +254,63 @@ class MultiDataset: ) # Ali-Meeting - logging.info("Loading Ali-Meeting TEST set in lazy mode") + logging.info("Loading Ali-Meeting set in lazy mode") alimeeting_test_cuts = load_manifest_lazy( self.fbank_dir / "alimeeting-far_cuts_test.jsonl.gz" ) + alimeeting_eval_cuts = load_manifest_lazy( + self.fbank_dir / "alimeeting-far_cuts_eval.jsonl.gz" + ) # MagicData - logging.info("Loading MagicData TEST set in lazy mode") + logging.info("Loading MagicData set in lazy mode") magicdata_test_cuts = load_manifest_lazy( self.fbank_dir / "magicdata_cuts_test.jsonl.gz" ) + magicdata_dev_cuts = load_manifest_lazy( + self.fbank_dir / "magicdata_cuts_dev.jsonl.gz" + ) # KeSpeech - logging.info("Loading KeSpeech TEST set in lazy mode") + logging.info("Loading KeSpeech set in lazy mode") kespeech_test_cuts = load_manifest_lazy( self.fbank_dir / "kespeech" / "kespeech-asr_cuts_test.jsonl.gz" ) + kespeech_dev_phase1_cuts = load_manifest_lazy( + self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase1.jsonl.gz" + ) + kespeech_dev_phase2_cuts = load_manifest_lazy( + self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase2.jsonl.gz" + ) # WeNetSpeech - logging.info("Loading WeNetSpeech TEST set in lazy mode") + logging.info("Loading WeNetSpeech set in lazy mode") wenetspeech_test_meeting_cuts = load_manifest_lazy( self.fbank_dir / "wenetspeech" / "cuts_TEST_MEETING.jsonl.gz" ) wenetspeech_test_net_cuts = load_manifest_lazy( self.fbank_dir / "wenetspeech" / "cuts_TEST_NET.jsonl.gz" ) + wenetspeech_dev_cuts = load_manifest_lazy( + self.fbank_dir / "wenetspeech" / "cuts_DEV.jsonl.gz" + ) return { - "aidatatang": aidatatang_test_cuts, - # "alimeeting": alimeeting_test_cuts, - "aishell": aishell_test_cuts, - "aishell-2": aishell2_test_cuts, + "aidatatang_test": aidatatang_test_cuts, + "aidatatang_dev": aidatatang_dev_cuts, + "alimeeting_test": alimeeting_test_cuts, + "alimeeting_eval": alimeeting_eval_cuts, + "aishell_test": aishell_test_cuts, + "aishell_dev": aishell_dev_cuts, + "aishell-2_test": aishell2_test_cuts, + "aishell-2_dev": aishell2_dev_cuts, "aishell-4": aishell4_test_cuts, - "magicdata": magicdata_test_cuts, - "kespeech": kespeech_test_cuts, - "wenetspeech-meeting": wenetspeech_test_meeting_cuts, - "wenetspeech-net": wenetspeech_test_net_cuts, + "magicdata_test": magicdata_test_cuts, + "magicdata_dev": magicdata_dev_cuts, + "kespeech-asr_test": kespeech_test_cuts, + "kespeech-asr_dev_phase1": kespeech_dev_phase1_cuts, + "kespeech-asr_dev_phase2": kespeech_dev_phase2_cuts, + "wenetspeech-meeting_test": wenetspeech_test_meeting_cuts, + "wenetspeech-net_test": wenetspeech_test_net_cuts, + "wenetspeech_dev": wenetspeech_dev_cuts, }