From 49b0a6d95260da9dbb2c2caed55556bc8bfce67d Mon Sep 17 00:00:00 2001 From: JinZr <60612200+JinZr@users.noreply.github.com> Date: Fri, 21 Jul 2023 09:51:24 +0800 Subject: [PATCH] removed redundant wenetspeech M and S sets --- .../ASR/zipformer/multi_dataset.py | 162 ++++++++---------- 1 file changed, 75 insertions(+), 87 deletions(-) diff --git a/egs/multi_zh-hans/ASR/zipformer/multi_dataset.py b/egs/multi_zh-hans/ASR/zipformer/multi_dataset.py index 3fabac6d3..f2cd80393 100644 --- a/egs/multi_zh-hans/ASR/zipformer/multi_dataset.py +++ b/egs/multi_zh-hans/ASR/zipformer/multi_dataset.py @@ -44,8 +44,6 @@ class MultiDataset: - kespeech/kespeech-asr_cuts_train_phase1.jsonl.gz - kespeech/kespeech-asr_cuts_train_phase2.jsonl.gz - wenetspeech/cuts_L.jsonl.gz - - wenetspeech/cuts_M.jsonl.gz - - wenetspeech/cuts_S.jsonl.gz """ self.fbank_dir = Path(fbank_dir) @@ -55,166 +53,156 @@ class MultiDataset: # THCHS-30 logging.info("Loading THCHS-30 in lazy mode") thchs_30_cuts = load_manifest_lazy( - self.fbank_dir / "thchs_30_cuts_train.jsonl.gz" - ) + self.fbank_dir / "thchs_30_cuts_train.jsonl.gz" + ) # AISHELL-1 logging.info("Loading Aishell-1 in lazy mode") aishell_cuts = load_manifest_lazy( - self.fbank_dir / "aishell_cuts_train.jsonl.gz" - ) + self.fbank_dir / "aishell_cuts_train.jsonl.gz" + ) # AISHELL-2 logging.info("Loading Aishell-2 in lazy mode") aishell_2_cuts = load_manifest_lazy( - self.fbank_dir / "aishell2_cuts_train.jsonl.gz" - ) + self.fbank_dir / "aishell2_cuts_train.jsonl.gz" + ) # AISHELL-4 logging.info("Loading Aishell-4 in lazy mode") aishell_4_L_cuts = load_manifest_lazy( - self.fbank_dir / "aishell4_cuts_train_L.jsonl.gz" - ) + self.fbank_dir / "aishell4_cuts_train_L.jsonl.gz" + ) aishell_4_M_cuts = load_manifest_lazy( - self.fbank_dir / "aishell4_cuts_train_M.jsonl.gz" - ) + self.fbank_dir / "aishell4_cuts_train_M.jsonl.gz" + ) aishell_4_S_cuts = load_manifest_lazy( - self.fbank_dir / "aishell4_cuts_train_S.jsonl.gz" - ) + self.fbank_dir / "aishell4_cuts_train_S.jsonl.gz" + ) # ST-CMDS logging.info("Loading ST-CMDS in lazy mode") - stcmds_cuts = load_manifest_lazy( - self.fbank_dir / "stcmds_cuts_train.jsonl.gz" - ) + stcmds_cuts = load_manifest_lazy(self.fbank_dir / "stcmds_cuts_train.jsonl.gz") # Primewords logging.info("Loading Primewords in lazy mode") primewords_cuts = load_manifest_lazy( - self.fbank_dir / "primewords_cuts_train.jsonl.gz" - ) + self.fbank_dir / "primewords_cuts_train.jsonl.gz" + ) # MagicData logging.info("Loading MagicData in lazy mode") magicdata_cuts = load_manifest_lazy( - self.fbank_dir / "magicdata_cuts_train.jsonl.gz" - ) + self.fbank_dir / "magicdata_cuts_train.jsonl.gz" + ) # Aidatatang_200zh logging.info("Loading Aidatatang_200zh in lazy mode") aidatatang_200zh_cuts = load_manifest_lazy( - self.fbank_dir / "aidatatang_cuts_train.jsonl.gz" - ) + self.fbank_dir / "aidatatang_cuts_train.jsonl.gz" + ) # Ali-Meeting logging.info("Loading Ali-Meeting in lazy mode") alimeeting_cuts = load_manifest_lazy( - self.fbank_dir / "alimeeting-far_cuts_train.jsonl.gz" - ) + self.fbank_dir / "alimeeting-far_cuts_train.jsonl.gz" + ) # WeNetSpeech logging.info("Loading WeNetSpeech in lazy mode") wenetspeech_L_cuts = load_manifest_lazy( - self.fbank_dir / "wenetspeech" / "cuts_L.jsonl.gz" - ) - wenetspeech_M_cuts = load_manifest_lazy( - self.fbank_dir / "wenetspeech" / "cuts_M.jsonl.gz" - ) - wenetspeech_S_cuts = load_manifest_lazy( - self.fbank_dir / "wenetspeech" / "cuts_S.jsonl.gz" - ) + self.fbank_dir / "wenetspeech" / "cuts_L.jsonl.gz" + ) # KeSpeech logging.info("Loading KeSpeech in lazy mode") kespeech_1_cuts = load_manifest_lazy( - self.fbank_dir / "kespeech" / "kespeech-asr_cuts_train_phase1.jsonl.gz" - ) + self.fbank_dir / "kespeech" / "kespeech-asr_cuts_train_phase1.jsonl.gz" + ) kespeech_2_cuts = load_manifest_lazy( - self.fbank_dir / "kespeech" / "kespeech-asr_cuts_train_phase2.jsonl.gz" - ) + self.fbank_dir / "kespeech" / "kespeech-asr_cuts_train_phase2.jsonl.gz" + ) return CutSet.mux( - thchs_30_cuts, - aishell_cuts, - aishell_2_cuts, - aishell_4_L_cuts, - aishell_4_M_cuts, - aishell_4_S_cuts, - stcmds_cuts, - primewords_cuts, - magicdata_cuts, - aidatatang_200zh_cuts, - alimeeting_cuts, - wenetspeech_L_cuts, - wenetspeech_M_cuts, - wenetspeech_S_cuts, - kespeech_1_cuts, - kespeech_2_cuts, - weights=[ - len(thchs_30_cuts), - len(aishell_cuts), - len(aishell_2_cuts), - len(aishell_4_L_cuts), - len(aishell_4_M_cuts), - len(aishell_4_S_cuts), - len(stcmds_cuts), - len(primewords_cuts), - len(magicdata_cuts), - len(aidatatang_200zh_cuts), - len(alimeeting_cuts), - len(wenetspeech_L_cuts), - len(wenetspeech_M_cuts), - len(wenetspeech_S_cuts), - len(kespeech_1_cuts), - len(kespeech_2_cuts), - ], - ) + thchs_30_cuts, + aishell_cuts, + aishell_2_cuts, + aishell_4_L_cuts, + aishell_4_M_cuts, + aishell_4_S_cuts, + stcmds_cuts, + primewords_cuts, + magicdata_cuts, + aidatatang_200zh_cuts, + alimeeting_cuts, + wenetspeech_L_cuts, + kespeech_1_cuts, + kespeech_2_cuts, + weights=[ + len(thchs_30_cuts), + len(aishell_cuts), + len(aishell_2_cuts), + len(aishell_4_L_cuts), + len(aishell_4_M_cuts), + len(aishell_4_S_cuts), + len(stcmds_cuts), + len(primewords_cuts), + len(magicdata_cuts), + len(aidatatang_200zh_cuts), + len(alimeeting_cuts), + len(wenetspeech_L_cuts), + len(kespeech_1_cuts), + len(kespeech_2_cuts), + ], + ) def dev_cuts(self) -> CutSet: logging.info("About to get multidataset dev cuts") # Aidatatang_200zh logging.info("Loading Aidatatang_200zh DEV set in lazy mode") - aidatatang_dev_cuts = load_manifest_lazy(self.fbank_dir / "aidatatang_cuts_dev.jsonl.gz") + aidatatang_dev_cuts = load_manifest_lazy( + self.fbank_dir / "aidatatang_cuts_dev.jsonl.gz" + ) # AISHELL logging.info("Loading Aishell DEV set in lazy mode") aishell_dev_cuts = load_manifest_lazy( - self.fbank_dir / "aishell_cuts_dev.jsonl.gz" - ) + self.fbank_dir / "aishell_cuts_dev.jsonl.gz" + ) # AISHELL-2 logging.info("Loading Aishell-2 DEV set in lazy mode") aishell2_dev_cuts = load_manifest_lazy( - self.fbank_dir / "aishell2_cuts_dev.jsonl.gz" - ) + self.fbank_dir / "aishell2_cuts_dev.jsonl.gz" + ) # Ali-Meeting logging.info("Loading Ali-Meeting DEV set in lazy mode") alimeeting_dev_cuts = load_manifest_lazy( - self.fbank_dir / "alimeeting-far_cuts_eval.jsonl.gz" - ) + self.fbank_dir / "alimeeting-far_cuts_eval.jsonl.gz" + ) # MagicData logging.info("Loading MagicData DEV set in lazy mode") magicdata_dev_cuts = load_manifest_lazy( - self.fbank_dir / "magicdata_cuts_dev.jsonl.gz" - ) + self.fbank_dir / "magicdata_cuts_dev.jsonl.gz" + ) # KeSpeech logging.info("Loading KeSpeech DEV set in lazy mode") kespeech_dev_phase1_cuts = load_manifest_lazy( - self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase1.jsonl.gz" - ) + self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase1.jsonl.gz" + ) kespeech_dev_phase2_cuts = load_manifest_lazy( - self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase2.jsonl.gz" - ) + self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase2.jsonl.gz" + ) # WeNetSpeech logging.info("Loading WeNetSpeech DEV set in lazy mode") wenetspeech_dev_cuts = load_manifest_lazy( - self.fbank_dir / "wenetspeech" / "cuts_DEV.jsonl.gz" - ) + self.fbank_dir / "wenetspeech" / "cuts_DEV.jsonl.gz" + ) return wenetspeech_dev_cuts # return [