removed redundant wenetspeech M and S sets

This commit is contained in:
JinZr 2023-07-21 09:51:24 +08:00
parent dfd9bb4dc9
commit 49b0a6d952

View File

@ -44,8 +44,6 @@ class MultiDataset:
- kespeech/kespeech-asr_cuts_train_phase1.jsonl.gz - kespeech/kespeech-asr_cuts_train_phase1.jsonl.gz
- kespeech/kespeech-asr_cuts_train_phase2.jsonl.gz - kespeech/kespeech-asr_cuts_train_phase2.jsonl.gz
- wenetspeech/cuts_L.jsonl.gz - wenetspeech/cuts_L.jsonl.gz
- wenetspeech/cuts_M.jsonl.gz
- wenetspeech/cuts_S.jsonl.gz
""" """
self.fbank_dir = Path(fbank_dir) self.fbank_dir = Path(fbank_dir)
@ -84,9 +82,7 @@ class MultiDataset:
# ST-CMDS # ST-CMDS
logging.info("Loading ST-CMDS in lazy mode") logging.info("Loading ST-CMDS in lazy mode")
stcmds_cuts = load_manifest_lazy( stcmds_cuts = load_manifest_lazy(self.fbank_dir / "stcmds_cuts_train.jsonl.gz")
self.fbank_dir / "stcmds_cuts_train.jsonl.gz"
)
# Primewords # Primewords
logging.info("Loading Primewords in lazy mode") logging.info("Loading Primewords in lazy mode")
@ -117,12 +113,6 @@ class MultiDataset:
wenetspeech_L_cuts = load_manifest_lazy( wenetspeech_L_cuts = load_manifest_lazy(
self.fbank_dir / "wenetspeech" / "cuts_L.jsonl.gz" self.fbank_dir / "wenetspeech" / "cuts_L.jsonl.gz"
) )
wenetspeech_M_cuts = load_manifest_lazy(
self.fbank_dir / "wenetspeech" / "cuts_M.jsonl.gz"
)
wenetspeech_S_cuts = load_manifest_lazy(
self.fbank_dir / "wenetspeech" / "cuts_S.jsonl.gz"
)
# KeSpeech # KeSpeech
logging.info("Loading KeSpeech in lazy mode") logging.info("Loading KeSpeech in lazy mode")
@ -146,8 +136,6 @@ class MultiDataset:
aidatatang_200zh_cuts, aidatatang_200zh_cuts,
alimeeting_cuts, alimeeting_cuts,
wenetspeech_L_cuts, wenetspeech_L_cuts,
wenetspeech_M_cuts,
wenetspeech_S_cuts,
kespeech_1_cuts, kespeech_1_cuts,
kespeech_2_cuts, kespeech_2_cuts,
weights=[ weights=[
@ -163,8 +151,6 @@ class MultiDataset:
len(aidatatang_200zh_cuts), len(aidatatang_200zh_cuts),
len(alimeeting_cuts), len(alimeeting_cuts),
len(wenetspeech_L_cuts), len(wenetspeech_L_cuts),
len(wenetspeech_M_cuts),
len(wenetspeech_S_cuts),
len(kespeech_1_cuts), len(kespeech_1_cuts),
len(kespeech_2_cuts), len(kespeech_2_cuts),
], ],
@ -175,7 +161,9 @@ class MultiDataset:
# Aidatatang_200zh # Aidatatang_200zh
logging.info("Loading Aidatatang_200zh DEV set in lazy mode") logging.info("Loading Aidatatang_200zh DEV set in lazy mode")
aidatatang_dev_cuts = load_manifest_lazy(self.fbank_dir / "aidatatang_cuts_dev.jsonl.gz") aidatatang_dev_cuts = load_manifest_lazy(
self.fbank_dir / "aidatatang_cuts_dev.jsonl.gz"
)
# AISHELL # AISHELL
logging.info("Loading Aishell DEV set in lazy mode") logging.info("Loading Aishell DEV set in lazy mode")