removed redundant wenetspeech M and S sets

This commit is contained in:
JinZr 2023-07-21 09:51:24 +08:00
parent dfd9bb4dc9
commit 49b0a6d952

View File

@ -44,8 +44,6 @@ class MultiDataset:
- kespeech/kespeech-asr_cuts_train_phase1.jsonl.gz - kespeech/kespeech-asr_cuts_train_phase1.jsonl.gz
- kespeech/kespeech-asr_cuts_train_phase2.jsonl.gz - kespeech/kespeech-asr_cuts_train_phase2.jsonl.gz
- wenetspeech/cuts_L.jsonl.gz - wenetspeech/cuts_L.jsonl.gz
- wenetspeech/cuts_M.jsonl.gz
- wenetspeech/cuts_S.jsonl.gz
""" """
self.fbank_dir = Path(fbank_dir) self.fbank_dir = Path(fbank_dir)
@ -55,166 +53,156 @@ class MultiDataset:
# THCHS-30 # THCHS-30
logging.info("Loading THCHS-30 in lazy mode") logging.info("Loading THCHS-30 in lazy mode")
thchs_30_cuts = load_manifest_lazy( thchs_30_cuts = load_manifest_lazy(
self.fbank_dir / "thchs_30_cuts_train.jsonl.gz" self.fbank_dir / "thchs_30_cuts_train.jsonl.gz"
) )
# AISHELL-1 # AISHELL-1
logging.info("Loading Aishell-1 in lazy mode") logging.info("Loading Aishell-1 in lazy mode")
aishell_cuts = load_manifest_lazy( aishell_cuts = load_manifest_lazy(
self.fbank_dir / "aishell_cuts_train.jsonl.gz" self.fbank_dir / "aishell_cuts_train.jsonl.gz"
) )
# AISHELL-2 # AISHELL-2
logging.info("Loading Aishell-2 in lazy mode") logging.info("Loading Aishell-2 in lazy mode")
aishell_2_cuts = load_manifest_lazy( aishell_2_cuts = load_manifest_lazy(
self.fbank_dir / "aishell2_cuts_train.jsonl.gz" self.fbank_dir / "aishell2_cuts_train.jsonl.gz"
) )
# AISHELL-4 # AISHELL-4
logging.info("Loading Aishell-4 in lazy mode") logging.info("Loading Aishell-4 in lazy mode")
aishell_4_L_cuts = load_manifest_lazy( aishell_4_L_cuts = load_manifest_lazy(
self.fbank_dir / "aishell4_cuts_train_L.jsonl.gz" self.fbank_dir / "aishell4_cuts_train_L.jsonl.gz"
) )
aishell_4_M_cuts = load_manifest_lazy( aishell_4_M_cuts = load_manifest_lazy(
self.fbank_dir / "aishell4_cuts_train_M.jsonl.gz" self.fbank_dir / "aishell4_cuts_train_M.jsonl.gz"
) )
aishell_4_S_cuts = load_manifest_lazy( aishell_4_S_cuts = load_manifest_lazy(
self.fbank_dir / "aishell4_cuts_train_S.jsonl.gz" self.fbank_dir / "aishell4_cuts_train_S.jsonl.gz"
) )
# ST-CMDS # ST-CMDS
logging.info("Loading ST-CMDS in lazy mode") logging.info("Loading ST-CMDS in lazy mode")
stcmds_cuts = load_manifest_lazy( stcmds_cuts = load_manifest_lazy(self.fbank_dir / "stcmds_cuts_train.jsonl.gz")
self.fbank_dir / "stcmds_cuts_train.jsonl.gz"
)
# Primewords # Primewords
logging.info("Loading Primewords in lazy mode") logging.info("Loading Primewords in lazy mode")
primewords_cuts = load_manifest_lazy( primewords_cuts = load_manifest_lazy(
self.fbank_dir / "primewords_cuts_train.jsonl.gz" self.fbank_dir / "primewords_cuts_train.jsonl.gz"
) )
# MagicData # MagicData
logging.info("Loading MagicData in lazy mode") logging.info("Loading MagicData in lazy mode")
magicdata_cuts = load_manifest_lazy( magicdata_cuts = load_manifest_lazy(
self.fbank_dir / "magicdata_cuts_train.jsonl.gz" self.fbank_dir / "magicdata_cuts_train.jsonl.gz"
) )
# Aidatatang_200zh # Aidatatang_200zh
logging.info("Loading Aidatatang_200zh in lazy mode") logging.info("Loading Aidatatang_200zh in lazy mode")
aidatatang_200zh_cuts = load_manifest_lazy( aidatatang_200zh_cuts = load_manifest_lazy(
self.fbank_dir / "aidatatang_cuts_train.jsonl.gz" self.fbank_dir / "aidatatang_cuts_train.jsonl.gz"
) )
# Ali-Meeting # Ali-Meeting
logging.info("Loading Ali-Meeting in lazy mode") logging.info("Loading Ali-Meeting in lazy mode")
alimeeting_cuts = load_manifest_lazy( alimeeting_cuts = load_manifest_lazy(
self.fbank_dir / "alimeeting-far_cuts_train.jsonl.gz" self.fbank_dir / "alimeeting-far_cuts_train.jsonl.gz"
) )
# WeNetSpeech # WeNetSpeech
logging.info("Loading WeNetSpeech in lazy mode") logging.info("Loading WeNetSpeech in lazy mode")
wenetspeech_L_cuts = load_manifest_lazy( wenetspeech_L_cuts = load_manifest_lazy(
self.fbank_dir / "wenetspeech" / "cuts_L.jsonl.gz" self.fbank_dir / "wenetspeech" / "cuts_L.jsonl.gz"
) )
wenetspeech_M_cuts = load_manifest_lazy(
self.fbank_dir / "wenetspeech" / "cuts_M.jsonl.gz"
)
wenetspeech_S_cuts = load_manifest_lazy(
self.fbank_dir / "wenetspeech" / "cuts_S.jsonl.gz"
)
# KeSpeech # KeSpeech
logging.info("Loading KeSpeech in lazy mode") logging.info("Loading KeSpeech in lazy mode")
kespeech_1_cuts = load_manifest_lazy( kespeech_1_cuts = load_manifest_lazy(
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_train_phase1.jsonl.gz" self.fbank_dir / "kespeech" / "kespeech-asr_cuts_train_phase1.jsonl.gz"
) )
kespeech_2_cuts = load_manifest_lazy( kespeech_2_cuts = load_manifest_lazy(
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_train_phase2.jsonl.gz" self.fbank_dir / "kespeech" / "kespeech-asr_cuts_train_phase2.jsonl.gz"
) )
return CutSet.mux( return CutSet.mux(
thchs_30_cuts, thchs_30_cuts,
aishell_cuts, aishell_cuts,
aishell_2_cuts, aishell_2_cuts,
aishell_4_L_cuts, aishell_4_L_cuts,
aishell_4_M_cuts, aishell_4_M_cuts,
aishell_4_S_cuts, aishell_4_S_cuts,
stcmds_cuts, stcmds_cuts,
primewords_cuts, primewords_cuts,
magicdata_cuts, magicdata_cuts,
aidatatang_200zh_cuts, aidatatang_200zh_cuts,
alimeeting_cuts, alimeeting_cuts,
wenetspeech_L_cuts, wenetspeech_L_cuts,
wenetspeech_M_cuts, kespeech_1_cuts,
wenetspeech_S_cuts, kespeech_2_cuts,
kespeech_1_cuts, weights=[
kespeech_2_cuts, len(thchs_30_cuts),
weights=[ len(aishell_cuts),
len(thchs_30_cuts), len(aishell_2_cuts),
len(aishell_cuts), len(aishell_4_L_cuts),
len(aishell_2_cuts), len(aishell_4_M_cuts),
len(aishell_4_L_cuts), len(aishell_4_S_cuts),
len(aishell_4_M_cuts), len(stcmds_cuts),
len(aishell_4_S_cuts), len(primewords_cuts),
len(stcmds_cuts), len(magicdata_cuts),
len(primewords_cuts), len(aidatatang_200zh_cuts),
len(magicdata_cuts), len(alimeeting_cuts),
len(aidatatang_200zh_cuts), len(wenetspeech_L_cuts),
len(alimeeting_cuts), len(kespeech_1_cuts),
len(wenetspeech_L_cuts), len(kespeech_2_cuts),
len(wenetspeech_M_cuts), ],
len(wenetspeech_S_cuts), )
len(kespeech_1_cuts),
len(kespeech_2_cuts),
],
)
def dev_cuts(self) -> CutSet: def dev_cuts(self) -> CutSet:
logging.info("About to get multidataset dev cuts") logging.info("About to get multidataset dev cuts")
# Aidatatang_200zh # Aidatatang_200zh
logging.info("Loading Aidatatang_200zh DEV set in lazy mode") logging.info("Loading Aidatatang_200zh DEV set in lazy mode")
aidatatang_dev_cuts = load_manifest_lazy(self.fbank_dir / "aidatatang_cuts_dev.jsonl.gz") aidatatang_dev_cuts = load_manifest_lazy(
self.fbank_dir / "aidatatang_cuts_dev.jsonl.gz"
)
# AISHELL # AISHELL
logging.info("Loading Aishell DEV set in lazy mode") logging.info("Loading Aishell DEV set in lazy mode")
aishell_dev_cuts = load_manifest_lazy( aishell_dev_cuts = load_manifest_lazy(
self.fbank_dir / "aishell_cuts_dev.jsonl.gz" self.fbank_dir / "aishell_cuts_dev.jsonl.gz"
) )
# AISHELL-2 # AISHELL-2
logging.info("Loading Aishell-2 DEV set in lazy mode") logging.info("Loading Aishell-2 DEV set in lazy mode")
aishell2_dev_cuts = load_manifest_lazy( aishell2_dev_cuts = load_manifest_lazy(
self.fbank_dir / "aishell2_cuts_dev.jsonl.gz" self.fbank_dir / "aishell2_cuts_dev.jsonl.gz"
) )
# Ali-Meeting # Ali-Meeting
logging.info("Loading Ali-Meeting DEV set in lazy mode") logging.info("Loading Ali-Meeting DEV set in lazy mode")
alimeeting_dev_cuts = load_manifest_lazy( alimeeting_dev_cuts = load_manifest_lazy(
self.fbank_dir / "alimeeting-far_cuts_eval.jsonl.gz" self.fbank_dir / "alimeeting-far_cuts_eval.jsonl.gz"
) )
# MagicData # MagicData
logging.info("Loading MagicData DEV set in lazy mode") logging.info("Loading MagicData DEV set in lazy mode")
magicdata_dev_cuts = load_manifest_lazy( magicdata_dev_cuts = load_manifest_lazy(
self.fbank_dir / "magicdata_cuts_dev.jsonl.gz" self.fbank_dir / "magicdata_cuts_dev.jsonl.gz"
) )
# KeSpeech # KeSpeech
logging.info("Loading KeSpeech DEV set in lazy mode") logging.info("Loading KeSpeech DEV set in lazy mode")
kespeech_dev_phase1_cuts = load_manifest_lazy( kespeech_dev_phase1_cuts = load_manifest_lazy(
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase1.jsonl.gz" self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase1.jsonl.gz"
) )
kespeech_dev_phase2_cuts = load_manifest_lazy( kespeech_dev_phase2_cuts = load_manifest_lazy(
self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase2.jsonl.gz" self.fbank_dir / "kespeech" / "kespeech-asr_cuts_dev_phase2.jsonl.gz"
) )
# WeNetSpeech # WeNetSpeech
logging.info("Loading WeNetSpeech DEV set in lazy mode") logging.info("Loading WeNetSpeech DEV set in lazy mode")
wenetspeech_dev_cuts = load_manifest_lazy( wenetspeech_dev_cuts = load_manifest_lazy(
self.fbank_dir / "wenetspeech" / "cuts_DEV.jsonl.gz" self.fbank_dir / "wenetspeech" / "cuts_DEV.jsonl.gz"
) )
return wenetspeech_dev_cuts return wenetspeech_dev_cuts
# return [ # return [