load_manifest_lazy for asr_datamodule.py (#453)

This commit is contained in:
Mingshuang Luo 2022-06-29 17:45:30 +08:00 committed by GitHub
parent 29e407fd04
commit c10aec5656
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -192,13 +192,6 @@ class WenetSpeechAsrDataModule:
"with training dataset. ", "with training dataset. ",
) )
group.add_argument(
"--lazy-load",
type=str2bool,
default=True,
help="lazily open CutSets to avoid OOM (for L|XL subset)",
)
group.add_argument( group.add_argument(
"--training-subset", "--training-subset",
type=str, type=str,
@ -420,17 +413,10 @@ class WenetSpeechAsrDataModule:
@lru_cache() @lru_cache()
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
logging.info("About to get train cuts") logging.info("About to get train cuts")
if self.args.lazy_load: cuts_train = load_manifest_lazy(
logging.info("use lazy cuts") self.args.manifest_dir
cuts_train = CutSet.from_jsonl_lazy( / f"cuts_{self.args.training_subset}.jsonl.gz"
self.args.manifest_dir )
/ f"cuts_{self.args.training_subset}.jsonl.gz"
)
else:
cuts_train = CutSet.from_file(
self.args.manifest_dir
/ f"cuts_{self.args.training_subset}.jsonl.gz"
)
return cuts_train return cuts_train
@lru_cache() @lru_cache()