From f66b266aa49e6227f62f15f36f8d1db1949e834c Mon Sep 17 00:00:00 2001 From: Yuekai Zhang Date: Tue, 23 Jan 2024 17:40:15 +0800 Subject: [PATCH] fix executor --- .../local/compute_fbank_wenetspeech_splits.py | 69 ++++++++++--------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py index dd95a24d6..e2ae10883 100755 --- a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py +++ b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py @@ -33,7 +33,7 @@ from lhotse import ( set_caching_enabled, ) -from icefall.utils import str2bool +from icefall.utils import str2bool, get_executor # Torch's multithreaded behavior needs to be disabled or # it wastes a lot of CPU and slow things down. # Do this outside of main() in case it needs to take effect @@ -137,43 +137,44 @@ def compute_fbank_wenetspeech_splits(args): set_audio_duration_mismatch_tolerance(0.01) # 10ms tolerance set_caching_enabled(False) - for i in range(start, stop): - idx = f"{i + 1}".zfill(num_digits) - logging.info(f"Processing {idx}/{num_splits}") + with get_executor() as ex: # Initialize the executor only once. + for i in range(start, stop): + idx = f"{i + 1}".zfill(num_digits) + logging.info(f"Processing {idx}/{num_splits}") + + cuts_path = output_dir / f"cuts_{subset}.{idx}.jsonl.gz" + if cuts_path.is_file(): + logging.info(f"{cuts_path} exists - skipping") + continue - cuts_path = output_dir / f"cuts_{subset}.{idx}.jsonl.gz" - if cuts_path.is_file(): - logging.info(f"{cuts_path} exists - skipping") - continue + raw_cuts_path = output_dir / f"cuts_{subset}_raw.{idx}.jsonl.gz" - raw_cuts_path = output_dir / f"cuts_{subset}_raw.{idx}.jsonl.gz" + logging.info(f"Loading {raw_cuts_path}") + cut_set = CutSet.from_file(raw_cuts_path) - logging.info(f"Loading {raw_cuts_path}") - cut_set = CutSet.from_file(raw_cuts_path) + logging.info("Splitting cuts into smaller chunks.") + cut_set = cut_set.trim_to_supervisions( + keep_overlapping=False, min_duration=None + ) - logging.info("Splitting cuts into smaller chunks.") - cut_set = cut_set.trim_to_supervisions( - keep_overlapping=False, min_duration=None - ) - - logging.info("Computing features") - # cut_set = cut_set.compute_and_store_features_batch( - # extractor=extractor, - # storage_path=f"{output_dir}/feats_{subset}_{idx}", - # num_workers=args.num_workers, - # batch_duration=args.batch_duration, - # storage_type=LilcomChunkyWriter, - # overwrite=True, - # ) - cut_set = cut_set.compute_and_store_features( - extractor=extractor, - storage_path=f"{output_dir}/feats_{subset}_{idx}", - num_jobs=args.num_workers, - executor=extractor, - storage_type=LilcomChunkyWriter, - ) - logging.info(f"Saving to {cuts_path}") - cut_set.to_file(cuts_path) + logging.info("Computing features") + # cut_set = cut_set.compute_and_store_features_batch( + # extractor=extractor, + # storage_path=f"{output_dir}/feats_{subset}_{idx}", + # num_workers=args.num_workers, + # batch_duration=args.batch_duration, + # storage_type=LilcomChunkyWriter, + # overwrite=True, + # ) + cut_set = cut_set.compute_and_store_features( + extractor=extractor, + storage_path=f"{output_dir}/feats_{subset}_{idx}", + num_jobs=args.num_workers, + executor=ex, + storage_type=LilcomChunkyWriter, + ) + logging.info(f"Saving to {cuts_path}") + cut_set.to_file(cuts_path) def main():