diff --git a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py index e2ae10883..fdc8e4a06 100755 --- a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py +++ b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py @@ -137,44 +137,44 @@ def compute_fbank_wenetspeech_splits(args): set_audio_duration_mismatch_tolerance(0.01) # 10ms tolerance set_caching_enabled(False) - with get_executor() as ex: # Initialize the executor only once. - for i in range(start, stop): - idx = f"{i + 1}".zfill(num_digits) - logging.info(f"Processing {idx}/{num_splits}") - - cuts_path = output_dir / f"cuts_{subset}.{idx}.jsonl.gz" - if cuts_path.is_file(): - logging.info(f"{cuts_path} exists - skipping") - continue + #with get_executor() as ex: # Initialize the executor only once. + for i in range(start, stop): + idx = f"{i + 1}".zfill(num_digits) + logging.info(f"Processing {idx}/{num_splits}") + + cuts_path = output_dir / f"cuts_{subset}.{idx}.jsonl.gz" + if cuts_path.is_file(): + logging.info(f"{cuts_path} exists - skipping") + continue - raw_cuts_path = output_dir / f"cuts_{subset}_raw.{idx}.jsonl.gz" + raw_cuts_path = output_dir / f"cuts_{subset}_raw.{idx}.jsonl.gz" - logging.info(f"Loading {raw_cuts_path}") - cut_set = CutSet.from_file(raw_cuts_path) + logging.info(f"Loading {raw_cuts_path}") + cut_set = CutSet.from_file(raw_cuts_path) - logging.info("Splitting cuts into smaller chunks.") - cut_set = cut_set.trim_to_supervisions( - keep_overlapping=False, min_duration=None - ) + logging.info("Splitting cuts into smaller chunks.") + cut_set = cut_set.trim_to_supervisions( + keep_overlapping=False, min_duration=None + ) - logging.info("Computing features") - # cut_set = cut_set.compute_and_store_features_batch( - # extractor=extractor, - # storage_path=f"{output_dir}/feats_{subset}_{idx}", - # num_workers=args.num_workers, - # batch_duration=args.batch_duration, - # storage_type=LilcomChunkyWriter, - # overwrite=True, - # ) - cut_set = cut_set.compute_and_store_features( - extractor=extractor, - storage_path=f"{output_dir}/feats_{subset}_{idx}", - num_jobs=args.num_workers, - executor=ex, - storage_type=LilcomChunkyWriter, - ) - logging.info(f"Saving to {cuts_path}") - cut_set.to_file(cuts_path) + logging.info("Computing features") + cut_set = cut_set.compute_and_store_features_batch( + extractor=extractor, + storage_path=f"{output_dir}/feats_{subset}_{idx}", + num_workers=args.num_workers, + batch_duration=args.batch_duration, + storage_type=LilcomChunkyWriter, + overwrite=True, + ) + # cut_set = cut_set.compute_and_store_features( + # extractor=extractor, + # storage_path=f"{output_dir}/feats_{subset}_{idx}", + # num_jobs=args.num_workers, + # executor=ex, + # storage_type=LilcomChunkyWriter, + # ) + logging.info(f"Saving to {cuts_path}") + cut_set.to_file(cuts_path) def main():