fix executor

2025-12-11 06:55:27 +00:00 · 2024-01-23 17:40:15 +08:00 · 2024-01-23 17:40:15 +08:00 · f66b266aa4
commit f66b266aa4
parent e46e9b77ee
1 changed files with 35 additions and 34 deletions
--- a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py
+++ b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py
@ -33,7 +33,7 @@ from lhotse import (
    set_caching_enabled,
 )
-from icefall.utils import str2bool
+from icefall.utils import str2bool, get_executor
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
@ -137,43 +137,44 @@ def compute_fbank_wenetspeech_splits(args):
    set_audio_duration_mismatch_tolerance(0.01)  # 10ms tolerance
    set_caching_enabled(False)
-    for i in range(start, stop):
+    with get_executor() as ex:  # Initialize the executor only once.
-        idx = f"{i + 1}".zfill(num_digits)
+        for i in range(start, stop):
-        logging.info(f"Processing {idx}/{num_splits}")
+            idx = f"{i + 1}".zfill(num_digits)
            logging.info(f"Processing {idx}/{num_splits}")
-        cuts_path = output_dir / f"cuts_{subset}.{idx}.jsonl.gz"
+            cuts_path = output_dir / f"cuts_{subset}.{idx}.jsonl.gz"
-        if cuts_path.is_file():
+            if cuts_path.is_file():
-            logging.info(f"{cuts_path} exists - skipping")
+                logging.info(f"{cuts_path} exists - skipping")
-            continue
+                continue
-        raw_cuts_path = output_dir / f"cuts_{subset}_raw.{idx}.jsonl.gz"
+            raw_cuts_path = output_dir / f"cuts_{subset}_raw.{idx}.jsonl.gz"
-        logging.info(f"Loading {raw_cuts_path}")
+            logging.info(f"Loading {raw_cuts_path}")
-        cut_set = CutSet.from_file(raw_cuts_path)
+            cut_set = CutSet.from_file(raw_cuts_path)
-        logging.info("Splitting cuts into smaller chunks.")
+            logging.info("Splitting cuts into smaller chunks.")
-        cut_set = cut_set.trim_to_supervisions(
+            cut_set = cut_set.trim_to_supervisions(
-            keep_overlapping=False, min_duration=None
+                keep_overlapping=False, min_duration=None
-        )
+            )
-        logging.info("Computing features")
+            logging.info("Computing features")
-        # cut_set = cut_set.compute_and_store_features_batch(
+            # cut_set = cut_set.compute_and_store_features_batch(
-        #     extractor=extractor,
+            #     extractor=extractor,
-        #     storage_path=f"{output_dir}/feats_{subset}_{idx}",
+            #     storage_path=f"{output_dir}/feats_{subset}_{idx}",
-        #     num_workers=args.num_workers,
+            #     num_workers=args.num_workers,
-        #     batch_duration=args.batch_duration,
+            #     batch_duration=args.batch_duration,
-        #     storage_type=LilcomChunkyWriter,
+            #     storage_type=LilcomChunkyWriter,
-        #     overwrite=True,
+            #     overwrite=True,
-        # )
+            # )
-        cut_set = cut_set.compute_and_store_features(
+            cut_set = cut_set.compute_and_store_features(
-            extractor=extractor,
+                extractor=extractor,
-            storage_path=f"{output_dir}/feats_{subset}_{idx}",
+                storage_path=f"{output_dir}/feats_{subset}_{idx}",
-            num_jobs=args.num_workers,
+                num_jobs=args.num_workers,
-            executor=extractor,
+                executor=ex,
-            storage_type=LilcomChunkyWriter,
+                storage_type=LilcomChunkyWriter,
-        )
+            )
-        logging.info(f"Saving to {cuts_path}")
+            logging.info(f"Saving to {cuts_path}")
-        cut_set.to_file(cuts_path)
+            cut_set.to_file(cuts_path)
 def main():