From ec699675845455c855fa16b5790f1ff1690c035e Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 29 Jul 2022 11:17:19 +0800 Subject: [PATCH] Set overwrite=True when extracting features in batches. (#487) --- .../ASR/local/compute_fbank_gigaspeech_dev_test.py | 7 ++----- .../ASR/local/compute_fbank_gigaspeech_splits.py | 7 ++----- .../ASR/local/compute_fbank_gigaspeech_dev_test.py | 1 + .../ASR/local/compute_fbank_gigaspeech_splits.py | 1 + egs/spgispeech/ASR/local/compute_fbank_musan.py | 1 + egs/spgispeech/ASR/local/compute_fbank_spgispeech.py | 2 ++ .../ASR/local/compute_fbank_wenetspeech_dev_test.py | 1 + .../ASR/local/compute_fbank_wenetspeech_splits.py | 1 + 8 files changed, 11 insertions(+), 10 deletions(-) diff --git a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_dev_test.py b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_dev_test.py index 9f1039893..8209ee3ec 100755 --- a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_dev_test.py +++ b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_dev_test.py @@ -20,11 +20,7 @@ import logging from pathlib import Path import torch -from lhotse import ( - CutSet, - KaldifeatFbank, - KaldifeatFbankConfig, -) +from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig # Torch's multithreaded behavior needs to be disabled or # it wastes a lot of CPU and slow things down. @@ -69,6 +65,7 @@ def compute_fbank_gigaspeech_dev_test(): storage_path=f"{in_out_dir}/feats_{partition}", num_workers=num_workers, batch_duration=batch_duration, + overwrite=True, ) cut_set = cut_set.trim_to_supervisions( keep_overlapping=False, min_duration=None diff --git a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py index 9dd3c046d..6410249db 100755 --- a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py +++ b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py @@ -22,11 +22,7 @@ from datetime import datetime from pathlib import Path import torch -from lhotse import ( - CutSet, - KaldifeatFbank, - KaldifeatFbankConfig, -) +from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig # Torch's multithreaded behavior needs to be disabled or # it wastes a lot of CPU and slow things down. @@ -120,6 +116,7 @@ def compute_fbank_gigaspeech_splits(args): storage_path=f"{output_dir}/feats_XL_{idx}", num_workers=args.num_workers, batch_duration=args.batch_duration, + overwrite=True, ) logging.info("About to split cuts into smaller chunks.") diff --git a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py index 68d93d2c5..c0c7ef8c5 100644 --- a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py +++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py @@ -68,6 +68,7 @@ def compute_fbank_gigaspeech_dev_test(): storage_path=f"{in_out_dir}/{prefix}_feats_{partition}", num_workers=num_workers, batch_duration=batch_duration, + overwrite=True, ) cut_set = cut_set.trim_to_supervisions( keep_overlapping=False, min_duration=None diff --git a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py index f826f064e..5587106e5 100644 --- a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py +++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py @@ -126,6 +126,7 @@ def compute_fbank_gigaspeech_splits(args): storage_path=f"{output_dir}/{prefix}_feats_XL_{idx}", num_workers=args.num_workers, batch_duration=args.batch_duration, + overwrite=True, ) logging.info("About to split cuts into smaller chunks.") diff --git a/egs/spgispeech/ASR/local/compute_fbank_musan.py b/egs/spgispeech/ASR/local/compute_fbank_musan.py index b88286c41..70372af2b 100755 --- a/egs/spgispeech/ASR/local/compute_fbank_musan.py +++ b/egs/spgispeech/ASR/local/compute_fbank_musan.py @@ -92,6 +92,7 @@ def compute_fbank_musan(): batch_duration=500, num_workers=4, storage_type=LilcomChunkyWriter, + overwrite=True, ) ) diff --git a/egs/spgispeech/ASR/local/compute_fbank_spgispeech.py b/egs/spgispeech/ASR/local/compute_fbank_spgispeech.py index b67754e2a..8116e7605 100755 --- a/egs/spgispeech/ASR/local/compute_fbank_spgispeech.py +++ b/egs/spgispeech/ASR/local/compute_fbank_spgispeech.py @@ -119,6 +119,7 @@ def compute_fbank_spgispeech(args): batch_duration=500, num_workers=4, storage_type=LilcomChunkyWriter, + overwrite=True, ) cs.to_file(cuts_train_idx_path) @@ -138,6 +139,7 @@ def compute_fbank_spgispeech(args): batch_duration=500, num_workers=4, storage_type=LilcomChunkyWriter, + overwrite=True, ) diff --git a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_dev_test.py b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_dev_test.py index c10a84d21..8a9f6ed30 100755 --- a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_dev_test.py +++ b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_dev_test.py @@ -75,6 +75,7 @@ def compute_fbank_wenetspeech_dev_test(): num_workers=num_workers, batch_duration=batch_duration, storage_type=LilcomHdf5Writer, + overwrite=True, ) logging.info(f"Saving to {cuts_path}") diff --git a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py index bf9a03b20..a882b6113 100755 --- a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py +++ b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py @@ -140,6 +140,7 @@ def compute_fbank_wenetspeech_splits(args): num_workers=args.num_workers, batch_duration=args.batch_duration, storage_type=LilcomChunkyWriter, + overwrite=True, ) logging.info(f"Saving to {cuts_path}")