From 42863b1f8eace0c4df59ef5f3b81a80bb909dffe Mon Sep 17 00:00:00 2001 From: marcoyang Date: Tue, 9 Jan 2024 10:14:23 +0800 Subject: [PATCH] change starting index; support different subsets --- .../local/compute_fbank_gigaspeech_splits.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py index 1c71be0f9..176eb8a84 100755 --- a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py +++ b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py @@ -51,6 +51,14 @@ def get_parser(): "Determines batch size dynamically.", ) + parser.add_argument( + "--subset", + type=str, + default="XL", + choices=["XL", "L", "M", "S", "XS"], + help="Which subset to work with", + ) + parser.add_argument( "--num-splits", type=int, @@ -76,7 +84,7 @@ def get_parser(): def compute_fbank_gigaspeech_splits(args): num_splits = args.num_splits - output_dir = "data/fbank/XL_split" + output_dir = f"data/fbank/{args.subset}_split" output_dir = Path(output_dir) assert output_dir.exists(), f"{output_dir} does not exist!" @@ -96,15 +104,15 @@ def compute_fbank_gigaspeech_splits(args): logging.info(f"device: {device}") for i in range(start, stop): - idx = f"{i + 1}".zfill(num_digits) + idx = f"{i}".zfill(num_digits) logging.info(f"Processing {idx}/{num_splits}") - cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz" + cuts_path = output_dir / f"cuts_{args.subset}.{idx}.jsonl.gz" if cuts_path.is_file(): logging.info(f"{cuts_path} exists - skipping") continue - raw_cuts_path = output_dir / f"cuts_XL_raw.{idx}.jsonl.gz" + raw_cuts_path = output_dir / f"cuts_{args.subset}_raw.{idx}.jsonl.gz" logging.info(f"Loading {raw_cuts_path}") cut_set = CutSet.from_file(raw_cuts_path) @@ -113,7 +121,7 @@ def compute_fbank_gigaspeech_splits(args): cut_set = cut_set.compute_and_store_features_batch( extractor=extractor, - storage_path=f"{output_dir}/feats_XL_{idx}", + storage_path=f"{output_dir}/feats_{args.subset}_{idx}", num_workers=args.num_workers, batch_duration=args.batch_duration, overwrite=True,