change starting index; support different subsets

This commit is contained in:
marcoyang 2024-01-09 10:14:23 +08:00
parent f42258caf8
commit 42863b1f8e

View File

@ -51,6 +51,14 @@ def get_parser():
"Determines batch size dynamically.",
)
parser.add_argument(
"--subset",
type=str,
default="XL",
choices=["XL", "L", "M", "S", "XS"],
help="Which subset to work with",
)
parser.add_argument(
"--num-splits",
type=int,
@ -76,7 +84,7 @@ def get_parser():
def compute_fbank_gigaspeech_splits(args):
num_splits = args.num_splits
output_dir = "data/fbank/XL_split"
output_dir = f"data/fbank/{args.subset}_split"
output_dir = Path(output_dir)
assert output_dir.exists(), f"{output_dir} does not exist!"
@ -96,15 +104,15 @@ def compute_fbank_gigaspeech_splits(args):
logging.info(f"device: {device}")
for i in range(start, stop):
idx = f"{i + 1}".zfill(num_digits)
idx = f"{i}".zfill(num_digits)
logging.info(f"Processing {idx}/{num_splits}")
cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz"
cuts_path = output_dir / f"cuts_{args.subset}.{idx}.jsonl.gz"
if cuts_path.is_file():
logging.info(f"{cuts_path} exists - skipping")
continue
raw_cuts_path = output_dir / f"cuts_XL_raw.{idx}.jsonl.gz"
raw_cuts_path = output_dir / f"cuts_{args.subset}_raw.{idx}.jsonl.gz"
logging.info(f"Loading {raw_cuts_path}")
cut_set = CutSet.from_file(raw_cuts_path)
@ -113,7 +121,7 @@ def compute_fbank_gigaspeech_splits(args):
cut_set = cut_set.compute_and_store_features_batch(
extractor=extractor,
storage_path=f"{output_dir}/feats_XL_{idx}",
storage_path=f"{output_dir}/feats_{args.subset}_{idx}",
num_workers=args.num_workers,
batch_duration=args.batch_duration,
overwrite=True,