Merge pull request #2 from csukuangfj/fix-giga

Split manifests into 2000 pieces.
This commit is contained in:
Wang, Guanbo 2021-11-30 00:28:58 -05:00 committed by GitHub
commit b8beb00ecc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 23 additions and 2 deletions

View File

@ -62,6 +62,20 @@ def get_parser():
required=True,
help="The number of splits of the XL subset",
)
parser.add_argument(
"--start",
type=int,
default=0,
help="Process pieces starting from this number (inclusive).",
)
parser.add_argument(
"--stop",
type=int,
default=-1,
help="Stop processing pieces until this number (exclusive).",
)
return parser
@ -73,13 +87,20 @@ def compute_fbank_gigaspeech_splits(args):
num_digits = len(str(num_splits))
start = args.start
stop = args.stop
if stop < start:
stop = num_splits
stop = min(stop, num_splits)
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda", 0)
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
logging.info(f"device: {device}")
for i in range(num_splits):
for i in range(start, stop):
idx = f"{i + 1}".zfill(num_digits)
logging.info(f"Processing {idx}/{num_splits}")

View File

@ -8,7 +8,7 @@ stop_stage=100
# Split XL subset to this number of pieces
# This is to avoid OOM during feature extraction.
num_splits=1000
num_splits=2000
# We assume dl_dir (download dir) contains the following
# directories and files. If not, they will be downloaded