mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
Merge pull request #2 from csukuangfj/fix-giga
Split manifests into 2000 pieces.
This commit is contained in:
commit
b8beb00ecc
@ -62,6 +62,20 @@ def get_parser():
|
||||
required=True,
|
||||
help="The number of splits of the XL subset",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--start",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Process pieces starting from this number (inclusive).",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--stop",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="Stop processing pieces until this number (exclusive).",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
@ -73,13 +87,20 @@ def compute_fbank_gigaspeech_splits(args):
|
||||
|
||||
num_digits = len(str(num_splits))
|
||||
|
||||
start = args.start
|
||||
stop = args.stop
|
||||
if stop < start:
|
||||
stop = num_splits
|
||||
|
||||
stop = min(stop, num_splits)
|
||||
|
||||
device = torch.device("cpu")
|
||||
if torch.cuda.is_available():
|
||||
device = torch.device("cuda", 0)
|
||||
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
|
||||
logging.info(f"device: {device}")
|
||||
|
||||
for i in range(num_splits):
|
||||
for i in range(start, stop):
|
||||
idx = f"{i + 1}".zfill(num_digits)
|
||||
logging.info(f"Processing {idx}/{num_splits}")
|
||||
|
||||
|
@ -8,7 +8,7 @@ stop_stage=100
|
||||
|
||||
# Split XL subset to this number of pieces
|
||||
# This is to avoid OOM during feature extraction.
|
||||
num_splits=1000
|
||||
num_splits=2000
|
||||
|
||||
# We assume dl_dir (download dir) contains the following
|
||||
# directories and files. If not, they will be downloaded
|
||||
|
Loading…
x
Reference in New Issue
Block a user