From 8109c2b913877866cd335e3951a3230d6b3df73e Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Tue, 30 Nov 2021 12:04:15 +0800 Subject: [PATCH] Split manifests into 2000 pieces. --- .../local/compute_fbank_gigaspeech_splits.py | 23 ++++++++++++++++++- egs/gigaspeech/ASR/prepare.sh | 2 +- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py index ff7e9c770..429168e5c 100755 --- a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py +++ b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py @@ -62,6 +62,20 @@ def get_parser(): required=True, help="The number of splits of the XL subset", ) + + parser.add_argument( + "--start", + type=int, + default=0, + help="Process pieces starting from this number (inclusive).", + ) + + parser.add_argument( + "--stop", + type=int, + default=-1, + help="Stop processing pieces until this number (exclusive).", + ) return parser @@ -73,13 +87,20 @@ def compute_fbank_gigaspeech_splits(args): num_digits = len(str(num_splits)) + start = args.start + stop = args.stop + if stop < start: + stop = num_splits + + stop = min(stop, num_splits) + device = torch.device("cpu") if torch.cuda.is_available(): device = torch.device("cuda", 0) extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) logging.info(f"device: {device}") - for i in range(num_splits): + for i in range(start, stop): idx = f"{i + 1}".zfill(num_digits) logging.info(f"Processing {idx}/{num_splits}") diff --git a/egs/gigaspeech/ASR/prepare.sh b/egs/gigaspeech/ASR/prepare.sh index 48a5f880a..b0c027448 100755 --- a/egs/gigaspeech/ASR/prepare.sh +++ b/egs/gigaspeech/ASR/prepare.sh @@ -8,7 +8,7 @@ stop_stage=100 # Split XL subset to this number of pieces # This is to avoid OOM during feature extraction. -num_splits=1000 +num_splits=2000 # We assume dl_dir (download dir) contains the following # directories and files. If not, they will be downloaded