Merge pull request #2 from csukuangfj/fix-giga

Split manifests into 2000 pieces.
2025-08-26 18:24:18 +00:00 · 2021-11-30 00:28:58 -05:00 · 2021-11-30 00:28:58 -05:00 · b8beb00ecc
commit b8beb00ecc
parent ee7c56c7d9 8109c2b913
2 changed files with 23 additions and 2 deletions
--- a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py
+++ b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py
@ -62,6 +62,20 @@ def get_parser():
        required=True,
        help="The number of splits of the XL subset",
    )
+
+    parser.add_argument(
+        "--start",
+        type=int,
+        default=0,
+        help="Process pieces starting from this number (inclusive).",
+    )
+
+    parser.add_argument(
+        "--stop",
+        type=int,
+        default=-1,
+        help="Stop processing pieces until this number (exclusive).",
+    )
    return parser


@ -73,13 +87,20 @@ def compute_fbank_gigaspeech_splits(args):

    num_digits = len(str(num_splits))

+    start = args.start
+    stop = args.stop
+    if stop < start:
+        stop = num_splits
+
+    stop = min(stop, num_splits)
+
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
    logging.info(f"device: {device}")

-    for i in range(num_splits):
+    for i in range(start, stop):
        idx = f"{i + 1}".zfill(num_digits)
        logging.info(f"Processing {idx}/{num_splits}")

--- a/egs/gigaspeech/ASR/prepare.sh
+++ b/egs/gigaspeech/ASR/prepare.sh
@ -8,7 +8,7 @@ stop_stage=100

 # Split XL subset to this number of pieces
 # This is to avoid OOM during feature extraction.
-num_splits=1000
+num_splits=2000

 # We assume dl_dir (download dir) contains the following
 # directories and files. If not, they will be downloaded