From 8109c2b913877866cd335e3951a3230d6b3df73e Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Tue, 30 Nov 2021 12:04:15 +0800
Subject: [PATCH] Split manifests into 2000 pieces.

---
 .../local/compute_fbank_gigaspeech_splits.py  | 23 ++++++++++++++++++-
 egs/gigaspeech/ASR/prepare.sh                 |  2 +-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py
index ff7e9c770..429168e5c 100755
--- a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py
+++ b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py
@@ -62,6 +62,20 @@ def get_parser():
         required=True,
         help="The number of splits of the XL subset",
     )
+
+    parser.add_argument(
+        "--start",
+        type=int,
+        default=0,
+        help="Process pieces starting from this number (inclusive).",
+    )
+
+    parser.add_argument(
+        "--stop",
+        type=int,
+        default=-1,
+        help="Stop processing pieces until this number (exclusive).",
+    )
     return parser
 
 
@@ -73,13 +87,20 @@ def compute_fbank_gigaspeech_splits(args):
 
     num_digits = len(str(num_splits))
 
+    start = args.start
+    stop = args.stop
+    if stop < start:
+        stop = num_splits
+
+    stop = min(stop, num_splits)
+
     device = torch.device("cpu")
     if torch.cuda.is_available():
         device = torch.device("cuda", 0)
     extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
     logging.info(f"device: {device}")
 
-    for i in range(num_splits):
+    for i in range(start, stop):
         idx = f"{i + 1}".zfill(num_digits)
         logging.info(f"Processing {idx}/{num_splits}")
 
diff --git a/egs/gigaspeech/ASR/prepare.sh b/egs/gigaspeech/ASR/prepare.sh
index 48a5f880a..b0c027448 100755
--- a/egs/gigaspeech/ASR/prepare.sh
+++ b/egs/gigaspeech/ASR/prepare.sh
@@ -8,7 +8,7 @@ stop_stage=100
 
 # Split XL subset to this number of pieces
 # This is to avoid OOM during feature extraction.
-num_splits=1000
+num_splits=2000
 
 # We assume dl_dir (download dir) contains the following
 # directories and files. If not, they will be downloaded