diff --git a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py index 6b7fcbaf4..6ce883fab 100755 --- a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py +++ b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py @@ -80,11 +80,11 @@ def get_parser(): def compute_fbank_gigaspeech_splits(args): num_splits = args.num_splits - output_dir = f"data/fbank/XL_split_{num_splits}" + output_dir = f"data/fbank/XL_split" output_dir = Path(output_dir) assert output_dir.exists(), f"{output_dir} does not exist!" - num_digits = len(str(num_splits)) + num_digits = 8 # num_digits is fixed by lhotse split-lazy start = args.start stop = args.stop diff --git a/egs/gigaspeech/ASR/prepare.sh b/egs/gigaspeech/ASR/prepare.sh index fed6f2b15..fd2532741 100755 --- a/egs/gigaspeech/ASR/prepare.sh +++ b/egs/gigaspeech/ASR/prepare.sh @@ -6,9 +6,9 @@ nj=15 stage=0 stop_stage=100 -# Split XL subset to this number of pieces +# Split XL subset to a number of pieces (about 2000) # This is to avoid OOM during feature extraction. -num_splits=2000 +num_per_split=50 # We assume dl_dir (download dir) contains the following # directories and files. If not, they will be downloaded @@ -141,16 +141,17 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Split XL subset into ${num_splits} pieces (may take 30 minutes)" - split_dir=data/fbank/XL_split_${num_splits} + log "Stage 5: Split XL subset into pieces (may take 30 minutes)" + split_dir=data/fbank/XL_split if [ ! -f $split_dir/.split_completed ]; then - lhotse split $num_splits ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir + lhotse split-lazy ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir $num_per_split touch $split_dir/.split_completed fi fi if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then log "Stage 6: Compute features for XL" + num_splits=$(find data/fbank/XL_split -name "cuts_XL_raw.*.jsonl.gz" | wc -l) python3 ./local/compute_fbank_gigaspeech_splits.py \ --num-workers 20 \ --batch-duration 600 \ @@ -160,7 +161,7 @@ fi if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then log "Stage 7: Combine features for XL (may take 3 hours)" if [ ! -f data/fbank/cuts_XL.jsonl.gz ]; then - pieces=$(find data/fbank/XL_split_${num_splits} -name "cuts_XL.*.jsonl.gz") + pieces=$(find data/fbank/XL_split -name "cuts_XL.*.jsonl.gz") lhotse combine $pieces data/fbank/cuts_XL.jsonl.gz fi fi