Use split-lazy

This commit is contained in:
Guanbo Wang 2022-04-13 18:08:32 -04:00
parent 6a425ed793
commit e83b703c5e
2 changed files with 9 additions and 8 deletions

View File

@ -80,11 +80,11 @@ def get_parser():
def compute_fbank_gigaspeech_splits(args):
num_splits = args.num_splits
output_dir = f"data/fbank/XL_split_{num_splits}"
output_dir = f"data/fbank/XL_split"
output_dir = Path(output_dir)
assert output_dir.exists(), f"{output_dir} does not exist!"
num_digits = len(str(num_splits))
num_digits = 8 # num_digits is fixed by lhotse split-lazy
start = args.start
stop = args.stop

View File

@ -6,9 +6,9 @@ nj=15
stage=0
stop_stage=100
# Split XL subset to this number of pieces
# Split XL subset to a number of pieces (about 2000)
# This is to avoid OOM during feature extraction.
num_splits=2000
num_per_split=50
# We assume dl_dir (download dir) contains the following
# directories and files. If not, they will be downloaded
@ -141,16 +141,17 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Split XL subset into ${num_splits} pieces (may take 30 minutes)"
split_dir=data/fbank/XL_split_${num_splits}
log "Stage 5: Split XL subset into pieces (may take 30 minutes)"
split_dir=data/fbank/XL_split
if [ ! -f $split_dir/.split_completed ]; then
lhotse split $num_splits ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir
lhotse split-lazy ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir $num_per_split
touch $split_dir/.split_completed
fi
fi
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Compute features for XL"
num_splits=$(find data/fbank/XL_split -name "cuts_XL_raw.*.jsonl.gz" | wc -l)
python3 ./local/compute_fbank_gigaspeech_splits.py \
--num-workers 20 \
--batch-duration 600 \
@ -160,7 +161,7 @@ fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 7: Combine features for XL (may take 3 hours)"
if [ ! -f data/fbank/cuts_XL.jsonl.gz ]; then
pieces=$(find data/fbank/XL_split_${num_splits} -name "cuts_XL.*.jsonl.gz")
pieces=$(find data/fbank/XL_split -name "cuts_XL.*.jsonl.gz")
lhotse combine $pieces data/fbank/cuts_XL.jsonl.gz
fi
fi