mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-04 22:54:18 +00:00
Use split-lazy
This commit is contained in:
parent
6a425ed793
commit
e83b703c5e
@ -80,11 +80,11 @@ def get_parser():
|
||||
|
||||
def compute_fbank_gigaspeech_splits(args):
|
||||
num_splits = args.num_splits
|
||||
output_dir = f"data/fbank/XL_split_{num_splits}"
|
||||
output_dir = f"data/fbank/XL_split"
|
||||
output_dir = Path(output_dir)
|
||||
assert output_dir.exists(), f"{output_dir} does not exist!"
|
||||
|
||||
num_digits = len(str(num_splits))
|
||||
num_digits = 8 # num_digits is fixed by lhotse split-lazy
|
||||
|
||||
start = args.start
|
||||
stop = args.stop
|
||||
|
@ -6,9 +6,9 @@ nj=15
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
# Split XL subset to this number of pieces
|
||||
# Split XL subset to a number of pieces (about 2000)
|
||||
# This is to avoid OOM during feature extraction.
|
||||
num_splits=2000
|
||||
num_per_split=50
|
||||
|
||||
# We assume dl_dir (download dir) contains the following
|
||||
# directories and files. If not, they will be downloaded
|
||||
@ -141,16 +141,17 @@ if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
log "Stage 5: Split XL subset into ${num_splits} pieces (may take 30 minutes)"
|
||||
split_dir=data/fbank/XL_split_${num_splits}
|
||||
log "Stage 5: Split XL subset into pieces (may take 30 minutes)"
|
||||
split_dir=data/fbank/XL_split
|
||||
if [ ! -f $split_dir/.split_completed ]; then
|
||||
lhotse split $num_splits ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir
|
||||
lhotse split-lazy ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir $num_per_split
|
||||
touch $split_dir/.split_completed
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||
log "Stage 6: Compute features for XL"
|
||||
num_splits=$(find data/fbank/XL_split -name "cuts_XL_raw.*.jsonl.gz" | wc -l)
|
||||
python3 ./local/compute_fbank_gigaspeech_splits.py \
|
||||
--num-workers 20 \
|
||||
--batch-duration 600 \
|
||||
@ -160,7 +161,7 @@ fi
|
||||
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
||||
log "Stage 7: Combine features for XL (may take 3 hours)"
|
||||
if [ ! -f data/fbank/cuts_XL.jsonl.gz ]; then
|
||||
pieces=$(find data/fbank/XL_split_${num_splits} -name "cuts_XL.*.jsonl.gz")
|
||||
pieces=$(find data/fbank/XL_split -name "cuts_XL.*.jsonl.gz")
|
||||
lhotse combine $pieces data/fbank/cuts_XL.jsonl.gz
|
||||
fi
|
||||
fi
|
||||
|
Loading…
x
Reference in New Issue
Block a user