mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-09 17:14:20 +00:00
refined support for KeSpeech
This commit is contained in:
parent
d0d7dd3cfc
commit
dd6614f482
@ -31,7 +31,7 @@ torch.set_num_interop_threads(1)
|
||||
|
||||
|
||||
def compute_fbank_kespeech_dev_test():
|
||||
in_out_dir = Path("data/fbank/KeSpeech")
|
||||
in_out_dir = Path("data/fbank/kespeech")
|
||||
# number of workers in dataloader
|
||||
num_workers = 42
|
||||
|
||||
|
@ -93,7 +93,7 @@ def compute_fbank_kespeech_splits(args):
|
||||
subset = args.training_subset
|
||||
subset = str(subset)
|
||||
num_splits = args.num_splits
|
||||
output_dir = f"data/fbank/KeSpeech/{subset}_split_{num_splits}"
|
||||
output_dir = f"data/fbank/kespeech/{subset}_split_{num_splits}"
|
||||
output_dir = Path(output_dir)
|
||||
assert output_dir.exists(), f"{output_dir} does not exist!"
|
||||
|
||||
|
@ -46,8 +46,8 @@ def has_no_oov(
|
||||
|
||||
|
||||
def preprocess_kespeech():
|
||||
src_dir = Path("data/manifests/KeSpeech")
|
||||
output_dir = Path("data/fbank/KeSpeech")
|
||||
src_dir = Path("data/manifests/kespeech")
|
||||
output_dir = Path("data/fbank/kespeech")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Note: By default, we preprocess all sub-parts.
|
||||
|
@ -8,6 +8,7 @@ set -eou pipefail
|
||||
nj=16
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
num_splits=100
|
||||
|
||||
dl_dir=$PWD/download
|
||||
|
||||
@ -248,4 +249,55 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then
|
||||
log "Abort! Please run ../../wenetspeech/ASR/prepare.sh --stage 5 --stop-stage 5"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
log "Dataset: KeSpeech"
|
||||
if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then
|
||||
log "Stage 12: Prepare KeSpeech"
|
||||
if [ ! -d $dl_dir/KeSpeech ]; then
|
||||
log "Abort! Please download KeSpeech first."
|
||||
fi
|
||||
|
||||
if [ ! -f data/manifests/.kespeech.done ]; then
|
||||
mkdir -p data/manifests
|
||||
lhotse prepare kespeech -j $nj $dl_dir/KeSpeech data/manifests/kespeech
|
||||
touch data/manifests/.kespeech.done
|
||||
fi
|
||||
|
||||
if [ ! -f data/fbank/.kespeech.done ]; then
|
||||
mkdir -p data/fbank
|
||||
|
||||
log "Preprocess KeSpeech manifest"
|
||||
if [ ! -f data/fbank/.kespeech_preprocess_complete ]; then
|
||||
python3 ./local/preprocess_kespeech.py
|
||||
touch data/fbank/.kespeech_preprocess_complete
|
||||
fi
|
||||
|
||||
if [ -f data/fbank/.kespeech.train_phase1.split.${num_splits}.done ]; then
|
||||
log "Spliting KeSpeech train_phase1"
|
||||
lhotse split ${num_splits} \
|
||||
data/fbank/kespeech/kespeech-asr_cuts_train_phase1_raw.jsonl.gz \
|
||||
data/fbank/kespeech/train_phase1_split_${num_splits}
|
||||
touch data/fbank/.kespeech.train_phase1.split.${num_splits}.done
|
||||
fi
|
||||
|
||||
if [ -f data/fbank/.kespeech.train_phase2.split.${num_splits}.done ]; then
|
||||
log "Spliting KeSpeech train_phase2"
|
||||
lhotse split ${num_splits} \
|
||||
data/fbank/kespeech/kespeech-asr_cuts_train_phase2_raw.jsonl.gz \
|
||||
data/fbank/kespeech/train_phase2_split_${num_splits}
|
||||
touch data/fbank/.kespeech.train_phase2.split.${num_splits}.done
|
||||
fi
|
||||
|
||||
log "Compute KeSpeech fbank for train_phase1"
|
||||
./local/compute_fbank_kespeech_splits.py --num-splits ${num_splits} --training-subset train_phase1
|
||||
|
||||
log "Compute KeSpeech fbank for train_phase2"
|
||||
./local/compute_fbank_kespeech_splits.py --num-splits ${num_splits} --training-subset train_phase2
|
||||
|
||||
log "Compute KeSpeech fbank for test/dev"
|
||||
./local/compute_fbank_kespeech_dev_test.py
|
||||
|
||||
touch data/fbank/.kespeech.done
|
||||
fi
|
||||
fi
|
Loading…
x
Reference in New Issue
Block a user