From dd6614f48235c2310ff71a0ec35924855eaf12a9 Mon Sep 17 00:00:00 2001 From: zr_jin <60612200+JinZr@users.noreply.github.com> Date: Wed, 19 Jul 2023 16:50:42 +0800 Subject: [PATCH] refined support for KeSpeech --- .../local/compute_fbank_kespeech_dev_test.py | 2 +- .../local/compute_fbank_kespeech_splits.py | 2 +- .../ASR/local/preprocess_kespeech.py | 4 +- egs/multi_zh-hans/ASR/prepare.sh | 52 +++++++++++++++++++ 4 files changed, 56 insertions(+), 4 deletions(-) diff --git a/egs/multi_zh-hans/ASR/local/compute_fbank_kespeech_dev_test.py b/egs/multi_zh-hans/ASR/local/compute_fbank_kespeech_dev_test.py index 1654a675b..9f043a222 100644 --- a/egs/multi_zh-hans/ASR/local/compute_fbank_kespeech_dev_test.py +++ b/egs/multi_zh-hans/ASR/local/compute_fbank_kespeech_dev_test.py @@ -31,7 +31,7 @@ torch.set_num_interop_threads(1) def compute_fbank_kespeech_dev_test(): - in_out_dir = Path("data/fbank/KeSpeech") + in_out_dir = Path("data/fbank/kespeech") # number of workers in dataloader num_workers = 42 diff --git a/egs/multi_zh-hans/ASR/local/compute_fbank_kespeech_splits.py b/egs/multi_zh-hans/ASR/local/compute_fbank_kespeech_splits.py index 28d0ee173..b8f17f814 100755 --- a/egs/multi_zh-hans/ASR/local/compute_fbank_kespeech_splits.py +++ b/egs/multi_zh-hans/ASR/local/compute_fbank_kespeech_splits.py @@ -93,7 +93,7 @@ def compute_fbank_kespeech_splits(args): subset = args.training_subset subset = str(subset) num_splits = args.num_splits - output_dir = f"data/fbank/KeSpeech/{subset}_split_{num_splits}" + output_dir = f"data/fbank/kespeech/{subset}_split_{num_splits}" output_dir = Path(output_dir) assert output_dir.exists(), f"{output_dir} does not exist!" diff --git a/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py b/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py index b4b587b4c..00eed113f 100755 --- a/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py +++ b/egs/multi_zh-hans/ASR/local/preprocess_kespeech.py @@ -46,8 +46,8 @@ def has_no_oov( def preprocess_kespeech(): - src_dir = Path("data/manifests/KeSpeech") - output_dir = Path("data/fbank/KeSpeech") + src_dir = Path("data/manifests/kespeech") + output_dir = Path("data/fbank/kespeech") output_dir.mkdir(exist_ok=True) # Note: By default, we preprocess all sub-parts. diff --git a/egs/multi_zh-hans/ASR/prepare.sh b/egs/multi_zh-hans/ASR/prepare.sh index 5d2e16777..26b34ea21 100755 --- a/egs/multi_zh-hans/ASR/prepare.sh +++ b/egs/multi_zh-hans/ASR/prepare.sh @@ -8,6 +8,7 @@ set -eou pipefail nj=16 stage=-1 stop_stage=100 +num_splits=100 dl_dir=$PWD/download @@ -248,4 +249,55 @@ if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then log "Abort! Please run ../../wenetspeech/ASR/prepare.sh --stage 5 --stop-stage 5" exit 1 fi +fi + +log "Dataset: KeSpeech" +if [ $stage -le 12 ] && [ $stop_stage -ge 12 ]; then + log "Stage 12: Prepare KeSpeech" + if [ ! -d $dl_dir/KeSpeech ]; then + log "Abort! Please download KeSpeech first." + fi + + if [ ! -f data/manifests/.kespeech.done ]; then + mkdir -p data/manifests + lhotse prepare kespeech -j $nj $dl_dir/KeSpeech data/manifests/kespeech + touch data/manifests/.kespeech.done + fi + + if [ ! -f data/fbank/.kespeech.done ]; then + mkdir -p data/fbank + + log "Preprocess KeSpeech manifest" + if [ ! -f data/fbank/.kespeech_preprocess_complete ]; then + python3 ./local/preprocess_kespeech.py + touch data/fbank/.kespeech_preprocess_complete + fi + + if [ -f data/fbank/.kespeech.train_phase1.split.${num_splits}.done ]; then + log "Spliting KeSpeech train_phase1" + lhotse split ${num_splits} \ + data/fbank/kespeech/kespeech-asr_cuts_train_phase1_raw.jsonl.gz \ + data/fbank/kespeech/train_phase1_split_${num_splits} + touch data/fbank/.kespeech.train_phase1.split.${num_splits}.done + fi + + if [ -f data/fbank/.kespeech.train_phase2.split.${num_splits}.done ]; then + log "Spliting KeSpeech train_phase2" + lhotse split ${num_splits} \ + data/fbank/kespeech/kespeech-asr_cuts_train_phase2_raw.jsonl.gz \ + data/fbank/kespeech/train_phase2_split_${num_splits} + touch data/fbank/.kespeech.train_phase2.split.${num_splits}.done + fi + + log "Compute KeSpeech fbank for train_phase1" + ./local/compute_fbank_kespeech_splits.py --num-splits ${num_splits} --training-subset train_phase1 + + log "Compute KeSpeech fbank for train_phase2" + ./local/compute_fbank_kespeech_splits.py --num-splits ${num_splits} --training-subset train_phase2 + + log "Compute KeSpeech fbank for test/dev" + ./local/compute_fbank_kespeech_dev_test.py + + touch data/fbank/.kespeech.done + fi fi \ No newline at end of file