From 2e05663fbbae3cf48ef55c42a973004df0b3ae38 Mon Sep 17 00:00:00 2001 From: Seung Hyun Lee Date: Tue, 18 Jun 2024 17:54:39 +0900 Subject: [PATCH] Add prepare.sh for KsponSpeech recipe. (#1656) --- egs/ksponspeech/ASR/prepare.sh | 162 +++++++++++++++++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100755 egs/ksponspeech/ASR/prepare.sh diff --git a/egs/ksponspeech/ASR/prepare.sh b/egs/ksponspeech/ASR/prepare.sh new file mode 100755 index 000000000..2c5cc8b49 --- /dev/null +++ b/egs/ksponspeech/ASR/prepare.sh @@ -0,0 +1,162 @@ +#!/usr/bin/env bash + +# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + +set -eou pipefail + +nj=15 +stage=0 +stop_stage=100 + +# Note: This script just prepare the minimal requirements that needed by a +# transducer training with bpe units. +# +# We assume dl_dir (download dir) contains the following +# directories and files. +# This script downloads only musan dataset automatically. +# +# - $dl_dir/KsponSpeech +# This script doesn't download KsponSpeech dataset automatically. +# For more details, please visit: +# Dataset: https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=123 +# Paper: https://www.mdpi.com/2076-3417/10/19/6936 +# +# - $dl_dir/musan +# This directory contains the following directories downloaded from +# http://www.openslr.org/17/ +# +# - music +# - noise +# - speech + +dl_dir=$PWD/download + +# vocab size for sentence piece models. +# It will generate data/lang_bpe_xxx, +# data/lang_bpe_yyy if the array contains xxx, yyy +vocab_sizes=( + 5000 +) + +# All files generated by this script are saved in "data". +# You can safely remove "data" and rerun this script to regenerate it. +data=$PWD/data + +. shared/parse_options.sh || exit 1 + +mkdir -p $data + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "Running prepare.sh" + +log "dl_dir: $dl_dir" + + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "Stage 0: Download MUSAN data" + # Befor you run this script, you must get the KsponSpeech dataset + # from https://aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=123 + # If you have pre-downloaded it to /path/to/KsponSpeech, + # you can create a symlink + # + # ln -svf /path/to/KsponSpeech $dl_dir/KsponSpeech + # + # If you have pre-downloaded it to /path/to/musan, + # you can create a symlink + # + # ln -sfv /path/to/musan $dl_dir/musan + # + if [ ! -d $dl_dir/musan ]; then + lhotse download musan $dl_dir + fi +fi + + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare KsponSpeech manifest" + # We assume that you have downloaded the KsponSpeech corpus + # to $dl_dir/KsponSpeech + mkdir -p $data/manifests + if [ ! -e $data/manifests/.ksponspeech.done ]; then + lhotse prepare ksponspeech -j $nj $dl_dir/KsponSpeech $data/manifests + touch $data/manifests/.ksponspeech.done + fi +fi + + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Prepare musan manifest" + # We assume that you have downloaded the musan corpus + # to $dl_dir/musan + mkdir -p $data/manifests + if [ ! -e $data/manifests/.musan.done ]; then + lhotse prepare musan $dl_dir/musan $data/manifests + touch $data/manifests/.musan.done + fi +fi + + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Compute fbank for KsponSpeech" + mkdir -p $data/fbank + if [ ! -e $data/fbank/.ksponspeech.done ]; then + ./local/compute_fbank_ksponspeech.py --data-dir $data + touch $data/fbank/.ksponspeech.done + fi + + if [ ! -e $data/fbank/.ksponspeech-validated.done ]; then + log "Validating data/fbank for KsponSpeech" + parts=( + train + dev + eval_clean + eval_other + ) + for part in ${parts[@]}; do + ./local/validate_manifest.py \ + $data/fbank/ksponspeech_cuts_${part}.jsonl.gz + done + touch $data/fbank/.ksponspeech-validated.done + fi +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Compute fbank for musan" + mkdir -p $data/fbank + if [ ! -e $data/fbank/.musan.done ]; then + ./local/compute_fbank_musan.py \ + --src-dir $data/manifests \ + --output-dir $data/fbank + touch $data/fbank/.musan.done + fi +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Prepare BPE based lang" + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=$data/lang_bpe_${vocab_size} + mkdir -p $lang_dir + + if [ ! -f $lang_dir/transcript_words.txt ]; then + log "Generate data for BPE training" + files=$( + find "$data/fbank" -name "ksponspeech_cuts_*.jsonl.gz" + ) + gunzip -c ${files} | awk -F '"' '{print $30}' > $lang_dir/transcript_words.txt + fi + + if [ ! -f $lang_dir/bpe.model ]; then + ./local/train_bpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size \ + --transcript $lang_dir/transcript_words.txt + fi + done +fi