diff --git a/egs/LJSpeech/ASR/prepare_4446.sh b/egs/LJSpeech/ASR/prepare_4446.sh new file mode 100755 index 000000000..f79f5114f --- /dev/null +++ b/egs/LJSpeech/ASR/prepare_4446.sh @@ -0,0 +1,158 @@ +#!/usr/bin/env bash + +# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python +#. ../../../tools/activate_python.sh + +set -eou pipefail + +nj=15 +stage=-1 +stop_stage=100 + +# We assume dl_dir (download dir) contains the following +# directories and files. If not, they will be downloaded +# by this script automatically. +# +# - $dl_dir/LJSpeech +# You can find BOOKS.TXT, test-clean, train-clean-360, etc, inside it. +# You can download them from https://www.openslr.org/12 +# +# - $dl_dir/lm +# This directory contains the following files downloaded from +# http://www.openslr.org/resources/11 +# +# - 3-gram.pruned.1e-7.arpa.gz +# - 3-gram.pruned.1e-7.arpa +# - 4-gram.arpa.gz +# - 4-gram.arpa +# - LJSpeech-vocab.txt +# - LJSpeech-lexicon.txt +# - LJSpeech-lm-norm.txt.gz +# +# - $dl_dir/musan +# This directory contains the following directories downloaded from +# http://www.openslr.org/17/ +# +# - music +# - noise +# - speech +dl_dir=/DB/LibriSpeech_tar + +. shared/parse_options.sh || exit 1 + +# vocab size for sentence piece models. +# It will generate data/lang_bpe_xxx, +# data/lang_bpe_yyy if the array contains xxx, yyy +vocab_sizes=( + 5000 + 2000 + 1000 + 500 +) + +# All files generated by this script are saved in "data". +# You can safely remove "data" and rerun this script to regenerate it. +mkdir -p data + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "dl_dir: $dl_dir" + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "Stage 0: Download data" + + # If you have pre-downloaded it to /path/to/LJSpeech, + # you can create a symlink + # + # ln -sfv /path/to/LJSpeech $dl_dir/LJSpeech + # + if [ ! -d $dl_dir/LJSpeech/wav ]; then + echo "download not supported yet"; + fi + + # If you have pre-downloaded it to /path/to/musan, + # you can create a symlink + # + # ln -sfv /path/to/musan $dl_dir/ + # + if [ ! -d $dl_dir/musan ]; then + lhotse download musan $dl_dir + fi +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare LJSpeech manifest" + # We assume that you have downloaded the LJSpeech corpus (ver 1.1) + # You need to prepare LJSpeech according to data_settings/*_list.txt like below + # $dl_dir/LJSpeech + # |-- wavs + # | |-- train + # | |-- dev + # | |-- test + # |-- texts + # |-- metadata.csv + + # to $dl_dir/LJSpeech + if [ ! -e $dl_dir/LJSpeech/.LJSpeech.done ]; then + for dset in "train" "dev" "test"; do + log "Resampling LJSpeech $dset set" + file_list=`ls $dl_dir/LJSpeech/wavs/$dset/` + for wavfile in $file_list; do + sox -v 0.9 $dl_dir/LJSpeech/wavs/$dset/$wavfile -r 16000 -e signed-integer $dl_dir/LJSpeech/wavs/$dset/tmp_$wavfile + mv $dl_dir/LJSpeech/wavs/$dset/tmp_$wavfile $dl_dir/LJSpeech/wavs/$dset/$wavfile + done + log "Resampling $dset done" + done + python local/prepare_LJSpeech_text.py $dl_dir/LJSpeech/metadata.csv + touch $dl_dir/LJSpeech/.LJSpeech.done + fi + + mkdir -p data/manifests + if [ ! -e data/manifests/.LJSpeech.done ]; then + python local/prepare_LJSpeech.py $dl_dir/LJSpeech + touch data/manifests/.LJSpeech.done + fi +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Prepare musan manifest" + # We assume that you have downloaded the musan corpus + # to data/musan + mkdir -p data/manifests + if [ ! -e data/manifests/.musan.done ]; then + lhotse prepare musan $dl_dir/musan data/manifests + touch data/manifests/.musan.done + fi +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 3: Compute fbank for LJSpeech" + mkdir -p data/fbank + if [ ! -e data/fbank/.LJSpeech.done ]; then + ./local/compute_fbank_LJSpeech.py --data-dir $dl_dir/LJSpeech + touch data/fbank/.LJSpeech.done + fi + + if [ ! -e data/fbank/.LJSpeech-validated.done ]; then + log "Validating data/fbank for LJSpeech" + parts=`ls $dl_dir/LJSpeech/wavs/` + for part in ${parts[@]}; do + python3 ./local/validate_manifest.py \ + data/fbank/LJSpeech_cuts_${part}.jsonl.gz + done + fi +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Compute fbank for musan" + mkdir -p data/fbank + if [ ! -e data/fbank/.musan.done ]; then + ./local/compute_fbank_musan.py + touch data/fbank/.musan.done + fi +fi