#!/usr/bin/env bash # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python #. ../../../tools/activate_python.sh set -eou pipefail nj=15 stage=-1 stop_stage=100 # We assume dl_dir (download dir) contains the following # directories and files. If not, they will be downloaded # by this script automatically. # # - $dl_dir/LJSpeech # You can find BOOKS.TXT, test-clean, train-clean-360, etc, inside it. # You can download them from https://www.openslr.org/12 # # - $dl_dir/lm # This directory contains the following files downloaded from # http://www.openslr.org/resources/11 # # - 3-gram.pruned.1e-7.arpa.gz # - 3-gram.pruned.1e-7.arpa # - 4-gram.arpa.gz # - 4-gram.arpa # - LJSpeech-vocab.txt # - LJSpeech-lexicon.txt # - LJSpeech-lm-norm.txt.gz # # - $dl_dir/musan # This directory contains the following directories downloaded from # http://www.openslr.org/17/ # # - music # - noise # - speech dl_dir=/DB/LibriSpeech_tar . shared/parse_options.sh || exit 1 # vocab size for sentence piece models. # It will generate data/lang_bpe_xxx, # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( 5000 2000 1000 500 ) # All files generated by this script are saved in "data". # You can safely remove "data" and rerun this script to regenerate it. mkdir -p data log() { # This function is from espnet local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } log "dl_dir: $dl_dir" if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Download data" # If you have pre-downloaded it to /path/to/LJSpeech, # you can create a symlink # # ln -sfv /path/to/LJSpeech $dl_dir/LJSpeech # if [ ! -d $dl_dir/LJSpeech/wav ]; then echo "download not supported yet"; fi # If you have pre-downloaded it to /path/to/musan, # you can create a symlink # # ln -sfv /path/to/musan $dl_dir/ # if [ ! -d $dl_dir/musan ]; then lhotse download musan $dl_dir fi fi if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Prepare LJSpeech manifest" # We assume that you have downloaded the LJSpeech corpus (ver 1.1) # You need to prepare LJSpeech according to data_settings/*_list.txt like below # $dl_dir/LJSpeech # |-- wavs # | |-- train # | |-- dev # | |-- test # |-- texts # |-- metadata.csv # to $dl_dir/LJSpeech if [ ! -e $dl_dir/LJSpeech/.LJSpeech.done ]; then for dset in "train" "dev" "test"; do log "Resampling LJSpeech $dset set" file_list=`ls $dl_dir/LJSpeech/wavs/$dset/` for wavfile in $file_list; do sox -v 0.9 $dl_dir/LJSpeech/wavs/$dset/$wavfile -r 16000 -e signed-integer $dl_dir/LJSpeech/wavs/$dset/tmp_$wavfile mv $dl_dir/LJSpeech/wavs/$dset/tmp_$wavfile $dl_dir/LJSpeech/wavs/$dset/$wavfile done log "Resampling $dset done" done python local/prepare_LJSpeech_text.py $dl_dir/LJSpeech/metadata.csv touch $dl_dir/LJSpeech/.LJSpeech.done fi mkdir -p data/manifests if [ ! -e data/manifests/.LJSpeech.done ]; then python local/prepare_LJSpeech.py $dl_dir/LJSpeech touch data/manifests/.LJSpeech.done fi fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Prepare musan manifest" # We assume that you have downloaded the musan corpus # to data/musan mkdir -p data/manifests if [ ! -e data/manifests/.musan.done ]; then lhotse prepare musan $dl_dir/musan data/manifests touch data/manifests/.musan.done fi fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 3: Compute fbank for LJSpeech" mkdir -p data/fbank if [ ! -e data/fbank/.LJSpeech.done ]; then ./local/compute_fbank_LJSpeech.py --data-dir $dl_dir/LJSpeech touch data/fbank/.LJSpeech.done fi if [ ! -e data/fbank/.LJSpeech-validated.done ]; then log "Validating data/fbank for LJSpeech" parts=`ls $dl_dir/LJSpeech/wavs/` for part in ${parts[@]}; do python3 ./local/validate_manifest.py \ data/fbank/LJSpeech_cuts_${part}.jsonl.gz done fi fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "Stage 4: Compute fbank for musan" mkdir -p data/fbank if [ ! -e data/fbank/.musan.done ]; then ./local/compute_fbank_musan.py touch data/fbank/.musan.done fi fi