#!/usr/bin/env bash # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python set -eou pipefail nj=16 stage=-1 stop_stage=100 dl_dir=$PWD/download . shared/parse_options.sh || exit 1 # vocab size for sentence piece models. # It will generate data/lang_bpe_xxx, # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( # 5000 # 2000 # 1000 500 ) # multidataset list. # LibriSpeech and musan are required. # The others are optional. multidataset=( "gigaspeech", "commonvoice", "librilight", ) # All files generated by this script are saved in "data". # You can safely remove "data" and rerun this script to regenerate it. mkdir -p data log() { # This function is from espnet local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } log "dl_dir: $dl_dir" log "Dataset: musan" if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Soft link fbank of musan" mkdir -p data/fbank if [ -e ../../librispeech/ASR/data/fbank/.musan.done ]; then cd data/fbank ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/musan_feats) . ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/musan_cuts.jsonl.gz) . cd ../.. else log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 4 --stop-stage 4" exit 1 fi fi log "Dataset: LibriSpeech" if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Soft link fbank of librispeech" mkdir -p data/fbank if [ -e ../../librispeech/ASR/data/fbank/.librispeech.done ]; then cd data/fbank ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz) . ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts_dev-clean.jsonl.gz) . ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts_dev-other.jsonl.gz) . ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts_test-clean.jsonl.gz) . ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_cuts_test-other.jsonl.gz) . ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_train-clean-100) . ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_train-clean-360) . ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_train-other-500) . ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_dev-clean) . ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_dev-other) . ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_test-clean) . ln -svf $(realpath ../../../../librispeech/ASR/data/fbank/librispeech_feats_test-other) . cd ../.. else log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 3 --stop-stage 3" exit 1 fi fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Soft link phone based lang" if [ -e ../../librispeech/ASR/data/lang_phone/L_disambig.pt ]; then cd data ln -svf $(realpath ../../../librispeech/ASR/data/lang_phone) . cd .. else log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 5 --stop-stage 5" exit 1 fi fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "Stage 4: Soft link BPE based lang" cd data for vocab_size in ${vocab_sizes[@]}; do if [ -e ../../../librispeech/ASR/data/lang_bpe_${vocab_size}/L_disambig.pt ]; then ln -svf $(realpath ../../../librispeech/ASR/data/lang_bpe_${vocab_size}) . else log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 6 --stop-stage 6" exit 1 fi done cd .. fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "Stage 5: Soft link G" mkdir -p data/lm cd data/lm if [ -f ../../../../librispeech/ASR/data/lm/G_3_gram.fst.txt ]; then ln -svf $(realpath ../../../../librispeech/ASR/data/lm/G_3_gram.fst.txt) . else log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 8 --stop-stage 8" exit 1 fi if [ -f ../../../../librispeech/ASR/data/lm/G_4_gram.fst.txt ]; then ln -svf $(realpath ../../../../librispeech/ASR/data/lm/G_4_gram.fst.txt) . else log "Abort! Please run ../../librispeech/ASR/prepare.sh --stage 8 --stop-stage 8" exit 1 fi cd ../.. fi if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then log "Stage 6: Prepare the other datasets" # GigaSpeech if [[ "${multidataset[@]}" =~ "gigaspeech" ]] && [ ! -f data/fbank/.gigaspeech.done ]; then log "Dataset: GigaSpeech" cd data/fbank if [ -f ../../../../gigaspeech/ASR/data/fbank/XL_split/.split_completed ]; then ln -svf $(realpath ../../../../gigaspeech/ASR/data/fbank/XL_split) . else log "Abort! Please run ../../gigaspeech/ASR/prepare.sh --stage 5 --stop-stage 6" exit 1 fi touch .gigaspeech.done cd ../.. fi # CommonVoice if [[ "${multidataset[@]}" =~ "commonvoice" ]] && [ ! -f data/fbank/.commonvoice.done ]; then log "Dataset: CommonVoice" cd data/fbank if [ -f ../../../../commonvoice/ASR/data/en/fbank/.cv-en_train.done ]; then ln -svf $(realpath ../../../../commonvoice/ASR/data/en/fbank/cv-en_train_split_1000) . ln -svf $(realpath ../../../../commonvoice/ASR/data/en/fbank/cv-en_cuts_train.jsonl.gz) . else log "Abort! Please run ../../commonvoice/ASR/prepare.sh --stage 5 --stop-stage 6" exit 1 fi touch .commonvoice.done cd ../.. fi # LibriLight if [[ "${multidataset[@]}" =~ "librilight" ]] && [ ! -f data/fbank/.librilight.done ]; then log "Dataset: LibriLight" cd data/fbank if [ -f ../../../../librilight/ASR/data/fbank/.librilight_train.done ]; then ln -svf $(realpath ../../../../librilight/ASR/data/fbank/librilight_train_split) . else log "Abort! Please run ../../librilight/ASR/prepare.sh --stage 5 --stop-stage 6" exit 1 fi touch .librilight.done cd ../.. fi fi