#!/usr/bin/env bash # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python set -eou pipefail nj=15 stage=-1 stop_stage=100 # We assume dl_dir (download dir) contains the following # directories and files. Most of them can't be downloaded automatically # as they are not publically available and require a license purchased # from the LDC. # # - $dl_dir/musan # This directory contains the following directories downloaded from # http://www.openslr.org/17/ # # - music # - noise # - speech dl_dir=./download # swbd1_dir="/export/corpora3/LDC/LDC97S62" swbd1_dir=./download/LDC97S62/ # eval2000_dir contains the following files and directories # downloaded from LDC website: # - LDC2002S09 # - hub5e_00 # - LDC2002T43 # - reference eval2000_dir="/export/corpora2/LDC/eval2000" rt03_dir="/export/corpora/LDC/LDC2007S10" fisher_dir="/export/corpora3/LDC/LDC2004T19" . shared/parse_options.sh || exit 1 # vocab size for sentence piece models. # It will generate data/lang_bpe_xxx, # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( # 5000 # 2000 1000 500 ) # All files generated by this script are saved in "data". # You can safely remove "data" and rerun this script to regenerate it. mkdir -p data log() { # This function is from espnet local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } log "swbd1_dir: $swbd1_dir" log "eval2000_dir: $eval2000_dir" log "rt03_dir: $rt03_dir" if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1 I: Compute narrowband fbank for SwitchBoard" if [ ! -e data/fbank_nb/.swbd.done ]; then mkdir -p data/fbank_nb/swbd_split${num_splits}/ for index in $(seq 1 16); do ./local/compute_fbank_swbd_nb.py --split-index ${index} & done wait pieces=$(find data/fbank_nb/swbd_split${num_splits} -name "swbd_cuts_all.*.jsonl.gz") lhotse combine $pieces data/fbank_nb/swbd_cuts_all.jsonl.gz touch data/fbank_nb/.swbd.done fi fi if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1 II: Compute narrowband fbank for eval2000" if [ ! -e data/fbank_nb/.eval2000.done ]; then mkdir -p data/fbank_nb/eval2000/ ./local/compute_fbank_eval2000_nb.py touch data/fbank_nb/.eval2000.done fi fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Compute narrowband fbank for musan" mkdir -p data/fbank_nb/ if [ ! -e data/fbank_nb/.musan.done ]; then ./local/compute_fbank_musan_nb.py touch data/fbank_nb/.musan.done fi fi