#!/usr/bin/env bash # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python export PYTHONPATH=/star-data/xiaoyu/icefall_libriheavy:$PYTHONPATH set -eou pipefail nj=15 stage=-1 stop_stage=100 start=0 stop=-1 num_per_split=2000 . shared/parse_options.sh || exit 1 # vocab size for sentence piece models. # It will generate data/lang_bpe_xxx, # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( 1000 ) mkdir -p data log() { # This function is from espnet local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } manifest_dir=data/manifests fbank_dir=data/fbank mkdir -p $manifest_dir subset="large" if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Split libri-heavy ${subset}" if [ $subset == "large" ]; then num_per_split=8000 log "Change num_per_split to ${num_per_split} 8000 for large" fi split_dir=$fbank_dir/libriheavy_${subset}_split mkdir -p $split_dir if [ ! -e $split_dir/.split_completed ]; then lhotse split-lazy $manifest_dir/librilight_cuts_${subset}_raw.jsonl.gz $split_dir $num_per_split touch $split_dir/.split_completed fi fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Compute fbank for Libri-heavy ${subset}" mkdir -p $fbank_dir num_splits=$(find $fbank_dir/libriheavy_${subset}_split -name "librilight_cuts_${subset}_raw.*.jsonl.gz" | wc -l) if [ ! -e $fbank_dir/.libriheavy.${subset}.done ]; then for i in $(seq 0 1 7); do start=$(( i * 200 )) end=$(( (i+1) * 200 )) ./local/compute_fbank_libriheavy.py \ --dataset ${subset} \ --fbank-dir $fbank_dir \ --num-splits $num_splits \ --num-workers $nj \ --start $start \ --stop $end & done wait touch $fbank_dir/.libriheavy.${subset}.done fi fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Combine features for ${subset}" if [ ! -f $fbank_dir/librilight_cuts_${subset}.jsonl.gz ]; then pieces=$(find $fbank_dir/libriheavy_${subset}_split -name "librilight_cuts_${subset}.*.jsonl.gz") lhotse combine $pieces $fbank_dir/librilight_cuts_${subset}.jsonl.gz fi fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "Stage 4: Prepare BPE model" tmp_dir=data/tmp mkdir -p $tmp_dir if [ ! -f $tmp_dir/transcript_words.txt ]; then for part in "small" "medium" "large"; do gunzip -c $manifest_dir/librilight_cuts_${part}_raw.jsonl.gz | jq '.supervisions[].custom.texts[]' | sed 's/" //' | sed 's/\(.*\)"/\1/' > $tmp_dir/transcript_words_${part}.txt done cat $tmp_dir/transcript_words_small.txt $tmp_dir/transcript_words_medium.txt $tmp_dir/transcript_words_large.txt > $tmp_dir/transcript_words.txt fi if [ ! -f $tmp_dir/words.txt ]; then cat $tmp_dir/transcript_words.txt | sed 's/ /\n/g' \ | sort -u | sed '/^$/d' > $tmp_dir/words.txt (echo '!SIL'; echo ''; echo ''; ) | cat - $tmp_dir/words.txt | sort | uniq | awk ' BEGIN { print " 0"; } { if ($1 == "") { print " is in the vocabulary!" | "cat 1>&2" exit 1; } if ($1 == "") { print " is in the vocabulary!" | "cat 1>&2" exit 1; } printf("%s %d\n", $1, NR); } END { printf("#0 %d\n", NR+1); printf(" %d\n", NR+2); printf(" %d\n", NR+3); }' > $tmp_dir/words || exit 1; mv $tmp_dir/words $tmp_dir/words.txt fi for vocab_size in ${vocab_sizes[@]}; do lang_dir=data/lang_bpe_${vocab_size} mkdir -p $lang_dir cp $tmp_dir/words.txt $lang_dir/words.txt pushd $lang_dir ln -s ../$tmp_dir/transcript_words.txt transcript_words.txt popd if [ ! -f $lang_dir/bpe.model ]; then ./local/train_bpe_model.py \ --lang-dir $lang_dir \ --vocab-size $vocab_size \ --transcript $tmp_dir/transcript_words_medium.txt fi if [ ! -f $lang_dir/tokens.txt ]; then ./local/bpe2tokens.py ${lang_dir}/bpe.model > ${lang_dir}/tokens.txt fi done fi