mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-07 08:04:18 +00:00
167 lines
4.8 KiB
Bash
Executable File
167 lines
4.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
|
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
|
export PYTHONPATH=/star-data/xiaoyu/icefall_libriheavy:$PYTHONPATH
|
|
|
|
set -eou pipefail
|
|
|
|
nj=15
|
|
stage=-1
|
|
stop_stage=100
|
|
start=0
|
|
stop=-1
|
|
num_per_split=2000
|
|
split_per_job=20
|
|
char_coverage=0.99
|
|
|
|
. shared/parse_options.sh || exit 1
|
|
|
|
# vocab size for sentence piece models.
|
|
# It will generate data/lang_bpe_xxx,
|
|
# data/lang_bpe_yyy if the array contains xxx, yyy
|
|
vocab_sizes=(
|
|
750
|
|
)
|
|
|
|
mkdir -p data
|
|
|
|
log() {
|
|
# This function is from espnet
|
|
local fname=${BASH_SOURCE[1]##*/}
|
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
|
}
|
|
|
|
manifest_dir=data/manifests
|
|
fbank_dir=data/fbank
|
|
|
|
mkdir -p $manifest_dir
|
|
|
|
subset="medium"
|
|
|
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|
log "Stage 1: Split libri-heavy ${subset}"
|
|
|
|
if [ $subset == "large" ]; then
|
|
num_per_split=8000
|
|
log "Change num_per_split to ${num_per_split} for large"
|
|
fi
|
|
|
|
split_dir=$fbank_dir/libriheavy_${subset}_split
|
|
mkdir -p $split_dir
|
|
if [ ! -e $split_dir/.split_completed ]; then
|
|
lhotse split-lazy $manifest_dir/libriheavy_cuts_${subset}_raw.jsonl.gz $split_dir $num_per_split
|
|
touch $split_dir/.split_completed
|
|
fi
|
|
fi
|
|
|
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|
log "Stage 2: Compute fbank for Libri-heavy ${subset}"
|
|
mkdir -p $fbank_dir
|
|
num_splits=$(find $fbank_dir/libriheavy_${subset}_split -name "libriheavy_cuts_${subset}_raw.*.jsonl.gz" | wc -l)
|
|
if [ $subset == "large" ]; then
|
|
split_per_job=210
|
|
log "Change split_per_job to ${split_per_job} for large"
|
|
elif [ $subset == "medium" ]; then
|
|
split_per_job=100
|
|
log "Change split_per_job to ${split_per_job} for medium"
|
|
fi
|
|
if [ ! -e $fbank_dir/.libriheavy.${subset}.done ]; then
|
|
for i in $(seq 0 1 7); do
|
|
start=$(( i * $split_per_job ))
|
|
end=$(( (i+1) * $split_per_job ))
|
|
./local/compute_fbank_libriheavy.py \
|
|
--dataset ${subset} \
|
|
--fbank-dir $fbank_dir \
|
|
--num-splits $num_splits \
|
|
--num-workers $nj \
|
|
--start $start \
|
|
--stop $end &
|
|
done
|
|
wait
|
|
touch $fbank_dir/.libriheavy.${subset}.done
|
|
fi
|
|
fi
|
|
|
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|
log "Stage 3: Combine features for ${subset}"
|
|
if [ ! -f $fbank_dir/libriheavy_cuts_${subset}.jsonl.gz ]; then
|
|
pieces=$(find $fbank_dir/libriheavy_${subset}_split -name "libriheavy_cuts_${subset}.*.jsonl.gz")
|
|
lhotse combine $pieces $fbank_dir/libriheavy_cuts_${subset}.jsonl.gz
|
|
fi
|
|
fi
|
|
|
|
|
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|
log "Stage 4: Prepare the validation&test sets"
|
|
|
|
./local/prepare_validation_sets.py \
|
|
--in-manifest $fbank_dir/libriheavy_cuts_medium.jsonl.gz \
|
|
--out-manifest $fbank_dir/libriheavy_cuts_medium_filtered.jsonl.gz
|
|
fi
|
|
|
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|
log "Stage 5: Prepare BPE model"
|
|
|
|
tmp_dir=data/tmp
|
|
mkdir -p $tmp_dir
|
|
if [ ! -f $tmp_dir/transcript_words.txt ]; then
|
|
for part in "small" "medium" "large"; do
|
|
gunzip -c $manifest_dir/libriheavy_cuts_${part}_raw.jsonl.gz |
|
|
jq '.supervisions[].custom.texts[]' | sed 's/" //' | sed 's/\(.*\)"/\1/' > $tmp_dir/transcript_words_${part}.txt
|
|
done
|
|
cat $tmp_dir/transcript_words_small.txt $tmp_dir/transcript_words_medium.txt $tmp_dir/transcript_words_large.txt > $tmp_dir/transcript_words.txt
|
|
fi
|
|
|
|
if [ ! -f $tmp_dir/words.txt ]; then
|
|
cat $tmp_dir/transcript_words.txt | sed 's/ /\n/g' \
|
|
| sort -u | sed '/^$/d' > $tmp_dir/words.txt
|
|
(echo '!SIL'; echo '<SPOKEN_NOISE>'; echo '<UNK>'; ) |
|
|
cat - $tmp_dir/words.txt | sort | uniq | awk '
|
|
BEGIN {
|
|
print "<eps> 0";
|
|
}
|
|
{
|
|
if ($1 == "<s>") {
|
|
|
|
print "<s> is in the vocabulary!" | "cat 1>&2"
|
|
exit 1;
|
|
}
|
|
if ($1 == "</s>") {
|
|
print "</s> is in the vocabulary!" | "cat 1>&2"
|
|
exit 1;
|
|
}
|
|
printf("%s %d\n", $1, NR);
|
|
}
|
|
END {
|
|
printf("#0 %d\n", NR+1);
|
|
printf("<s> %d\n", NR+2);
|
|
printf("</s> %d\n", NR+3);
|
|
}' > $tmp_dir/words || exit 1;
|
|
mv $tmp_dir/words $tmp_dir/words.txt
|
|
fi
|
|
|
|
for vocab_size in ${vocab_sizes[@]}; do
|
|
lang_dir=data/lang_bpe_${vocab_size}_fallback_coverage_${char_coverage}
|
|
mkdir -p $lang_dir
|
|
cp $tmp_dir/words.txt $lang_dir/words.txt
|
|
pushd $lang_dir
|
|
ln -s ../$tmp_dir/transcript_words.txt transcript_words.txt
|
|
popd
|
|
|
|
if [ ! -f $lang_dir/bpe.model ]; then
|
|
./local/train_bpe_model.py \
|
|
--lang-dir $lang_dir \
|
|
--vocab-size $vocab_size \
|
|
--byte-fallback True \
|
|
--character-coverage $char_coverage \
|
|
--transcript $tmp_dir/transcript_words_medium.txt
|
|
fi
|
|
|
|
if [ ! -f $lang_dir/tokens.txt ]; then
|
|
./local/bpe2tokens.py ${lang_dir}/bpe.model > ${lang_dir}/tokens.txt
|
|
fi
|
|
|
|
done
|
|
fi
|