mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
234 lines
6.6 KiB
Bash
Executable File
234 lines
6.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
|
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
|
|
|
set -eou pipefail
|
|
|
|
nj=15
|
|
stage=-1
|
|
stop_stage=100
|
|
|
|
# We assume dl_dir (download dir) contains the following
|
|
# directories and files. If not, they will be downloaded
|
|
# by this script automatically.
|
|
#
|
|
# - $dl_dir/LibriSpeech
|
|
# You can find BOOKS.TXT, test-clean, train-clean-360, etc, inside it.
|
|
# You can download them from https://www.openslr.org/12
|
|
#
|
|
# - $dl_dir/lm
|
|
# This directory contains the following files downloaded from
|
|
# http://www.openslr.org/resources/11
|
|
#
|
|
# - 3-gram.pruned.1e-7.arpa.gz
|
|
# - 3-gram.pruned.1e-7.arpa
|
|
# - 4-gram.arpa.gz
|
|
# - 4-gram.arpa
|
|
# - librispeech-vocab.txt
|
|
# - librispeech-lexicon.txt
|
|
# - librispeech-lm-norm.txt.gz
|
|
#
|
|
otc_token="<star>"
|
|
feature_type="ssl"
|
|
|
|
dl_dir=$PWD/download
|
|
manifests_dir="data/manifests"
|
|
feature_dir="data/${feature_type}"
|
|
lang_dir="data/lang"
|
|
lm_dir="data/lm"
|
|
|
|
perturb_speed=false
|
|
|
|
# ssl or fbank
|
|
|
|
. ./cmd.sh
|
|
. shared/parse_options.sh || exit 1
|
|
|
|
# vocab size for sentence piece models.
|
|
# It will generate data/lang_bpe_xxx,
|
|
# data/lang_bpe_yyy if the array contains xxx, yyy
|
|
vocab_sizes=(
|
|
200
|
|
)
|
|
|
|
# All files generated by this script are saved in "data".
|
|
# You can safely remove "data" and rerun this script to regenerate it.
|
|
mkdir -p data
|
|
|
|
log() {
|
|
# This function is from espnet
|
|
local fname=${BASH_SOURCE[1]##*/}
|
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
|
}
|
|
|
|
log "dl_dir: ${dl_dir}"
|
|
|
|
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
|
log "Stage -1: Download LM"
|
|
mkdir -p ${dl_dir}/lm
|
|
if [ ! -e ${dl_dir}/lm/.done ]; then
|
|
./local/download_lm.py --out-dir=${dl_dir}/lm
|
|
touch ${dl_dir}/lm/.done
|
|
fi
|
|
fi
|
|
|
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
|
log "Stage 0: Download data"
|
|
|
|
# If you have pre-downloaded it to /path/to/LibriSpeech,
|
|
# you can create a symlink
|
|
#
|
|
# ln -sfv /path/to/LibriSpeech $dl_dir/LibriSpeech
|
|
#
|
|
if [ ! -d $dl_dir/LibriSpeech/train-clean-100 ]; then
|
|
lhotse download librispeech --full ${dl_dir}
|
|
fi
|
|
fi
|
|
|
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|
log "Stage 1: Prepare LibriSpeech manifest"
|
|
# We assume that you have downloaded the LibriSpeech corpus
|
|
# to $dl_dir/LibriSpeech
|
|
mkdir -p data/manifests
|
|
if [ ! -e data/manifests/.librispeech.done ]; then
|
|
lhotse prepare librispeech -j ${nj} \
|
|
-p dev-clean \
|
|
-p dev-other \
|
|
-p test-clean \
|
|
-p test-other \
|
|
-p train-clean-100 "${dl_dir}/LibriSpeech" "${manifests_dir}"
|
|
touch data/manifests/.librispeech.done
|
|
fi
|
|
fi
|
|
|
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|
log "Stage 2: Compute ${feature_type} feature for librispeech (train-clean-100)"
|
|
mkdir -p "${feature_dir}"
|
|
if [ ! -e "${feature_dir}/.librispeech.done" ]; then
|
|
if [ "${feature_type}" = ssl ]; then
|
|
./local/compute_ssl_librispeech.py
|
|
elif [ "${feature_type}" = fbank ]; then
|
|
./local/compute_fbank_librispeech.py --perturb-speed ${perturb_speed}
|
|
else
|
|
log "Error: not supported --feature-type '${feature_type}'"
|
|
exit 2
|
|
fi
|
|
|
|
touch "${feature_dir}.librispeech.done"
|
|
fi
|
|
|
|
if [ ! -e "${feature_dir}/.librispeech-validated.done" ]; then
|
|
log "Validating data/ssl for LibriSpeech"
|
|
parts=(
|
|
train-clean-100
|
|
test-clean
|
|
test-other
|
|
dev-clean
|
|
dev-other
|
|
)
|
|
for part in ${parts[@]}; do
|
|
python3 ./local/validate_manifest.py \
|
|
"${feature_dir}/librispeech_cuts_${part}.jsonl.gz"
|
|
done
|
|
touch "${feature_dir}/.librispeech-validated.done"
|
|
fi
|
|
fi
|
|
|
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|
log "Stage 3: Prepare words.txt"
|
|
mkdir -p ${lang_dir}
|
|
|
|
(echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |
|
|
cat - $dl_dir/lm/librispeech-lexicon.txt |
|
|
sort | uniq > ${lang_dir}/lexicon.txt
|
|
|
|
local/get_words_from_lexicon.py \
|
|
--lang-dir ${lang_dir} \
|
|
--otc-token ${otc_token}
|
|
fi
|
|
|
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|
log "Stage 4: Prepare BPE based lang"
|
|
|
|
for vocab_size in ${vocab_sizes[@]}; do
|
|
bpe_lang_dir="data/lang_bpe_${vocab_size}"
|
|
mkdir -p "${bpe_lang_dir}"
|
|
# We reuse words.txt from phone based lexicon
|
|
# so that the two can share G.pt later.
|
|
cp "${lang_dir}/words.txt" "${bpe_lang_dir}"
|
|
|
|
if [ ! -f "${bpe_lang_dir}/transcript_words.txt" ]; then
|
|
log "Generate data for BPE training"
|
|
files=$(
|
|
find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt"
|
|
find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt"
|
|
find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt"
|
|
)
|
|
for f in ${files[@]}; do
|
|
cat $f | cut -d " " -f 2-
|
|
done > "${bpe_lang_dir}/transcript_words.txt"
|
|
fi
|
|
|
|
if [ ! -f ${bpe_lang_dir}/bpe.model ]; then
|
|
./local/train_bpe_model.py \
|
|
--lang-dir ${bpe_lang_dir} \
|
|
--vocab-size ${vocab_size} \
|
|
--transcript ${bpe_lang_dir}/transcript_words.txt
|
|
fi
|
|
|
|
if [ ! -f ${bpe_lang_dir}/L_disambig.pt ]; then
|
|
./local/prepare_otc_lang_bpe.py \
|
|
--lang-dir "${bpe_lang_dir}" \
|
|
--otc-token "${otc_token}"
|
|
|
|
log "Validating ${bpe_lang_dir}/lexicon.txt"
|
|
./local/validate_bpe_lexicon.py \
|
|
--lexicon ${bpe_lang_dir}/lexicon.txt \
|
|
--bpe-model ${bpe_lang_dir}/bpe.model \
|
|
--otc-token "${otc_token}"
|
|
fi
|
|
done
|
|
fi
|
|
|
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|
log "Stage 5: Prepare G"
|
|
# We assume you have installed kaldilm, if not, please install
|
|
# it using: pip install kaldilm
|
|
|
|
mkdir -p "${lm_dir}"
|
|
if [ ! -f ${lm_dir}/G_3_gram.fst.txt ]; then
|
|
# It is used in building HLG
|
|
python3 -m kaldilm \
|
|
--read-symbol-table="${lang_dir}/words.txt" \
|
|
--disambig-symbol='#0' \
|
|
--max-order=3 \
|
|
${dl_dir}/lm/3-gram.pruned.1e-7.arpa > ${lm_dir}/G_3_gram.fst.txt
|
|
fi
|
|
|
|
if [ ! -f ${lm_dir}/G_4_gram.fst.txt ]; then
|
|
# It is used for LM rescoring
|
|
python3 -m kaldilm \
|
|
--read-symbol-table="${lang_dir}/words.txt" \
|
|
--disambig-symbol='#0' \
|
|
--max-order=4 \
|
|
${dl_dir}/lm/4-gram.arpa > ${lm_dir}/G_4_gram.fst.txt
|
|
fi
|
|
fi
|
|
|
|
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
|
log "Stage 6: Compile HLG"
|
|
# Note If ./local/compile_hlg.py throws OOM,
|
|
# please switch to the following command
|
|
#
|
|
# ./local/compile_hlg_using_openfst.py --lang-dir data/lang_phone
|
|
|
|
for vocab_size in ${vocab_sizes[@]}; do
|
|
bpe_lang_dir="data/lang_bpe_${vocab_size}"
|
|
echo "LM DIR: ${lm_dir}"
|
|
./local/compile_hlg.py \
|
|
--lm-dir "${lm_dir}" \
|
|
--lang-dir "${bpe_lang_dir}"
|
|
done
|
|
fi
|