mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 10:02:22 +00:00
127 lines
4.1 KiB
Bash
Executable File
127 lines
4.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
|
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
|
|
|
set -eou pipefail
|
|
|
|
stage=0
|
|
stop_stage=5
|
|
sampling_rate=24000
|
|
nj=32
|
|
|
|
dl_dir=$PWD/download
|
|
|
|
# All files generated by this script are saved in "data".
|
|
# You can safely remove "data" and rerun this script to regenerate it.
|
|
mkdir -p data
|
|
|
|
log() {
|
|
# This function is from espnet
|
|
local fname=${BASH_SOURCE[1]##*/}
|
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
|
}
|
|
|
|
log "dl_dir: $dl_dir"
|
|
|
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
|
log "Stage 0: Download data"
|
|
|
|
# Your download directory should look like this:
|
|
#
|
|
# download/Amphion___Emilia
|
|
# ├── metafile.yaml
|
|
# ├── raw
|
|
# │ ├── DE
|
|
# │ ├── EN
|
|
# │ ├── FR
|
|
# │ ├── JA
|
|
# │ ├── KO
|
|
# │ ├── openemilia_45batches.tar.gz
|
|
# │ ├── openemilia_all.tar.gz
|
|
# │ └── ZH
|
|
# └── README.md
|
|
|
|
if [ ! -d $dl_dir/Amphion___Emilia/raw ]; then
|
|
log "Please refer https://openxlab.org.cn/datasets/Amphion/Emilia to download the dataset."
|
|
exit(-1)
|
|
fi
|
|
|
|
fi
|
|
|
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|
log "Stage 1: Prepare emilia manifests (EN and ZH only)"
|
|
# We assume that you have downloaded the Emilia corpus
|
|
# to $dl_dir/Amphion___Emilia
|
|
# see stage 0 for the directory structure
|
|
mkdir -p data/manifests
|
|
if [ ! -e data/manifests/.emilia.done ]; then
|
|
lhotse prepare emilia --lang en --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
|
|
lhotse prepare emilia --lang zh --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
|
|
touch data/manifests/.emilia.done
|
|
fi
|
|
fi
|
|
|
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|
log "Stage 2: Preprocess Emilia dataset, mainly for cleaning"
|
|
mkdir -p data/manifests/splits_raw
|
|
if [ ! -e data/manifests/split_raw/.emilia.split.done ]; then
|
|
lhotse split-lazy data/manifests/emilia_cuts_EN.jsonl.gz data/manifests/splits_raw 10000
|
|
lhotse split-lazy data/manifests/emilia_cuts_ZH.jsonl.gz data/manifests/splits_raw 10000
|
|
touch data/manifests/splits_raw/.emilia.split.done
|
|
fi
|
|
|
|
mkdir -p data/manifests/splits
|
|
|
|
if [ ! -e data/manifests/splits/.emilia.preprocess.done ]; then
|
|
python local/preprocess_emilia.py --subset EN
|
|
python local/preprocess_emilia.py --subset ZH
|
|
touch data/manifests/splits/.emilia.preprocess.done
|
|
fi
|
|
|
|
fi
|
|
|
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|
log "Stage 3: Extract Fbank for Emilia"
|
|
mkdir -p data/fbank/emilia_splits
|
|
if [ ! -e data/fbank/emilia_splits/.emilia.fbank.done ]; then
|
|
# You can speed up the extraction by distributing splits to multiple machines.
|
|
for subset in EN ZH; do
|
|
python local/compute_fbank.py \
|
|
--source-dir data/manifests/splits \
|
|
--dest-dir data/fbank/emilia_splits \
|
|
--dataset emilia \
|
|
--subset ${subset} \
|
|
--splits-cuts 1 \
|
|
--split-begin 0 \
|
|
--split-end 2000 \
|
|
--num-jobs ${nj}
|
|
done
|
|
touch data/fbank/emilia_splits/.emilia.fbank.done
|
|
fi
|
|
|
|
if [ ! -e data/fbank/emilia_cuts_EN.jsonl.gz ]; then
|
|
log "Combining EN fbank cuts and spliting EN dev set"
|
|
gunzip -c data/fbank/emilia_splits/emilia_cuts_EN.*.jsonl.gz > data/fbank/emilia_cuts_EN.jsonl
|
|
head -n 1500 data/fbank/emilia_cuts_EN.jsonl | gzip -c > data/fbank/emilia_cuts_EN_dev.jsonl.gz
|
|
sed -i '1,1500d' data/fbank/emilia_cuts_EN.jsonl
|
|
gzip data/fbank/emilia_cuts_EN.jsonl
|
|
fi
|
|
|
|
if [ ! -e data/fbank/emilia_cuts_ZH.jsonl.gz ]; then
|
|
log "Combining ZH fbank cuts and spliting ZH dev set"
|
|
gunzip -c data/fbank/emilia_splits/emilia_cuts_ZH.*.jsonl.gz > data/fbank/emilia_cuts_ZH.jsonl
|
|
head -n 1500 data/fbank/emilia_cuts_ZH.jsonl | gzip -c > data/fbank/emilia_cuts_ZH_dev.jsonl.gz
|
|
sed -i '1,1500d' data/fbank/emilia_cuts_ZH.jsonl
|
|
gzip data/fbank/emilia_cuts_ZH.jsonl
|
|
fi
|
|
|
|
fi
|
|
|
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|
log "Stage 4: Generate token file"
|
|
if [ ! -e data/tokens_emilia.txt ]; then
|
|
./local/prepare_token_file_emilia.py --tokens data/tokens_emilia.txt
|
|
fi
|
|
fi
|