icefall/egs/zipvoice/scripts/prepare_emilia.sh
Wei Kang 06539d2b9d
Add Zipvoice (#1964)
* Add ZipVoice - a flow-matching based zero-shot TTS model.
2025-06-17 20:17:12 +08:00

127 lines
4.1 KiB
Bash
Executable File

#!/usr/bin/env bash
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
set -eou pipefail
stage=0
stop_stage=5
sampling_rate=24000
nj=32
dl_dir=$PWD/download
# All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
log "dl_dir: $dl_dir"
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"
# Your download directory should look like this:
#
# download/Amphion___Emilia
# ├── metafile.yaml
# ├── raw
# │ ├── DE
# │ ├── EN
# │ ├── FR
# │ ├── JA
# │ ├── KO
# │ ├── openemilia_45batches.tar.gz
# │ ├── openemilia_all.tar.gz
# │ └── ZH
# └── README.md
if [ ! -d $dl_dir/Amphion___Emilia/raw ]; then
log "Please refer https://openxlab.org.cn/datasets/Amphion/Emilia to download the dataset."
exit(-1)
fi
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare emilia manifests (EN and ZH only)"
# We assume that you have downloaded the Emilia corpus
# to $dl_dir/Amphion___Emilia
# see stage 0 for the directory structure
mkdir -p data/manifests
if [ ! -e data/manifests/.emilia.done ]; then
lhotse prepare emilia --lang en --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
lhotse prepare emilia --lang zh --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
touch data/manifests/.emilia.done
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Preprocess Emilia dataset, mainly for cleaning"
mkdir -p data/manifests/splits_raw
if [ ! -e data/manifests/split_raw/.emilia.split.done ]; then
lhotse split-lazy data/manifests/emilia_cuts_EN.jsonl.gz data/manifests/splits_raw 10000
lhotse split-lazy data/manifests/emilia_cuts_ZH.jsonl.gz data/manifests/splits_raw 10000
touch data/manifests/splits_raw/.emilia.split.done
fi
mkdir -p data/manifests/splits
if [ ! -e data/manifests/splits/.emilia.preprocess.done ]; then
python local/preprocess_emilia.py --subset EN
python local/preprocess_emilia.py --subset ZH
touch data/manifests/splits/.emilia.preprocess.done
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Extract Fbank for Emilia"
mkdir -p data/fbank/emilia_splits
if [ ! -e data/fbank/emilia_splits/.emilia.fbank.done ]; then
# You can speed up the extraction by distributing splits to multiple machines.
for subset in EN ZH; do
python local/compute_fbank.py \
--source-dir data/manifests/splits \
--dest-dir data/fbank/emilia_splits \
--dataset emilia \
--subset ${subset} \
--splits-cuts 1 \
--split-begin 0 \
--split-end 2000 \
--num-jobs ${nj}
done
touch data/fbank/emilia_splits/.emilia.fbank.done
fi
if [ ! -e data/fbank/emilia_cuts_EN.jsonl.gz ]; then
log "Combining EN fbank cuts and spliting EN dev set"
gunzip -c data/fbank/emilia_splits/emilia_cuts_EN.*.jsonl.gz > data/fbank/emilia_cuts_EN.jsonl
head -n 1500 data/fbank/emilia_cuts_EN.jsonl | gzip -c > data/fbank/emilia_cuts_EN_dev.jsonl.gz
sed -i '1,1500d' data/fbank/emilia_cuts_EN.jsonl
gzip data/fbank/emilia_cuts_EN.jsonl
fi
if [ ! -e data/fbank/emilia_cuts_ZH.jsonl.gz ]; then
log "Combining ZH fbank cuts and spliting ZH dev set"
gunzip -c data/fbank/emilia_splits/emilia_cuts_ZH.*.jsonl.gz > data/fbank/emilia_cuts_ZH.jsonl
head -n 1500 data/fbank/emilia_cuts_ZH.jsonl | gzip -c > data/fbank/emilia_cuts_ZH_dev.jsonl.gz
sed -i '1,1500d' data/fbank/emilia_cuts_ZH.jsonl
gzip data/fbank/emilia_cuts_ZH.jsonl
fi
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Generate token file"
if [ ! -e data/tokens_emilia.txt ]; then
./local/prepare_token_file_emilia.py --tokens data/tokens_emilia.txt
fi
fi