#!/usr/bin/env bash # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python set -eou pipefail stage=0 stop_stage=5 sampling_rate=24000 nj=32 dl_dir=$PWD/download # All files generated by this script are saved in "data". # You can safely remove "data" and rerun this script to regenerate it. mkdir -p data log() { # This function is from espnet local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } log "dl_dir: $dl_dir" if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Download data" # Your download directory should look like this: # # download/Amphion___Emilia # ├── metafile.yaml # ├── raw # │ ├── DE # │ ├── EN # │ ├── FR # │ ├── JA # │ ├── KO # │ ├── openemilia_45batches.tar.gz # │ ├── openemilia_all.tar.gz # │ └── ZH # └── README.md if [ ! -d $dl_dir/Amphion___Emilia/raw ]; then log "Please refer https://openxlab.org.cn/datasets/Amphion/Emilia to download the dataset." exit(-1) fi fi if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Prepare emilia manifests (EN and ZH only)" # We assume that you have downloaded the Emilia corpus # to $dl_dir/Amphion___Emilia # see stage 0 for the directory structure mkdir -p data/manifests if [ ! -e data/manifests/.emilia.done ]; then lhotse prepare emilia --lang en --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests lhotse prepare emilia --lang zh --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests touch data/manifests/.emilia.done fi fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Preprocess Emilia dataset, mainly for cleaning" mkdir -p data/manifests/splits_raw if [ ! -e data/manifests/split_raw/.emilia.split.done ]; then lhotse split-lazy data/manifests/emilia_cuts_EN.jsonl.gz data/manifests/splits_raw 10000 lhotse split-lazy data/manifests/emilia_cuts_ZH.jsonl.gz data/manifests/splits_raw 10000 touch data/manifests/splits_raw/.emilia.split.done fi mkdir -p data/manifests/splits if [ ! -e data/manifests/splits/.emilia.preprocess.done ]; then python local/preprocess_emilia.py --subset EN python local/preprocess_emilia.py --subset ZH touch data/manifests/splits/.emilia.preprocess.done fi fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Extract Fbank for Emilia" mkdir -p data/fbank/emilia_splits if [ ! -e data/fbank/emilia_splits/.emilia.fbank.done ]; then # You can speed up the extraction by distributing splits to multiple machines. for subset in EN ZH; do python local/compute_fbank.py \ --source-dir data/manifests/splits \ --dest-dir data/fbank/emilia_splits \ --dataset emilia \ --subset ${subset} \ --splits-cuts 1 \ --split-begin 0 \ --split-end 2000 \ --num-jobs ${nj} done touch data/fbank/emilia_splits/.emilia.fbank.done fi if [ ! -e data/fbank/emilia_cuts_EN.jsonl.gz ]; then log "Combining EN fbank cuts and spliting EN dev set" gunzip -c data/fbank/emilia_splits/emilia_cuts_EN.*.jsonl.gz > data/fbank/emilia_cuts_EN.jsonl head -n 1500 data/fbank/emilia_cuts_EN.jsonl | gzip -c > data/fbank/emilia_cuts_EN_dev.jsonl.gz sed -i '1,1500d' data/fbank/emilia_cuts_EN.jsonl gzip data/fbank/emilia_cuts_EN.jsonl fi if [ ! -e data/fbank/emilia_cuts_ZH.jsonl.gz ]; then log "Combining ZH fbank cuts and spliting ZH dev set" gunzip -c data/fbank/emilia_splits/emilia_cuts_ZH.*.jsonl.gz > data/fbank/emilia_cuts_ZH.jsonl head -n 1500 data/fbank/emilia_cuts_ZH.jsonl | gzip -c > data/fbank/emilia_cuts_ZH_dev.jsonl.gz sed -i '1,1500d' data/fbank/emilia_cuts_ZH.jsonl gzip data/fbank/emilia_cuts_ZH.jsonl fi fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "Stage 4: Generate token file" if [ ! -e data/tokens_emilia.txt ]; then ./local/prepare_token_file_emilia.py --tokens data/tokens_emilia.txt fi fi