#!/usr/bin/env bash set -eou pipefail stage=0 stop_stage=6 # HI_MIA and aishell dataset are used in this experiment. # musan dataset is used for data augmentation. # # For aishell dataset downloading and preparation, # refer to icefall/egs/aishell/ASR/prepare.sh. # # For HI_MIA and HI_MIA_CW dataset, # we assume dl_dir (download dir) contains the following # directories and files. If not, they will be downloaded # by this script automatically. # Then these files will be extracted to $dl_dir/HiMia/ # # - $dl_dir/train.tar.gz # Himia training dataset. # From https://www.openslr.org/85 # # - $dl_dir/dev.tar.gz # Himia Devlopment dataset. # From https://www.openslr.org/85 # # - $dl_dir/test_v2.tar.gz # Himia test dataset. # From https://www.openslr.org/85 # # - $dl_dir/data.tgz # Himia confusion words(HI_MIA_CW) test dataset. # From https://www.openslr.org/120 # - $dl_dir/resource.tgz # Transcripts of (HI_MIA_CW) test dataset. # From https://www.openslr.org/120 dl_dir=$PWD/download train_set_channel=_7_01 enable_speed_perturb=False . shared/parse_options.sh || exit 1 # All files generated by this script are saved in "data". # You can safely remove "data" and rerun this script to regenerate it. mkdir -p data log() { # This function is from espnet local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } log "dl_dir: $dl_dir" if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Download data" # If you have pre-downloaded HI_MIA and HI_MIA_CW dataset to /path/to/himia/, # you can create a symlink # # ln -sfv /path/to/himia $dl_dir/ # if [ ! -f $dl_dir/train.tar.gz ]; then lhotse download himia $dl_dir/ fi # If you have pre-downloaded it to /path/to/musan, # you can create a symlink # # ln -sfv /path/to/musan $dl_dir/ # if [ ! -d $dl_dir/musan ]; then lhotse download musan $dl_dir fi # If you have pre-downloaded it to /path/to/aishell, # you can create a symlink # # ln -sfv /path/to/aishell $dl_dir/aishell # # The directory structure is # aishell/ # |-- data_aishell # | |-- transcript # | `-- wav # `-- resource_aishell # |-- lexicon.txt # `-- speaker.info if [ ! -d $dl_dir/aishell/data_aishell/wav/train ]; then lhotse download aishell $dl_dir fi fi if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Prepare HI_MIA and HI_MIA_CW manifest" mkdir -p data/manifests if [ ! -e data/manifests/.himia.done ]; then lhotse prepare himia $dl_dir/HiMia data/manifests touch data/manifests/.himia.done fi fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Prepare musan manifest" # We assume that you have downloaded the musan corpus # to data/musan mkdir -p data/manifests if [ ! -e data/manifests/.musan.done ]; then lhotse prepare musan $dl_dir/musan data/manifests touch data/manifests/.musan.done fi fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Prepare aishell manifest" # We assume that you have downloaded the aishell corpus # to $dl_dir/aishell if [ ! -f data/manifests/.aishell_manifests.done ]; then mkdir -p data/manifests lhotse prepare aishell $dl_dir/aishell data/manifests touch data/manifests/.aishell_manifests.done fi fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "Stage 4: Compute fbank for aishell" if [ ! -f data/fbank/.aishell.done ]; then mkdir -p data/fbank ./local/compute_fbank_aishell.py \ --enable-speed-perturb=${enable_speed_perturb} touch data/fbank/.aishell.done fi fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "Stage 5: Compute fbank for musan" mkdir -p data/fbank if [ ! -e data/fbank/.musan.done ]; then ./local/compute_fbank_musan.py touch data/fbank/.musan.done fi fi if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then log "Stage 6: Compute fbank for HI_MIA and HI_MIA_CW dataset" # Format of train_set_channel is "micropohone position"_"channel" # Microphone 1 to 6 is an array with 16 channels. # Microphone 8 only has a single channel. # So valid examples of train_set_channel could be: # 1_01, ..., 1_16 # 2_01, ..., 2_16 # ... # 6_01, ..., 6_16 # 7_01 train_set_channel="_7_01" for subset in train dev test; do for file_type in recordings supervisions; do src=data/manifests/himia_${file_type}_${subset}.jsonl.gz dst=data/manifests/himia_${file_type}_${subset}${train_set_channel}.jsonl.gz cat <(gunzip -c ${src}) | \ grep ${train_set_channel} | \ gzip -c > ${dst} done done mkdir -p data/fbank if [ ! -e data/fbank/.himia.done ]; then ./local/compute_fbank_himia.py \ --train-set-channel=${train_set_channel} \ --enable-speed-perturb=${enable_speed_perturb} touch data/fbank/.himia.done fi train_file=data/fbank/cuts_train_himia${train_set_channel}-aishell-shuf.jsonl.gz if [ ! -f ${train_file} ]; then # SingleCutSampler is preferred for this experiment # rather than DynamicBucketingSampler. # Since negative audios(Aishell) tends to be longer than positive ones(HiMia). # if DynamicBucketingSample is used, a batch may contain either all negative sample # or positive sample. # So `shuf` the training dataset here and use SingleCutSampler to load data. cat <(gunzip -c data/fbank/aishell_cuts_train.jsonl.gz) \ <(gunzip -c data/fbank/cuts_train${train_set_channel}.jsonl.gz) | \ grep -v _sp | \ shuf |shuf | gzip -c > ${train_file} fi fi