icefall/egs/mls_english/ASR/prepare.sh
Bailey Hirota 7d462aa8b4 add fbank
2025-08-05 18:13:51 +09:00

88 lines
2.5 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# Prepare script for MLS English ASR recipe in icefall
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
set -eou pipefail
stage=-1
stop_stage=100
# Configuration for BPE tokenizer
vocab_sizes=(500)
# Directory where dataset will be downloaded
dl_dir=$PWD/download
. shared/parse_options.sh || exit 1
# All files generated by this script are saved in "data/".
mkdir -p data/manifests data/fbank data/audio data/lang
log() {
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${LINENO}:${FUNCNAME[1]}) $*"
}
log "Starting MLS English data preparation"
# Stage 0: Download corpus
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download MLS English dataset"
if [ ! -d $dl_dir/mls_english ]; then
git clone https://huggingface.co/datasets/parler-tts/mls_eng \
$dl_dir/mls_english || {
log "Failed to download MLS English dataset"; exit 1; }
fi
fi
# Stage 1: Compute fbank & emit manifests
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Compute & validate MLS English fbank"
# we already did `mkdir -p data/manifests data/fbank data/audio` above
if [ ! -e data/fbank/.mls_eng-fbank.done ]; then
python local/compute_fbank_mls_english.py \
--manifest-dir data/manifests \
--audio-dir data/audio \
--dl-dir $dl_dir/mls_english \
--fbank-dir data/fbank
# Validate each splits manifest
for split in train dev test; do
python local/validate_manifest.py \
--manifest data/manifests/mls_eng_cuts_${split}.jsonl.gz
done
touch data/fbank/.mls_eng-fbank.done
log "fbank + manifest generation complete."
else
log "Skipping: fbank already done (data/fbank/.mls_eng-fbank.done exists)."
fi
fi
# Stage 2: Prepare transcript for BPE
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Generate transcript for BPE"
if [ ! -f data/lang/transcript.txt ]; then
./local/utils/generate_transcript.py --lang-dir data/lang
fi
fi
# Stage 3: Train BPE models
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Train BPE models"
for vocab_size in "${vocab_sizes[@]}"; do
bpe_dir=data/lang_bpe_${vocab_size}
mkdir -p $bpe_dir
if [ ! -f $bpe_dir/bpe.model ]; then
./local/train_bpe_model.py \
--lang-dir $bpe_dir \
--vocab-size $vocab_size \
--transcript data/lang/transcript.txt
fi
done
fi
log "MLS English data preparation completed successfully"