mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 10:02:22 +00:00
* Begin to use multiple datasets. * Finish preparing training datasets. * Minor fixes * Copy files. * Finish training code. * Display losses for gigaspeech and librispeech separately. * Fix decode.py * Make the probability to select a batch from GigaSpeech configurable. * Update results. * Minor fixes.
110 lines
2.9 KiB
Bash
Executable File
110 lines
2.9 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -eou pipefail
|
|
|
|
nj=15
|
|
stage=-1
|
|
stop_stage=100
|
|
|
|
# We assume dl_dir (download dir) contains the following
|
|
# directories and files. If not, they will be downloaded
|
|
# by this script automatically.
|
|
#
|
|
# - $dl_dir/GigaSpeech
|
|
# You can find audio, dict, GigaSpeech.json inside it.
|
|
# You can apply for the download credentials by following
|
|
# https://github.com/SpeechColab/GigaSpeech#download
|
|
|
|
# Number of hours for GigaSpeech subsets
|
|
# XL 10k hours
|
|
# L 2.5k hours
|
|
# M 1k hours
|
|
# S 250 hours
|
|
# XS 10 hours
|
|
# DEV 12 hours
|
|
# Test 40 hours
|
|
|
|
dl_dir=$PWD/download
|
|
|
|
. shared/parse_options.sh || exit 1
|
|
|
|
# All files generated by this script are saved in "data".
|
|
# You can safely remove "data" and rerun this script to regenerate it.
|
|
mkdir -p data
|
|
|
|
log() {
|
|
# This function is from espnet
|
|
local fname=${BASH_SOURCE[1]##*/}
|
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
|
}
|
|
|
|
log "dl_dir: $dl_dir"
|
|
|
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
|
log "Stage 0: Download data"
|
|
|
|
[ ! -e $dl_dir/GigaSpeech ] && mkdir -p $dl_dir/GigaSpeech
|
|
|
|
# If you have pre-downloaded it to /path/to/GigaSpeech,
|
|
# you can create a symlink
|
|
#
|
|
# ln -sfv /path/to/GigaSpeech $dl_dir/GigaSpeech
|
|
#
|
|
if [ ! -d $dl_dir/GigaSpeech/audio ] && [ ! -f $dl_dir/GigaSpeech.json ]; then
|
|
# Check credentials.
|
|
if [ ! -f $dl_dir/password ]; then
|
|
echo -n "$0: Please apply for the download credentials by following"
|
|
echo -n "https://github.com/SpeechColab/GigaSpeech#dataset-download"
|
|
echo " and save it to $dl_dir/password."
|
|
exit 1;
|
|
fi
|
|
PASSWORD=`cat $dl_dir/password 2>/dev/null`
|
|
if [ -z "$PASSWORD" ]; then
|
|
echo "$0: Error, $dl_dir/password is empty."
|
|
exit 1;
|
|
fi
|
|
PASSWORD_MD5=`echo $PASSWORD | md5sum | cut -d ' ' -f 1`
|
|
if [[ $PASSWORD_MD5 != "dfbf0cde1a3ce23749d8d81e492741b8" ]]; then
|
|
echo "$0: Error, invalid $dl_dir/password."
|
|
exit 1;
|
|
fi
|
|
# Download XL, DEV and TEST sets by default.
|
|
lhotse download gigaspeech \
|
|
--subset XL \
|
|
--subset L \
|
|
--subset M \
|
|
--subset S \
|
|
--subset XS \
|
|
--subset DEV \
|
|
--subset TEST \
|
|
--host tsinghua \
|
|
$dl_dir/password $dl_dir/GigaSpeech
|
|
fi
|
|
fi
|
|
|
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|
log "Stage 1: Prepare GigaSpeech manifest (may take 30 minutes)"
|
|
# We assume that you have downloaded the GigaSpeech corpus
|
|
# to $dl_dir/GigaSpeech
|
|
mkdir -p data/manifests
|
|
lhotse prepare gigaspeech \
|
|
--subset XL \
|
|
--subset L \
|
|
--subset M \
|
|
--subset S \
|
|
--subset XS \
|
|
--subset DEV \
|
|
--subset TEST \
|
|
-j $nj \
|
|
$dl_dir/GigaSpeech data/manifests
|
|
fi
|
|
|
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|
log "Stage 2: Preprocess GigaSpeech manifest"
|
|
if [ ! -f data/fbank/.preprocess_complete ]; then
|
|
log "It may take 2 hours for this stage"
|
|
python3 ./local/preprocess_gigaspeech.py
|
|
touch data/fbank/.preprocess_complete
|
|
fi
|
|
fi
|