diff --git a/egs/librispeech/ASR/prepare_giga_speech.sh b/egs/librispeech/ASR/prepare_giga_speech.sh index b077aaf3a..5684ccdf8 100755 --- a/egs/librispeech/ASR/prepare_giga_speech.sh +++ b/egs/librispeech/ASR/prepare_giga_speech.sh @@ -149,11 +149,3 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then touch data/fbank/.gigaspeech_XL.done fi fi - -if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Combine features for XL (may take 15 hours)" - if [ ! -f data/fbank/gigaspeech_cuts_XL.jsonl.gz ]; then - pieces=$(find data/fbank/gigaspeech_XL_split_${num_splits} -name "gigaspeech_cuts_XL.*.jsonl.gz") - lhotse combine $pieces data/fbank/gigaspeech_cuts_XL.jsonl.gz - fi -fi diff --git a/egs/librispeech/ASR/prepare_multidataset.sh b/egs/librispeech/ASR/prepare_multidataset.sh index c95b4d039..31f6646c9 100755 --- a/egs/librispeech/ASR/prepare_multidataset.sh +++ b/egs/librispeech/ASR/prepare_multidataset.sh @@ -37,10 +37,6 @@ stop_stage=100 # - noise # - speech -# Split all dataset to this number of pieces and mix each dataset pieces -# into multidataset pieces with shuffling. -num_splits=1998 - dl_dir=$PWD/download . shared/parse_options.sh || exit 1 @@ -61,6 +57,7 @@ vocab_sizes=( multidataset=( "gigaspeech", "commonvoice", + "peoples_speech", ) # All files generated by this script are saved in "data". @@ -319,7 +316,7 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then # GigaSpeech if [[ "${multidataset[@]}" =~ "gigaspeech" ]]; then log "Dataset: GigaSpeech" - ./prepare_giga_speech.sh --stop_stage 5 + ./prepare_giga_speech.sh fi # CommonVoice @@ -327,4 +324,10 @@ if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then log "Dataset: CommonVoice" ./prepare_common_voice.sh fi + + # People's Speech + if [[ "${multidataset[@]}" =~ "peoples_speech" ]]; then + log "Dataset: People's Speech" + ./prepare_peoples_speech.sh + fi fi diff --git a/egs/librispeech/ASR/prepare_peoples_speech.sh b/egs/librispeech/ASR/prepare_peoples_speech.sh new file mode 100755 index 000000000..0c7267778 --- /dev/null +++ b/egs/librispeech/ASR/prepare_peoples_speech.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash + +set -eou pipefail + +nj=32 +stage=-1 +stop_stage=100 + +# Split data/set to a number of pieces +# This is to avoid OOM during feature extraction. +num_per_split=4000 + +# We assume dl_dir (download dir) contains the following +# directories and files. If not, they will be downloaded +# by this script automatically. +# +# - $dl_dir/peoples_speech +# This directory contains the following files downloaded from +# https://huggingface.co/datasets/MLCommons/peoples_speech +# +# - test +# - train +# - validation + +dl_dir=$PWD/download + +. shared/parse_options.sh || exit 1 + +# vocab size for sentence piece models. +# It will generate data/lang_bpe_xxx, +# data/lang_bpe_yyy if the array contains xxx, yyy +vocab_sizes=( + # 5000 + # 2000 + # 1000 + 500 +) + +# All files generated by this script are saved in "data". +# You can safely remove "data" and rerun this script to regenerate it. +mkdir -p data + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "dl_dir: $dl_dir" + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "Stage 0: Download data" + + # If you have pre-downloaded it to /path/to/peoples_speech, + # you can create a symlink + # + # ln -sfv /path/to/peoples_speech $dl_dir/peoples_speech + # + if [ ! -d $dl_dir/peoples_speech/train ]; then + git lfs install + git clone https://huggingface.co/datasets/MLCommons/peoples_speech + fi +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare People's Speech manifest" + # We assume that you have downloaded the People's Speech corpus + # to $dl_dir/peoples_speech + mkdir -p data/manifests + if [ ! -e data/manifests/.peoples_speech.done ]; then + lhotse prepare peoples-speech -j $nj $dl_dir/peoples_speech data/manifests + touch data/manifests/.peoples_speech.done + fi +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Preprocess People's Speech manifest" + mkdir -p data/fbank + if [ ! -e data/fbank/.preprocess_complete ]; then + ./local/preprocess_peoples_speech.py + touch data/fbank/.preprocess_complete + fi +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Compute fbank for valid and test subsets of People's Speech" + if [ ! -e data/fbank/.peoples_speech_valid_test.done ]; then + ./local/compute_fbank_peoples_speech_valid_test.py + touch data/fbank/.peoples_speech_valid_test.done + fi +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Split train subset into pieces" + split_dir=data/fbank/peoples_speech_train_split + if [ ! -e $split_dir/.peoples_speech_dirty_split.done ]; then + lhotse split-lazy ./data/fbank/peoples_speech_cuts_dirty_raw.jsonl.gz $split_dir $num_per_split + touch $split_dir/.peoples_speech_dirty_split.done + fi + + if [ ! -e $split_dir/.peoples_speech_dirty_sa_split.done ]; then + lhotse split-lazy ./data/fbank/peoples_speech_cuts_dirty_sa_raw.jsonl.gz $split_dir $num_per_split + touch $split_dir/.peoples_speech_dirty_sa_split.done + fi + + if [ ! -e $split_dir/.peoples_speech_clean_split.done ]; then + lhotse split-lazy ./data/fbank/peoples_speech_cuts_clean_raw.jsonl.gz $split_dir $num_per_split + touch $split_dir/.peoples_speech_clean_split.done + fi + + if [ ! -e $split_dir/.peoples_speech_clean_sa_split.done ]; then + lhotse split-lazy ./data/fbank/peoples_speech_cuts_clean_sa_raw.jsonl.gz $split_dir $num_per_split + touch $split_dir/.peoples_speech_clean_sa_split.done + fi +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Compute features for train subset of People's Speech" + if [ ! -e data/fbank/.peoples_speech_train.done ]; then + ./local/compute_fbank_peoples_speech_splits.py \ + --num-workers $nj \ + --batch-duration 600 \ + --start 0 \ + --num-splits 2000 + touch data/fbank/.peoples_speech_train.done + fi +fi diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py b/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py index 07c7126fa..434471196 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py @@ -33,6 +33,10 @@ class MultiDataset: - librispeech_cuts_train-all-shuf.jsonl.gz - gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz + - peoples_speech_train_split/peoples_speech_cuts_dirty.*.jsonl.gz + - peoples_speech_train_split/peoples_speech_cuts_dirty_sa.*.jsonl.gz + - peoples_speech_train_split/peoples_speech_cuts_clean.*.jsonl.gz + - peoples_speech_train_split/peoples_speech_cuts_clean_sa.*.jsonl.gz cv_manifest_dir: It is expected to contain the following files: @@ -74,4 +78,34 @@ class MultiDataset: self.cv_manifest_dir / f"cv-en_cuts_train.jsonl.gz" ) - return CutSet.mux(librispeech_cuts, gigaspeech_cuts, commonvoice_cuts) + # People's Speech + filenames = glob.glob( + f"{self.manifest_dir}/peoples_speech_train_split/peoples_speech_cuts_*.*.jsonl.gz" + ) + + pattern = re.compile(r"peoples_speech_cuts.([0-9]+).jsonl.gz") + idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames) + idx_filenames = sorted(idx_filenames, key=lambda x: x[0]) + + sorted_filenames = [f[1] for f in idx_filenames] + + logging.info( + f"Loading People's Speech {len(sorted_filenames)} splits in lazy mode" + ) + + peoples_speech_cuts = lhotse.combine( + lhotse.load_manifest_lazy(p) for p in sorted_filenames + ) + + return CutSet.mux( + librispeech_cuts, + gigaspeech_cuts, + commonvoice_cuts, + peoples_speech_cuts, + weights=[ + len(librispeech_cuts), + len(gigaspeech_cuts), + len(commonvoice_cuts), + len(peoples_speech_cuts), + ], + )