From 5d59f48193ff4aeee72e129b6c8012800e846d7a Mon Sep 17 00:00:00 2001 From: Yifan Yang Date: Wed, 31 May 2023 18:11:12 +0800 Subject: [PATCH] Add People's Speech to multidataset --- egs/librispeech/ASR/prepare_common_voice.sh | 117 -------------- egs/librispeech/ASR/prepare_giga_speech.sh | 151 ------------------ egs/librispeech/ASR/prepare_multidataset.sh | 67 ++++---- egs/librispeech/ASR/prepare_peoples_speech.sh | 127 --------------- .../multidataset.py | 15 +- .../ASR/pruned_transducer_stateless7/train.py | 2 +- 6 files changed, 43 insertions(+), 436 deletions(-) delete mode 100755 egs/librispeech/ASR/prepare_common_voice.sh delete mode 100755 egs/librispeech/ASR/prepare_giga_speech.sh delete mode 100755 egs/librispeech/ASR/prepare_peoples_speech.sh diff --git a/egs/librispeech/ASR/prepare_common_voice.sh b/egs/librispeech/ASR/prepare_common_voice.sh deleted file mode 100755 index 6f9c4fb2f..000000000 --- a/egs/librispeech/ASR/prepare_common_voice.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env bash - -set -eou pipefail - -nj=16 -stage=-1 -stop_stage=100 - -# Split data/${lang}set to this number of pieces -# This is to avoid OOM during feature extraction. -num_splits=1000 - -# We assume dl_dir (download dir) contains the following -# directories and files. If not, they will be downloaded -# by this script automatically. -# -# - $dl_dir/$release/$lang -# This directory contains the following files downloaded from -# https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/${release}/${release}-${lang}.tar.gz -# -# - clips -# - dev.tsv -# - invalidated.tsv -# - other.tsv -# - reported.tsv -# - test.tsv -# - train.tsv -# - validated.tsv - -dl_dir=$PWD/download -release=cv-corpus-13.0-2023-03-09 -lang=en - -. shared/parse_options.sh || exit 1 - -# All files generated by this script are saved in "data/${lang}". -# You can safely remove "data/${lang}" and rerun this script to regenerate it. -mkdir -p data/${lang} - -log() { - # This function is from espnet - local fname=${BASH_SOURCE[1]##*/} - echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" -} - -log "dl_dir: $dl_dir" - -if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then - log "Stage 0: Download data" - - # If you have pre-downloaded it to /path/to/$release, - # you can create a symlink - # - # ln -sfv /path/to/$release $dl_dir/$release - # - if [ ! -d $dl_dir/$release/$lang/clips ]; then - lhotse download commonvoice --languages $lang --release $release $dl_dir - fi -fi - -if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Prepare CommonVoice manifest" - # We assume that you have downloaded the CommonVoice corpus - # to $dl_dir/$release - mkdir -p data/${lang}/manifests - if [ ! -e data/${lang}/manifests/.cv-${lang}.done ]; then - lhotse prepare commonvoice --language $lang -j $nj $dl_dir/$release data/${lang}/manifests - touch data/${lang}/manifests/.cv-${lang}.done - fi -fi - -if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Preprocess CommonVoice manifest" - if [ ! -e data/${lang}/fbank/.preprocess_complete ]; then - ./local/preprocess_commonvoice.py --language $lang - touch data/${lang}/fbank/.preprocess_complete - fi -fi - -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Compute fbank for dev and test subsets of CommonVoice" - mkdir -p data/${lang}/fbank - if [ ! -e data/${lang}/fbank/.cv-${lang}_dev_test.done ]; then - ./local/compute_fbank_commonvoice_dev_test.py --language $lang - touch data/${lang}/fbank/.cv-${lang}_dev_test.done - fi -fi - -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Split train subset into ${num_splits} pieces" - split_dir=data/${lang}/fbank/cv-${lang}_train_split_${num_splits} - if [ ! -e $split_dir/.cv-${lang}_train_split.done ]; then - lhotse split $num_splits ./data/${lang}/fbank/cv-${lang}_cuts_train_raw.jsonl.gz $split_dir - touch $split_dir/.cv-${lang}_train_split.done - fi -fi - -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Compute features for train subset of CommonVoice" - if [ ! -e data/${lang}/fbank/.cv-${lang}_train.done ]; then - ./local/compute_fbank_commonvoice_splits.py \ - --num-workers $nj \ - --batch-duration 600 \ - --start 0 \ - --num-splits $num_splits \ - --language $lang - touch data/${lang}/fbank/.cv-${lang}_train.done - fi -fi - -if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Combine features for train" - if [ ! -f data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz ]; then - pieces=$(find data/${lang}/fbank/cv-${lang}_train_split_${num_splits} -name "cv-${lang}_cuts_train.*.jsonl.gz") - lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz - fi -fi diff --git a/egs/librispeech/ASR/prepare_giga_speech.sh b/egs/librispeech/ASR/prepare_giga_speech.sh deleted file mode 100755 index 5684ccdf8..000000000 --- a/egs/librispeech/ASR/prepare_giga_speech.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env bash - -set -eou pipefail - -nj=15 -stage=-1 -stop_stage=100 - -# We assume dl_dir (download dir) contains the following -# directories and files. If not, they will be downloaded -# by this script automatically. -# -# - $dl_dir/GigaSpeech -# You can find audio, dict, GigaSpeech.json inside it. -# You can apply for the download credentials by following -# https://github.com/SpeechColab/GigaSpeech#download - -# Number of hours for GigaSpeech subsets -# XL 10k hours -# L 2.5k hours -# M 1k hours -# S 250 hours -# XS 10 hours -# DEV 12 hours -# Test 40 hours - -# Split XL subset to this number of pieces -# This is to avoid OOM during feature extraction. -num_splits=2000 -# We use lazy split from lhotse. -# The XL subset (10k hours) contains 37956 cuts without speed perturbing. -# We want to split it into 2000 splits, so each split -# contains about 37956 / 2000 = 19 cuts. As a result, there will be 1998 splits. -chunk_size=19 # number of cuts in each split. The last split may contain fewer cuts. - -dl_dir=$PWD/download - -. shared/parse_options.sh || exit 1 - -# All files generated by this script are saved in "data". -# You can safely remove "data" and rerun this script to regenerate it. -mkdir -p data - -log() { - # This function is from espnet - local fname=${BASH_SOURCE[1]##*/} - echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" -} - -log "dl_dir: $dl_dir" - -if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then - log "Stage 0: Download data" - - [ ! -e $dl_dir/GigaSpeech ] && mkdir -p $dl_dir/GigaSpeech - - # If you have pre-downloaded it to /path/to/GigaSpeech, - # you can create a symlink - # - # ln -sfv /path/to/GigaSpeech $dl_dir/GigaSpeech - # - if [ ! -d $dl_dir/GigaSpeech/audio ] && [ ! -f $dl_dir/GigaSpeech.json ]; then - # Check credentials. - if [ ! -f $dl_dir/password ]; then - echo -n "$0: Please apply for the download credentials by following" - echo -n "https://github.com/SpeechColab/GigaSpeech#dataset-download" - echo " and save it to $dl_dir/password." - exit 1; - fi - PASSWORD=`cat $dl_dir/password 2>/dev/null` - if [ -z "$PASSWORD" ]; then - echo "$0: Error, $dl_dir/password is empty." - exit 1; - fi - PASSWORD_MD5=`echo $PASSWORD | md5sum | cut -d ' ' -f 1` - if [[ $PASSWORD_MD5 != "dfbf0cde1a3ce23749d8d81e492741b8" ]]; then - echo "$0: Error, invalid $dl_dir/password." - exit 1; - fi - # Download XL, DEV and TEST sets by default. - lhotse download gigaspeech \ - --subset XL \ - --subset L \ - --subset M \ - --subset S \ - --subset XS \ - --subset DEV \ - --subset TEST \ - --host tsinghua \ - $dl_dir/password $dl_dir/GigaSpeech - fi -fi - -if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Prepare GigaSpeech manifest (may take 30 minutes)" - # We assume that you have downloaded the GigaSpeech corpus - # to $dl_dir/GigaSpeech - if [ ! -f data/manifests/.gigaspeech.done ]; then - mkdir -p data/manifests - lhotse prepare gigaspeech \ - --subset XL \ - --subset L \ - --subset M \ - --subset S \ - --subset XS \ - --subset DEV \ - --subset TEST \ - -j $nj \ - $dl_dir/GigaSpeech data/manifests - touch data/manifests/.gigaspeech.done - fi -fi - -if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Preprocess GigaSpeech manifest" - if [ ! -f data/fbank/.gigaspeech_preprocess.done ]; then - log "It may take 2 hours for this stage" - ./local/preprocess_gigaspeech.py - touch data/fbank/.gigaspeech_preprocess.done - fi -fi - -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Compute features for DEV and TEST subsets of GigaSpeech (may take 2 minutes)" - if [ ! -f data/fbank/.gigaspeech_dev_test.done ]; then - ./local/compute_fbank_gigaspeech_dev_test.py - touch data/fbank/.gigaspeech_dev_test.done - fi -fi - -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Split XL subset into ${num_splits} pieces" - split_dir=data/fbank/gigaspeech_XL_split_${num_splits} - if [ ! -f $split_dir/.gigaspeech_XL_split.done ]; then - lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size - touch $split_dir/.gigaspeech_XL_split.done - fi -fi - -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Compute features for XL" - # Note: The script supports --start and --stop options. - # You can use several machines to compute the features in parallel. - if [ ! -f data/fbank/.gigaspeech_XL.done ]; then - ./local/compute_fbank_gigaspeech_splits.py \ - --num-workers $nj \ - --batch-duration 600 \ - --num-splits $num_splits - touch data/fbank/.gigaspeech_XL.done - fi -fi diff --git a/egs/librispeech/ASR/prepare_multidataset.sh b/egs/librispeech/ASR/prepare_multidataset.sh index 31f6646c9..aac1c9265 100755 --- a/egs/librispeech/ASR/prepare_multidataset.sh +++ b/egs/librispeech/ASR/prepare_multidataset.sh @@ -281,28 +281,7 @@ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then fi if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then - log "Stage 8: Compile HLG" - ./local/compile_hlg.py --lang-dir data/lang_phone - - # Note If ./local/compile_hlg.py throws OOM, - # please switch to the following command - # - # ./local/compile_hlg_using_openfst.py --lang-dir data/lang_phone - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} - ./local/compile_hlg.py --lang-dir $lang_dir - - # Note If ./local/compile_hlg.py throws OOM, - # please switch to the following command - # - # ./local/compile_hlg_using_openfst.py --lang-dir $lang_dir - done -fi - -# Compile LG for RNN-T fast_beam_search decoding -if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then - log "Stage 9: Compile LG" + log "Stage 8: Compile LG" ./local/compile_lg.py --lang-dir data/lang_phone for vocab_size in ${vocab_sizes[@]}; do @@ -311,23 +290,51 @@ if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then done fi -if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then - log "Stage 10: Prepare the other datasets" +if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then + log "Stage 9: Prepare the other datasets" # GigaSpeech - if [[ "${multidataset[@]}" =~ "gigaspeech" ]]; then + if [[ "${multidataset[@]}" =~ "gigaspeech" ]] && [ ! -f data/fbank/.gigaspeech.done ]; then log "Dataset: GigaSpeech" - ./prepare_giga_speech.sh + cd data/fbank + if [ -f ../../../../gigaspeech/ASR/data/fbank/XL_split/.split_completed ]; then + ln -svf $(realpath ../../../../gigaspeech/ASR/data/fbank/XL_split) . + else + log "Abort! Please run gigaspeech prepare.sh --stage 5 --stop-stage 6" + exit 1 + fi + + touch .gigaspeech.done + cd ../.. fi # CommonVoice - if [[ "${multidataset[@]}" =~ "commonvoice" ]]; then + if [[ "${multidataset[@]}" =~ "commonvoice" ]] && [ ! -f data/fbank/.commonvoice.done ]; then log "Dataset: CommonVoice" - ./prepare_common_voice.sh + cd data/fbank + if [ -f ../../../../commonvoice/ASR/data/en/fbank/.cv-en_train.done ]; then + ln -svf $(realpath ../../../../commonvoice/ASR/data/en/fbank/cv-en_train_split_1000) . + ln -svf $(realpath ../../../../commonvoice/ASR/data/en/fbank/cv-en_cuts_train.jsonl.gz) . + else + log "Abort! Please run commonvoice prepare.sh --stage 5 --stop-stage 6" + exit 1 + fi + + touch .commonvoice.done + cd ../.. fi # People's Speech - if [[ "${multidataset[@]}" =~ "peoples_speech" ]]; then + if [[ "${multidataset[@]}" =~ "peoples_speech" ]] && [ ! -f data/fbank/.peoples_speech.done ]; then log "Dataset: People's Speech" - ./prepare_peoples_speech.sh + cd data/fbank + if [ -f ../../../../peoples_speech/ASR/data/fbank/.peoples_speech_train.done ]; then + ln -svf $(realpath ../../../../peoples_speech/ASR/data/fbank/peoples_speech_train_split) . + else + log "Abort! Please run commonvoice prepare.sh --stage 5 --stop-stage 6" + exit 1 + fi + + touch .peoples_speech.done + cd ../.. fi fi diff --git a/egs/librispeech/ASR/prepare_peoples_speech.sh b/egs/librispeech/ASR/prepare_peoples_speech.sh deleted file mode 100755 index 0c7267778..000000000 --- a/egs/librispeech/ASR/prepare_peoples_speech.sh +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env bash - -set -eou pipefail - -nj=32 -stage=-1 -stop_stage=100 - -# Split data/set to a number of pieces -# This is to avoid OOM during feature extraction. -num_per_split=4000 - -# We assume dl_dir (download dir) contains the following -# directories and files. If not, they will be downloaded -# by this script automatically. -# -# - $dl_dir/peoples_speech -# This directory contains the following files downloaded from -# https://huggingface.co/datasets/MLCommons/peoples_speech -# -# - test -# - train -# - validation - -dl_dir=$PWD/download - -. shared/parse_options.sh || exit 1 - -# vocab size for sentence piece models. -# It will generate data/lang_bpe_xxx, -# data/lang_bpe_yyy if the array contains xxx, yyy -vocab_sizes=( - # 5000 - # 2000 - # 1000 - 500 -) - -# All files generated by this script are saved in "data". -# You can safely remove "data" and rerun this script to regenerate it. -mkdir -p data - -log() { - # This function is from espnet - local fname=${BASH_SOURCE[1]##*/} - echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" -} - -log "dl_dir: $dl_dir" - -if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then - log "Stage 0: Download data" - - # If you have pre-downloaded it to /path/to/peoples_speech, - # you can create a symlink - # - # ln -sfv /path/to/peoples_speech $dl_dir/peoples_speech - # - if [ ! -d $dl_dir/peoples_speech/train ]; then - git lfs install - git clone https://huggingface.co/datasets/MLCommons/peoples_speech - fi -fi - -if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Prepare People's Speech manifest" - # We assume that you have downloaded the People's Speech corpus - # to $dl_dir/peoples_speech - mkdir -p data/manifests - if [ ! -e data/manifests/.peoples_speech.done ]; then - lhotse prepare peoples-speech -j $nj $dl_dir/peoples_speech data/manifests - touch data/manifests/.peoples_speech.done - fi -fi - -if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Preprocess People's Speech manifest" - mkdir -p data/fbank - if [ ! -e data/fbank/.preprocess_complete ]; then - ./local/preprocess_peoples_speech.py - touch data/fbank/.preprocess_complete - fi -fi - -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Compute fbank for valid and test subsets of People's Speech" - if [ ! -e data/fbank/.peoples_speech_valid_test.done ]; then - ./local/compute_fbank_peoples_speech_valid_test.py - touch data/fbank/.peoples_speech_valid_test.done - fi -fi - -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Split train subset into pieces" - split_dir=data/fbank/peoples_speech_train_split - if [ ! -e $split_dir/.peoples_speech_dirty_split.done ]; then - lhotse split-lazy ./data/fbank/peoples_speech_cuts_dirty_raw.jsonl.gz $split_dir $num_per_split - touch $split_dir/.peoples_speech_dirty_split.done - fi - - if [ ! -e $split_dir/.peoples_speech_dirty_sa_split.done ]; then - lhotse split-lazy ./data/fbank/peoples_speech_cuts_dirty_sa_raw.jsonl.gz $split_dir $num_per_split - touch $split_dir/.peoples_speech_dirty_sa_split.done - fi - - if [ ! -e $split_dir/.peoples_speech_clean_split.done ]; then - lhotse split-lazy ./data/fbank/peoples_speech_cuts_clean_raw.jsonl.gz $split_dir $num_per_split - touch $split_dir/.peoples_speech_clean_split.done - fi - - if [ ! -e $split_dir/.peoples_speech_clean_sa_split.done ]; then - lhotse split-lazy ./data/fbank/peoples_speech_cuts_clean_sa_raw.jsonl.gz $split_dir $num_per_split - touch $split_dir/.peoples_speech_clean_sa_split.done - fi -fi - -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Compute features for train subset of People's Speech" - if [ ! -e data/fbank/.peoples_speech_train.done ]; then - ./local/compute_fbank_peoples_speech_splits.py \ - --num-workers $nj \ - --batch-duration 600 \ - --start 0 \ - --num-splits 2000 - touch data/fbank/.peoples_speech_train.done - fi -fi diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py b/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py index 434471196..6efbb140d 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py @@ -25,26 +25,21 @@ from lhotse import CutSet, load_manifest_lazy class MultiDataset: - def __init__(self, manifest_dir: str, cv_manifest_dir: str): + def __init__(self, manifest_dir: str): """ Args: manifest_dir: It is expected to contain the following files: - librispeech_cuts_train-all-shuf.jsonl.gz - - gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz + - XL_split_2000/cuts_XL.*.jsonl.gz - peoples_speech_train_split/peoples_speech_cuts_dirty.*.jsonl.gz - peoples_speech_train_split/peoples_speech_cuts_dirty_sa.*.jsonl.gz - peoples_speech_train_split/peoples_speech_cuts_clean.*.jsonl.gz - peoples_speech_train_split/peoples_speech_cuts_clean_sa.*.jsonl.gz - - cv_manifest_dir: - It is expected to contain the following files: - - cv-en_cuts_train.jsonl.gz """ self.manifest_dir = Path(manifest_dir) - self.cv_manifest_dir = Path(cv_manifest_dir) def train_cuts(self) -> CutSet: logging.info("About to get multidataset train cuts") @@ -57,10 +52,10 @@ class MultiDataset: # GigaSpeech filenames = glob.glob( - f"{self.manifest_dir}/gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz" + f"{self.manifest_dir}/XL_split_2000/cuts_XL.*.jsonl.gz" ) - pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz") + pattern = re.compile(r"cuts_XL.([0-9]+).jsonl.gz") idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames) idx_filenames = sorted(idx_filenames, key=lambda x: x[0]) @@ -75,7 +70,7 @@ class MultiDataset: # CommonVoice logging.info(f"Loading CommonVoice in lazy mode") commonvoice_cuts = load_manifest_lazy( - self.cv_manifest_dir / f"cv-en_cuts_train.jsonl.gz" + self.manifest_dir / f"cv-en_cuts_train.jsonl.gz" ) # People's Speech diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py index 5be8c481d..b7f644bcd 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py @@ -1043,7 +1043,7 @@ def run(rank, world_size, args): librispeech = LibriSpeechAsrDataModule(args) if params.use_multidataset: - multidataset = MultiDataset(params.manifest_dir, params.cv_manifest_dir) + multidataset = MultiDataset(params.manifest_dir) train_cuts = multidataset.train_cuts() else: if params.mini_libri: