diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py index 554d7f109..25d6050bb 100755 --- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py +++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py @@ -35,7 +35,7 @@ from filter_cuts import filter_cuts from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached -from icefall.utils import get_executor +from icefall.utils import get_executor, str2bool # Torch's multithreaded behavior needs to be disabled or # it wastes a lot of CPU and slow things down. @@ -61,12 +61,20 @@ def get_args(): help="""Dataset parts to compute fbank. If None, we will use all""", ) + parser.add_argument( + "--perturb-speed", + type=str2bool, + default=True, + help="""Perturb speed with factor 0.9 and 1.1 on train subset.""", + ) + return parser.parse_args() def compute_fbank_librispeech( bpe_model: Optional[str] = None, dataset: Optional[str] = None, + perturb_speed: Optional[bool] = True, ): src_dir = Path("data/manifests") output_dir = Path("data/fbank") @@ -125,9 +133,13 @@ def compute_fbank_librispeech( if "train" in partition: if bpe_model: cut_set = filter_cuts(cut_set, sp) - cut_set = ( - cut_set + cut_set.perturb_speed(0.9) + cut_set.perturb_speed(1.1) - ) + if perturb_speed: + logging.info(f"Doing speed perturb") + cut_set = ( + cut_set + + cut_set.perturb_speed(0.9) + + cut_set.perturb_speed(1.1) + ) cut_set = cut_set.compute_and_store_features( extractor=extractor, storage_path=f"{output_dir}/{prefix}_feats_{partition}", @@ -145,4 +157,8 @@ if __name__ == "__main__": logging.basicConfig(format=formatter, level=logging.INFO) args = get_args() logging.info(vars(args)) - compute_fbank_librispeech(bpe_model=args.bpe_model, dataset=args.dataset) + compute_fbank_librispeech( + bpe_model=args.bpe_model, + dataset=args.dataset, + perturb_speed=args.perturb_speed, + ) diff --git a/egs/librispeech/ASR/prepare_common_voice.sh b/egs/librispeech/ASR/prepare_common_voice.sh new file mode 100755 index 000000000..6f9c4fb2f --- /dev/null +++ b/egs/librispeech/ASR/prepare_common_voice.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash + +set -eou pipefail + +nj=16 +stage=-1 +stop_stage=100 + +# Split data/${lang}set to this number of pieces +# This is to avoid OOM during feature extraction. +num_splits=1000 + +# We assume dl_dir (download dir) contains the following +# directories and files. If not, they will be downloaded +# by this script automatically. +# +# - $dl_dir/$release/$lang +# This directory contains the following files downloaded from +# https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/${release}/${release}-${lang}.tar.gz +# +# - clips +# - dev.tsv +# - invalidated.tsv +# - other.tsv +# - reported.tsv +# - test.tsv +# - train.tsv +# - validated.tsv + +dl_dir=$PWD/download +release=cv-corpus-13.0-2023-03-09 +lang=en + +. shared/parse_options.sh || exit 1 + +# All files generated by this script are saved in "data/${lang}". +# You can safely remove "data/${lang}" and rerun this script to regenerate it. +mkdir -p data/${lang} + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "dl_dir: $dl_dir" + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "Stage 0: Download data" + + # If you have pre-downloaded it to /path/to/$release, + # you can create a symlink + # + # ln -sfv /path/to/$release $dl_dir/$release + # + if [ ! -d $dl_dir/$release/$lang/clips ]; then + lhotse download commonvoice --languages $lang --release $release $dl_dir + fi +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare CommonVoice manifest" + # We assume that you have downloaded the CommonVoice corpus + # to $dl_dir/$release + mkdir -p data/${lang}/manifests + if [ ! -e data/${lang}/manifests/.cv-${lang}.done ]; then + lhotse prepare commonvoice --language $lang -j $nj $dl_dir/$release data/${lang}/manifests + touch data/${lang}/manifests/.cv-${lang}.done + fi +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Preprocess CommonVoice manifest" + if [ ! -e data/${lang}/fbank/.preprocess_complete ]; then + ./local/preprocess_commonvoice.py --language $lang + touch data/${lang}/fbank/.preprocess_complete + fi +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Compute fbank for dev and test subsets of CommonVoice" + mkdir -p data/${lang}/fbank + if [ ! -e data/${lang}/fbank/.cv-${lang}_dev_test.done ]; then + ./local/compute_fbank_commonvoice_dev_test.py --language $lang + touch data/${lang}/fbank/.cv-${lang}_dev_test.done + fi +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Split train subset into ${num_splits} pieces" + split_dir=data/${lang}/fbank/cv-${lang}_train_split_${num_splits} + if [ ! -e $split_dir/.cv-${lang}_train_split.done ]; then + lhotse split $num_splits ./data/${lang}/fbank/cv-${lang}_cuts_train_raw.jsonl.gz $split_dir + touch $split_dir/.cv-${lang}_train_split.done + fi +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Compute features for train subset of CommonVoice" + if [ ! -e data/${lang}/fbank/.cv-${lang}_train.done ]; then + ./local/compute_fbank_commonvoice_splits.py \ + --num-workers $nj \ + --batch-duration 600 \ + --start 0 \ + --num-splits $num_splits \ + --language $lang + touch data/${lang}/fbank/.cv-${lang}_train.done + fi +fi + +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Combine features for train" + if [ ! -f data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz ]; then + pieces=$(find data/${lang}/fbank/cv-${lang}_train_split_${num_splits} -name "cv-${lang}_cuts_train.*.jsonl.gz") + lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz + fi +fi diff --git a/egs/librispeech/ASR/prepare_giga_speech.sh b/egs/librispeech/ASR/prepare_giga_speech.sh index 6f85ddc29..b077aaf3a 100755 --- a/egs/librispeech/ASR/prepare_giga_speech.sh +++ b/egs/librispeech/ASR/prepare_giga_speech.sh @@ -95,39 +95,45 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Prepare GigaSpeech manifest (may take 30 minutes)" # We assume that you have downloaded the GigaSpeech corpus # to $dl_dir/GigaSpeech - mkdir -p data/manifests - lhotse prepare gigaspeech \ - --subset XL \ - --subset L \ - --subset M \ - --subset S \ - --subset XS \ - --subset DEV \ - --subset TEST \ - -j $nj \ - $dl_dir/GigaSpeech data/manifests + if [ ! -f data/manifests/.gigaspeech.done ]; then + mkdir -p data/manifests + lhotse prepare gigaspeech \ + --subset XL \ + --subset L \ + --subset M \ + --subset S \ + --subset XS \ + --subset DEV \ + --subset TEST \ + -j $nj \ + $dl_dir/GigaSpeech data/manifests + touch data/manifests/.gigaspeech.done + fi fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Preprocess GigaSpeech manifest" - if [ ! -f data/fbank/.preprocess_complete ]; then - log "It may take 2 hours for this stage" - python3 ./local/preprocess_gigaspeech.py - touch data/fbank/.preprocess_complete + if [ ! -f data/fbank/.gigaspeech_preprocess.done ]; then + log "It may take 2 hours for this stage" + ./local/preprocess_gigaspeech.py + touch data/fbank/.gigaspeech_preprocess.done fi fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Compute features for DEV and TEST subsets of GigaSpeech (may take 2 minutes)" - python3 ./local/compute_fbank_gigaspeech_dev_test.py + if [ ! -f data/fbank/.gigaspeech_dev_test.done ]; then + ./local/compute_fbank_gigaspeech_dev_test.py + touch data/fbank/.gigaspeech_dev_test.done + fi fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "Stage 4: Split XL subset into ${num_splits} pieces" split_dir=data/fbank/gigaspeech_XL_split_${num_splits} - if [ ! -f $split_dir/.split_completed ]; then + if [ ! -f $split_dir/.gigaspeech_XL_split.done ]; then lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size - touch $split_dir/.split_completed + touch $split_dir/.gigaspeech_XL_split.done fi fi @@ -135,8 +141,19 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "Stage 5: Compute features for XL" # Note: The script supports --start and --stop options. # You can use several machines to compute the features in parallel. - python3 ./local/compute_fbank_gigaspeech_splits.py \ - --num-workers $nj \ - --batch-duration 600 \ - --num-splits $num_splits + if [ ! -f data/fbank/.gigaspeech_XL.done ]; then + ./local/compute_fbank_gigaspeech_splits.py \ + --num-workers $nj \ + --batch-duration 600 \ + --num-splits $num_splits + touch data/fbank/.gigaspeech_XL.done + fi +fi + +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Combine features for XL (may take 15 hours)" + if [ ! -f data/fbank/gigaspeech_cuts_XL.jsonl.gz ]; then + pieces=$(find data/fbank/gigaspeech_XL_split_${num_splits} -name "gigaspeech_cuts_XL.*.jsonl.gz") + lhotse combine $pieces data/fbank/gigaspeech_cuts_XL.jsonl.gz + fi fi diff --git a/egs/librispeech/ASR/prepare_multidataset.sh b/egs/librispeech/ASR/prepare_multidataset.sh new file mode 100755 index 000000000..c068305c0 --- /dev/null +++ b/egs/librispeech/ASR/prepare_multidataset.sh @@ -0,0 +1,373 @@ +#!/usr/bin/env bash + +# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + +set -eou pipefail + +nj=16 +stage=-1 +stop_stage=100 + +# We assume dl_dir (download dir) contains the following +# directories and files. If not, they will be downloaded +# by this script automatically. +# +# - $dl_dir/LibriSpeech +# You can find BOOKS.TXT, test-clean, train-clean-360, etc, inside it. +# You can download them from https://www.openslr.org/12 +# +# - $dl_dir/lm +# This directory contains the following files downloaded from +# http://www.openslr.org/resources/11 +# +# - 3-gram.pruned.1e-7.arpa.gz +# - 3-gram.pruned.1e-7.arpa +# - 4-gram.arpa.gz +# - 4-gram.arpa +# - librispeech-vocab.txt +# - librispeech-lexicon.txt +# - librispeech-lm-norm.txt.gz +# +# - $dl_dir/musan +# This directory contains the following directories downloaded from +# http://www.openslr.org/17/ +# +# - music +# - noise +# - speech + +# Split all dataset to this number of pieces and mix each dataset pieces +# into multidataset pieces with shuffling. +num_splits=1998 + +dl_dir=$PWD/download + +. shared/parse_options.sh || exit 1 + +# vocab size for sentence piece models. +# It will generate data/lang_bpe_xxx, +# data/lang_bpe_yyy if the array contains xxx, yyy +vocab_sizes=( + # 5000 + # 2000 + # 1000 + 500 +) + +# multidataset list. +# LibriSpeech and musan are required. +# The others are optional. +multidataset=( + "gigaspeech", + "commonvoice", +) + +# All files generated by this script are saved in "data". +# You can safely remove "data" and rerun this script to regenerate it. +mkdir -p data + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "dl_dir: $dl_dir" + +log "Dataset: LibriSpeech and musan" +if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then + log "Stage -1: Download LM" + mkdir -p $dl_dir/lm + if [ ! -e $dl_dir/lm/.done ]; then + ./local/download_lm.py --out-dir=$dl_dir/lm + touch $dl_dir/lm/.done + fi +fi + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "Stage 0: Download data" + + # If you have pre-downloaded it to /path/to/LibriSpeech, + # you can create a symlink + # + # ln -sfv /path/to/LibriSpeech $dl_dir/LibriSpeech + # + if [ ! -d $dl_dir/LibriSpeech/train-other-500 ]; then + lhotse download librispeech --full $dl_dir + fi + + # If you have pre-downloaded it to /path/to/musan, + # you can create a symlink + # + # ln -sfv /path/to/musan $dl_dir/ + # + if [ ! -d $dl_dir/musan ]; then + lhotse download musan $dl_dir + fi +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare LibriSpeech manifest" + # We assume that you have downloaded the LibriSpeech corpus + # to $dl_dir/LibriSpeech + mkdir -p data/manifests + if [ ! -e data/manifests/.librispeech.done ]; then + lhotse prepare librispeech -j $nj $dl_dir/LibriSpeech data/manifests + touch data/manifests/.librispeech.done + fi +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Prepare musan manifest" + # We assume that you have downloaded the musan corpus + # to data/musan + mkdir -p data/manifests + if [ ! -e data/manifests/.musan.done ]; then + lhotse prepare musan $dl_dir/musan data/manifests + touch data/manifests/.musan.done + fi +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Compute fbank for librispeech" + mkdir -p data/fbank + if [ ! -e data/fbank/.librispeech.done ]; then + ./local/compute_fbank_librispeech.py --perturb-speed False + touch data/fbank/.librispeech.done + fi + + if [ ! -f data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz ]; then + cat <(gunzip -c data/fbank/librispeech_cuts_train-clean-100.jsonl.gz) \ + <(gunzip -c data/fbank/librispeech_cuts_train-clean-360.jsonl.gz) \ + <(gunzip -c data/fbank/librispeech_cuts_train-other-500.jsonl.gz) | \ + shuf | gzip -c > data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz + fi + + if [ ! -e data/fbank/.librispeech-validated.done ]; then + log "Validating data/fbank for LibriSpeech" + parts=( + train-clean-100 + train-clean-360 + train-other-500 + test-clean + test-other + dev-clean + dev-other + ) + for part in ${parts[@]}; do + python3 ./local/validate_manifest.py \ + data/fbank/librispeech_cuts_${part}.jsonl.gz + done + touch data/fbank/.librispeech-validated.done + fi +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Compute fbank for musan" + mkdir -p data/fbank + if [ ! -e data/fbank/.musan.done ]; then + ./local/compute_fbank_musan.py + touch data/fbank/.musan.done + fi +fi + +if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then + log "Stage 5: Prepare phone based lang" + lang_dir=data/lang_phone + mkdir -p $lang_dir + + (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | + cat - $dl_dir/lm/librispeech-lexicon.txt | + sort | uniq > $lang_dir/lexicon.txt + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang.py --lang-dir $lang_dir + fi + + if [ ! -f $lang_dir/L.fst ]; then + log "Converting L.pt to L.fst" + ./shared/convert-k2-to-openfst.py \ + --olabels aux_labels \ + $lang_dir/L.pt \ + $lang_dir/L.fst + fi + + if [ ! -f $lang_dir/L_disambig.fst ]; then + log "Converting L_disambig.pt to L_disambig.fst" + ./shared/convert-k2-to-openfst.py \ + --olabels aux_labels \ + $lang_dir/L_disambig.pt \ + $lang_dir/disambig_L.fst + fi +fi + +if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then + log "Stage 6: Prepare BPE based lang" + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + mkdir -p $lang_dir + # We reuse words.txt from phone based lexicon + # so that the two can share G.pt later. + cp data/lang_phone/words.txt $lang_dir + + if [ ! -f $lang_dir/transcript_words.txt ]; then + log "Generate data for BPE training" + files=$( + find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt" + find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt" + find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt" + ) + for f in ${files[@]}; do + cat $f | cut -d " " -f 2- + done > $lang_dir/transcript_words.txt + fi + + if [ ! -f $lang_dir/bpe.model ]; then + ./local/train_bpe_model.py \ + --lang-dir $lang_dir \ + --vocab-size $vocab_size \ + --transcript $lang_dir/transcript_words.txt + fi + + if [ ! -f $lang_dir/L_disambig.pt ]; then + ./local/prepare_lang_bpe.py --lang-dir $lang_dir + + log "Validating $lang_dir/lexicon.txt" + ./local/validate_bpe_lexicon.py \ + --lexicon $lang_dir/lexicon.txt \ + --bpe-model $lang_dir/bpe.model + fi + + if [ ! -f $lang_dir/L.fst ]; then + log "Converting L.pt to L.fst" + ./shared/convert-k2-to-openfst.py \ + --olabels aux_labels \ + $lang_dir/L.pt \ + $lang_dir/L.fst + fi + + if [ ! -f $lang_dir/L_disambig.fst ]; then + log "Converting L_disambig.pt to L_disambig.fst" + ./shared/convert-k2-to-openfst.py \ + --olabels aux_labels \ + $lang_dir/L_disambig.pt \ + $lang_dir/L_disambig.fst + fi + done +fi + +if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then + log "Stage 7: Prepare G" + # We assume you have install kaldilm, if not, please install + # it using: pip install kaldilm + + mkdir -p data/lm + if [ ! -f data/lm/G_3_gram.fst.txt ]; then + # It is used in building HLG + python3 -m kaldilm \ + --read-symbol-table="data/lang_phone/words.txt" \ + --disambig-symbol='#0' \ + --max-order=3 \ + $dl_dir/lm/3-gram.pruned.1e-7.arpa > data/lm/G_3_gram.fst.txt + fi + + if [ ! -f data/lm/G_4_gram.fst.txt ]; then + # It is used for LM rescoring + python3 -m kaldilm \ + --read-symbol-table="data/lang_phone/words.txt" \ + --disambig-symbol='#0' \ + --max-order=4 \ + $dl_dir/lm/4-gram.arpa > data/lm/G_4_gram.fst.txt + fi +fi + +if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then + log "Stage 8: Compile HLG" + ./local/compile_hlg.py --lang-dir data/lang_phone + + # Note If ./local/compile_hlg.py throws OOM, + # please switch to the following command + # + # ./local/compile_hlg_using_openfst.py --lang-dir data/lang_phone + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + ./local/compile_hlg.py --lang-dir $lang_dir + + # Note If ./local/compile_hlg.py throws OOM, + # please switch to the following command + # + # ./local/compile_hlg_using_openfst.py --lang-dir $lang_dir + done +fi + +# Compile LG for RNN-T fast_beam_search decoding +if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then + log "Stage 9: Compile LG" + ./local/compile_lg.py --lang-dir data/lang_phone + + for vocab_size in ${vocab_sizes[@]}; do + lang_dir=data/lang_bpe_${vocab_size} + ./local/compile_lg.py --lang-dir $lang_dir + done +fi + +if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then + log "Stage 10: Prepare the other datasets" + # GigaSpeech + if [[ "${multidataset[@]}" =~ "gigaspeech" ]]; then + log "Dataset: GigaSpeech" + ./prepare_giga_speech.sh --stop_stage 5 + fi + + # CommonVoice + if [[ "${multidataset[@]}" =~ "commonvoice" ]]; then + log "Dataset: CommonVoice" + ./prepare_common_voice.sh + fi +fi + +if [ $stage -le 11 ] && [ $stop_stage -ge 11 ]; then + log "Stage 11: Create multidataset" + split_dir=data/fbank/multidataset_split_${num_splits} + if [ ! -f data/fbank/multidataset_split/.multidataset.done ]; then + mkdir -p $split_dir/multidataset + log "Split LibriSpeech" + if [ ! -f $split_dir/.librispeech_split.done ]; then + lhotse split $num_splits ./data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz $split_dir + touch $split_dir/.librispeech_split.done + fi + + if [[ "${multidataset[@]}" =~ "gigaspeech" ]]; then + log "Split GigaSpeech XL" + if [ ! -f $split_dir/.gigaspeech_XL_split.done ]; then + cd $split_dir + ln -sv ../gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz . + cd ../../.. + touch $split_dir/.gigaspeech_XL_split.done + fi + fi + + if [[ "${multidataset[@]}" =~ "commonvoice" ]]; then + log "Split CommonVoice" + if [ ! -f $split_dir/.cv-en_train_split.done ]; then + lhotse split $num_splits ./data/en/fbank/cv-en_cuts_train.jsonl.gz $split_dir + touch $split_dir/.cv-en_train_split.done + fi + fi + + if [ ! -f $split_dir/.multidataset_mix.done ]; then + log "Mix multidataset" + for ((seq=1; seq<=$num_splits; seq++)); do + fseq=$(printf "%04d" $seq) + gunzip -c $split_dir/*.*${fseq}.jsonl.gz | \ + shuf | gzip -c > $split_dir/multidataset/multidataset_cuts_train.${fseq}.jsonl.gz + done + touch $split_dir/.multidataset_mix.done + fi + + touch data/fbank/multidataset_split/.multidataset.done + fi +fi diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py b/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py index 598434f54..f3bd6284e 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py @@ -30,7 +30,7 @@ class GigaSpeech: """ Args: manifest_dir: - It is expected to contain the following files:: + It is expected to contain the following files: - gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz - gigaspeech_cuts_L_raw.jsonl.gz diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py b/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py new file mode 100644 index 000000000..dcb4cd141 --- /dev/null +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py @@ -0,0 +1,53 @@ +# Copyright 2023 Xiaomi Corp. (authors: Yifan Yang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import glob +import logging +import re +from pathlib import Path + +import lhotse +from lhotse import CutSet, load_manifest_lazy + + +class MultiDataset: + def __init__(self, manifest_dir: str): + """ + Args: + manifest_dir: + It is expected to contain the following files: + + - multidataset_split_1998/multidataset/multidataset_cuts_train.*.jsonl.gz + """ + self.manifest_dir = Path(manifest_dir) + + def train_cuts(self) -> CutSet: + logging.info("About to get multidataset train cuts") + + filenames = glob.glob( + f"{self.manifest_dir}/multidataset_split_1998/multidataset/multidataset_cuts_train.*.jsonl.gz" + ) + + pattern = re.compile(r"multidataset_cuts_train.([0-9]+).jsonl.gz") + idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames) + idx_filenames = sorted(idx_filenames, key=lambda x: x[0]) + + sorted_filenames = [f[1] for f in idx_filenames] + + logging.info(f"Loading {len(sorted_filenames)} splits") + + return lhotse.combine(lhotse.load_manifest_lazy(p) for p in sorted_filenames) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py index 792a243e5..01c9500ce 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py @@ -1,8 +1,9 @@ #!/usr/bin/env python3 -# Copyright 2021-2022 Xiaomi Corp. (authors: Fangjun Kuang, +# Copyright 2021-2023 Xiaomi Corp. (authors: Fangjun Kuang, # Wei Kang, -# Mingshuang Luo,) -# Zengwei Yao) +# Mingshuang Luo, +# Zengwei Yao, +# Yifan Yang) # # See ../../../../LICENSE for clarification regarding multiple authors # @@ -59,6 +60,7 @@ import torch import torch.multiprocessing as mp import torch.nn as nn from asr_datamodule import LibriSpeechAsrDataModule +from multidataset import MultiDataset from decoder import Decoder from joiner import Joiner from lhotse.cut import Cut @@ -374,6 +376,13 @@ def get_parser(): help="Whether to use half precision training.", ) + parser.add_argument( + "--use-multidataset", + type=str2bool, + default=False, + help="Whether to use multidataset to train.", + ) + add_model_arguments(parser) return parser @@ -1043,10 +1052,14 @@ def run(rank, world_size, args): librispeech = LibriSpeechAsrDataModule(args) - if params.full_libri: - train_cuts = librispeech.train_all_shuf_cuts() + if params.use_multidataset: + multidataset = MultiDataset(params.manifest_dir) + train_cuts = multidataset.train_cuts() else: - train_cuts = librispeech.train_clean_100_cuts() + if params.full_libri: + train_cuts = librispeech.train_all_shuf_cuts() + else: + train_cuts = librispeech.train_clean_100_cuts() def remove_short_and_long_utt(c: Cut): # Keep only utterances with duration between 1 second and 20 seconds @@ -1058,9 +1071,6 @@ def run(rank, world_size, args): # an utterance duration distribution for your dataset to select # the threshold if c.duration < 1.0 or c.duration > 20.0: - logging.warning( - f"Exclude cut with ID {c.id} from training. Duration: {c.duration}" - ) return False # In pruned RNN-T, we require that T >= S @@ -1102,7 +1112,7 @@ def run(rank, world_size, args): valid_cuts += librispeech.dev_other_cuts() valid_dl = librispeech.valid_dataloaders(valid_cuts) - if not params.print_diagnostics: + if not params.use_multidataset and not params.print_diagnostics: scan_pessimistic_batches_for_oom( model=model, train_dl=train_dl,