diff --git a/egs/librispeech/ASR/prepare_common_voice.sh b/egs/librispeech/ASR/prepare_common_voice.sh deleted file mode 100755 index 6f9c4fb2f..000000000 --- a/egs/librispeech/ASR/prepare_common_voice.sh +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env bash - -set -eou pipefail - -nj=16 -stage=-1 -stop_stage=100 - -# Split data/${lang}set to this number of pieces -# This is to avoid OOM during feature extraction. -num_splits=1000 - -# We assume dl_dir (download dir) contains the following -# directories and files. If not, they will be downloaded -# by this script automatically. -# -# - $dl_dir/$release/$lang -# This directory contains the following files downloaded from -# https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/${release}/${release}-${lang}.tar.gz -# -# - clips -# - dev.tsv -# - invalidated.tsv -# - other.tsv -# - reported.tsv -# - test.tsv -# - train.tsv -# - validated.tsv - -dl_dir=$PWD/download -release=cv-corpus-13.0-2023-03-09 -lang=en - -. shared/parse_options.sh || exit 1 - -# All files generated by this script are saved in "data/${lang}". -# You can safely remove "data/${lang}" and rerun this script to regenerate it. -mkdir -p data/${lang} - -log() { - # This function is from espnet - local fname=${BASH_SOURCE[1]##*/} - echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" -} - -log "dl_dir: $dl_dir" - -if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then - log "Stage 0: Download data" - - # If you have pre-downloaded it to /path/to/$release, - # you can create a symlink - # - # ln -sfv /path/to/$release $dl_dir/$release - # - if [ ! -d $dl_dir/$release/$lang/clips ]; then - lhotse download commonvoice --languages $lang --release $release $dl_dir - fi -fi - -if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Prepare CommonVoice manifest" - # We assume that you have downloaded the CommonVoice corpus - # to $dl_dir/$release - mkdir -p data/${lang}/manifests - if [ ! -e data/${lang}/manifests/.cv-${lang}.done ]; then - lhotse prepare commonvoice --language $lang -j $nj $dl_dir/$release data/${lang}/manifests - touch data/${lang}/manifests/.cv-${lang}.done - fi -fi - -if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Preprocess CommonVoice manifest" - if [ ! -e data/${lang}/fbank/.preprocess_complete ]; then - ./local/preprocess_commonvoice.py --language $lang - touch data/${lang}/fbank/.preprocess_complete - fi -fi - -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Compute fbank for dev and test subsets of CommonVoice" - mkdir -p data/${lang}/fbank - if [ ! -e data/${lang}/fbank/.cv-${lang}_dev_test.done ]; then - ./local/compute_fbank_commonvoice_dev_test.py --language $lang - touch data/${lang}/fbank/.cv-${lang}_dev_test.done - fi -fi - -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Split train subset into ${num_splits} pieces" - split_dir=data/${lang}/fbank/cv-${lang}_train_split_${num_splits} - if [ ! -e $split_dir/.cv-${lang}_train_split.done ]; then - lhotse split $num_splits ./data/${lang}/fbank/cv-${lang}_cuts_train_raw.jsonl.gz $split_dir - touch $split_dir/.cv-${lang}_train_split.done - fi -fi - -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Compute features for train subset of CommonVoice" - if [ ! -e data/${lang}/fbank/.cv-${lang}_train.done ]; then - ./local/compute_fbank_commonvoice_splits.py \ - --num-workers $nj \ - --batch-duration 600 \ - --start 0 \ - --num-splits $num_splits \ - --language $lang - touch data/${lang}/fbank/.cv-${lang}_train.done - fi -fi - -if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Combine features for train" - if [ ! -f data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz ]; then - pieces=$(find data/${lang}/fbank/cv-${lang}_train_split_${num_splits} -name "cv-${lang}_cuts_train.*.jsonl.gz") - lhotse combine $pieces data/${lang}/fbank/cv-${lang}_cuts_train.jsonl.gz - fi -fi diff --git a/egs/librispeech/ASR/prepare_giga_speech.sh b/egs/librispeech/ASR/prepare_giga_speech.sh deleted file mode 100755 index b077aaf3a..000000000 --- a/egs/librispeech/ASR/prepare_giga_speech.sh +++ /dev/null @@ -1,159 +0,0 @@ -#!/usr/bin/env bash - -set -eou pipefail - -nj=15 -stage=-1 -stop_stage=100 - -# We assume dl_dir (download dir) contains the following -# directories and files. If not, they will be downloaded -# by this script automatically. -# -# - $dl_dir/GigaSpeech -# You can find audio, dict, GigaSpeech.json inside it. -# You can apply for the download credentials by following -# https://github.com/SpeechColab/GigaSpeech#download - -# Number of hours for GigaSpeech subsets -# XL 10k hours -# L 2.5k hours -# M 1k hours -# S 250 hours -# XS 10 hours -# DEV 12 hours -# Test 40 hours - -# Split XL subset to this number of pieces -# This is to avoid OOM during feature extraction. -num_splits=2000 -# We use lazy split from lhotse. -# The XL subset (10k hours) contains 37956 cuts without speed perturbing. -# We want to split it into 2000 splits, so each split -# contains about 37956 / 2000 = 19 cuts. As a result, there will be 1998 splits. -chunk_size=19 # number of cuts in each split. The last split may contain fewer cuts. - -dl_dir=$PWD/download - -. shared/parse_options.sh || exit 1 - -# All files generated by this script are saved in "data". -# You can safely remove "data" and rerun this script to regenerate it. -mkdir -p data - -log() { - # This function is from espnet - local fname=${BASH_SOURCE[1]##*/} - echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" -} - -log "dl_dir: $dl_dir" - -if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then - log "Stage 0: Download data" - - [ ! -e $dl_dir/GigaSpeech ] && mkdir -p $dl_dir/GigaSpeech - - # If you have pre-downloaded it to /path/to/GigaSpeech, - # you can create a symlink - # - # ln -sfv /path/to/GigaSpeech $dl_dir/GigaSpeech - # - if [ ! -d $dl_dir/GigaSpeech/audio ] && [ ! -f $dl_dir/GigaSpeech.json ]; then - # Check credentials. - if [ ! -f $dl_dir/password ]; then - echo -n "$0: Please apply for the download credentials by following" - echo -n "https://github.com/SpeechColab/GigaSpeech#dataset-download" - echo " and save it to $dl_dir/password." - exit 1; - fi - PASSWORD=`cat $dl_dir/password 2>/dev/null` - if [ -z "$PASSWORD" ]; then - echo "$0: Error, $dl_dir/password is empty." - exit 1; - fi - PASSWORD_MD5=`echo $PASSWORD | md5sum | cut -d ' ' -f 1` - if [[ $PASSWORD_MD5 != "dfbf0cde1a3ce23749d8d81e492741b8" ]]; then - echo "$0: Error, invalid $dl_dir/password." - exit 1; - fi - # Download XL, DEV and TEST sets by default. - lhotse download gigaspeech \ - --subset XL \ - --subset L \ - --subset M \ - --subset S \ - --subset XS \ - --subset DEV \ - --subset TEST \ - --host tsinghua \ - $dl_dir/password $dl_dir/GigaSpeech - fi -fi - -if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Prepare GigaSpeech manifest (may take 30 minutes)" - # We assume that you have downloaded the GigaSpeech corpus - # to $dl_dir/GigaSpeech - if [ ! -f data/manifests/.gigaspeech.done ]; then - mkdir -p data/manifests - lhotse prepare gigaspeech \ - --subset XL \ - --subset L \ - --subset M \ - --subset S \ - --subset XS \ - --subset DEV \ - --subset TEST \ - -j $nj \ - $dl_dir/GigaSpeech data/manifests - touch data/manifests/.gigaspeech.done - fi -fi - -if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Preprocess GigaSpeech manifest" - if [ ! -f data/fbank/.gigaspeech_preprocess.done ]; then - log "It may take 2 hours for this stage" - ./local/preprocess_gigaspeech.py - touch data/fbank/.gigaspeech_preprocess.done - fi -fi - -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Compute features for DEV and TEST subsets of GigaSpeech (may take 2 minutes)" - if [ ! -f data/fbank/.gigaspeech_dev_test.done ]; then - ./local/compute_fbank_gigaspeech_dev_test.py - touch data/fbank/.gigaspeech_dev_test.done - fi -fi - -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Split XL subset into ${num_splits} pieces" - split_dir=data/fbank/gigaspeech_XL_split_${num_splits} - if [ ! -f $split_dir/.gigaspeech_XL_split.done ]; then - lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size - touch $split_dir/.gigaspeech_XL_split.done - fi -fi - -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Compute features for XL" - # Note: The script supports --start and --stop options. - # You can use several machines to compute the features in parallel. - if [ ! -f data/fbank/.gigaspeech_XL.done ]; then - ./local/compute_fbank_gigaspeech_splits.py \ - --num-workers $nj \ - --batch-duration 600 \ - --num-splits $num_splits - touch data/fbank/.gigaspeech_XL.done - fi -fi - -if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Combine features for XL (may take 15 hours)" - if [ ! -f data/fbank/gigaspeech_cuts_XL.jsonl.gz ]; then - pieces=$(find data/fbank/gigaspeech_XL_split_${num_splits} -name "gigaspeech_cuts_XL.*.jsonl.gz") - lhotse combine $pieces data/fbank/gigaspeech_cuts_XL.jsonl.gz - fi -fi diff --git a/egs/librispeech/ASR/prepare_multidataset.sh b/egs/librispeech/ASR/prepare_multidataset.sh deleted file mode 100755 index c95b4d039..000000000 --- a/egs/librispeech/ASR/prepare_multidataset.sh +++ /dev/null @@ -1,330 +0,0 @@ -#!/usr/bin/env bash - -# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 -export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - -set -eou pipefail - -nj=16 -stage=-1 -stop_stage=100 - -# We assume dl_dir (download dir) contains the following -# directories and files. If not, they will be downloaded -# by this script automatically. -# -# - $dl_dir/LibriSpeech -# You can find BOOKS.TXT, test-clean, train-clean-360, etc, inside it. -# You can download them from https://www.openslr.org/12 -# -# - $dl_dir/lm -# This directory contains the following files downloaded from -# http://www.openslr.org/resources/11 -# -# - 3-gram.pruned.1e-7.arpa.gz -# - 3-gram.pruned.1e-7.arpa -# - 4-gram.arpa.gz -# - 4-gram.arpa -# - librispeech-vocab.txt -# - librispeech-lexicon.txt -# - librispeech-lm-norm.txt.gz -# -# - $dl_dir/musan -# This directory contains the following directories downloaded from -# http://www.openslr.org/17/ -# -# - music -# - noise -# - speech - -# Split all dataset to this number of pieces and mix each dataset pieces -# into multidataset pieces with shuffling. -num_splits=1998 - -dl_dir=$PWD/download - -. shared/parse_options.sh || exit 1 - -# vocab size for sentence piece models. -# It will generate data/lang_bpe_xxx, -# data/lang_bpe_yyy if the array contains xxx, yyy -vocab_sizes=( - # 5000 - # 2000 - # 1000 - 500 -) - -# multidataset list. -# LibriSpeech and musan are required. -# The others are optional. -multidataset=( - "gigaspeech", - "commonvoice", -) - -# All files generated by this script are saved in "data". -# You can safely remove "data" and rerun this script to regenerate it. -mkdir -p data - -log() { - # This function is from espnet - local fname=${BASH_SOURCE[1]##*/} - echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" -} - -log "dl_dir: $dl_dir" - -log "Dataset: LibriSpeech and musan" -if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then - log "Stage -1: Download LM" - mkdir -p $dl_dir/lm - if [ ! -e $dl_dir/lm/.done ]; then - ./local/download_lm.py --out-dir=$dl_dir/lm - touch $dl_dir/lm/.done - fi -fi - -if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then - log "Stage 0: Download data" - - # If you have pre-downloaded it to /path/to/LibriSpeech, - # you can create a symlink - # - # ln -sfv /path/to/LibriSpeech $dl_dir/LibriSpeech - # - if [ ! -d $dl_dir/LibriSpeech/train-other-500 ]; then - lhotse download librispeech --full $dl_dir - fi - - # If you have pre-downloaded it to /path/to/musan, - # you can create a symlink - # - # ln -sfv /path/to/musan $dl_dir/ - # - if [ ! -d $dl_dir/musan ]; then - lhotse download musan $dl_dir - fi -fi - -if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Prepare LibriSpeech manifest" - # We assume that you have downloaded the LibriSpeech corpus - # to $dl_dir/LibriSpeech - mkdir -p data/manifests - if [ ! -e data/manifests/.librispeech.done ]; then - lhotse prepare librispeech -j $nj $dl_dir/LibriSpeech data/manifests - touch data/manifests/.librispeech.done - fi -fi - -if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Prepare musan manifest" - # We assume that you have downloaded the musan corpus - # to data/musan - mkdir -p data/manifests - if [ ! -e data/manifests/.musan.done ]; then - lhotse prepare musan $dl_dir/musan data/manifests - touch data/manifests/.musan.done - fi -fi - -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Compute fbank for librispeech" - mkdir -p data/fbank - if [ ! -e data/fbank/.librispeech.done ]; then - ./local/compute_fbank_librispeech.py --perturb-speed False - touch data/fbank/.librispeech.done - fi - - if [ ! -f data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz ]; then - cat <(gunzip -c data/fbank/librispeech_cuts_train-clean-100.jsonl.gz) \ - <(gunzip -c data/fbank/librispeech_cuts_train-clean-360.jsonl.gz) \ - <(gunzip -c data/fbank/librispeech_cuts_train-other-500.jsonl.gz) | \ - shuf | gzip -c > data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz - fi - - if [ ! -e data/fbank/.librispeech-validated.done ]; then - log "Validating data/fbank for LibriSpeech" - parts=( - train-clean-100 - train-clean-360 - train-other-500 - test-clean - test-other - dev-clean - dev-other - ) - for part in ${parts[@]}; do - python3 ./local/validate_manifest.py \ - data/fbank/librispeech_cuts_${part}.jsonl.gz - done - touch data/fbank/.librispeech-validated.done - fi -fi - -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Compute fbank for musan" - mkdir -p data/fbank - if [ ! -e data/fbank/.musan.done ]; then - ./local/compute_fbank_musan.py - touch data/fbank/.musan.done - fi -fi - -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Prepare phone based lang" - lang_dir=data/lang_phone - mkdir -p $lang_dir - - (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | - cat - $dl_dir/lm/librispeech-lexicon.txt | - sort | uniq > $lang_dir/lexicon.txt - - if [ ! -f $lang_dir/L_disambig.pt ]; then - ./local/prepare_lang.py --lang-dir $lang_dir - fi - - if [ ! -f $lang_dir/L.fst ]; then - log "Converting L.pt to L.fst" - ./shared/convert-k2-to-openfst.py \ - --olabels aux_labels \ - $lang_dir/L.pt \ - $lang_dir/L.fst - fi - - if [ ! -f $lang_dir/L_disambig.fst ]; then - log "Converting L_disambig.pt to L_disambig.fst" - ./shared/convert-k2-to-openfst.py \ - --olabels aux_labels \ - $lang_dir/L_disambig.pt \ - $lang_dir/L_disambig.fst - fi -fi - -if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Prepare BPE based lang" - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} - mkdir -p $lang_dir - # We reuse words.txt from phone based lexicon - # so that the two can share G.pt later. - cp data/lang_phone/words.txt $lang_dir - - if [ ! -f $lang_dir/transcript_words.txt ]; then - log "Generate data for BPE training" - files=$( - find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt" - find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt" - find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt" - ) - for f in ${files[@]}; do - cat $f | cut -d " " -f 2- - done > $lang_dir/transcript_words.txt - fi - - if [ ! -f $lang_dir/bpe.model ]; then - ./local/train_bpe_model.py \ - --lang-dir $lang_dir \ - --vocab-size $vocab_size \ - --transcript $lang_dir/transcript_words.txt - fi - - if [ ! -f $lang_dir/L_disambig.pt ]; then - ./local/prepare_lang_bpe.py --lang-dir $lang_dir - - log "Validating $lang_dir/lexicon.txt" - ./local/validate_bpe_lexicon.py \ - --lexicon $lang_dir/lexicon.txt \ - --bpe-model $lang_dir/bpe.model - fi - - if [ ! -f $lang_dir/L.fst ]; then - log "Converting L.pt to L.fst" - ./shared/convert-k2-to-openfst.py \ - --olabels aux_labels \ - $lang_dir/L.pt \ - $lang_dir/L.fst - fi - - if [ ! -f $lang_dir/L_disambig.fst ]; then - log "Converting L_disambig.pt to L_disambig.fst" - ./shared/convert-k2-to-openfst.py \ - --olabels aux_labels \ - $lang_dir/L_disambig.pt \ - $lang_dir/L_disambig.fst - fi - done -fi - -if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then - log "Stage 7: Prepare G" - # We assume you have install kaldilm, if not, please install - # it using: pip install kaldilm - - mkdir -p data/lm - if [ ! -f data/lm/G_3_gram.fst.txt ]; then - # It is used in building HLG - python3 -m kaldilm \ - --read-symbol-table="data/lang_phone/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $dl_dir/lm/3-gram.pruned.1e-7.arpa > data/lm/G_3_gram.fst.txt - fi - - if [ ! -f data/lm/G_4_gram.fst.txt ]; then - # It is used for LM rescoring - python3 -m kaldilm \ - --read-symbol-table="data/lang_phone/words.txt" \ - --disambig-symbol='#0' \ - --max-order=4 \ - $dl_dir/lm/4-gram.arpa > data/lm/G_4_gram.fst.txt - fi -fi - -if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then - log "Stage 8: Compile HLG" - ./local/compile_hlg.py --lang-dir data/lang_phone - - # Note If ./local/compile_hlg.py throws OOM, - # please switch to the following command - # - # ./local/compile_hlg_using_openfst.py --lang-dir data/lang_phone - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} - ./local/compile_hlg.py --lang-dir $lang_dir - - # Note If ./local/compile_hlg.py throws OOM, - # please switch to the following command - # - # ./local/compile_hlg_using_openfst.py --lang-dir $lang_dir - done -fi - -# Compile LG for RNN-T fast_beam_search decoding -if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then - log "Stage 9: Compile LG" - ./local/compile_lg.py --lang-dir data/lang_phone - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} - ./local/compile_lg.py --lang-dir $lang_dir - done -fi - -if [ $stage -le 10 ] && [ $stop_stage -ge 10 ]; then - log "Stage 10: Prepare the other datasets" - # GigaSpeech - if [[ "${multidataset[@]}" =~ "gigaspeech" ]]; then - log "Dataset: GigaSpeech" - ./prepare_giga_speech.sh --stop_stage 5 - fi - - # CommonVoice - if [[ "${multidataset[@]}" =~ "commonvoice" ]]; then - log "Dataset: CommonVoice" - ./prepare_common_voice.sh - fi -fi diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py b/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py deleted file mode 100644 index 07c7126fa..000000000 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright 2023 Xiaomi Corp. (authors: Yifan Yang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import glob -import logging -import re -from pathlib import Path - -import lhotse -from lhotse import CutSet, load_manifest_lazy - - -class MultiDataset: - def __init__(self, manifest_dir: str, cv_manifest_dir: str): - """ - Args: - manifest_dir: - It is expected to contain the following files: - - - librispeech_cuts_train-all-shuf.jsonl.gz - - gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz - - cv_manifest_dir: - It is expected to contain the following files: - - - cv-en_cuts_train.jsonl.gz - """ - self.manifest_dir = Path(manifest_dir) - self.cv_manifest_dir = Path(cv_manifest_dir) - - def train_cuts(self) -> CutSet: - logging.info("About to get multidataset train cuts") - - # LibriSpeech - logging.info(f"Loading LibriSpeech in lazy mode") - librispeech_cuts = load_manifest_lazy( - self.manifest_dir / "librispeech_cuts_train-all-shuf.jsonl.gz" - ) - - # GigaSpeech - filenames = glob.glob( - f"{self.manifest_dir}/gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz" - ) - - pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz") - idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames) - idx_filenames = sorted(idx_filenames, key=lambda x: x[0]) - - sorted_filenames = [f[1] for f in idx_filenames] - - logging.info(f"Loading GigaSpeech {len(sorted_filenames)} splits in lazy mode") - - gigaspeech_cuts = lhotse.combine( - lhotse.load_manifest_lazy(p) for p in sorted_filenames - ) - - # CommonVoice - logging.info(f"Loading CommonVoice in lazy mode") - commonvoice_cuts = load_manifest_lazy( - self.cv_manifest_dir / f"cv-en_cuts_train.jsonl.gz" - ) - - return CutSet.mux(librispeech_cuts, gigaspeech_cuts, commonvoice_cuts) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py index 5be8c481d..a67e5174e 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py @@ -66,7 +66,6 @@ from lhotse.cut import Cut from lhotse.dataset.sampling.base import CutSampler from lhotse.utils import fix_random_seed from model import Transducer -from multidataset import MultiDataset from optim import Eden, ScaledAdam from torch import Tensor from torch.cuda.amp import GradScaler @@ -376,13 +375,6 @@ def get_parser(): help="Whether to use half precision training.", ) - parser.add_argument( - "--use-multidataset", - type=str2bool, - default=False, - help="Whether to use multidataset to train.", - ) - add_model_arguments(parser) return parser @@ -1042,16 +1034,12 @@ def run(rank, world_size, args): librispeech = LibriSpeechAsrDataModule(args) - if params.use_multidataset: - multidataset = MultiDataset(params.manifest_dir, params.cv_manifest_dir) - train_cuts = multidataset.train_cuts() + if params.mini_libri: + train_cuts = librispeech.train_clean_5_cuts() + elif params.full_libri: + train_cuts = librispeech.train_all_shuf_cuts() else: - if params.mini_libri: - train_cuts = librispeech.train_clean_5_cuts() - elif params.full_libri: - train_cuts = librispeech.train_all_shuf_cuts() - else: - train_cuts = librispeech.train_clean_100_cuts() + train_cuts = librispeech.train_clean_100_cuts() def remove_short_and_long_utt(c: Cut): # Keep only utterances with duration between 1 second and 20 seconds @@ -1107,7 +1095,7 @@ def run(rank, world_size, args): valid_cuts += librispeech.dev_other_cuts() valid_dl = librispeech.valid_dataloaders(valid_cuts) - if not params.use_multidataset and not params.print_diagnostics: + if not params.print_diagnostics: scan_pessimistic_batches_for_oom( model=model, train_dl=train_dl, diff --git a/egs/librispeech/ASR/zipformer/train.py b/egs/librispeech/ASR/zipformer/train.py index 1f0741ba4..9788220c9 100755 --- a/egs/librispeech/ASR/zipformer/train.py +++ b/egs/librispeech/ASR/zipformer/train.py @@ -62,20 +62,20 @@ import torch import torch.multiprocessing as mp import torch.nn as nn from asr_datamodule import LibriSpeechAsrDataModule -from zipformer import Zipformer2 -from scaling import ScheduledFloat from decoder import Decoder from joiner import Joiner -from subsampling import Conv2dSubsampling from lhotse.cut import Cut from lhotse.dataset.sampling.base import CutSampler from lhotse.utils import fix_random_seed from model import Transducer from optim import Eden, ScaledAdam +from scaling import ScheduledFloat +from subsampling import Conv2dSubsampling from torch import Tensor from torch.cuda.amp import GradScaler from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.tensorboard import SummaryWriter +from zipformer import Zipformer2 from icefall import diagnostics from icefall.checkpoint import load_checkpoint, remove_checkpoints @@ -84,40 +84,38 @@ from icefall.checkpoint import ( save_checkpoint_with_global_batch_idx, update_averaged_model, ) -from icefall.hooks import register_inf_check_hooks from icefall.dist import cleanup_dist, setup_dist from icefall.env import get_env_info +from icefall.hooks import register_inf_check_hooks from icefall.utils import ( AttributeDict, MetricsTracker, + get_parameter_groups_with_lrs, setup_logger, str2bool, - get_parameter_groups_with_lrs ) -LRSchedulerType = Union[ - torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler -] +LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler] -def get_adjusted_batch_count( - params: AttributeDict) -> float: +def get_adjusted_batch_count(params: AttributeDict) -> float: # returns the number of batches we would have used so far if we had used the reference # duration. This is for purposes of set_batch_count(). - return (params.batch_idx_train * (params.max_duration * params.world_size) / - params.ref_duration) + return ( + params.batch_idx_train + * (params.max_duration * params.world_size) + / params.ref_duration + ) -def set_batch_count( - model: Union[nn.Module, DDP], batch_count: float -) -> None: +def set_batch_count(model: Union[nn.Module, DDP], batch_count: float) -> None: if isinstance(model, DDP): # get underlying nn.Module model = model.module for name, module in model.named_modules(): - if hasattr(module, 'batch_count'): + if hasattr(module, "batch_count"): module.batch_count = batch_count - if hasattr(module, 'name'): + if hasattr(module, "name"): module.name = name @@ -154,35 +152,35 @@ def add_model_arguments(parser: argparse.ArgumentParser): "--encoder-dim", type=str, default="192,256,384,512,384,256", - help="Embedding dimension in encoder stacks: a single int or comma-separated list." + help="Embedding dimension in encoder stacks: a single int or comma-separated list.", ) parser.add_argument( "--query-head-dim", type=str, default="32", - help="Query/key dimension per head in encoder stacks: a single int or comma-separated list." + help="Query/key dimension per head in encoder stacks: a single int or comma-separated list.", ) parser.add_argument( "--value-head-dim", type=str, default="12", - help="Value dimension per head in encoder stacks: a single int or comma-separated list." + help="Value dimension per head in encoder stacks: a single int or comma-separated list.", ) parser.add_argument( "--pos-head-dim", type=str, default="4", - help="Positional-encoding dimension per head in encoder stacks: a single int or comma-separated list." + help="Positional-encoding dimension per head in encoder stacks: a single int or comma-separated list.", ) parser.add_argument( "--pos-dim", type=int, default="48", - help="Positional-encoding embedding dimension" + help="Positional-encoding embedding dimension", ) parser.add_argument( @@ -190,7 +188,7 @@ def add_model_arguments(parser: argparse.ArgumentParser): type=str, default="192,192,256,256,256,192", help="Unmasked dimensions in the encoders, relates to augmentation during training. " - "A single int or comma-separated list. Must be <= each corresponding encoder_dim." + "A single int or comma-separated list. Must be <= each corresponding encoder_dim.", ) parser.add_argument( @@ -230,7 +228,7 @@ def add_model_arguments(parser: argparse.ArgumentParser): type=str, default="16,32,64,-1", help="Chunk sizes (at 50Hz frame rate) will be chosen randomly from this list during training. " - " Must be just -1 if --causal=False" + " Must be just -1 if --causal=False", ) parser.add_argument( @@ -239,7 +237,7 @@ def add_model_arguments(parser: argparse.ArgumentParser): default="64,128,256,-1", help="Maximum left-contexts for causal training, measured in frames which will " "be converted to a number of chunks. If splitting into chunks, " - "chunk left-context frames will be chosen randomly from this list; else not relevant." + "chunk left-context frames will be chosen randomly from this list; else not relevant.", ) @@ -313,10 +311,7 @@ def get_parser(): ) parser.add_argument( - "--base-lr", - type=float, - default=0.045, - help="The base learning rate." + "--base-lr", type=float, default=0.045, help="The base learning rate." ) parser.add_argument( @@ -340,15 +335,14 @@ def get_parser(): type=float, default=600, help="Reference batch duration for purposes of adjusting batch counts for setting various " - "schedules inside the model" + "schedules inside the model", ) parser.add_argument( "--context-size", type=int, default=2, - help="The context size in the decoder. 1 means bigram; " - "2 means tri-gram", + help="The context size in the decoder. 1 means bigram; " "2 means tri-gram", ) parser.add_argument( @@ -371,8 +365,7 @@ def get_parser(): "--am-scale", type=float, default=0.0, - help="The scale to smooth the loss with am (output of encoder network)" - "part.", + help="The scale to smooth the loss with am (output of encoder network)" "part.", ) parser.add_argument( @@ -522,7 +515,7 @@ def get_params() -> AttributeDict: def _to_int_tuple(s: str): - return tuple(map(int, s.split(','))) + return tuple(map(int, s.split(","))) def get_encoder_embed(params: AttributeDict) -> nn.Module: @@ -537,7 +530,7 @@ def get_encoder_embed(params: AttributeDict) -> nn.Module: encoder_embed = Conv2dSubsampling( in_channels=params.feature_dim, out_channels=_to_int_tuple(params.encoder_dim)[0], - dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)) + dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)), ) return encoder_embed @@ -596,7 +589,7 @@ def get_transducer_model(params: AttributeDict) -> nn.Module: encoder=encoder, decoder=decoder, joiner=joiner, - encoder_dim=int(max(params.encoder_dim.split(','))), + encoder_dim=int(max(params.encoder_dim.split(","))), decoder_dim=params.decoder_dim, joiner_dim=params.joiner_dim, vocab_size=params.vocab_size, @@ -745,11 +738,7 @@ def compute_loss( warmup: a floating point value which increases throughout training; values >= 1.0 are fully warmed up and have all modules present. """ - device = ( - model.device - if isinstance(model, DDP) - else next(model.parameters()).device - ) + device = model.device if isinstance(model, DDP) else next(model.parameters()).device feature = batch["inputs"] # at entry, feature is (N, T, C) assert feature.ndim == 3 @@ -779,27 +768,24 @@ def compute_loss( # take down the scale on the simple loss from 1.0 at the start # to params.simple_loss scale by warm_step. simple_loss_scale = ( - s if batch_idx_train >= warm_step + s + if batch_idx_train >= warm_step else 1.0 - (batch_idx_train / warm_step) * (1.0 - s) ) pruned_loss_scale = ( - 1.0 if batch_idx_train >= warm_step + 1.0 + if batch_idx_train >= warm_step else 0.1 + 0.9 * (batch_idx_train / warm_step) ) - loss = ( - simple_loss_scale * simple_loss + - pruned_loss_scale * pruned_loss - ) + loss = simple_loss_scale * simple_loss + pruned_loss_scale * pruned_loss assert loss.requires_grad == is_training info = MetricsTracker() with warnings.catch_warnings(): warnings.simplefilter("ignore") - info["frames"] = ( - (feature_lens // params.subsampling_factor).sum().item() - ) + info["frames"] = (feature_lens // params.subsampling_factor).sum().item() # Note: We use reduction=sum while computing the loss. info["loss"] = loss.detach().cpu().item() @@ -895,15 +881,17 @@ def train_one_epoch( saved_bad_model = False def save_bad_model(suffix: str = ""): - save_checkpoint_impl(filename=params.exp_dir / f"bad-model{suffix}-{rank}.pt", - model=model, - model_avg=model_avg, - params=params, - optimizer=optimizer, - scheduler=scheduler, - sampler=train_dl.sampler, - scaler=scaler, - rank=0) + save_checkpoint_impl( + filename=params.exp_dir / f"bad-model{suffix}-{rank}.pt", + model=model, + model_avg=model_avg, + params=params, + optimizer=optimizer, + scheduler=scheduler, + sampler=train_dl.sampler, + scaler=scaler, + rank=0, + ) for batch_idx, batch in enumerate(train_dl): if batch_idx % 10 == 0: @@ -988,7 +976,9 @@ def train_one_epoch( logging.warning(f"Grad scale is small: {cur_grad_scale}") if cur_grad_scale < 1.0e-05: save_bad_model() - raise RuntimeError(f"grad_scale is too small, exiting: {cur_grad_scale}") + raise RuntimeError( + f"grad_scale is too small, exiting: {cur_grad_scale}" + ) if batch_idx % params.log_interval == 0: cur_lr = max(scheduler.get_last_lr()) @@ -998,8 +988,8 @@ def train_one_epoch( f"Epoch {params.cur_epoch}, " f"batch {batch_idx}, loss[{loss_info}], " f"tot_loss[{tot_loss}], batch size: {batch_size}, " - f"lr: {cur_lr:.2e}, " + - (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "") + f"lr: {cur_lr:.2e}, " + + (f"grad_scale: {scaler._scale.item()}" if params.use_fp16 else "") ) if tb_writer is not None: @@ -1010,9 +1000,7 @@ def train_one_epoch( loss_info.write_summary( tb_writer, "train/current_", params.batch_idx_train ) - tot_loss.write_summary( - tb_writer, "train/tot_", params.batch_idx_train - ) + tot_loss.write_summary(tb_writer, "train/tot_", params.batch_idx_train) if params.use_fp16: tb_writer.add_scalar( "train/grad_scale", cur_grad_scale, params.batch_idx_train @@ -1029,7 +1017,9 @@ def train_one_epoch( ) model.train() logging.info(f"Epoch {params.cur_epoch}, validation: {valid_info}") - logging.info(f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB") + logging.info( + f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB" + ) if tb_writer is not None: valid_info.write_summary( tb_writer, "train/valid_", params.batch_idx_train @@ -1103,13 +1093,11 @@ def run(rank, world_size, args): model.to(device) if world_size > 1: logging.info("Using DDP") - model = DDP(model, device_ids=[rank], - find_unused_parameters=True) + model = DDP(model, device_ids=[rank], find_unused_parameters=True) optimizer = ScaledAdam( - get_parameter_groups_with_lrs( - model, lr=params.base_lr, include_names=True), - lr=params.base_lr, # should have no effect + get_parameter_groups_with_lrs(model, lr=params.base_lr, include_names=True), + lr=params.base_lr, # should have no effect clipping_scale=2.0, ) @@ -1129,7 +1117,7 @@ def run(rank, world_size, args): if params.print_diagnostics: opts = diagnostics.TensorDiagnosticOptions( - 2 ** 22 + 2**22 ) # allow 4 megabytes per sub-module diagnostic = diagnostics.attach_diagnostics(model, opts) @@ -1153,9 +1141,9 @@ def run(rank, world_size, args): # an utterance duration distribution for your dataset to select # the threshold if c.duration < 1.0 or c.duration > 20.0: - logging.warning( - f"Exclude cut with ID {c.id} from training. Duration: {c.duration}" - ) + # logging.warning( + # f"Exclude cut with ID {c.id} from training. Duration: {c.duration}" + # ) return False # In pruned RNN-T, we require that T >= S @@ -1206,8 +1194,7 @@ def run(rank, world_size, args): params=params, ) - scaler = GradScaler(enabled=params.use_fp16, - init_scale=1.0) + scaler = GradScaler(enabled=params.use_fp16, init_scale=1.0) if checkpoints and "grad_scaler" in checkpoints: logging.info("Loading grad scaler state dict") scaler.load_state_dict(checkpoints["grad_scaler"]) @@ -1328,7 +1315,9 @@ def scan_pessimistic_batches_for_oom( ) display_and_save_batch(batch, params=params, sp=sp) raise - logging.info(f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB") + logging.info( + f"Maximum memory allocated so far is {torch.cuda.max_memory_allocated()//1000000}MB" + ) def main():