diff --git a/egs/librispeech/ASR/prepare_multidataset.sh b/egs/librispeech/ASR/prepare_multidataset.sh deleted file mode 100755 index f7183a2ea..000000000 --- a/egs/librispeech/ASR/prepare_multidataset.sh +++ /dev/null @@ -1,340 +0,0 @@ -#!/usr/bin/env bash - -# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 -export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - -set -eou pipefail - -nj=16 -stage=-1 -stop_stage=100 - -# We assume dl_dir (download dir) contains the following -# directories and files. If not, they will be downloaded -# by this script automatically. -# -# - $dl_dir/LibriSpeech -# You can find BOOKS.TXT, test-clean, train-clean-360, etc, inside it. -# You can download them from https://www.openslr.org/12 -# -# - $dl_dir/lm -# This directory contains the following files downloaded from -# http://www.openslr.org/resources/11 -# -# - 3-gram.pruned.1e-7.arpa.gz -# - 3-gram.pruned.1e-7.arpa -# - 4-gram.arpa.gz -# - 4-gram.arpa -# - librispeech-vocab.txt -# - librispeech-lexicon.txt -# - librispeech-lm-norm.txt.gz -# -# - $dl_dir/musan -# This directory contains the following directories downloaded from -# http://www.openslr.org/17/ -# -# - music -# - noise -# - speech - -dl_dir=$PWD/download - -. shared/parse_options.sh || exit 1 - -# vocab size for sentence piece models. -# It will generate data/lang_bpe_xxx, -# data/lang_bpe_yyy if the array contains xxx, yyy -vocab_sizes=( - # 5000 - # 2000 - # 1000 - 500 -) - -# multidataset list. -# LibriSpeech and musan are required. -# The others are optional. -multidataset=( - "gigaspeech", - "commonvoice", - "peoples_speech", -) - -# All files generated by this script are saved in "data". -# You can safely remove "data" and rerun this script to regenerate it. -mkdir -p data - -log() { - # This function is from espnet - local fname=${BASH_SOURCE[1]##*/} - echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" -} - -log "dl_dir: $dl_dir" - -log "Dataset: LibriSpeech and musan" -if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then - log "Stage -1: Download LM" - mkdir -p $dl_dir/lm - if [ ! -e $dl_dir/lm/.done ]; then - ./local/download_lm.py --out-dir=$dl_dir/lm - touch $dl_dir/lm/.done - fi -fi - -if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then - log "Stage 0: Download data" - - # If you have pre-downloaded it to /path/to/LibriSpeech, - # you can create a symlink - # - # ln -sfv /path/to/LibriSpeech $dl_dir/LibriSpeech - # - if [ ! -d $dl_dir/LibriSpeech/train-other-500 ]; then - lhotse download librispeech --full $dl_dir - fi - - # If you have pre-downloaded it to /path/to/musan, - # you can create a symlink - # - # ln -sfv /path/to/musan $dl_dir/ - # - if [ ! -d $dl_dir/musan ]; then - lhotse download musan $dl_dir - fi -fi - -if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Prepare LibriSpeech manifest" - # We assume that you have downloaded the LibriSpeech corpus - # to $dl_dir/LibriSpeech - mkdir -p data/manifests - if [ ! -e data/manifests/.librispeech.done ]; then - lhotse prepare librispeech -j $nj $dl_dir/LibriSpeech data/manifests - touch data/manifests/.librispeech.done - fi -fi - -if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Prepare musan manifest" - # We assume that you have downloaded the musan corpus - # to data/musan - mkdir -p data/manifests - if [ ! -e data/manifests/.musan.done ]; then - lhotse prepare musan $dl_dir/musan data/manifests - touch data/manifests/.musan.done - fi -fi - -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Compute fbank for librispeech" - mkdir -p data/fbank - if [ ! -e data/fbank/.librispeech.done ]; then - ./local/compute_fbank_librispeech.py --perturb-speed False - touch data/fbank/.librispeech.done - fi - - if [ ! -f data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz ]; then - cat <(gunzip -c data/fbank/librispeech_cuts_train-clean-100.jsonl.gz) \ - <(gunzip -c data/fbank/librispeech_cuts_train-clean-360.jsonl.gz) \ - <(gunzip -c data/fbank/librispeech_cuts_train-other-500.jsonl.gz) | \ - shuf | gzip -c > data/fbank/librispeech_cuts_train-all-shuf.jsonl.gz - fi - - if [ ! -e data/fbank/.librispeech-validated.done ]; then - log "Validating data/fbank for LibriSpeech" - parts=( - train-clean-100 - train-clean-360 - train-other-500 - test-clean - test-other - dev-clean - dev-other - ) - for part in ${parts[@]}; do - python3 ./local/validate_manifest.py \ - data/fbank/librispeech_cuts_${part}.jsonl.gz - done - touch data/fbank/.librispeech-validated.done - fi -fi - -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Compute fbank for musan" - mkdir -p data/fbank - if [ ! -e data/fbank/.musan.done ]; then - ./local/compute_fbank_musan.py - touch data/fbank/.musan.done - fi -fi - -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Prepare phone based lang" - lang_dir=data/lang_phone - mkdir -p $lang_dir - - (echo '!SIL SIL'; echo ' SPN'; echo ' SPN'; ) | - cat - $dl_dir/lm/librispeech-lexicon.txt | - sort | uniq > $lang_dir/lexicon.txt - - if [ ! -f $lang_dir/L_disambig.pt ]; then - ./local/prepare_lang.py --lang-dir $lang_dir - fi - - if [ ! -f $lang_dir/L.fst ]; then - log "Converting L.pt to L.fst" - ./shared/convert-k2-to-openfst.py \ - --olabels aux_labels \ - $lang_dir/L.pt \ - $lang_dir/L.fst - fi - - if [ ! -f $lang_dir/L_disambig.fst ]; then - log "Converting L_disambig.pt to L_disambig.fst" - ./shared/convert-k2-to-openfst.py \ - --olabels aux_labels \ - $lang_dir/L_disambig.pt \ - $lang_dir/L_disambig.fst - fi -fi - -if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Prepare BPE based lang" - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} - mkdir -p $lang_dir - # We reuse words.txt from phone based lexicon - # so that the two can share G.pt later. - cp data/lang_phone/words.txt $lang_dir - - if [ ! -f $lang_dir/transcript_words.txt ]; then - log "Generate data for BPE training" - files=$( - find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt" - find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt" - find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt" - ) - for f in ${files[@]}; do - cat $f | cut -d " " -f 2- - done > $lang_dir/transcript_words.txt - fi - - if [ ! -f $lang_dir/bpe.model ]; then - ./local/train_bpe_model.py \ - --lang-dir $lang_dir \ - --vocab-size $vocab_size \ - --transcript $lang_dir/transcript_words.txt - fi - - if [ ! -f $lang_dir/L_disambig.pt ]; then - ./local/prepare_lang_bpe.py --lang-dir $lang_dir - - log "Validating $lang_dir/lexicon.txt" - ./local/validate_bpe_lexicon.py \ - --lexicon $lang_dir/lexicon.txt \ - --bpe-model $lang_dir/bpe.model - fi - - if [ ! -f $lang_dir/L.fst ]; then - log "Converting L.pt to L.fst" - ./shared/convert-k2-to-openfst.py \ - --olabels aux_labels \ - $lang_dir/L.pt \ - $lang_dir/L.fst - fi - - if [ ! -f $lang_dir/L_disambig.fst ]; then - log "Converting L_disambig.pt to L_disambig.fst" - ./shared/convert-k2-to-openfst.py \ - --olabels aux_labels \ - $lang_dir/L_disambig.pt \ - $lang_dir/L_disambig.fst - fi - done -fi - -if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then - log "Stage 7: Prepare G" - # We assume you have install kaldilm, if not, please install - # it using: pip install kaldilm - - mkdir -p data/lm - if [ ! -f data/lm/G_3_gram.fst.txt ]; then - # It is used in building HLG - python3 -m kaldilm \ - --read-symbol-table="data/lang_phone/words.txt" \ - --disambig-symbol='#0' \ - --max-order=3 \ - $dl_dir/lm/3-gram.pruned.1e-7.arpa > data/lm/G_3_gram.fst.txt - fi - - if [ ! -f data/lm/G_4_gram.fst.txt ]; then - # It is used for LM rescoring - python3 -m kaldilm \ - --read-symbol-table="data/lang_phone/words.txt" \ - --disambig-symbol='#0' \ - --max-order=4 \ - $dl_dir/lm/4-gram.arpa > data/lm/G_4_gram.fst.txt - fi -fi - -if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then - log "Stage 8: Compile LG" - ./local/compile_lg.py --lang-dir data/lang_phone - - for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} - ./local/compile_lg.py --lang-dir $lang_dir - done -fi - -if [ $stage -le 9 ] && [ $stop_stage -ge 9 ]; then - log "Stage 9: Prepare the other datasets" - # GigaSpeech - if [[ "${multidataset[@]}" =~ "gigaspeech" ]] && [ ! -f data/fbank/.gigaspeech.done ]; then - log "Dataset: GigaSpeech" - cd data/fbank - if [ -f ../../../../gigaspeech/ASR/data/fbank/XL_split/.split_completed ]; then - ln -svf $(realpath ../../../../gigaspeech/ASR/data/fbank/XL_split) . - else - log "Abort! Please run ../../gigaspeech/ASR/prepare.sh --stage 5 --stop-stage 6" - exit 1 - fi - - touch .gigaspeech.done - cd ../.. - fi - - # CommonVoice - if [[ "${multidataset[@]}" =~ "commonvoice" ]] && [ ! -f data/fbank/.commonvoice.done ]; then - log "Dataset: CommonVoice" - cd data/fbank - if [ -f ../../../../commonvoice/ASR/data/en/fbank/.cv-en_train.done ]; then - ln -svf $(realpath ../../../../commonvoice/ASR/data/en/fbank/cv-en_train_split_1000) . - ln -svf $(realpath ../../../../commonvoice/ASR/data/en/fbank/cv-en_cuts_train.jsonl.gz) . - else - log "Abort! Please run ../../commonvoice/ASR/prepare.sh --stage 5 --stop-stage 6" - exit 1 - fi - - touch .commonvoice.done - cd ../.. - fi - - # People's Speech - if [[ "${multidataset[@]}" =~ "peoples_speech" ]] && [ ! -f data/fbank/.peoples_speech.done ]; then - log "Dataset: People's Speech" - cd data/fbank - if [ -f ../../../../peoples_speech/ASR/data/fbank/.peoples_speech_train.done ]; then - ln -svf $(realpath ../../../../peoples_speech/ASR/data/fbank/peoples_speech_train_split) . - else - log "Abort! Please run ../../peoples_speech/prepare.sh --stage 5 --stop-stage 6" - exit 1 - fi - - touch .peoples_speech.done - cd ../.. - fi -fi diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py b/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py deleted file mode 100644 index 798aa27ba..000000000 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/multidataset.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright 2023 Xiaomi Corp. (authors: Yifan Yang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import glob -import logging -import re -from pathlib import Path - -import lhotse -from lhotse import CutSet, load_manifest_lazy - - -class MultiDataset: - def __init__(self, manifest_dir: str): - """ - Args: - manifest_dir: - It is expected to contain the following files: - - - librispeech_cuts_train-all-shuf.jsonl.gz - - XL_split_2000/cuts_XL.*.jsonl.gz - - cv-en_cuts_train.jsonl.gz - - peoples_speech_train_split/peoples_speech_cuts_dirty.*.jsonl.gz - - peoples_speech_train_split/peoples_speech_cuts_dirty_sa.*.jsonl.gz - - peoples_speech_train_split/peoples_speech_cuts_clean.*.jsonl.gz - - peoples_speech_train_split/peoples_speech_cuts_clean_sa.*.jsonl.gz - """ - self.manifest_dir = Path(manifest_dir) - - def train_cuts(self) -> CutSet: - logging.info("About to get multidataset train cuts") - - # LibriSpeech - logging.info("Loading LibriSpeech in lazy mode") - librispeech_cuts = load_manifest_lazy( - self.manifest_dir / "librispeech_cuts_train-all-shuf.jsonl.gz" - ) - - # GigaSpeech - filenames = glob.glob(f"{self.manifest_dir}/XL_split/cuts_XL.*.jsonl.gz") - - pattern = re.compile(r"cuts_XL.([0-9]+).jsonl.gz") - idx_filenames = ((int(pattern.search(f).group(1)), f) for f in filenames) - idx_filenames = sorted(idx_filenames, key=lambda x: x[0]) - - sorted_filenames = [f[1] for f in idx_filenames] - - logging.info(f"Loading GigaSpeech {len(sorted_filenames)} splits in lazy mode") - - gigaspeech_cuts = lhotse.combine( - lhotse.load_manifest_lazy(p) for p in sorted_filenames - ) - - # CommonVoice - logging.info("Loading CommonVoice in lazy mode") - commonvoice_cuts = load_manifest_lazy( - self.manifest_dir / f"cv-en_cuts_train.jsonl.gz" - ) - - # People's Speech - sorted_filenames = sorted( - glob.glob( - f"{self.manifest_dir}/peoples_speech_train_split/peoples_speech_cuts_*[yna].*.jsonl.gz" - ) - ) - - logging.info( - f"Loading People's Speech {len(sorted_filenames)} splits in lazy mode" - ) - - peoples_speech_cuts = lhotse.combine( - lhotse.load_manifest_lazy(p) for p in sorted_filenames - ) - - return CutSet.mux( - librispeech_cuts, - gigaspeech_cuts, - commonvoice_cuts, - peoples_speech_cuts, - weights=[ - len(librispeech_cuts), - len(gigaspeech_cuts), - len(commonvoice_cuts), - len(peoples_speech_cuts), - ], - ) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py index b7f644bcd..a67e5174e 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless7/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless7/train.py @@ -66,7 +66,6 @@ from lhotse.cut import Cut from lhotse.dataset.sampling.base import CutSampler from lhotse.utils import fix_random_seed from model import Transducer -from multidataset import MultiDataset from optim import Eden, ScaledAdam from torch import Tensor from torch.cuda.amp import GradScaler @@ -376,13 +375,6 @@ def get_parser(): help="Whether to use half precision training.", ) - parser.add_argument( - "--use-multidataset", - type=str2bool, - default=False, - help="Whether to use multidataset to train.", - ) - add_model_arguments(parser) return parser @@ -1042,16 +1034,12 @@ def run(rank, world_size, args): librispeech = LibriSpeechAsrDataModule(args) - if params.use_multidataset: - multidataset = MultiDataset(params.manifest_dir) - train_cuts = multidataset.train_cuts() + if params.mini_libri: + train_cuts = librispeech.train_clean_5_cuts() + elif params.full_libri: + train_cuts = librispeech.train_all_shuf_cuts() else: - if params.mini_libri: - train_cuts = librispeech.train_clean_5_cuts() - elif params.full_libri: - train_cuts = librispeech.train_all_shuf_cuts() - else: - train_cuts = librispeech.train_clean_100_cuts() + train_cuts = librispeech.train_clean_100_cuts() def remove_short_and_long_utt(c: Cut): # Keep only utterances with duration between 1 second and 20 seconds @@ -1107,7 +1095,7 @@ def run(rank, world_size, args): valid_cuts += librispeech.dev_other_cuts() valid_dl = librispeech.valid_dataloaders(valid_cuts) - if not params.use_multidataset and not params.print_diagnostics: + if not params.print_diagnostics: scan_pessimistic_batches_for_oom( model=model, train_dl=train_dl, diff --git a/egs/librispeech/ASR/zipformer/multidataset.py b/egs/librispeech/ASR/zipformer/multidataset.py deleted file mode 120000 index 6a4627887..000000000 --- a/egs/librispeech/ASR/zipformer/multidataset.py +++ /dev/null @@ -1 +0,0 @@ -../pruned_transducer_stateless7/multidataset.py \ No newline at end of file diff --git a/egs/librispeech/ASR/zipformer/train.py b/egs/librispeech/ASR/zipformer/train.py index 3724d9181..9788220c9 100755 --- a/egs/librispeech/ASR/zipformer/train.py +++ b/egs/librispeech/ASR/zipformer/train.py @@ -68,7 +68,6 @@ from lhotse.cut import Cut from lhotse.dataset.sampling.base import CutSampler from lhotse.utils import fix_random_seed from model import Transducer -from multidataset import MultiDataset from optim import Eden, ScaledAdam from scaling import ScheduledFloat from subsampling import Conv2dSubsampling @@ -444,13 +443,6 @@ def get_parser(): help="Whether to use half precision training.", ) - parser.add_argument( - "--use-multidataset", - type=str2bool, - default=False, - help="Whether to use multidataset to train.", - ) - add_model_arguments(parser) return parser @@ -1134,14 +1126,10 @@ def run(rank, world_size, args): librispeech = LibriSpeechAsrDataModule(args) - if params.use_multidataset: - multidataset = MultiDataset(params.manifest_dir) - train_cuts = multidataset.train_cuts() - else: - train_cuts = librispeech.train_clean_100_cuts() - if params.full_libri: - train_cuts += librispeech.train_clean_360_cuts() - train_cuts += librispeech.train_other_500_cuts() + train_cuts = librispeech.train_clean_100_cuts() + if params.full_libri: + train_cuts += librispeech.train_clean_360_cuts() + train_cuts += librispeech.train_other_500_cuts() def remove_short_and_long_utt(c: Cut): # Keep only utterances with duration between 1 second and 20 seconds @@ -1197,7 +1185,7 @@ def run(rank, world_size, args): valid_cuts += librispeech.dev_other_cuts() valid_dl = librispeech.valid_dataloaders(valid_cuts) - if not params.use_multidataset and not params.print_diagnostics: + if not params.print_diagnostics: scan_pessimistic_batches_for_oom( model=model, train_dl=train_dl,