From 3eeadd0f3a46fed870f05d009994e2bc8968d691 Mon Sep 17 00:00:00 2001 From: Kinan Martin Date: Fri, 11 Apr 2025 10:29:27 +0900 Subject: [PATCH] update prepare.sh, fix asr_datamodule.py --- .../ASR/local/utils/asr_datamodule.py | 4 +- egs/mls_english/ASR/prepare.sh | 123 +++--- .../ASR/zipformer/asr_datamodule.py | 387 +----------------- 3 files changed, 73 insertions(+), 441 deletions(-) mode change 100644 => 120000 egs/mls_english/ASR/zipformer/asr_datamodule.py diff --git a/egs/mls_english/ASR/local/utils/asr_datamodule.py b/egs/mls_english/ASR/local/utils/asr_datamodule.py index 2308a408d..de928e36b 100644 --- a/egs/mls_english/ASR/local/utils/asr_datamodule.py +++ b/egs/mls_english/ASR/local/utils/asr_datamodule.py @@ -120,7 +120,7 @@ class MLSEnglishHFAsrDataModule: group.add_argument( "--on-the-fly-feats", type=str2bool, - default=True, # Must be True for Lazy HF dataset (?) + default=True, # must be true without lhotse feature prep help="When enabled, use on-the-fly cut mixing and feature " "extraction. Will drop existing precomputed feature manifests " "if available.", @@ -190,7 +190,7 @@ class MLSEnglishHFAsrDataModule: Intended usage inside a training script: ``` mls_english_corpus = MLSEnglishHFAsrDataModule(args) - mls_english_corpus.load_hf_dataset("fr") + mls_english_corpus.load_hf_dataset("parler-tts/mls_eng") train_cuts = mls_english_corpus.train_cuts() train_dataloader = mls_english_corpus.train_dataloaders( train_cuts, sampler_state_dict=sampler_state_dict diff --git a/egs/mls_english/ASR/prepare.sh b/egs/mls_english/ASR/prepare.sh index 8724d3ca0..0484832a3 100644 --- a/egs/mls_english/ASR/prepare.sh +++ b/egs/mls_english/ASR/prepare.sh @@ -9,6 +9,10 @@ nj=15 stage=-1 stop_stage=100 +# vocab_sizes=(500 1000 2000) +vocab_sizes=(2000) + + # We assume dl_dir (download dir) contains the following # directories and files. If not, they will be downloaded # by this script automatically. @@ -41,74 +45,87 @@ log "dl_dir: $dl_dir" if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then log "Stage 0: Download data" - # If you have pre-downloaded it to /path/to/ReazonSpeech, + # If you have pre-downloaded it to /path/to/mls_eng, # you can create a symlink # - # ln -sfv /path/to/ReazonSpeech $dl_dir/ReazonSpeech + # ln -sfv /path/to/mls_eng $dl_dir/mls_eng # - if [ ! -d $dl_dir/ReazonSpeech/downloads ]; then - # Download small-v1 by default. - lhotse download reazonspeech --subset small-v1 $dl_dir + if [ ! -d $dl_dir/mls_english ]; then + git clone https://huggingface.co/datasets/parler-tts/mls_eng $dl_dir/mls_eng fi fi +## Not necessary to create manifest or pre-compute fbank for on-the-fly feature computation ## + +# if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then +# log "Stage 1: Prepare MLS English manifest" +# # We assume that you have downloaded the ReazonSpeech corpus +# # to $dl_dir/ReazonSpeech +# mkdir -p data/manifests +# if [ ! -e data/manifests/.reazonspeech.done ]; then +# lhotse prepare reazonspeech -j $nj $dl_dir/ReazonSpeech data/manifests +# touch data/manifests/.reazonspeech.done +# fi +# fi + +# if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then +# log "Stage 2: Compute ReazonSpeech fbank" +# if [ ! -e data/manifests/.reazonspeech-validated.done ]; then +# python local/compute_fbank_reazonspeech.py --manifest-dir data/manifests +# python local/validate_manifest.py --manifest data/manifests/reazonspeech_cuts_train.jsonl.gz +# python local/validate_manifest.py --manifest data/manifests/reazonspeech_cuts_dev.jsonl.gz +# python local/validate_manifest.py --manifest data/manifests/reazonspeech_cuts_test.jsonl.gz +# touch data/manifests/.reazonspeech-validated.done +# fi +# fi + +############################################################################################### + +# if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then +# log "Stage 3: Prepare ReazonSpeech lang_char" +# python local/prepare_lang_char.py data/manifests/reazonspeech_cuts_train.jsonl.gz +# fi + +# if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then +# log "Stage 4: Show manifest statistics" +# python local/display_manifest_statistics.py --manifest-dir data/manifests > data/manifests/manifest_statistics.txt +# cat data/manifests/manifest_statistics.txt +# fi + +mkdir -p data/lang + +lang_dir=data/lang + +log "lang_dir: $lang_dir" + if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Prepare ReazonSpeech manifest" - # We assume that you have downloaded the ReazonSpeech corpus - # to $dl_dir/ReazonSpeech - mkdir -p data/manifests - if [ ! -e data/manifests/.reazonspeech.done ]; then - lhotse prepare reazonspeech -j $nj $dl_dir/ReazonSpeech data/manifests - touch data/manifests/.reazonspeech.done - fi -fi + log "Stage 1: Prepare BPE based lang" -if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Compute ReazonSpeech fbank" - if [ ! -e data/manifests/.reazonspeech-validated.done ]; then - python local/compute_fbank_reazonspeech.py --manifest-dir data/manifests - python local/validate_manifest.py --manifest data/manifests/reazonspeech_cuts_train.jsonl.gz - python local/validate_manifest.py --manifest data/manifests/reazonspeech_cuts_dev.jsonl.gz - python local/validate_manifest.py --manifest data/manifests/reazonspeech_cuts_test.jsonl.gz - touch data/manifests/.reazonspeech-validated.done - fi -fi + if [ ! -f $lang_dir/transcript.txt ]; then + log "Generate transcript for BPE training" -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Prepare ReazonSpeech lang_char" - python local/prepare_lang_char.py data/manifests/reazonspeech_cuts_train.jsonl.gz -fi - -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Show manifest statistics" - python local/display_manifest_statistics.py --manifest-dir data/manifests > data/manifests/manifest_statistics.txt - cat data/manifests/manifest_statistics.txt -fi - -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Prepare BPE based lang" + ./local/utils/generate_transcript.py --lang-dir $lang_dir + # files=$( + # find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt" + # find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt" + # find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt" + # ) + # for f in ${files[@]}; do + # cat $f | cut -d " " -f 2- + # done > $lang_dir/transcript_words.txt + fi for vocab_size in ${vocab_sizes[@]}; do - lang_dir=data/lang_bpe_${vocab_size} - mkdir -p $lang_dir + log "Train BPE model with vocab_size: $vocab_size" + bpe_dir=data/lang/bpe_${vocab_size} + mkdir -p $bpe_dir - if [ ! -f $lang_dir/transcript_words.txt ]; then - log "Generate data for BPE training" - files=$( - find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt" - find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt" - find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt" - ) - for f in ${files[@]}; do - cat $f | cut -d " " -f 2- - done > $lang_dir/transcript_words.txt - fi - if [ ! -f $lang_dir/bpe.model ]; then + if [ ! -f $bpe_dir/bpe.model ]; then ./local/train_bpe_model.py \ - --lang-dir $lang_dir \ + --lang-dir $bpe_dir \ --vocab-size $vocab_size \ - --transcript $lang_dir/transcript_words.txt + --transcript $lang_dir/transcript.txt fi done fi \ No newline at end of file diff --git a/egs/mls_english/ASR/zipformer/asr_datamodule.py b/egs/mls_english/ASR/zipformer/asr_datamodule.py deleted file mode 100644 index b783e802e..000000000 --- a/egs/mls_english/ASR/zipformer/asr_datamodule.py +++ /dev/null @@ -1,386 +0,0 @@ -# Copyright 2021 Piotr Żelasko -# Copyright 2022 Xiaomi Corporation (Author: Mingshuang Luo) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import argparse -import inspect -import logging -from functools import lru_cache -from pathlib import Path -from typing import Any, Dict, List, Optional - -from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy -from lhotse.dataset import ( - CutConcatenate, - CutMix, - DynamicBucketingSampler, - K2SpeechRecognitionDataset, - PrecomputedFeatures, - SimpleCutSampler, - SpecAugment, -) -from lhotse.dataset.input_strategies import OnTheFlyFeatures -from lhotse.utils import is_module_available -from torch.utils.data import DataLoader - -from icefall.utils import str2bool - - -class MLSEnglishHFAsrDataModule: - """ - DataModule for k2 ASR experiments. - It assumes there is always one train and valid dataloader, - but there can be multiple test dataloaders (e.g. LibriSpeech test-clean - and test-other). - It contains all the common data pipeline modules used in ASR - experiments, e.g.: - - dynamic batch size, - - bucketing samplers, - - cut concatenation, - - augmentation, - - on-the-fly feature extraction - This class should be derived for specific corpora used in ASR tasks. - """ - - def __init__(self, args: argparse.Namespace): - self.args = args - - @classmethod - def add_arguments(cls, parser: argparse.ArgumentParser): - group = parser.add_argument_group( - title="ASR data related options", - description="These options are used for the preparation of " - "PyTorch DataLoaders from Lhotse CutSet's -- they control the " - "effective batch sizes, sampling strategies, applied data " - "augmentations, etc.", - ) - group.add_argument( - "--manifest-dir", - type=Path, - default=Path("data/manifests"), - help="Path to directory with train/dev/test cuts.", - ) - group.add_argument( - "--max-duration", - type=int, - default=200.0, - help="Maximum pooled recordings duration (seconds) in a " - "single batch. You can reduce it if it causes CUDA OOM.", - ) - group.add_argument( - "--bucketing-sampler", - type=str2bool, - default=True, - help="When enabled, the batches will come from buckets of " - "similar duration (saves padding frames).", - ) - group.add_argument( - "--num-buckets", - type=int, - default=30, - help="The number of buckets for the DynamicBucketingSampler" - "(you might want to increase it for larger datasets).", - ) - group.add_argument( - "--concatenate-cuts", - type=str2bool, - default=False, - help="When enabled, utterances (cuts) will be concatenated " - "to minimize the amount of padding.", - ) - group.add_argument( - "--duration-factor", - type=float, - default=1.0, - help="Determines the maximum duration of a concatenated cut " - "relative to the duration of the longest cut in a batch.", - ) - group.add_argument( - "--gap", - type=float, - default=1.0, - help="The amount of padding (in seconds) inserted between " - "concatenated cuts. This padding is filled with noise when " - "noise augmentation is used.", - ) - group.add_argument( - "--on-the-fly-feats", - type=str2bool, - default=True, - help="When enabled, use on-the-fly cut mixing and feature " - "extraction. Will drop existing precomputed feature manifests " - "if available.", - ) - group.add_argument( - "--shuffle", - type=str2bool, - default=True, - help="When enabled (=default), the examples will be " - "shuffled for each epoch.", - ) - group.add_argument( - "--drop-last", - type=str2bool, - default=True, - help="Whether to drop last batch. Used by sampler.", - ) - group.add_argument( - "--return-cuts", - type=str2bool, - default=False, - help="When enabled, each batch will have the " - "field: batch['supervisions']['cut'] with the cuts that " - "were used to construct it.", - ) - - group.add_argument( - "--num-workers", - type=int, - default=2, - help="The number of training dataloader workers that " - "collect the batches.", - ) - - group.add_argument( - "--enable-spec-aug", - type=str2bool, - default=True, - help="When enabled, use SpecAugment for training dataset.", - ) - - group.add_argument( - "--spec-aug-time-warp-factor", - type=int, - default=80, - help="Used only when --enable-spec-aug is True. " - "It specifies the factor for time warping in SpecAugment. " - "Larger values mean more warping. " - "A value less than 1 means to disable time warp.", - ) - - group.add_argument( - "--enable-musan", - type=str2bool, - default=False, - help="When enabled, select noise from MUSAN and mix it" - "with training dataset. ", - ) - - def load_hf_dataset( - self, mls_eng_hf_dataset_path: str = "parler-tts/mls_eng", - ): - """ - Method to load HF dataset with datasets.load_dataset - and save it in this DataModule. - - Intended usage inside a training script: - ``` - mls_english_corpus = MLSEnglishHFAsrDataModule(args) - mls_english_corpus.load_hf_dataset("parler-tts/mls_eng") - train_cuts = mls_english_corpus.train_cuts() - train_dataloader = mls_english_corpus.train_dataloaders( - train_cuts, sampler_state_dict=sampler_state_dict - ) - ... - for epoch in range(...): - train_one_epoch( - ..., - train_dl=train_dl, - ..., - ) - ``` - """ - if not is_module_available("datasets"): - raise ImportError( - "To process the MLS English HF corpus, please install optional dependency: pip install datasets" - ) - - from datasets import load_dataset - - self.dataset = load_dataset(mls_eng_hf_dataset_path) #, split="test") - - def train_dataloaders( - self, cuts_train: CutSet, sampler_state_dict: Optional[Dict[str, Any]] = None - ) -> DataLoader: - """ - Args: - cuts_train: - CutSet for training. - sampler_state_dict: - The state dict for the training sampler. - """ - - transforms = [] - input_transforms = [] - - if self.args.enable_spec_aug: - logging.info("Enable SpecAugment") - logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}") - # Set the value of num_frame_masks according to Lhotse's version. - # In different Lhotse's versions, the default of num_frame_masks is - # different. - num_frame_masks = 10 - num_frame_masks_parameter = inspect.signature( - SpecAugment.__init__ - ).parameters["num_frame_masks"] - if num_frame_masks_parameter.default == 1: - num_frame_masks = 2 - logging.info(f"Num frame mask: {num_frame_masks}") - input_transforms.append( - SpecAugment( - time_warp_factor=self.args.spec_aug_time_warp_factor, - num_frame_masks=num_frame_masks, - features_mask_size=27, - num_feature_masks=2, - frames_mask_size=100, - ) - ) - else: - logging.info("Disable SpecAugment") - - logging.info("About to create train dataset") - train = K2SpeechRecognitionDataset( - cut_transforms=transforms, - input_transforms=input_transforms, - return_cuts=self.args.return_cuts, - ) - - if self.args.on_the_fly_feats: - # NOTE: the PerturbSpeed transform should be added only if we - # remove it from data prep stage. - # Add on-the-fly speed perturbation; since originally it would - # have increased epoch size by 3, we will apply prob 2/3 and use - # 3x more epochs. - # Speed perturbation probably should come first before - # concatenation, but in principle the transforms order doesn't have - # to be strict (e.g. could be randomized) - # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms # noqa - # Drop feats to be on the safe side. - train = K2SpeechRecognitionDataset( - cut_transforms=transforms, - input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))), - input_transforms=input_transforms, - return_cuts=self.args.return_cuts, - ) - - if self.args.bucketing_sampler: - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - num_buckets=self.args.num_buckets, - drop_last=self.args.drop_last, - ) - else: - logging.info("Using SimpleCutSampler.") - train_sampler = SimpleCutSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - ) - logging.info("About to create train dataloader") - - if sampler_state_dict is not None: - logging.info("Loading sampler state dict") - train_sampler.load_state_dict(sampler_state_dict) - - train_dl = DataLoader( - train, - sampler=train_sampler, - batch_size=None, - num_workers=self.args.num_workers, - persistent_workers=False, - ) - - return train_dl - - def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader: - transforms = [] - if self.args.concatenate_cuts: - transforms = [ - CutConcatenate( - duration_factor=self.args.duration_factor, gap=self.args.gap - ) - ] + transforms - - logging.info("About to create dev dataset") - if self.args.on_the_fly_feats: - validate = K2SpeechRecognitionDataset( - cut_transforms=transforms, - input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))), - return_cuts=self.args.return_cuts, - ) - else: - validate = K2SpeechRecognitionDataset( - cut_transforms=transforms, - return_cuts=self.args.return_cuts, - ) - valid_sampler = DynamicBucketingSampler( - cuts_valid, - max_duration=self.args.max_duration, - shuffle=False, - ) - logging.info("About to create dev dataloader") - valid_dl = DataLoader( - validate, - sampler=valid_sampler, - batch_size=None, - num_workers=2, - persistent_workers=False, - ) - - return valid_dl - - def test_dataloaders(self, cuts: CutSet) -> DataLoader: - logging.info("About to create test dataset") - test = K2SpeechRecognitionDataset( - input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))) - if self.args.on_the_fly_feats - else PrecomputedFeatures(), - return_cuts=self.args.return_cuts, - ) - sampler = DynamicBucketingSampler( - cuts, - max_duration=self.args.max_duration, - shuffle=False, - ) - test_dl = DataLoader( - test, - batch_size=None, - sampler=sampler, - num_workers=self.args.num_workers, - ) - return test_dl - - @lru_cache() - def train_cuts(self) -> CutSet: - logging.info("About to get train cuts") - cutset = CutSet.from_huggingface_dataset(self.dataset["train"], text_key="transcript") - return cutset - - @lru_cache() - def valid_cuts(self) -> CutSet: - logging.info("About to get dev cuts") - cutset = CutSet.from_huggingface_dataset(self.dataset["dev"], text_key="transcript") - return cutset - - @lru_cache() - def test_cuts(self) -> List[CutSet]: - logging.info("About to get test cuts") - cutset = CutSet.from_huggingface_dataset(self.dataset["test"], text_key="transcript") - return cutset diff --git a/egs/mls_english/ASR/zipformer/asr_datamodule.py b/egs/mls_english/ASR/zipformer/asr_datamodule.py new file mode 120000 index 000000000..d3d1bc74e --- /dev/null +++ b/egs/mls_english/ASR/zipformer/asr_datamodule.py @@ -0,0 +1 @@ +local/utils/asr_datamodule.py \ No newline at end of file