diff --git a/egs/mls_english/ASR/local/utils/asr_datamodule.py b/egs/mls_english/ASR/local/utils/asr_datamodule.py index de928e36b..6a64198d9 100644 --- a/egs/mls_english/ASR/local/utils/asr_datamodule.py +++ b/egs/mls_english/ASR/local/utils/asr_datamodule.py @@ -21,366 +21,230 @@ import inspect import logging from functools import lru_cache from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union -from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy +from lhotse import CutSet, Fbank, FbankConfig from lhotse.dataset import ( CutConcatenate, - CutMix, DynamicBucketingSampler, K2SpeechRecognitionDataset, - PrecomputedFeatures, SimpleCutSampler, SpecAugment, ) from lhotse.dataset.input_strategies import OnTheFlyFeatures -from lhotse.utils import is_module_available from torch.utils.data import DataLoader from icefall.utils import str2bool - class MLSEnglishHFAsrDataModule: """ - DataModule for k2 ASR experiments. - It assumes there is always one train and valid dataloader, - but there can be multiple test dataloaders (e.g. LibriSpeech test-clean - and test-other). - It contains all the common data pipeline modules used in ASR - experiments, e.g.: - - dynamic batch size, - - bucketing samplers, - - cut concatenation, - - augmentation, - - on-the-fly feature extraction - This class should be derived for specific corpora used in ASR tasks. + DataModule for MLS English ASR experiments using HuggingFace dataset. + Handles dataset loading and provides train/valid/test dataloaders with + on-the-fly feature extraction. """ def __init__(self, args: argparse.Namespace): self.args = args + self.dataset = None + # self._validate_args() + + # def _validate_args(self) -> None: + # """Validate configuration arguments.""" + # if self.args.on_the_fly_feats is False: + # raise ValueError("This recipe requires on-the-fly feature extraction") @classmethod - def add_arguments(cls, parser: argparse.ArgumentParser): + def add_arguments(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser: group = parser.add_argument_group( title="ASR data related options", - description="These options are used for the preparation of " - "PyTorch DataLoaders from Lhotse CutSet's -- they control the " - "effective batch sizes, sampling strategies, applied data " - "augmentations, etc.", + description="Options for data loading and processing", ) + + # Dataset configuration group.add_argument( - "--manifest-dir", - type=Path, - default=Path("data/manifests"), - help="Path to directory with train/dev/test cuts.", + "--dataset-path", + type=str, + default="parler-tts/mls_eng", + help="Path to HuggingFace MLS English dataset (name or local path)", ) + + # Sampling and batching group.add_argument( "--max-duration", - type=int, + type=float, default=200.0, - help="Maximum pooled recordings duration (seconds) in a " - "single batch. You can reduce it if it causes CUDA OOM.", + help="Maximum batch duration in seconds", ) group.add_argument( "--bucketing-sampler", type=str2bool, default=True, - help="When enabled, the batches will come from buckets of " - "similar duration (saves padding frames).", + help="Whether to use bucketing sampler", ) group.add_argument( "--num-buckets", type=int, default=30, - help="The number of buckets for the DynamicBucketingSampler" - "(you might want to increase it for larger datasets).", + help="Number of buckets for DynamicBucketingSampler", ) + + # Data augmentation group.add_argument( - "--concatenate-cuts", - type=str2bool, - default=False, - help="When enabled, utterances (cuts) will be concatenated " - "to minimize the amount of padding.", - ) - group.add_argument( - "--duration-factor", - type=float, - default=1.0, - help="Determines the maximum duration of a concatenated cut " - "relative to the duration of the longest cut in a batch.", - ) - group.add_argument( - "--gap", - type=float, - default=1.0, - help="The amount of padding (in seconds) inserted between " - "concatenated cuts. This padding is filled with noise when " - "noise augmentation is used.", - ) - group.add_argument( - "--on-the-fly-feats", - type=str2bool, - default=True, # must be true without lhotse feature prep - help="When enabled, use on-the-fly cut mixing and feature " - "extraction. Will drop existing precomputed feature manifests " - "if available.", - ) - group.add_argument( - "--shuffle", + "--enable-spec-aug", type=str2bool, default=True, - help="When enabled (=default), the examples will be " - "shuffled for each epoch.", + help="Whether to enable SpecAugment", ) group.add_argument( - "--drop-last", - type=str2bool, - default=True, - help="Whether to drop last batch. Used by sampler.", + "--spec-aug-time-warp-factor", + type=int, + default=80, + help="Time warp factor for SpecAugment", + ) + + # Dataloader configuration + group.add_argument( + "--num-workers", + type=int, + default=2, + help="Number of workers for data loading", ) group.add_argument( "--return-cuts", type=str2bool, default=False, - help="When enabled, each batch will have the " - "field: batch['supervisions']['cut'] with the cuts that " - "were used to construct it.", + help="Whether to return cuts in batch", ) group.add_argument( - "--num-workers", - type=int, - default=2, - help="The number of training dataloader workers that " - "collect the batches.", - ) - - group.add_argument( - "--enable-spec-aug", + "--drop-last", type=str2bool, default=True, - help="When enabled, use SpecAugment for training dataset.", + help="Whether to drop last incomplete batch", ) + - group.add_argument( - "--spec-aug-time-warp-factor", - type=int, - default=80, - help="Used only when --enable-spec-aug is True. " - "It specifies the factor for time warping in SpecAugment. " - "Larger values mean more warping. " - "A value less than 1 means to disable time warp.", - ) + return parser - group.add_argument( - "--enable-musan", - type=str2bool, - default=False, - help="When enabled, select noise from MUSAN and mix it" - "with training dataset. ", - ) + def load_dataset(self, dataset_path: Optional[str] = None) -> None: + """Load the HuggingFace dataset.""" + dataset_path = dataset_path or self.args.dataset_path + logging.info(f"Loading MLS English dataset from: {dataset_path}") - def load_hf_dataset( - self, mls_eng_hf_dataset_path: str = "parler-tts/mls_eng", - ): - """ - Method to load HF dataset with datasets.load_dataset - and save it in this DataModule. - - Intended usage inside a training script: - ``` - mls_english_corpus = MLSEnglishHFAsrDataModule(args) - mls_english_corpus.load_hf_dataset("parler-tts/mls_eng") - train_cuts = mls_english_corpus.train_cuts() - train_dataloader = mls_english_corpus.train_dataloaders( - train_cuts, sampler_state_dict=sampler_state_dict - ) - ... - for epoch in range(...): - train_one_epoch( - ..., - train_dl=train_dl, - ..., - ) - ``` - """ - if not is_module_available("datasets"): + try: + from datasets import load_dataset + self.dataset = load_dataset(dataset_path) + logging.info("Dataset loaded successfully") + except ImportError: raise ImportError( - "To process the MLS English HF corpus, please install optional dependency: pip install datasets" + "Please install datasets package: pip install datasets" ) - - from datasets import load_dataset - - self.dataset = load_dataset(mls_eng_hf_dataset_path) #, split="test") - - def train_dataloaders( - self, cuts_train: CutSet, sampler_state_dict: Optional[Dict[str, Any]] = None - ) -> DataLoader: - """ - Args: - cuts_train: - CutSet for training. - sampler_state_dict: - The state dict for the training sampler. - """ + except Exception as e: + raise RuntimeError(f"Failed to load dataset: {e}") + def _create_dataset(self, cuts: CutSet, is_train: bool = False) -> K2SpeechRecognitionDataset: + """Create appropriate dataset with transforms.""" transforms = [] input_transforms = [] - if self.args.enable_spec_aug: - logging.info("Enable SpecAugment") - logging.info(f"Time warp factor: {self.args.spec_aug_time_warp_factor}") - # Set the value of num_frame_masks according to Lhotse's version. - # In different Lhotse's versions, the default of num_frame_masks is - # different. - num_frame_masks = 10 - num_frame_masks_parameter = inspect.signature( - SpecAugment.__init__ - ).parameters["num_frame_masks"] - if num_frame_masks_parameter.default == 1: - num_frame_masks = 2 - logging.info(f"Num frame mask: {num_frame_masks}") - input_transforms.append( - SpecAugment( - time_warp_factor=self.args.spec_aug_time_warp_factor, - num_frame_masks=num_frame_masks, - features_mask_size=27, - num_feature_masks=2, - frames_mask_size=100, - ) - ) - else: - logging.info("Disable SpecAugment") + if is_train and self.args.enable_spec_aug: + input_transforms.append(self._create_spec_augment()) - logging.info("About to create train dataset") - train = K2SpeechRecognitionDataset( + return K2SpeechRecognitionDataset( cut_transforms=transforms, + input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))), input_transforms=input_transforms, return_cuts=self.args.return_cuts, ) - if self.args.on_the_fly_feats: - # NOTE: the PerturbSpeed transform should be added only if we - # remove it from data prep stage. - # Add on-the-fly speed perturbation; since originally it would - # have increased epoch size by 3, we will apply prob 2/3 and use - # 3x more epochs. - # Speed perturbation probably should come first before - # concatenation, but in principle the transforms order doesn't have - # to be strict (e.g. could be randomized) - # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms # noqa - # Drop feats to be on the safe side. - train = K2SpeechRecognitionDataset( - cut_transforms=transforms, - input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))), - input_transforms=input_transforms, - return_cuts=self.args.return_cuts, - ) + def _create_spec_augment(self) -> SpecAugment: + """Create SpecAugment transform based on config.""" + num_frame_masks = 10 + num_frame_masks_parameter = inspect.signature( + SpecAugment.__init__ + ).parameters["num_frame_masks"] + if num_frame_masks_parameter.default == 1: + num_frame_masks = 2 + return SpecAugment( + time_warp_factor=self.args.spec_aug_time_warp_factor, + num_frame_masks=num_frame_masks, + features_mask_size=27, + num_feature_masks=2, + frames_mask_size=100, + ) + + def _create_sampler(self, cuts: CutSet, shuffle: bool) -> Union[DynamicBucketingSampler, SimpleCutSampler]: + """Create appropriate sampler based on config.""" if self.args.bucketing_sampler: - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( - cuts_train, + return DynamicBucketingSampler( + cuts, max_duration=self.args.max_duration, - shuffle=self.args.shuffle, + shuffle=shuffle, num_buckets=self.args.num_buckets, drop_last=self.args.drop_last, ) - else: - logging.info("Using SimpleCutSampler.") - train_sampler = SimpleCutSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - ) - logging.info("About to create train dataloader") + return SimpleCutSampler( + cuts, + max_duration=self.args.max_duration, + shuffle=shuffle, + ) - if sampler_state_dict is not None: - logging.info("Loading sampler state dict") - train_sampler.load_state_dict(sampler_state_dict) + def train_dataloader(self, sampler_state_dict: Optional[Dict[str, Any]] = None) -> DataLoader: + """Create train dataloader.""" + cuts = self.train_cuts() + dataset = self._create_dataset(cuts, is_train=True) + sampler = self._create_sampler(cuts, shuffle=True) - train_dl = DataLoader( - train, - sampler=train_sampler, + if sampler_state_dict: + sampler.load_state_dict(sampler_state_dict) + + return DataLoader( + dataset, + sampler=sampler, batch_size=None, num_workers=self.args.num_workers, persistent_workers=False, ) - return train_dl - - def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader: - transforms = [] - if self.args.concatenate_cuts: - transforms = [ - CutConcatenate( - duration_factor=self.args.duration_factor, gap=self.args.gap - ) - ] + transforms - - logging.info("About to create dev dataset") - if self.args.on_the_fly_feats: - validate = K2SpeechRecognitionDataset( - cut_transforms=transforms, - input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))), - return_cuts=self.args.return_cuts, - ) - else: - validate = K2SpeechRecognitionDataset( - cut_transforms=transforms, - return_cuts=self.args.return_cuts, - ) - valid_sampler = DynamicBucketingSampler( - cuts_valid, - max_duration=self.args.max_duration, - shuffle=False, - ) - logging.info("About to create dev dataloader") - valid_dl = DataLoader( - validate, - sampler=valid_sampler, + def valid_dataloader(self) -> DataLoader: + """Create validation dataloader.""" + cuts = self.valid_cuts() + return DataLoader( + self._create_dataset(cuts), + sampler=self._create_sampler(cuts, shuffle=False), batch_size=None, num_workers=2, persistent_workers=False, ) - return valid_dl - - def test_dataloaders(self, cuts: CutSet) -> DataLoader: - logging.info("About to create test dataset") - test = K2SpeechRecognitionDataset( - input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))) - if self.args.on_the_fly_feats - else PrecomputedFeatures(), - return_cuts=self.args.return_cuts, - ) - sampler = DynamicBucketingSampler( - cuts, - max_duration=self.args.max_duration, - shuffle=False, - ) - test_dl = DataLoader( - test, + def test_dataloader(self) -> DataLoader: + """Create test dataloader.""" + cuts = self.test_cuts() + return DataLoader( + self._create_dataset(cuts), + sampler=self._create_sampler(cuts, shuffle=False), batch_size=None, - sampler=sampler, num_workers=self.args.num_workers, ) - return test_dl @lru_cache() def train_cuts(self) -> CutSet: - logging.info("About to get train cuts") - cutset = CutSet.from_huggingface_dataset(self.dataset["train"], text_key="transcript") - return cutset + return CutSet.from_huggingface_dataset( + self.dataset["train"], + text_key="transcript" + ) @lru_cache() def valid_cuts(self) -> CutSet: - logging.info("About to get dev cuts") - cutset = CutSet.from_huggingface_dataset(self.dataset["dev"], text_key="transcript") - return cutset + return CutSet.from_huggingface_dataset( + self.dataset["dev"], + text_key="transcript" + ) @lru_cache() - def test_cuts(self) -> List[CutSet]: - logging.info("About to get test cuts") - cutset = CutSet.from_huggingface_dataset(self.dataset["test"], text_key="transcript") - return cutset + def test_cuts(self) -> CutSet: + return CutSet.from_huggingface_dataset( + self.dataset["test"], + text_key="transcript" + ) \ No newline at end of file diff --git a/egs/mls_english/ASR/local/utils/generate_transcript.py b/egs/mls_english/ASR/local/utils/generate_transcript.py index 67c64726e..f48093bcb 100644 --- a/egs/mls_english/ASR/local/utils/generate_transcript.py +++ b/egs/mls_english/ASR/local/utils/generate_transcript.py @@ -19,59 +19,71 @@ import argparse import logging from pathlib import Path +from typing import Optional from lhotse import CutSet -from asr_datamodule import MLSEnglishHFAsrDataModule - from tqdm import tqdm def get_args(): parser = argparse.ArgumentParser( + description="Generate transcripts for BPE training from MLS English dataset", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - # parser.add_argument( - # "train_cut", metavar="train-cut", type=Path, help="Path to the train cut" - # ) - + parser.add_argument( + "--dataset-path", + type=str, + default="parler-tts/mls_eng", + help="Path to HuggingFace MLS English dataset (name or local path)", + ) + parser.add_argument( "--lang-dir", type=Path, default=Path("data/lang"), - help=( - "Name of lang dir. " - "If not set, this will default to data/lang" - ), + help="Directory to store output transcripts", + ) + + parser.add_argument( + "--split", + type=str, + default="train", + help="Dataset split to use for generating transcripts (train/dev/test)", ) return parser.parse_args() +def generate_transcript_from_cuts(cuts: CutSet, output_file: Path) -> None: + """Generate transcript text file from Lhotse CutSet.""" + with open(output_file, "w") as f: + for cut in tqdm(cuts, desc="Processing cuts"): + for sup in cut.supervisions: + f.write(f"{sup.text}\n") def main(): args = get_args() logging.basicConfig( - format=("%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"), + format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s", level=logging.INFO, ) + args.lang_dir.mkdir(parents=True, exist_ok=True) + output_file = args.lang_dir / "transcript.txt" - mls_english_corpus = MLSEnglishHFAsrDataModule(args) - mls_english_corpus.load_hf_dataset("/root/datasets/parler-tts--mls_eng") - - train_cuts = mls_english_corpus.train_cuts() - - logging.info(f"Creating transcript from MLS English train cut.") - - def generate_text(train_cuts): - for cut in tqdm(train_cuts): - for sup in cut.supervisions: - yield sup.text + "\n" - - with open(args.lang_dir / "transcript.txt", "w") as file: - file.writelines(generate_text(train_cuts)) - - logging.info("Done.") + logging.info(f"Loading {args.split} split from dataset: {args.dataset_path}") + try: + cuts = CutSet.from_huggingface_dataset( + args.dataset_path, + split=args.split, + text_key="transcript" + ) + except Exception as e: + logging.error(f"Failed to load dataset: {e}") + raise + logging.info(f"Generating transcript to {output_file}") + generate_transcript_from_cuts(cuts, output_file) + logging.info("Transcript generation completed") if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/egs/mls_english/ASR/prepare.sh b/egs/mls_english/ASR/prepare.sh index 0484832a3..27aaa5e37 100644 --- a/egs/mls_english/ASR/prepare.sh +++ b/egs/mls_english/ASR/prepare.sh @@ -1,5 +1,9 @@ #!/usr/bin/env bash +# Prepare script for MLS English ASR recipe in icefall +# This recipe uses on-the-fly feature extraction, so it skips manifest +# and feature generation steps used in other recipes. + # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python @@ -9,118 +13,50 @@ nj=15 stage=-1 stop_stage=100 -# vocab_sizes=(500 1000 2000) -vocab_sizes=(2000) - - -# We assume dl_dir (download dir) contains the following -# directories and files. If not, they will be downloaded -# by this script automatically. -# -# - $dl_dir/ReazonSpeech -# You can find FLAC files in this directory. -# You can download them from https://huggingface.co/datasets/reazon-research/reazonspeech -# -# - $dl_dir/dataset.json -# The metadata of the ReazonSpeech dataset. +# Configuration for BPE tokenizer +vocab_sizes=(2000) # You can add more sizes like (500 1000 2000) for comparison +# Directory where dataset will be downloaded dl_dir=$PWD/download . shared/parse_options.sh || exit 1 # All files generated by this script are saved in "data". -# You can safely remove "data" and rerun this script to regenerate it. mkdir -p data log() { - # This function is from espnet local fname=${BASH_SOURCE[1]##*/} echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" } -log "Running prepare.sh" - -log "dl_dir: $dl_dir" +log "Starting MLS English data preparation" if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then - log "Stage 0: Download data" - - # If you have pre-downloaded it to /path/to/mls_eng, - # you can create a symlink - # - # ln -sfv /path/to/mls_eng $dl_dir/mls_eng - # + log "Stage 0: Download MLS English dataset" if [ ! -d $dl_dir/mls_english ]; then - git clone https://huggingface.co/datasets/parler-tts/mls_eng $dl_dir/mls_eng + if ! git clone https://huggingface.co/datasets/parler-tts/mls_eng $dl_dir/mls_english; then + log "Failed to download MLS English dataset" + exit 1 + fi fi fi -## Not necessary to create manifest or pre-compute fbank for on-the-fly feature computation ## - -# if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then -# log "Stage 1: Prepare MLS English manifest" -# # We assume that you have downloaded the ReazonSpeech corpus -# # to $dl_dir/ReazonSpeech -# mkdir -p data/manifests -# if [ ! -e data/manifests/.reazonspeech.done ]; then -# lhotse prepare reazonspeech -j $nj $dl_dir/ReazonSpeech data/manifests -# touch data/manifests/.reazonspeech.done -# fi -# fi - -# if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then -# log "Stage 2: Compute ReazonSpeech fbank" -# if [ ! -e data/manifests/.reazonspeech-validated.done ]; then -# python local/compute_fbank_reazonspeech.py --manifest-dir data/manifests -# python local/validate_manifest.py --manifest data/manifests/reazonspeech_cuts_train.jsonl.gz -# python local/validate_manifest.py --manifest data/manifests/reazonspeech_cuts_dev.jsonl.gz -# python local/validate_manifest.py --manifest data/manifests/reazonspeech_cuts_test.jsonl.gz -# touch data/manifests/.reazonspeech-validated.done -# fi -# fi - -############################################################################################### - -# if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then -# log "Stage 3: Prepare ReazonSpeech lang_char" -# python local/prepare_lang_char.py data/manifests/reazonspeech_cuts_train.jsonl.gz -# fi - -# if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then -# log "Stage 4: Show manifest statistics" -# python local/display_manifest_statistics.py --manifest-dir data/manifests > data/manifests/manifest_statistics.txt -# cat data/manifests/manifest_statistics.txt -# fi - mkdir -p data/lang - lang_dir=data/lang -log "lang_dir: $lang_dir" - if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Prepare BPE based lang" - + log "Stage 1: Prepare BPE tokenizer" + if [ ! -f $lang_dir/transcript.txt ]; then - log "Generate transcript for BPE training" - + log "Generating transcripts for BPE training" ./local/utils/generate_transcript.py --lang-dir $lang_dir - # files=$( - # find "$dl_dir/LibriSpeech/train-clean-100" -name "*.trans.txt" - # find "$dl_dir/LibriSpeech/train-clean-360" -name "*.trans.txt" - # find "$dl_dir/LibriSpeech/train-other-500" -name "*.trans.txt" - # ) - # for f in ${files[@]}; do - # cat $f | cut -d " " -f 2- - # done > $lang_dir/transcript_words.txt fi for vocab_size in ${vocab_sizes[@]}; do - log "Train BPE model with vocab_size: $vocab_size" + log "Training BPE model with vocab_size=${vocab_size}" bpe_dir=data/lang/bpe_${vocab_size} mkdir -p $bpe_dir - if [ ! -f $bpe_dir/bpe.model ]; then ./local/train_bpe_model.py \ --lang-dir $bpe_dir \ @@ -128,4 +64,6 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then --transcript $lang_dir/transcript.txt fi done -fi \ No newline at end of file +fi + +log "MLS English data preparation completed successfully" \ No newline at end of file diff --git a/egs/mls_english/ASR/zipformer/asr_datamodule.py b/egs/mls_english/ASR/zipformer/asr_datamodule.py index d3d1bc74e..a48591198 120000 --- a/egs/mls_english/ASR/zipformer/asr_datamodule.py +++ b/egs/mls_english/ASR/zipformer/asr_datamodule.py @@ -1 +1 @@ -local/utils/asr_datamodule.py \ No newline at end of file +../local/utils/asr_datamodule.py \ No newline at end of file diff --git a/egs/mls_english/ASR/zipformer/decode.py b/egs/mls_english/ASR/zipformer/decode.py index 1a446804b..088252c59 100755 --- a/egs/mls_english/ASR/zipformer/decode.py +++ b/egs/mls_english/ASR/zipformer/decode.py @@ -1043,13 +1043,13 @@ def main(): # we need cut ids to display recognition results. args.return_cuts = True mls_english_corpus = MLSEnglishHFAsrDataModule(args) - mls_english_corpus.load_hf_dataset("/root/datasets/parler-tts--mls_eng") + mls_english_corpus.load_dataset(args.dataset_path) - # dev_cuts = mls_english_corpus.dev_cuts() - test_cuts = mls_english_corpus.test_cuts() + # # dev_cuts = mls_english_corpus.dev_cuts() + # test_cuts = mls_english_corpus.test_cuts() - # dev_dl = mls_english_corpus.test_dataloaders(dev_cuts) - test_dl = mls_english_corpus.test_dataloaders(test_cuts) + # dev_dl = mls_english_corpus.test_dataloader() + test_dl = mls_english_corpus.test_dataloader() test_sets = ["test"] test_dls = [test_dl] diff --git a/egs/mls_english/ASR/zipformer/train.py b/egs/mls_english/ASR/zipformer/train.py index 72abc25fe..8c61df8f2 100755 --- a/egs/mls_english/ASR/zipformer/train.py +++ b/egs/mls_english/ASR/zipformer/train.py @@ -1215,9 +1215,9 @@ def run(rank, world_size, args): return True mls_english_corpus = MLSEnglishHFAsrDataModule(args) - mls_english_corpus.load_hf_dataset("/root/datasets/parler-tts--mls_eng") + mls_english_corpus.load_dataset(args.dataset_path) - train_cuts = mls_english_corpus.train_cuts() + # train_cuts = mls_english_corpus.train_cuts() # train_cuts = train_cuts.filter(remove_short_and_long_utt) @@ -1228,12 +1228,17 @@ def run(rank, world_size, args): else: sampler_state_dict = None - train_dl = mls_english_corpus.train_dataloaders( - train_cuts, sampler_state_dict=sampler_state_dict + # train_dl = mls_english_corpus.train_dataloaders( + # train_cuts, sampler_state_dict=sampler_state_dict + # ) + train_dl = mls_english_corpus.train_dataloader( + sampler_state_dict=sampler_state_dict ) - valid_cuts = mls_english_corpus.valid_cuts() - valid_dl = mls_english_corpus.valid_dataloaders(valid_cuts) + # valid_cuts = mls_english_corpus.valid_cuts() + # valid_dl = mls_english_corpus.valid_dataloader(valid_cuts) + valid_dl = mls_english_corpus.valid_dataloader() + if not params.print_diagnostics: scan_pessimistic_batches_for_oom(