diff --git a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py index faebff2f6..9850cf251 100755 --- a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py +++ b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py @@ -29,7 +29,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -90,7 +90,7 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80): # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}") diff --git a/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py b/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py index d66e5cfca..7d468755d 100644 --- a/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py +++ b/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py @@ -25,7 +25,7 @@ for usage. """ -from lhotse import load_manifest_lazy +from lhotse import load_manifest def main(): @@ -37,7 +37,7 @@ def main(): for path in paths: print(f"Starting display the statistics for {path}") - cuts = load_manifest_lazy(path) + cuts = load_manifest(path) cuts.describe() diff --git a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py index 728f7e3d0..f1b5bec59 100644 --- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py +++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py @@ -27,13 +27,13 @@ from lhotse import ( CutSet, Fbank, FbankConfig, - load_manifest_lazy, + load_manifest, set_caching_enabled, ) from lhotse.dataset import ( + BucketingSampler, CutConcatenate, CutMix, - DynamicBucketingSampler, K2SpeechRecognitionDataset, PrecomputedFeatures, SingleCutSampler, @@ -109,7 +109,7 @@ class Aidatatang_200zhAsrDataModule: "--num-buckets", type=int, default=300, - help="The number of buckets for the DynamicBucketingSampler" + help="The number of buckets for the BucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -204,7 +204,7 @@ class Aidatatang_200zhAsrDataModule: The state dict for the training sampler. """ logging.info("About to get Musan cuts") - cuts_musan = load_manifest_lazy( + cuts_musan = load_manifest( self.args.manifest_dir / "musan_cuts.jsonl.gz" ) @@ -289,12 +289,13 @@ class Aidatatang_200zhAsrDataModule: ) if self.args.bucketing_sampler: - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( + logging.info("Using BucketingSampler.") + train_sampler = BucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, + bucket_method="equal_duration", drop_last=True, ) else: @@ -349,7 +350,7 @@ class Aidatatang_200zhAsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = DynamicBucketingSampler( + valid_sampler = BucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, @@ -379,7 +380,7 @@ class Aidatatang_200zhAsrDataModule: else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = DynamicBucketingSampler( + sampler = BucketingSampler( cuts, max_duration=self.args.max_duration, shuffle=False, @@ -400,20 +401,20 @@ class Aidatatang_200zhAsrDataModule: @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz" ) @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz" ) @lru_cache() def test_cuts(self) -> List[CutSet]: logging.info("About to get test cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz" ) diff --git a/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py b/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py index 034a2a956..8cdfad71f 100755 --- a/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py +++ b/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py @@ -29,7 +29,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -90,7 +90,7 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80): # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}") diff --git a/egs/aishell/ASR/local/compute_fbank_aishell.py b/egs/aishell/ASR/local/compute_fbank_aishell.py index 685a04e3f..e27e35ec5 100755 --- a/egs/aishell/ASR/local/compute_fbank_aishell.py +++ b/egs/aishell/ASR/local/compute_fbank_aishell.py @@ -29,7 +29,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -86,7 +86,7 @@ def compute_fbank_aishell(num_mel_bins: int = 80): # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}") diff --git a/egs/aishell/ASR/local/display_manifest_statistics.py b/egs/aishell/ASR/local/display_manifest_statistics.py index c478f7331..38506436e 100755 --- a/egs/aishell/ASR/local/display_manifest_statistics.py +++ b/egs/aishell/ASR/local/display_manifest_statistics.py @@ -25,7 +25,7 @@ for usage. """ -from lhotse import load_manifest_lazy +from lhotse import load_manifest def main(): @@ -36,7 +36,7 @@ def main(): # path = "./data/fbank/aidatatang_cuts_test.jsonl.gz" # path = "./data/fbank/aidatatang_cuts_dev.jsonl.gz" - cuts = load_manifest_lazy(path) + cuts = load_manifest(path) cuts.describe() diff --git a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py index e1021fda2..4018ea597 100644 --- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py +++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py @@ -23,11 +23,11 @@ from functools import lru_cache from pathlib import Path from typing import List -from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy +from lhotse import CutSet, Fbank, FbankConfig, load_manifest from lhotse.dataset import ( + BucketingSampler, CutConcatenate, CutMix, - DynamicBucketingSampler, K2SpeechRecognitionDataset, PrecomputedFeatures, SingleCutSampler, @@ -93,7 +93,7 @@ class AishellAsrDataModule: "--num-buckets", type=int, default=30, - help="The number of buckets for the DynamicBucketingSampler" + help="The number of buckets for the BucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -183,7 +183,7 @@ class AishellAsrDataModule: def train_dataloaders(self, cuts_train: CutSet) -> DataLoader: logging.info("About to get Musan cuts") - cuts_musan = load_manifest_lazy( + cuts_musan = load_manifest( self.args.manifest_dir / "musan_cuts.jsonl.gz" ) @@ -268,12 +268,13 @@ class AishellAsrDataModule: ) if self.args.bucketing_sampler: - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( + logging.info("Using BucketingSampler.") + train_sampler = BucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, + bucket_method="equal_duration", drop_last=self.args.drop_last, ) else: @@ -318,7 +319,7 @@ class AishellAsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = DynamicBucketingSampler( + valid_sampler = BucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, @@ -342,7 +343,7 @@ class AishellAsrDataModule: else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = DynamicBucketingSampler( + sampler = BucketingSampler( cuts, max_duration=self.args.max_duration, shuffle=False, @@ -358,7 +359,7 @@ class AishellAsrDataModule: @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - cuts_train = load_manifest_lazy( + cuts_train = load_manifest( self.args.manifest_dir / "aishell_cuts_train.jsonl.gz" ) return cuts_train @@ -366,13 +367,13 @@ class AishellAsrDataModule: @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "aishell_cuts_dev.jsonl.gz" ) @lru_cache() def test_cuts(self) -> List[CutSet]: logging.info("About to get test cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "aishell_cuts_test.jsonl.gz" ) diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py b/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py index 26d4ee111..4d1d0af60 100644 --- a/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py +++ b/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py @@ -18,7 +18,7 @@ import logging from pathlib import Path -from lhotse import CutSet, load_manifest_lazy +from lhotse import CutSet, load_manifest class AIDatatang200zh: @@ -37,17 +37,17 @@ class AIDatatang200zh: def train_cuts(self) -> CutSet: f = self.manifest_dir / "aidatatang_cuts_train.jsonl.gz" logging.info(f"About to get train cuts from {f}") - cuts_train = load_manifest_lazy(f) + cuts_train = load_manifest(f) return cuts_train def valid_cuts(self) -> CutSet: f = self.manifest_dir / "aidatatang_cuts_valid.jsonl.gz" logging.info(f"About to get valid cuts from {f}") - cuts_valid = load_manifest_lazy(f) + cuts_valid = load_manifest(f) return cuts_valid def test_cuts(self) -> CutSet: f = self.manifest_dir / "aidatatang_cuts_test.jsonl.gz" logging.info(f"About to get test cuts from {f}") - cuts_test = load_manifest_lazy(f) + cuts_test = load_manifest(f) return cuts_test diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py b/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py index ddeca4d88..52fb330f6 100644 --- a/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py +++ b/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py @@ -18,7 +18,7 @@ import logging from pathlib import Path -from lhotse import CutSet, load_manifest_lazy +from lhotse import CutSet, load_manifest class AIShell: @@ -37,17 +37,17 @@ class AIShell: def train_cuts(self) -> CutSet: f = self.manifest_dir / "aishell_cuts_train.jsonl.gz" logging.info(f"About to get train cuts from {f}") - cuts_train = load_manifest_lazy(f) + cuts_train = load_manifest(f) return cuts_train def valid_cuts(self) -> CutSet: f = self.manifest_dir / "aishell_cuts_dev.jsonl.gz" logging.info(f"About to get valid cuts from {f}") - cuts_valid = load_manifest_lazy(f) + cuts_valid = load_manifest(f) return cuts_valid def test_cuts(self) -> CutSet: f = self.manifest_dir / "aishell_cuts_test.jsonl.gz" logging.info(f"About to get test cuts from {f}") - cuts_test = load_manifest_lazy(f) + cuts_test = load_manifest(f) return cuts_test diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py b/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py index 838e53658..d1b873d2a 100644 --- a/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py +++ b/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py @@ -24,8 +24,8 @@ from typing import Optional from lhotse import CutSet, Fbank, FbankConfig from lhotse.dataset import ( + BucketingSampler, CutMix, - DynamicBucketingSampler, K2SpeechRecognitionDataset, SpecAugment, ) @@ -72,7 +72,7 @@ class AsrDataModule: "--num-buckets", type=int, default=30, - help="The number of buckets for the DynamicBucketingSampler " + help="The number of buckets for the BucketingSampler" "(you might want to increase it for larger datasets).", ) @@ -226,12 +226,13 @@ class AsrDataModule: return_cuts=self.args.return_cuts, ) - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( + logging.info("Using BucketingSampler.") + train_sampler = BucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, + bucket_method="equal_duration", drop_last=True, ) @@ -262,7 +263,7 @@ class AsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = DynamicBucketingSampler( + valid_sampler = BucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, @@ -286,7 +287,7 @@ class AsrDataModule: else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = DynamicBucketingSampler( + sampler = BucketingSampler( cuts, max_duration=self.args.max_duration, shuffle=False, diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/train.py b/egs/aishell/ASR/transducer_stateless_modified-2/train.py index a6c17198f..0975f309a 100755 --- a/egs/aishell/ASR/transducer_stateless_modified-2/train.py +++ b/egs/aishell/ASR/transducer_stateless_modified-2/train.py @@ -56,7 +56,7 @@ from asr_datamodule import AsrDataModule from conformer import Conformer from decoder import Decoder from joiner import Joiner -from lhotse import CutSet, load_manifest_lazy +from lhotse import CutSet, load_manifest from lhotse.cut import Cut from lhotse.utils import fix_random_seed from model import Transducer @@ -735,7 +735,7 @@ def run(rank, world_size, args): train_datatang_cuts = train_datatang_cuts.repeat(times=None) if args.enable_musan: - cuts_musan = load_manifest_lazy( + cuts_musan = load_manifest( Path(args.manifest_dir) / "musan_cuts.jsonl.gz" ) else: diff --git a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py index b3fc8adbb..2ff473c60 100755 --- a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py +++ b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py @@ -29,7 +29,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -90,7 +90,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80): # when an executor is specified, make more partitions num_jobs=cur_num_jobs, executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) logging.info("About splitting cuts into smaller chunks") diff --git a/egs/alimeeting/ASR/local/display_manifest_statistics.py b/egs/alimeeting/ASR/local/display_manifest_statistics.py index 16cdecc91..f6e7a697f 100644 --- a/egs/alimeeting/ASR/local/display_manifest_statistics.py +++ b/egs/alimeeting/ASR/local/display_manifest_statistics.py @@ -25,7 +25,7 @@ for usage. """ -from lhotse import load_manifest_lazy +from lhotse import load_manifest def main(): @@ -37,7 +37,7 @@ def main(): for path in paths: print(f"Starting display the statistics for {path}") - cuts = load_manifest_lazy(path) + cuts = load_manifest(path) cuts.describe() diff --git a/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py index 339612afe..1b3475c90 100644 --- a/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py +++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py @@ -27,13 +27,13 @@ from lhotse import ( CutSet, Fbank, FbankConfig, - load_manifest_lazy, + load_manifest, set_caching_enabled, ) from lhotse.dataset import ( + BucketingSampler, CutConcatenate, CutMix, - DynamicBucketingSampler, K2SpeechRecognitionDataset, PrecomputedFeatures, SingleCutSampler, @@ -109,7 +109,7 @@ class AlimeetingAsrDataModule: "--num-buckets", type=int, default=300, - help="The number of buckets for the DynamicBucketingSampler" + help="The number of buckets for the BucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -204,7 +204,7 @@ class AlimeetingAsrDataModule: The state dict for the training sampler. """ logging.info("About to get Musan cuts") - cuts_musan = load_manifest_lazy( + cuts_musan = load_manifest( self.args.manifest_dir / "musan_cuts.jsonl.gz" ) @@ -289,13 +289,14 @@ class AlimeetingAsrDataModule: ) if self.args.bucketing_sampler: - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( + logging.info("Using BucketingSampler.") + train_sampler = BucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, buffer_size=30000, + bucket_method="equal_duration", drop_last=True, ) else: @@ -350,7 +351,7 @@ class AlimeetingAsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = DynamicBucketingSampler( + valid_sampler = BucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, @@ -380,7 +381,7 @@ class AlimeetingAsrDataModule: else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = DynamicBucketingSampler( + sampler = BucketingSampler( cuts, max_duration=self.args.max_duration, shuffle=False, @@ -401,20 +402,20 @@ class AlimeetingAsrDataModule: @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "alimeeting_cuts_train.jsonl.gz" ) @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "alimeeting_cuts_eval.jsonl.gz" ) @lru_cache() def test_cuts(self) -> List[CutSet]: logging.info("About to get test cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "alimeeting_cuts_test.jsonl.gz" ) diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py index 89a57d6c6..642d9fd32 100755 --- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py +++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py @@ -28,7 +28,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -91,7 +91,7 @@ def compute_fbank_librispeech(): # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) cut_set.to_file(output_dir / cuts_filename) diff --git a/egs/librispeech/ASR/local/compute_fbank_musan.py b/egs/librispeech/ASR/local/compute_fbank_musan.py index bbe5ef653..45f1b2f67 100755 --- a/egs/librispeech/ASR/local/compute_fbank_musan.py +++ b/egs/librispeech/ASR/local/compute_fbank_musan.py @@ -28,7 +28,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig, combine +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -67,7 +67,7 @@ def compute_fbank_musan(): len(dataset_parts), ) - musan_cuts_path = output_dir / "musan_cuts.jsonl.gz" + musan_cuts_path = output_dir / f"musan_cuts.{suffix}" if musan_cuts_path.is_file(): logging.info(f"{musan_cuts_path} already exists - skipping") @@ -92,7 +92,7 @@ def compute_fbank_musan(): storage_path=f"{output_dir}/musan_feats", num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) ) musan_cuts.to_file(musan_cuts_path) diff --git a/egs/librispeech/ASR/local/display_manifest_statistics.py b/egs/librispeech/ASR/local/display_manifest_statistics.py index c3c684235..1e8d78a9b 100755 --- a/egs/librispeech/ASR/local/display_manifest_statistics.py +++ b/egs/librispeech/ASR/local/display_manifest_statistics.py @@ -25,7 +25,7 @@ for usage. """ -from lhotse import load_manifest_lazy +from lhotse import load_manifest def main(): @@ -37,7 +37,7 @@ def main(): # path = "./data/fbank/librispeech_cuts_test-clean.jsonl.gz" path = "./data/fbank/librispeech_cuts_test-other.jsonl.gz" - cuts = load_manifest_lazy(path) + cuts = load_manifest(path) cuts.describe() diff --git a/egs/librispeech/ASR/local/validate_manifest.py b/egs/librispeech/ASR/local/validate_manifest.py index 7c57d629a..4ccc1d353 100755 --- a/egs/librispeech/ASR/local/validate_manifest.py +++ b/egs/librispeech/ASR/local/validate_manifest.py @@ -33,7 +33,7 @@ import argparse import logging from pathlib import Path -from lhotse import CutSet, load_manifest_lazy +from lhotse import CutSet, load_manifest from lhotse.cut import Cut @@ -76,7 +76,7 @@ def main(): logging.info(f"Validating {manifest}") assert manifest.is_file(), f"{manifest} does not exist" - cut_set = load_manifest_lazy(manifest) + cut_set = load_manifest(manifest) assert isinstance(cut_set, CutSet) for c in cut_set: diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py b/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py index b54d1aa39..df1e52202 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py @@ -22,6 +22,7 @@ from typing import Optional from lhotse import CutSet, Fbank, FbankConfig from lhotse.dataset import ( + BucketingSampler, CutMix, DynamicBucketingSampler, K2SpeechRecognitionDataset, @@ -70,7 +71,8 @@ class AsrDataModule: "--num-buckets", type=int, default=30, - help="The number of buckets for the DynamicBucketingSampler. " + help="The number of buckets for the BucketingSampler " + "and DynamicBucketingSampler." "(you might want to increase it for larger datasets).", ) @@ -150,6 +152,7 @@ class AsrDataModule: def train_dataloaders( self, cuts_train: CutSet, + dynamic_bucketing: bool, on_the_fly_feats: bool, cuts_musan: Optional[CutSet] = None, ) -> DataLoader: @@ -159,6 +162,9 @@ class AsrDataModule: Cuts for training. cuts_musan: If not None, it is the cuts for mixing. + dynamic_bucketing: + True to use DynamicBucketingSampler; + False to use BucketingSampler. on_the_fly_feats: True to use OnTheFlyFeatures; False to use PrecomputedFeatures. @@ -224,14 +230,25 @@ class AsrDataModule: return_cuts=self.args.return_cuts, ) - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - num_buckets=self.args.num_buckets, - drop_last=True, - ) + if dynamic_bucketing: + logging.info("Using DynamicBucketingSampler.") + train_sampler = DynamicBucketingSampler( + cuts_train, + max_duration=self.args.max_duration, + shuffle=self.args.shuffle, + num_buckets=self.args.num_buckets, + drop_last=True, + ) + else: + logging.info("Using BucketingSampler.") + train_sampler = BucketingSampler( + cuts_train, + max_duration=self.args.max_duration, + shuffle=self.args.shuffle, + num_buckets=self.args.num_buckets, + bucket_method="equal_duration", + drop_last=True, + ) logging.info("About to create train dataloader") train_dl = DataLoader( @@ -260,12 +277,10 @@ class AsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = DynamicBucketingSampler( + valid_sampler = BucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, - num_buckets=self.args.num_buckets, - drop_last=False, ) logging.info("About to create dev dataloader") valid_dl = DataLoader( @@ -286,12 +301,8 @@ class AsrDataModule: else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = DynamicBucketingSampler( - cuts, - max_duration=self.args.max_duration, - shuffle=False, - num_buckets=self.args.num_buckets, - drop_last=True, + sampler = BucketingSampler( + cuts, max_duration=self.args.max_duration, shuffle=False ) logging.debug("About to create test dataloader") test_dl = DataLoader( diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py b/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py index 6dba8e9fe..bf0871949 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py @@ -18,7 +18,7 @@ import logging from pathlib import Path -from lhotse import CutSet, load_manifest_lazy +from lhotse import CutSet, load_manifest class LibriSpeech: @@ -41,34 +41,34 @@ class LibriSpeech: def train_clean_100_cuts(self) -> CutSet: f = self.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz" logging.info(f"About to get train-clean-100 cuts from {f}") - return load_manifest_lazy(f) + return load_manifest(f) def train_clean_360_cuts(self) -> CutSet: f = self.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz" logging.info(f"About to get train-clean-360 cuts from {f}") - return load_manifest_lazy(f) + return load_manifest(f) def train_other_500_cuts(self) -> CutSet: f = self.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz" logging.info(f"About to get train-other-500 cuts from {f}") - return load_manifest_lazy(f) + return load_manifest(f) def test_clean_cuts(self) -> CutSet: f = self.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz" logging.info(f"About to get test-clean cuts from {f}") - return load_manifest_lazy(f) + return load_manifest(f) def test_other_cuts(self) -> CutSet: f = self.manifest_dir / "librispeech_cuts_test-other.jsonl.gz" logging.info(f"About to get test-other cuts from {f}") - return load_manifest_lazy(f) + return load_manifest(f) def dev_clean_cuts(self) -> CutSet: f = self.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz" logging.info(f"About to get dev-clean cuts from {f}") - return load_manifest_lazy(f) + return load_manifest(f) def dev_other_cuts(self) -> CutSet: f = self.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz" logging.info(f"About to get dev-other cuts from {f}") - return load_manifest_lazy(f) + return load_manifest(f) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py index 37cebd577..1d5cbe7bf 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py @@ -66,7 +66,7 @@ from conformer import Conformer from decoder import Decoder from gigaspeech import GigaSpeech from joiner import Joiner -from lhotse import CutSet, load_manifest_lazy +from lhotse import CutSet, load_manifest from lhotse.cut import Cut from lhotse.dataset.sampling.base import CutSampler from lhotse.utils import fix_random_seed @@ -968,7 +968,7 @@ def run(rank, world_size, args): train_giga_cuts = train_giga_cuts.repeat(times=None) if args.enable_musan: - cuts_musan = load_manifest_lazy( + cuts_musan = load_manifest( Path(args.manifest_dir) / "musan_cuts.jsonl.gz" ) else: @@ -978,12 +978,14 @@ def run(rank, world_size, args): train_dl = asr_datamodule.train_dataloaders( train_cuts, + dynamic_bucketing=False, on_the_fly_feats=False, cuts_musan=cuts_musan, ) giga_train_dl = asr_datamodule.train_dataloaders( train_giga_cuts, + dynamic_bucketing=True, on_the_fly_feats=False, cuts_musan=cuts_musan, ) diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py index 5cca06169..ff524978c 100644 --- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py +++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py @@ -24,11 +24,11 @@ from pathlib import Path from typing import Any, Dict, Optional import torch -from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy +from lhotse import CutSet, Fbank, FbankConfig, load_manifest from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures + BucketingSampler, CutConcatenate, CutMix, - DynamicBucketingSampler, K2SpeechRecognitionDataset, PrecomputedFeatures, SingleCutSampler, @@ -113,7 +113,7 @@ class LibriSpeechAsrDataModule: "--num-buckets", type=int, default=30, - help="The number of buckets for the DynamicBucketingSampler" + help="The number of buckets for the BucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -224,7 +224,7 @@ class LibriSpeechAsrDataModule: if self.args.enable_musan: logging.info("Enable MUSAN") logging.info("About to get Musan cuts") - cuts_musan = load_manifest_lazy( + cuts_musan = load_manifest( self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms.append( @@ -306,12 +306,13 @@ class LibriSpeechAsrDataModule: ) if self.args.bucketing_sampler: - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( + logging.info("Using BucketingSampler.") + train_sampler = BucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, + bucket_method="equal_duration", drop_last=self.args.drop_last, ) else: @@ -366,7 +367,7 @@ class LibriSpeechAsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = DynamicBucketingSampler( + valid_sampler = BucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, @@ -376,7 +377,7 @@ class LibriSpeechAsrDataModule: validate, sampler=valid_sampler, batch_size=None, - num_workers=2, + num_workers=self.args.num_workers, persistent_workers=False, ) @@ -390,7 +391,7 @@ class LibriSpeechAsrDataModule: else eval(self.args.input_strategy)(), return_cuts=self.args.return_cuts, ) - sampler = DynamicBucketingSampler( + sampler = BucketingSampler( cuts, max_duration=self.args.max_duration, shuffle=False, @@ -407,48 +408,48 @@ class LibriSpeechAsrDataModule: @lru_cache() def train_clean_100_cuts(self) -> CutSet: logging.info("About to get train-clean-100 cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz" ) @lru_cache() def train_clean_360_cuts(self) -> CutSet: logging.info("About to get train-clean-360 cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz" ) @lru_cache() def train_other_500_cuts(self) -> CutSet: logging.info("About to get train-other-500 cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz" ) @lru_cache() def dev_clean_cuts(self) -> CutSet: logging.info("About to get dev-clean cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz" ) @lru_cache() def dev_other_cuts(self) -> CutSet: logging.info("About to get dev-other cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz" ) @lru_cache() def test_clean_cuts(self) -> CutSet: logging.info("About to get test-clean cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz" ) @lru_cache() def test_other_cuts(self) -> CutSet: logging.info("About to get test-other cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz" ) diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py index 3b51ff9bc..ef51a7811 100755 --- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py +++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py @@ -28,7 +28,7 @@ from pathlib import Path from asr_datamodule import AsrDataModule from gigaspeech import GigaSpeech -from lhotse import load_manifest_lazy +from lhotse import load_manifest from librispeech import LibriSpeech @@ -41,7 +41,7 @@ def test_dataset(): print(args) if args.enable_musan: - cuts_musan = load_manifest_lazy( + cuts_musan = load_manifest( Path(args.manifest_dir) / "musan_cuts.jsonl.gz" ) else: diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py index 217fdb39a..f91144cee 100755 --- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py +++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py @@ -73,7 +73,7 @@ from conformer import Conformer from decoder import Decoder from gigaspeech import GigaSpeech from joiner import Joiner -from lhotse import CutSet, load_manifest_lazy +from lhotse import CutSet, load_manifest from lhotse.cut import Cut from lhotse.utils import fix_random_seed from librispeech import LibriSpeech @@ -775,7 +775,7 @@ def run(rank, world_size, args): train_giga_cuts = train_giga_cuts.repeat(times=None) if args.enable_musan: - cuts_musan = load_manifest_lazy( + cuts_musan = load_manifest( Path(args.manifest_dir) / "musan_cuts.jsonl.gz" ) else: @@ -785,13 +785,15 @@ def run(rank, world_size, args): train_dl = asr_datamodule.train_dataloaders( train_cuts, + dynamic_bucketing=False, on_the_fly_feats=False, cuts_musan=cuts_musan, ) giga_train_dl = asr_datamodule.train_dataloaders( train_giga_cuts, - on_the_fly_feats=True, + dynamic_bucketing=True, + on_the_fly_feats=False, cuts_musan=cuts_musan, ) diff --git a/egs/spgispeech/ASR/local/compute_fbank_musan.py b/egs/spgispeech/ASR/local/compute_fbank_musan.py index b56f81906..b88286c41 100755 --- a/egs/spgispeech/ASR/local/compute_fbank_musan.py +++ b/egs/spgispeech/ASR/local/compute_fbank_musan.py @@ -27,7 +27,7 @@ import logging from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, combine +from lhotse import CutSet, LilcomChunkyWriter, combine from lhotse.features.kaldifeat import ( KaldifeatFbank, KaldifeatFbankConfig, @@ -91,7 +91,7 @@ def compute_fbank_musan(): storage_path=output_dir / "feats_musan", batch_duration=500, num_workers=4, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) ) diff --git a/egs/spgispeech/ASR/local/compute_fbank_spgispeech.py b/egs/spgispeech/ASR/local/compute_fbank_spgispeech.py index bda537b4f..b67754e2a 100755 --- a/egs/spgispeech/ASR/local/compute_fbank_spgispeech.py +++ b/egs/spgispeech/ASR/local/compute_fbank_spgispeech.py @@ -27,7 +27,7 @@ import logging from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, load_manifest_lazy +from lhotse import LilcomChunkyWriter, load_manifest_lazy from lhotse.features.kaldifeat import ( KaldifeatFbank, KaldifeatFbankConfig, @@ -118,7 +118,7 @@ def compute_fbank_spgispeech(args): storage_path=output_dir / f"feats_train_{idx}", batch_duration=500, num_workers=4, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) cs.to_file(cuts_train_idx_path) @@ -137,7 +137,7 @@ def compute_fbank_spgispeech(args): manifest_path=src_dir / f"cuts_{partition}.jsonl.gz", batch_duration=500, num_workers=4, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) diff --git a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py index 78351d77c..e324b5025 100755 --- a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py +++ b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py @@ -27,7 +27,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -89,7 +89,7 @@ def compute_fbank_tedlium(): # when an executor is specified, make more partitions num_jobs=cur_num_jobs, executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) # Split long cuts into many short and un-overlapping cuts cut_set = cut_set.trim_to_supervisions(keep_overlapping=False) diff --git a/egs/tedlium3/ASR/local/display_manifest_statistics.py b/egs/tedlium3/ASR/local/display_manifest_statistics.py index 52e152389..ff438e918 100755 --- a/egs/tedlium3/ASR/local/display_manifest_statistics.py +++ b/egs/tedlium3/ASR/local/display_manifest_statistics.py @@ -27,7 +27,7 @@ for usage. """ -from lhotse import load_manifest_lazy +from lhotse import load_manifest def main(): @@ -35,7 +35,7 @@ def main(): path = "./data/fbank/tedlium_cuts_dev.jsonl.gz" path = "./data/fbank/tedlium_cuts_test.jsonl.gz" - cuts = load_manifest_lazy(path) + cuts = load_manifest(path) cuts.describe() diff --git a/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py b/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py index ae22bfd92..52a322f79 100644 --- a/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py +++ b/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py @@ -22,11 +22,11 @@ import logging from functools import lru_cache from pathlib import Path -from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy +from lhotse import CutSet, Fbank, FbankConfig, load_manifest from lhotse.dataset import ( + BucketingSampler, CutConcatenate, CutMix, - DynamicBucketingSampler, K2SpeechRecognitionDataset, PrecomputedFeatures, SingleCutSampler, @@ -92,7 +92,7 @@ class TedLiumAsrDataModule: "--num-buckets", type=int, default=30, - help="The number of buckets for the DynamicBucketingSampler" + help="The number of buckets for the BucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -179,7 +179,7 @@ class TedLiumAsrDataModule: transforms = [] if self.args.enable_musan: logging.info("Enable MUSAN") - cuts_musan = load_manifest_lazy( + cuts_musan = load_manifest( self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms.append( @@ -261,12 +261,13 @@ class TedLiumAsrDataModule: ) if self.args.bucketing_sampler: - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( + logging.info("Using BucketingSampler.") + train_sampler = BucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, + bucket_method="equal_duration", drop_last=True, ) else: @@ -310,7 +311,7 @@ class TedLiumAsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = DynamicBucketingSampler( + valid_sampler = BucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, @@ -334,7 +335,7 @@ class TedLiumAsrDataModule: else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = DynamicBucketingSampler( + sampler = BucketingSampler( cuts, max_duration=self.args.max_duration, shuffle=False, @@ -351,20 +352,20 @@ class TedLiumAsrDataModule: @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "tedlium_cuts_train.jsonl.gz" ) @lru_cache() def dev_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "tedlium_cuts_dev.jsonl.gz" ) @lru_cache() def test_cuts(self) -> CutSet: logging.info("About to get test cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "tedlium_cuts_test.jsonl.gz" ) diff --git a/egs/timit/ASR/local/compute_fbank_timit.py b/egs/timit/ASR/local/compute_fbank_timit.py index 5704c72b6..094769c8c 100644 --- a/egs/timit/ASR/local/compute_fbank_timit.py +++ b/egs/timit/ASR/local/compute_fbank_timit.py @@ -29,7 +29,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -88,7 +88,7 @@ def compute_fbank_timit(): # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) cut_set.to_file(cuts_file) diff --git a/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py index 665b5a771..4ef95c97f 100644 --- a/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py +++ b/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py @@ -23,11 +23,11 @@ from functools import lru_cache from pathlib import Path from typing import List, Union -from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy +from lhotse import CutSet, Fbank, FbankConfig, load_manifest from lhotse.dataset import ( + BucketingSampler, CutConcatenate, CutMix, - DynamicBucketingSampler, K2SpeechRecognitionDataset, PrecomputedFeatures, SingleCutSampler, @@ -92,7 +92,7 @@ class TimitAsrDataModule(DataModule): "--num-buckets", type=int, default=30, - help="The number of buckets for the DynamicBucketingSampler" + help="The number of buckets for the BucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -154,7 +154,7 @@ class TimitAsrDataModule(DataModule): cuts_train = self.train_cuts() logging.info("About to get Musan cuts") - cuts_musan = load_manifest_lazy( + cuts_musan = load_manifest( self.args.feature_dir / "cuts_musan.jsonl.gz" ) @@ -220,12 +220,13 @@ class TimitAsrDataModule(DataModule): ) if self.args.bucketing_sampler: - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( + logging.info("Using BucketingSampler.") + train_sampler = BucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, + bucket_method="equal_duration", drop_last=True, ) else: @@ -273,7 +274,7 @@ class TimitAsrDataModule(DataModule): cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = SingleCutSampler( + valid_sampler = BucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, @@ -306,7 +307,7 @@ class TimitAsrDataModule(DataModule): else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = SingleCutSampler( + sampler = BucketingSampler( cuts_test, max_duration=self.args.max_duration ) logging.debug("About to create test dataloader") @@ -323,7 +324,7 @@ class TimitAsrDataModule(DataModule): @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - cuts_train = load_manifest_lazy( + cuts_train = load_manifest( self.args.feature_dir / "timit_cuts_TRAIN.jsonl.gz" ) @@ -332,7 +333,7 @@ class TimitAsrDataModule(DataModule): @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - cuts_valid = load_manifest_lazy( + cuts_valid = load_manifest( self.args.feature_dir / "timit_cuts_DEV.jsonl.gz" ) @@ -341,7 +342,7 @@ class TimitAsrDataModule(DataModule): @lru_cache() def test_cuts(self) -> CutSet: logging.debug("About to get test cuts") - cuts_test = load_manifest_lazy( + cuts_test = load_manifest( self.args.feature_dir / "timit_cuts_TEST.jsonl.gz" ) diff --git a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_dev_test.py b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_dev_test.py index 11aaef5c5..9e817e43e 100755 --- a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_dev_test.py +++ b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_dev_test.py @@ -24,7 +24,7 @@ from lhotse import ( CutSet, KaldifeatFbank, KaldifeatFbankConfig, - LilcomHdf5Writer, + LilcomChunkyWriter, ) # Torch's multithreaded behavior needs to be disabled or @@ -70,7 +70,7 @@ def compute_fbank_wenetspeech_dev_test(): storage_path=f"{in_out_dir}/feats_{partition}", num_workers=num_workers, batch_duration=batch_duration, - storage_type=LilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) cut_set = cut_set.trim_to_supervisions( keep_overlapping=False, min_duration=None diff --git a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py index a828bead9..4622bdb55 100755 --- a/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py +++ b/egs/wenetspeech/ASR/local/compute_fbank_wenetspeech_splits.py @@ -23,10 +23,10 @@ from pathlib import Path import torch from lhotse import ( - ChunkedLilcomHdf5Writer, CutSet, KaldifeatFbank, KaldifeatFbankConfig, + LilcomChunkyWriter, set_audio_duration_mismatch_tolerance, set_caching_enabled, ) @@ -135,7 +135,7 @@ def compute_fbank_wenetspeech_splits(args): storage_path=f"{output_dir}/feats_{subset}_{idx}", num_workers=args.num_workers, batch_duration=args.batch_duration, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) logging.info("About to split cuts into smaller chunks.") diff --git a/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py index 6aebc2164..7b1d169d2 100644 --- a/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py +++ b/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py @@ -27,10 +27,11 @@ from lhotse import ( CutSet, Fbank, FbankConfig, - load_manifest_lazy, + load_manifest, set_caching_enabled, ) from lhotse.dataset import ( + BucketingSampler, CutConcatenate, CutMix, DynamicBucketingSampler, @@ -109,8 +110,9 @@ class WenetSpeechAsrDataModule: "--num-buckets", type=int, default=300, - help="The number of buckets for the DynamicBucketingSampler" - "(you might want to increase it for larger datasets).", + help="""The number of buckets for the BucketingSampler and + DynamicBucketingSampler + (you might want to increase it for larger datasets).""", ) group.add_argument( "--concatenate-cuts", @@ -218,7 +220,7 @@ class WenetSpeechAsrDataModule: The state dict for the training sampler. """ logging.info("About to get Musan cuts") - cuts_musan = load_manifest_lazy( + cuts_musan = load_manifest( self.args.manifest_dir / "musan_cuts.jsonl.gz" ) @@ -364,23 +366,16 @@ class WenetSpeechAsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = DynamicBucketingSampler( + valid_sampler = BucketingSampler( cuts_valid, max_duration=self.args.max_duration, - rank=0, - world_size=1, shuffle=False, ) logging.info("About to create dev dataloader") - from lhotse.dataset.iterable_dataset import IterableDatasetWrapper - - dev_iter_dataset = IterableDatasetWrapper( - dataset=validate, - sampler=valid_sampler, - ) valid_dl = DataLoader( - dev_iter_dataset, + validate, + sampler=valid_sampler, batch_size=None, num_workers=self.args.num_workers, persistent_workers=False, @@ -396,22 +391,16 @@ class WenetSpeechAsrDataModule: else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = DynamicBucketingSampler( + sampler = BucketingSampler( cuts, max_duration=self.args.max_duration, - rank=0, - world_size=1, shuffle=False, ) - from lhotse.dataset.iterable_dataset import IterableDatasetWrapper - test_iter_dataset = IterableDatasetWrapper( - dataset=test, - sampler=sampler, - ) test_dl = DataLoader( - test_iter_dataset, + test, batch_size=None, + sampler=sampler, num_workers=self.args.num_workers, ) return test_dl @@ -435,18 +424,16 @@ class WenetSpeechAsrDataModule: @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest_lazy(self.args.manifest_dir / "cuts_DEV.jsonl.gz") + return load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz") @lru_cache() def test_net_cuts(self) -> List[CutSet]: logging.info("About to get TEST_NET cuts") - return load_manifest_lazy( - self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz" - ) + return load_manifest(self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz") @lru_cache() def test_meeting_cuts(self) -> List[CutSet]: logging.info("About to get TEST_MEETING cuts") - return load_manifest_lazy( + return load_manifest( self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz" ) diff --git a/egs/yesno/ASR/local/compute_fbank_yesno.py b/egs/yesno/ASR/local/compute_fbank_yesno.py index 0a00a33ef..fb48b6f8e 100755 --- a/egs/yesno/ASR/local/compute_fbank_yesno.py +++ b/egs/yesno/ASR/local/compute_fbank_yesno.py @@ -12,7 +12,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -74,7 +74,7 @@ def compute_fbank_yesno(): # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 1, # use one job executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) cut_set.to_file(cuts_file) diff --git a/egs/yesno/ASR/tdnn/asr_datamodule.py b/egs/yesno/ASR/tdnn/asr_datamodule.py index 85e5f1358..cec2e7b00 100644 --- a/egs/yesno/ASR/tdnn/asr_datamodule.py +++ b/egs/yesno/ASR/tdnn/asr_datamodule.py @@ -20,10 +20,10 @@ from functools import lru_cache from pathlib import Path from typing import List -from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy +from lhotse import CutSet, Fbank, FbankConfig, load_manifest from lhotse.dataset import ( + BucketingSampler, CutConcatenate, - DynamicBucketingSampler, K2SpeechRecognitionDataset, PrecomputedFeatures, SingleCutSampler, @@ -85,7 +85,7 @@ class YesNoAsrDataModule(DataModule): "--num-buckets", type=int, default=10, - help="The number of buckets for the DynamicBucketingSampler" + help="The number of buckets for the BucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -187,12 +187,13 @@ class YesNoAsrDataModule(DataModule): ) if self.args.bucketing_sampler: - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( + logging.info("Using BucketingSampler.") + train_sampler = BucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, + bucket_method="equal_duration", drop_last=True, ) else: @@ -225,7 +226,7 @@ class YesNoAsrDataModule(DataModule): else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = DynamicBucketingSampler( + sampler = BucketingSampler( cuts_test, max_duration=self.args.max_duration, shuffle=False, @@ -242,7 +243,7 @@ class YesNoAsrDataModule(DataModule): @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - cuts_train = load_manifest_lazy( + cuts_train = load_manifest( self.args.feature_dir / "yesno_cuts_train.jsonl.gz" ) return cuts_train @@ -250,7 +251,7 @@ class YesNoAsrDataModule(DataModule): @lru_cache() def test_cuts(self) -> List[CutSet]: logging.info("About to get test cuts") - cuts_test = load_manifest_lazy( + cuts_test = load_manifest( self.args.feature_dir / "yesno_cuts_test.jsonl.gz" ) return cuts_test