diff --git a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py index ae24127bf..b3fc8adbb 100755 --- a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py +++ b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py @@ -43,7 +43,7 @@ torch.set_num_interop_threads(1) def compute_fbank_alimeeting(num_mel_bins: int = 80): - src_dir = Path("data/manifests/alimeeting") + src_dir = Path("data/manifests") output_dir = Path("data/fbank") num_jobs = min(15, os.cpu_count()) @@ -52,11 +52,14 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80): "eval", "test", ) + + prefix = "alimeeting" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=src_dir, - prefix="alimeeting", - suffix="jsonl.gz", + prefix=prefix, + suffix=suffix, ) assert manifests is not None @@ -64,7 +67,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80): with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): - if (output_dir / f"cuts_{partition}.json.gz").is_file(): + if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file(): logging.info(f"{partition} already exists - skipping.") continue logging.info(f"Processing {partition}") @@ -83,7 +86,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80): cut_set = cut_set.compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_{partition}", + storage_path=f"{output_dir}/{prefix}_feats_{partition}", # when an executor is specified, make more partitions num_jobs=cur_num_jobs, executor=ex, @@ -95,7 +98,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80): keep_overlapping=False, min_duration=None, ) - cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}") def get_args(): diff --git a/egs/alimeeting/ASR/local/display_manifest_statistics.py b/egs/alimeeting/ASR/local/display_manifest_statistics.py index 7f7aa094d..16cdecc91 100644 --- a/egs/alimeeting/ASR/local/display_manifest_statistics.py +++ b/egs/alimeeting/ASR/local/display_manifest_statistics.py @@ -25,19 +25,19 @@ for usage. """ -from lhotse import load_manifest +from lhotse import load_manifest_lazy def main(): paths = [ - "./data/fbank/cuts_train.json.gz", - "./data/fbank/cuts_eval.json.gz", - "./data/fbank/cuts_test.json.gz", + "./data/fbank/alimeeting_cuts_train.jsonl.gz", + "./data/fbank/alimeeting_cuts_eval.jsonl.gz", + "./data/fbank/alimeeting_cuts_test.jsonl.gz", ] for path in paths: print(f"Starting display the statistics for {path}") - cuts = load_manifest(path) + cuts = load_manifest_lazy(path) cuts.describe() @@ -45,7 +45,7 @@ if __name__ == "__main__": main() """ -Starting display the statistics for ./data/fbank/cuts_train.json.gz +Starting display the statistics for ./data/fbank/alimeeting_cuts_train.jsonl.gz Cuts count: 559092 Total duration (hours): 424.6 Speech duration (hours): 424.6 (100.0%) @@ -61,7 +61,7 @@ min 0.0 99.5% 14.7 99.9% 16.2 max 284.3 -Starting display the statistics for ./data/fbank/cuts_eval.json.gz +Starting display the statistics for ./data/fbank/alimeeting_cuts_eval.jsonl.gz Cuts count: 6457 Total duration (hours): 4.9 Speech duration (hours): 4.9 (100.0%) @@ -77,7 +77,7 @@ min 0.1 99.5% 14.1 99.9% 14.7 max 15.8 -Starting display the statistics for ./data/fbank/cuts_test.json.gz +Starting display the statistics for ./data/fbank/alimeeting_cuts_test.jsonl.gz Cuts count: 16358 Total duration (hours): 12.5 Speech duration (hours): 12.5 (100.0%) diff --git a/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py index bd41a7a1e..339612afe 100644 --- a/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py +++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py @@ -27,7 +27,7 @@ from lhotse import ( CutSet, Fbank, FbankConfig, - load_manifest, + load_manifest_lazy, set_caching_enabled, ) from lhotse.dataset import ( @@ -204,8 +204,8 @@ class AlimeetingAsrDataModule: The state dict for the training sampler. """ logging.info("About to get Musan cuts") - cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms = [] @@ -401,14 +401,20 @@ class AlimeetingAsrDataModule: @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - return load_manifest(self.args.manifest_dir / "cuts_train.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "alimeeting_cuts_train.jsonl.gz" + ) @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest(self.args.manifest_dir / "cuts_eval.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "alimeeting_cuts_eval.jsonl.gz" + ) @lru_cache() def test_cuts(self) -> List[CutSet]: logging.info("About to get test cuts") - return load_manifest(self.args.manifest_dir / "cuts_test.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "alimeeting_cuts_test.jsonl.gz" + ) diff --git a/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py b/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py index 42b45e492..ec3f75901 100644 --- a/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py +++ b/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py @@ -20,7 +20,7 @@ import logging from functools import lru_cache from pathlib import Path -from lhotse import CutSet, Fbank, FbankConfig, load_manifest +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse.dataset import ( CutConcatenate, CutMix, @@ -189,7 +189,7 @@ class GigaSpeechAsrDataModule: def train_dataloaders(self, cuts_train: CutSet) -> DataLoader: logging.info("About to get Musan cuts") - cuts_musan = load_manifest( + cuts_musan = load_manifest_lazy( self.args.manifest_dir / "musan_cuts.jsonl.gz" ) @@ -362,7 +362,9 @@ class GigaSpeechAsrDataModule: @lru_cache() def dev_cuts(self) -> CutSet: logging.info("About to get dev cuts") - cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz") + cuts_valid = load_manifest_lazy( + self.args.manifest_dir / "gigaspeech_cuts_DEV.jsonl.gz" + ) if self.args.small_dev: return cuts_valid.subset(first=1000) else: @@ -371,4 +373,6 @@ class GigaSpeechAsrDataModule: @lru_cache() def test_cuts(self) -> CutSet: logging.info("About to get test cuts") - return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz") + return load_manifest_lazy( + self.args.manifest_dir / "gigaspeech_cuts_TEST.jsonl.gz" + ) diff --git a/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py index ee1cdcbcf..19fe7c6a7 100644 --- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py +++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py @@ -23,7 +23,7 @@ from pathlib import Path from typing import Any, Dict, Optional import torch -from lhotse import CutSet, Fbank, FbankConfig, load_manifest +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse.dataset import ( CutConcatenate, CutMix, @@ -216,7 +216,7 @@ class GigaSpeechAsrDataModule: if self.args.enable_musan: logging.info("Enable MUSAN") logging.info("About to get Musan cuts") - cuts_musan = load_manifest( + cuts_musan = load_manifest_lazy( self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms.append( @@ -405,7 +405,9 @@ class GigaSpeechAsrDataModule: @lru_cache() def dev_cuts(self) -> CutSet: logging.info("About to get dev cuts") - cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz") + cuts_valid = load_manifest_lazy( + self.args.manifest_dir / "cuts_DEV.jsonl.gz" + ) if self.args.small_dev: return cuts_valid.subset(first=1000) else: @@ -414,4 +416,4 @@ class GigaSpeechAsrDataModule: @lru_cache() def test_cuts(self) -> CutSet: logging.info("About to get test cuts") - return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz") + return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz") diff --git a/egs/librispeech/ASR/conformer_ctc/ali.py b/egs/librispeech/ASR/conformer_ctc/ali.py index 42fa2308e..2828e309e 100755 --- a/egs/librispeech/ASR/conformer_ctc/ali.py +++ b/egs/librispeech/ASR/conformer_ctc/ali.py @@ -96,14 +96,14 @@ def get_parser(): - labels_xxx.h5 - aux_labels_xxx.h5 - - cuts_xxx.json.gz + - librispeech_cuts_xxx.jsonl.gz where xxx is the value of `--dataset`. For instance, if `--dataset` is `train-clean-100`, it will contain 3 files: - `labels_train-clean-100.h5` - `aux_labels_train-clean-100.h5` - - `cuts_train-clean-100.json.gz` + - `librispeech_cuts_train-clean-100.jsonl.gz` Note: Both labels_xxx.h5 and aux_labels_xxx.h5 contain framewise alignment. The difference is that labels_xxx.h5 contains repeats. @@ -289,7 +289,9 @@ def main(): out_labels_ali_filename = out_dir / f"labels_{params.dataset}.h5" out_aux_labels_ali_filename = out_dir / f"aux_labels_{params.dataset}.h5" - out_manifest_filename = out_dir / f"cuts_{params.dataset}.json.gz" + out_manifest_filename = ( + out_dir / f"librispeech_cuts_{params.dataset}.jsonl.gz" + ) for f in ( out_labels_ali_filename, diff --git a/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py index f165f6e60..a674d5527 100644 --- a/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py +++ b/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py @@ -22,7 +22,7 @@ from pathlib import Path from typing import Any, Dict, Optional import torch -from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse.dataset import ( CutConcatenate, CutMix, @@ -176,7 +176,7 @@ class SPGISpeechAsrDataModule: The state dict for the training sampler. """ logging.info("About to get Musan cuts") - cuts_musan = load_manifest( + cuts_musan = load_manifest_lazy( self.args.manifest_dir / "cuts_musan.jsonl.gz" ) diff --git a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py index 14200f34f..78351d77c 100755 --- a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py +++ b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py @@ -52,8 +52,13 @@ def compute_fbank_tedlium(): "test", ) + prefix = "tedlium" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( - prefix="tedlium", dataset_parts=dataset_parts, output_dir=src_dir + dataset_parts=dataset_parts, + output_dir=src_dir, + prefix=prefix, + suffix=suffix, ) assert manifests is not None @@ -61,7 +66,7 @@ def compute_fbank_tedlium(): with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): - if (output_dir / f"cuts_{partition}.json.gz").is_file(): + if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file(): logging.info(f"{partition} already exists - skipping.") continue logging.info(f"Processing {partition}") @@ -80,7 +85,7 @@ def compute_fbank_tedlium(): cut_set = cut_set.compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_{partition}", + storage_path=f"{output_dir}/{prefix}_feats_{partition}", # when an executor is specified, make more partitions num_jobs=cur_num_jobs, executor=ex, @@ -88,7 +93,7 @@ def compute_fbank_tedlium(): ) # Split long cuts into many short and un-overlapping cuts cut_set = cut_set.trim_to_supervisions(keep_overlapping=False) - cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}") if __name__ == "__main__": diff --git a/egs/tedlium3/ASR/local/display_manifest_statistics.py b/egs/tedlium3/ASR/local/display_manifest_statistics.py index 972d03b12..52e152389 100755 --- a/egs/tedlium3/ASR/local/display_manifest_statistics.py +++ b/egs/tedlium3/ASR/local/display_manifest_statistics.py @@ -27,15 +27,15 @@ for usage. """ -from lhotse import load_manifest +from lhotse import load_manifest_lazy def main(): - path = "./data/fbank/cuts_train.json.gz" - path = "./data/fbank/cuts_dev.json.gz" - path = "./data/fbank/cuts_test.json.gz" + path = "./data/fbank/tedlium_cuts_train.jsonl.gz" + path = "./data/fbank/tedlium_cuts_dev.jsonl.gz" + path = "./data/fbank/tedlium_cuts_test.jsonl.gz" - cuts = load_manifest(path) + cuts = load_manifest_lazy(path) cuts.describe() diff --git a/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py b/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py index a6b986a94..ae22bfd92 100644 --- a/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py +++ b/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py @@ -22,11 +22,11 @@ import logging from functools import lru_cache from pathlib import Path -from lhotse import CutSet, Fbank, FbankConfig, load_manifest +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse.dataset import ( - BucketingSampler, CutConcatenate, CutMix, + DynamicBucketingSampler, K2SpeechRecognitionDataset, PrecomputedFeatures, SingleCutSampler, @@ -92,7 +92,7 @@ class TedLiumAsrDataModule: "--num-buckets", type=int, default=30, - help="The number of buckets for the BucketingSampler" + help="The number of buckets for the DynamicBucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -179,8 +179,8 @@ class TedLiumAsrDataModule: transforms = [] if self.args.enable_musan: logging.info("Enable MUSAN") - cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms.append( CutMix( @@ -261,13 +261,12 @@ class TedLiumAsrDataModule: ) if self.args.bucketing_sampler: - logging.info("Using BucketingSampler.") - train_sampler = BucketingSampler( + logging.info("Using DynamicBucketingSampler.") + train_sampler = DynamicBucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, - bucket_method="equal_duration", drop_last=True, ) else: @@ -311,7 +310,7 @@ class TedLiumAsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = BucketingSampler( + valid_sampler = DynamicBucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, @@ -335,8 +334,10 @@ class TedLiumAsrDataModule: else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = BucketingSampler( - cuts, max_duration=self.args.max_duration, shuffle=False + sampler = DynamicBucketingSampler( + cuts, + max_duration=self.args.max_duration, + shuffle=False, ) logging.debug("About to create test dataloader") test_dl = DataLoader( @@ -350,14 +351,20 @@ class TedLiumAsrDataModule: @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - return load_manifest(self.args.manifest_dir / "cuts_train.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "tedlium_cuts_train.jsonl.gz" + ) @lru_cache() def dev_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "tedlium_cuts_dev.jsonl.gz" + ) @lru_cache() def test_cuts(self) -> CutSet: logging.info("About to get test cuts") - return load_manifest(self.args.manifest_dir / "cuts_test.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "tedlium_cuts_test.jsonl.gz" + ) diff --git a/egs/timit/ASR/local/compute_fbank_timit.py b/egs/timit/ASR/local/compute_fbank_timit.py index 8e3cbac4e..094769c8c 100644 --- a/egs/timit/ASR/local/compute_fbank_timit.py +++ b/egs/timit/ASR/local/compute_fbank_timit.py @@ -29,7 +29,7 @@ import os from pathlib import Path import torch -from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -53,8 +53,13 @@ def compute_fbank_timit(): "DEV", "TEST", ) + prefix = "timit" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( - prefix="timit", dataset_parts=dataset_parts, output_dir=src_dir + dataset_parts=dataset_parts, + output_dir=src_dir, + prefix=prefix, + suffix=suffix, ) assert manifests is not None @@ -62,7 +67,8 @@ def compute_fbank_timit(): with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): - if (output_dir / f"cuts_{partition}.json.gz").is_file(): + cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}" + if cuts_file.is_file(): logging.info(f"{partition} already exists - skipping.") continue logging.info(f"Processing {partition}") @@ -78,13 +84,13 @@ def compute_fbank_timit(): ) cut_set = cut_set.compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_{partition}", + storage_path=f"{output_dir}/{prefix}_feats_{partition}", # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=LilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) - cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + cut_set.to_file(cuts_file) if __name__ == "__main__": diff --git a/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py index a7029f514..665b5a771 100644 --- a/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py +++ b/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py @@ -23,11 +23,11 @@ from functools import lru_cache from pathlib import Path from typing import List, Union -from lhotse import CutSet, Fbank, FbankConfig, load_manifest +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse.dataset import ( - BucketingSampler, CutConcatenate, CutMix, + DynamicBucketingSampler, K2SpeechRecognitionDataset, PrecomputedFeatures, SingleCutSampler, @@ -92,7 +92,7 @@ class TimitAsrDataModule(DataModule): "--num-buckets", type=int, default=30, - help="The number of buckets for the BucketingSampler" + help="The number of buckets for the DynamicBucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -154,7 +154,9 @@ class TimitAsrDataModule(DataModule): cuts_train = self.train_cuts() logging.info("About to get Musan cuts") - cuts_musan = load_manifest(self.args.feature_dir / "cuts_musan.json.gz") + cuts_musan = load_manifest_lazy( + self.args.feature_dir / "cuts_musan.jsonl.gz" + ) logging.info("About to create train dataset") transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))] @@ -218,13 +220,12 @@ class TimitAsrDataModule(DataModule): ) if self.args.bucketing_sampler: - logging.info("Using BucketingSampler.") - train_sampler = BucketingSampler( + logging.info("Using DynamicBucketingSampler.") + train_sampler = DynamicBucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, - bucket_method="equal_duration", drop_last=True, ) else: @@ -322,20 +323,26 @@ class TimitAsrDataModule(DataModule): @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - cuts_train = load_manifest(self.args.feature_dir / "cuts_TRAIN.json.gz") + cuts_train = load_manifest_lazy( + self.args.feature_dir / "timit_cuts_TRAIN.jsonl.gz" + ) return cuts_train @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - cuts_valid = load_manifest(self.args.feature_dir / "cuts_DEV.json.gz") + cuts_valid = load_manifest_lazy( + self.args.feature_dir / "timit_cuts_DEV.jsonl.gz" + ) return cuts_valid @lru_cache() def test_cuts(self) -> CutSet: logging.debug("About to get test cuts") - cuts_test = load_manifest(self.args.feature_dir / "cuts_TEST.json.gz") + cuts_test = load_manifest_lazy( + self.args.feature_dir / "timit_cuts_TEST.jsonl.gz" + ) return cuts_test diff --git a/egs/wenetspeech/ASR/local/display_manifest_statistics.py b/egs/wenetspeech/ASR/local/display_manifest_statistics.py index 30dc5a5ec..c41445b8d 100644 --- a/egs/wenetspeech/ASR/local/display_manifest_statistics.py +++ b/egs/wenetspeech/ASR/local/display_manifest_statistics.py @@ -26,7 +26,7 @@ for usage. """ -from lhotse import load_manifest +from lhotse import load_manifest_lazy def main(): @@ -40,7 +40,7 @@ def main(): for path in paths: print(f"Starting display the statistics for {path}") - cuts = load_manifest(path) + cuts = load_manifest_lazy(path) cuts.describe() diff --git a/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py index d2f8d85ce..6aebc2164 100644 --- a/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py +++ b/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py @@ -27,7 +27,7 @@ from lhotse import ( CutSet, Fbank, FbankConfig, - load_manifest, + load_manifest_lazy, set_caching_enabled, ) from lhotse.dataset import ( @@ -218,8 +218,8 @@ class WenetSpeechAsrDataModule: The state dict for the training sampler. """ logging.info("About to get Musan cuts") - cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms = [] @@ -435,16 +435,18 @@ class WenetSpeechAsrDataModule: @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz") + return load_manifest_lazy(self.args.manifest_dir / "cuts_DEV.jsonl.gz") @lru_cache() def test_net_cuts(self) -> List[CutSet]: logging.info("About to get TEST_NET cuts") - return load_manifest(self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz") + return load_manifest_lazy( + self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz" + ) @lru_cache() def test_meeting_cuts(self) -> List[CutSet]: logging.info("About to get TEST_MEETING cuts") - return load_manifest( + return load_manifest_lazy( self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz" ) diff --git a/egs/yesno/ASR/local/compute_fbank_yesno.py b/egs/yesno/ASR/local/compute_fbank_yesno.py index 6922ffe10..fb48b6f8e 100755 --- a/egs/yesno/ASR/local/compute_fbank_yesno.py +++ b/egs/yesno/ASR/local/compute_fbank_yesno.py @@ -12,7 +12,7 @@ import os from pathlib import Path import torch -from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -37,10 +37,13 @@ def compute_fbank_yesno(): "train", "test", ) + prefix = "yesno" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=src_dir, - prefix="yesno", + prefix=prefix, + suffix=suffix, ) assert manifests is not None @@ -50,7 +53,8 @@ def compute_fbank_yesno(): with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): - if (output_dir / f"cuts_{partition}.json.gz").is_file(): + cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}" + if cuts_file.is_file(): logging.info(f"{partition} already exists - skipping.") continue logging.info(f"Processing {partition}") @@ -66,13 +70,13 @@ def compute_fbank_yesno(): ) cut_set = cut_set.compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_{partition}", + storage_path=f"{output_dir}/{prefix}_feats_{partition}", # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 1, # use one job executor=ex, - storage_type=LilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) - cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + cut_set.to_file(cuts_file) if __name__ == "__main__": diff --git a/egs/yesno/ASR/tdnn/asr_datamodule.py b/egs/yesno/ASR/tdnn/asr_datamodule.py index 0a5a42089..85e5f1358 100644 --- a/egs/yesno/ASR/tdnn/asr_datamodule.py +++ b/egs/yesno/ASR/tdnn/asr_datamodule.py @@ -20,18 +20,19 @@ from functools import lru_cache from pathlib import Path from typing import List +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy +from lhotse.dataset import ( + CutConcatenate, + DynamicBucketingSampler, + K2SpeechRecognitionDataset, + PrecomputedFeatures, + SingleCutSampler, +) +from lhotse.dataset.input_strategies import OnTheFlyFeatures from torch.utils.data import DataLoader from icefall.dataset.datamodule import DataModule from icefall.utils import str2bool -from lhotse import CutSet, Fbank, FbankConfig, load_manifest -from lhotse.dataset import ( - BucketingSampler, - CutConcatenate, - K2SpeechRecognitionDataset, - PrecomputedFeatures, -) -from lhotse.dataset.input_strategies import OnTheFlyFeatures class YesNoAsrDataModule(DataModule): @@ -84,7 +85,7 @@ class YesNoAsrDataModule(DataModule): "--num-buckets", type=int, default=10, - help="The number of buckets for the BucketingSampler" + help="The number of buckets for the DynamicBucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -186,18 +187,17 @@ class YesNoAsrDataModule(DataModule): ) if self.args.bucketing_sampler: - logging.info("Using BucketingSampler.") - train_sampler = BucketingSampler( + logging.info("Using DynamicBucketingSampler.") + train_sampler = DynamicBucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, - bucket_method="equal_duration", drop_last=True, ) else: logging.info("Using SingleCutSampler.") - train_sampler = BucketingSampler( + train_sampler = SingleCutSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, @@ -225,8 +225,10 @@ class YesNoAsrDataModule(DataModule): else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = BucketingSampler( - cuts_test, max_duration=self.args.max_duration, shuffle=False + sampler = DynamicBucketingSampler( + cuts_test, + max_duration=self.args.max_duration, + shuffle=False, ) logging.debug("About to create test dataloader") test_dl = DataLoader( @@ -240,11 +242,15 @@ class YesNoAsrDataModule(DataModule): @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - cuts_train = load_manifest(self.args.feature_dir / "cuts_train.json.gz") + cuts_train = load_manifest_lazy( + self.args.feature_dir / "yesno_cuts_train.jsonl.gz" + ) return cuts_train @lru_cache() def test_cuts(self) -> List[CutSet]: logging.info("About to get test cuts") - cuts_test = load_manifest(self.args.feature_dir / "cuts_test.json.gz") + cuts_test = load_manifest_lazy( + self.args.feature_dir / "yesno_cuts_test.jsonl.gz" + ) return cuts_test