Revert to use LilcomChunkyWriter and load_manifest.

This commit is contained in:
Fangjun Kuang 2022-06-08 21:13:59 +08:00
parent 5079d99ee2
commit ea741070e0
36 changed files with 206 additions and 196 deletions

View File

@ -29,7 +29,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -90,7 +90,7 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80, num_jobs=num_jobs if ex is None else 80,
executor=ex, executor=ex,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}") cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")

View File

@ -25,7 +25,7 @@ for usage.
""" """
from lhotse import load_manifest_lazy from lhotse import load_manifest
def main(): def main():
@ -37,7 +37,7 @@ def main():
for path in paths: for path in paths:
print(f"Starting display the statistics for {path}") print(f"Starting display the statistics for {path}")
cuts = load_manifest_lazy(path) cuts = load_manifest(path)
cuts.describe() cuts.describe()

View File

@ -27,13 +27,13 @@ from lhotse import (
CutSet, CutSet,
Fbank, Fbank,
FbankConfig, FbankConfig,
load_manifest_lazy, load_manifest,
set_caching_enabled, set_caching_enabled,
) )
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutConcatenate, CutConcatenate,
CutMix, CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
PrecomputedFeatures, PrecomputedFeatures,
SingleCutSampler, SingleCutSampler,
@ -109,7 +109,7 @@ class Aidatatang_200zhAsrDataModule:
"--num-buckets", "--num-buckets",
type=int, type=int,
default=300, default=300,
help="The number of buckets for the DynamicBucketingSampler" help="The number of buckets for the BucketingSampler"
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
group.add_argument( group.add_argument(
@ -204,7 +204,7 @@ class Aidatatang_200zhAsrDataModule:
The state dict for the training sampler. The state dict for the training sampler.
""" """
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest_lazy( cuts_musan = load_manifest(
self.args.manifest_dir / "musan_cuts.jsonl.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
@ -289,12 +289,13 @@ class Aidatatang_200zhAsrDataModule:
) )
if self.args.bucketing_sampler: if self.args.bucketing_sampler:
logging.info("Using DynamicBucketingSampler.") logging.info("Using BucketingSampler.")
train_sampler = DynamicBucketingSampler( train_sampler = BucketingSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True, drop_last=True,
) )
else: else:
@ -349,7 +350,7 @@ class Aidatatang_200zhAsrDataModule:
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = DynamicBucketingSampler( valid_sampler = BucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -379,7 +380,7 @@ class Aidatatang_200zhAsrDataModule:
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = DynamicBucketingSampler( sampler = BucketingSampler(
cuts, cuts,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -400,20 +401,20 @@ class Aidatatang_200zhAsrDataModule:
@lru_cache() @lru_cache()
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
logging.info("About to get train cuts") logging.info("About to get train cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz" self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
) )
@lru_cache() @lru_cache()
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz" self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz"
) )
@lru_cache() @lru_cache()
def test_cuts(self) -> List[CutSet]: def test_cuts(self) -> List[CutSet]:
logging.info("About to get test cuts") logging.info("About to get test cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz" self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
) )

View File

@ -29,7 +29,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -90,7 +90,7 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80, num_jobs=num_jobs if ex is None else 80,
executor=ex, executor=ex,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}") cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")

View File

@ -29,7 +29,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -86,7 +86,7 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80, num_jobs=num_jobs if ex is None else 80,
executor=ex, executor=ex,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}") cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")

View File

@ -25,7 +25,7 @@ for usage.
""" """
from lhotse import load_manifest_lazy from lhotse import load_manifest
def main(): def main():
@ -36,7 +36,7 @@ def main():
# path = "./data/fbank/aidatatang_cuts_test.jsonl.gz" # path = "./data/fbank/aidatatang_cuts_test.jsonl.gz"
# path = "./data/fbank/aidatatang_cuts_dev.jsonl.gz" # path = "./data/fbank/aidatatang_cuts_dev.jsonl.gz"
cuts = load_manifest_lazy(path) cuts = load_manifest(path)
cuts.describe() cuts.describe()

View File

@ -23,11 +23,11 @@ from functools import lru_cache
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse import CutSet, Fbank, FbankConfig, load_manifest
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutConcatenate, CutConcatenate,
CutMix, CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
PrecomputedFeatures, PrecomputedFeatures,
SingleCutSampler, SingleCutSampler,
@ -93,7 +93,7 @@ class AishellAsrDataModule:
"--num-buckets", "--num-buckets",
type=int, type=int,
default=30, default=30,
help="The number of buckets for the DynamicBucketingSampler" help="The number of buckets for the BucketingSampler"
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
group.add_argument( group.add_argument(
@ -183,7 +183,7 @@ class AishellAsrDataModule:
def train_dataloaders(self, cuts_train: CutSet) -> DataLoader: def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest_lazy( cuts_musan = load_manifest(
self.args.manifest_dir / "musan_cuts.jsonl.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
@ -268,12 +268,13 @@ class AishellAsrDataModule:
) )
if self.args.bucketing_sampler: if self.args.bucketing_sampler:
logging.info("Using DynamicBucketingSampler.") logging.info("Using BucketingSampler.")
train_sampler = DynamicBucketingSampler( train_sampler = BucketingSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=self.args.drop_last, drop_last=self.args.drop_last,
) )
else: else:
@ -318,7 +319,7 @@ class AishellAsrDataModule:
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = DynamicBucketingSampler( valid_sampler = BucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -342,7 +343,7 @@ class AishellAsrDataModule:
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = DynamicBucketingSampler( sampler = BucketingSampler(
cuts, cuts,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -358,7 +359,7 @@ class AishellAsrDataModule:
@lru_cache() @lru_cache()
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
logging.info("About to get train cuts") logging.info("About to get train cuts")
cuts_train = load_manifest_lazy( cuts_train = load_manifest(
self.args.manifest_dir / "aishell_cuts_train.jsonl.gz" self.args.manifest_dir / "aishell_cuts_train.jsonl.gz"
) )
return cuts_train return cuts_train
@ -366,13 +367,13 @@ class AishellAsrDataModule:
@lru_cache() @lru_cache()
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "aishell_cuts_dev.jsonl.gz" self.args.manifest_dir / "aishell_cuts_dev.jsonl.gz"
) )
@lru_cache() @lru_cache()
def test_cuts(self) -> List[CutSet]: def test_cuts(self) -> List[CutSet]:
logging.info("About to get test cuts") logging.info("About to get test cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "aishell_cuts_test.jsonl.gz" self.args.manifest_dir / "aishell_cuts_test.jsonl.gz"
) )

View File

@ -18,7 +18,7 @@
import logging import logging
from pathlib import Path from pathlib import Path
from lhotse import CutSet, load_manifest_lazy from lhotse import CutSet, load_manifest
class AIDatatang200zh: class AIDatatang200zh:
@ -37,17 +37,17 @@ class AIDatatang200zh:
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
f = self.manifest_dir / "aidatatang_cuts_train.jsonl.gz" f = self.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
logging.info(f"About to get train cuts from {f}") logging.info(f"About to get train cuts from {f}")
cuts_train = load_manifest_lazy(f) cuts_train = load_manifest(f)
return cuts_train return cuts_train
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
f = self.manifest_dir / "aidatatang_cuts_valid.jsonl.gz" f = self.manifest_dir / "aidatatang_cuts_valid.jsonl.gz"
logging.info(f"About to get valid cuts from {f}") logging.info(f"About to get valid cuts from {f}")
cuts_valid = load_manifest_lazy(f) cuts_valid = load_manifest(f)
return cuts_valid return cuts_valid
def test_cuts(self) -> CutSet: def test_cuts(self) -> CutSet:
f = self.manifest_dir / "aidatatang_cuts_test.jsonl.gz" f = self.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
logging.info(f"About to get test cuts from {f}") logging.info(f"About to get test cuts from {f}")
cuts_test = load_manifest_lazy(f) cuts_test = load_manifest(f)
return cuts_test return cuts_test

View File

@ -18,7 +18,7 @@
import logging import logging
from pathlib import Path from pathlib import Path
from lhotse import CutSet, load_manifest_lazy from lhotse import CutSet, load_manifest
class AIShell: class AIShell:
@ -37,17 +37,17 @@ class AIShell:
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
f = self.manifest_dir / "aishell_cuts_train.jsonl.gz" f = self.manifest_dir / "aishell_cuts_train.jsonl.gz"
logging.info(f"About to get train cuts from {f}") logging.info(f"About to get train cuts from {f}")
cuts_train = load_manifest_lazy(f) cuts_train = load_manifest(f)
return cuts_train return cuts_train
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
f = self.manifest_dir / "aishell_cuts_dev.jsonl.gz" f = self.manifest_dir / "aishell_cuts_dev.jsonl.gz"
logging.info(f"About to get valid cuts from {f}") logging.info(f"About to get valid cuts from {f}")
cuts_valid = load_manifest_lazy(f) cuts_valid = load_manifest(f)
return cuts_valid return cuts_valid
def test_cuts(self) -> CutSet: def test_cuts(self) -> CutSet:
f = self.manifest_dir / "aishell_cuts_test.jsonl.gz" f = self.manifest_dir / "aishell_cuts_test.jsonl.gz"
logging.info(f"About to get test cuts from {f}") logging.info(f"About to get test cuts from {f}")
cuts_test = load_manifest_lazy(f) cuts_test = load_manifest(f)
return cuts_test return cuts_test

View File

@ -24,8 +24,8 @@ from typing import Optional
from lhotse import CutSet, Fbank, FbankConfig from lhotse import CutSet, Fbank, FbankConfig
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutMix, CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
SpecAugment, SpecAugment,
) )
@ -72,7 +72,7 @@ class AsrDataModule:
"--num-buckets", "--num-buckets",
type=int, type=int,
default=30, default=30,
help="The number of buckets for the DynamicBucketingSampler " help="The number of buckets for the BucketingSampler"
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
@ -226,12 +226,13 @@ class AsrDataModule:
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
logging.info("Using DynamicBucketingSampler.") logging.info("Using BucketingSampler.")
train_sampler = DynamicBucketingSampler( train_sampler = BucketingSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True, drop_last=True,
) )
@ -262,7 +263,7 @@ class AsrDataModule:
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = DynamicBucketingSampler( valid_sampler = BucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -286,7 +287,7 @@ class AsrDataModule:
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = DynamicBucketingSampler( sampler = BucketingSampler(
cuts, cuts,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,

View File

@ -56,7 +56,7 @@ from asr_datamodule import AsrDataModule
from conformer import Conformer from conformer import Conformer
from decoder import Decoder from decoder import Decoder
from joiner import Joiner from joiner import Joiner
from lhotse import CutSet, load_manifest_lazy from lhotse import CutSet, load_manifest
from lhotse.cut import Cut from lhotse.cut import Cut
from lhotse.utils import fix_random_seed from lhotse.utils import fix_random_seed
from model import Transducer from model import Transducer
@ -735,7 +735,7 @@ def run(rank, world_size, args):
train_datatang_cuts = train_datatang_cuts.repeat(times=None) train_datatang_cuts = train_datatang_cuts.repeat(times=None)
if args.enable_musan: if args.enable_musan:
cuts_musan = load_manifest_lazy( cuts_musan = load_manifest(
Path(args.manifest_dir) / "musan_cuts.jsonl.gz" Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
) )
else: else:

View File

@ -29,7 +29,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -90,7 +90,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=cur_num_jobs, num_jobs=cur_num_jobs,
executor=ex, executor=ex,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
logging.info("About splitting cuts into smaller chunks") logging.info("About splitting cuts into smaller chunks")

View File

@ -25,7 +25,7 @@ for usage.
""" """
from lhotse import load_manifest_lazy from lhotse import load_manifest
def main(): def main():
@ -37,7 +37,7 @@ def main():
for path in paths: for path in paths:
print(f"Starting display the statistics for {path}") print(f"Starting display the statistics for {path}")
cuts = load_manifest_lazy(path) cuts = load_manifest(path)
cuts.describe() cuts.describe()

View File

@ -27,13 +27,13 @@ from lhotse import (
CutSet, CutSet,
Fbank, Fbank,
FbankConfig, FbankConfig,
load_manifest_lazy, load_manifest,
set_caching_enabled, set_caching_enabled,
) )
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutConcatenate, CutConcatenate,
CutMix, CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
PrecomputedFeatures, PrecomputedFeatures,
SingleCutSampler, SingleCutSampler,
@ -109,7 +109,7 @@ class AlimeetingAsrDataModule:
"--num-buckets", "--num-buckets",
type=int, type=int,
default=300, default=300,
help="The number of buckets for the DynamicBucketingSampler" help="The number of buckets for the BucketingSampler"
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
group.add_argument( group.add_argument(
@ -204,7 +204,7 @@ class AlimeetingAsrDataModule:
The state dict for the training sampler. The state dict for the training sampler.
""" """
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest_lazy( cuts_musan = load_manifest(
self.args.manifest_dir / "musan_cuts.jsonl.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
@ -289,13 +289,14 @@ class AlimeetingAsrDataModule:
) )
if self.args.bucketing_sampler: if self.args.bucketing_sampler:
logging.info("Using DynamicBucketingSampler.") logging.info("Using BucketingSampler.")
train_sampler = DynamicBucketingSampler( train_sampler = BucketingSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
buffer_size=30000, buffer_size=30000,
bucket_method="equal_duration",
drop_last=True, drop_last=True,
) )
else: else:
@ -350,7 +351,7 @@ class AlimeetingAsrDataModule:
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = DynamicBucketingSampler( valid_sampler = BucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -380,7 +381,7 @@ class AlimeetingAsrDataModule:
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = DynamicBucketingSampler( sampler = BucketingSampler(
cuts, cuts,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -401,20 +402,20 @@ class AlimeetingAsrDataModule:
@lru_cache() @lru_cache()
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
logging.info("About to get train cuts") logging.info("About to get train cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "alimeeting_cuts_train.jsonl.gz" self.args.manifest_dir / "alimeeting_cuts_train.jsonl.gz"
) )
@lru_cache() @lru_cache()
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "alimeeting_cuts_eval.jsonl.gz" self.args.manifest_dir / "alimeeting_cuts_eval.jsonl.gz"
) )
@lru_cache() @lru_cache()
def test_cuts(self) -> List[CutSet]: def test_cuts(self) -> List[CutSet]:
logging.info("About to get test cuts") logging.info("About to get test cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "alimeeting_cuts_test.jsonl.gz" self.args.manifest_dir / "alimeeting_cuts_test.jsonl.gz"
) )

View File

@ -28,7 +28,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -91,7 +91,7 @@ def compute_fbank_librispeech():
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80, num_jobs=num_jobs if ex is None else 80,
executor=ex, executor=ex,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
cut_set.to_file(output_dir / cuts_filename) cut_set.to_file(output_dir / cuts_filename)

View File

@ -28,7 +28,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig, combine from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -67,7 +67,7 @@ def compute_fbank_musan():
len(dataset_parts), len(dataset_parts),
) )
musan_cuts_path = output_dir / "musan_cuts.jsonl.gz" musan_cuts_path = output_dir / f"musan_cuts.{suffix}"
if musan_cuts_path.is_file(): if musan_cuts_path.is_file():
logging.info(f"{musan_cuts_path} already exists - skipping") logging.info(f"{musan_cuts_path} already exists - skipping")
@ -92,7 +92,7 @@ def compute_fbank_musan():
storage_path=f"{output_dir}/musan_feats", storage_path=f"{output_dir}/musan_feats",
num_jobs=num_jobs if ex is None else 80, num_jobs=num_jobs if ex is None else 80,
executor=ex, executor=ex,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
) )
musan_cuts.to_file(musan_cuts_path) musan_cuts.to_file(musan_cuts_path)

View File

@ -25,7 +25,7 @@ for usage.
""" """
from lhotse import load_manifest_lazy from lhotse import load_manifest
def main(): def main():
@ -37,7 +37,7 @@ def main():
# path = "./data/fbank/librispeech_cuts_test-clean.jsonl.gz" # path = "./data/fbank/librispeech_cuts_test-clean.jsonl.gz"
path = "./data/fbank/librispeech_cuts_test-other.jsonl.gz" path = "./data/fbank/librispeech_cuts_test-other.jsonl.gz"
cuts = load_manifest_lazy(path) cuts = load_manifest(path)
cuts.describe() cuts.describe()

View File

@ -33,7 +33,7 @@ import argparse
import logging import logging
from pathlib import Path from pathlib import Path
from lhotse import CutSet, load_manifest_lazy from lhotse import CutSet, load_manifest
from lhotse.cut import Cut from lhotse.cut import Cut
@ -76,7 +76,7 @@ def main():
logging.info(f"Validating {manifest}") logging.info(f"Validating {manifest}")
assert manifest.is_file(), f"{manifest} does not exist" assert manifest.is_file(), f"{manifest} does not exist"
cut_set = load_manifest_lazy(manifest) cut_set = load_manifest(manifest)
assert isinstance(cut_set, CutSet) assert isinstance(cut_set, CutSet)
for c in cut_set: for c in cut_set:

View File

@ -22,6 +22,7 @@ from typing import Optional
from lhotse import CutSet, Fbank, FbankConfig from lhotse import CutSet, Fbank, FbankConfig
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutMix, CutMix,
DynamicBucketingSampler, DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
@ -70,7 +71,8 @@ class AsrDataModule:
"--num-buckets", "--num-buckets",
type=int, type=int,
default=30, default=30,
help="The number of buckets for the DynamicBucketingSampler. " help="The number of buckets for the BucketingSampler "
"and DynamicBucketingSampler."
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
@ -150,6 +152,7 @@ class AsrDataModule:
def train_dataloaders( def train_dataloaders(
self, self,
cuts_train: CutSet, cuts_train: CutSet,
dynamic_bucketing: bool,
on_the_fly_feats: bool, on_the_fly_feats: bool,
cuts_musan: Optional[CutSet] = None, cuts_musan: Optional[CutSet] = None,
) -> DataLoader: ) -> DataLoader:
@ -159,6 +162,9 @@ class AsrDataModule:
Cuts for training. Cuts for training.
cuts_musan: cuts_musan:
If not None, it is the cuts for mixing. If not None, it is the cuts for mixing.
dynamic_bucketing:
True to use DynamicBucketingSampler;
False to use BucketingSampler.
on_the_fly_feats: on_the_fly_feats:
True to use OnTheFlyFeatures; True to use OnTheFlyFeatures;
False to use PrecomputedFeatures. False to use PrecomputedFeatures.
@ -224,6 +230,7 @@ class AsrDataModule:
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
if dynamic_bucketing:
logging.info("Using DynamicBucketingSampler.") logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler( train_sampler = DynamicBucketingSampler(
cuts_train, cuts_train,
@ -232,6 +239,16 @@ class AsrDataModule:
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
drop_last=True, drop_last=True,
) )
else:
logging.info("Using BucketingSampler.")
train_sampler = BucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True,
)
logging.info("About to create train dataloader") logging.info("About to create train dataloader")
train_dl = DataLoader( train_dl = DataLoader(
@ -260,12 +277,10 @@ class AsrDataModule:
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = DynamicBucketingSampler( valid_sampler = BucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
num_buckets=self.args.num_buckets,
drop_last=False,
) )
logging.info("About to create dev dataloader") logging.info("About to create dev dataloader")
valid_dl = DataLoader( valid_dl = DataLoader(
@ -286,12 +301,8 @@ class AsrDataModule:
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = DynamicBucketingSampler( sampler = BucketingSampler(
cuts, cuts, max_duration=self.args.max_duration, shuffle=False
max_duration=self.args.max_duration,
shuffle=False,
num_buckets=self.args.num_buckets,
drop_last=True,
) )
logging.debug("About to create test dataloader") logging.debug("About to create test dataloader")
test_dl = DataLoader( test_dl = DataLoader(

View File

@ -18,7 +18,7 @@
import logging import logging
from pathlib import Path from pathlib import Path
from lhotse import CutSet, load_manifest_lazy from lhotse import CutSet, load_manifest
class LibriSpeech: class LibriSpeech:
@ -41,34 +41,34 @@ class LibriSpeech:
def train_clean_100_cuts(self) -> CutSet: def train_clean_100_cuts(self) -> CutSet:
f = self.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz" f = self.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
logging.info(f"About to get train-clean-100 cuts from {f}") logging.info(f"About to get train-clean-100 cuts from {f}")
return load_manifest_lazy(f) return load_manifest(f)
def train_clean_360_cuts(self) -> CutSet: def train_clean_360_cuts(self) -> CutSet:
f = self.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz" f = self.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
logging.info(f"About to get train-clean-360 cuts from {f}") logging.info(f"About to get train-clean-360 cuts from {f}")
return load_manifest_lazy(f) return load_manifest(f)
def train_other_500_cuts(self) -> CutSet: def train_other_500_cuts(self) -> CutSet:
f = self.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz" f = self.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
logging.info(f"About to get train-other-500 cuts from {f}") logging.info(f"About to get train-other-500 cuts from {f}")
return load_manifest_lazy(f) return load_manifest(f)
def test_clean_cuts(self) -> CutSet: def test_clean_cuts(self) -> CutSet:
f = self.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz" f = self.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
logging.info(f"About to get test-clean cuts from {f}") logging.info(f"About to get test-clean cuts from {f}")
return load_manifest_lazy(f) return load_manifest(f)
def test_other_cuts(self) -> CutSet: def test_other_cuts(self) -> CutSet:
f = self.manifest_dir / "librispeech_cuts_test-other.jsonl.gz" f = self.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
logging.info(f"About to get test-other cuts from {f}") logging.info(f"About to get test-other cuts from {f}")
return load_manifest_lazy(f) return load_manifest(f)
def dev_clean_cuts(self) -> CutSet: def dev_clean_cuts(self) -> CutSet:
f = self.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz" f = self.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
logging.info(f"About to get dev-clean cuts from {f}") logging.info(f"About to get dev-clean cuts from {f}")
return load_manifest_lazy(f) return load_manifest(f)
def dev_other_cuts(self) -> CutSet: def dev_other_cuts(self) -> CutSet:
f = self.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz" f = self.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
logging.info(f"About to get dev-other cuts from {f}") logging.info(f"About to get dev-other cuts from {f}")
return load_manifest_lazy(f) return load_manifest(f)

View File

@ -66,7 +66,7 @@ from conformer import Conformer
from decoder import Decoder from decoder import Decoder
from gigaspeech import GigaSpeech from gigaspeech import GigaSpeech
from joiner import Joiner from joiner import Joiner
from lhotse import CutSet, load_manifest_lazy from lhotse import CutSet, load_manifest
from lhotse.cut import Cut from lhotse.cut import Cut
from lhotse.dataset.sampling.base import CutSampler from lhotse.dataset.sampling.base import CutSampler
from lhotse.utils import fix_random_seed from lhotse.utils import fix_random_seed
@ -968,7 +968,7 @@ def run(rank, world_size, args):
train_giga_cuts = train_giga_cuts.repeat(times=None) train_giga_cuts = train_giga_cuts.repeat(times=None)
if args.enable_musan: if args.enable_musan:
cuts_musan = load_manifest_lazy( cuts_musan = load_manifest(
Path(args.manifest_dir) / "musan_cuts.jsonl.gz" Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
) )
else: else:
@ -978,12 +978,14 @@ def run(rank, world_size, args):
train_dl = asr_datamodule.train_dataloaders( train_dl = asr_datamodule.train_dataloaders(
train_cuts, train_cuts,
dynamic_bucketing=False,
on_the_fly_feats=False, on_the_fly_feats=False,
cuts_musan=cuts_musan, cuts_musan=cuts_musan,
) )
giga_train_dl = asr_datamodule.train_dataloaders( giga_train_dl = asr_datamodule.train_dataloaders(
train_giga_cuts, train_giga_cuts,
dynamic_bucketing=True,
on_the_fly_feats=False, on_the_fly_feats=False,
cuts_musan=cuts_musan, cuts_musan=cuts_musan,
) )

View File

@ -24,11 +24,11 @@ from pathlib import Path
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
import torch import torch
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse import CutSet, Fbank, FbankConfig, load_manifest
from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures
BucketingSampler,
CutConcatenate, CutConcatenate,
CutMix, CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
PrecomputedFeatures, PrecomputedFeatures,
SingleCutSampler, SingleCutSampler,
@ -113,7 +113,7 @@ class LibriSpeechAsrDataModule:
"--num-buckets", "--num-buckets",
type=int, type=int,
default=30, default=30,
help="The number of buckets for the DynamicBucketingSampler" help="The number of buckets for the BucketingSampler"
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
group.add_argument( group.add_argument(
@ -224,7 +224,7 @@ class LibriSpeechAsrDataModule:
if self.args.enable_musan: if self.args.enable_musan:
logging.info("Enable MUSAN") logging.info("Enable MUSAN")
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest_lazy( cuts_musan = load_manifest(
self.args.manifest_dir / "musan_cuts.jsonl.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
transforms.append( transforms.append(
@ -306,12 +306,13 @@ class LibriSpeechAsrDataModule:
) )
if self.args.bucketing_sampler: if self.args.bucketing_sampler:
logging.info("Using DynamicBucketingSampler.") logging.info("Using BucketingSampler.")
train_sampler = DynamicBucketingSampler( train_sampler = BucketingSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=self.args.drop_last, drop_last=self.args.drop_last,
) )
else: else:
@ -366,7 +367,7 @@ class LibriSpeechAsrDataModule:
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = DynamicBucketingSampler( valid_sampler = BucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -376,7 +377,7 @@ class LibriSpeechAsrDataModule:
validate, validate,
sampler=valid_sampler, sampler=valid_sampler,
batch_size=None, batch_size=None,
num_workers=2, num_workers=self.args.num_workers,
persistent_workers=False, persistent_workers=False,
) )
@ -390,7 +391,7 @@ class LibriSpeechAsrDataModule:
else eval(self.args.input_strategy)(), else eval(self.args.input_strategy)(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = DynamicBucketingSampler( sampler = BucketingSampler(
cuts, cuts,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -407,48 +408,48 @@ class LibriSpeechAsrDataModule:
@lru_cache() @lru_cache()
def train_clean_100_cuts(self) -> CutSet: def train_clean_100_cuts(self) -> CutSet:
logging.info("About to get train-clean-100 cuts") logging.info("About to get train-clean-100 cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz" self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
) )
@lru_cache() @lru_cache()
def train_clean_360_cuts(self) -> CutSet: def train_clean_360_cuts(self) -> CutSet:
logging.info("About to get train-clean-360 cuts") logging.info("About to get train-clean-360 cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz" self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
) )
@lru_cache() @lru_cache()
def train_other_500_cuts(self) -> CutSet: def train_other_500_cuts(self) -> CutSet:
logging.info("About to get train-other-500 cuts") logging.info("About to get train-other-500 cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz" self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
) )
@lru_cache() @lru_cache()
def dev_clean_cuts(self) -> CutSet: def dev_clean_cuts(self) -> CutSet:
logging.info("About to get dev-clean cuts") logging.info("About to get dev-clean cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz" self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
) )
@lru_cache() @lru_cache()
def dev_other_cuts(self) -> CutSet: def dev_other_cuts(self) -> CutSet:
logging.info("About to get dev-other cuts") logging.info("About to get dev-other cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz" self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
) )
@lru_cache() @lru_cache()
def test_clean_cuts(self) -> CutSet: def test_clean_cuts(self) -> CutSet:
logging.info("About to get test-clean cuts") logging.info("About to get test-clean cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz" self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
) )
@lru_cache() @lru_cache()
def test_other_cuts(self) -> CutSet: def test_other_cuts(self) -> CutSet:
logging.info("About to get test-other cuts") logging.info("About to get test-other cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz" self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
) )

View File

@ -28,7 +28,7 @@ from pathlib import Path
from asr_datamodule import AsrDataModule from asr_datamodule import AsrDataModule
from gigaspeech import GigaSpeech from gigaspeech import GigaSpeech
from lhotse import load_manifest_lazy from lhotse import load_manifest
from librispeech import LibriSpeech from librispeech import LibriSpeech
@ -41,7 +41,7 @@ def test_dataset():
print(args) print(args)
if args.enable_musan: if args.enable_musan:
cuts_musan = load_manifest_lazy( cuts_musan = load_manifest(
Path(args.manifest_dir) / "musan_cuts.jsonl.gz" Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
) )
else: else:

View File

@ -73,7 +73,7 @@ from conformer import Conformer
from decoder import Decoder from decoder import Decoder
from gigaspeech import GigaSpeech from gigaspeech import GigaSpeech
from joiner import Joiner from joiner import Joiner
from lhotse import CutSet, load_manifest_lazy from lhotse import CutSet, load_manifest
from lhotse.cut import Cut from lhotse.cut import Cut
from lhotse.utils import fix_random_seed from lhotse.utils import fix_random_seed
from librispeech import LibriSpeech from librispeech import LibriSpeech
@ -775,7 +775,7 @@ def run(rank, world_size, args):
train_giga_cuts = train_giga_cuts.repeat(times=None) train_giga_cuts = train_giga_cuts.repeat(times=None)
if args.enable_musan: if args.enable_musan:
cuts_musan = load_manifest_lazy( cuts_musan = load_manifest(
Path(args.manifest_dir) / "musan_cuts.jsonl.gz" Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
) )
else: else:
@ -785,13 +785,15 @@ def run(rank, world_size, args):
train_dl = asr_datamodule.train_dataloaders( train_dl = asr_datamodule.train_dataloaders(
train_cuts, train_cuts,
dynamic_bucketing=False,
on_the_fly_feats=False, on_the_fly_feats=False,
cuts_musan=cuts_musan, cuts_musan=cuts_musan,
) )
giga_train_dl = asr_datamodule.train_dataloaders( giga_train_dl = asr_datamodule.train_dataloaders(
train_giga_cuts, train_giga_cuts,
on_the_fly_feats=True, dynamic_bucketing=True,
on_the_fly_feats=False,
cuts_musan=cuts_musan, cuts_musan=cuts_musan,
) )

View File

@ -27,7 +27,7 @@ import logging
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, combine from lhotse import CutSet, LilcomChunkyWriter, combine
from lhotse.features.kaldifeat import ( from lhotse.features.kaldifeat import (
KaldifeatFbank, KaldifeatFbank,
KaldifeatFbankConfig, KaldifeatFbankConfig,
@ -91,7 +91,7 @@ def compute_fbank_musan():
storage_path=output_dir / "feats_musan", storage_path=output_dir / "feats_musan",
batch_duration=500, batch_duration=500,
num_workers=4, num_workers=4,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
) )

View File

@ -27,7 +27,7 @@ import logging
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ChunkedLilcomHdf5Writer, load_manifest_lazy from lhotse import LilcomChunkyWriter, load_manifest_lazy
from lhotse.features.kaldifeat import ( from lhotse.features.kaldifeat import (
KaldifeatFbank, KaldifeatFbank,
KaldifeatFbankConfig, KaldifeatFbankConfig,
@ -118,7 +118,7 @@ def compute_fbank_spgispeech(args):
storage_path=output_dir / f"feats_train_{idx}", storage_path=output_dir / f"feats_train_{idx}",
batch_duration=500, batch_duration=500,
num_workers=4, num_workers=4,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
cs.to_file(cuts_train_idx_path) cs.to_file(cuts_train_idx_path)
@ -137,7 +137,7 @@ def compute_fbank_spgispeech(args):
manifest_path=src_dir / f"cuts_{partition}.jsonl.gz", manifest_path=src_dir / f"cuts_{partition}.jsonl.gz",
batch_duration=500, batch_duration=500,
num_workers=4, num_workers=4,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )

View File

@ -27,7 +27,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -89,7 +89,7 @@ def compute_fbank_tedlium():
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=cur_num_jobs, num_jobs=cur_num_jobs,
executor=ex, executor=ex,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
# Split long cuts into many short and un-overlapping cuts # Split long cuts into many short and un-overlapping cuts
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False) cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)

View File

@ -27,7 +27,7 @@ for usage.
""" """
from lhotse import load_manifest_lazy from lhotse import load_manifest
def main(): def main():
@ -35,7 +35,7 @@ def main():
path = "./data/fbank/tedlium_cuts_dev.jsonl.gz" path = "./data/fbank/tedlium_cuts_dev.jsonl.gz"
path = "./data/fbank/tedlium_cuts_test.jsonl.gz" path = "./data/fbank/tedlium_cuts_test.jsonl.gz"
cuts = load_manifest_lazy(path) cuts = load_manifest(path)
cuts.describe() cuts.describe()

View File

@ -22,11 +22,11 @@ import logging
from functools import lru_cache from functools import lru_cache
from pathlib import Path from pathlib import Path
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse import CutSet, Fbank, FbankConfig, load_manifest
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutConcatenate, CutConcatenate,
CutMix, CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
PrecomputedFeatures, PrecomputedFeatures,
SingleCutSampler, SingleCutSampler,
@ -92,7 +92,7 @@ class TedLiumAsrDataModule:
"--num-buckets", "--num-buckets",
type=int, type=int,
default=30, default=30,
help="The number of buckets for the DynamicBucketingSampler" help="The number of buckets for the BucketingSampler"
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
group.add_argument( group.add_argument(
@ -179,7 +179,7 @@ class TedLiumAsrDataModule:
transforms = [] transforms = []
if self.args.enable_musan: if self.args.enable_musan:
logging.info("Enable MUSAN") logging.info("Enable MUSAN")
cuts_musan = load_manifest_lazy( cuts_musan = load_manifest(
self.args.manifest_dir / "musan_cuts.jsonl.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
transforms.append( transforms.append(
@ -261,12 +261,13 @@ class TedLiumAsrDataModule:
) )
if self.args.bucketing_sampler: if self.args.bucketing_sampler:
logging.info("Using DynamicBucketingSampler.") logging.info("Using BucketingSampler.")
train_sampler = DynamicBucketingSampler( train_sampler = BucketingSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True, drop_last=True,
) )
else: else:
@ -310,7 +311,7 @@ class TedLiumAsrDataModule:
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = DynamicBucketingSampler( valid_sampler = BucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -334,7 +335,7 @@ class TedLiumAsrDataModule:
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = DynamicBucketingSampler( sampler = BucketingSampler(
cuts, cuts,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -351,20 +352,20 @@ class TedLiumAsrDataModule:
@lru_cache() @lru_cache()
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
logging.info("About to get train cuts") logging.info("About to get train cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "tedlium_cuts_train.jsonl.gz" self.args.manifest_dir / "tedlium_cuts_train.jsonl.gz"
) )
@lru_cache() @lru_cache()
def dev_cuts(self) -> CutSet: def dev_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "tedlium_cuts_dev.jsonl.gz" self.args.manifest_dir / "tedlium_cuts_dev.jsonl.gz"
) )
@lru_cache() @lru_cache()
def test_cuts(self) -> CutSet: def test_cuts(self) -> CutSet:
logging.info("About to get test cuts") logging.info("About to get test cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "tedlium_cuts_test.jsonl.gz" self.args.manifest_dir / "tedlium_cuts_test.jsonl.gz"
) )

View File

@ -29,7 +29,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -88,7 +88,7 @@ def compute_fbank_timit():
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80, num_jobs=num_jobs if ex is None else 80,
executor=ex, executor=ex,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
cut_set.to_file(cuts_file) cut_set.to_file(cuts_file)

View File

@ -23,11 +23,11 @@ from functools import lru_cache
from pathlib import Path from pathlib import Path
from typing import List, Union from typing import List, Union
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse import CutSet, Fbank, FbankConfig, load_manifest
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutConcatenate, CutConcatenate,
CutMix, CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
PrecomputedFeatures, PrecomputedFeatures,
SingleCutSampler, SingleCutSampler,
@ -92,7 +92,7 @@ class TimitAsrDataModule(DataModule):
"--num-buckets", "--num-buckets",
type=int, type=int,
default=30, default=30,
help="The number of buckets for the DynamicBucketingSampler" help="The number of buckets for the BucketingSampler"
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
group.add_argument( group.add_argument(
@ -154,7 +154,7 @@ class TimitAsrDataModule(DataModule):
cuts_train = self.train_cuts() cuts_train = self.train_cuts()
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest_lazy( cuts_musan = load_manifest(
self.args.feature_dir / "cuts_musan.jsonl.gz" self.args.feature_dir / "cuts_musan.jsonl.gz"
) )
@ -220,12 +220,13 @@ class TimitAsrDataModule(DataModule):
) )
if self.args.bucketing_sampler: if self.args.bucketing_sampler:
logging.info("Using DynamicBucketingSampler.") logging.info("Using BucketingSampler.")
train_sampler = DynamicBucketingSampler( train_sampler = BucketingSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True, drop_last=True,
) )
else: else:
@ -273,7 +274,7 @@ class TimitAsrDataModule(DataModule):
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = SingleCutSampler( valid_sampler = BucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -306,7 +307,7 @@ class TimitAsrDataModule(DataModule):
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = SingleCutSampler( sampler = BucketingSampler(
cuts_test, max_duration=self.args.max_duration cuts_test, max_duration=self.args.max_duration
) )
logging.debug("About to create test dataloader") logging.debug("About to create test dataloader")
@ -323,7 +324,7 @@ class TimitAsrDataModule(DataModule):
@lru_cache() @lru_cache()
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
logging.info("About to get train cuts") logging.info("About to get train cuts")
cuts_train = load_manifest_lazy( cuts_train = load_manifest(
self.args.feature_dir / "timit_cuts_TRAIN.jsonl.gz" self.args.feature_dir / "timit_cuts_TRAIN.jsonl.gz"
) )
@ -332,7 +333,7 @@ class TimitAsrDataModule(DataModule):
@lru_cache() @lru_cache()
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
cuts_valid = load_manifest_lazy( cuts_valid = load_manifest(
self.args.feature_dir / "timit_cuts_DEV.jsonl.gz" self.args.feature_dir / "timit_cuts_DEV.jsonl.gz"
) )
@ -341,7 +342,7 @@ class TimitAsrDataModule(DataModule):
@lru_cache() @lru_cache()
def test_cuts(self) -> CutSet: def test_cuts(self) -> CutSet:
logging.debug("About to get test cuts") logging.debug("About to get test cuts")
cuts_test = load_manifest_lazy( cuts_test = load_manifest(
self.args.feature_dir / "timit_cuts_TEST.jsonl.gz" self.args.feature_dir / "timit_cuts_TEST.jsonl.gz"
) )

View File

@ -24,7 +24,7 @@ from lhotse import (
CutSet, CutSet,
KaldifeatFbank, KaldifeatFbank,
KaldifeatFbankConfig, KaldifeatFbankConfig,
LilcomHdf5Writer, LilcomChunkyWriter,
) )
# Torch's multithreaded behavior needs to be disabled or # Torch's multithreaded behavior needs to be disabled or
@ -70,7 +70,7 @@ def compute_fbank_wenetspeech_dev_test():
storage_path=f"{in_out_dir}/feats_{partition}", storage_path=f"{in_out_dir}/feats_{partition}",
num_workers=num_workers, num_workers=num_workers,
batch_duration=batch_duration, batch_duration=batch_duration,
storage_type=LilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
cut_set = cut_set.trim_to_supervisions( cut_set = cut_set.trim_to_supervisions(
keep_overlapping=False, min_duration=None keep_overlapping=False, min_duration=None

View File

@ -23,10 +23,10 @@ from pathlib import Path
import torch import torch
from lhotse import ( from lhotse import (
ChunkedLilcomHdf5Writer,
CutSet, CutSet,
KaldifeatFbank, KaldifeatFbank,
KaldifeatFbankConfig, KaldifeatFbankConfig,
LilcomChunkyWriter,
set_audio_duration_mismatch_tolerance, set_audio_duration_mismatch_tolerance,
set_caching_enabled, set_caching_enabled,
) )
@ -135,7 +135,7 @@ def compute_fbank_wenetspeech_splits(args):
storage_path=f"{output_dir}/feats_{subset}_{idx}", storage_path=f"{output_dir}/feats_{subset}_{idx}",
num_workers=args.num_workers, num_workers=args.num_workers,
batch_duration=args.batch_duration, batch_duration=args.batch_duration,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
logging.info("About to split cuts into smaller chunks.") logging.info("About to split cuts into smaller chunks.")

View File

@ -27,10 +27,11 @@ from lhotse import (
CutSet, CutSet,
Fbank, Fbank,
FbankConfig, FbankConfig,
load_manifest_lazy, load_manifest,
set_caching_enabled, set_caching_enabled,
) )
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutConcatenate, CutConcatenate,
CutMix, CutMix,
DynamicBucketingSampler, DynamicBucketingSampler,
@ -109,8 +110,9 @@ class WenetSpeechAsrDataModule:
"--num-buckets", "--num-buckets",
type=int, type=int,
default=300, default=300,
help="The number of buckets for the DynamicBucketingSampler" help="""The number of buckets for the BucketingSampler and
"(you might want to increase it for larger datasets).", DynamicBucketingSampler
(you might want to increase it for larger datasets).""",
) )
group.add_argument( group.add_argument(
"--concatenate-cuts", "--concatenate-cuts",
@ -218,7 +220,7 @@ class WenetSpeechAsrDataModule:
The state dict for the training sampler. The state dict for the training sampler.
""" """
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest_lazy( cuts_musan = load_manifest(
self.args.manifest_dir / "musan_cuts.jsonl.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
@ -364,23 +366,16 @@ class WenetSpeechAsrDataModule:
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = DynamicBucketingSampler( valid_sampler = BucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
rank=0,
world_size=1,
shuffle=False, shuffle=False,
) )
logging.info("About to create dev dataloader") logging.info("About to create dev dataloader")
from lhotse.dataset.iterable_dataset import IterableDatasetWrapper
dev_iter_dataset = IterableDatasetWrapper(
dataset=validate,
sampler=valid_sampler,
)
valid_dl = DataLoader( valid_dl = DataLoader(
dev_iter_dataset, validate,
sampler=valid_sampler,
batch_size=None, batch_size=None,
num_workers=self.args.num_workers, num_workers=self.args.num_workers,
persistent_workers=False, persistent_workers=False,
@ -396,22 +391,16 @@ class WenetSpeechAsrDataModule:
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = DynamicBucketingSampler( sampler = BucketingSampler(
cuts, cuts,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
rank=0,
world_size=1,
shuffle=False, shuffle=False,
) )
from lhotse.dataset.iterable_dataset import IterableDatasetWrapper
test_iter_dataset = IterableDatasetWrapper(
dataset=test,
sampler=sampler,
)
test_dl = DataLoader( test_dl = DataLoader(
test_iter_dataset, test,
batch_size=None, batch_size=None,
sampler=sampler,
num_workers=self.args.num_workers, num_workers=self.args.num_workers,
) )
return test_dl return test_dl
@ -435,18 +424,16 @@ class WenetSpeechAsrDataModule:
@lru_cache() @lru_cache()
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
return load_manifest_lazy(self.args.manifest_dir / "cuts_DEV.jsonl.gz") return load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
@lru_cache() @lru_cache()
def test_net_cuts(self) -> List[CutSet]: def test_net_cuts(self) -> List[CutSet]:
logging.info("About to get TEST_NET cuts") logging.info("About to get TEST_NET cuts")
return load_manifest_lazy( return load_manifest(self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz")
self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz"
)
@lru_cache() @lru_cache()
def test_meeting_cuts(self) -> List[CutSet]: def test_meeting_cuts(self) -> List[CutSet]:
logging.info("About to get TEST_MEETING cuts") logging.info("About to get TEST_MEETING cuts")
return load_manifest_lazy( return load_manifest(
self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz" self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz"
) )

View File

@ -12,7 +12,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -74,7 +74,7 @@ def compute_fbank_yesno():
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 1, # use one job num_jobs=num_jobs if ex is None else 1, # use one job
executor=ex, executor=ex,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
cut_set.to_file(cuts_file) cut_set.to_file(cuts_file)

View File

@ -20,10 +20,10 @@ from functools import lru_cache
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse import CutSet, Fbank, FbankConfig, load_manifest
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutConcatenate, CutConcatenate,
DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
PrecomputedFeatures, PrecomputedFeatures,
SingleCutSampler, SingleCutSampler,
@ -85,7 +85,7 @@ class YesNoAsrDataModule(DataModule):
"--num-buckets", "--num-buckets",
type=int, type=int,
default=10, default=10,
help="The number of buckets for the DynamicBucketingSampler" help="The number of buckets for the BucketingSampler"
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
group.add_argument( group.add_argument(
@ -187,12 +187,13 @@ class YesNoAsrDataModule(DataModule):
) )
if self.args.bucketing_sampler: if self.args.bucketing_sampler:
logging.info("Using DynamicBucketingSampler.") logging.info("Using BucketingSampler.")
train_sampler = DynamicBucketingSampler( train_sampler = BucketingSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True, drop_last=True,
) )
else: else:
@ -225,7 +226,7 @@ class YesNoAsrDataModule(DataModule):
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = DynamicBucketingSampler( sampler = BucketingSampler(
cuts_test, cuts_test,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -242,7 +243,7 @@ class YesNoAsrDataModule(DataModule):
@lru_cache() @lru_cache()
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
logging.info("About to get train cuts") logging.info("About to get train cuts")
cuts_train = load_manifest_lazy( cuts_train = load_manifest(
self.args.feature_dir / "yesno_cuts_train.jsonl.gz" self.args.feature_dir / "yesno_cuts_train.jsonl.gz"
) )
return cuts_train return cuts_train
@ -250,7 +251,7 @@ class YesNoAsrDataModule(DataModule):
@lru_cache() @lru_cache()
def test_cuts(self) -> List[CutSet]: def test_cuts(self) -> List[CutSet]:
logging.info("About to get test cuts") logging.info("About to get test cuts")
cuts_test = load_manifest_lazy( cuts_test = load_manifest(
self.args.feature_dir / "yesno_cuts_test.jsonl.gz" self.args.feature_dir / "yesno_cuts_test.jsonl.gz"
) )
return cuts_test return cuts_test