Use jsonl for CutSet in the LibriSpeech recipe. (#397)

* Use jsonl for cutsets in the librispeech recipe. * Use lazy cutset for all recipes. * More fixes to use lazy CutSet. * Remove force=True from logging to support Python < 3.8 * Minor fixes. * Fix style issues.
2025-08-09 01:52:41 +00:00 · 2022-06-06 10:19:16 +08:00 · 2022-06-06 10:19:16 +08:00 · f1abce72f8
commit f1abce72f8
parent e5884f82e0
68 changed files with 702 additions and 1098 deletions
--- a/.github/workflows/run-gigaspeech-2022-05-13.yml
+++ b/.github/workflows/run-gigaspeech-2022-05-13.yml
@ -59,6 +59,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
--- a/.github/workflows/run-librispeech-2022-03-12.yml
+++ b/.github/workflows/run-librispeech-2022-03-12.yml
@ -59,6 +59,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
@ -99,7 +101,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-2022-04-29.yml
+++ b/.github/workflows/run-librispeech-2022-04-29.yml
@ -59,6 +59,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
@ -99,7 +101,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-2022-05-13.yml
@ -59,6 +59,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
@ -99,7 +101,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
@ -59,6 +59,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
@ -99,7 +101,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
+++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
@ -59,6 +59,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
@ -99,7 +101,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@ -58,6 +58,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
@ -98,7 +100,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@ -58,6 +58,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
@ -98,7 +100,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-pretrained-transducer-stateless.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@ -58,6 +58,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
          pip uninstall -y protobuf
          pip install --no-binary protobuf protobuf
      - name: Cache kaldifeat
        id: my-cache
@ -98,7 +100,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
+++ b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
@ -43,7 +43,7 @@ torch.set_num_interop_threads(1)
 def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
-    src_dir = Path("data/manifests/aidatatang_200zh")
+    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_jobs = min(15, os.cpu_count())
@ -52,11 +52,13 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
        "dev",
        "test",
    )
    prefix = "aidatatang"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        prefix="aidatatang",
        suffix="jsonl.gz",
        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
@ -64,10 +66,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
            for sup in m["supervisions"]:
                sup.custom = {"origin": "aidatatang_200zh"}
            cut_set = CutSet.from_manifests(
                recordings=m["recordings"],
                supervisions=m["supervisions"],
@ -80,13 +86,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=ChunkedLilcomHdf5Writer,
            )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+
            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
 def get_args():
--- a/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py
+++ b/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py
@ -25,19 +25,19 @@ for usage.
 """
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 def main():
    paths = [
-        "./data/fbank/cuts_train.json.gz",
+        "./data/fbank/aidatatang_cuts_train.jsonl.gz",
-        "./data/fbank/cuts_dev.json.gz",
+        "./data/fbank/aidatatang_cuts_dev.jsonl.gz",
-        "./data/fbank/cuts_test.json.gz",
+        "./data/fbank/aidatatang_cuts_test.jsonl.gz",
    ]
    for path in paths:
        print(f"Starting display the statistics for {path}")
-        cuts = load_manifest(path)
+        cuts = load_manifest_lazy(path)
        cuts.describe()
@ -45,7 +45,7 @@ if __name__ == "__main__":
    main()
 """
-Starting display the statistics for ./data/fbank/cuts_train.json.gz
+Starting display the statistics for ./data/fbank/aidatatang_cuts_train.jsonl.gz
 Cuts count: 494715
 Total duration (hours): 422.6
 Speech duration (hours): 422.6 (100.0%)
@ -61,7 +61,7 @@ min     1.0
 99.5%   8.0
 99.9%   9.5
 max     18.1
-Starting display the statistics for ./data/fbank/cuts_dev.json.gz
+Starting display the statistics for ./data/fbank/aidatatang_cuts_dev.jsonl.gz
 Cuts count: 24216
 Total duration (hours): 20.2
 Speech duration (hours): 20.2 (100.0%)
@ -77,7 +77,7 @@ min     1.2
 99.5%   7.3
 99.9%   8.8
 max     11.3
-Starting display the statistics for ./data/fbank/cuts_test.json.gz
+Starting display the statistics for ./data/fbank/aidatatang_cuts_test.jsonl.gz
 Cuts count: 48144
 Total duration (hours): 40.2
 Speech duration (hours): 40.2 (100.0%)
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -27,11 +27,10 @@ from lhotse import (
    CutSet,
    Fbank,
    FbankConfig,
-    load_manifest,
+    load_manifest_lazy,
    set_caching_enabled,
 )
 from lhotse.dataset import (
    BucketingSampler,
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
@ -205,8 +204,8 @@ class Aidatatang_200zhAsrDataModule:
            The state dict for the training sampler.
        """
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
+        cuts_musan = load_manifest_lazy(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
        )
        transforms = []
@ -290,13 +289,12 @@ class Aidatatang_200zhAsrDataModule:
            )
        if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
+            logging.info("Using DynamicBucketingSampler.")
-            train_sampler = BucketingSampler(
+            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                bucket_method="equal_duration",
                drop_last=True,
            )
        else:
@ -402,14 +400,20 @@ class Aidatatang_200zhAsrDataModule:
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
        )
    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz"
        )
    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
        )
--- a/egs/aishell/ASR/conformer_ctc/train.py
+++ b/egs/aishell/ASR/conformer_ctc/train.py
@ -195,9 +195,9 @@ def get_params() -> AttributeDict:
            "best_train_epoch": -1,
            "best_valid_epoch": -1,
            "batch_idx_train": 0,
-            "log_interval": 10,
+            "log_interval": 50,
            "reset_interval": 200,
-            "valid_interval": 3000,
+            "valid_interval": 2000,
            # parameters for k2.ctc_loss
            "beam_size": 10,
            "reduction": "sum",
--- a/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py
+++ b/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py
@ -0,0 +1,119 @@
 #!/usr/bin/env python3
 # Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file computes fbank features of the aidatatang_200zh dataset.
 It looks for manifests in the directory data/manifests.
 The generated fbank features are saved in data/fbank.
 """
 import argparse
 import logging
 import os
 from pathlib import Path
 import torch
 from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_jobs = min(15, os.cpu_count())
    dataset_parts = (
        "train",
        "test",
        "dev",
    )
    prefix = "aidatatang"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
            for sup in m["supervisions"]:
                sup.custom = {"origin": "aidatatang_200zh"}
            cut_set = CutSet.from_manifests(
                recordings=m["recordings"],
                supervisions=m["supervisions"],
            )
            if "train" in partition:
                cut_set = (
                    cut_set
                    + cut_set.perturb_speed(0.9)
                    + cut_set.perturb_speed(1.1)
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomChunkyWriter,
            )
            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--num-mel-bins",
        type=int,
        default=80,
        help="""The number of mel bins for Fbank""",
    )
    return parser.parse_args()
 if __name__ == "__main__":
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    args = get_args()
    compute_fbank_aidatatang_200zh(num_mel_bins=args.num_mel_bins)
--- a/egs/aishell/ASR/local/compute_fbank_aishell.py
+++ b/egs/aishell/ASR/local/compute_fbank_aishell.py
@ -29,7 +29,7 @@ import os
 from pathlib import Path
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
@ -52,8 +52,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
        "dev",
        "test",
    )
    prefix = "aishell"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
-        prefix="aishell", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
@ -61,7 +66,7 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
@ -77,13 +82,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=LilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
 def get_args():
--- a/egs/aishell/ASR/local/display_manifest_statistics.py
+++ b/egs/aishell/ASR/local/display_manifest_statistics.py
@ -25,18 +25,18 @@ for usage.
 """
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 def main():
-    #  path = "./data/fbank/cuts_train.json.gz"
+    #  path = "./data/fbank/aishell_cuts_train.jsonl.gz"
-    #  path = "./data/fbank/cuts_test.json.gz"
+    #  path = "./data/fbank/aishell_cuts_test.jsonl.gz"
-    #  path = "./data/fbank/cuts_dev.json.gz"
+    path = "./data/fbank/aishell_cuts_dev.jsonl.gz"
-    #  path = "./data/fbank/aidatatang_200zh/cuts_train_raw.jsonl.gz"
+    #  path = "./data/fbank/aidatatang_cuts_train.jsonl.gz"
-    #  path = "./data/fbank/aidatatang_200zh/cuts_test_raw.jsonl.gz"
+    #  path = "./data/fbank/aidatatang_cuts_test.jsonl.gz"
-    path = "./data/fbank/aidatatang_200zh/cuts_dev_raw.jsonl.gz"
+    #  path = "./data/fbank/aidatatang_cuts_dev.jsonl.gz"
-    cuts = load_manifest(path)
+    cuts = load_manifest_lazy(path)
    cuts.describe()
--- a/egs/aishell/ASR/local/process_aidatatang_200zh.py
+++ b/egs/aishell/ASR/local/process_aidatatang_200zh.py
@ -1,71 +0,0 @@
 #!/usr/bin/env python3
 # Copyright    2022  Xiaomi Corp.             (Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from pathlib import Path
 from lhotse import CutSet
 from lhotse.recipes.utils import read_manifests_if_cached
 def preprocess_aidatatang_200zh():
    src_dir = Path("data/manifests/aidatatang_200zh")
    output_dir = Path("data/fbank/aidatatang_200zh")
    output_dir.mkdir(exist_ok=True, parents=True)
    dataset_parts = (
        "train",
        "test",
        "dev",
    )
    logging.info("Loading manifest")
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts, output_dir=src_dir, prefix="aidatatang"
    )
    assert len(manifests) > 0
    for partition, m in manifests.items():
        logging.info(f"Processing {partition}")
        raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
        if raw_cuts_path.is_file():
            logging.info(f"{partition} already exists - skipping")
            continue
        for sup in m["supervisions"]:
            sup.custom = {"origin": "aidatatang_200zh"}
        cut_set = CutSet.from_manifests(
            recordings=m["recordings"],
            supervisions=m["supervisions"],
        )
        logging.info(f"Saving to {raw_cuts_path}")
        cut_set.to_file(raw_cuts_path)
 def main():
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    preprocess_aidatatang_200zh()
 if __name__ == "__main__":
    main()
--- a/egs/aishell/ASR/prepare_aidatatang_200zh.sh
+++ b/egs/aishell/ASR/prepare_aidatatang_200zh.sh
@ -42,18 +42,18 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare manifest"
  # We assume that you have downloaded the aidatatang_200zh corpus
  # to $dl_dir/aidatatang_200zh
-  if [ ! -f data/manifests/aidatatang_200zh/.manifests.done ]; then
+  if [ ! -f data/manifests/.aidatatang_200zh_manifests.done ]; then
-    mkdir -p data/manifests/aidatatang_200zh
+    mkdir -p data/manifests
-    lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
+    lhotse prepare aidatatang-200zh $dl_dir data/manifests
-    touch data/manifests/aidatatang_200zh/.manifests.done
+    touch data/manifests/.aidatatang_200zh_manifests.done
  fi
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Process aidatatang_200zh"
-  if [ ! -f data/fbank/aidatatang_200zh/.fbank.done ]; then
+  if [ ! -f data/fbank/.aidatatang_200zh_fbank.done ]; then
-    mkdir -p data/fbank/aidatatang_200zh
+    mkdir -p data/fbank
-    lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
+    ./local/compute_fbank_aidatatang_200zh.py
-    touch data/fbank/aidatatang_200zh/.fbank.done
+    touch data/fbank/.aidatatang_200zh_fbank.done
  fi
 fi
--- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -23,11 +23,11 @@ from functools import lru_cache
 from pathlib import Path
 from typing import List
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
    BucketingSampler,
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SingleCutSampler,
@ -93,7 +93,7 @@ class AishellAsrDataModule:
            "--num-buckets",
            type=int,
            default=30,
-            help="The number of buckets for the BucketingSampler"
+            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
@ -133,6 +133,12 @@ class AishellAsrDataModule:
            help="When enabled (=default), the examples will be "
            "shuffled for each epoch.",
        )
        group.add_argument(
            "--drop-last",
            type=str2bool,
            default=True,
            help="Whether to drop last batch. Used by sampler.",
        )
        group.add_argument(
            "--return-cuts",
            type=str2bool,
@ -177,8 +183,8 @@ class AishellAsrDataModule:
    def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
+        cuts_musan = load_manifest_lazy(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
        )
        transforms = []
@ -262,14 +268,13 @@ class AishellAsrDataModule:
            )
        if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
+            logging.info("Using DynamicBucketingSampler.")
-            train_sampler = BucketingSampler(
+            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
+                drop_last=self.args.drop_last,
                drop_last=True,
            )
        else:
            logging.info("Using SingleCutSampler.")
@ -313,7 +318,7 @@ class AishellAsrDataModule:
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
@ -337,8 +342,10 @@ class AishellAsrDataModule:
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
-        sampler = BucketingSampler(
+        sampler = DynamicBucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+            cuts,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        test_dl = DataLoader(
            test,
@ -351,17 +358,21 @@ class AishellAsrDataModule:
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        cuts_train = load_manifest(
+        cuts_train = load_manifest_lazy(
-            self.args.manifest_dir / "cuts_train.json.gz"
+            self.args.manifest_dir / "aishell_cuts_train.jsonl.gz"
        )
        return cuts_train
    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "aishell_cuts_dev.jsonl.gz"
        )
    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "aishell_cuts_test.jsonl.gz"
        )
--- a/egs/aishell/ASR/tdnn_lstm_ctc/train.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/train.py
@ -15,6 +15,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage
  export CUDA_VISIBLE_DEVICES="0,1,2,3"
  ./tdnn_lstm_ctc/train.py \
    --world-size 4 \
    --num-epochs 20 \
    --max-duration 300
 """
 import argparse
 import logging
--- a/egs/aishell/ASR/transducer_stateless/conformer.py
+++ b/egs/aishell/ASR/transducer_stateless/conformer.py
@ -110,9 +110,7 @@ class Conformer(Transformer):
        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
        # Caution: We assume the subsampling factor is 4!
-        with warnings.catch_warnings():
+        lengths = (((x_lens - 1) >> 1) - 1) >> 1
            warnings.simplefilter("ignore")
            lengths = ((x_lens - 1) // 2 - 1) // 2
        assert x.size(0) == lengths.max().item()
        mask = make_pad_mask(lengths)
--- a/egs/aishell/ASR/transducer_stateless/train.py
+++ b/egs/aishell/ASR/transducer_stateless/train.py
@ -21,6 +21,7 @@
 import argparse
 import logging
 import warnings
 from pathlib import Path
 from shutil import copyfile
 from typing import Optional, Tuple
@ -386,7 +387,11 @@ def compute_loss(
    assert loss.requires_grad == is_training
    info = MetricsTracker()
-    info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        info["frames"] = (
            (feature_lens // params.subsampling_factor).sum().item()
        )
    # Note: We use reduction=sum while computing the loss.
    info["loss"] = loss.detach().cpu().item()
--- a/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py
@ -18,7 +18,7 @@
 import logging
 from pathlib import Path
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 class AIDatatang200zh:
@ -28,26 +28,26 @@ class AIDatatang200zh:
          manifest_dir:
            It is expected to contain the following files::
-                - cuts_dev_raw.jsonl.gz
+                - aidatatang_cuts_dev.jsonl.gz
-                - cuts_train_raw.jsonl.gz
+                - aidatatang_cuts_train.jsonl.gz
-                - cuts_test_raw.jsonl.gz
+                - aidatatang_cuts_test.jsonl.gz
        """
        self.manifest_dir = Path(manifest_dir)
    def train_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train_raw.jsonl.gz"
+        f = self.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
        logging.info(f"About to get train cuts from {f}")
-        cuts_train = load_manifest(f)
+        cuts_train = load_manifest_lazy(f)
        return cuts_train
    def valid_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_valid_raw.jsonl.gz"
+        f = self.manifest_dir / "aidatatang_cuts_valid.jsonl.gz"
        logging.info(f"About to get valid cuts from {f}")
-        cuts_valid = load_manifest(f)
+        cuts_valid = load_manifest_lazy(f)
        return cuts_valid
    def test_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test_raw.jsonl.gz"
+        f = self.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
        logging.info(f"About to get test cuts from {f}")
-        cuts_test = load_manifest(f)
+        cuts_test = load_manifest_lazy(f)
        return cuts_test
--- a/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py
@ -18,7 +18,7 @@
 import logging
 from pathlib import Path
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 class AIShell:
@ -28,26 +28,26 @@ class AIShell:
          manifest_dir:
            It is expected to contain the following files::
-                - cuts_dev.json.gz
+                - aishell_cuts_dev.jsonl.gz
-                - cuts_train.json.gz
+                - aishell_cuts_train.jsonl.gz
-                - cuts_test.json.gz
+                - aishell_cuts_test.jsonl.gz
        """
        self.manifest_dir = Path(manifest_dir)
    def train_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train.json.gz"
+        f = self.manifest_dir / "aishell_cuts_train.jsonl.gz"
        logging.info(f"About to get train cuts from {f}")
-        cuts_train = load_manifest(f)
+        cuts_train = load_manifest_lazy(f)
        return cuts_train
    def valid_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_dev.json.gz"
+        f = self.manifest_dir / "aishell_cuts_dev.jsonl.gz"
        logging.info(f"About to get valid cuts from {f}")
-        cuts_valid = load_manifest(f)
+        cuts_valid = load_manifest_lazy(f)
        return cuts_valid
    def test_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test.json.gz"
+        f = self.manifest_dir / "aishell_cuts_test.jsonl.gz"
        logging.info(f"About to get test cuts from {f}")
-        cuts_test = load_manifest(f)
+        cuts_test = load_manifest_lazy(f)
        return cuts_test
--- a/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py
@ -24,7 +24,6 @@ from typing import Optional
 from lhotse import CutSet, Fbank, FbankConfig
 from lhotse.dataset import (
    BucketingSampler,
    CutMix,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
@ -73,8 +72,7 @@ class AsrDataModule:
            "--num-buckets",
            type=int,
            default=30,
-            help="The number of buckets for the BucketingSampler "
+            help="The number of buckets for the DynamicBucketingSampler "
            "and DynamicBucketingSampler."
            "(you might want to increase it for larger datasets).",
        )
@ -147,7 +145,6 @@ class AsrDataModule:
    def train_dataloaders(
        self,
        cuts_train: CutSet,
        dynamic_bucketing: bool,
        on_the_fly_feats: bool,
        cuts_musan: Optional[CutSet] = None,
    ) -> DataLoader:
@ -157,9 +154,6 @@ class AsrDataModule:
            Cuts for training.
          cuts_musan:
            If not None, it is the cuts for mixing.
          dynamic_bucketing:
            True to use DynamicBucketingSampler;
            False to use BucketingSampler.
          on_the_fly_feats:
            True to use OnTheFlyFeatures;
            False to use PrecomputedFeatures.
@ -232,25 +226,14 @@ class AsrDataModule:
            return_cuts=self.args.return_cuts,
        )
-        if dynamic_bucketing:
+        logging.info("Using DynamicBucketingSampler.")
-            logging.info("Using DynamicBucketingSampler.")
+        train_sampler = DynamicBucketingSampler(
-            train_sampler = DynamicBucketingSampler(
+            cuts_train,
-                cuts_train,
+            max_duration=self.args.max_duration,
-                max_duration=self.args.max_duration,
+            shuffle=self.args.shuffle,
-                shuffle=self.args.shuffle,
+            num_buckets=self.args.num_buckets,
-                num_buckets=self.args.num_buckets,
+            drop_last=True,
-                drop_last=True,
+        )
            )
        else:
            logging.info("Using BucketingSampler.")
            train_sampler = BucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                bucket_method="equal_duration",
                drop_last=True,
            )
        logging.info("About to create train dataloader")
        train_dl = DataLoader(
@ -279,7 +262,7 @@ class AsrDataModule:
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
@ -303,8 +286,10 @@ class AsrDataModule:
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
-        sampler = BucketingSampler(
+        sampler = DynamicBucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+            cuts,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
--- a/egs/aishell/ASR/transducer_stateless_modified-2/train.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/train.py
@ -41,6 +41,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2"
 import argparse
 import logging
 import random
 import warnings
 from pathlib import Path
 from shutil import copyfile
 from typing import Optional, Tuple
@ -55,7 +56,7 @@ from asr_datamodule import AsrDataModule
 from conformer import Conformer
 from decoder import Decoder
 from joiner import Joiner
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from model import Transducer
@ -446,7 +447,11 @@ def compute_loss(
    assert loss.requires_grad == is_training
    info = MetricsTracker()
-    info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        info["frames"] = (
            (feature_lens // params.subsampling_factor).sum().item()
        )
    # Note: We use reduction=sum while computing the loss.
    info["loss"] = loss.detach().cpu().item()
@ -635,20 +640,16 @@ def train_one_epoch(
 def filter_short_and_long_utterances(cuts: CutSet) -> CutSet:
    def remove_short_and_long_utt(c: Cut):
-        # Keep only utterances with duration between 1 second and 12 seconds
+        # Keep only utterances with duration between 1 second and 20 seconds
        #
        # Caution: There is a reason to select 12.0 here. Please see
        # ../local/display_manifest_statistics.py
        #
        # You should use ../local/display_manifest_statistics.py to get
        # an utterance duration distribution for your dataset to select
        # the threshold
        return 1.0 <= c.duration <= 12.0
    num_in_total = len(cuts)
    cuts = cuts.filter(remove_short_and_long_utt)
    num_left = len(cuts)
    num_removed = num_in_total - num_left
    removed_percent = num_removed / num_in_total * 100
    logging.info(f"Before removing short and long utterances: {num_in_total}")
    logging.info(f"After removing short and long utterances: {num_left}")
    logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
    return cuts
@ -728,15 +729,14 @@ def run(rank, world_size, args):
    train_cuts = aishell.train_cuts()
    train_cuts = filter_short_and_long_utterances(train_cuts)
-    datatang = AIDatatang200zh(
+    datatang = AIDatatang200zh(manifest_dir=args.manifest_dir)
        manifest_dir=f"{args.manifest_dir}/aidatatang_200zh"
    )
    train_datatang_cuts = datatang.train_cuts()
    train_datatang_cuts = filter_short_and_long_utterances(train_datatang_cuts)
    train_datatang_cuts = train_datatang_cuts.repeat(times=None)
    if args.enable_musan:
-        cuts_musan = load_manifest(
+        cuts_musan = load_manifest_lazy(
-            Path(args.manifest_dir) / "cuts_musan.json.gz"
+            Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
        )
    else:
        cuts_musan = None
@ -745,22 +745,23 @@ def run(rank, world_size, args):
    train_dl = asr_datamodule.train_dataloaders(
        train_cuts,
        dynamic_bucketing=False,
        on_the_fly_feats=False,
        cuts_musan=cuts_musan,
    )
    datatang_train_dl = asr_datamodule.train_dataloaders(
        train_datatang_cuts,
-        dynamic_bucketing=True,
+        on_the_fly_feats=False,
        on_the_fly_feats=True,
        cuts_musan=cuts_musan,
    )
    valid_cuts = aishell.valid_cuts()
    valid_dl = asr_datamodule.valid_dataloaders(valid_cuts)
-    for dl in [train_dl, datatang_train_dl]:
+    for dl in [
        train_dl,
        # datatang_train_dl
    ]:
        scan_pessimistic_batches_for_oom(
            model=model,
            train_dl=dl,
--- a/egs/aishell/ASR/transducer_stateless_modified/train.py
+++ b/egs/aishell/ASR/transducer_stateless_modified/train.py
@ -37,6 +37,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2"
 import argparse
 import logging
 import warnings
 from pathlib import Path
 from shutil import copyfile
 from typing import Optional, Tuple
@ -411,7 +412,11 @@ def compute_loss(
    assert loss.requires_grad == is_training
    info = MetricsTracker()
-    info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        info["frames"] = (
            (feature_lens // params.subsampling_factor).sum().item()
        )
    # Note: We use reduction=sum while computing the loss.
    info["loss"] = loss.detach().cpu().item()
--- a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
+++ b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
@ -43,7 +43,7 @@ torch.set_num_interop_threads(1)
 def compute_fbank_alimeeting(num_mel_bins: int = 80):
-    src_dir = Path("data/manifests/alimeeting")
+    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_jobs = min(15, os.cpu_count())
@ -52,11 +52,14 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
        "eval",
        "test",
    )
    prefix = "alimeeting"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
-        prefix="alimeeting",
+        prefix=prefix,
-        suffix="jsonl.gz",
+        suffix=suffix,
    )
    assert manifests is not None
@ -64,7 +67,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
@ -83,7 +86,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=cur_num_jobs,
                executor=ex,
@ -95,7 +98,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
                keep_overlapping=False,
                min_duration=None,
            )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
 def get_args():
--- a/egs/alimeeting/ASR/local/display_manifest_statistics.py
+++ b/egs/alimeeting/ASR/local/display_manifest_statistics.py
@ -25,19 +25,19 @@ for usage.
 """
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 def main():
    paths = [
-        "./data/fbank/cuts_train.json.gz",
+        "./data/fbank/alimeeting_cuts_train.jsonl.gz",
-        "./data/fbank/cuts_eval.json.gz",
+        "./data/fbank/alimeeting_cuts_eval.jsonl.gz",
-        "./data/fbank/cuts_test.json.gz",
+        "./data/fbank/alimeeting_cuts_test.jsonl.gz",
    ]
    for path in paths:
        print(f"Starting display the statistics for {path}")
-        cuts = load_manifest(path)
+        cuts = load_manifest_lazy(path)
        cuts.describe()
@ -45,7 +45,7 @@ if __name__ == "__main__":
    main()
 """
-Starting display the statistics for ./data/fbank/cuts_train.json.gz
+Starting display the statistics for ./data/fbank/alimeeting_cuts_train.jsonl.gz
 Cuts count: 559092
 Total duration (hours): 424.6
 Speech duration (hours): 424.6 (100.0%)
@ -61,7 +61,7 @@ min     0.0
 99.5%   14.7
 99.9%   16.2
 max     284.3
-Starting display the statistics for ./data/fbank/cuts_eval.json.gz
+Starting display the statistics for ./data/fbank/alimeeting_cuts_eval.jsonl.gz
 Cuts count: 6457
 Total duration (hours): 4.9
 Speech duration (hours): 4.9 (100.0%)
@ -77,7 +77,7 @@ min     0.1
 99.5%   14.1
 99.9%   14.7
 max     15.8
-Starting display the statistics for ./data/fbank/cuts_test.json.gz
+Starting display the statistics for ./data/fbank/alimeeting_cuts_test.jsonl.gz
 Cuts count: 16358
 Total duration (hours): 12.5
 Speech duration (hours): 12.5 (100.0%)
--- a/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -27,7 +27,7 @@ from lhotse import (
    CutSet,
    Fbank,
    FbankConfig,
-    load_manifest,
+    load_manifest_lazy,
    set_caching_enabled,
 )
 from lhotse.dataset import (
@ -204,8 +204,8 @@ class AlimeetingAsrDataModule:
            The state dict for the training sampler.
        """
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
+        cuts_musan = load_manifest_lazy(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
        )
        transforms = []
@ -401,14 +401,20 @@ class AlimeetingAsrDataModule:
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "alimeeting_cuts_train.jsonl.gz"
        )
    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_eval.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "alimeeting_cuts_eval.jsonl.gz"
        )
    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "alimeeting_cuts_test.jsonl.gz"
        )
--- a/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py
+++ b/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py
@ -20,9 +20,8 @@ import logging
 from functools import lru_cache
 from pathlib import Path
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
    BucketingSampler,
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
@ -190,8 +189,8 @@ class GigaSpeechAsrDataModule:
    def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
+        cuts_musan = load_manifest_lazy(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
        )
        transforms = []
@ -315,7 +314,7 @@ class GigaSpeechAsrDataModule:
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
@ -339,8 +338,10 @@ class GigaSpeechAsrDataModule:
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
-        sampler = BucketingSampler(
+        sampler = DynamicBucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+            cuts,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
@ -361,7 +362,9 @@ class GigaSpeechAsrDataModule:
    @lru_cache()
    def dev_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
+        cuts_valid = load_manifest_lazy(
            self.args.manifest_dir / "cuts_DEV.jsonl.gz"
        )
        if self.args.small_dev:
            return cuts_valid.subset(first=1000)
        else:
@ -370,4 +373,4 @@ class GigaSpeechAsrDataModule:
    @lru_cache()
    def test_cuts(self) -> CutSet:
        logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
--- a/egs/gigaspeech/ASR/local/compute_fbank_musan.py
+++ b/egs/gigaspeech/ASR/local/compute_fbank_musan.py
@ -1,103 +0,0 @@
 #!/usr/bin/env python3
 # Copyright    2021  Johns Hopkins University (Piotr Żelasko)
 # Copyright    2021  Xiaomi Corp.             (Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from pathlib import Path
 import torch
 from lhotse import (
    CutSet,
    KaldifeatFbank,
    KaldifeatFbankConfig,
    combine,
 )
 from lhotse.recipes.utils import read_manifests_if_cached
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def compute_fbank_musan():
    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    # number of workers in dataloader
    num_workers = 10
    # number of seconds in a batch
    batch_duration = 600
    dataset_parts = (
        "music",
        "speech",
        "noise",
    )
    manifests = read_manifests_if_cached(
        prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir
    )
    assert manifests is not None
    musan_cuts_path = output_dir / "cuts_musan.json.gz"
    if musan_cuts_path.is_file():
        logging.info(f"{musan_cuts_path} already exists - skipping")
        return
    logging.info("Extracting features for Musan")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
    logging.info(f"device: {device}")
    musan_cuts = (
        CutSet.from_manifests(
            recordings=combine(
                part["recordings"] for part in manifests.values()
            )
        )
        .cut_into_windows(10.0)
        .filter(lambda c: c.duration > 5)
        .compute_and_store_features_batch(
            extractor=extractor,
            storage_path=f"{output_dir}/feats_musan",
            num_workers=num_workers,
            batch_duration=batch_duration,
        )
    )
    musan_cuts.to_json(musan_cuts_path)
 def main():
    formatter = (
        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    )
    logging.basicConfig(format=formatter, level=logging.INFO)
    compute_fbank_musan()
 if __name__ == "__main__":
    main()
--- a/egs/gigaspeech/ASR/local/compute_fbank_musan.py
+++ b/egs/gigaspeech/ASR/local/compute_fbank_musan.py
@ -0,0 +1 @@
 ../../../librispeech/ASR/local/compute_fbank_musan.py
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -23,9 +23,8 @@ from pathlib import Path
 from typing import Any, Dict, Optional
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
    BucketingSampler,
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
@ -217,8 +216,8 @@ class GigaSpeechAsrDataModule:
        if self.args.enable_musan:
            logging.info("Enable MUSAN")
            logging.info("About to get Musan cuts")
-            cuts_musan = load_manifest(
+            cuts_musan = load_manifest_lazy(
-                self.args.manifest_dir / "cuts_musan.json.gz"
+                self.args.manifest_dir / "musan_cuts.jsonl.gz"
            )
            transforms.append(
                CutMix(
@ -358,7 +357,7 @@ class GigaSpeechAsrDataModule:
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
@ -382,8 +381,10 @@ class GigaSpeechAsrDataModule:
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
-        sampler = BucketingSampler(
+        sampler = DynamicBucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+            cuts,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
@ -404,7 +405,9 @@ class GigaSpeechAsrDataModule:
    @lru_cache()
    def dev_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
+        cuts_valid = load_manifest_lazy(
            self.args.manifest_dir / "cuts_DEV.jsonl.gz"
        )
        if self.args.small_dev:
            return cuts_valid.subset(first=1000)
        else:
@ -413,4 +416,4 @@ class GigaSpeechAsrDataModule:
    @lru_cache()
    def test_cuts(self) -> CutSet:
        logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
--- a/egs/librispeech/ASR/conformer_ctc/ali.py
+++ b/egs/librispeech/ASR/conformer_ctc/ali.py
@ -96,14 +96,14 @@ def get_parser():
        - labels_xxx.h5
        - aux_labels_xxx.h5
-        - cuts_xxx.json.gz
+        - librispeech_cuts_xxx.jsonl.gz
        where xxx is the value of `--dataset`. For instance, if
        `--dataset` is `train-clean-100`, it will contain 3 files:
        - `labels_train-clean-100.h5`
        - `aux_labels_train-clean-100.h5`
-        - `cuts_train-clean-100.json.gz`
+        - `librispeech_cuts_train-clean-100.jsonl.gz`
        Note: Both labels_xxx.h5 and aux_labels_xxx.h5 contain framewise
        alignment. The difference is that labels_xxx.h5 contains repeats.
@ -289,7 +289,9 @@ def main():
    out_labels_ali_filename = out_dir / f"labels_{params.dataset}.h5"
    out_aux_labels_ali_filename = out_dir / f"aux_labels_{params.dataset}.h5"
-    out_manifest_filename = out_dir / f"cuts_{params.dataset}.json.gz"
+    out_manifest_filename = (
        out_dir / f"librispeech_cuts_{params.dataset}.jsonl.gz"
    )
    for f in (
        out_labels_ali_filename,
--- a/egs/librispeech/ASR/conformer_ctc/train.py
+++ b/egs/librispeech/ASR/conformer_ctc/train.py
@ -17,6 +17,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
  export CUDA_VISIBLE_DEVICES="0,1,2,3"
  ./conformer_ctc/train.py \
     --exp-dir ./conformer_ctc/exp \
     --world-size 4 \
     --full-libri 1 \
     --max-duration 200 \
     --num-epochs 20
 """
 import argparse
 import logging
 from pathlib import Path
@ -29,6 +40,7 @@ import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
 from conformer import Conformer
 from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from torch import Tensor
 from torch.nn.parallel import DistributedDataParallel as DDP
@ -676,6 +688,20 @@ def run(rank, world_size, args):
    if params.full_libri:
        train_cuts += librispeech.train_clean_360_cuts()
        train_cuts += librispeech.train_other_500_cuts()
    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
        #
        # Caution: There is a reason to select 20.0 here. Please see
        # ../local/display_manifest_statistics.py
        #
        # You should use ../local/display_manifest_statistics.py to get
        # an utterance duration distribution for your dataset to select
        # the threshold
        return 1.0 <= c.duration <= 20.0
    train_cuts = train_cuts.filter(remove_short_and_long_utt)
    train_dl = librispeech.train_dataloaders(train_cuts)
    valid_cuts = librispeech.dev_clean_cuts()
--- a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py
+++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py
@ -20,11 +20,7 @@ import logging
 from pathlib import Path
 import torch
-from lhotse import (
+from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig
    CutSet,
    KaldifeatFbank,
    KaldifeatFbankConfig,
 )
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
@ -51,13 +47,16 @@ def compute_fbank_gigaspeech_dev_test():
    logging.info(f"device: {device}")
    prefix = "gigaspeech"
    suffix = "jsonl.gz"
    for partition in subsets:
-        cuts_path = in_out_dir / f"cuts_{partition}.jsonl.gz"
+        cuts_path = in_out_dir / f"{prefix}_cuts_{partition}.{suffix}"
        if cuts_path.is_file():
            logging.info(f"{cuts_path} exists - skipping")
            continue
-        raw_cuts_path = in_out_dir / f"cuts_{partition}_raw.jsonl.gz"
+        raw_cuts_path = in_out_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
        logging.info(f"Loading {raw_cuts_path}")
        cut_set = CutSet.from_file(raw_cuts_path)
@ -66,7 +65,7 @@ def compute_fbank_gigaspeech_dev_test():
        cut_set = cut_set.compute_and_store_features_batch(
            extractor=extractor,
-            storage_path=f"{in_out_dir}/feats_{partition}",
+            storage_path=f"{in_out_dir}/{prefix}_feats_{partition}",
            num_workers=num_workers,
            batch_duration=batch_duration,
        )
--- a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py
+++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py
@ -77,7 +77,7 @@ def get_parser():
 def compute_fbank_gigaspeech_splits(args):
    num_splits = args.num_splits
-    output_dir = f"data/fbank/XL_split_{num_splits}"
+    output_dir = f"data/fbank/gigaspeech_XL_split_{num_splits}"
    output_dir = Path(output_dir)
    assert output_dir.exists(), f"{output_dir} does not exist!"
@ -96,17 +96,19 @@ def compute_fbank_gigaspeech_splits(args):
    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
    logging.info(f"device: {device}")
    prefix = "gigaspeech"
    num_digits = 8  # num_digits is fixed by lhotse split-lazy
    for i in range(start, stop):
        idx = f"{i + 1}".zfill(num_digits)
        logging.info(f"Processing {idx}/{num_splits}")
-        cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz"
+        cuts_path = output_dir / f"{prefix}_cuts_XL.{idx}.jsonl.gz"
        if cuts_path.is_file():
            logging.info(f"{cuts_path} exists - skipping")
            continue
-        raw_cuts_path = output_dir / f"cuts_XL_raw.{idx}.jsonl.gz"
+        raw_cuts_path = output_dir / f"{prefix}_cuts_XL_raw.{idx}.jsonl.gz"
        if not raw_cuts_path.is_file():
            logging.info(f"{raw_cuts_path} does not exist - skipping it")
            continue
@ -115,13 +117,13 @@ def compute_fbank_gigaspeech_splits(args):
        cut_set = CutSet.from_file(raw_cuts_path)
        logging.info("Computing features")
-        if (output_dir / f"feats_XL_{idx}.lca").exists():
+        if (output_dir / f"{prefix}_feats_XL_{idx}.lca").exists():
-            logging.info(f"Removing {output_dir}/feats_XL_{idx}.lca")
+            logging.info(f"Removing {output_dir}/{prefix}_feats_XL_{idx}.lca")
-            os.remove(output_dir / f"feats_XL_{idx}.lca")
+            os.remove(output_dir / f"{prefix}_feats_XL_{idx}.lca")
        cut_set = cut_set.compute_and_store_features_batch(
            extractor=extractor,
-            storage_path=f"{output_dir}/feats_XL_{idx}",
+            storage_path=f"{output_dir}/{prefix}_feats_XL_{idx}",
            num_workers=args.num_workers,
            batch_duration=args.batch_duration,
        )
--- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@ -28,7 +28,7 @@ import os
 from pathlib import Path
 import torch
-from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
@ -56,8 +56,13 @@ def compute_fbank_librispeech():
        "train-clean-360",
        "train-other-500",
    )
    prefix = "librispeech"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
-        prefix="librispeech", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
@ -65,7 +70,8 @@ def compute_fbank_librispeech():
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
            if (output_dir / cuts_filename).is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
@ -81,13 +87,13 @@ def compute_fbank_librispeech():
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=ChunkedLilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(output_dir / cuts_filename)
 if __name__ == "__main__":
--- a/egs/librispeech/ASR/local/compute_fbank_musan.py
+++ b/egs/librispeech/ASR/local/compute_fbank_musan.py
@ -28,7 +28,7 @@ import os
 from pathlib import Path
 import torch
-from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig, combine
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
@ -52,12 +52,22 @@ def compute_fbank_musan():
        "speech",
        "noise",
    )
    prefix = "musan"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
-        prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
-    musan_cuts_path = output_dir / "cuts_musan.json.gz"
+    assert len(manifests) == len(dataset_parts), (
        len(manifests),
        len(dataset_parts),
    )
    musan_cuts_path = output_dir / "musan_cuts.jsonl.gz"
    if musan_cuts_path.is_file():
        logging.info(f"{musan_cuts_path} already exists - skipping")
@ -79,13 +89,13 @@ def compute_fbank_musan():
            .filter(lambda c: c.duration > 5)
            .compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_musan",
+                storage_path=f"{output_dir}/musan_feats",
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=ChunkedLilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )
        )
-        musan_cuts.to_json(musan_cuts_path)
+        musan_cuts.to_file(musan_cuts_path)
 if __name__ == "__main__":
--- a/egs/librispeech/ASR/local/display_manifest_statistics.py
+++ b/egs/librispeech/ASR/local/display_manifest_statistics.py
@ -25,19 +25,19 @@ for usage.
 """
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 def main():
-    path = "./data/fbank/cuts_train-clean-100.json.gz"
+    #  path = "./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz"
-    path = "./data/fbank/cuts_train-clean-360.json.gz"
+    #  path = "./data/fbank/librispeech_cuts_train-clean-360.jsonl.gz"
-    path = "./data/fbank/cuts_train-other-500.json.gz"
+    #  path = "./data/fbank/librispeech_cuts_train-other-500.jsonl.gz"
-    path = "./data/fbank/cuts_dev-clean.json.gz"
+    #  path = "./data/fbank/librispeech_cuts_dev-clean.jsonl.gz"
-    path = "./data/fbank/cuts_dev-other.json.gz"
+    #  path = "./data/fbank/librispeech_cuts_dev-other.jsonl.gz"
-    path = "./data/fbank/cuts_test-clean.json.gz"
+    #  path = "./data/fbank/librispeech_cuts_test-clean.jsonl.gz"
-    path = "./data/fbank/cuts_test-other.json.gz"
+    path = "./data/fbank/librispeech_cuts_test-other.jsonl.gz"
-    cuts = load_manifest(path)
+    cuts = load_manifest_lazy(path)
    cuts.describe()
--- a/egs/librispeech/ASR/local/preprocess_gigaspeech.py
+++ b/egs/librispeech/ASR/local/preprocess_gigaspeech.py
@ -58,17 +58,19 @@ def preprocess_giga_speech():
    )
    logging.info("Loading manifest (may take 4 minutes)")
    prefix = "gigaspeech"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
-        prefix="gigaspeech",
+        prefix=prefix,
-        suffix="jsonl.gz",
+        suffix=suffix,
    )
    assert manifests is not None
    for partition, m in manifests.items():
        logging.info(f"Processing {partition}")
-        raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
+        raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
        if raw_cuts_path.is_file():
            logging.info(f"{partition} already exists - skipping")
            continue
--- a/egs/librispeech/ASR/local/validate_manifest.py
+++ b/egs/librispeech/ASR/local/validate_manifest.py
@ -25,7 +25,7 @@ We will add more checks later if needed.
 Usage example:
    python3 ./local/validate_manifest.py \
-            ./data/fbank/cuts_train-clean-100.json.gz
+            ./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
 """
@ -33,7 +33,7 @@ import argparse
 import logging
 from pathlib import Path
-from lhotse import load_manifest, CutSet
+from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
@ -76,7 +76,7 @@ def main():
    logging.info(f"Validating {manifest}")
    assert manifest.is_file(), f"{manifest} does not exist"
-    cut_set = load_manifest(manifest)
+    cut_set = load_manifest_lazy(manifest)
    assert isinstance(cut_set, CutSet)
    for c in cut_set:
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -40,9 +40,9 @@ dl_dir=$PWD/download
 # It will generate data/lang_bpe_xxx,
 # data/lang_bpe_yyy if the array contains xxx, yyy
 vocab_sizes=(
-  5000
+  # 5000
-  2000
+  # 2000
-  1000
+  # 1000
  500
 )
@ -132,7 +132,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
    )
    for part in ${parts[@]}; do
      python3 ./local/validate_manifest.py \
-        data/fbank/cuts_${part}.json.gz
+        data/fbank/librispeech_cuts_${part}.jsonl.gz
    done
    touch data/fbank/.librispeech-validated.done
  fi
--- a/egs/librispeech/ASR/prepare_giga_speech.sh
+++ b/egs/librispeech/ASR/prepare_giga_speech.sh
@ -124,9 +124,9 @@ fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Split XL subset into ${num_splits} pieces"
-  split_dir=data/fbank/XL_split_${num_splits}
+  split_dir=data/fbank/gigaspeech_XL_split_${num_splits}
  if [ ! -f $split_dir/.split_completed ]; then
-    lhotse split-lazy ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir $chunk_size
+    lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size
    touch $split_dir/.split_completed
  fi
 fi
--- a/egs/librispeech/ASR/pruned_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/train.py
@ -807,28 +807,8 @@ def run(rank, world_size, args):
        # the threshold
        return 1.0 <= c.duration <= 20.0
    num_in_total = len(train_cuts)
    train_cuts = train_cuts.filter(remove_short_and_long_utt)
    try:
        num_left = len(train_cuts)
        num_removed = num_in_total - num_left
        removed_percent = num_removed / num_in_total * 100
        logging.info(
            f"Before removing short and long utterances: {num_in_total}"
        )
        logging.info(f"After removing short and long utterances: {num_left}")
        logging.info(
            f"Removed {num_removed} utterances ({removed_percent:.5f}%)"
        )
    except TypeError as e:
        # You can ignore this error as previous versions of Lhotse work fine
        # for the above code. In recent versions of Lhotse, it uses
        # lazy filter, producing cutsets that don't have the __len__  method
        logging.info(str(e))
    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
        # We only load the sampler's state dict when it loads a checkpoint
        # saved in the middle of an epoch
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py
@ -22,7 +22,6 @@ from typing import Optional
 from lhotse import CutSet, Fbank, FbankConfig
 from lhotse.dataset import (
    BucketingSampler,
    CutMix,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
@ -71,8 +70,7 @@ class AsrDataModule:
            "--num-buckets",
            type=int,
            default=30,
-            help="The number of buckets for the BucketingSampler "
+            help="The number of buckets for the DynamicBucketingSampler. "
            "and DynamicBucketingSampler."
            "(you might want to increase it for larger datasets).",
        )
@ -152,7 +150,6 @@ class AsrDataModule:
    def train_dataloaders(
        self,
        cuts_train: CutSet,
        dynamic_bucketing: bool,
        on_the_fly_feats: bool,
        cuts_musan: Optional[CutSet] = None,
    ) -> DataLoader:
@ -162,9 +159,6 @@ class AsrDataModule:
            Cuts for training.
          cuts_musan:
            If not None, it is the cuts for mixing.
          dynamic_bucketing:
            True to use DynamicBucketingSampler;
            False to use BucketingSampler.
          on_the_fly_feats:
            True to use OnTheFlyFeatures;
            False to use PrecomputedFeatures.
@ -230,25 +224,14 @@ class AsrDataModule:
            return_cuts=self.args.return_cuts,
        )
-        if dynamic_bucketing:
+        logging.info("Using DynamicBucketingSampler.")
-            logging.info("Using DynamicBucketingSampler.")
+        train_sampler = DynamicBucketingSampler(
-            train_sampler = DynamicBucketingSampler(
+            cuts_train,
-                cuts_train,
+            max_duration=self.args.max_duration,
-                max_duration=self.args.max_duration,
+            shuffle=self.args.shuffle,
-                shuffle=self.args.shuffle,
+            num_buckets=self.args.num_buckets,
-                num_buckets=self.args.num_buckets,
+            drop_last=True,
-                drop_last=True,
+        )
            )
        else:
            logging.info("Using BucketingSampler.")
            train_sampler = BucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                bucket_method="equal_duration",
                drop_last=True,
            )
        logging.info("About to create train dataloader")
        train_dl = DataLoader(
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py
@ -22,7 +22,7 @@ import re
 from pathlib import Path
 import lhotse
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 class GigaSpeech:
@ -32,13 +32,13 @@ class GigaSpeech:
          manifest_dir:
            It is expected to contain the following files::
-                - XL_split_2000/cuts_XL.*.jsonl.gz
+                - gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz
-                - cuts_L_raw.jsonl.gz
+                - gigaspeech_cuts_L_raw.jsonl.gz
-                - cuts_M_raw.jsonl.gz
+                - gigaspeech_cuts_M_raw.jsonl.gz
-                - cuts_S_raw.jsonl.gz
+                - gigaspeech_cuts_S_raw.jsonl.gz
-                - cuts_XS_raw.jsonl.gz
+                - gigaspeech_cuts_XS_raw.jsonl.gz
-                - cuts_DEV_raw.jsonl.gz
+                - gigaspeech_cuts_DEV_raw.jsonl.gz
-                - cuts_TEST_raw.jsonl.gz
+                - gigaspeech_cuts_TEST_raw.jsonl.gz
        """
        self.manifest_dir = Path(manifest_dir)
@ -46,10 +46,12 @@ class GigaSpeech:
        logging.info("About to get train-XL cuts")
        filenames = list(
-            glob.glob(f"{self.manifest_dir}/XL_split_2000/cuts_XL.*.jsonl.gz")
+            glob.glob(
                f"{self.manifest_dir}/gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz"  # noqa
            )
        )
-        pattern = re.compile(r"cuts_XL.([0-9]+).jsonl.gz")
+        pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz")
        idx_filenames = [
            (int(pattern.search(f).group(1)), f) for f in filenames
        ]
@ -64,31 +66,31 @@ class GigaSpeech:
        )
    def train_L_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_L_raw.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_L_raw.jsonl.gz"
        logging.info(f"About to get train-L cuts from {f}")
        return CutSet.from_jsonl_lazy(f)
    def train_M_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_M_raw.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_M_raw.jsonl.gz"
        logging.info(f"About to get train-M cuts from {f}")
        return CutSet.from_jsonl_lazy(f)
    def train_S_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_S_raw.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_S_raw.jsonl.gz"
        logging.info(f"About to get train-S cuts from {f}")
        return CutSet.from_jsonl_lazy(f)
    def train_XS_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_XS_raw.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_XS_raw.jsonl.gz"
        logging.info(f"About to get train-XS cuts from {f}")
        return CutSet.from_jsonl_lazy(f)
    def test_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_TEST.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_TEST.jsonl.gz"
        logging.info(f"About to get TEST cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
    def dev_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_DEV.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_DEV.jsonl.gz"
        logging.info(f"About to get DEV cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py
@ -18,7 +18,7 @@
 import logging
 from pathlib import Path
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 class LibriSpeech:
@ -28,47 +28,47 @@ class LibriSpeech:
          manifest_dir:
            It is expected to contain the following files::
-                - cuts_dev-clean.json.gz
+                - librispeech_cuts_dev-clean.jsonl.gz
-                - cuts_dev-other.json.gz
+                - librispeech_cuts_dev-other.jsonl.gz
-                - cuts_test-clean.json.gz
+                - librispeech_cuts_test-clean.jsonl.gz
-                - cuts_test-other.json.gz
+                - librispeech_cuts_test-other.jsonl.gz
-                - cuts_train-clean-100.json.gz
+                - librispeech_cuts_train-clean-100.jsonl.gz
-                - cuts_train-clean-360.json.gz
+                - librispeech_cuts_train-clean-360.jsonl.gz
-                - cuts_train-other-500.json.gz
+                - librispeech_cuts_train-other-500.jsonl.gz
        """
        self.manifest_dir = Path(manifest_dir)
    def train_clean_100_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-clean-100.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
        logging.info(f"About to get train-clean-100 cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
    def train_clean_360_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-clean-360.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
        logging.info(f"About to get train-clean-360 cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
    def train_other_500_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-other-500.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
        logging.info(f"About to get train-other-500 cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
    def test_clean_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test-clean.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
        logging.info(f"About to get test-clean cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
    def test_other_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test-other.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
        logging.info(f"About to get test-other cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
    def dev_clean_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_dev-clean.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
        logging.info(f"About to get dev-clean cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
    def dev_other_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_dev-other.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
        logging.info(f"About to get dev-other cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
@ -66,7 +66,7 @@ from conformer import Conformer
 from decoder import Decoder
 from gigaspeech import GigaSpeech
 from joiner import Joiner
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
 from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
@ -968,8 +968,8 @@ def run(rank, world_size, args):
    train_giga_cuts = train_giga_cuts.repeat(times=None)
    if args.enable_musan:
-        cuts_musan = load_manifest(
+        cuts_musan = load_manifest_lazy(
-            Path(args.manifest_dir) / "cuts_musan.json.gz"
+            Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
        )
    else:
        cuts_musan = None
@ -978,14 +978,12 @@ def run(rank, world_size, args):
    train_dl = asr_datamodule.train_dataloaders(
        train_cuts,
        dynamic_bucketing=False,
        on_the_fly_feats=False,
        cuts_musan=cuts_musan,
    )
    giga_train_dl = asr_datamodule.train_dataloaders(
        train_giga_cuts,
        dynamic_bucketing=True,
        on_the_fly_feats=False,
        cuts_musan=cuts_musan,
    )
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -24,7 +24,7 @@ from pathlib import Path
 from typing import Any, Dict, Optional
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
    CutConcatenate,
    CutMix,
@ -224,8 +224,8 @@ class LibriSpeechAsrDataModule:
        if self.args.enable_musan:
            logging.info("Enable MUSAN")
            logging.info("About to get Musan cuts")
-            cuts_musan = load_manifest(
+            cuts_musan = load_manifest_lazy(
-                self.args.manifest_dir / "cuts_musan.json.gz"
+                self.args.manifest_dir / "musan_cuts.jsonl.gz"
            )
            transforms.append(
                CutMix(
@ -407,40 +407,48 @@ class LibriSpeechAsrDataModule:
    @lru_cache()
    def train_clean_100_cuts(self) -> CutSet:
        logging.info("About to get train-clean-100 cuts")
-        return load_manifest(
+        return load_manifest_lazy(
-            self.args.manifest_dir / "cuts_train-clean-100.json.gz"
+            self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
        )
    @lru_cache()
    def train_clean_360_cuts(self) -> CutSet:
        logging.info("About to get train-clean-360 cuts")
-        return load_manifest(
+        return load_manifest_lazy(
-            self.args.manifest_dir / "cuts_train-clean-360.json.gz"
+            self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
        )
    @lru_cache()
    def train_other_500_cuts(self) -> CutSet:
        logging.info("About to get train-other-500 cuts")
-        return load_manifest(
+        return load_manifest_lazy(
-            self.args.manifest_dir / "cuts_train-other-500.json.gz"
+            self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
        )
    @lru_cache()
    def dev_clean_cuts(self) -> CutSet:
        logging.info("About to get dev-clean cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev-clean.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
        )
    @lru_cache()
    def dev_other_cuts(self) -> CutSet:
        logging.info("About to get dev-other cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev-other.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
        )
    @lru_cache()
    def test_clean_cuts(self) -> CutSet:
        logging.info("About to get test-clean cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test-clean.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
        )
    @lru_cache()
    def test_other_cuts(self) -> CutSet:
        logging.info("About to get test-other cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
        )
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
@ -16,6 +16,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 Usage:
  export CUDA_VISIBLE_DEVICES="0,1,2,3"
  ./tdnn_lstm_ctc/train.py \
     --world-size 4 \
     --full-libri 1 \
     --max-duration 300 \
     --num-epochs 20
 """
 import argparse
 import logging
@ -29,6 +38,7 @@ import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.optim as optim
 from asr_datamodule import LibriSpeechAsrDataModule
 from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from model import TdnnLstm
 from torch import Tensor
@ -544,10 +554,25 @@ def run(rank, world_size, args):
    if params.full_libri:
        train_cuts += librispeech.train_clean_360_cuts()
        train_cuts += librispeech.train_other_500_cuts()
    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
        #
        # Caution: There is a reason to select 20.0 here. Please see
        # ../local/display_manifest_statistics.py
        #
        # You should use ../local/display_manifest_statistics.py to get
        # an utterance duration distribution for your dataset to select
        # the threshold
        return 1.0 <= c.duration <= 20.0
    train_cuts = train_cuts.filter(remove_short_and_long_utt)
    train_dl = librispeech.train_dataloaders(train_cuts)
    valid_cuts = librispeech.dev_clean_cuts()
    valid_cuts += librispeech.dev_other_cuts()
    valid_dl = librispeech.valid_dataloaders(valid_cuts)
    for epoch in range(params.start_epoch, params.num_epochs):
--- a/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py
+++ b/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py
@ -44,8 +44,8 @@ from pathlib import Path
 import sentencepiece as spm
 import torch
 from alignment import get_word_starting_frames
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
-from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler
+from lhotse.dataset import DynamicBucketingSampler, K2SpeechRecognitionDataset
 from lhotse.dataset.collation import collate_custom_field
@ -93,14 +93,15 @@ def main():
    sp = spm.SentencePieceProcessor()
    sp.load(args.bpe_model)
-    cuts_json = args.ali_dir / f"cuts_{args.dataset}.json.gz"
+    cuts_jsonl = args.ali_dir / f"librispeech_cuts_{args.dataset}.jsonl.gz"
-    logging.info(f"Loading {cuts_json}")
+    logging.info(f"Loading {cuts_jsonl}")
-    cuts = load_manifest(cuts_json)
+    cuts = load_manifest_lazy(cuts_jsonl)
-    sampler = SingleCutSampler(
+    sampler = DynamicBucketingSampler(
        cuts,
        max_duration=30,
        num_buckets=30,
        shuffle=False,
    )
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
@ -1,333 +0,0 @@
 # Copyright      2021  Piotr Żelasko
 #                2022  Xiaomi Corp.        (authors: Fangjun Kuang
 # 						     Mingshuang Luo)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import inspect
 import logging
 from pathlib import Path
 from typing import Optional
 import torch
 from lhotse import CutSet, Fbank, FbankConfig
 from lhotse.dataset import (
    BucketingSampler,
    CutMix,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    SpecAugment,
 )
 from lhotse.dataset.input_strategies import (
    OnTheFlyFeatures,
    PrecomputedFeatures,
 )
 from lhotse.utils import fix_random_seed
 from torch.utils.data import DataLoader
 from icefall.utils import str2bool
 class _SeedWorkers:
    def __init__(self, seed: int):
        self.seed = seed
    def __call__(self, worker_id: int):
        fix_random_seed(self.seed + worker_id)
 class AsrDataModule:
    def __init__(self, args: argparse.Namespace):
        self.args = args
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        group = parser.add_argument_group(
            title="ASR data related options",
            description="These options are used for the preparation of "
            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
            "effective batch sizes, sampling strategies, applied data "
            "augmentations, etc.",
        )
        group.add_argument(
            "--max-duration",
            type=int,
            default=200.0,
            help="Maximum pooled recordings duration (seconds) in a "
            "single batch. You can reduce it if it causes CUDA OOM.",
        )
        group.add_argument(
            "--bucketing-sampler",
            type=str2bool,
            default=True,
            help="When enabled, the batches will come from buckets of "
            "similar duration (saves padding frames).",
        )
        group.add_argument(
            "--num-buckets",
            type=int,
            default=30,
            help="The number of buckets for the BucketingSampler "
            "and DynamicBucketingSampler."
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
            "--shuffle",
            type=str2bool,
            default=True,
            help="When enabled (=default), the examples will be "
            "shuffled for each epoch.",
        )
        group.add_argument(
            "--return-cuts",
            type=str2bool,
            default=True,
            help="When enabled, each batch will have the "
            "field: batch['supervisions']['cut'] with the cuts that "
            "were used to construct it.",
        )
        group.add_argument(
            "--num-workers",
            type=int,
            default=2,
            help="The number of training dataloader workers that "
            "collect the batches.",
        )
        group.add_argument(
            "--enable-spec-aug",
            type=str2bool,
            default=True,
            help="When enabled, use SpecAugment for training dataset.",
        )
        group.add_argument(
            "--spec-aug-time-warp-factor",
            type=int,
            default=80,
            help="Used only when --enable-spec-aug is True. "
            "It specifies the factor for time warping in SpecAugment. "
            "Larger values mean more warping. "
            "A value less than 1 means to disable time warp.",
        )
        group.add_argument(
            "--enable-musan",
            type=str2bool,
            default=True,
            help="When enabled, select noise from MUSAN and mix it"
            "with training dataset. ",
        )
        group.add_argument(
            "--manifest-dir",
            type=Path,
            default=Path("data/fbank"),
            help="Path to directory with train/valid/test cuts.",
        )
        group.add_argument(
            "--on-the-fly-feats",
            type=str2bool,
            default=False,
            help="When enabled, use on-the-fly cut mixing and feature "
            "extraction. Will drop existing precomputed feature manifests "
            "if available. Used only in dev/test CutSet",
        )
    def train_dataloaders(
        self,
        cuts_train: CutSet,
        dynamic_bucketing: bool,
        on_the_fly_feats: bool,
        cuts_musan: Optional[CutSet] = None,
    ) -> DataLoader:
        """
        Args:
          cuts_train:
            Cuts for training.
          cuts_musan:
            If not None, it is the cuts for mixing.
          dynamic_bucketing:
            True to use DynamicBucketingSampler;
            False to use BucketingSampler.
          on_the_fly_feats:
            True to use OnTheFlyFeatures;
            False to use PrecomputedFeatures.
        """
        transforms = []
        if cuts_musan is not None:
            logging.info("Enable MUSAN")
            transforms.append(
                CutMix(
                    cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True
                )
            )
        else:
            logging.info("Disable MUSAN")
        input_transforms = []
        if self.args.enable_spec_aug:
            logging.info("Enable SpecAugment")
            logging.info(
                f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
            )
            # Set the value of num_frame_masks according to Lhotse's version.
            # In different Lhotse's versions, the default of num_frame_masks is
            # different.
            num_frame_masks = 10
            num_frame_masks_parameter = inspect.signature(
                SpecAugment.__init__
            ).parameters["num_frame_masks"]
            if num_frame_masks_parameter.default == 1:
                num_frame_masks = 2
            logging.info(f"Num frame mask: {num_frame_masks}")
            input_transforms.append(
                SpecAugment(
                    time_warp_factor=self.args.spec_aug_time_warp_factor,
                    num_frame_masks=num_frame_masks,
                    features_mask_size=27,
                    num_feature_masks=2,
                    frames_mask_size=100,
                )
            )
        else:
            logging.info("Disable SpecAugment")
        logging.info("About to create train dataset")
        train = K2SpeechRecognitionDataset(
            cut_transforms=transforms,
            input_transforms=input_transforms,
            return_cuts=self.args.return_cuts,
        )
        # NOTE: the PerturbSpeed transform should be added only if we
        # remove it from data prep stage.
        # Add on-the-fly speed perturbation; since originally it would
        # have increased epoch size by 3, we will apply prob 2/3 and use
        # 3x more epochs.
        # Speed perturbation probably should come first before
        # concatenation, but in principle the transforms order doesn't have
        # to be strict (e.g. could be randomized)
        # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
        # Drop feats to be on the safe side.
        train = K2SpeechRecognitionDataset(
            cut_transforms=transforms,
            input_strategy=(
                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
                if on_the_fly_feats
                else PrecomputedFeatures()
            ),
            input_transforms=input_transforms,
            return_cuts=self.args.return_cuts,
        )
        if dynamic_bucketing:
            logging.info("Using DynamicBucketingSampler.")
            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                drop_last=True,
            )
        else:
            logging.info("Using BucketingSampler.")
            train_sampler = BucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                bucket_method="equal_duration",
                drop_last=True,
            )
        logging.info("About to create train dataloader")
        # 'seed' is derived from the current random state, which will have
        # previously been set in the main process.
        seed = torch.randint(0, 100000, ()).item()
        worker_init_fn = _SeedWorkers(seed)
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=self.args.num_workers,
            persistent_workers=False,
            worker_init_fn=worker_init_fn,
        )
        return train_dl
    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
        transforms = []
        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                input_strategy=OnTheFlyFeatures(
                    Fbank(FbankConfig(num_mel_bins=80))
                ),
                return_cuts=self.args.return_cuts,
            )
        else:
            validate = K2SpeechRecognitionDataset(
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
        valid_sampler = BucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.info("About to create dev dataloader")
        valid_dl = DataLoader(
            validate,
            sampler=valid_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=False,
        )
        return valid_dl
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.debug("About to create test dataset")
        test = K2SpeechRecognitionDataset(
            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
            if self.args.on_the_fly_feats
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
        sampler = BucketingSampler(
            cuts, max_duration=self.args.max_duration, shuffle=False
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
            test,
            batch_size=None,
            sampler=sampler,
            num_workers=self.args.num_workers,
        )
        return test_dl
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
@ -0,0 +1 @@
 ../pruned_transducer_stateless3/asr_datamodule.py
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
@ -1,75 +0,0 @@
 # Copyright      2021  Piotr Żelasko
 #                2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from pathlib import Path
 from lhotse import CutSet, load_manifest
 class GigaSpeech:
    def __init__(self, manifest_dir: str):
        """
        Args:
          manifest_dir:
            It is expected to contain the following files::
                - cuts_XL_raw.jsonl.gz
                - cuts_L_raw.jsonl.gz
                - cuts_M_raw.jsonl.gz
                - cuts_S_raw.jsonl.gz
                - cuts_XS_raw.jsonl.gz
                - cuts_DEV_raw.jsonl.gz
                - cuts_TEST_raw.jsonl.gz
        """
        self.manifest_dir = Path(manifest_dir)
    def train_XL_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_XL_raw.jsonl.gz"
        logging.info(f"About to get train-XL cuts from {f}")
        return CutSet.from_jsonl_lazy(f)
    def train_L_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_L_raw.jsonl.gz"
        logging.info(f"About to get train-L cuts from {f}")
        return CutSet.from_jsonl_lazy(f)
    def train_M_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_M_raw.jsonl.gz"
        logging.info(f"About to get train-M cuts from {f}")
        return CutSet.from_jsonl_lazy(f)
    def train_S_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_S_raw.jsonl.gz"
        logging.info(f"About to get train-S cuts from {f}")
        return CutSet.from_jsonl_lazy(f)
    def train_XS_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_XS_raw.jsonl.gz"
        logging.info(f"About to get train-XS cuts from {f}")
        return CutSet.from_jsonl_lazy(f)
    def test_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_TEST.jsonl.gz"
        logging.info(f"About to get TEST cuts from {f}")
        return load_manifest(f)
    def dev_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_DEV.jsonl.gz"
        logging.info(f"About to get DEV cuts from {f}")
        return load_manifest(f)
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
@ -0,0 +1 @@
 ../pruned_transducer_stateless3/gigaspeech.py
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
@ -1,74 +0,0 @@
 # Copyright      2021  Piotr Żelasko
 #                2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 from pathlib import Path
 from lhotse import CutSet, load_manifest
 class LibriSpeech:
    def __init__(self, manifest_dir: str):
        """
        Args:
          manifest_dir:
            It is expected to contain the following files::
                - cuts_dev-clean.json.gz
                - cuts_dev-other.json.gz
                - cuts_test-clean.json.gz
                - cuts_test-other.json.gz
                - cuts_train-clean-100.json.gz
                - cuts_train-clean-360.json.gz
                - cuts_train-other-500.json.gz
        """
        self.manifest_dir = Path(manifest_dir)
    def train_clean_100_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_train-clean-100.json.gz"
        logging.info(f"About to get train-clean-100 cuts from {f}")
        return load_manifest(f)
    def train_clean_360_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_train-clean-360.json.gz"
        logging.info(f"About to get train-clean-360 cuts from {f}")
        return load_manifest(f)
    def train_other_500_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_train-other-500.json.gz"
        logging.info(f"About to get train-other-500 cuts from {f}")
        return load_manifest(f)
    def test_clean_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_test-clean.json.gz"
        logging.info(f"About to get test-clean cuts from {f}")
        return load_manifest(f)
    def test_other_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_test-other.json.gz"
        logging.info(f"About to get test-other cuts from {f}")
        return load_manifest(f)
    def dev_clean_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_dev-clean.json.gz"
        logging.info(f"About to get dev-clean cuts from {f}")
        return load_manifest(f)
    def dev_other_cuts(self) -> CutSet:
        f = self.manifest_dir / "cuts_dev-other.json.gz"
        logging.info(f"About to get dev-other cuts from {f}")
        return load_manifest(f)
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
@ -0,0 +1 @@
 ../pruned_transducer_stateless3/librispeech.py
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py
@ -28,7 +28,7 @@ from pathlib import Path
 from asr_datamodule import AsrDataModule
 from gigaspeech import GigaSpeech
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 from librispeech import LibriSpeech
@ -41,8 +41,8 @@ def test_dataset():
    print(args)
    if args.enable_musan:
-        cuts_musan = load_manifest(
+        cuts_musan = load_manifest_lazy(
-            Path(args.manifest_dir) / "cuts_musan.json.gz"
+            Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
        )
    else:
        cuts_musan = None
@ -57,14 +57,12 @@ def test_dataset():
    libri_train_dl = asr_datamodule.train_dataloaders(
        train_clean_100,
        dynamic_bucketing=False,
        on_the_fly_feats=False,
        cuts_musan=cuts_musan,
    )
    giga_train_dl = asr_datamodule.train_dataloaders(
        train_S,
        dynamic_bucketing=True,
        on_the_fly_feats=True,
        cuts_musan=cuts_musan,
    )
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py
@ -73,7 +73,7 @@ from conformer import Conformer
 from decoder import Decoder
 from gigaspeech import GigaSpeech
 from joiner import Joiner
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from librispeech import LibriSpeech
@ -662,19 +662,17 @@ def train_one_epoch(
 def filter_short_and_long_utterances(cuts: CutSet) -> CutSet:
    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
        #
        # Caution: There is a reason to select 20.0 here. Please see
        # ../local/display_manifest_statistics.py
        #
        # You should use ../local/display_manifest_statistics.py to get
        # an utterance duration distribution for your dataset to select
        # the threshold
        return 1.0 <= c.duration <= 20.0
    num_in_total = len(cuts)
    cuts = cuts.filter(remove_short_and_long_utt)
    num_left = len(cuts)
    num_removed = num_in_total - num_left
    removed_percent = num_removed / num_in_total * 100
    logging.info(f"Before removing short and long utterances: {num_in_total}")
    logging.info(f"After removing short and long utterances: {num_left}")
    logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
    return cuts
@ -767,17 +765,18 @@ def run(rank, world_size, args):
    # DEV 12 hours
    # Test 40 hours
    if params.full_libri:
-        logging.info("Using the L subset of GigaSpeech (2.5k hours)")
+        logging.info("Using the XL subset of GigaSpeech (10k hours)")
-        train_giga_cuts = gigaspeech.train_L_cuts()
+        train_giga_cuts = gigaspeech.train_XL_cuts()
    else:
        logging.info("Using the S subset of GigaSpeech (250 hours)")
        train_giga_cuts = gigaspeech.train_S_cuts()
    train_giga_cuts = filter_short_and_long_utterances(train_giga_cuts)
    train_giga_cuts = train_giga_cuts.repeat(times=None)
    if args.enable_musan:
-        cuts_musan = load_manifest(
+        cuts_musan = load_manifest_lazy(
-            Path(args.manifest_dir) / "cuts_musan.json.gz"
+            Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
        )
    else:
        cuts_musan = None
@ -786,14 +785,12 @@ def run(rank, world_size, args):
    train_dl = asr_datamodule.train_dataloaders(
        train_cuts,
        dynamic_bucketing=False,
        on_the_fly_feats=False,
        cuts_musan=cuts_musan,
    )
    giga_train_dl = asr_datamodule.train_dataloaders(
        train_giga_cuts,
        dynamic_bucketing=True,
        on_the_fly_feats=True,
        cuts_musan=cuts_musan,
    )
--- a/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -22,7 +22,7 @@ from pathlib import Path
 from typing import Any, Dict, Optional
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
    CutConcatenate,
    CutMix,
@ -176,7 +176,7 @@ class SPGISpeechAsrDataModule:
            The state dict for the training sampler.
        """
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
+        cuts_musan = load_manifest_lazy(
            self.args.manifest_dir / "cuts_musan.jsonl.gz"
        )
--- a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py
+++ b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py
@ -52,8 +52,13 @@ def compute_fbank_tedlium():
        "test",
    )
    prefix = "tedlium"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
-        prefix="tedlium", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
@ -61,7 +66,7 @@ def compute_fbank_tedlium():
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
@ -80,7 +85,7 @@ def compute_fbank_tedlium():
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=cur_num_jobs,
                executor=ex,
@ -88,7 +93,7 @@ def compute_fbank_tedlium():
            )
            # Split long cuts into many short and un-overlapping cuts
            cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
 if __name__ == "__main__":
--- a/egs/tedlium3/ASR/local/display_manifest_statistics.py
+++ b/egs/tedlium3/ASR/local/display_manifest_statistics.py
@ -27,15 +27,15 @@ for usage.
 """
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 def main():
-    path = "./data/fbank/cuts_train.json.gz"
+    path = "./data/fbank/tedlium_cuts_train.jsonl.gz"
-    path = "./data/fbank/cuts_dev.json.gz"
+    path = "./data/fbank/tedlium_cuts_dev.jsonl.gz"
-    path = "./data/fbank/cuts_test.json.gz"
+    path = "./data/fbank/tedlium_cuts_test.jsonl.gz"
-    cuts = load_manifest(path)
+    cuts = load_manifest_lazy(path)
    cuts.describe()
--- a/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py
+++ b/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py
@ -22,11 +22,11 @@ import logging
 from functools import lru_cache
 from pathlib import Path
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
    BucketingSampler,
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SingleCutSampler,
@ -92,7 +92,7 @@ class TedLiumAsrDataModule:
            "--num-buckets",
            type=int,
            default=30,
-            help="The number of buckets for the BucketingSampler"
+            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
@ -179,8 +179,8 @@ class TedLiumAsrDataModule:
        transforms = []
        if self.args.enable_musan:
            logging.info("Enable MUSAN")
-            cuts_musan = load_manifest(
+            cuts_musan = load_manifest_lazy(
-                self.args.manifest_dir / "cuts_musan.json.gz"
+                self.args.manifest_dir / "musan_cuts.jsonl.gz"
            )
            transforms.append(
                CutMix(
@ -261,13 +261,12 @@ class TedLiumAsrDataModule:
            )
        if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
+            logging.info("Using DynamicBucketingSampler.")
-            train_sampler = BucketingSampler(
+            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                bucket_method="equal_duration",
                drop_last=True,
            )
        else:
@ -311,7 +310,7 @@ class TedLiumAsrDataModule:
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
@ -335,8 +334,10 @@ class TedLiumAsrDataModule:
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
-        sampler = BucketingSampler(
+        sampler = DynamicBucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+            cuts,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
@ -350,14 +351,20 @@ class TedLiumAsrDataModule:
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "tedlium_cuts_train.jsonl.gz"
        )
    @lru_cache()
    def dev_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "tedlium_cuts_dev.jsonl.gz"
        )
    @lru_cache()
    def test_cuts(self) -> CutSet:
        logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "tedlium_cuts_test.jsonl.gz"
        )
--- a/egs/timit/ASR/local/compute_fbank_timit.py
+++ b/egs/timit/ASR/local/compute_fbank_timit.py
@ -29,7 +29,7 @@ import os
 from pathlib import Path
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
@ -53,8 +53,13 @@ def compute_fbank_timit():
        "DEV",
        "TEST",
    )
    prefix = "timit"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
-        prefix="timit", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
@ -62,7 +67,8 @@ def compute_fbank_timit():
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
            if cuts_file.is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
@ -78,13 +84,13 @@ def compute_fbank_timit():
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=LilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(cuts_file)
 if __name__ == "__main__":
--- a/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -23,11 +23,11 @@ from functools import lru_cache
 from pathlib import Path
 from typing import List, Union
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
    BucketingSampler,
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SingleCutSampler,
@ -92,7 +92,7 @@ class TimitAsrDataModule(DataModule):
            "--num-buckets",
            type=int,
            default=30,
-            help="The number of buckets for the BucketingSampler"
+            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
@ -154,7 +154,9 @@ class TimitAsrDataModule(DataModule):
        cuts_train = self.train_cuts()
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(self.args.feature_dir / "cuts_musan.json.gz")
+        cuts_musan = load_manifest_lazy(
            self.args.feature_dir / "cuts_musan.jsonl.gz"
        )
        logging.info("About to create train dataset")
        transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
@ -218,13 +220,12 @@ class TimitAsrDataModule(DataModule):
            )
        if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
+            logging.info("Using DynamicBucketingSampler.")
-            train_sampler = BucketingSampler(
+            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                bucket_method="equal_duration",
                drop_last=True,
            )
        else:
@ -322,20 +323,26 @@ class TimitAsrDataModule(DataModule):
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        cuts_train = load_manifest(self.args.feature_dir / "cuts_TRAIN.json.gz")
+        cuts_train = load_manifest_lazy(
            self.args.feature_dir / "timit_cuts_TRAIN.jsonl.gz"
        )
        return cuts_train
    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        cuts_valid = load_manifest(self.args.feature_dir / "cuts_DEV.json.gz")
+        cuts_valid = load_manifest_lazy(
            self.args.feature_dir / "timit_cuts_DEV.jsonl.gz"
        )
        return cuts_valid
    @lru_cache()
    def test_cuts(self) -> CutSet:
        logging.debug("About to get test cuts")
-        cuts_test = load_manifest(self.args.feature_dir / "cuts_TEST.json.gz")
+        cuts_test = load_manifest_lazy(
            self.args.feature_dir / "timit_cuts_TEST.jsonl.gz"
        )
        return cuts_test
--- a/egs/wenetspeech/ASR/local/display_manifest_statistics.py
+++ b/egs/wenetspeech/ASR/local/display_manifest_statistics.py
@ -26,7 +26,7 @@ for usage.
 """
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 def main():
@ -40,7 +40,7 @@ def main():
    for path in paths:
        print(f"Starting display the statistics for {path}")
-        cuts = load_manifest(path)
+        cuts = load_manifest_lazy(path)
        cuts.describe()
--- a/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -27,7 +27,7 @@ from lhotse import (
    CutSet,
    Fbank,
    FbankConfig,
-    load_manifest,
+    load_manifest_lazy,
    set_caching_enabled,
 )
 from lhotse.dataset import (
@ -218,8 +218,8 @@ class WenetSpeechAsrDataModule:
            The state dict for the training sampler.
        """
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
+        cuts_musan = load_manifest_lazy(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
        )
        transforms = []
@ -435,16 +435,18 @@ class WenetSpeechAsrDataModule:
    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
    @lru_cache()
    def test_net_cuts(self) -> List[CutSet]:
        logging.info("About to get TEST_NET cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz")
+        return load_manifest_lazy(
            self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz"
        )
    @lru_cache()
    def test_meeting_cuts(self) -> List[CutSet]:
        logging.info("About to get TEST_MEETING cuts")
-        return load_manifest(
+        return load_manifest_lazy(
            self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz"
        )
--- a/egs/yesno/ASR/local/compute_fbank_yesno.py
+++ b/egs/yesno/ASR/local/compute_fbank_yesno.py
@ -12,7 +12,7 @@ import os
 from pathlib import Path
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
@ -37,10 +37,13 @@ def compute_fbank_yesno():
        "train",
        "test",
    )
    prefix = "yesno"
    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
-        prefix="yesno",
+        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
@ -50,7 +53,8 @@ def compute_fbank_yesno():
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
            if cuts_file.is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
@ -66,13 +70,13 @@ def compute_fbank_yesno():
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 1,  # use one job
                executor=ex,
-                storage_type=LilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(cuts_file)
 if __name__ == "__main__":
--- a/egs/yesno/ASR/tdnn/asr_datamodule.py
+++ b/egs/yesno/ASR/tdnn/asr_datamodule.py
@ -20,18 +20,19 @@ from functools import lru_cache
 from pathlib import Path
 from typing import List
 from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
    CutConcatenate,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SingleCutSampler,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
 from torch.utils.data import DataLoader
 from icefall.dataset.datamodule import DataModule
 from icefall.utils import str2bool
 from lhotse import CutSet, Fbank, FbankConfig, load_manifest
 from lhotse.dataset import (
    BucketingSampler,
    CutConcatenate,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
 )
 from lhotse.dataset.input_strategies import OnTheFlyFeatures
 class YesNoAsrDataModule(DataModule):
@ -84,7 +85,7 @@ class YesNoAsrDataModule(DataModule):
            "--num-buckets",
            type=int,
            default=10,
-            help="The number of buckets for the BucketingSampler"
+            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
@ -186,18 +187,17 @@ class YesNoAsrDataModule(DataModule):
            )
        if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
+            logging.info("Using DynamicBucketingSampler.")
-            train_sampler = BucketingSampler(
+            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                bucket_method="equal_duration",
                drop_last=True,
            )
        else:
            logging.info("Using SingleCutSampler.")
-            train_sampler = BucketingSampler(
+            train_sampler = SingleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
@ -225,8 +225,10 @@ class YesNoAsrDataModule(DataModule):
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
-        sampler = BucketingSampler(
+        sampler = DynamicBucketingSampler(
-            cuts_test, max_duration=self.args.max_duration, shuffle=False
+            cuts_test,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
@ -240,11 +242,15 @@ class YesNoAsrDataModule(DataModule):
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        cuts_train = load_manifest(self.args.feature_dir / "cuts_train.json.gz")
+        cuts_train = load_manifest_lazy(
            self.args.feature_dir / "yesno_cuts_train.jsonl.gz"
        )
        return cuts_train
    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        logging.info("About to get test cuts")
-        cuts_test = load_manifest(self.args.feature_dir / "cuts_test.json.gz")
+        cuts_test = load_manifest_lazy(
            self.args.feature_dir / "yesno_cuts_test.jsonl.gz"
        )
        return cuts_test
--- a/icefall/utils.py
+++ b/icefall/utils.py
@ -131,7 +131,6 @@ def setup_logger(
        format=formatter,
        level=level,
        filemode="w",
        force=True,
    )
    if use_console:
        console = logging.StreamHandler()
		`@ -0,0 +1 @@`
							`../../../librispeech/ASR/local/compute_fbank_musan.py`
		`@ -0,0 +1 @@`
							`../pruned_transducer_stateless3/asr_datamodule.py`
		`@ -0,0 +1 @@`
							`../pruned_transducer_stateless3/gigaspeech.py`
		`@ -0,0 +1 @@`
							`../pruned_transducer_stateless3/librispeech.py`