Use jsonl for CutSet in the LibriSpeech recipe. (#397)

* Use jsonl for cutsets in the librispeech recipe. * Use lazy cutset for all recipes. * More fixes to use lazy CutSet. * Remove force=True from logging to support Python < 3.8 * Minor fixes. * Fix style issues.
2025-08-09 01:52:41 +00:00 · 2022-06-06 10:19:16 +08:00 · 2022-06-06 10:19:16 +08:00 · f1abce72f8
commit f1abce72f8
parent e5884f82e0
68 changed files with 702 additions and 1098 deletions
--- a/.github/workflows/run-gigaspeech-2022-05-13.yml
+++ b/.github/workflows/run-gigaspeech-2022-05-13.yml
@ -59,6 +59,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf

      - name: Cache kaldifeat
        id: my-cache
--- a/.github/workflows/run-librispeech-2022-03-12.yml
+++ b/.github/workflows/run-librispeech-2022-03-12.yml
@ -59,6 +59,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf

      - name: Cache kaldifeat
        id: my-cache
@ -99,7 +101,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2

      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-2022-04-29.yml
+++ b/.github/workflows/run-librispeech-2022-04-29.yml
@ -59,6 +59,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf

      - name: Cache kaldifeat
        id: my-cache
@ -99,7 +101,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2

      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-2022-05-13.yml
@ -59,6 +59,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf

      - name: Cache kaldifeat
        id: my-cache
@ -99,7 +101,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2

      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
@ -59,6 +59,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf

      - name: Cache kaldifeat
        id: my-cache
@ -99,7 +101,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2

      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
+++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
@ -59,6 +59,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf

      - name: Cache kaldifeat
        id: my-cache
@ -99,7 +101,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2

      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@ -58,6 +58,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf

      - name: Cache kaldifeat
        id: my-cache
@ -98,7 +100,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2

      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@ -58,6 +58,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf

      - name: Cache kaldifeat
        id: my-cache
@ -98,7 +100,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2

      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/.github/workflows/run-pretrained-transducer-stateless.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@ -58,6 +58,8 @@ jobs:
      - name: Install Python dependencies
        run: |
          grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf

      - name: Cache kaldifeat
        id: my-cache
@ -98,7 +100,7 @@ jobs:
        with:
          path: |
            ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2

      - name: Compute fbank for LibriSpeech test-clean and test-other
        if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
--- a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
+++ b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
@ -43,7 +43,7 @@ torch.set_num_interop_threads(1)


 def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
-    src_dir = Path("data/manifests/aidatatang_200zh")
+    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_jobs = min(15, os.cpu_count())

@ -52,11 +52,13 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
        "dev",
        "test",
    )
+    prefix = "aidatatang"
+    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
-        prefix="aidatatang",
-        suffix="jsonl.gz",
        dataset_parts=dataset_parts,
        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
    )
    assert manifests is not None

@ -64,10 +66,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
+
+            for sup in m["supervisions"]:
+                sup.custom = {"origin": "aidatatang_200zh"}
+
            cut_set = CutSet.from_manifests(
                recordings=m["recordings"],
                supervisions=m["supervisions"],
@ -80,13 +86,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=ChunkedLilcomHdf5Writer,
            )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")


 def get_args():
--- a/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py
+++ b/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py
@ -25,19 +25,19 @@ for usage.
 """


-from lhotse import load_manifest
+from lhotse import load_manifest_lazy


 def main():
    paths = [
-        "./data/fbank/cuts_train.json.gz",
-        "./data/fbank/cuts_dev.json.gz",
-        "./data/fbank/cuts_test.json.gz",
+        "./data/fbank/aidatatang_cuts_train.jsonl.gz",
+        "./data/fbank/aidatatang_cuts_dev.jsonl.gz",
+        "./data/fbank/aidatatang_cuts_test.jsonl.gz",
    ]

    for path in paths:
        print(f"Starting display the statistics for {path}")
-        cuts = load_manifest(path)
+        cuts = load_manifest_lazy(path)
        cuts.describe()


@ -45,7 +45,7 @@ if __name__ == "__main__":
    main()

 """
-Starting display the statistics for ./data/fbank/cuts_train.json.gz
+Starting display the statistics for ./data/fbank/aidatatang_cuts_train.jsonl.gz
 Cuts count: 494715
 Total duration (hours): 422.6
 Speech duration (hours): 422.6 (100.0%)
@ -61,7 +61,7 @@ min     1.0
 99.5%   8.0
 99.9%   9.5
 max     18.1
-Starting display the statistics for ./data/fbank/cuts_dev.json.gz
+Starting display the statistics for ./data/fbank/aidatatang_cuts_dev.jsonl.gz
 Cuts count: 24216
 Total duration (hours): 20.2
 Speech duration (hours): 20.2 (100.0%)
@ -77,7 +77,7 @@ min     1.2
 99.5%   7.3
 99.9%   8.8
 max     11.3
-Starting display the statistics for ./data/fbank/cuts_test.json.gz
+Starting display the statistics for ./data/fbank/aidatatang_cuts_test.jsonl.gz
 Cuts count: 48144
 Total duration (hours): 40.2
 Speech duration (hours): 40.2 (100.0%)
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -27,11 +27,10 @@ from lhotse import (
    CutSet,
    Fbank,
    FbankConfig,
-    load_manifest,
+    load_manifest_lazy,
    set_caching_enabled,
 )
 from lhotse.dataset import (
-    BucketingSampler,
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
@ -205,8 +204,8 @@ class Aidatatang_200zhAsrDataModule:
            The state dict for the training sampler.
        """
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
        )

        transforms = []
@ -290,13 +289,12 @@ class Aidatatang_200zhAsrDataModule:
            )

        if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
                drop_last=True,
            )
        else:
@ -402,14 +400,20 @@ class Aidatatang_200zhAsrDataModule:
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
+        )

    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz"
+        )

    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
+        )
--- a/egs/aishell/ASR/conformer_ctc/train.py
+++ b/egs/aishell/ASR/conformer_ctc/train.py
@ -195,9 +195,9 @@ def get_params() -> AttributeDict:
            "best_train_epoch": -1,
            "best_valid_epoch": -1,
            "batch_idx_train": 0,
-            "log_interval": 10,
+            "log_interval": 50,
            "reset_interval": 200,
-            "valid_interval": 3000,
+            "valid_interval": 2000,
            # parameters for k2.ctc_loss
            "beam_size": 10,
            "reduction": "sum",
--- a/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py
+++ b/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py
@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the aidatatang_200zh dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+import torch
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse.recipes.utils import read_manifests_if_cached
+
+from icefall.utils import get_executor
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+
+def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+    num_jobs = min(15, os.cpu_count())
+
+    dataset_parts = (
+        "train",
+        "test",
+        "dev",
+    )
+    prefix = "aidatatang"
+    suffix = "jsonl.gz"
+    manifests = read_manifests_if_cached(
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
+    )
+    assert manifests is not None
+
+    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+
+    with get_executor() as ex:  # Initialize the executor only once.
+        for partition, m in manifests.items():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
+                logging.info(f"{partition} already exists - skipping.")
+                continue
+            logging.info(f"Processing {partition}")
+
+            for sup in m["supervisions"]:
+                sup.custom = {"origin": "aidatatang_200zh"}
+
+            cut_set = CutSet.from_manifests(
+                recordings=m["recordings"],
+                supervisions=m["supervisions"],
+            )
+            if "train" in partition:
+                cut_set = (
+                    cut_set
+                    + cut_set.perturb_speed(0.9)
+                    + cut_set.perturb_speed(1.1)
+                )
+            cut_set = cut_set.compute_and_store_features(
+                extractor=extractor,
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
+                # when an executor is specified, make more partitions
+                num_jobs=num_jobs if ex is None else 80,
+                executor=ex,
+                storage_type=LilcomChunkyWriter,
+            )
+
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--num-mel-bins",
+        type=int,
+        default=80,
+        help="""The number of mel bins for Fbank""",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    args = get_args()
+    compute_fbank_aidatatang_200zh(num_mel_bins=args.num_mel_bins)
--- a/egs/aishell/ASR/local/compute_fbank_aishell.py
+++ b/egs/aishell/ASR/local/compute_fbank_aishell.py
@ -29,7 +29,7 @@ import os
 from pathlib import Path

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor
@ -52,8 +52,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
        "dev",
        "test",
    )
+    prefix = "aishell"
+    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
-        prefix="aishell", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
    )
    assert manifests is not None

@ -61,7 +66,7 @@ def compute_fbank_aishell(num_mel_bins: int = 80):

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
@ -77,13 +82,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=LilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")


 def get_args():
--- a/egs/aishell/ASR/local/display_manifest_statistics.py
+++ b/egs/aishell/ASR/local/display_manifest_statistics.py
@ -25,18 +25,18 @@ for usage.
 """


-from lhotse import load_manifest
+from lhotse import load_manifest_lazy


 def main():
-    #  path = "./data/fbank/cuts_train.json.gz"
-    #  path = "./data/fbank/cuts_test.json.gz"
-    #  path = "./data/fbank/cuts_dev.json.gz"
-    #  path = "./data/fbank/aidatatang_200zh/cuts_train_raw.jsonl.gz"
-    #  path = "./data/fbank/aidatatang_200zh/cuts_test_raw.jsonl.gz"
-    path = "./data/fbank/aidatatang_200zh/cuts_dev_raw.jsonl.gz"
+    #  path = "./data/fbank/aishell_cuts_train.jsonl.gz"
+    #  path = "./data/fbank/aishell_cuts_test.jsonl.gz"
+    path = "./data/fbank/aishell_cuts_dev.jsonl.gz"
+    #  path = "./data/fbank/aidatatang_cuts_train.jsonl.gz"
+    #  path = "./data/fbank/aidatatang_cuts_test.jsonl.gz"
+    #  path = "./data/fbank/aidatatang_cuts_dev.jsonl.gz"

-    cuts = load_manifest(path)
+    cuts = load_manifest_lazy(path)
    cuts.describe()


--- a/egs/aishell/ASR/local/process_aidatatang_200zh.py
+++ b/egs/aishell/ASR/local/process_aidatatang_200zh.py
@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2022  Xiaomi Corp.             (Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from pathlib import Path
-
-from lhotse import CutSet
-from lhotse.recipes.utils import read_manifests_if_cached
-
-
-def preprocess_aidatatang_200zh():
-    src_dir = Path("data/manifests/aidatatang_200zh")
-    output_dir = Path("data/fbank/aidatatang_200zh")
-    output_dir.mkdir(exist_ok=True, parents=True)
-
-    dataset_parts = (
-        "train",
-        "test",
-        "dev",
-    )
-
-    logging.info("Loading manifest")
-    manifests = read_manifests_if_cached(
-        dataset_parts=dataset_parts, output_dir=src_dir, prefix="aidatatang"
-    )
-    assert len(manifests) > 0
-
-    for partition, m in manifests.items():
-        logging.info(f"Processing {partition}")
-        raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
-        if raw_cuts_path.is_file():
-            logging.info(f"{partition} already exists - skipping")
-            continue
-
-        for sup in m["supervisions"]:
-            sup.custom = {"origin": "aidatatang_200zh"}
-
-        cut_set = CutSet.from_manifests(
-            recordings=m["recordings"],
-            supervisions=m["supervisions"],
-        )
-
-        logging.info(f"Saving to {raw_cuts_path}")
-        cut_set.to_file(raw_cuts_path)
-
-
-def main():
-    formatter = (
-        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-    )
-    logging.basicConfig(format=formatter, level=logging.INFO)
-
-    preprocess_aidatatang_200zh()
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/aishell/ASR/prepare_aidatatang_200zh.sh
+++ b/egs/aishell/ASR/prepare_aidatatang_200zh.sh
@ -42,18 +42,18 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare manifest"
  # We assume that you have downloaded the aidatatang_200zh corpus
  # to $dl_dir/aidatatang_200zh
-  if [ ! -f data/manifests/aidatatang_200zh/.manifests.done ]; then
-    mkdir -p data/manifests/aidatatang_200zh
-    lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
-    touch data/manifests/aidatatang_200zh/.manifests.done
+  if [ ! -f data/manifests/.aidatatang_200zh_manifests.done ]; then
+    mkdir -p data/manifests
+    lhotse prepare aidatatang-200zh $dl_dir data/manifests
+    touch data/manifests/.aidatatang_200zh_manifests.done
  fi
 fi

 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Process aidatatang_200zh"
-  if [ ! -f data/fbank/aidatatang_200zh/.fbank.done ]; then
-    mkdir -p data/fbank/aidatatang_200zh
-    lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
-    touch data/fbank/aidatatang_200zh/.fbank.done
+  if [ ! -f data/fbank/.aidatatang_200zh_fbank.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_aidatatang_200zh.py
+    touch data/fbank/.aidatatang_200zh_fbank.done
  fi
 fi
--- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -23,11 +23,11 @@ from functools import lru_cache
 from pathlib import Path
 from typing import List

-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
-    BucketingSampler,
    CutConcatenate,
    CutMix,
+    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SingleCutSampler,
@ -93,7 +93,7 @@ class AishellAsrDataModule:
            "--num-buckets",
            type=int,
            default=30,
-            help="The number of buckets for the BucketingSampler"
+            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
@ -133,6 +133,12 @@ class AishellAsrDataModule:
            help="When enabled (=default), the examples will be "
            "shuffled for each epoch.",
        )
+        group.add_argument(
+            "--drop-last",
+            type=str2bool,
+            default=True,
+            help="Whether to drop last batch. Used by sampler.",
+        )
        group.add_argument(
            "--return-cuts",
            type=str2bool,
@ -177,8 +183,8 @@ class AishellAsrDataModule:

    def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
        )

        transforms = []
@ -262,14 +268,13 @@ class AishellAsrDataModule:
            )

        if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
-                drop_last=True,
+                drop_last=self.args.drop_last,
            )
        else:
            logging.info("Using SingleCutSampler.")
@ -313,7 +318,7 @@ class AishellAsrDataModule:
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
@ -337,8 +342,10 @@ class AishellAsrDataModule:
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
-        sampler = BucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
        )
        test_dl = DataLoader(
            test,
@ -351,17 +358,21 @@ class AishellAsrDataModule:
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        cuts_train = load_manifest(
-            self.args.manifest_dir / "cuts_train.json.gz"
+        cuts_train = load_manifest_lazy(
+            self.args.manifest_dir / "aishell_cuts_train.jsonl.gz"
        )
        return cuts_train

    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aishell_cuts_dev.jsonl.gz"
+        )

    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aishell_cuts_test.jsonl.gz"
+        )
--- a/egs/aishell/ASR/tdnn_lstm_ctc/train.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/train.py
@ -15,6 +15,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+"""
+Usage
+  export CUDA_VISIBLE_DEVICES="0,1,2,3"
+  ./tdnn_lstm_ctc/train.py \
+    --world-size 4 \
+    --num-epochs 20 \
+    --max-duration 300
+"""

 import argparse
 import logging
--- a/egs/aishell/ASR/transducer_stateless/conformer.py
+++ b/egs/aishell/ASR/transducer_stateless/conformer.py
@ -110,9 +110,7 @@ class Conformer(Transformer):
        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)

        # Caution: We assume the subsampling factor is 4!
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            lengths = ((x_lens - 1) // 2 - 1) // 2
+        lengths = (((x_lens - 1) >> 1) - 1) >> 1
        assert x.size(0) == lengths.max().item()
        mask = make_pad_mask(lengths)

--- a/egs/aishell/ASR/transducer_stateless/train.py
+++ b/egs/aishell/ASR/transducer_stateless/train.py
@ -21,6 +21,7 @@

 import argparse
 import logging
+import warnings
 from pathlib import Path
 from shutil import copyfile
 from typing import Optional, Tuple
@ -386,7 +387,11 @@ def compute_loss(
    assert loss.requires_grad == is_training

    info = MetricsTracker()
-    info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (
+            (feature_lens // params.subsampling_factor).sum().item()
+        )

    # Note: We use reduction=sum while computing the loss.
    info["loss"] = loss.detach().cpu().item()
--- a/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py
@ -18,7 +18,7 @@
 import logging
 from pathlib import Path

-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy


 class AIDatatang200zh:
@ -28,26 +28,26 @@ class AIDatatang200zh:
          manifest_dir:
            It is expected to contain the following files::

-                - cuts_dev_raw.jsonl.gz
-                - cuts_train_raw.jsonl.gz
-                - cuts_test_raw.jsonl.gz
+                - aidatatang_cuts_dev.jsonl.gz
+                - aidatatang_cuts_train.jsonl.gz
+                - aidatatang_cuts_test.jsonl.gz
        """
        self.manifest_dir = Path(manifest_dir)

    def train_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train_raw.jsonl.gz"
+        f = self.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
        logging.info(f"About to get train cuts from {f}")
-        cuts_train = load_manifest(f)
+        cuts_train = load_manifest_lazy(f)
        return cuts_train

    def valid_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_valid_raw.jsonl.gz"
+        f = self.manifest_dir / "aidatatang_cuts_valid.jsonl.gz"
        logging.info(f"About to get valid cuts from {f}")
-        cuts_valid = load_manifest(f)
+        cuts_valid = load_manifest_lazy(f)
        return cuts_valid

    def test_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test_raw.jsonl.gz"
+        f = self.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
        logging.info(f"About to get test cuts from {f}")
-        cuts_test = load_manifest(f)
+        cuts_test = load_manifest_lazy(f)
        return cuts_test
--- a/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py
@ -18,7 +18,7 @@
 import logging
 from pathlib import Path

-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy


 class AIShell:
@ -28,26 +28,26 @@ class AIShell:
          manifest_dir:
            It is expected to contain the following files::

-                - cuts_dev.json.gz
-                - cuts_train.json.gz
-                - cuts_test.json.gz
+                - aishell_cuts_dev.jsonl.gz
+                - aishell_cuts_train.jsonl.gz
+                - aishell_cuts_test.jsonl.gz
        """
        self.manifest_dir = Path(manifest_dir)

    def train_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train.json.gz"
+        f = self.manifest_dir / "aishell_cuts_train.jsonl.gz"
        logging.info(f"About to get train cuts from {f}")
-        cuts_train = load_manifest(f)
+        cuts_train = load_manifest_lazy(f)
        return cuts_train

    def valid_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_dev.json.gz"
+        f = self.manifest_dir / "aishell_cuts_dev.jsonl.gz"
        logging.info(f"About to get valid cuts from {f}")
-        cuts_valid = load_manifest(f)
+        cuts_valid = load_manifest_lazy(f)
        return cuts_valid

    def test_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test.json.gz"
+        f = self.manifest_dir / "aishell_cuts_test.jsonl.gz"
        logging.info(f"About to get test cuts from {f}")
-        cuts_test = load_manifest(f)
+        cuts_test = load_manifest_lazy(f)
        return cuts_test
--- a/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py
@ -24,7 +24,6 @@ from typing import Optional

 from lhotse import CutSet, Fbank, FbankConfig
 from lhotse.dataset import (
-    BucketingSampler,
    CutMix,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
@ -73,8 +72,7 @@ class AsrDataModule:
            "--num-buckets",
            type=int,
            default=30,
-            help="The number of buckets for the BucketingSampler "
-            "and DynamicBucketingSampler."
+            help="The number of buckets for the DynamicBucketingSampler "
            "(you might want to increase it for larger datasets).",
        )

@ -147,7 +145,6 @@ class AsrDataModule:
    def train_dataloaders(
        self,
        cuts_train: CutSet,
-        dynamic_bucketing: bool,
        on_the_fly_feats: bool,
        cuts_musan: Optional[CutSet] = None,
    ) -> DataLoader:
@ -157,9 +154,6 @@ class AsrDataModule:
            Cuts for training.
          cuts_musan:
            If not None, it is the cuts for mixing.
-          dynamic_bucketing:
-            True to use DynamicBucketingSampler;
-            False to use BucketingSampler.
          on_the_fly_feats:
            True to use OnTheFlyFeatures;
            False to use PrecomputedFeatures.
@ -232,7 +226,6 @@ class AsrDataModule:
            return_cuts=self.args.return_cuts,
        )

-        if dynamic_bucketing:
        logging.info("Using DynamicBucketingSampler.")
        train_sampler = DynamicBucketingSampler(
            cuts_train,
@ -241,16 +234,6 @@ class AsrDataModule:
            num_buckets=self.args.num_buckets,
            drop_last=True,
        )
-        else:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
-                drop_last=True,
-            )

        logging.info("About to create train dataloader")
        train_dl = DataLoader(
@ -279,7 +262,7 @@ class AsrDataModule:
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
@ -303,8 +286,10 @@ class AsrDataModule:
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
-        sampler = BucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
--- a/egs/aishell/ASR/transducer_stateless_modified-2/train.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/train.py
@ -41,6 +41,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2"
 import argparse
 import logging
 import random
+import warnings
 from pathlib import Path
 from shutil import copyfile
 from typing import Optional, Tuple
@ -55,7 +56,7 @@ from asr_datamodule import AsrDataModule
 from conformer import Conformer
 from decoder import Decoder
 from joiner import Joiner
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from model import Transducer
@ -446,7 +447,11 @@ def compute_loss(
    assert loss.requires_grad == is_training

    info = MetricsTracker()
-    info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (
+            (feature_lens // params.subsampling_factor).sum().item()
+        )

    # Note: We use reduction=sum while computing the loss.
    info["loss"] = loss.detach().cpu().item()
@ -635,20 +640,16 @@ def train_one_epoch(

 def filter_short_and_long_utterances(cuts: CutSet) -> CutSet:
    def remove_short_and_long_utt(c: Cut):
-        # Keep only utterances with duration between 1 second and 12 seconds
+        # Keep only utterances with duration between 1 second and 20 seconds
+        #
+        # Caution: There is a reason to select 12.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
        return 1.0 <= c.duration <= 12.0

-    num_in_total = len(cuts)
-    cuts = cuts.filter(remove_short_and_long_utt)
-
-    num_left = len(cuts)
-    num_removed = num_in_total - num_left
-    removed_percent = num_removed / num_in_total * 100
-
-    logging.info(f"Before removing short and long utterances: {num_in_total}")
-    logging.info(f"After removing short and long utterances: {num_left}")
-    logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
-
    return cuts


@ -728,15 +729,14 @@ def run(rank, world_size, args):
    train_cuts = aishell.train_cuts()
    train_cuts = filter_short_and_long_utterances(train_cuts)

-    datatang = AIDatatang200zh(
-        manifest_dir=f"{args.manifest_dir}/aidatatang_200zh"
-    )
+    datatang = AIDatatang200zh(manifest_dir=args.manifest_dir)
    train_datatang_cuts = datatang.train_cuts()
    train_datatang_cuts = filter_short_and_long_utterances(train_datatang_cuts)
+    train_datatang_cuts = train_datatang_cuts.repeat(times=None)

    if args.enable_musan:
-        cuts_musan = load_manifest(
-            Path(args.manifest_dir) / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
        )
    else:
        cuts_musan = None
@ -745,22 +745,23 @@ def run(rank, world_size, args):

    train_dl = asr_datamodule.train_dataloaders(
        train_cuts,
-        dynamic_bucketing=False,
        on_the_fly_feats=False,
        cuts_musan=cuts_musan,
    )

    datatang_train_dl = asr_datamodule.train_dataloaders(
        train_datatang_cuts,
-        dynamic_bucketing=True,
-        on_the_fly_feats=True,
+        on_the_fly_feats=False,
        cuts_musan=cuts_musan,
    )

    valid_cuts = aishell.valid_cuts()
    valid_dl = asr_datamodule.valid_dataloaders(valid_cuts)

-    for dl in [train_dl, datatang_train_dl]:
+    for dl in [
+        train_dl,
+        # datatang_train_dl
+    ]:
        scan_pessimistic_batches_for_oom(
            model=model,
            train_dl=dl,
--- a/egs/aishell/ASR/transducer_stateless_modified/train.py
+++ b/egs/aishell/ASR/transducer_stateless_modified/train.py
@ -37,6 +37,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2"

 import argparse
 import logging
+import warnings
 from pathlib import Path
 from shutil import copyfile
 from typing import Optional, Tuple
@ -411,7 +412,11 @@ def compute_loss(
    assert loss.requires_grad == is_training

    info = MetricsTracker()
-    info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (
+            (feature_lens // params.subsampling_factor).sum().item()
+        )

    # Note: We use reduction=sum while computing the loss.
    info["loss"] = loss.detach().cpu().item()
--- a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
+++ b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
@ -43,7 +43,7 @@ torch.set_num_interop_threads(1)


 def compute_fbank_alimeeting(num_mel_bins: int = 80):
-    src_dir = Path("data/manifests/alimeeting")
+    src_dir = Path("data/manifests")
    output_dir = Path("data/fbank")
    num_jobs = min(15, os.cpu_count())

@ -52,11 +52,14 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
        "eval",
        "test",
    )
+
+    prefix = "alimeeting"
+    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
-        prefix="alimeeting",
-        suffix="jsonl.gz",
+        prefix=prefix,
+        suffix=suffix,
    )
    assert manifests is not None

@ -64,7 +67,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
@ -83,7 +86,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):

            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=cur_num_jobs,
                executor=ex,
@ -95,7 +98,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
                keep_overlapping=False,
                min_duration=None,
            )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")


 def get_args():
--- a/egs/alimeeting/ASR/local/display_manifest_statistics.py
+++ b/egs/alimeeting/ASR/local/display_manifest_statistics.py
@ -25,19 +25,19 @@ for usage.
 """


-from lhotse import load_manifest
+from lhotse import load_manifest_lazy


 def main():
    paths = [
-        "./data/fbank/cuts_train.json.gz",
-        "./data/fbank/cuts_eval.json.gz",
-        "./data/fbank/cuts_test.json.gz",
+        "./data/fbank/alimeeting_cuts_train.jsonl.gz",
+        "./data/fbank/alimeeting_cuts_eval.jsonl.gz",
+        "./data/fbank/alimeeting_cuts_test.jsonl.gz",
    ]

    for path in paths:
        print(f"Starting display the statistics for {path}")
-        cuts = load_manifest(path)
+        cuts = load_manifest_lazy(path)
        cuts.describe()


@ -45,7 +45,7 @@ if __name__ == "__main__":
    main()

 """
-Starting display the statistics for ./data/fbank/cuts_train.json.gz
+Starting display the statistics for ./data/fbank/alimeeting_cuts_train.jsonl.gz
 Cuts count: 559092
 Total duration (hours): 424.6
 Speech duration (hours): 424.6 (100.0%)
@ -61,7 +61,7 @@ min     0.0
 99.5%   14.7
 99.9%   16.2
 max     284.3
-Starting display the statistics for ./data/fbank/cuts_eval.json.gz
+Starting display the statistics for ./data/fbank/alimeeting_cuts_eval.jsonl.gz
 Cuts count: 6457
 Total duration (hours): 4.9
 Speech duration (hours): 4.9 (100.0%)
@ -77,7 +77,7 @@ min     0.1
 99.5%   14.1
 99.9%   14.7
 max     15.8
-Starting display the statistics for ./data/fbank/cuts_test.json.gz
+Starting display the statistics for ./data/fbank/alimeeting_cuts_test.jsonl.gz
 Cuts count: 16358
 Total duration (hours): 12.5
 Speech duration (hours): 12.5 (100.0%)
--- a/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -27,7 +27,7 @@ from lhotse import (
    CutSet,
    Fbank,
    FbankConfig,
-    load_manifest,
+    load_manifest_lazy,
    set_caching_enabled,
 )
 from lhotse.dataset import (
@ -204,8 +204,8 @@ class AlimeetingAsrDataModule:
            The state dict for the training sampler.
        """
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
        )

        transforms = []
@ -401,14 +401,20 @@ class AlimeetingAsrDataModule:
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "alimeeting_cuts_train.jsonl.gz"
+        )

    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_eval.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "alimeeting_cuts_eval.jsonl.gz"
+        )

    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "alimeeting_cuts_test.jsonl.gz"
+        )
--- a/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py
+++ b/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py
@ -20,9 +20,8 @@ import logging
 from functools import lru_cache
 from pathlib import Path

-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
-    BucketingSampler,
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
@ -190,8 +189,8 @@ class GigaSpeechAsrDataModule:

    def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
        )

        transforms = []
@ -315,7 +314,7 @@ class GigaSpeechAsrDataModule:
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
@ -339,8 +338,10 @@ class GigaSpeechAsrDataModule:
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
-        sampler = BucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
@ -361,7 +362,9 @@ class GigaSpeechAsrDataModule:
    @lru_cache()
    def dev_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
+        cuts_valid = load_manifest_lazy(
+            self.args.manifest_dir / "cuts_DEV.jsonl.gz"
+        )
        if self.args.small_dev:
            return cuts_valid.subset(first=1000)
        else:
@ -370,4 +373,4 @@ class GigaSpeechAsrDataModule:
    @lru_cache()
    def test_cuts(self) -> CutSet:
        logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
--- a/egs/gigaspeech/ASR/local/compute_fbank_musan.py
+++ b/egs/gigaspeech/ASR/local/compute_fbank_musan.py
@ -1,103 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Johns Hopkins University (Piotr Żelasko)
-# Copyright    2021  Xiaomi Corp.             (Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from pathlib import Path
-
-import torch
-from lhotse import (
-    CutSet,
-    KaldifeatFbank,
-    KaldifeatFbankConfig,
-    combine,
-)
-from lhotse.recipes.utils import read_manifests_if_cached
-
-# Torch's multithreaded behavior needs to be disabled or
-# it wastes a lot of CPU and slow things down.
-# Do this outside of main() in case it needs to take effect
-# even when we are not invoking the main (e.g. when spawning subprocesses).
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-
-def compute_fbank_musan():
-    src_dir = Path("data/manifests")
-    output_dir = Path("data/fbank")
-
-    # number of workers in dataloader
-    num_workers = 10
-
-    # number of seconds in a batch
-    batch_duration = 600
-
-    dataset_parts = (
-        "music",
-        "speech",
-        "noise",
-    )
-
-    manifests = read_manifests_if_cached(
-        prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir
-    )
-    assert manifests is not None
-
-    musan_cuts_path = output_dir / "cuts_musan.json.gz"
-
-    if musan_cuts_path.is_file():
-        logging.info(f"{musan_cuts_path} already exists - skipping")
-        return
-
-    logging.info("Extracting features for Musan")
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
-
-    logging.info(f"device: {device}")
-
-    musan_cuts = (
-        CutSet.from_manifests(
-            recordings=combine(
-                part["recordings"] for part in manifests.values()
-            )
-        )
-        .cut_into_windows(10.0)
-        .filter(lambda c: c.duration > 5)
-        .compute_and_store_features_batch(
-            extractor=extractor,
-            storage_path=f"{output_dir}/feats_musan",
-            num_workers=num_workers,
-            batch_duration=batch_duration,
-        )
-    )
-    musan_cuts.to_json(musan_cuts_path)
-
-
-def main():
-    formatter = (
-        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-    )
-    logging.basicConfig(format=formatter, level=logging.INFO)
-
-    compute_fbank_musan()
-
-
-if __name__ == "__main__":
-    main()
--- a/egs/gigaspeech/ASR/local/compute_fbank_musan.py
+++ b/egs/gigaspeech/ASR/local/compute_fbank_musan.py
@ -0,0 +1 @@
+../../../librispeech/ASR/local/compute_fbank_musan.py
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -23,9 +23,8 @@ from pathlib import Path
 from typing import Any, Dict, Optional

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
-    BucketingSampler,
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
@ -217,8 +216,8 @@ class GigaSpeechAsrDataModule:
        if self.args.enable_musan:
            logging.info("Enable MUSAN")
            logging.info("About to get Musan cuts")
-            cuts_musan = load_manifest(
-                self.args.manifest_dir / "cuts_musan.json.gz"
+            cuts_musan = load_manifest_lazy(
+                self.args.manifest_dir / "musan_cuts.jsonl.gz"
            )
            transforms.append(
                CutMix(
@ -358,7 +357,7 @@ class GigaSpeechAsrDataModule:
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
@ -382,8 +381,10 @@ class GigaSpeechAsrDataModule:
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
-        sampler = BucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
@ -404,7 +405,9 @@ class GigaSpeechAsrDataModule:
    @lru_cache()
    def dev_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
+        cuts_valid = load_manifest_lazy(
+            self.args.manifest_dir / "cuts_DEV.jsonl.gz"
+        )
        if self.args.small_dev:
            return cuts_valid.subset(first=1000)
        else:
@ -413,4 +416,4 @@ class GigaSpeechAsrDataModule:
    @lru_cache()
    def test_cuts(self) -> CutSet:
        logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
--- a/egs/librispeech/ASR/conformer_ctc/ali.py
+++ b/egs/librispeech/ASR/conformer_ctc/ali.py
@ -96,14 +96,14 @@ def get_parser():

        - labels_xxx.h5
        - aux_labels_xxx.h5
-        - cuts_xxx.json.gz
+        - librispeech_cuts_xxx.jsonl.gz

        where xxx is the value of `--dataset`. For instance, if
        `--dataset` is `train-clean-100`, it will contain 3 files:

        - `labels_train-clean-100.h5`
        - `aux_labels_train-clean-100.h5`
-        - `cuts_train-clean-100.json.gz`
+        - `librispeech_cuts_train-clean-100.jsonl.gz`

        Note: Both labels_xxx.h5 and aux_labels_xxx.h5 contain framewise
        alignment. The difference is that labels_xxx.h5 contains repeats.
@ -289,7 +289,9 @@ def main():

    out_labels_ali_filename = out_dir / f"labels_{params.dataset}.h5"
    out_aux_labels_ali_filename = out_dir / f"aux_labels_{params.dataset}.h5"
-    out_manifest_filename = out_dir / f"cuts_{params.dataset}.json.gz"
+    out_manifest_filename = (
+        out_dir / f"librispeech_cuts_{params.dataset}.jsonl.gz"
+    )

    for f in (
        out_labels_ali_filename,
--- a/egs/librispeech/ASR/conformer_ctc/train.py
+++ b/egs/librispeech/ASR/conformer_ctc/train.py
@ -17,6 +17,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+"""
+Usage:
+  export CUDA_VISIBLE_DEVICES="0,1,2,3"
+  ./conformer_ctc/train.py \
+     --exp-dir ./conformer_ctc/exp \
+     --world-size 4 \
+     --full-libri 1 \
+     --max-duration 200 \
+     --num-epochs 20
+"""
+
 import argparse
 import logging
 from pathlib import Path
@ -29,6 +40,7 @@ import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
 from conformer import Conformer
+from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from torch import Tensor
 from torch.nn.parallel import DistributedDataParallel as DDP
@ -676,6 +688,20 @@ def run(rank, world_size, args):
    if params.full_libri:
        train_cuts += librispeech.train_clean_360_cuts()
        train_cuts += librispeech.train_other_500_cuts()
+
+    def remove_short_and_long_utt(c: Cut):
+        # Keep only utterances with duration between 1 second and 20 seconds
+        #
+        # Caution: There is a reason to select 20.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
+        return 1.0 <= c.duration <= 20.0
+
+    train_cuts = train_cuts.filter(remove_short_and_long_utt)
+
    train_dl = librispeech.train_dataloaders(train_cuts)

    valid_cuts = librispeech.dev_clean_cuts()
--- a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py
+++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py
@ -20,11 +20,7 @@ import logging
 from pathlib import Path

 import torch
-from lhotse import (
-    CutSet,
-    KaldifeatFbank,
-    KaldifeatFbankConfig,
-)
+from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig

 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
@ -51,13 +47,16 @@ def compute_fbank_gigaspeech_dev_test():

    logging.info(f"device: {device}")

+    prefix = "gigaspeech"
+    suffix = "jsonl.gz"
+
    for partition in subsets:
-        cuts_path = in_out_dir / f"cuts_{partition}.jsonl.gz"
+        cuts_path = in_out_dir / f"{prefix}_cuts_{partition}.{suffix}"
        if cuts_path.is_file():
            logging.info(f"{cuts_path} exists - skipping")
            continue

-        raw_cuts_path = in_out_dir / f"cuts_{partition}_raw.jsonl.gz"
+        raw_cuts_path = in_out_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"

        logging.info(f"Loading {raw_cuts_path}")
        cut_set = CutSet.from_file(raw_cuts_path)
@ -66,7 +65,7 @@ def compute_fbank_gigaspeech_dev_test():

        cut_set = cut_set.compute_and_store_features_batch(
            extractor=extractor,
-            storage_path=f"{in_out_dir}/feats_{partition}",
+            storage_path=f"{in_out_dir}/{prefix}_feats_{partition}",
            num_workers=num_workers,
            batch_duration=batch_duration,
        )
--- a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py
+++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py
@ -77,7 +77,7 @@ def get_parser():

 def compute_fbank_gigaspeech_splits(args):
    num_splits = args.num_splits
-    output_dir = f"data/fbank/XL_split_{num_splits}"
+    output_dir = f"data/fbank/gigaspeech_XL_split_{num_splits}"
    output_dir = Path(output_dir)
    assert output_dir.exists(), f"{output_dir} does not exist!"

@ -96,17 +96,19 @@ def compute_fbank_gigaspeech_splits(args):
    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
    logging.info(f"device: {device}")

+    prefix = "gigaspeech"
+
    num_digits = 8  # num_digits is fixed by lhotse split-lazy
    for i in range(start, stop):
        idx = f"{i + 1}".zfill(num_digits)
        logging.info(f"Processing {idx}/{num_splits}")

-        cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz"
+        cuts_path = output_dir / f"{prefix}_cuts_XL.{idx}.jsonl.gz"
        if cuts_path.is_file():
            logging.info(f"{cuts_path} exists - skipping")
            continue

-        raw_cuts_path = output_dir / f"cuts_XL_raw.{idx}.jsonl.gz"
+        raw_cuts_path = output_dir / f"{prefix}_cuts_XL_raw.{idx}.jsonl.gz"
        if not raw_cuts_path.is_file():
            logging.info(f"{raw_cuts_path} does not exist - skipping it")
            continue
@ -115,13 +117,13 @@ def compute_fbank_gigaspeech_splits(args):
        cut_set = CutSet.from_file(raw_cuts_path)

        logging.info("Computing features")
-        if (output_dir / f"feats_XL_{idx}.lca").exists():
-            logging.info(f"Removing {output_dir}/feats_XL_{idx}.lca")
-            os.remove(output_dir / f"feats_XL_{idx}.lca")
+        if (output_dir / f"{prefix}_feats_XL_{idx}.lca").exists():
+            logging.info(f"Removing {output_dir}/{prefix}_feats_XL_{idx}.lca")
+            os.remove(output_dir / f"{prefix}_feats_XL_{idx}.lca")

        cut_set = cut_set.compute_and_store_features_batch(
            extractor=extractor,
-            storage_path=f"{output_dir}/feats_XL_{idx}",
+            storage_path=f"{output_dir}/{prefix}_feats_XL_{idx}",
            num_workers=args.num_workers,
            batch_duration=args.batch_duration,
        )
--- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@ -28,7 +28,7 @@ import os
 from pathlib import Path

 import torch
-from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor
@ -56,8 +56,13 @@ def compute_fbank_librispeech():
        "train-clean-360",
        "train-other-500",
    )
+    prefix = "librispeech"
+    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
-        prefix="librispeech", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
    )
    assert manifests is not None

@ -65,7 +70,8 @@ def compute_fbank_librispeech():

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
+            if (output_dir / cuts_filename).is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
@ -81,13 +87,13 @@ def compute_fbank_librispeech():
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=ChunkedLilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(output_dir / cuts_filename)


 if __name__ == "__main__":
--- a/egs/librispeech/ASR/local/compute_fbank_musan.py
+++ b/egs/librispeech/ASR/local/compute_fbank_musan.py
@ -28,7 +28,7 @@ import os
 from pathlib import Path

 import torch
-from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig, combine
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor
@ -52,12 +52,22 @@ def compute_fbank_musan():
        "speech",
        "noise",
    )
+    prefix = "musan"
+    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
-        prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
    )
    assert manifests is not None

-    musan_cuts_path = output_dir / "cuts_musan.json.gz"
+    assert len(manifests) == len(dataset_parts), (
+        len(manifests),
+        len(dataset_parts),
+    )
+
+    musan_cuts_path = output_dir / "musan_cuts.jsonl.gz"

    if musan_cuts_path.is_file():
        logging.info(f"{musan_cuts_path} already exists - skipping")
@ -79,13 +89,13 @@ def compute_fbank_musan():
            .filter(lambda c: c.duration > 5)
            .compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_musan",
+                storage_path=f"{output_dir}/musan_feats",
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=ChunkedLilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )
        )
-        musan_cuts.to_json(musan_cuts_path)
+        musan_cuts.to_file(musan_cuts_path)


 if __name__ == "__main__":
--- a/egs/librispeech/ASR/local/display_manifest_statistics.py
+++ b/egs/librispeech/ASR/local/display_manifest_statistics.py
@ -25,19 +25,19 @@ for usage.
 """


-from lhotse import load_manifest
+from lhotse import load_manifest_lazy


 def main():
-    path = "./data/fbank/cuts_train-clean-100.json.gz"
-    path = "./data/fbank/cuts_train-clean-360.json.gz"
-    path = "./data/fbank/cuts_train-other-500.json.gz"
-    path = "./data/fbank/cuts_dev-clean.json.gz"
-    path = "./data/fbank/cuts_dev-other.json.gz"
-    path = "./data/fbank/cuts_test-clean.json.gz"
-    path = "./data/fbank/cuts_test-other.json.gz"
+    #  path = "./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz"
+    #  path = "./data/fbank/librispeech_cuts_train-clean-360.jsonl.gz"
+    #  path = "./data/fbank/librispeech_cuts_train-other-500.jsonl.gz"
+    #  path = "./data/fbank/librispeech_cuts_dev-clean.jsonl.gz"
+    #  path = "./data/fbank/librispeech_cuts_dev-other.jsonl.gz"
+    #  path = "./data/fbank/librispeech_cuts_test-clean.jsonl.gz"
+    path = "./data/fbank/librispeech_cuts_test-other.jsonl.gz"

-    cuts = load_manifest(path)
+    cuts = load_manifest_lazy(path)
    cuts.describe()


--- a/egs/librispeech/ASR/local/preprocess_gigaspeech.py
+++ b/egs/librispeech/ASR/local/preprocess_gigaspeech.py
@ -58,17 +58,19 @@ def preprocess_giga_speech():
    )

    logging.info("Loading manifest (may take 4 minutes)")
+    prefix = "gigaspeech"
+    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
-        prefix="gigaspeech",
-        suffix="jsonl.gz",
+        prefix=prefix,
+        suffix=suffix,
    )
    assert manifests is not None

    for partition, m in manifests.items():
        logging.info(f"Processing {partition}")
-        raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
+        raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
        if raw_cuts_path.is_file():
            logging.info(f"{partition} already exists - skipping")
            continue
--- a/egs/librispeech/ASR/local/validate_manifest.py
+++ b/egs/librispeech/ASR/local/validate_manifest.py
@ -25,7 +25,7 @@ We will add more checks later if needed.
 Usage example:

    python3 ./local/validate_manifest.py \
-            ./data/fbank/cuts_train-clean-100.json.gz
+            ./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz

 """

@ -33,7 +33,7 @@ import argparse
 import logging
 from pathlib import Path

-from lhotse import load_manifest, CutSet
+from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut


@ -76,7 +76,7 @@ def main():
    logging.info(f"Validating {manifest}")

    assert manifest.is_file(), f"{manifest} does not exist"
-    cut_set = load_manifest(manifest)
+    cut_set = load_manifest_lazy(manifest)
    assert isinstance(cut_set, CutSet)

    for c in cut_set:
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@ -40,9 +40,9 @@ dl_dir=$PWD/download
 # It will generate data/lang_bpe_xxx,
 # data/lang_bpe_yyy if the array contains xxx, yyy
 vocab_sizes=(
-  5000
-  2000
-  1000
+  # 5000
+  # 2000
+  # 1000
  500
 )

@ -132,7 +132,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
    )
    for part in ${parts[@]}; do
      python3 ./local/validate_manifest.py \
-        data/fbank/cuts_${part}.json.gz
+        data/fbank/librispeech_cuts_${part}.jsonl.gz
    done
    touch data/fbank/.librispeech-validated.done
  fi
--- a/egs/librispeech/ASR/prepare_giga_speech.sh
+++ b/egs/librispeech/ASR/prepare_giga_speech.sh
@ -124,9 +124,9 @@ fi

 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Split XL subset into ${num_splits} pieces"
-  split_dir=data/fbank/XL_split_${num_splits}
+  split_dir=data/fbank/gigaspeech_XL_split_${num_splits}
  if [ ! -f $split_dir/.split_completed ]; then
-    lhotse split-lazy ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir $chunk_size
+    lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size
    touch $split_dir/.split_completed
  fi
 fi
--- a/egs/librispeech/ASR/pruned_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/train.py
@ -807,28 +807,8 @@ def run(rank, world_size, args):
        # the threshold
        return 1.0 <= c.duration <= 20.0

-    num_in_total = len(train_cuts)
-
    train_cuts = train_cuts.filter(remove_short_and_long_utt)

-    try:
-        num_left = len(train_cuts)
-        num_removed = num_in_total - num_left
-        removed_percent = num_removed / num_in_total * 100
-
-        logging.info(
-            f"Before removing short and long utterances: {num_in_total}"
-        )
-        logging.info(f"After removing short and long utterances: {num_left}")
-        logging.info(
-            f"Removed {num_removed} utterances ({removed_percent:.5f}%)"
-        )
-    except TypeError as e:
-        # You can ignore this error as previous versions of Lhotse work fine
-        # for the above code. In recent versions of Lhotse, it uses
-        # lazy filter, producing cutsets that don't have the __len__  method
-        logging.info(str(e))
-
    if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
        # We only load the sampler's state dict when it loads a checkpoint
        # saved in the middle of an epoch
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py
@ -22,7 +22,6 @@ from typing import Optional

 from lhotse import CutSet, Fbank, FbankConfig
 from lhotse.dataset import (
-    BucketingSampler,
    CutMix,
    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
@ -71,8 +70,7 @@ class AsrDataModule:
            "--num-buckets",
            type=int,
            default=30,
-            help="The number of buckets for the BucketingSampler "
-            "and DynamicBucketingSampler."
+            help="The number of buckets for the DynamicBucketingSampler. "
            "(you might want to increase it for larger datasets).",
        )

@ -152,7 +150,6 @@ class AsrDataModule:
    def train_dataloaders(
        self,
        cuts_train: CutSet,
-        dynamic_bucketing: bool,
        on_the_fly_feats: bool,
        cuts_musan: Optional[CutSet] = None,
    ) -> DataLoader:
@ -162,9 +159,6 @@ class AsrDataModule:
            Cuts for training.
          cuts_musan:
            If not None, it is the cuts for mixing.
-          dynamic_bucketing:
-            True to use DynamicBucketingSampler;
-            False to use BucketingSampler.
          on_the_fly_feats:
            True to use OnTheFlyFeatures;
            False to use PrecomputedFeatures.
@ -230,7 +224,6 @@ class AsrDataModule:
            return_cuts=self.args.return_cuts,
        )

-        if dynamic_bucketing:
        logging.info("Using DynamicBucketingSampler.")
        train_sampler = DynamicBucketingSampler(
            cuts_train,
@ -239,16 +232,6 @@ class AsrDataModule:
            num_buckets=self.args.num_buckets,
            drop_last=True,
        )
-        else:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
-                drop_last=True,
-            )

        logging.info("About to create train dataloader")
        train_dl = DataLoader(
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py
@ -22,7 +22,7 @@ import re
 from pathlib import Path

 import lhotse
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy


 class GigaSpeech:
@ -32,13 +32,13 @@ class GigaSpeech:
          manifest_dir:
            It is expected to contain the following files::

-                - XL_split_2000/cuts_XL.*.jsonl.gz
-                - cuts_L_raw.jsonl.gz
-                - cuts_M_raw.jsonl.gz
-                - cuts_S_raw.jsonl.gz
-                - cuts_XS_raw.jsonl.gz
-                - cuts_DEV_raw.jsonl.gz
-                - cuts_TEST_raw.jsonl.gz
+                - gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz
+                - gigaspeech_cuts_L_raw.jsonl.gz
+                - gigaspeech_cuts_M_raw.jsonl.gz
+                - gigaspeech_cuts_S_raw.jsonl.gz
+                - gigaspeech_cuts_XS_raw.jsonl.gz
+                - gigaspeech_cuts_DEV_raw.jsonl.gz
+                - gigaspeech_cuts_TEST_raw.jsonl.gz
        """
        self.manifest_dir = Path(manifest_dir)

@ -46,10 +46,12 @@ class GigaSpeech:
        logging.info("About to get train-XL cuts")

        filenames = list(
-            glob.glob(f"{self.manifest_dir}/XL_split_2000/cuts_XL.*.jsonl.gz")
+            glob.glob(
+                f"{self.manifest_dir}/gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz"  # noqa
+            )
        )

-        pattern = re.compile(r"cuts_XL.([0-9]+).jsonl.gz")
+        pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz")
        idx_filenames = [
            (int(pattern.search(f).group(1)), f) for f in filenames
        ]
@ -64,31 +66,31 @@ class GigaSpeech:
        )

    def train_L_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_L_raw.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_L_raw.jsonl.gz"
        logging.info(f"About to get train-L cuts from {f}")
        return CutSet.from_jsonl_lazy(f)

    def train_M_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_M_raw.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_M_raw.jsonl.gz"
        logging.info(f"About to get train-M cuts from {f}")
        return CutSet.from_jsonl_lazy(f)

    def train_S_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_S_raw.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_S_raw.jsonl.gz"
        logging.info(f"About to get train-S cuts from {f}")
        return CutSet.from_jsonl_lazy(f)

    def train_XS_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_XS_raw.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_XS_raw.jsonl.gz"
        logging.info(f"About to get train-XS cuts from {f}")
        return CutSet.from_jsonl_lazy(f)

    def test_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_TEST.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_TEST.jsonl.gz"
        logging.info(f"About to get TEST cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)

    def dev_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_DEV.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_DEV.jsonl.gz"
        logging.info(f"About to get DEV cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py
@ -18,7 +18,7 @@
 import logging
 from pathlib import Path

-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy


 class LibriSpeech:
@ -28,47 +28,47 @@ class LibriSpeech:
          manifest_dir:
            It is expected to contain the following files::

-                - cuts_dev-clean.json.gz
-                - cuts_dev-other.json.gz
-                - cuts_test-clean.json.gz
-                - cuts_test-other.json.gz
-                - cuts_train-clean-100.json.gz
-                - cuts_train-clean-360.json.gz
-                - cuts_train-other-500.json.gz
+                - librispeech_cuts_dev-clean.jsonl.gz
+                - librispeech_cuts_dev-other.jsonl.gz
+                - librispeech_cuts_test-clean.jsonl.gz
+                - librispeech_cuts_test-other.jsonl.gz
+                - librispeech_cuts_train-clean-100.jsonl.gz
+                - librispeech_cuts_train-clean-360.jsonl.gz
+                - librispeech_cuts_train-other-500.jsonl.gz
        """
        self.manifest_dir = Path(manifest_dir)

    def train_clean_100_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-clean-100.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
        logging.info(f"About to get train-clean-100 cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)

    def train_clean_360_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-clean-360.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
        logging.info(f"About to get train-clean-360 cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)

    def train_other_500_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-other-500.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
        logging.info(f"About to get train-other-500 cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)

    def test_clean_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test-clean.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
        logging.info(f"About to get test-clean cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)

    def test_other_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test-other.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
        logging.info(f"About to get test-other cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)

    def dev_clean_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_dev-clean.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
        logging.info(f"About to get dev-clean cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)

    def dev_other_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_dev-other.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
        logging.info(f"About to get dev-other cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
@ -66,7 +66,7 @@ from conformer import Conformer
 from decoder import Decoder
 from gigaspeech import GigaSpeech
 from joiner import Joiner
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
 from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
@ -968,8 +968,8 @@ def run(rank, world_size, args):
    train_giga_cuts = train_giga_cuts.repeat(times=None)

    if args.enable_musan:
-        cuts_musan = load_manifest(
-            Path(args.manifest_dir) / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
        )
    else:
        cuts_musan = None
@ -978,14 +978,12 @@ def run(rank, world_size, args):

    train_dl = asr_datamodule.train_dataloaders(
        train_cuts,
-        dynamic_bucketing=False,
        on_the_fly_feats=False,
        cuts_musan=cuts_musan,
    )

    giga_train_dl = asr_datamodule.train_dataloaders(
        train_giga_cuts,
-        dynamic_bucketing=True,
        on_the_fly_feats=False,
        cuts_musan=cuts_musan,
    )
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -24,7 +24,7 @@ from pathlib import Path
 from typing import Any, Dict, Optional

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
    CutConcatenate,
    CutMix,
@ -224,8 +224,8 @@ class LibriSpeechAsrDataModule:
        if self.args.enable_musan:
            logging.info("Enable MUSAN")
            logging.info("About to get Musan cuts")
-            cuts_musan = load_manifest(
-                self.args.manifest_dir / "cuts_musan.json.gz"
+            cuts_musan = load_manifest_lazy(
+                self.args.manifest_dir / "musan_cuts.jsonl.gz"
            )
            transforms.append(
                CutMix(
@ -407,40 +407,48 @@ class LibriSpeechAsrDataModule:
    @lru_cache()
    def train_clean_100_cuts(self) -> CutSet:
        logging.info("About to get train-clean-100 cuts")
-        return load_manifest(
-            self.args.manifest_dir / "cuts_train-clean-100.json.gz"
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
        )

    @lru_cache()
    def train_clean_360_cuts(self) -> CutSet:
        logging.info("About to get train-clean-360 cuts")
-        return load_manifest(
-            self.args.manifest_dir / "cuts_train-clean-360.json.gz"
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
        )

    @lru_cache()
    def train_other_500_cuts(self) -> CutSet:
        logging.info("About to get train-other-500 cuts")
-        return load_manifest(
-            self.args.manifest_dir / "cuts_train-other-500.json.gz"
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
        )

    @lru_cache()
    def dev_clean_cuts(self) -> CutSet:
        logging.info("About to get dev-clean cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev-clean.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
+        )

    @lru_cache()
    def dev_other_cuts(self) -> CutSet:
        logging.info("About to get dev-other cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev-other.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
+        )

    @lru_cache()
    def test_clean_cuts(self) -> CutSet:
        logging.info("About to get test-clean cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test-clean.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
+        )

    @lru_cache()
    def test_other_cuts(self) -> CutSet:
        logging.info("About to get test-other cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
+        )
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
@ -16,6 +16,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+"""
+Usage:
+  export CUDA_VISIBLE_DEVICES="0,1,2,3"
+  ./tdnn_lstm_ctc/train.py \
+     --world-size 4 \
+     --full-libri 1 \
+     --max-duration 300 \
+     --num-epochs 20
+"""

 import argparse
 import logging
@ -29,6 +38,7 @@ import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.optim as optim
 from asr_datamodule import LibriSpeechAsrDataModule
+from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from model import TdnnLstm
 from torch import Tensor
@ -544,10 +554,25 @@ def run(rank, world_size, args):
    if params.full_libri:
        train_cuts += librispeech.train_clean_360_cuts()
        train_cuts += librispeech.train_other_500_cuts()
+
+    def remove_short_and_long_utt(c: Cut):
+        # Keep only utterances with duration between 1 second and 20 seconds
+        #
+        # Caution: There is a reason to select 20.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
+        return 1.0 <= c.duration <= 20.0
+
+    train_cuts = train_cuts.filter(remove_short_and_long_utt)
+
    train_dl = librispeech.train_dataloaders(train_cuts)

    valid_cuts = librispeech.dev_clean_cuts()
    valid_cuts += librispeech.dev_other_cuts()
+
    valid_dl = librispeech.valid_dataloaders(valid_cuts)

    for epoch in range(params.start_epoch, params.num_epochs):
--- a/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py
+++ b/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py
@ -44,8 +44,8 @@ from pathlib import Path
 import sentencepiece as spm
 import torch
 from alignment import get_word_starting_frames
-from lhotse import CutSet, load_manifest
-from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler
+from lhotse import CutSet, load_manifest_lazy
+from lhotse.dataset import DynamicBucketingSampler, K2SpeechRecognitionDataset
 from lhotse.dataset.collation import collate_custom_field


@ -93,14 +93,15 @@ def main():
    sp = spm.SentencePieceProcessor()
    sp.load(args.bpe_model)

-    cuts_json = args.ali_dir / f"cuts_{args.dataset}.json.gz"
+    cuts_jsonl = args.ali_dir / f"librispeech_cuts_{args.dataset}.jsonl.gz"

-    logging.info(f"Loading {cuts_json}")
-    cuts = load_manifest(cuts_json)
+    logging.info(f"Loading {cuts_jsonl}")
+    cuts = load_manifest_lazy(cuts_jsonl)

-    sampler = SingleCutSampler(
+    sampler = DynamicBucketingSampler(
        cuts,
        max_duration=30,
+        num_buckets=30,
        shuffle=False,
    )

--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
@ -1,333 +0,0 @@
-# Copyright      2021  Piotr Żelasko
-#                2022  Xiaomi Corp.        (authors: Fangjun Kuang
-# 						     Mingshuang Luo)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import inspect
-import logging
-from pathlib import Path
-from typing import Optional
-
-import torch
-from lhotse import CutSet, Fbank, FbankConfig
-from lhotse.dataset import (
-    BucketingSampler,
-    CutMix,
-    DynamicBucketingSampler,
-    K2SpeechRecognitionDataset,
-    SpecAugment,
-)
-from lhotse.dataset.input_strategies import (
-    OnTheFlyFeatures,
-    PrecomputedFeatures,
-)
-from lhotse.utils import fix_random_seed
-from torch.utils.data import DataLoader
-
-from icefall.utils import str2bool
-
-
-class _SeedWorkers:
-    def __init__(self, seed: int):
-        self.seed = seed
-
-    def __call__(self, worker_id: int):
-        fix_random_seed(self.seed + worker_id)
-
-
-class AsrDataModule:
-    def __init__(self, args: argparse.Namespace):
-        self.args = args
-
-    @classmethod
-    def add_arguments(cls, parser: argparse.ArgumentParser):
-        group = parser.add_argument_group(
-            title="ASR data related options",
-            description="These options are used for the preparation of "
-            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
-            "effective batch sizes, sampling strategies, applied data "
-            "augmentations, etc.",
-        )
-
-        group.add_argument(
-            "--max-duration",
-            type=int,
-            default=200.0,
-            help="Maximum pooled recordings duration (seconds) in a "
-            "single batch. You can reduce it if it causes CUDA OOM.",
-        )
-
-        group.add_argument(
-            "--bucketing-sampler",
-            type=str2bool,
-            default=True,
-            help="When enabled, the batches will come from buckets of "
-            "similar duration (saves padding frames).",
-        )
-
-        group.add_argument(
-            "--num-buckets",
-            type=int,
-            default=30,
-            help="The number of buckets for the BucketingSampler "
-            "and DynamicBucketingSampler."
-            "(you might want to increase it for larger datasets).",
-        )
-
-        group.add_argument(
-            "--shuffle",
-            type=str2bool,
-            default=True,
-            help="When enabled (=default), the examples will be "
-            "shuffled for each epoch.",
-        )
-
-        group.add_argument(
-            "--return-cuts",
-            type=str2bool,
-            default=True,
-            help="When enabled, each batch will have the "
-            "field: batch['supervisions']['cut'] with the cuts that "
-            "were used to construct it.",
-        )
-
-        group.add_argument(
-            "--num-workers",
-            type=int,
-            default=2,
-            help="The number of training dataloader workers that "
-            "collect the batches.",
-        )
-
-        group.add_argument(
-            "--enable-spec-aug",
-            type=str2bool,
-            default=True,
-            help="When enabled, use SpecAugment for training dataset.",
-        )
-
-        group.add_argument(
-            "--spec-aug-time-warp-factor",
-            type=int,
-            default=80,
-            help="Used only when --enable-spec-aug is True. "
-            "It specifies the factor for time warping in SpecAugment. "
-            "Larger values mean more warping. "
-            "A value less than 1 means to disable time warp.",
-        )
-
-        group.add_argument(
-            "--enable-musan",
-            type=str2bool,
-            default=True,
-            help="When enabled, select noise from MUSAN and mix it"
-            "with training dataset. ",
-        )
-
-        group.add_argument(
-            "--manifest-dir",
-            type=Path,
-            default=Path("data/fbank"),
-            help="Path to directory with train/valid/test cuts.",
-        )
-
-        group.add_argument(
-            "--on-the-fly-feats",
-            type=str2bool,
-            default=False,
-            help="When enabled, use on-the-fly cut mixing and feature "
-            "extraction. Will drop existing precomputed feature manifests "
-            "if available. Used only in dev/test CutSet",
-        )
-
-    def train_dataloaders(
-        self,
-        cuts_train: CutSet,
-        dynamic_bucketing: bool,
-        on_the_fly_feats: bool,
-        cuts_musan: Optional[CutSet] = None,
-    ) -> DataLoader:
-        """
-        Args:
-          cuts_train:
-            Cuts for training.
-          cuts_musan:
-            If not None, it is the cuts for mixing.
-          dynamic_bucketing:
-            True to use DynamicBucketingSampler;
-            False to use BucketingSampler.
-          on_the_fly_feats:
-            True to use OnTheFlyFeatures;
-            False to use PrecomputedFeatures.
-        """
-        transforms = []
-        if cuts_musan is not None:
-            logging.info("Enable MUSAN")
-            transforms.append(
-                CutMix(
-                    cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True
-                )
-            )
-        else:
-            logging.info("Disable MUSAN")
-
-        input_transforms = []
-
-        if self.args.enable_spec_aug:
-            logging.info("Enable SpecAugment")
-            logging.info(
-                f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
-            )
-            # Set the value of num_frame_masks according to Lhotse's version.
-            # In different Lhotse's versions, the default of num_frame_masks is
-            # different.
-            num_frame_masks = 10
-            num_frame_masks_parameter = inspect.signature(
-                SpecAugment.__init__
-            ).parameters["num_frame_masks"]
-            if num_frame_masks_parameter.default == 1:
-                num_frame_masks = 2
-            logging.info(f"Num frame mask: {num_frame_masks}")
-            input_transforms.append(
-                SpecAugment(
-                    time_warp_factor=self.args.spec_aug_time_warp_factor,
-                    num_frame_masks=num_frame_masks,
-                    features_mask_size=27,
-                    num_feature_masks=2,
-                    frames_mask_size=100,
-                )
-            )
-        else:
-            logging.info("Disable SpecAugment")
-
-        logging.info("About to create train dataset")
-        train = K2SpeechRecognitionDataset(
-            cut_transforms=transforms,
-            input_transforms=input_transforms,
-            return_cuts=self.args.return_cuts,
-        )
-
-        # NOTE: the PerturbSpeed transform should be added only if we
-        # remove it from data prep stage.
-        # Add on-the-fly speed perturbation; since originally it would
-        # have increased epoch size by 3, we will apply prob 2/3 and use
-        # 3x more epochs.
-        # Speed perturbation probably should come first before
-        # concatenation, but in principle the transforms order doesn't have
-        # to be strict (e.g. could be randomized)
-        # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
-        # Drop feats to be on the safe side.
-        train = K2SpeechRecognitionDataset(
-            cut_transforms=transforms,
-            input_strategy=(
-                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
-                if on_the_fly_feats
-                else PrecomputedFeatures()
-            ),
-            input_transforms=input_transforms,
-            return_cuts=self.args.return_cuts,
-        )
-
-        if dynamic_bucketing:
-            logging.info("Using DynamicBucketingSampler.")
-            train_sampler = DynamicBucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                drop_last=True,
-            )
-        else:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
-                drop_last=True,
-            )
-
-        logging.info("About to create train dataloader")
-
-        # 'seed' is derived from the current random state, which will have
-        # previously been set in the main process.
-        seed = torch.randint(0, 100000, ()).item()
-        worker_init_fn = _SeedWorkers(seed)
-
-        train_dl = DataLoader(
-            train,
-            sampler=train_sampler,
-            batch_size=None,
-            num_workers=self.args.num_workers,
-            persistent_workers=False,
-            worker_init_fn=worker_init_fn,
-        )
-        return train_dl
-
-    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
-        transforms = []
-
-        logging.info("About to create dev dataset")
-        if self.args.on_the_fly_feats:
-            validate = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-                input_strategy=OnTheFlyFeatures(
-                    Fbank(FbankConfig(num_mel_bins=80))
-                ),
-                return_cuts=self.args.return_cuts,
-            )
-        else:
-            validate = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-                return_cuts=self.args.return_cuts,
-            )
-        valid_sampler = BucketingSampler(
-            cuts_valid,
-            max_duration=self.args.max_duration,
-            shuffle=False,
-        )
-        logging.info("About to create dev dataloader")
-        valid_dl = DataLoader(
-            validate,
-            sampler=valid_sampler,
-            batch_size=None,
-            num_workers=2,
-            persistent_workers=False,
-        )
-
-        return valid_dl
-
-    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
-        logging.debug("About to create test dataset")
-        test = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
-            if self.args.on_the_fly_feats
-            else PrecomputedFeatures(),
-            return_cuts=self.args.return_cuts,
-        )
-        sampler = BucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
-        )
-        logging.debug("About to create test dataloader")
-        test_dl = DataLoader(
-            test,
-            batch_size=None,
-            sampler=sampler,
-            num_workers=self.args.num_workers,
-        )
-        return test_dl
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
@ -0,0 +1 @@
+../pruned_transducer_stateless3/asr_datamodule.py
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
@ -1,75 +0,0 @@
-# Copyright      2021  Piotr Żelasko
-#                2022  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-from pathlib import Path
-
-from lhotse import CutSet, load_manifest
-
-
-class GigaSpeech:
-    def __init__(self, manifest_dir: str):
-        """
-        Args:
-          manifest_dir:
-            It is expected to contain the following files::
-
-                - cuts_XL_raw.jsonl.gz
-                - cuts_L_raw.jsonl.gz
-                - cuts_M_raw.jsonl.gz
-                - cuts_S_raw.jsonl.gz
-                - cuts_XS_raw.jsonl.gz
-                - cuts_DEV_raw.jsonl.gz
-                - cuts_TEST_raw.jsonl.gz
-        """
-        self.manifest_dir = Path(manifest_dir)
-
-    def train_XL_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_XL_raw.jsonl.gz"
-        logging.info(f"About to get train-XL cuts from {f}")
-        return CutSet.from_jsonl_lazy(f)
-
-    def train_L_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_L_raw.jsonl.gz"
-        logging.info(f"About to get train-L cuts from {f}")
-        return CutSet.from_jsonl_lazy(f)
-
-    def train_M_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_M_raw.jsonl.gz"
-        logging.info(f"About to get train-M cuts from {f}")
-        return CutSet.from_jsonl_lazy(f)
-
-    def train_S_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_S_raw.jsonl.gz"
-        logging.info(f"About to get train-S cuts from {f}")
-        return CutSet.from_jsonl_lazy(f)
-
-    def train_XS_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_XS_raw.jsonl.gz"
-        logging.info(f"About to get train-XS cuts from {f}")
-        return CutSet.from_jsonl_lazy(f)
-
-    def test_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_TEST.jsonl.gz"
-        logging.info(f"About to get TEST cuts from {f}")
-        return load_manifest(f)
-
-    def dev_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_DEV.jsonl.gz"
-        logging.info(f"About to get DEV cuts from {f}")
-        return load_manifest(f)
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
@ -0,0 +1 @@
+../pruned_transducer_stateless3/gigaspeech.py
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
@ -1,74 +0,0 @@
-# Copyright      2021  Piotr Żelasko
-#                2022  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from pathlib import Path
-
-from lhotse import CutSet, load_manifest
-
-
-class LibriSpeech:
-    def __init__(self, manifest_dir: str):
-        """
-        Args:
-          manifest_dir:
-            It is expected to contain the following files::
-
-                - cuts_dev-clean.json.gz
-                - cuts_dev-other.json.gz
-                - cuts_test-clean.json.gz
-                - cuts_test-other.json.gz
-                - cuts_train-clean-100.json.gz
-                - cuts_train-clean-360.json.gz
-                - cuts_train-other-500.json.gz
-        """
-        self.manifest_dir = Path(manifest_dir)
-
-    def train_clean_100_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-clean-100.json.gz"
-        logging.info(f"About to get train-clean-100 cuts from {f}")
-        return load_manifest(f)
-
-    def train_clean_360_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-clean-360.json.gz"
-        logging.info(f"About to get train-clean-360 cuts from {f}")
-        return load_manifest(f)
-
-    def train_other_500_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-other-500.json.gz"
-        logging.info(f"About to get train-other-500 cuts from {f}")
-        return load_manifest(f)
-
-    def test_clean_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test-clean.json.gz"
-        logging.info(f"About to get test-clean cuts from {f}")
-        return load_manifest(f)
-
-    def test_other_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test-other.json.gz"
-        logging.info(f"About to get test-other cuts from {f}")
-        return load_manifest(f)
-
-    def dev_clean_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_dev-clean.json.gz"
-        logging.info(f"About to get dev-clean cuts from {f}")
-        return load_manifest(f)
-
-    def dev_other_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_dev-other.json.gz"
-        logging.info(f"About to get dev-other cuts from {f}")
-        return load_manifest(f)
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
@ -0,0 +1 @@
+../pruned_transducer_stateless3/librispeech.py
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py
@ -28,7 +28,7 @@ from pathlib import Path

 from asr_datamodule import AsrDataModule
 from gigaspeech import GigaSpeech
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 from librispeech import LibriSpeech


@ -41,8 +41,8 @@ def test_dataset():
    print(args)

    if args.enable_musan:
-        cuts_musan = load_manifest(
-            Path(args.manifest_dir) / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
        )
    else:
        cuts_musan = None
@ -57,14 +57,12 @@ def test_dataset():

    libri_train_dl = asr_datamodule.train_dataloaders(
        train_clean_100,
-        dynamic_bucketing=False,
        on_the_fly_feats=False,
        cuts_musan=cuts_musan,
    )

    giga_train_dl = asr_datamodule.train_dataloaders(
        train_S,
-        dynamic_bucketing=True,
        on_the_fly_feats=True,
        cuts_musan=cuts_musan,
    )
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py
@ -73,7 +73,7 @@ from conformer import Conformer
 from decoder import Decoder
 from gigaspeech import GigaSpeech
 from joiner import Joiner
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from librispeech import LibriSpeech
@ -662,19 +662,17 @@ def train_one_epoch(
 def filter_short_and_long_utterances(cuts: CutSet) -> CutSet:
    def remove_short_and_long_utt(c: Cut):
        # Keep only utterances with duration between 1 second and 20 seconds
+        #
+        # Caution: There is a reason to select 20.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
        return 1.0 <= c.duration <= 20.0

-    num_in_total = len(cuts)
    cuts = cuts.filter(remove_short_and_long_utt)

-    num_left = len(cuts)
-    num_removed = num_in_total - num_left
-    removed_percent = num_removed / num_in_total * 100
-
-    logging.info(f"Before removing short and long utterances: {num_in_total}")
-    logging.info(f"After removing short and long utterances: {num_left}")
-    logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
-
    return cuts


@ -767,17 +765,18 @@ def run(rank, world_size, args):
    # DEV 12 hours
    # Test 40 hours
    if params.full_libri:
-        logging.info("Using the L subset of GigaSpeech (2.5k hours)")
-        train_giga_cuts = gigaspeech.train_L_cuts()
+        logging.info("Using the XL subset of GigaSpeech (10k hours)")
+        train_giga_cuts = gigaspeech.train_XL_cuts()
    else:
        logging.info("Using the S subset of GigaSpeech (250 hours)")
        train_giga_cuts = gigaspeech.train_S_cuts()

    train_giga_cuts = filter_short_and_long_utterances(train_giga_cuts)
+    train_giga_cuts = train_giga_cuts.repeat(times=None)

    if args.enable_musan:
-        cuts_musan = load_manifest(
-            Path(args.manifest_dir) / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
        )
    else:
        cuts_musan = None
@ -786,14 +785,12 @@ def run(rank, world_size, args):

    train_dl = asr_datamodule.train_dataloaders(
        train_cuts,
-        dynamic_bucketing=False,
        on_the_fly_feats=False,
        cuts_musan=cuts_musan,
    )

    giga_train_dl = asr_datamodule.train_dataloaders(
        train_giga_cuts,
-        dynamic_bucketing=True,
        on_the_fly_feats=True,
        cuts_musan=cuts_musan,
    )
--- a/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -22,7 +22,7 @@ from pathlib import Path
 from typing import Any, Dict, Optional

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
    CutConcatenate,
    CutMix,
@ -176,7 +176,7 @@ class SPGISpeechAsrDataModule:
            The state dict for the training sampler.
        """
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
+        cuts_musan = load_manifest_lazy(
            self.args.manifest_dir / "cuts_musan.jsonl.gz"
        )

--- a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py
+++ b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py
@ -52,8 +52,13 @@ def compute_fbank_tedlium():
        "test",
    )

+    prefix = "tedlium"
+    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
-        prefix="tedlium", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
    )
    assert manifests is not None

@ -61,7 +66,7 @@ def compute_fbank_tedlium():

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
@ -80,7 +85,7 @@ def compute_fbank_tedlium():

            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=cur_num_jobs,
                executor=ex,
@ -88,7 +93,7 @@ def compute_fbank_tedlium():
            )
            # Split long cuts into many short and un-overlapping cuts
            cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")


 if __name__ == "__main__":
--- a/egs/tedlium3/ASR/local/display_manifest_statistics.py
+++ b/egs/tedlium3/ASR/local/display_manifest_statistics.py
@ -27,15 +27,15 @@ for usage.
 """


-from lhotse import load_manifest
+from lhotse import load_manifest_lazy


 def main():
-    path = "./data/fbank/cuts_train.json.gz"
-    path = "./data/fbank/cuts_dev.json.gz"
-    path = "./data/fbank/cuts_test.json.gz"
+    path = "./data/fbank/tedlium_cuts_train.jsonl.gz"
+    path = "./data/fbank/tedlium_cuts_dev.jsonl.gz"
+    path = "./data/fbank/tedlium_cuts_test.jsonl.gz"

-    cuts = load_manifest(path)
+    cuts = load_manifest_lazy(path)
    cuts.describe()


--- a/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py
+++ b/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py
@ -22,11 +22,11 @@ import logging
 from functools import lru_cache
 from pathlib import Path

-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
-    BucketingSampler,
    CutConcatenate,
    CutMix,
+    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SingleCutSampler,
@ -92,7 +92,7 @@ class TedLiumAsrDataModule:
            "--num-buckets",
            type=int,
            default=30,
-            help="The number of buckets for the BucketingSampler"
+            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
@ -179,8 +179,8 @@ class TedLiumAsrDataModule:
        transforms = []
        if self.args.enable_musan:
            logging.info("Enable MUSAN")
-            cuts_musan = load_manifest(
-                self.args.manifest_dir / "cuts_musan.json.gz"
+            cuts_musan = load_manifest_lazy(
+                self.args.manifest_dir / "musan_cuts.jsonl.gz"
            )
            transforms.append(
                CutMix(
@ -261,13 +261,12 @@ class TedLiumAsrDataModule:
            )

        if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
                drop_last=True,
            )
        else:
@ -311,7 +310,7 @@ class TedLiumAsrDataModule:
                cut_transforms=transforms,
                return_cuts=self.args.return_cuts,
            )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
@ -335,8 +334,10 @@ class TedLiumAsrDataModule:
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
-        sampler = BucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
@ -350,14 +351,20 @@ class TedLiumAsrDataModule:
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "tedlium_cuts_train.jsonl.gz"
+        )

    @lru_cache()
    def dev_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "tedlium_cuts_dev.jsonl.gz"
+        )

    @lru_cache()
    def test_cuts(self) -> CutSet:
        logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "tedlium_cuts_test.jsonl.gz"
+        )
--- a/egs/timit/ASR/local/compute_fbank_timit.py
+++ b/egs/timit/ASR/local/compute_fbank_timit.py
@ -29,7 +29,7 @@ import os
 from pathlib import Path

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor
@ -53,8 +53,13 @@ def compute_fbank_timit():
        "DEV",
        "TEST",
    )
+    prefix = "timit"
+    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
-        prefix="timit", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
    )
    assert manifests is not None

@ -62,7 +67,8 @@ def compute_fbank_timit():

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
+            if cuts_file.is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
@ -78,13 +84,13 @@ def compute_fbank_timit():
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
-                storage_type=LilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(cuts_file)


 if __name__ == "__main__":
--- a/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py
@ -23,11 +23,11 @@ from functools import lru_cache
 from pathlib import Path
 from typing import List, Union

-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
-    BucketingSampler,
    CutConcatenate,
    CutMix,
+    DynamicBucketingSampler,
    K2SpeechRecognitionDataset,
    PrecomputedFeatures,
    SingleCutSampler,
@ -92,7 +92,7 @@ class TimitAsrDataModule(DataModule):
            "--num-buckets",
            type=int,
            default=30,
-            help="The number of buckets for the BucketingSampler"
+            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
@ -154,7 +154,9 @@ class TimitAsrDataModule(DataModule):
        cuts_train = self.train_cuts()

        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(self.args.feature_dir / "cuts_musan.json.gz")
+        cuts_musan = load_manifest_lazy(
+            self.args.feature_dir / "cuts_musan.jsonl.gz"
+        )

        logging.info("About to create train dataset")
        transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
@ -218,13 +220,12 @@ class TimitAsrDataModule(DataModule):
            )

        if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
                drop_last=True,
            )
        else:
@ -322,20 +323,26 @@ class TimitAsrDataModule(DataModule):
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        cuts_train = load_manifest(self.args.feature_dir / "cuts_TRAIN.json.gz")
+        cuts_train = load_manifest_lazy(
+            self.args.feature_dir / "timit_cuts_TRAIN.jsonl.gz"
+        )

        return cuts_train

    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        cuts_valid = load_manifest(self.args.feature_dir / "cuts_DEV.json.gz")
+        cuts_valid = load_manifest_lazy(
+            self.args.feature_dir / "timit_cuts_DEV.jsonl.gz"
+        )

        return cuts_valid

    @lru_cache()
    def test_cuts(self) -> CutSet:
        logging.debug("About to get test cuts")
-        cuts_test = load_manifest(self.args.feature_dir / "cuts_TEST.json.gz")
+        cuts_test = load_manifest_lazy(
+            self.args.feature_dir / "timit_cuts_TEST.jsonl.gz"
+        )

        return cuts_test
--- a/egs/wenetspeech/ASR/local/display_manifest_statistics.py
+++ b/egs/wenetspeech/ASR/local/display_manifest_statistics.py
@ -26,7 +26,7 @@ for usage.
 """


-from lhotse import load_manifest
+from lhotse import load_manifest_lazy


 def main():
@ -40,7 +40,7 @@ def main():

    for path in paths:
        print(f"Starting display the statistics for {path}")
-        cuts = load_manifest(path)
+        cuts = load_manifest_lazy(path)
        cuts.describe()


--- a/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@ -27,7 +27,7 @@ from lhotse import (
    CutSet,
    Fbank,
    FbankConfig,
-    load_manifest,
+    load_manifest_lazy,
    set_caching_enabled,
 )
 from lhotse.dataset import (
@ -218,8 +218,8 @@ class WenetSpeechAsrDataModule:
            The state dict for the training sampler.
        """
        logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
        )

        transforms = []
@ -435,16 +435,18 @@ class WenetSpeechAsrDataModule:
    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_DEV.jsonl.gz")

    @lru_cache()
    def test_net_cuts(self) -> List[CutSet]:
        logging.info("About to get TEST_NET cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz"
+        )

    @lru_cache()
    def test_meeting_cuts(self) -> List[CutSet]:
        logging.info("About to get TEST_MEETING cuts")
-        return load_manifest(
+        return load_manifest_lazy(
            self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz"
        )
--- a/egs/yesno/ASR/local/compute_fbank_yesno.py
+++ b/egs/yesno/ASR/local/compute_fbank_yesno.py
@ -12,7 +12,7 @@ import os
 from pathlib import Path

 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached

 from icefall.utils import get_executor
@ -37,10 +37,13 @@ def compute_fbank_yesno():
        "train",
        "test",
    )
+    prefix = "yesno"
+    suffix = "jsonl.gz"
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
-        prefix="yesno",
+        prefix=prefix,
+        suffix=suffix,
    )
    assert manifests is not None

@ -50,7 +53,8 @@ def compute_fbank_yesno():

    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
+            if cuts_file.is_file():
                logging.info(f"{partition} already exists - skipping.")
                continue
            logging.info(f"Processing {partition}")
@ -66,13 +70,13 @@ def compute_fbank_yesno():
                )
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 1,  # use one job
                executor=ex,
-                storage_type=LilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
            )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(cuts_file)


 if __name__ == "__main__":
--- a/egs/yesno/ASR/tdnn/asr_datamodule.py
+++ b/egs/yesno/ASR/tdnn/asr_datamodule.py
@ -20,18 +20,19 @@ from functools import lru_cache
 from pathlib import Path
 from typing import List

+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
+from lhotse.dataset import (
+    CutConcatenate,
+    DynamicBucketingSampler,
+    K2SpeechRecognitionDataset,
+    PrecomputedFeatures,
+    SingleCutSampler,
+)
+from lhotse.dataset.input_strategies import OnTheFlyFeatures
 from torch.utils.data import DataLoader

 from icefall.dataset.datamodule import DataModule
 from icefall.utils import str2bool
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
-from lhotse.dataset import (
-    BucketingSampler,
-    CutConcatenate,
-    K2SpeechRecognitionDataset,
-    PrecomputedFeatures,
-)
-from lhotse.dataset.input_strategies import OnTheFlyFeatures


 class YesNoAsrDataModule(DataModule):
@ -84,7 +85,7 @@ class YesNoAsrDataModule(DataModule):
            "--num-buckets",
            type=int,
            default=10,
-            help="The number of buckets for the BucketingSampler"
+            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
@ -186,18 +187,17 @@ class YesNoAsrDataModule(DataModule):
            )

        if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
                drop_last=True,
            )
        else:
            logging.info("Using SingleCutSampler.")
-            train_sampler = BucketingSampler(
+            train_sampler = SingleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
@ -225,8 +225,10 @@ class YesNoAsrDataModule(DataModule):
            else PrecomputedFeatures(),
            return_cuts=self.args.return_cuts,
        )
-        sampler = BucketingSampler(
-            cuts_test, max_duration=self.args.max_duration, shuffle=False
+        sampler = DynamicBucketingSampler(
+            cuts_test,
+            max_duration=self.args.max_duration,
+            shuffle=False,
        )
        logging.debug("About to create test dataloader")
        test_dl = DataLoader(
@ -240,11 +242,15 @@ class YesNoAsrDataModule(DataModule):
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
-        cuts_train = load_manifest(self.args.feature_dir / "cuts_train.json.gz")
+        cuts_train = load_manifest_lazy(
+            self.args.feature_dir / "yesno_cuts_train.jsonl.gz"
+        )
        return cuts_train

    @lru_cache()
    def test_cuts(self) -> List[CutSet]:
        logging.info("About to get test cuts")
-        cuts_test = load_manifest(self.args.feature_dir / "cuts_test.json.gz")
+        cuts_test = load_manifest_lazy(
+            self.args.feature_dir / "yesno_cuts_test.jsonl.gz"
+        )
        return cuts_test
--- a/icefall/utils.py
+++ b/icefall/utils.py
@ -131,7 +131,6 @@ def setup_logger(
        format=formatter,
        level=level,
        filemode="w",
-        force=True,
    )
    if use_console:
        console = logging.StreamHandler()
				`@ -0,0 +1 @@`
				`../../../librispeech/ASR/local/compute_fbank_musan.py`
				`@ -0,0 +1 @@`
				`../pruned_transducer_stateless3/asr_datamodule.py`
				`@ -0,0 +1 @@`
				`../pruned_transducer_stateless3/gigaspeech.py`
				`@ -0,0 +1 @@`
				`../pruned_transducer_stateless3/librispeech.py`