diff --git a/.github/workflows/run-gigaspeech-2022-05-13.yml b/.github/workflows/run-gigaspeech-2022-05-13.yml
index d250b72b0..dc33751d3 100644
--- a/.github/workflows/run-gigaspeech-2022-05-13.yml
+++ b/.github/workflows/run-gigaspeech-2022-05-13.yml
@@ -59,6 +59,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
diff --git a/.github/workflows/run-librispeech-2022-03-12.yml b/.github/workflows/run-librispeech-2022-03-12.yml
index b18b84378..291f2bc71 100644
--- a/.github/workflows/run-librispeech-2022-03-12.yml
+++ b/.github/workflows/run-librispeech-2022-03-12.yml
@@ -59,6 +59,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -99,7 +101,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-librispeech-2022-04-29.yml b/.github/workflows/run-librispeech-2022-04-29.yml
index 6c8188b48..b04718f86 100644
--- a/.github/workflows/run-librispeech-2022-04-29.yml
+++ b/.github/workflows/run-librispeech-2022-04-29.yml
@@ -59,6 +59,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -99,7 +101,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-librispeech-2022-05-13.yml b/.github/workflows/run-librispeech-2022-05-13.yml
index 2290e18d4..bb3d74e55 100644
--- a/.github/workflows/run-librispeech-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-2022-05-13.yml
@@ -59,6 +59,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -99,7 +101,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
index 512f1b334..47976fc2c 100644
--- a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
+++ b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml
@@ -59,6 +59,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -99,7 +101,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
index 3864f4aa3..e05b04bee 100644
--- a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
+++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml
@@ -59,6 +59,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -99,7 +101,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
index f77d9e658..348a68095 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml
@@ -58,6 +58,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -98,7 +100,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
index ddfa62073..d1369c2b1 100644
--- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml
@@ -58,6 +58,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -98,7 +100,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/.github/workflows/run-pretrained-transducer-stateless.yml b/.github/workflows/run-pretrained-transducer-stateless.yml
index cdea78a88..78c1ca059 100644
--- a/.github/workflows/run-pretrained-transducer-stateless.yml
+++ b/.github/workflows/run-pretrained-transducer-stateless.yml
@@ -58,6 +58,8 @@ jobs:
       - name: Install Python dependencies
         run: |
           grep -v '^#' ./requirements-ci.txt  | xargs -n 1 -L 1 pip install
+          pip uninstall -y protobuf
+          pip install --no-binary protobuf protobuf
 
       - name: Cache kaldifeat
         id: my-cache
@@ -98,7 +100,7 @@ jobs:
         with:
           path: |
             ~/tmp/fbank-libri
-          key: cache-libri-fbank-test-clean-and-test-other
+          key: cache-libri-fbank-test-clean-and-test-other-v2
 
       - name: Compute fbank for LibriSpeech test-clean and test-other
         if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
diff --git a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
index 5d00edeca..faebff2f6 100755
--- a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
+++ b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py
@@ -43,7 +43,7 @@ torch.set_num_interop_threads(1)
 
 
 def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
-    src_dir = Path("data/manifests/aidatatang_200zh")
+    src_dir = Path("data/manifests")
     output_dir = Path("data/fbank")
     num_jobs = min(15, os.cpu_count())
 
@@ -52,11 +52,13 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
         "dev",
         "test",
     )
+    prefix = "aidatatang"
+    suffix = "jsonl.gz"
     manifests = read_manifests_if_cached(
-        prefix="aidatatang",
-        suffix="jsonl.gz",
         dataset_parts=dataset_parts,
         output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
     )
     assert manifests is not None
 
@@ -64,10 +66,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
 
     with get_executor() as ex:  # Initialize the executor only once.
         for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                 logging.info(f"{partition} already exists - skipping.")
                 continue
             logging.info(f"Processing {partition}")
+
+            for sup in m["supervisions"]:
+                sup.custom = {"origin": "aidatatang_200zh"}
+
             cut_set = CutSet.from_manifests(
                 recordings=m["recordings"],
                 supervisions=m["supervisions"],
@@ -80,13 +86,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
                 )
             cut_set = cut_set.compute_and_store_features(
                 extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                 # when an executor is specified, make more partitions
                 num_jobs=num_jobs if ex is None else 80,
                 executor=ex,
                 storage_type=ChunkedLilcomHdf5Writer,
             )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
 
 
 def get_args():
diff --git a/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py b/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py
index 2352785ac..d66e5cfca 100644
--- a/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py
+++ b/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py
@@ -25,19 +25,19 @@ for usage.
 """
 
 
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 
 
 def main():
     paths = [
-        "./data/fbank/cuts_train.json.gz",
-        "./data/fbank/cuts_dev.json.gz",
-        "./data/fbank/cuts_test.json.gz",
+        "./data/fbank/aidatatang_cuts_train.jsonl.gz",
+        "./data/fbank/aidatatang_cuts_dev.jsonl.gz",
+        "./data/fbank/aidatatang_cuts_test.jsonl.gz",
     ]
 
     for path in paths:
         print(f"Starting display the statistics for {path}")
-        cuts = load_manifest(path)
+        cuts = load_manifest_lazy(path)
         cuts.describe()
 
 
@@ -45,7 +45,7 @@ if __name__ == "__main__":
     main()
 
 """
-Starting display the statistics for ./data/fbank/cuts_train.json.gz
+Starting display the statistics for ./data/fbank/aidatatang_cuts_train.jsonl.gz
 Cuts count: 494715
 Total duration (hours): 422.6
 Speech duration (hours): 422.6 (100.0%)
@@ -61,7 +61,7 @@ min     1.0
 99.5%   8.0
 99.9%   9.5
 max     18.1
-Starting display the statistics for ./data/fbank/cuts_dev.json.gz
+Starting display the statistics for ./data/fbank/aidatatang_cuts_dev.jsonl.gz
 Cuts count: 24216
 Total duration (hours): 20.2
 Speech duration (hours): 20.2 (100.0%)
@@ -77,7 +77,7 @@ min     1.2
 99.5%   7.3
 99.9%   8.8
 max     11.3
-Starting display the statistics for ./data/fbank/cuts_test.json.gz
+Starting display the statistics for ./data/fbank/aidatatang_cuts_test.jsonl.gz
 Cuts count: 48144
 Total duration (hours): 40.2
 Speech duration (hours): 40.2 (100.0%)
diff --git a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
index 447a011cb..728f7e3d0 100644
--- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py
@@ -27,11 +27,10 @@ from lhotse import (
     CutSet,
     Fbank,
     FbankConfig,
-    load_manifest,
+    load_manifest_lazy,
     set_caching_enabled,
 )
 from lhotse.dataset import (
-    BucketingSampler,
     CutConcatenate,
     CutMix,
     DynamicBucketingSampler,
@@ -205,8 +204,8 @@ class Aidatatang_200zhAsrDataModule:
             The state dict for the training sampler.
         """
         logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
         )
 
         transforms = []
@@ -290,13 +289,12 @@ class Aidatatang_200zhAsrDataModule:
             )
 
         if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
                 cuts_train,
                 max_duration=self.args.max_duration,
                 shuffle=self.args.shuffle,
                 num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
                 drop_last=True,
             )
         else:
@@ -402,14 +400,20 @@ class Aidatatang_200zhAsrDataModule:
     @lru_cache()
     def train_cuts(self) -> CutSet:
         logging.info("About to get train cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
+        )
 
     @lru_cache()
     def valid_cuts(self) -> CutSet:
         logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz"
+        )
 
     @lru_cache()
     def test_cuts(self) -> List[CutSet]:
         logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
+        )
diff --git a/egs/aishell/ASR/conformer_ctc/train.py b/egs/aishell/ASR/conformer_ctc/train.py
index 369ad310f..a228cc1fe 100755
--- a/egs/aishell/ASR/conformer_ctc/train.py
+++ b/egs/aishell/ASR/conformer_ctc/train.py
@@ -195,9 +195,9 @@ def get_params() -> AttributeDict:
             "best_train_epoch": -1,
             "best_valid_epoch": -1,
             "batch_idx_train": 0,
-            "log_interval": 10,
+            "log_interval": 50,
             "reset_interval": 200,
-            "valid_interval": 3000,
+            "valid_interval": 2000,
             # parameters for k2.ctc_loss
             "beam_size": 10,
             "reduction": "sum",
diff --git a/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py b/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py
new file mode 100755
index 000000000..8cdfad71f
--- /dev/null
+++ b/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# Copyright    2021  Xiaomi Corp.        (authors: Fangjun Kuang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the aidatatang_200zh dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated fbank features are saved in data/fbank.
+"""
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+import torch
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
+from lhotse.recipes.utils import read_manifests_if_cached
+
+from icefall.utils import get_executor
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+
+def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/fbank")
+    num_jobs = min(15, os.cpu_count())
+
+    dataset_parts = (
+        "train",
+        "test",
+        "dev",
+    )
+    prefix = "aidatatang"
+    suffix = "jsonl.gz"
+    manifests = read_manifests_if_cached(
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
+    )
+    assert manifests is not None
+
+    extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
+
+    with get_executor() as ex:  # Initialize the executor only once.
+        for partition, m in manifests.items():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
+                logging.info(f"{partition} already exists - skipping.")
+                continue
+            logging.info(f"Processing {partition}")
+
+            for sup in m["supervisions"]:
+                sup.custom = {"origin": "aidatatang_200zh"}
+
+            cut_set = CutSet.from_manifests(
+                recordings=m["recordings"],
+                supervisions=m["supervisions"],
+            )
+            if "train" in partition:
+                cut_set = (
+                    cut_set
+                    + cut_set.perturb_speed(0.9)
+                    + cut_set.perturb_speed(1.1)
+                )
+            cut_set = cut_set.compute_and_store_features(
+                extractor=extractor,
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
+                # when an executor is specified, make more partitions
+                num_jobs=num_jobs if ex is None else 80,
+                executor=ex,
+                storage_type=LilcomChunkyWriter,
+            )
+
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--num-mel-bins",
+        type=int,
+        default=80,
+        help="""The number of mel bins for Fbank""",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    formatter = (
+        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    )
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    args = get_args()
+    compute_fbank_aidatatang_200zh(num_mel_bins=args.num_mel_bins)
diff --git a/egs/aishell/ASR/local/compute_fbank_aishell.py b/egs/aishell/ASR/local/compute_fbank_aishell.py
index 70dee81d8..e27e35ec5 100755
--- a/egs/aishell/ASR/local/compute_fbank_aishell.py
+++ b/egs/aishell/ASR/local/compute_fbank_aishell.py
@@ -29,7 +29,7 @@ import os
 from pathlib import Path
 
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 
 from icefall.utils import get_executor
@@ -52,8 +52,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
         "dev",
         "test",
     )
+    prefix = "aishell"
+    suffix = "jsonl.gz"
     manifests = read_manifests_if_cached(
-        prefix="aishell", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
     )
     assert manifests is not None
 
@@ -61,7 +66,7 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
 
     with get_executor() as ex:  # Initialize the executor only once.
         for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                 logging.info(f"{partition} already exists - skipping.")
                 continue
             logging.info(f"Processing {partition}")
@@ -77,13 +82,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
                 )
             cut_set = cut_set.compute_and_store_features(
                 extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                 # when an executor is specified, make more partitions
                 num_jobs=num_jobs if ex is None else 80,
                 executor=ex,
-                storage_type=LilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
             )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
 
 
 def get_args():
diff --git a/egs/aishell/ASR/local/display_manifest_statistics.py b/egs/aishell/ASR/local/display_manifest_statistics.py
index 0ae731a1d..c478f7331 100755
--- a/egs/aishell/ASR/local/display_manifest_statistics.py
+++ b/egs/aishell/ASR/local/display_manifest_statistics.py
@@ -25,18 +25,18 @@ for usage.
 """
 
 
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 
 
 def main():
-    #  path = "./data/fbank/cuts_train.json.gz"
-    #  path = "./data/fbank/cuts_test.json.gz"
-    #  path = "./data/fbank/cuts_dev.json.gz"
-    #  path = "./data/fbank/aidatatang_200zh/cuts_train_raw.jsonl.gz"
-    #  path = "./data/fbank/aidatatang_200zh/cuts_test_raw.jsonl.gz"
-    path = "./data/fbank/aidatatang_200zh/cuts_dev_raw.jsonl.gz"
+    #  path = "./data/fbank/aishell_cuts_train.jsonl.gz"
+    #  path = "./data/fbank/aishell_cuts_test.jsonl.gz"
+    path = "./data/fbank/aishell_cuts_dev.jsonl.gz"
+    #  path = "./data/fbank/aidatatang_cuts_train.jsonl.gz"
+    #  path = "./data/fbank/aidatatang_cuts_test.jsonl.gz"
+    #  path = "./data/fbank/aidatatang_cuts_dev.jsonl.gz"
 
-    cuts = load_manifest(path)
+    cuts = load_manifest_lazy(path)
     cuts.describe()
 
 
diff --git a/egs/aishell/ASR/local/process_aidatatang_200zh.py b/egs/aishell/ASR/local/process_aidatatang_200zh.py
deleted file mode 100755
index ac2b86927..000000000
--- a/egs/aishell/ASR/local/process_aidatatang_200zh.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2022  Xiaomi Corp.             (Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from pathlib import Path
-
-from lhotse import CutSet
-from lhotse.recipes.utils import read_manifests_if_cached
-
-
-def preprocess_aidatatang_200zh():
-    src_dir = Path("data/manifests/aidatatang_200zh")
-    output_dir = Path("data/fbank/aidatatang_200zh")
-    output_dir.mkdir(exist_ok=True, parents=True)
-
-    dataset_parts = (
-        "train",
-        "test",
-        "dev",
-    )
-
-    logging.info("Loading manifest")
-    manifests = read_manifests_if_cached(
-        dataset_parts=dataset_parts, output_dir=src_dir, prefix="aidatatang"
-    )
-    assert len(manifests) > 0
-
-    for partition, m in manifests.items():
-        logging.info(f"Processing {partition}")
-        raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
-        if raw_cuts_path.is_file():
-            logging.info(f"{partition} already exists - skipping")
-            continue
-
-        for sup in m["supervisions"]:
-            sup.custom = {"origin": "aidatatang_200zh"}
-
-        cut_set = CutSet.from_manifests(
-            recordings=m["recordings"],
-            supervisions=m["supervisions"],
-        )
-
-        logging.info(f"Saving to {raw_cuts_path}")
-        cut_set.to_file(raw_cuts_path)
-
-
-def main():
-    formatter = (
-        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-    )
-    logging.basicConfig(format=formatter, level=logging.INFO)
-
-    preprocess_aidatatang_200zh()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/aishell/ASR/prepare_aidatatang_200zh.sh b/egs/aishell/ASR/prepare_aidatatang_200zh.sh
index 60b2060ec..f1d4d18a7 100755
--- a/egs/aishell/ASR/prepare_aidatatang_200zh.sh
+++ b/egs/aishell/ASR/prepare_aidatatang_200zh.sh
@@ -42,18 +42,18 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
   log "Stage 1: Prepare manifest"
   # We assume that you have downloaded the aidatatang_200zh corpus
   # to $dl_dir/aidatatang_200zh
-  if [ ! -f data/manifests/aidatatang_200zh/.manifests.done ]; then
-    mkdir -p data/manifests/aidatatang_200zh
-    lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
-    touch data/manifests/aidatatang_200zh/.manifests.done
+  if [ ! -f data/manifests/.aidatatang_200zh_manifests.done ]; then
+    mkdir -p data/manifests
+    lhotse prepare aidatatang-200zh $dl_dir data/manifests
+    touch data/manifests/.aidatatang_200zh_manifests.done
   fi
 fi
 
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
   log "Stage 2: Process aidatatang_200zh"
-  if [ ! -f data/fbank/aidatatang_200zh/.fbank.done ]; then
-    mkdir -p data/fbank/aidatatang_200zh
-    lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
-    touch data/fbank/aidatatang_200zh/.fbank.done
+  if [ ! -f data/fbank/.aidatatang_200zh_fbank.done ]; then
+    mkdir -p data/fbank
+    ./local/compute_fbank_aidatatang_200zh.py
+    touch data/fbank/.aidatatang_200zh_fbank.done
   fi
 fi
diff --git a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
index 507db2933..e1021fda2 100644
--- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py
@@ -23,11 +23,11 @@ from functools import lru_cache
 from pathlib import Path
 from typing import List
 
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
-    BucketingSampler,
     CutConcatenate,
     CutMix,
+    DynamicBucketingSampler,
     K2SpeechRecognitionDataset,
     PrecomputedFeatures,
     SingleCutSampler,
@@ -93,7 +93,7 @@ class AishellAsrDataModule:
             "--num-buckets",
             type=int,
             default=30,
-            help="The number of buckets for the BucketingSampler"
+            help="The number of buckets for the DynamicBucketingSampler"
             "(you might want to increase it for larger datasets).",
         )
         group.add_argument(
@@ -133,6 +133,12 @@ class AishellAsrDataModule:
             help="When enabled (=default), the examples will be "
             "shuffled for each epoch.",
         )
+        group.add_argument(
+            "--drop-last",
+            type=str2bool,
+            default=True,
+            help="Whether to drop last batch. Used by sampler.",
+        )
         group.add_argument(
             "--return-cuts",
             type=str2bool,
@@ -177,8 +183,8 @@ class AishellAsrDataModule:
 
     def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
         logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
         )
 
         transforms = []
@@ -262,14 +268,13 @@ class AishellAsrDataModule:
             )
 
         if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
                 cuts_train,
                 max_duration=self.args.max_duration,
                 shuffle=self.args.shuffle,
                 num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
-                drop_last=True,
+                drop_last=self.args.drop_last,
             )
         else:
             logging.info("Using SingleCutSampler.")
@@ -313,7 +318,7 @@ class AishellAsrDataModule:
                 cut_transforms=transforms,
                 return_cuts=self.args.return_cuts,
             )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
             cuts_valid,
             max_duration=self.args.max_duration,
             shuffle=False,
@@ -337,8 +342,10 @@ class AishellAsrDataModule:
             else PrecomputedFeatures(),
             return_cuts=self.args.return_cuts,
         )
-        sampler = BucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
         )
         test_dl = DataLoader(
             test,
@@ -351,17 +358,21 @@ class AishellAsrDataModule:
     @lru_cache()
     def train_cuts(self) -> CutSet:
         logging.info("About to get train cuts")
-        cuts_train = load_manifest(
-            self.args.manifest_dir / "cuts_train.json.gz"
+        cuts_train = load_manifest_lazy(
+            self.args.manifest_dir / "aishell_cuts_train.jsonl.gz"
         )
         return cuts_train
 
     @lru_cache()
     def valid_cuts(self) -> CutSet:
         logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aishell_cuts_dev.jsonl.gz"
+        )
 
     @lru_cache()
     def test_cuts(self) -> List[CutSet]:
         logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aishell_cuts_test.jsonl.gz"
+        )
diff --git a/egs/aishell/ASR/tdnn_lstm_ctc/train.py b/egs/aishell/ASR/tdnn_lstm_ctc/train.py
index 3327cdb79..7619b0551 100755
--- a/egs/aishell/ASR/tdnn_lstm_ctc/train.py
+++ b/egs/aishell/ASR/tdnn_lstm_ctc/train.py
@@ -15,6 +15,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Usage
+  export CUDA_VISIBLE_DEVICES="0,1,2,3"
+  ./tdnn_lstm_ctc/train.py \
+    --world-size 4 \
+    --num-epochs 20 \
+    --max-duration 300
+"""
 
 import argparse
 import logging
diff --git a/egs/aishell/ASR/transducer_stateless/conformer.py b/egs/aishell/ASR/transducer_stateless/conformer.py
index 149df92ab..7e8dc4ace 100644
--- a/egs/aishell/ASR/transducer_stateless/conformer.py
+++ b/egs/aishell/ASR/transducer_stateless/conformer.py
@@ -110,9 +110,7 @@ class Conformer(Transformer):
         x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
 
         # Caution: We assume the subsampling factor is 4!
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            lengths = ((x_lens - 1) // 2 - 1) // 2
+        lengths = (((x_lens - 1) >> 1) - 1) >> 1
         assert x.size(0) == lengths.max().item()
         mask = make_pad_mask(lengths)
 
diff --git a/egs/aishell/ASR/transducer_stateless/train.py b/egs/aishell/ASR/transducer_stateless/train.py
index f615c78f4..21128318b 100755
--- a/egs/aishell/ASR/transducer_stateless/train.py
+++ b/egs/aishell/ASR/transducer_stateless/train.py
@@ -21,6 +21,7 @@
 
 import argparse
 import logging
+import warnings
 from pathlib import Path
 from shutil import copyfile
 from typing import Optional, Tuple
@@ -386,7 +387,11 @@ def compute_loss(
     assert loss.requires_grad == is_training
 
     info = MetricsTracker()
-    info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (
+            (feature_lens // params.subsampling_factor).sum().item()
+        )
 
     # Note: We use reduction=sum while computing the loss.
     info["loss"] = loss.detach().cpu().item()
diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py b/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py
index 84ca64c89..26d4ee111 100644
--- a/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py
@@ -18,7 +18,7 @@
 import logging
 from pathlib import Path
 
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 
 
 class AIDatatang200zh:
@@ -28,26 +28,26 @@ class AIDatatang200zh:
           manifest_dir:
             It is expected to contain the following files::
 
-                - cuts_dev_raw.jsonl.gz
-                - cuts_train_raw.jsonl.gz
-                - cuts_test_raw.jsonl.gz
+                - aidatatang_cuts_dev.jsonl.gz
+                - aidatatang_cuts_train.jsonl.gz
+                - aidatatang_cuts_test.jsonl.gz
         """
         self.manifest_dir = Path(manifest_dir)
 
     def train_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train_raw.jsonl.gz"
+        f = self.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
         logging.info(f"About to get train cuts from {f}")
-        cuts_train = load_manifest(f)
+        cuts_train = load_manifest_lazy(f)
         return cuts_train
 
     def valid_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_valid_raw.jsonl.gz"
+        f = self.manifest_dir / "aidatatang_cuts_valid.jsonl.gz"
         logging.info(f"About to get valid cuts from {f}")
-        cuts_valid = load_manifest(f)
+        cuts_valid = load_manifest_lazy(f)
         return cuts_valid
 
     def test_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test_raw.jsonl.gz"
+        f = self.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
         logging.info(f"About to get test cuts from {f}")
-        cuts_test = load_manifest(f)
+        cuts_test = load_manifest_lazy(f)
         return cuts_test
diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py b/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py
index 94d1da066..ddeca4d88 100644
--- a/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py
@@ -18,7 +18,7 @@
 import logging
 from pathlib import Path
 
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 
 
 class AIShell:
@@ -28,26 +28,26 @@ class AIShell:
           manifest_dir:
             It is expected to contain the following files::
 
-                - cuts_dev.json.gz
-                - cuts_train.json.gz
-                - cuts_test.json.gz
+                - aishell_cuts_dev.jsonl.gz
+                - aishell_cuts_train.jsonl.gz
+                - aishell_cuts_test.jsonl.gz
         """
         self.manifest_dir = Path(manifest_dir)
 
     def train_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train.json.gz"
+        f = self.manifest_dir / "aishell_cuts_train.jsonl.gz"
         logging.info(f"About to get train cuts from {f}")
-        cuts_train = load_manifest(f)
+        cuts_train = load_manifest_lazy(f)
         return cuts_train
 
     def valid_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_dev.json.gz"
+        f = self.manifest_dir / "aishell_cuts_dev.jsonl.gz"
         logging.info(f"About to get valid cuts from {f}")
-        cuts_valid = load_manifest(f)
+        cuts_valid = load_manifest_lazy(f)
         return cuts_valid
 
     def test_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test.json.gz"
+        f = self.manifest_dir / "aishell_cuts_test.jsonl.gz"
         logging.info(f"About to get test cuts from {f}")
-        cuts_test = load_manifest(f)
+        cuts_test = load_manifest_lazy(f)
         return cuts_test
diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py b/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py
index 20eb8155c..838e53658 100644
--- a/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py
@@ -24,7 +24,6 @@ from typing import Optional
 
 from lhotse import CutSet, Fbank, FbankConfig
 from lhotse.dataset import (
-    BucketingSampler,
     CutMix,
     DynamicBucketingSampler,
     K2SpeechRecognitionDataset,
@@ -73,8 +72,7 @@ class AsrDataModule:
             "--num-buckets",
             type=int,
             default=30,
-            help="The number of buckets for the BucketingSampler "
-            "and DynamicBucketingSampler."
+            help="The number of buckets for the DynamicBucketingSampler "
             "(you might want to increase it for larger datasets).",
         )
 
@@ -147,7 +145,6 @@ class AsrDataModule:
     def train_dataloaders(
         self,
         cuts_train: CutSet,
-        dynamic_bucketing: bool,
         on_the_fly_feats: bool,
         cuts_musan: Optional[CutSet] = None,
     ) -> DataLoader:
@@ -157,9 +154,6 @@ class AsrDataModule:
             Cuts for training.
           cuts_musan:
             If not None, it is the cuts for mixing.
-          dynamic_bucketing:
-            True to use DynamicBucketingSampler;
-            False to use BucketingSampler.
           on_the_fly_feats:
             True to use OnTheFlyFeatures;
             False to use PrecomputedFeatures.
@@ -232,25 +226,14 @@ class AsrDataModule:
             return_cuts=self.args.return_cuts,
         )
 
-        if dynamic_bucketing:
-            logging.info("Using DynamicBucketingSampler.")
-            train_sampler = DynamicBucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                drop_last=True,
-            )
-        else:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
-                drop_last=True,
-            )
+        logging.info("Using DynamicBucketingSampler.")
+        train_sampler = DynamicBucketingSampler(
+            cuts_train,
+            max_duration=self.args.max_duration,
+            shuffle=self.args.shuffle,
+            num_buckets=self.args.num_buckets,
+            drop_last=True,
+        )
 
         logging.info("About to create train dataloader")
         train_dl = DataLoader(
@@ -279,7 +262,7 @@ class AsrDataModule:
                 cut_transforms=transforms,
                 return_cuts=self.args.return_cuts,
             )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
             cuts_valid,
             max_duration=self.args.max_duration,
             shuffle=False,
@@ -303,8 +286,10 @@ class AsrDataModule:
             else PrecomputedFeatures(),
             return_cuts=self.args.return_cuts,
         )
-        sampler = BucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
         )
         logging.debug("About to create test dataloader")
         test_dl = DataLoader(
diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/train.py b/egs/aishell/ASR/transducer_stateless_modified-2/train.py
index 53d4e455f..a6c17198f 100755
--- a/egs/aishell/ASR/transducer_stateless_modified-2/train.py
+++ b/egs/aishell/ASR/transducer_stateless_modified-2/train.py
@@ -41,6 +41,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2"
 import argparse
 import logging
 import random
+import warnings
 from pathlib import Path
 from shutil import copyfile
 from typing import Optional, Tuple
@@ -55,7 +56,7 @@ from asr_datamodule import AsrDataModule
 from conformer import Conformer
 from decoder import Decoder
 from joiner import Joiner
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from model import Transducer
@@ -446,7 +447,11 @@ def compute_loss(
     assert loss.requires_grad == is_training
 
     info = MetricsTracker()
-    info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (
+            (feature_lens // params.subsampling_factor).sum().item()
+        )
 
     # Note: We use reduction=sum while computing the loss.
     info["loss"] = loss.detach().cpu().item()
@@ -635,20 +640,16 @@ def train_one_epoch(
 
 def filter_short_and_long_utterances(cuts: CutSet) -> CutSet:
     def remove_short_and_long_utt(c: Cut):
-        # Keep only utterances with duration between 1 second and 12 seconds
+        # Keep only utterances with duration between 1 second and 20 seconds
+        #
+        # Caution: There is a reason to select 12.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
         return 1.0 <= c.duration <= 12.0
 
-    num_in_total = len(cuts)
-    cuts = cuts.filter(remove_short_and_long_utt)
-
-    num_left = len(cuts)
-    num_removed = num_in_total - num_left
-    removed_percent = num_removed / num_in_total * 100
-
-    logging.info(f"Before removing short and long utterances: {num_in_total}")
-    logging.info(f"After removing short and long utterances: {num_left}")
-    logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
-
     return cuts
 
 
@@ -728,15 +729,14 @@ def run(rank, world_size, args):
     train_cuts = aishell.train_cuts()
     train_cuts = filter_short_and_long_utterances(train_cuts)
 
-    datatang = AIDatatang200zh(
-        manifest_dir=f"{args.manifest_dir}/aidatatang_200zh"
-    )
+    datatang = AIDatatang200zh(manifest_dir=args.manifest_dir)
     train_datatang_cuts = datatang.train_cuts()
     train_datatang_cuts = filter_short_and_long_utterances(train_datatang_cuts)
+    train_datatang_cuts = train_datatang_cuts.repeat(times=None)
 
     if args.enable_musan:
-        cuts_musan = load_manifest(
-            Path(args.manifest_dir) / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
         )
     else:
         cuts_musan = None
@@ -745,22 +745,23 @@ def run(rank, world_size, args):
 
     train_dl = asr_datamodule.train_dataloaders(
         train_cuts,
-        dynamic_bucketing=False,
         on_the_fly_feats=False,
         cuts_musan=cuts_musan,
     )
 
     datatang_train_dl = asr_datamodule.train_dataloaders(
         train_datatang_cuts,
-        dynamic_bucketing=True,
-        on_the_fly_feats=True,
+        on_the_fly_feats=False,
         cuts_musan=cuts_musan,
     )
 
     valid_cuts = aishell.valid_cuts()
     valid_dl = asr_datamodule.valid_dataloaders(valid_cuts)
 
-    for dl in [train_dl, datatang_train_dl]:
+    for dl in [
+        train_dl,
+        # datatang_train_dl
+    ]:
         scan_pessimistic_batches_for_oom(
             model=model,
             train_dl=dl,
diff --git a/egs/aishell/ASR/transducer_stateless_modified/train.py b/egs/aishell/ASR/transducer_stateless_modified/train.py
index 524854b73..dcbc874a0 100755
--- a/egs/aishell/ASR/transducer_stateless_modified/train.py
+++ b/egs/aishell/ASR/transducer_stateless_modified/train.py
@@ -37,6 +37,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2"
 
 import argparse
 import logging
+import warnings
 from pathlib import Path
 from shutil import copyfile
 from typing import Optional, Tuple
@@ -411,7 +412,11 @@ def compute_loss(
     assert loss.requires_grad == is_training
 
     info = MetricsTracker()
-    info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        info["frames"] = (
+            (feature_lens // params.subsampling_factor).sum().item()
+        )
 
     # Note: We use reduction=sum while computing the loss.
     info["loss"] = loss.detach().cpu().item()
diff --git a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
index ae24127bf..b3fc8adbb 100755
--- a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
+++ b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py
@@ -43,7 +43,7 @@ torch.set_num_interop_threads(1)
 
 
 def compute_fbank_alimeeting(num_mel_bins: int = 80):
-    src_dir = Path("data/manifests/alimeeting")
+    src_dir = Path("data/manifests")
     output_dir = Path("data/fbank")
     num_jobs = min(15, os.cpu_count())
 
@@ -52,11 +52,14 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
         "eval",
         "test",
     )
+
+    prefix = "alimeeting"
+    suffix = "jsonl.gz"
     manifests = read_manifests_if_cached(
         dataset_parts=dataset_parts,
         output_dir=src_dir,
-        prefix="alimeeting",
-        suffix="jsonl.gz",
+        prefix=prefix,
+        suffix=suffix,
     )
     assert manifests is not None
 
@@ -64,7 +67,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
 
     with get_executor() as ex:  # Initialize the executor only once.
         for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                 logging.info(f"{partition} already exists - skipping.")
                 continue
             logging.info(f"Processing {partition}")
@@ -83,7 +86,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
 
             cut_set = cut_set.compute_and_store_features(
                 extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                 # when an executor is specified, make more partitions
                 num_jobs=cur_num_jobs,
                 executor=ex,
@@ -95,7 +98,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
                 keep_overlapping=False,
                 min_duration=None,
             )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
 
 
 def get_args():
diff --git a/egs/alimeeting/ASR/local/display_manifest_statistics.py b/egs/alimeeting/ASR/local/display_manifest_statistics.py
index 7f7aa094d..16cdecc91 100644
--- a/egs/alimeeting/ASR/local/display_manifest_statistics.py
+++ b/egs/alimeeting/ASR/local/display_manifest_statistics.py
@@ -25,19 +25,19 @@ for usage.
 """
 
 
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 
 
 def main():
     paths = [
-        "./data/fbank/cuts_train.json.gz",
-        "./data/fbank/cuts_eval.json.gz",
-        "./data/fbank/cuts_test.json.gz",
+        "./data/fbank/alimeeting_cuts_train.jsonl.gz",
+        "./data/fbank/alimeeting_cuts_eval.jsonl.gz",
+        "./data/fbank/alimeeting_cuts_test.jsonl.gz",
     ]
 
     for path in paths:
         print(f"Starting display the statistics for {path}")
-        cuts = load_manifest(path)
+        cuts = load_manifest_lazy(path)
         cuts.describe()
 
 
@@ -45,7 +45,7 @@ if __name__ == "__main__":
     main()
 
 """
-Starting display the statistics for ./data/fbank/cuts_train.json.gz
+Starting display the statistics for ./data/fbank/alimeeting_cuts_train.jsonl.gz
 Cuts count: 559092
 Total duration (hours): 424.6
 Speech duration (hours): 424.6 (100.0%)
@@ -61,7 +61,7 @@ min     0.0
 99.5%   14.7
 99.9%   16.2
 max     284.3
-Starting display the statistics for ./data/fbank/cuts_eval.json.gz
+Starting display the statistics for ./data/fbank/alimeeting_cuts_eval.jsonl.gz
 Cuts count: 6457
 Total duration (hours): 4.9
 Speech duration (hours): 4.9 (100.0%)
@@ -77,7 +77,7 @@ min     0.1
 99.5%   14.1
 99.9%   14.7
 max     15.8
-Starting display the statistics for ./data/fbank/cuts_test.json.gz
+Starting display the statistics for ./data/fbank/alimeeting_cuts_test.jsonl.gz
 Cuts count: 16358
 Total duration (hours): 12.5
 Speech duration (hours): 12.5 (100.0%)
diff --git a/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py
index bd41a7a1e..339612afe 100644
--- a/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py
@@ -27,7 +27,7 @@ from lhotse import (
     CutSet,
     Fbank,
     FbankConfig,
-    load_manifest,
+    load_manifest_lazy,
     set_caching_enabled,
 )
 from lhotse.dataset import (
@@ -204,8 +204,8 @@ class AlimeetingAsrDataModule:
             The state dict for the training sampler.
         """
         logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
         )
 
         transforms = []
@@ -401,14 +401,20 @@ class AlimeetingAsrDataModule:
     @lru_cache()
     def train_cuts(self) -> CutSet:
         logging.info("About to get train cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "alimeeting_cuts_train.jsonl.gz"
+        )
 
     @lru_cache()
     def valid_cuts(self) -> CutSet:
         logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_eval.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "alimeeting_cuts_eval.jsonl.gz"
+        )
 
     @lru_cache()
     def test_cuts(self) -> List[CutSet]:
         logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "alimeeting_cuts_test.jsonl.gz"
+        )
diff --git a/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py b/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py
index ab958fa68..62b43146a 100644
--- a/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py
+++ b/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py
@@ -20,9 +20,8 @@ import logging
 from functools import lru_cache
 from pathlib import Path
 
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
-    BucketingSampler,
     CutConcatenate,
     CutMix,
     DynamicBucketingSampler,
@@ -190,8 +189,8 @@ class GigaSpeechAsrDataModule:
 
     def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
         logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
         )
 
         transforms = []
@@ -315,7 +314,7 @@ class GigaSpeechAsrDataModule:
                 cut_transforms=transforms,
                 return_cuts=self.args.return_cuts,
             )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
             cuts_valid,
             max_duration=self.args.max_duration,
             shuffle=False,
@@ -339,8 +338,10 @@ class GigaSpeechAsrDataModule:
             else PrecomputedFeatures(),
             return_cuts=self.args.return_cuts,
         )
-        sampler = BucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
         )
         logging.debug("About to create test dataloader")
         test_dl = DataLoader(
@@ -361,7 +362,9 @@ class GigaSpeechAsrDataModule:
     @lru_cache()
     def dev_cuts(self) -> CutSet:
         logging.info("About to get dev cuts")
-        cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
+        cuts_valid = load_manifest_lazy(
+            self.args.manifest_dir / "cuts_DEV.jsonl.gz"
+        )
         if self.args.small_dev:
             return cuts_valid.subset(first=1000)
         else:
@@ -370,4 +373,4 @@ class GigaSpeechAsrDataModule:
     @lru_cache()
     def test_cuts(self) -> CutSet:
         logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
diff --git a/egs/gigaspeech/ASR/local/compute_fbank_musan.py b/egs/gigaspeech/ASR/local/compute_fbank_musan.py
deleted file mode 100755
index 562872993..000000000
--- a/egs/gigaspeech/ASR/local/compute_fbank_musan.py
+++ /dev/null
@@ -1,103 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021  Johns Hopkins University (Piotr Żelasko)
-# Copyright    2021  Xiaomi Corp.             (Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from pathlib import Path
-
-import torch
-from lhotse import (
-    CutSet,
-    KaldifeatFbank,
-    KaldifeatFbankConfig,
-    combine,
-)
-from lhotse.recipes.utils import read_manifests_if_cached
-
-# Torch's multithreaded behavior needs to be disabled or
-# it wastes a lot of CPU and slow things down.
-# Do this outside of main() in case it needs to take effect
-# even when we are not invoking the main (e.g. when spawning subprocesses).
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-
-def compute_fbank_musan():
-    src_dir = Path("data/manifests")
-    output_dir = Path("data/fbank")
-
-    # number of workers in dataloader
-    num_workers = 10
-
-    # number of seconds in a batch
-    batch_duration = 600
-
-    dataset_parts = (
-        "music",
-        "speech",
-        "noise",
-    )
-
-    manifests = read_manifests_if_cached(
-        prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir
-    )
-    assert manifests is not None
-
-    musan_cuts_path = output_dir / "cuts_musan.json.gz"
-
-    if musan_cuts_path.is_file():
-        logging.info(f"{musan_cuts_path} already exists - skipping")
-        return
-
-    logging.info("Extracting features for Musan")
-
-    device = torch.device("cpu")
-    if torch.cuda.is_available():
-        device = torch.device("cuda", 0)
-    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
-
-    logging.info(f"device: {device}")
-
-    musan_cuts = (
-        CutSet.from_manifests(
-            recordings=combine(
-                part["recordings"] for part in manifests.values()
-            )
-        )
-        .cut_into_windows(10.0)
-        .filter(lambda c: c.duration > 5)
-        .compute_and_store_features_batch(
-            extractor=extractor,
-            storage_path=f"{output_dir}/feats_musan",
-            num_workers=num_workers,
-            batch_duration=batch_duration,
-        )
-    )
-    musan_cuts.to_json(musan_cuts_path)
-
-
-def main():
-    formatter = (
-        "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-    )
-    logging.basicConfig(format=formatter, level=logging.INFO)
-
-    compute_fbank_musan()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/gigaspeech/ASR/local/compute_fbank_musan.py b/egs/gigaspeech/ASR/local/compute_fbank_musan.py
new file mode 120000
index 000000000..5833f2484
--- /dev/null
+++ b/egs/gigaspeech/ASR/local/compute_fbank_musan.py
@@ -0,0 +1 @@
+../../../librispeech/ASR/local/compute_fbank_musan.py
\ No newline at end of file
diff --git a/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
index ff3d3b07a..19fe7c6a7 100644
--- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@@ -23,9 +23,8 @@ from pathlib import Path
 from typing import Any, Dict, Optional
 
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
-    BucketingSampler,
     CutConcatenate,
     CutMix,
     DynamicBucketingSampler,
@@ -217,8 +216,8 @@ class GigaSpeechAsrDataModule:
         if self.args.enable_musan:
             logging.info("Enable MUSAN")
             logging.info("About to get Musan cuts")
-            cuts_musan = load_manifest(
-                self.args.manifest_dir / "cuts_musan.json.gz"
+            cuts_musan = load_manifest_lazy(
+                self.args.manifest_dir / "musan_cuts.jsonl.gz"
             )
             transforms.append(
                 CutMix(
@@ -358,7 +357,7 @@ class GigaSpeechAsrDataModule:
                 cut_transforms=transforms,
                 return_cuts=self.args.return_cuts,
             )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
             cuts_valid,
             max_duration=self.args.max_duration,
             shuffle=False,
@@ -382,8 +381,10 @@ class GigaSpeechAsrDataModule:
             else PrecomputedFeatures(),
             return_cuts=self.args.return_cuts,
         )
-        sampler = BucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
         )
         logging.debug("About to create test dataloader")
         test_dl = DataLoader(
@@ -404,7 +405,9 @@ class GigaSpeechAsrDataModule:
     @lru_cache()
     def dev_cuts(self) -> CutSet:
         logging.info("About to get dev cuts")
-        cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
+        cuts_valid = load_manifest_lazy(
+            self.args.manifest_dir / "cuts_DEV.jsonl.gz"
+        )
         if self.args.small_dev:
             return cuts_valid.subset(first=1000)
         else:
@@ -413,4 +416,4 @@ class GigaSpeechAsrDataModule:
     @lru_cache()
     def test_cuts(self) -> CutSet:
         logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
diff --git a/egs/librispeech/ASR/conformer_ctc/ali.py b/egs/librispeech/ASR/conformer_ctc/ali.py
index 42fa2308e..2828e309e 100755
--- a/egs/librispeech/ASR/conformer_ctc/ali.py
+++ b/egs/librispeech/ASR/conformer_ctc/ali.py
@@ -96,14 +96,14 @@ def get_parser():
 
         - labels_xxx.h5
         - aux_labels_xxx.h5
-        - cuts_xxx.json.gz
+        - librispeech_cuts_xxx.jsonl.gz
 
         where xxx is the value of `--dataset`. For instance, if
         `--dataset` is `train-clean-100`, it will contain 3 files:
 
         - `labels_train-clean-100.h5`
         - `aux_labels_train-clean-100.h5`
-        - `cuts_train-clean-100.json.gz`
+        - `librispeech_cuts_train-clean-100.jsonl.gz`
 
         Note: Both labels_xxx.h5 and aux_labels_xxx.h5 contain framewise
         alignment. The difference is that labels_xxx.h5 contains repeats.
@@ -289,7 +289,9 @@ def main():
 
     out_labels_ali_filename = out_dir / f"labels_{params.dataset}.h5"
     out_aux_labels_ali_filename = out_dir / f"aux_labels_{params.dataset}.h5"
-    out_manifest_filename = out_dir / f"cuts_{params.dataset}.json.gz"
+    out_manifest_filename = (
+        out_dir / f"librispeech_cuts_{params.dataset}.jsonl.gz"
+    )
 
     for f in (
         out_labels_ali_filename,
diff --git a/egs/librispeech/ASR/conformer_ctc/train.py b/egs/librispeech/ASR/conformer_ctc/train.py
index b81bd6330..fc8fc8863 100755
--- a/egs/librispeech/ASR/conformer_ctc/train.py
+++ b/egs/librispeech/ASR/conformer_ctc/train.py
@@ -17,6 +17,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Usage:
+  export CUDA_VISIBLE_DEVICES="0,1,2,3"
+  ./conformer_ctc/train.py \
+     --exp-dir ./conformer_ctc/exp \
+     --world-size 4 \
+     --full-libri 1 \
+     --max-duration 200 \
+     --num-epochs 20
+"""
+
 import argparse
 import logging
 from pathlib import Path
@@ -29,6 +40,7 @@ import torch.multiprocessing as mp
 import torch.nn as nn
 from asr_datamodule import LibriSpeechAsrDataModule
 from conformer import Conformer
+from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from torch import Tensor
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -676,6 +688,20 @@ def run(rank, world_size, args):
     if params.full_libri:
         train_cuts += librispeech.train_clean_360_cuts()
         train_cuts += librispeech.train_other_500_cuts()
+
+    def remove_short_and_long_utt(c: Cut):
+        # Keep only utterances with duration between 1 second and 20 seconds
+        #
+        # Caution: There is a reason to select 20.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
+        return 1.0 <= c.duration <= 20.0
+
+    train_cuts = train_cuts.filter(remove_short_and_long_utt)
+
     train_dl = librispeech.train_dataloaders(train_cuts)
 
     valid_cuts = librispeech.dev_clean_cuts()
diff --git a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py
index 9f1039893..68d93d2c5 100644
--- a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py
+++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py
@@ -20,11 +20,7 @@ import logging
 from pathlib import Path
 
 import torch
-from lhotse import (
-    CutSet,
-    KaldifeatFbank,
-    KaldifeatFbankConfig,
-)
+from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig
 
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
@@ -51,13 +47,16 @@ def compute_fbank_gigaspeech_dev_test():
 
     logging.info(f"device: {device}")
 
+    prefix = "gigaspeech"
+    suffix = "jsonl.gz"
+
     for partition in subsets:
-        cuts_path = in_out_dir / f"cuts_{partition}.jsonl.gz"
+        cuts_path = in_out_dir / f"{prefix}_cuts_{partition}.{suffix}"
         if cuts_path.is_file():
             logging.info(f"{cuts_path} exists - skipping")
             continue
 
-        raw_cuts_path = in_out_dir / f"cuts_{partition}_raw.jsonl.gz"
+        raw_cuts_path = in_out_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
 
         logging.info(f"Loading {raw_cuts_path}")
         cut_set = CutSet.from_file(raw_cuts_path)
@@ -66,7 +65,7 @@ def compute_fbank_gigaspeech_dev_test():
 
         cut_set = cut_set.compute_and_store_features_batch(
             extractor=extractor,
-            storage_path=f"{in_out_dir}/feats_{partition}",
+            storage_path=f"{in_out_dir}/{prefix}_feats_{partition}",
             num_workers=num_workers,
             batch_duration=batch_duration,
         )
diff --git a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py
index a7ed2467d..f826f064e 100644
--- a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py
+++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py
@@ -77,7 +77,7 @@ def get_parser():
 
 def compute_fbank_gigaspeech_splits(args):
     num_splits = args.num_splits
-    output_dir = f"data/fbank/XL_split_{num_splits}"
+    output_dir = f"data/fbank/gigaspeech_XL_split_{num_splits}"
     output_dir = Path(output_dir)
     assert output_dir.exists(), f"{output_dir} does not exist!"
 
@@ -96,17 +96,19 @@ def compute_fbank_gigaspeech_splits(args):
     extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
     logging.info(f"device: {device}")
 
+    prefix = "gigaspeech"
+
     num_digits = 8  # num_digits is fixed by lhotse split-lazy
     for i in range(start, stop):
         idx = f"{i + 1}".zfill(num_digits)
         logging.info(f"Processing {idx}/{num_splits}")
 
-        cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz"
+        cuts_path = output_dir / f"{prefix}_cuts_XL.{idx}.jsonl.gz"
         if cuts_path.is_file():
             logging.info(f"{cuts_path} exists - skipping")
             continue
 
-        raw_cuts_path = output_dir / f"cuts_XL_raw.{idx}.jsonl.gz"
+        raw_cuts_path = output_dir / f"{prefix}_cuts_XL_raw.{idx}.jsonl.gz"
         if not raw_cuts_path.is_file():
             logging.info(f"{raw_cuts_path} does not exist - skipping it")
             continue
@@ -115,13 +117,13 @@ def compute_fbank_gigaspeech_splits(args):
         cut_set = CutSet.from_file(raw_cuts_path)
 
         logging.info("Computing features")
-        if (output_dir / f"feats_XL_{idx}.lca").exists():
-            logging.info(f"Removing {output_dir}/feats_XL_{idx}.lca")
-            os.remove(output_dir / f"feats_XL_{idx}.lca")
+        if (output_dir / f"{prefix}_feats_XL_{idx}.lca").exists():
+            logging.info(f"Removing {output_dir}/{prefix}_feats_XL_{idx}.lca")
+            os.remove(output_dir / f"{prefix}_feats_XL_{idx}.lca")
 
         cut_set = cut_set.compute_and_store_features_batch(
             extractor=extractor,
-            storage_path=f"{output_dir}/feats_XL_{idx}",
+            storage_path=f"{output_dir}/{prefix}_feats_XL_{idx}",
             num_workers=args.num_workers,
             batch_duration=args.batch_duration,
         )
diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
index 92f4f6ab7..642d9fd32 100755
--- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py
+++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py
@@ -28,7 +28,7 @@ import os
 from pathlib import Path
 
 import torch
-from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 
 from icefall.utils import get_executor
@@ -56,8 +56,13 @@ def compute_fbank_librispeech():
         "train-clean-360",
         "train-other-500",
     )
+    prefix = "librispeech"
+    suffix = "jsonl.gz"
     manifests = read_manifests_if_cached(
-        prefix="librispeech", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
     )
     assert manifests is not None
 
@@ -65,7 +70,8 @@ def compute_fbank_librispeech():
 
     with get_executor() as ex:  # Initialize the executor only once.
         for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
+            if (output_dir / cuts_filename).is_file():
                 logging.info(f"{partition} already exists - skipping.")
                 continue
             logging.info(f"Processing {partition}")
@@ -81,13 +87,13 @@ def compute_fbank_librispeech():
                 )
             cut_set = cut_set.compute_and_store_features(
                 extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                 # when an executor is specified, make more partitions
                 num_jobs=num_jobs if ex is None else 80,
                 executor=ex,
-                storage_type=ChunkedLilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
             )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(output_dir / cuts_filename)
 
 
 if __name__ == "__main__":
diff --git a/egs/librispeech/ASR/local/compute_fbank_musan.py b/egs/librispeech/ASR/local/compute_fbank_musan.py
index 368bea4e8..fef372129 100755
--- a/egs/librispeech/ASR/local/compute_fbank_musan.py
+++ b/egs/librispeech/ASR/local/compute_fbank_musan.py
@@ -28,7 +28,7 @@ import os
 from pathlib import Path
 
 import torch
-from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig, combine
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine
 from lhotse.recipes.utils import read_manifests_if_cached
 
 from icefall.utils import get_executor
@@ -52,12 +52,22 @@ def compute_fbank_musan():
         "speech",
         "noise",
     )
+    prefix = "musan"
+    suffix = "jsonl.gz"
     manifests = read_manifests_if_cached(
-        prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
     )
     assert manifests is not None
 
-    musan_cuts_path = output_dir / "cuts_musan.json.gz"
+    assert len(manifests) == len(dataset_parts), (
+        len(manifests),
+        len(dataset_parts),
+    )
+
+    musan_cuts_path = output_dir / "musan_cuts.jsonl.gz"
 
     if musan_cuts_path.is_file():
         logging.info(f"{musan_cuts_path} already exists - skipping")
@@ -79,13 +89,13 @@ def compute_fbank_musan():
             .filter(lambda c: c.duration > 5)
             .compute_and_store_features(
                 extractor=extractor,
-                storage_path=f"{output_dir}/feats_musan",
+                storage_path=f"{output_dir}/musan_feats",
                 num_jobs=num_jobs if ex is None else 80,
                 executor=ex,
-                storage_type=ChunkedLilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
             )
         )
-        musan_cuts.to_json(musan_cuts_path)
+        musan_cuts.to_file(musan_cuts_path)
 
 
 if __name__ == "__main__":
diff --git a/egs/librispeech/ASR/local/display_manifest_statistics.py b/egs/librispeech/ASR/local/display_manifest_statistics.py
index 15bd206fa..c3c684235 100755
--- a/egs/librispeech/ASR/local/display_manifest_statistics.py
+++ b/egs/librispeech/ASR/local/display_manifest_statistics.py
@@ -25,19 +25,19 @@ for usage.
 """
 
 
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 
 
 def main():
-    path = "./data/fbank/cuts_train-clean-100.json.gz"
-    path = "./data/fbank/cuts_train-clean-360.json.gz"
-    path = "./data/fbank/cuts_train-other-500.json.gz"
-    path = "./data/fbank/cuts_dev-clean.json.gz"
-    path = "./data/fbank/cuts_dev-other.json.gz"
-    path = "./data/fbank/cuts_test-clean.json.gz"
-    path = "./data/fbank/cuts_test-other.json.gz"
+    #  path = "./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz"
+    #  path = "./data/fbank/librispeech_cuts_train-clean-360.jsonl.gz"
+    #  path = "./data/fbank/librispeech_cuts_train-other-500.jsonl.gz"
+    #  path = "./data/fbank/librispeech_cuts_dev-clean.jsonl.gz"
+    #  path = "./data/fbank/librispeech_cuts_dev-other.jsonl.gz"
+    #  path = "./data/fbank/librispeech_cuts_test-clean.jsonl.gz"
+    path = "./data/fbank/librispeech_cuts_test-other.jsonl.gz"
 
-    cuts = load_manifest(path)
+    cuts = load_manifest_lazy(path)
     cuts.describe()
 
 
diff --git a/egs/librispeech/ASR/local/preprocess_gigaspeech.py b/egs/librispeech/ASR/local/preprocess_gigaspeech.py
index cd1345904..0f4ae820b 100644
--- a/egs/librispeech/ASR/local/preprocess_gigaspeech.py
+++ b/egs/librispeech/ASR/local/preprocess_gigaspeech.py
@@ -58,17 +58,19 @@ def preprocess_giga_speech():
     )
 
     logging.info("Loading manifest (may take 4 minutes)")
+    prefix = "gigaspeech"
+    suffix = "jsonl.gz"
     manifests = read_manifests_if_cached(
         dataset_parts=dataset_parts,
         output_dir=src_dir,
-        prefix="gigaspeech",
-        suffix="jsonl.gz",
+        prefix=prefix,
+        suffix=suffix,
     )
     assert manifests is not None
 
     for partition, m in manifests.items():
         logging.info(f"Processing {partition}")
-        raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
+        raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
         if raw_cuts_path.is_file():
             logging.info(f"{partition} already exists - skipping")
             continue
diff --git a/egs/librispeech/ASR/local/validate_manifest.py b/egs/librispeech/ASR/local/validate_manifest.py
index 8d3d4c7ce..7c57d629a 100755
--- a/egs/librispeech/ASR/local/validate_manifest.py
+++ b/egs/librispeech/ASR/local/validate_manifest.py
@@ -25,7 +25,7 @@ We will add more checks later if needed.
 Usage example:
 
     python3 ./local/validate_manifest.py \
-            ./data/fbank/cuts_train-clean-100.json.gz
+            ./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
 
 """
 
@@ -33,7 +33,7 @@ import argparse
 import logging
 from pathlib import Path
 
-from lhotse import load_manifest, CutSet
+from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
 
 
@@ -76,7 +76,7 @@ def main():
     logging.info(f"Validating {manifest}")
 
     assert manifest.is_file(), f"{manifest} does not exist"
-    cut_set = load_manifest(manifest)
+    cut_set = load_manifest_lazy(manifest)
     assert isinstance(cut_set, CutSet)
 
     for c in cut_set:
diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh
index 8cfb046c8..17a638502 100755
--- a/egs/librispeech/ASR/prepare.sh
+++ b/egs/librispeech/ASR/prepare.sh
@@ -40,9 +40,9 @@ dl_dir=$PWD/download
 # It will generate data/lang_bpe_xxx,
 # data/lang_bpe_yyy if the array contains xxx, yyy
 vocab_sizes=(
-  5000
-  2000
-  1000
+  # 5000
+  # 2000
+  # 1000
   500
 )
 
@@ -132,7 +132,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
     )
     for part in ${parts[@]}; do
       python3 ./local/validate_manifest.py \
-        data/fbank/cuts_${part}.json.gz
+        data/fbank/librispeech_cuts_${part}.jsonl.gz
     done
     touch data/fbank/.librispeech-validated.done
   fi
diff --git a/egs/librispeech/ASR/prepare_giga_speech.sh b/egs/librispeech/ASR/prepare_giga_speech.sh
index 26b921eab..6f85ddc29 100755
--- a/egs/librispeech/ASR/prepare_giga_speech.sh
+++ b/egs/librispeech/ASR/prepare_giga_speech.sh
@@ -124,9 +124,9 @@ fi
 
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
   log "Stage 4: Split XL subset into ${num_splits} pieces"
-  split_dir=data/fbank/XL_split_${num_splits}
+  split_dir=data/fbank/gigaspeech_XL_split_${num_splits}
   if [ ! -f $split_dir/.split_completed ]; then
-    lhotse split-lazy ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir $chunk_size
+    lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size
     touch $split_dir/.split_completed
   fi
 fi
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/train.py b/egs/librispeech/ASR/pruned_transducer_stateless/train.py
index c360d025a..e6795330f 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless/train.py
@@ -807,28 +807,8 @@ def run(rank, world_size, args):
         # the threshold
         return 1.0 <= c.duration <= 20.0
 
-    num_in_total = len(train_cuts)
-
     train_cuts = train_cuts.filter(remove_short_and_long_utt)
 
-    try:
-        num_left = len(train_cuts)
-        num_removed = num_in_total - num_left
-        removed_percent = num_removed / num_in_total * 100
-
-        logging.info(
-            f"Before removing short and long utterances: {num_in_total}"
-        )
-        logging.info(f"After removing short and long utterances: {num_left}")
-        logging.info(
-            f"Removed {num_removed} utterances ({removed_percent:.5f}%)"
-        )
-    except TypeError as e:
-        # You can ignore this error as previous versions of Lhotse work fine
-        # for the above code. In recent versions of Lhotse, it uses
-        # lazy filter, producing cutsets that don't have the __len__  method
-        logging.info(str(e))
-
     if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
         # We only load the sampler's state dict when it loads a checkpoint
         # saved in the middle of an epoch
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py b/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py
index 8828285aa..b54d1aa39 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py
@@ -22,7 +22,6 @@ from typing import Optional
 
 from lhotse import CutSet, Fbank, FbankConfig
 from lhotse.dataset import (
-    BucketingSampler,
     CutMix,
     DynamicBucketingSampler,
     K2SpeechRecognitionDataset,
@@ -71,8 +70,7 @@ class AsrDataModule:
             "--num-buckets",
             type=int,
             default=30,
-            help="The number of buckets for the BucketingSampler "
-            "and DynamicBucketingSampler."
+            help="The number of buckets for the DynamicBucketingSampler. "
             "(you might want to increase it for larger datasets).",
         )
 
@@ -152,7 +150,6 @@ class AsrDataModule:
     def train_dataloaders(
         self,
         cuts_train: CutSet,
-        dynamic_bucketing: bool,
         on_the_fly_feats: bool,
         cuts_musan: Optional[CutSet] = None,
     ) -> DataLoader:
@@ -162,9 +159,6 @@ class AsrDataModule:
             Cuts for training.
           cuts_musan:
             If not None, it is the cuts for mixing.
-          dynamic_bucketing:
-            True to use DynamicBucketingSampler;
-            False to use BucketingSampler.
           on_the_fly_feats:
             True to use OnTheFlyFeatures;
             False to use PrecomputedFeatures.
@@ -230,25 +224,14 @@ class AsrDataModule:
             return_cuts=self.args.return_cuts,
         )
 
-        if dynamic_bucketing:
-            logging.info("Using DynamicBucketingSampler.")
-            train_sampler = DynamicBucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                drop_last=True,
-            )
-        else:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
-                drop_last=True,
-            )
+        logging.info("Using DynamicBucketingSampler.")
+        train_sampler = DynamicBucketingSampler(
+            cuts_train,
+            max_duration=self.args.max_duration,
+            shuffle=self.args.shuffle,
+            num_buckets=self.args.num_buckets,
+            drop_last=True,
+        )
 
         logging.info("About to create train dataloader")
         train_dl = DataLoader(
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py b/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py
index 3f8bf3ba9..36f32c6b3 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py
@@ -22,7 +22,7 @@ import re
 from pathlib import Path
 
 import lhotse
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 
 
 class GigaSpeech:
@@ -32,13 +32,13 @@ class GigaSpeech:
           manifest_dir:
             It is expected to contain the following files::
 
-                - XL_split_2000/cuts_XL.*.jsonl.gz
-                - cuts_L_raw.jsonl.gz
-                - cuts_M_raw.jsonl.gz
-                - cuts_S_raw.jsonl.gz
-                - cuts_XS_raw.jsonl.gz
-                - cuts_DEV_raw.jsonl.gz
-                - cuts_TEST_raw.jsonl.gz
+                - gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz
+                - gigaspeech_cuts_L_raw.jsonl.gz
+                - gigaspeech_cuts_M_raw.jsonl.gz
+                - gigaspeech_cuts_S_raw.jsonl.gz
+                - gigaspeech_cuts_XS_raw.jsonl.gz
+                - gigaspeech_cuts_DEV_raw.jsonl.gz
+                - gigaspeech_cuts_TEST_raw.jsonl.gz
         """
         self.manifest_dir = Path(manifest_dir)
 
@@ -46,10 +46,12 @@ class GigaSpeech:
         logging.info("About to get train-XL cuts")
 
         filenames = list(
-            glob.glob(f"{self.manifest_dir}/XL_split_2000/cuts_XL.*.jsonl.gz")
+            glob.glob(
+                f"{self.manifest_dir}/gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz"  # noqa
+            )
         )
 
-        pattern = re.compile(r"cuts_XL.([0-9]+).jsonl.gz")
+        pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz")
         idx_filenames = [
             (int(pattern.search(f).group(1)), f) for f in filenames
         ]
@@ -64,31 +66,31 @@ class GigaSpeech:
         )
 
     def train_L_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_L_raw.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_L_raw.jsonl.gz"
         logging.info(f"About to get train-L cuts from {f}")
         return CutSet.from_jsonl_lazy(f)
 
     def train_M_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_M_raw.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_M_raw.jsonl.gz"
         logging.info(f"About to get train-M cuts from {f}")
         return CutSet.from_jsonl_lazy(f)
 
     def train_S_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_S_raw.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_S_raw.jsonl.gz"
         logging.info(f"About to get train-S cuts from {f}")
         return CutSet.from_jsonl_lazy(f)
 
     def train_XS_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_XS_raw.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_XS_raw.jsonl.gz"
         logging.info(f"About to get train-XS cuts from {f}")
         return CutSet.from_jsonl_lazy(f)
 
     def test_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_TEST.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_TEST.jsonl.gz"
         logging.info(f"About to get TEST cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
 
     def dev_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_DEV.jsonl.gz"
+        f = self.manifest_dir / "gigaspeech_cuts_DEV.jsonl.gz"
         logging.info(f"About to get DEV cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py b/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py
index 00b7c8334..6dba8e9fe 100644
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py
@@ -18,7 +18,7 @@
 import logging
 from pathlib import Path
 
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 
 
 class LibriSpeech:
@@ -28,47 +28,47 @@ class LibriSpeech:
           manifest_dir:
             It is expected to contain the following files::
 
-                - cuts_dev-clean.json.gz
-                - cuts_dev-other.json.gz
-                - cuts_test-clean.json.gz
-                - cuts_test-other.json.gz
-                - cuts_train-clean-100.json.gz
-                - cuts_train-clean-360.json.gz
-                - cuts_train-other-500.json.gz
+                - librispeech_cuts_dev-clean.jsonl.gz
+                - librispeech_cuts_dev-other.jsonl.gz
+                - librispeech_cuts_test-clean.jsonl.gz
+                - librispeech_cuts_test-other.jsonl.gz
+                - librispeech_cuts_train-clean-100.jsonl.gz
+                - librispeech_cuts_train-clean-360.jsonl.gz
+                - librispeech_cuts_train-other-500.jsonl.gz
         """
         self.manifest_dir = Path(manifest_dir)
 
     def train_clean_100_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-clean-100.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
         logging.info(f"About to get train-clean-100 cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
 
     def train_clean_360_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-clean-360.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
         logging.info(f"About to get train-clean-360 cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
 
     def train_other_500_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-other-500.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
         logging.info(f"About to get train-other-500 cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
 
     def test_clean_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test-clean.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
         logging.info(f"About to get test-clean cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
 
     def test_other_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test-other.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
         logging.info(f"About to get test-other cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
 
     def dev_clean_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_dev-clean.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
         logging.info(f"About to get dev-clean cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
 
     def dev_other_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_dev-other.json.gz"
+        f = self.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
         logging.info(f"About to get dev-other cuts from {f}")
-        return load_manifest(f)
+        return load_manifest_lazy(f)
diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
index a2a5519f1..37cebd577 100755
--- a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
+++ b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py
@@ -66,7 +66,7 @@ from conformer import Conformer
 from decoder import Decoder
 from gigaspeech import GigaSpeech
 from joiner import Joiner
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
 from lhotse.dataset.sampling.base import CutSampler
 from lhotse.utils import fix_random_seed
@@ -968,8 +968,8 @@ def run(rank, world_size, args):
     train_giga_cuts = train_giga_cuts.repeat(times=None)
 
     if args.enable_musan:
-        cuts_musan = load_manifest(
-            Path(args.manifest_dir) / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
         )
     else:
         cuts_musan = None
@@ -978,14 +978,12 @@ def run(rank, world_size, args):
 
     train_dl = asr_datamodule.train_dataloaders(
         train_cuts,
-        dynamic_bucketing=False,
         on_the_fly_feats=False,
         cuts_musan=cuts_musan,
     )
 
     giga_train_dl = asr_datamodule.train_dataloaders(
         train_giga_cuts,
-        dynamic_bucketing=True,
         on_the_fly_feats=False,
         cuts_musan=cuts_musan,
     )
diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
index 7628c8274..5cca06169 100644
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
@@ -24,7 +24,7 @@ from pathlib import Path
 from typing import Any, Dict, Optional
 
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
     CutConcatenate,
     CutMix,
@@ -224,8 +224,8 @@ class LibriSpeechAsrDataModule:
         if self.args.enable_musan:
             logging.info("Enable MUSAN")
             logging.info("About to get Musan cuts")
-            cuts_musan = load_manifest(
-                self.args.manifest_dir / "cuts_musan.json.gz"
+            cuts_musan = load_manifest_lazy(
+                self.args.manifest_dir / "musan_cuts.jsonl.gz"
             )
             transforms.append(
                 CutMix(
@@ -407,40 +407,48 @@ class LibriSpeechAsrDataModule:
     @lru_cache()
     def train_clean_100_cuts(self) -> CutSet:
         logging.info("About to get train-clean-100 cuts")
-        return load_manifest(
-            self.args.manifest_dir / "cuts_train-clean-100.json.gz"
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
         )
 
     @lru_cache()
     def train_clean_360_cuts(self) -> CutSet:
         logging.info("About to get train-clean-360 cuts")
-        return load_manifest(
-            self.args.manifest_dir / "cuts_train-clean-360.json.gz"
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
         )
 
     @lru_cache()
     def train_other_500_cuts(self) -> CutSet:
         logging.info("About to get train-other-500 cuts")
-        return load_manifest(
-            self.args.manifest_dir / "cuts_train-other-500.json.gz"
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
         )
 
     @lru_cache()
     def dev_clean_cuts(self) -> CutSet:
         logging.info("About to get dev-clean cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev-clean.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
+        )
 
     @lru_cache()
     def dev_other_cuts(self) -> CutSet:
         logging.info("About to get dev-other cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev-other.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
+        )
 
     @lru_cache()
     def test_clean_cuts(self) -> CutSet:
         logging.info("About to get test-clean cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test-clean.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
+        )
 
     @lru_cache()
     def test_other_cuts(self) -> CutSet:
         logging.info("About to get test-other cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
+        )
diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
index 8597525ba..827e3ae1f 100755
--- a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
+++ b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py
@@ -16,6 +16,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Usage:
+  export CUDA_VISIBLE_DEVICES="0,1,2,3"
+  ./tdnn_lstm_ctc/train.py \
+     --world-size 4 \
+     --full-libri 1 \
+     --max-duration 300 \
+     --num-epochs 20
+"""
 
 import argparse
 import logging
@@ -29,6 +38,7 @@ import torch.multiprocessing as mp
 import torch.nn as nn
 import torch.optim as optim
 from asr_datamodule import LibriSpeechAsrDataModule
+from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from model import TdnnLstm
 from torch import Tensor
@@ -544,10 +554,25 @@ def run(rank, world_size, args):
     if params.full_libri:
         train_cuts += librispeech.train_clean_360_cuts()
         train_cuts += librispeech.train_other_500_cuts()
+
+    def remove_short_and_long_utt(c: Cut):
+        # Keep only utterances with duration between 1 second and 20 seconds
+        #
+        # Caution: There is a reason to select 20.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
+        return 1.0 <= c.duration <= 20.0
+
+    train_cuts = train_cuts.filter(remove_short_and_long_utt)
+
     train_dl = librispeech.train_dataloaders(train_cuts)
 
     valid_cuts = librispeech.dev_clean_cuts()
     valid_cuts += librispeech.dev_other_cuts()
+
     valid_dl = librispeech.valid_dataloaders(valid_cuts)
 
     for epoch in range(params.start_epoch, params.num_epochs):
diff --git a/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py b/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py
index 99d5b3788..b00fc34f1 100755
--- a/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py
+++ b/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py
@@ -44,8 +44,8 @@ from pathlib import Path
 import sentencepiece as spm
 import torch
 from alignment import get_word_starting_frames
-from lhotse import CutSet, load_manifest
-from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler
+from lhotse import CutSet, load_manifest_lazy
+from lhotse.dataset import DynamicBucketingSampler, K2SpeechRecognitionDataset
 from lhotse.dataset.collation import collate_custom_field
 
 
@@ -93,14 +93,15 @@ def main():
     sp = spm.SentencePieceProcessor()
     sp.load(args.bpe_model)
 
-    cuts_json = args.ali_dir / f"cuts_{args.dataset}.json.gz"
+    cuts_jsonl = args.ali_dir / f"librispeech_cuts_{args.dataset}.jsonl.gz"
 
-    logging.info(f"Loading {cuts_json}")
-    cuts = load_manifest(cuts_json)
+    logging.info(f"Loading {cuts_jsonl}")
+    cuts = load_manifest_lazy(cuts_jsonl)
 
-    sampler = SingleCutSampler(
+    sampler = DynamicBucketingSampler(
         cuts,
         max_duration=30,
+        num_buckets=30,
         shuffle=False,
     )
 
diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
deleted file mode 100644
index c6cf739fb..000000000
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
+++ /dev/null
@@ -1,333 +0,0 @@
-# Copyright      2021  Piotr Żelasko
-#                2022  Xiaomi Corp.        (authors: Fangjun Kuang
-# 						     Mingshuang Luo)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import inspect
-import logging
-from pathlib import Path
-from typing import Optional
-
-import torch
-from lhotse import CutSet, Fbank, FbankConfig
-from lhotse.dataset import (
-    BucketingSampler,
-    CutMix,
-    DynamicBucketingSampler,
-    K2SpeechRecognitionDataset,
-    SpecAugment,
-)
-from lhotse.dataset.input_strategies import (
-    OnTheFlyFeatures,
-    PrecomputedFeatures,
-)
-from lhotse.utils import fix_random_seed
-from torch.utils.data import DataLoader
-
-from icefall.utils import str2bool
-
-
-class _SeedWorkers:
-    def __init__(self, seed: int):
-        self.seed = seed
-
-    def __call__(self, worker_id: int):
-        fix_random_seed(self.seed + worker_id)
-
-
-class AsrDataModule:
-    def __init__(self, args: argparse.Namespace):
-        self.args = args
-
-    @classmethod
-    def add_arguments(cls, parser: argparse.ArgumentParser):
-        group = parser.add_argument_group(
-            title="ASR data related options",
-            description="These options are used for the preparation of "
-            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
-            "effective batch sizes, sampling strategies, applied data "
-            "augmentations, etc.",
-        )
-
-        group.add_argument(
-            "--max-duration",
-            type=int,
-            default=200.0,
-            help="Maximum pooled recordings duration (seconds) in a "
-            "single batch. You can reduce it if it causes CUDA OOM.",
-        )
-
-        group.add_argument(
-            "--bucketing-sampler",
-            type=str2bool,
-            default=True,
-            help="When enabled, the batches will come from buckets of "
-            "similar duration (saves padding frames).",
-        )
-
-        group.add_argument(
-            "--num-buckets",
-            type=int,
-            default=30,
-            help="The number of buckets for the BucketingSampler "
-            "and DynamicBucketingSampler."
-            "(you might want to increase it for larger datasets).",
-        )
-
-        group.add_argument(
-            "--shuffle",
-            type=str2bool,
-            default=True,
-            help="When enabled (=default), the examples will be "
-            "shuffled for each epoch.",
-        )
-
-        group.add_argument(
-            "--return-cuts",
-            type=str2bool,
-            default=True,
-            help="When enabled, each batch will have the "
-            "field: batch['supervisions']['cut'] with the cuts that "
-            "were used to construct it.",
-        )
-
-        group.add_argument(
-            "--num-workers",
-            type=int,
-            default=2,
-            help="The number of training dataloader workers that "
-            "collect the batches.",
-        )
-
-        group.add_argument(
-            "--enable-spec-aug",
-            type=str2bool,
-            default=True,
-            help="When enabled, use SpecAugment for training dataset.",
-        )
-
-        group.add_argument(
-            "--spec-aug-time-warp-factor",
-            type=int,
-            default=80,
-            help="Used only when --enable-spec-aug is True. "
-            "It specifies the factor for time warping in SpecAugment. "
-            "Larger values mean more warping. "
-            "A value less than 1 means to disable time warp.",
-        )
-
-        group.add_argument(
-            "--enable-musan",
-            type=str2bool,
-            default=True,
-            help="When enabled, select noise from MUSAN and mix it"
-            "with training dataset. ",
-        )
-
-        group.add_argument(
-            "--manifest-dir",
-            type=Path,
-            default=Path("data/fbank"),
-            help="Path to directory with train/valid/test cuts.",
-        )
-
-        group.add_argument(
-            "--on-the-fly-feats",
-            type=str2bool,
-            default=False,
-            help="When enabled, use on-the-fly cut mixing and feature "
-            "extraction. Will drop existing precomputed feature manifests "
-            "if available. Used only in dev/test CutSet",
-        )
-
-    def train_dataloaders(
-        self,
-        cuts_train: CutSet,
-        dynamic_bucketing: bool,
-        on_the_fly_feats: bool,
-        cuts_musan: Optional[CutSet] = None,
-    ) -> DataLoader:
-        """
-        Args:
-          cuts_train:
-            Cuts for training.
-          cuts_musan:
-            If not None, it is the cuts for mixing.
-          dynamic_bucketing:
-            True to use DynamicBucketingSampler;
-            False to use BucketingSampler.
-          on_the_fly_feats:
-            True to use OnTheFlyFeatures;
-            False to use PrecomputedFeatures.
-        """
-        transforms = []
-        if cuts_musan is not None:
-            logging.info("Enable MUSAN")
-            transforms.append(
-                CutMix(
-                    cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True
-                )
-            )
-        else:
-            logging.info("Disable MUSAN")
-
-        input_transforms = []
-
-        if self.args.enable_spec_aug:
-            logging.info("Enable SpecAugment")
-            logging.info(
-                f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
-            )
-            # Set the value of num_frame_masks according to Lhotse's version.
-            # In different Lhotse's versions, the default of num_frame_masks is
-            # different.
-            num_frame_masks = 10
-            num_frame_masks_parameter = inspect.signature(
-                SpecAugment.__init__
-            ).parameters["num_frame_masks"]
-            if num_frame_masks_parameter.default == 1:
-                num_frame_masks = 2
-            logging.info(f"Num frame mask: {num_frame_masks}")
-            input_transforms.append(
-                SpecAugment(
-                    time_warp_factor=self.args.spec_aug_time_warp_factor,
-                    num_frame_masks=num_frame_masks,
-                    features_mask_size=27,
-                    num_feature_masks=2,
-                    frames_mask_size=100,
-                )
-            )
-        else:
-            logging.info("Disable SpecAugment")
-
-        logging.info("About to create train dataset")
-        train = K2SpeechRecognitionDataset(
-            cut_transforms=transforms,
-            input_transforms=input_transforms,
-            return_cuts=self.args.return_cuts,
-        )
-
-        # NOTE: the PerturbSpeed transform should be added only if we
-        # remove it from data prep stage.
-        # Add on-the-fly speed perturbation; since originally it would
-        # have increased epoch size by 3, we will apply prob 2/3 and use
-        # 3x more epochs.
-        # Speed perturbation probably should come first before
-        # concatenation, but in principle the transforms order doesn't have
-        # to be strict (e.g. could be randomized)
-        # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms   # noqa
-        # Drop feats to be on the safe side.
-        train = K2SpeechRecognitionDataset(
-            cut_transforms=transforms,
-            input_strategy=(
-                OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
-                if on_the_fly_feats
-                else PrecomputedFeatures()
-            ),
-            input_transforms=input_transforms,
-            return_cuts=self.args.return_cuts,
-        )
-
-        if dynamic_bucketing:
-            logging.info("Using DynamicBucketingSampler.")
-            train_sampler = DynamicBucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                drop_last=True,
-            )
-        else:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
-                cuts_train,
-                max_duration=self.args.max_duration,
-                shuffle=self.args.shuffle,
-                num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
-                drop_last=True,
-            )
-
-        logging.info("About to create train dataloader")
-
-        # 'seed' is derived from the current random state, which will have
-        # previously been set in the main process.
-        seed = torch.randint(0, 100000, ()).item()
-        worker_init_fn = _SeedWorkers(seed)
-
-        train_dl = DataLoader(
-            train,
-            sampler=train_sampler,
-            batch_size=None,
-            num_workers=self.args.num_workers,
-            persistent_workers=False,
-            worker_init_fn=worker_init_fn,
-        )
-        return train_dl
-
-    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
-        transforms = []
-
-        logging.info("About to create dev dataset")
-        if self.args.on_the_fly_feats:
-            validate = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-                input_strategy=OnTheFlyFeatures(
-                    Fbank(FbankConfig(num_mel_bins=80))
-                ),
-                return_cuts=self.args.return_cuts,
-            )
-        else:
-            validate = K2SpeechRecognitionDataset(
-                cut_transforms=transforms,
-                return_cuts=self.args.return_cuts,
-            )
-        valid_sampler = BucketingSampler(
-            cuts_valid,
-            max_duration=self.args.max_duration,
-            shuffle=False,
-        )
-        logging.info("About to create dev dataloader")
-        valid_dl = DataLoader(
-            validate,
-            sampler=valid_sampler,
-            batch_size=None,
-            num_workers=2,
-            persistent_workers=False,
-        )
-
-        return valid_dl
-
-    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
-        logging.debug("About to create test dataset")
-        test = K2SpeechRecognitionDataset(
-            input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
-            if self.args.on_the_fly_feats
-            else PrecomputedFeatures(),
-            return_cuts=self.args.return_cuts,
-        )
-        sampler = BucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
-        )
-        logging.debug("About to create test dataloader")
-        test_dl = DataLoader(
-            test,
-            batch_size=None,
-            sampler=sampler,
-            num_workers=self.args.num_workers,
-        )
-        return test_dl
diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
new file mode 120000
index 000000000..3ba9ada4f
--- /dev/null
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless3/asr_datamodule.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
deleted file mode 100644
index 286771d7d..000000000
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright      2021  Piotr Żelasko
-#                2022  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-from pathlib import Path
-
-from lhotse import CutSet, load_manifest
-
-
-class GigaSpeech:
-    def __init__(self, manifest_dir: str):
-        """
-        Args:
-          manifest_dir:
-            It is expected to contain the following files::
-
-                - cuts_XL_raw.jsonl.gz
-                - cuts_L_raw.jsonl.gz
-                - cuts_M_raw.jsonl.gz
-                - cuts_S_raw.jsonl.gz
-                - cuts_XS_raw.jsonl.gz
-                - cuts_DEV_raw.jsonl.gz
-                - cuts_TEST_raw.jsonl.gz
-        """
-        self.manifest_dir = Path(manifest_dir)
-
-    def train_XL_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_XL_raw.jsonl.gz"
-        logging.info(f"About to get train-XL cuts from {f}")
-        return CutSet.from_jsonl_lazy(f)
-
-    def train_L_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_L_raw.jsonl.gz"
-        logging.info(f"About to get train-L cuts from {f}")
-        return CutSet.from_jsonl_lazy(f)
-
-    def train_M_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_M_raw.jsonl.gz"
-        logging.info(f"About to get train-M cuts from {f}")
-        return CutSet.from_jsonl_lazy(f)
-
-    def train_S_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_S_raw.jsonl.gz"
-        logging.info(f"About to get train-S cuts from {f}")
-        return CutSet.from_jsonl_lazy(f)
-
-    def train_XS_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_XS_raw.jsonl.gz"
-        logging.info(f"About to get train-XS cuts from {f}")
-        return CutSet.from_jsonl_lazy(f)
-
-    def test_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_TEST.jsonl.gz"
-        logging.info(f"About to get TEST cuts from {f}")
-        return load_manifest(f)
-
-    def dev_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_DEV.jsonl.gz"
-        logging.info(f"About to get DEV cuts from {f}")
-        return load_manifest(f)
diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
new file mode 120000
index 000000000..5242c652a
--- /dev/null
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless3/gigaspeech.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
deleted file mode 100644
index 00b7c8334..000000000
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright      2021  Piotr Żelasko
-#                2022  Xiaomi Corp.        (authors: Fangjun Kuang)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from pathlib import Path
-
-from lhotse import CutSet, load_manifest
-
-
-class LibriSpeech:
-    def __init__(self, manifest_dir: str):
-        """
-        Args:
-          manifest_dir:
-            It is expected to contain the following files::
-
-                - cuts_dev-clean.json.gz
-                - cuts_dev-other.json.gz
-                - cuts_test-clean.json.gz
-                - cuts_test-other.json.gz
-                - cuts_train-clean-100.json.gz
-                - cuts_train-clean-360.json.gz
-                - cuts_train-other-500.json.gz
-        """
-        self.manifest_dir = Path(manifest_dir)
-
-    def train_clean_100_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-clean-100.json.gz"
-        logging.info(f"About to get train-clean-100 cuts from {f}")
-        return load_manifest(f)
-
-    def train_clean_360_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-clean-360.json.gz"
-        logging.info(f"About to get train-clean-360 cuts from {f}")
-        return load_manifest(f)
-
-    def train_other_500_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_train-other-500.json.gz"
-        logging.info(f"About to get train-other-500 cuts from {f}")
-        return load_manifest(f)
-
-    def test_clean_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test-clean.json.gz"
-        logging.info(f"About to get test-clean cuts from {f}")
-        return load_manifest(f)
-
-    def test_other_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_test-other.json.gz"
-        logging.info(f"About to get test-other cuts from {f}")
-        return load_manifest(f)
-
-    def dev_clean_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_dev-clean.json.gz"
-        logging.info(f"About to get dev-clean cuts from {f}")
-        return load_manifest(f)
-
-    def dev_other_cuts(self) -> CutSet:
-        f = self.manifest_dir / "cuts_dev-other.json.gz"
-        logging.info(f"About to get dev-other cuts from {f}")
-        return load_manifest(f)
diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
new file mode 120000
index 000000000..b76723bf5
--- /dev/null
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py
@@ -0,0 +1 @@
+../pruned_transducer_stateless3/librispeech.py
\ No newline at end of file
diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py
index e1833b841..3b51ff9bc 100755
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py
@@ -28,7 +28,7 @@ from pathlib import Path
 
 from asr_datamodule import AsrDataModule
 from gigaspeech import GigaSpeech
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 from librispeech import LibriSpeech
 
 
@@ -41,8 +41,8 @@ def test_dataset():
     print(args)
 
     if args.enable_musan:
-        cuts_musan = load_manifest(
-            Path(args.manifest_dir) / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
         )
     else:
         cuts_musan = None
@@ -57,14 +57,12 @@ def test_dataset():
 
     libri_train_dl = asr_datamodule.train_dataloaders(
         train_clean_100,
-        dynamic_bucketing=False,
         on_the_fly_feats=False,
         cuts_musan=cuts_musan,
     )
 
     giga_train_dl = asr_datamodule.train_dataloaders(
         train_S,
-        dynamic_bucketing=True,
         on_the_fly_feats=True,
         cuts_musan=cuts_musan,
     )
diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py
index 5572d3f4c..217fdb39a 100755
--- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py
+++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py
@@ -73,7 +73,7 @@ from conformer import Conformer
 from decoder import Decoder
 from gigaspeech import GigaSpeech
 from joiner import Joiner
-from lhotse import CutSet, load_manifest
+from lhotse import CutSet, load_manifest_lazy
 from lhotse.cut import Cut
 from lhotse.utils import fix_random_seed
 from librispeech import LibriSpeech
@@ -662,19 +662,17 @@ def train_one_epoch(
 def filter_short_and_long_utterances(cuts: CutSet) -> CutSet:
     def remove_short_and_long_utt(c: Cut):
         # Keep only utterances with duration between 1 second and 20 seconds
+        #
+        # Caution: There is a reason to select 20.0 here. Please see
+        # ../local/display_manifest_statistics.py
+        #
+        # You should use ../local/display_manifest_statistics.py to get
+        # an utterance duration distribution for your dataset to select
+        # the threshold
         return 1.0 <= c.duration <= 20.0
 
-    num_in_total = len(cuts)
     cuts = cuts.filter(remove_short_and_long_utt)
 
-    num_left = len(cuts)
-    num_removed = num_in_total - num_left
-    removed_percent = num_removed / num_in_total * 100
-
-    logging.info(f"Before removing short and long utterances: {num_in_total}")
-    logging.info(f"After removing short and long utterances: {num_left}")
-    logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
-
     return cuts
 
 
@@ -767,17 +765,18 @@ def run(rank, world_size, args):
     # DEV 12 hours
     # Test 40 hours
     if params.full_libri:
-        logging.info("Using the L subset of GigaSpeech (2.5k hours)")
-        train_giga_cuts = gigaspeech.train_L_cuts()
+        logging.info("Using the XL subset of GigaSpeech (10k hours)")
+        train_giga_cuts = gigaspeech.train_XL_cuts()
     else:
         logging.info("Using the S subset of GigaSpeech (250 hours)")
         train_giga_cuts = gigaspeech.train_S_cuts()
 
     train_giga_cuts = filter_short_and_long_utterances(train_giga_cuts)
+    train_giga_cuts = train_giga_cuts.repeat(times=None)
 
     if args.enable_musan:
-        cuts_musan = load_manifest(
-            Path(args.manifest_dir) / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
         )
     else:
         cuts_musan = None
@@ -786,14 +785,12 @@ def run(rank, world_size, args):
 
     train_dl = asr_datamodule.train_dataloaders(
         train_cuts,
-        dynamic_bucketing=False,
         on_the_fly_feats=False,
         cuts_musan=cuts_musan,
     )
 
     giga_train_dl = asr_datamodule.train_dataloaders(
         train_giga_cuts,
-        dynamic_bucketing=True,
         on_the_fly_feats=True,
         cuts_musan=cuts_musan,
     )
diff --git a/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
index f165f6e60..a674d5527 100644
--- a/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@@ -22,7 +22,7 @@ from pathlib import Path
 from typing import Any, Dict, Optional
 
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
     CutConcatenate,
     CutMix,
@@ -176,7 +176,7 @@ class SPGISpeechAsrDataModule:
             The state dict for the training sampler.
         """
         logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
+        cuts_musan = load_manifest_lazy(
             self.args.manifest_dir / "cuts_musan.jsonl.gz"
         )
 
diff --git a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py
index 14200f34f..78351d77c 100755
--- a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py
+++ b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py
@@ -52,8 +52,13 @@ def compute_fbank_tedlium():
         "test",
     )
 
+    prefix = "tedlium"
+    suffix = "jsonl.gz"
     manifests = read_manifests_if_cached(
-        prefix="tedlium", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
     )
     assert manifests is not None
 
@@ -61,7 +66,7 @@ def compute_fbank_tedlium():
 
     with get_executor() as ex:  # Initialize the executor only once.
         for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
                 logging.info(f"{partition} already exists - skipping.")
                 continue
             logging.info(f"Processing {partition}")
@@ -80,7 +85,7 @@ def compute_fbank_tedlium():
 
             cut_set = cut_set.compute_and_store_features(
                 extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                 # when an executor is specified, make more partitions
                 num_jobs=cur_num_jobs,
                 executor=ex,
@@ -88,7 +93,7 @@ def compute_fbank_tedlium():
             )
             # Split long cuts into many short and un-overlapping cuts
             cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
 
 
 if __name__ == "__main__":
diff --git a/egs/tedlium3/ASR/local/display_manifest_statistics.py b/egs/tedlium3/ASR/local/display_manifest_statistics.py
index 972d03b12..52e152389 100755
--- a/egs/tedlium3/ASR/local/display_manifest_statistics.py
+++ b/egs/tedlium3/ASR/local/display_manifest_statistics.py
@@ -27,15 +27,15 @@ for usage.
 """
 
 
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 
 
 def main():
-    path = "./data/fbank/cuts_train.json.gz"
-    path = "./data/fbank/cuts_dev.json.gz"
-    path = "./data/fbank/cuts_test.json.gz"
+    path = "./data/fbank/tedlium_cuts_train.jsonl.gz"
+    path = "./data/fbank/tedlium_cuts_dev.jsonl.gz"
+    path = "./data/fbank/tedlium_cuts_test.jsonl.gz"
 
-    cuts = load_manifest(path)
+    cuts = load_manifest_lazy(path)
     cuts.describe()
 
 
diff --git a/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py b/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py
index a6b986a94..ae22bfd92 100644
--- a/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py
+++ b/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py
@@ -22,11 +22,11 @@ import logging
 from functools import lru_cache
 from pathlib import Path
 
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
-    BucketingSampler,
     CutConcatenate,
     CutMix,
+    DynamicBucketingSampler,
     K2SpeechRecognitionDataset,
     PrecomputedFeatures,
     SingleCutSampler,
@@ -92,7 +92,7 @@ class TedLiumAsrDataModule:
             "--num-buckets",
             type=int,
             default=30,
-            help="The number of buckets for the BucketingSampler"
+            help="The number of buckets for the DynamicBucketingSampler"
             "(you might want to increase it for larger datasets).",
         )
         group.add_argument(
@@ -179,8 +179,8 @@ class TedLiumAsrDataModule:
         transforms = []
         if self.args.enable_musan:
             logging.info("Enable MUSAN")
-            cuts_musan = load_manifest(
-                self.args.manifest_dir / "cuts_musan.json.gz"
+            cuts_musan = load_manifest_lazy(
+                self.args.manifest_dir / "musan_cuts.jsonl.gz"
             )
             transforms.append(
                 CutMix(
@@ -261,13 +261,12 @@ class TedLiumAsrDataModule:
             )
 
         if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
                 cuts_train,
                 max_duration=self.args.max_duration,
                 shuffle=self.args.shuffle,
                 num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
                 drop_last=True,
             )
         else:
@@ -311,7 +310,7 @@ class TedLiumAsrDataModule:
                 cut_transforms=transforms,
                 return_cuts=self.args.return_cuts,
             )
-        valid_sampler = BucketingSampler(
+        valid_sampler = DynamicBucketingSampler(
             cuts_valid,
             max_duration=self.args.max_duration,
             shuffle=False,
@@ -335,8 +334,10 @@ class TedLiumAsrDataModule:
             else PrecomputedFeatures(),
             return_cuts=self.args.return_cuts,
         )
-        sampler = BucketingSampler(
-            cuts, max_duration=self.args.max_duration, shuffle=False
+        sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            shuffle=False,
         )
         logging.debug("About to create test dataloader")
         test_dl = DataLoader(
@@ -350,14 +351,20 @@ class TedLiumAsrDataModule:
     @lru_cache()
     def train_cuts(self) -> CutSet:
         logging.info("About to get train cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "tedlium_cuts_train.jsonl.gz"
+        )
 
     @lru_cache()
     def dev_cuts(self) -> CutSet:
         logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "tedlium_cuts_dev.jsonl.gz"
+        )
 
     @lru_cache()
     def test_cuts(self) -> CutSet:
         logging.info("About to get test cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "tedlium_cuts_test.jsonl.gz"
+        )
diff --git a/egs/timit/ASR/local/compute_fbank_timit.py b/egs/timit/ASR/local/compute_fbank_timit.py
index 8e3cbac4e..094769c8c 100644
--- a/egs/timit/ASR/local/compute_fbank_timit.py
+++ b/egs/timit/ASR/local/compute_fbank_timit.py
@@ -29,7 +29,7 @@ import os
 from pathlib import Path
 
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 
 from icefall.utils import get_executor
@@ -53,8 +53,13 @@ def compute_fbank_timit():
         "DEV",
         "TEST",
     )
+    prefix = "timit"
+    suffix = "jsonl.gz"
     manifests = read_manifests_if_cached(
-        prefix="timit", dataset_parts=dataset_parts, output_dir=src_dir
+        dataset_parts=dataset_parts,
+        output_dir=src_dir,
+        prefix=prefix,
+        suffix=suffix,
     )
     assert manifests is not None
 
@@ -62,7 +67,8 @@ def compute_fbank_timit():
 
     with get_executor() as ex:  # Initialize the executor only once.
         for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
+            if cuts_file.is_file():
                 logging.info(f"{partition} already exists - skipping.")
                 continue
             logging.info(f"Processing {partition}")
@@ -78,13 +84,13 @@ def compute_fbank_timit():
                 )
             cut_set = cut_set.compute_and_store_features(
                 extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                 # when an executor is specified, make more partitions
                 num_jobs=num_jobs if ex is None else 80,
                 executor=ex,
-                storage_type=LilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
             )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(cuts_file)
 
 
 if __name__ == "__main__":
diff --git a/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py
index a7029f514..665b5a771 100644
--- a/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py
+++ b/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py
@@ -23,11 +23,11 @@ from functools import lru_cache
 from pathlib import Path
 from typing import List, Union
 
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (
-    BucketingSampler,
     CutConcatenate,
     CutMix,
+    DynamicBucketingSampler,
     K2SpeechRecognitionDataset,
     PrecomputedFeatures,
     SingleCutSampler,
@@ -92,7 +92,7 @@ class TimitAsrDataModule(DataModule):
             "--num-buckets",
             type=int,
             default=30,
-            help="The number of buckets for the BucketingSampler"
+            help="The number of buckets for the DynamicBucketingSampler"
             "(you might want to increase it for larger datasets).",
         )
         group.add_argument(
@@ -154,7 +154,9 @@ class TimitAsrDataModule(DataModule):
         cuts_train = self.train_cuts()
 
         logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(self.args.feature_dir / "cuts_musan.json.gz")
+        cuts_musan = load_manifest_lazy(
+            self.args.feature_dir / "cuts_musan.jsonl.gz"
+        )
 
         logging.info("About to create train dataset")
         transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
@@ -218,13 +220,12 @@ class TimitAsrDataModule(DataModule):
             )
 
         if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
                 cuts_train,
                 max_duration=self.args.max_duration,
                 shuffle=self.args.shuffle,
                 num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
                 drop_last=True,
             )
         else:
@@ -322,20 +323,26 @@ class TimitAsrDataModule(DataModule):
     @lru_cache()
     def train_cuts(self) -> CutSet:
         logging.info("About to get train cuts")
-        cuts_train = load_manifest(self.args.feature_dir / "cuts_TRAIN.json.gz")
+        cuts_train = load_manifest_lazy(
+            self.args.feature_dir / "timit_cuts_TRAIN.jsonl.gz"
+        )
 
         return cuts_train
 
     @lru_cache()
     def valid_cuts(self) -> CutSet:
         logging.info("About to get dev cuts")
-        cuts_valid = load_manifest(self.args.feature_dir / "cuts_DEV.json.gz")
+        cuts_valid = load_manifest_lazy(
+            self.args.feature_dir / "timit_cuts_DEV.jsonl.gz"
+        )
 
         return cuts_valid
 
     @lru_cache()
     def test_cuts(self) -> CutSet:
         logging.debug("About to get test cuts")
-        cuts_test = load_manifest(self.args.feature_dir / "cuts_TEST.json.gz")
+        cuts_test = load_manifest_lazy(
+            self.args.feature_dir / "timit_cuts_TEST.jsonl.gz"
+        )
 
         return cuts_test
diff --git a/egs/wenetspeech/ASR/local/display_manifest_statistics.py b/egs/wenetspeech/ASR/local/display_manifest_statistics.py
index 30dc5a5ec..c41445b8d 100644
--- a/egs/wenetspeech/ASR/local/display_manifest_statistics.py
+++ b/egs/wenetspeech/ASR/local/display_manifest_statistics.py
@@ -26,7 +26,7 @@ for usage.
 """
 
 
-from lhotse import load_manifest
+from lhotse import load_manifest_lazy
 
 
 def main():
@@ -40,7 +40,7 @@ def main():
 
     for path in paths:
         print(f"Starting display the statistics for {path}")
-        cuts = load_manifest(path)
+        cuts = load_manifest_lazy(path)
         cuts.describe()
 
 
diff --git a/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
index d2f8d85ce..6aebc2164 100644
--- a/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
+++ b/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py
@@ -27,7 +27,7 @@ from lhotse import (
     CutSet,
     Fbank,
     FbankConfig,
-    load_manifest,
+    load_manifest_lazy,
     set_caching_enabled,
 )
 from lhotse.dataset import (
@@ -218,8 +218,8 @@ class WenetSpeechAsrDataModule:
             The state dict for the training sampler.
         """
         logging.info("About to get Musan cuts")
-        cuts_musan = load_manifest(
-            self.args.manifest_dir / "cuts_musan.json.gz"
+        cuts_musan = load_manifest_lazy(
+            self.args.manifest_dir / "musan_cuts.jsonl.gz"
         )
 
         transforms = []
@@ -435,16 +435,18 @@ class WenetSpeechAsrDataModule:
     @lru_cache()
     def valid_cuts(self) -> CutSet:
         logging.info("About to get dev cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
+        return load_manifest_lazy(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
 
     @lru_cache()
     def test_net_cuts(self) -> List[CutSet]:
         logging.info("About to get TEST_NET cuts")
-        return load_manifest(self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz"
+        )
 
     @lru_cache()
     def test_meeting_cuts(self) -> List[CutSet]:
         logging.info("About to get TEST_MEETING cuts")
-        return load_manifest(
+        return load_manifest_lazy(
             self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz"
         )
diff --git a/egs/yesno/ASR/local/compute_fbank_yesno.py b/egs/yesno/ASR/local/compute_fbank_yesno.py
index 6922ffe10..fb48b6f8e 100755
--- a/egs/yesno/ASR/local/compute_fbank_yesno.py
+++ b/egs/yesno/ASR/local/compute_fbank_yesno.py
@@ -12,7 +12,7 @@ import os
 from pathlib import Path
 
 import torch
-from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
+from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 
 from icefall.utils import get_executor
@@ -37,10 +37,13 @@ def compute_fbank_yesno():
         "train",
         "test",
     )
+    prefix = "yesno"
+    suffix = "jsonl.gz"
     manifests = read_manifests_if_cached(
         dataset_parts=dataset_parts,
         output_dir=src_dir,
-        prefix="yesno",
+        prefix=prefix,
+        suffix=suffix,
     )
     assert manifests is not None
 
@@ -50,7 +53,8 @@ def compute_fbank_yesno():
 
     with get_executor() as ex:  # Initialize the executor only once.
         for partition, m in manifests.items():
-            if (output_dir / f"cuts_{partition}.json.gz").is_file():
+            cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
+            if cuts_file.is_file():
                 logging.info(f"{partition} already exists - skipping.")
                 continue
             logging.info(f"Processing {partition}")
@@ -66,13 +70,13 @@ def compute_fbank_yesno():
                 )
             cut_set = cut_set.compute_and_store_features(
                 extractor=extractor,
-                storage_path=f"{output_dir}/feats_{partition}",
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                 # when an executor is specified, make more partitions
                 num_jobs=num_jobs if ex is None else 1,  # use one job
                 executor=ex,
-                storage_type=LilcomHdf5Writer,
+                storage_type=LilcomChunkyWriter,
             )
-            cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
+            cut_set.to_file(cuts_file)
 
 
 if __name__ == "__main__":
diff --git a/egs/yesno/ASR/tdnn/asr_datamodule.py b/egs/yesno/ASR/tdnn/asr_datamodule.py
index 0a5a42089..85e5f1358 100644
--- a/egs/yesno/ASR/tdnn/asr_datamodule.py
+++ b/egs/yesno/ASR/tdnn/asr_datamodule.py
@@ -20,18 +20,19 @@ from functools import lru_cache
 from pathlib import Path
 from typing import List
 
+from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
+from lhotse.dataset import (
+    CutConcatenate,
+    DynamicBucketingSampler,
+    K2SpeechRecognitionDataset,
+    PrecomputedFeatures,
+    SingleCutSampler,
+)
+from lhotse.dataset.input_strategies import OnTheFlyFeatures
 from torch.utils.data import DataLoader
 
 from icefall.dataset.datamodule import DataModule
 from icefall.utils import str2bool
-from lhotse import CutSet, Fbank, FbankConfig, load_manifest
-from lhotse.dataset import (
-    BucketingSampler,
-    CutConcatenate,
-    K2SpeechRecognitionDataset,
-    PrecomputedFeatures,
-)
-from lhotse.dataset.input_strategies import OnTheFlyFeatures
 
 
 class YesNoAsrDataModule(DataModule):
@@ -84,7 +85,7 @@ class YesNoAsrDataModule(DataModule):
             "--num-buckets",
             type=int,
             default=10,
-            help="The number of buckets for the BucketingSampler"
+            help="The number of buckets for the DynamicBucketingSampler"
             "(you might want to increase it for larger datasets).",
         )
         group.add_argument(
@@ -186,18 +187,17 @@ class YesNoAsrDataModule(DataModule):
             )
 
         if self.args.bucketing_sampler:
-            logging.info("Using BucketingSampler.")
-            train_sampler = BucketingSampler(
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
                 cuts_train,
                 max_duration=self.args.max_duration,
                 shuffle=self.args.shuffle,
                 num_buckets=self.args.num_buckets,
-                bucket_method="equal_duration",
                 drop_last=True,
             )
         else:
             logging.info("Using SingleCutSampler.")
-            train_sampler = BucketingSampler(
+            train_sampler = SingleCutSampler(
                 cuts_train,
                 max_duration=self.args.max_duration,
                 shuffle=self.args.shuffle,
@@ -225,8 +225,10 @@ class YesNoAsrDataModule(DataModule):
             else PrecomputedFeatures(),
             return_cuts=self.args.return_cuts,
         )
-        sampler = BucketingSampler(
-            cuts_test, max_duration=self.args.max_duration, shuffle=False
+        sampler = DynamicBucketingSampler(
+            cuts_test,
+            max_duration=self.args.max_duration,
+            shuffle=False,
         )
         logging.debug("About to create test dataloader")
         test_dl = DataLoader(
@@ -240,11 +242,15 @@ class YesNoAsrDataModule(DataModule):
     @lru_cache()
     def train_cuts(self) -> CutSet:
         logging.info("About to get train cuts")
-        cuts_train = load_manifest(self.args.feature_dir / "cuts_train.json.gz")
+        cuts_train = load_manifest_lazy(
+            self.args.feature_dir / "yesno_cuts_train.jsonl.gz"
+        )
         return cuts_train
 
     @lru_cache()
     def test_cuts(self) -> List[CutSet]:
         logging.info("About to get test cuts")
-        cuts_test = load_manifest(self.args.feature_dir / "cuts_test.json.gz")
+        cuts_test = load_manifest_lazy(
+            self.args.feature_dir / "yesno_cuts_test.jsonl.gz"
+        )
         return cuts_test
diff --git a/icefall/utils.py b/icefall/utils.py
index c9045006d..b38574f0c 100644
--- a/icefall/utils.py
+++ b/icefall/utils.py
@@ -131,7 +131,6 @@ def setup_logger(
         format=formatter,
         level=level,
         filemode="w",
-        force=True,
     )
     if use_console:
         console = logging.StreamHandler()