diff --git a/.github/workflows/run-gigaspeech-2022-05-13.yml b/.github/workflows/run-gigaspeech-2022-05-13.yml index d250b72b0..dc33751d3 100644 --- a/.github/workflows/run-gigaspeech-2022-05-13.yml +++ b/.github/workflows/run-gigaspeech-2022-05-13.yml @@ -59,6 +59,8 @@ jobs: - name: Install Python dependencies run: | grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install + pip uninstall -y protobuf + pip install --no-binary protobuf protobuf - name: Cache kaldifeat id: my-cache diff --git a/.github/workflows/run-librispeech-2022-03-12.yml b/.github/workflows/run-librispeech-2022-03-12.yml index b18b84378..291f2bc71 100644 --- a/.github/workflows/run-librispeech-2022-03-12.yml +++ b/.github/workflows/run-librispeech-2022-03-12.yml @@ -59,6 +59,8 @@ jobs: - name: Install Python dependencies run: | grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install + pip uninstall -y protobuf + pip install --no-binary protobuf protobuf - name: Cache kaldifeat id: my-cache @@ -99,7 +101,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-librispeech-2022-04-29.yml b/.github/workflows/run-librispeech-2022-04-29.yml index 6c8188b48..b04718f86 100644 --- a/.github/workflows/run-librispeech-2022-04-29.yml +++ b/.github/workflows/run-librispeech-2022-04-29.yml @@ -59,6 +59,8 @@ jobs: - name: Install Python dependencies run: | grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install + pip uninstall -y protobuf + pip install --no-binary protobuf protobuf - name: Cache kaldifeat id: my-cache @@ -99,7 +101,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-librispeech-2022-05-13.yml b/.github/workflows/run-librispeech-2022-05-13.yml index 2290e18d4..bb3d74e55 100644 --- a/.github/workflows/run-librispeech-2022-05-13.yml +++ b/.github/workflows/run-librispeech-2022-05-13.yml @@ -59,6 +59,8 @@ jobs: - name: Install Python dependencies run: | grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install + pip uninstall -y protobuf + pip install --no-binary protobuf protobuf - name: Cache kaldifeat id: my-cache @@ -99,7 +101,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml index 512f1b334..47976fc2c 100644 --- a/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml +++ b/.github/workflows/run-librispeech-pruned-transducer-stateless3-2022-05-13.yml @@ -59,6 +59,8 @@ jobs: - name: Install Python dependencies run: | grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install + pip uninstall -y protobuf + pip install --no-binary protobuf protobuf - name: Cache kaldifeat id: my-cache @@ -99,7 +101,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml index 3864f4aa3..e05b04bee 100644 --- a/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml +++ b/.github/workflows/run-librispeech-transducer-stateless2-2022-04-19.yml @@ -59,6 +59,8 @@ jobs: - name: Install Python dependencies run: | grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install + pip uninstall -y protobuf + pip install --no-binary protobuf protobuf - name: Cache kaldifeat id: my-cache @@ -99,7 +101,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml index f77d9e658..348a68095 100644 --- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml +++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-100h.yml @@ -58,6 +58,8 @@ jobs: - name: Install Python dependencies run: | grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install + pip uninstall -y protobuf + pip install --no-binary protobuf protobuf - name: Cache kaldifeat id: my-cache @@ -98,7 +100,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml index ddfa62073..d1369c2b1 100644 --- a/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml +++ b/.github/workflows/run-pretrained-transducer-stateless-librispeech-multi-datasets.yml @@ -58,6 +58,8 @@ jobs: - name: Install Python dependencies run: | grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install + pip uninstall -y protobuf + pip install --no-binary protobuf protobuf - name: Cache kaldifeat id: my-cache @@ -98,7 +100,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/.github/workflows/run-pretrained-transducer-stateless.yml b/.github/workflows/run-pretrained-transducer-stateless.yml index cdea78a88..78c1ca059 100644 --- a/.github/workflows/run-pretrained-transducer-stateless.yml +++ b/.github/workflows/run-pretrained-transducer-stateless.yml @@ -58,6 +58,8 @@ jobs: - name: Install Python dependencies run: | grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install + pip uninstall -y protobuf + pip install --no-binary protobuf protobuf - name: Cache kaldifeat id: my-cache @@ -98,7 +100,7 @@ jobs: with: path: | ~/tmp/fbank-libri - key: cache-libri-fbank-test-clean-and-test-other + key: cache-libri-fbank-test-clean-and-test-other-v2 - name: Compute fbank for LibriSpeech test-clean and test-other if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' diff --git a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py index 5d00edeca..faebff2f6 100755 --- a/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py +++ b/egs/aidatatang_200zh/ASR/local/compute_fbank_aidatatang_200zh.py @@ -43,7 +43,7 @@ torch.set_num_interop_threads(1) def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80): - src_dir = Path("data/manifests/aidatatang_200zh") + src_dir = Path("data/manifests") output_dir = Path("data/fbank") num_jobs = min(15, os.cpu_count()) @@ -52,11 +52,13 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80): "dev", "test", ) + prefix = "aidatatang" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( - prefix="aidatatang", - suffix="jsonl.gz", dataset_parts=dataset_parts, output_dir=src_dir, + prefix=prefix, + suffix=suffix, ) assert manifests is not None @@ -64,10 +66,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80): with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): - if (output_dir / f"cuts_{partition}.json.gz").is_file(): + if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file(): logging.info(f"{partition} already exists - skipping.") continue logging.info(f"Processing {partition}") + + for sup in m["supervisions"]: + sup.custom = {"origin": "aidatatang_200zh"} + cut_set = CutSet.from_manifests( recordings=m["recordings"], supervisions=m["supervisions"], @@ -80,13 +86,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80): ) cut_set = cut_set.compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_{partition}", + storage_path=f"{output_dir}/{prefix}_feats_{partition}", # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, storage_type=ChunkedLilcomHdf5Writer, ) - cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + + cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}") def get_args(): diff --git a/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py b/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py index 2352785ac..d66e5cfca 100644 --- a/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py +++ b/egs/aidatatang_200zh/ASR/local/display_manifest_statistics.py @@ -25,19 +25,19 @@ for usage. """ -from lhotse import load_manifest +from lhotse import load_manifest_lazy def main(): paths = [ - "./data/fbank/cuts_train.json.gz", - "./data/fbank/cuts_dev.json.gz", - "./data/fbank/cuts_test.json.gz", + "./data/fbank/aidatatang_cuts_train.jsonl.gz", + "./data/fbank/aidatatang_cuts_dev.jsonl.gz", + "./data/fbank/aidatatang_cuts_test.jsonl.gz", ] for path in paths: print(f"Starting display the statistics for {path}") - cuts = load_manifest(path) + cuts = load_manifest_lazy(path) cuts.describe() @@ -45,7 +45,7 @@ if __name__ == "__main__": main() """ -Starting display the statistics for ./data/fbank/cuts_train.json.gz +Starting display the statistics for ./data/fbank/aidatatang_cuts_train.jsonl.gz Cuts count: 494715 Total duration (hours): 422.6 Speech duration (hours): 422.6 (100.0%) @@ -61,7 +61,7 @@ min 1.0 99.5% 8.0 99.9% 9.5 max 18.1 -Starting display the statistics for ./data/fbank/cuts_dev.json.gz +Starting display the statistics for ./data/fbank/aidatatang_cuts_dev.jsonl.gz Cuts count: 24216 Total duration (hours): 20.2 Speech duration (hours): 20.2 (100.0%) @@ -77,7 +77,7 @@ min 1.2 99.5% 7.3 99.9% 8.8 max 11.3 -Starting display the statistics for ./data/fbank/cuts_test.json.gz +Starting display the statistics for ./data/fbank/aidatatang_cuts_test.jsonl.gz Cuts count: 48144 Total duration (hours): 40.2 Speech duration (hours): 40.2 (100.0%) diff --git a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py index 447a011cb..728f7e3d0 100644 --- a/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py +++ b/egs/aidatatang_200zh/ASR/pruned_transducer_stateless2/asr_datamodule.py @@ -27,11 +27,10 @@ from lhotse import ( CutSet, Fbank, FbankConfig, - load_manifest, + load_manifest_lazy, set_caching_enabled, ) from lhotse.dataset import ( - BucketingSampler, CutConcatenate, CutMix, DynamicBucketingSampler, @@ -205,8 +204,8 @@ class Aidatatang_200zhAsrDataModule: The state dict for the training sampler. """ logging.info("About to get Musan cuts") - cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms = [] @@ -290,13 +289,12 @@ class Aidatatang_200zhAsrDataModule: ) if self.args.bucketing_sampler: - logging.info("Using BucketingSampler.") - train_sampler = BucketingSampler( + logging.info("Using DynamicBucketingSampler.") + train_sampler = DynamicBucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, - bucket_method="equal_duration", drop_last=True, ) else: @@ -402,14 +400,20 @@ class Aidatatang_200zhAsrDataModule: @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - return load_manifest(self.args.manifest_dir / "cuts_train.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz" + ) @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz" + ) @lru_cache() def test_cuts(self) -> List[CutSet]: logging.info("About to get test cuts") - return load_manifest(self.args.manifest_dir / "cuts_test.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz" + ) diff --git a/egs/aishell/ASR/conformer_ctc/train.py b/egs/aishell/ASR/conformer_ctc/train.py index 369ad310f..a228cc1fe 100755 --- a/egs/aishell/ASR/conformer_ctc/train.py +++ b/egs/aishell/ASR/conformer_ctc/train.py @@ -195,9 +195,9 @@ def get_params() -> AttributeDict: "best_train_epoch": -1, "best_valid_epoch": -1, "batch_idx_train": 0, - "log_interval": 10, + "log_interval": 50, "reset_interval": 200, - "valid_interval": 3000, + "valid_interval": 2000, # parameters for k2.ctc_loss "beam_size": 10, "reduction": "sum", diff --git a/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py b/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py new file mode 100755 index 000000000..8cdfad71f --- /dev/null +++ b/egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +""" +This file computes fbank features of the aidatatang_200zh dataset. +It looks for manifests in the directory data/manifests. + +The generated fbank features are saved in data/fbank. +""" + +import argparse +import logging +import os +from pathlib import Path + +import torch +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter +from lhotse.recipes.utils import read_manifests_if_cached + +from icefall.utils import get_executor + +# Torch's multithreaded behavior needs to be disabled or +# it wastes a lot of CPU and slow things down. +# Do this outside of main() in case it needs to take effect +# even when we are not invoking the main (e.g. when spawning subprocesses). +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + + +def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80): + src_dir = Path("data/manifests") + output_dir = Path("data/fbank") + num_jobs = min(15, os.cpu_count()) + + dataset_parts = ( + "train", + "test", + "dev", + ) + prefix = "aidatatang" + suffix = "jsonl.gz" + manifests = read_manifests_if_cached( + dataset_parts=dataset_parts, + output_dir=src_dir, + prefix=prefix, + suffix=suffix, + ) + assert manifests is not None + + extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins)) + + with get_executor() as ex: # Initialize the executor only once. + for partition, m in manifests.items(): + if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file(): + logging.info(f"{partition} already exists - skipping.") + continue + logging.info(f"Processing {partition}") + + for sup in m["supervisions"]: + sup.custom = {"origin": "aidatatang_200zh"} + + cut_set = CutSet.from_manifests( + recordings=m["recordings"], + supervisions=m["supervisions"], + ) + if "train" in partition: + cut_set = ( + cut_set + + cut_set.perturb_speed(0.9) + + cut_set.perturb_speed(1.1) + ) + cut_set = cut_set.compute_and_store_features( + extractor=extractor, + storage_path=f"{output_dir}/{prefix}_feats_{partition}", + # when an executor is specified, make more partitions + num_jobs=num_jobs if ex is None else 80, + executor=ex, + storage_type=LilcomChunkyWriter, + ) + + cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}") + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--num-mel-bins", + type=int, + default=80, + help="""The number of mel bins for Fbank""", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + formatter = ( + "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + ) + + logging.basicConfig(format=formatter, level=logging.INFO) + + args = get_args() + compute_fbank_aidatatang_200zh(num_mel_bins=args.num_mel_bins) diff --git a/egs/aishell/ASR/local/compute_fbank_aishell.py b/egs/aishell/ASR/local/compute_fbank_aishell.py index 70dee81d8..e27e35ec5 100755 --- a/egs/aishell/ASR/local/compute_fbank_aishell.py +++ b/egs/aishell/ASR/local/compute_fbank_aishell.py @@ -29,7 +29,7 @@ import os from pathlib import Path import torch -from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -52,8 +52,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80): "dev", "test", ) + prefix = "aishell" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( - prefix="aishell", dataset_parts=dataset_parts, output_dir=src_dir + dataset_parts=dataset_parts, + output_dir=src_dir, + prefix=prefix, + suffix=suffix, ) assert manifests is not None @@ -61,7 +66,7 @@ def compute_fbank_aishell(num_mel_bins: int = 80): with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): - if (output_dir / f"cuts_{partition}.json.gz").is_file(): + if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file(): logging.info(f"{partition} already exists - skipping.") continue logging.info(f"Processing {partition}") @@ -77,13 +82,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80): ) cut_set = cut_set.compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_{partition}", + storage_path=f"{output_dir}/{prefix}_feats_{partition}", # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=LilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) - cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}") def get_args(): diff --git a/egs/aishell/ASR/local/display_manifest_statistics.py b/egs/aishell/ASR/local/display_manifest_statistics.py index 0ae731a1d..c478f7331 100755 --- a/egs/aishell/ASR/local/display_manifest_statistics.py +++ b/egs/aishell/ASR/local/display_manifest_statistics.py @@ -25,18 +25,18 @@ for usage. """ -from lhotse import load_manifest +from lhotse import load_manifest_lazy def main(): - # path = "./data/fbank/cuts_train.json.gz" - # path = "./data/fbank/cuts_test.json.gz" - # path = "./data/fbank/cuts_dev.json.gz" - # path = "./data/fbank/aidatatang_200zh/cuts_train_raw.jsonl.gz" - # path = "./data/fbank/aidatatang_200zh/cuts_test_raw.jsonl.gz" - path = "./data/fbank/aidatatang_200zh/cuts_dev_raw.jsonl.gz" + # path = "./data/fbank/aishell_cuts_train.jsonl.gz" + # path = "./data/fbank/aishell_cuts_test.jsonl.gz" + path = "./data/fbank/aishell_cuts_dev.jsonl.gz" + # path = "./data/fbank/aidatatang_cuts_train.jsonl.gz" + # path = "./data/fbank/aidatatang_cuts_test.jsonl.gz" + # path = "./data/fbank/aidatatang_cuts_dev.jsonl.gz" - cuts = load_manifest(path) + cuts = load_manifest_lazy(path) cuts.describe() diff --git a/egs/aishell/ASR/local/process_aidatatang_200zh.py b/egs/aishell/ASR/local/process_aidatatang_200zh.py deleted file mode 100755 index ac2b86927..000000000 --- a/egs/aishell/ASR/local/process_aidatatang_200zh.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2022 Xiaomi Corp. (Fangjun Kuang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from pathlib import Path - -from lhotse import CutSet -from lhotse.recipes.utils import read_manifests_if_cached - - -def preprocess_aidatatang_200zh(): - src_dir = Path("data/manifests/aidatatang_200zh") - output_dir = Path("data/fbank/aidatatang_200zh") - output_dir.mkdir(exist_ok=True, parents=True) - - dataset_parts = ( - "train", - "test", - "dev", - ) - - logging.info("Loading manifest") - manifests = read_manifests_if_cached( - dataset_parts=dataset_parts, output_dir=src_dir, prefix="aidatatang" - ) - assert len(manifests) > 0 - - for partition, m in manifests.items(): - logging.info(f"Processing {partition}") - raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz" - if raw_cuts_path.is_file(): - logging.info(f"{partition} already exists - skipping") - continue - - for sup in m["supervisions"]: - sup.custom = {"origin": "aidatatang_200zh"} - - cut_set = CutSet.from_manifests( - recordings=m["recordings"], - supervisions=m["supervisions"], - ) - - logging.info(f"Saving to {raw_cuts_path}") - cut_set.to_file(raw_cuts_path) - - -def main(): - formatter = ( - "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" - ) - logging.basicConfig(format=formatter, level=logging.INFO) - - preprocess_aidatatang_200zh() - - -if __name__ == "__main__": - main() diff --git a/egs/aishell/ASR/prepare_aidatatang_200zh.sh b/egs/aishell/ASR/prepare_aidatatang_200zh.sh index 60b2060ec..f1d4d18a7 100755 --- a/egs/aishell/ASR/prepare_aidatatang_200zh.sh +++ b/egs/aishell/ASR/prepare_aidatatang_200zh.sh @@ -42,18 +42,18 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then log "Stage 1: Prepare manifest" # We assume that you have downloaded the aidatatang_200zh corpus # to $dl_dir/aidatatang_200zh - if [ ! -f data/manifests/aidatatang_200zh/.manifests.done ]; then - mkdir -p data/manifests/aidatatang_200zh - lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh - touch data/manifests/aidatatang_200zh/.manifests.done + if [ ! -f data/manifests/.aidatatang_200zh_manifests.done ]; then + mkdir -p data/manifests + lhotse prepare aidatatang-200zh $dl_dir data/manifests + touch data/manifests/.aidatatang_200zh_manifests.done fi fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Process aidatatang_200zh" - if [ ! -f data/fbank/aidatatang_200zh/.fbank.done ]; then - mkdir -p data/fbank/aidatatang_200zh - lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh - touch data/fbank/aidatatang_200zh/.fbank.done + if [ ! -f data/fbank/.aidatatang_200zh_fbank.done ]; then + mkdir -p data/fbank + ./local/compute_fbank_aidatatang_200zh.py + touch data/fbank/.aidatatang_200zh_fbank.done fi fi diff --git a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py index 507db2933..e1021fda2 100644 --- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py +++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py @@ -23,11 +23,11 @@ from functools import lru_cache from pathlib import Path from typing import List -from lhotse import CutSet, Fbank, FbankConfig, load_manifest +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse.dataset import ( - BucketingSampler, CutConcatenate, CutMix, + DynamicBucketingSampler, K2SpeechRecognitionDataset, PrecomputedFeatures, SingleCutSampler, @@ -93,7 +93,7 @@ class AishellAsrDataModule: "--num-buckets", type=int, default=30, - help="The number of buckets for the BucketingSampler" + help="The number of buckets for the DynamicBucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -133,6 +133,12 @@ class AishellAsrDataModule: help="When enabled (=default), the examples will be " "shuffled for each epoch.", ) + group.add_argument( + "--drop-last", + type=str2bool, + default=True, + help="Whether to drop last batch. Used by sampler.", + ) group.add_argument( "--return-cuts", type=str2bool, @@ -177,8 +183,8 @@ class AishellAsrDataModule: def train_dataloaders(self, cuts_train: CutSet) -> DataLoader: logging.info("About to get Musan cuts") - cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms = [] @@ -262,14 +268,13 @@ class AishellAsrDataModule: ) if self.args.bucketing_sampler: - logging.info("Using BucketingSampler.") - train_sampler = BucketingSampler( + logging.info("Using DynamicBucketingSampler.") + train_sampler = DynamicBucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, - bucket_method="equal_duration", - drop_last=True, + drop_last=self.args.drop_last, ) else: logging.info("Using SingleCutSampler.") @@ -313,7 +318,7 @@ class AishellAsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = BucketingSampler( + valid_sampler = DynamicBucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, @@ -337,8 +342,10 @@ class AishellAsrDataModule: else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = BucketingSampler( - cuts, max_duration=self.args.max_duration, shuffle=False + sampler = DynamicBucketingSampler( + cuts, + max_duration=self.args.max_duration, + shuffle=False, ) test_dl = DataLoader( test, @@ -351,17 +358,21 @@ class AishellAsrDataModule: @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - cuts_train = load_manifest( - self.args.manifest_dir / "cuts_train.json.gz" + cuts_train = load_manifest_lazy( + self.args.manifest_dir / "aishell_cuts_train.jsonl.gz" ) return cuts_train @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "aishell_cuts_dev.jsonl.gz" + ) @lru_cache() def test_cuts(self) -> List[CutSet]: logging.info("About to get test cuts") - return load_manifest(self.args.manifest_dir / "cuts_test.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "aishell_cuts_test.jsonl.gz" + ) diff --git a/egs/aishell/ASR/tdnn_lstm_ctc/train.py b/egs/aishell/ASR/tdnn_lstm_ctc/train.py index 3327cdb79..7619b0551 100755 --- a/egs/aishell/ASR/tdnn_lstm_ctc/train.py +++ b/egs/aishell/ASR/tdnn_lstm_ctc/train.py @@ -15,6 +15,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Usage + export CUDA_VISIBLE_DEVICES="0,1,2,3" + ./tdnn_lstm_ctc/train.py \ + --world-size 4 \ + --num-epochs 20 \ + --max-duration 300 +""" import argparse import logging diff --git a/egs/aishell/ASR/transducer_stateless/conformer.py b/egs/aishell/ASR/transducer_stateless/conformer.py index 149df92ab..7e8dc4ace 100644 --- a/egs/aishell/ASR/transducer_stateless/conformer.py +++ b/egs/aishell/ASR/transducer_stateless/conformer.py @@ -110,9 +110,7 @@ class Conformer(Transformer): x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) # Caution: We assume the subsampling factor is 4! - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - lengths = ((x_lens - 1) // 2 - 1) // 2 + lengths = (((x_lens - 1) >> 1) - 1) >> 1 assert x.size(0) == lengths.max().item() mask = make_pad_mask(lengths) diff --git a/egs/aishell/ASR/transducer_stateless/train.py b/egs/aishell/ASR/transducer_stateless/train.py index f615c78f4..21128318b 100755 --- a/egs/aishell/ASR/transducer_stateless/train.py +++ b/egs/aishell/ASR/transducer_stateless/train.py @@ -21,6 +21,7 @@ import argparse import logging +import warnings from pathlib import Path from shutil import copyfile from typing import Optional, Tuple @@ -386,7 +387,11 @@ def compute_loss( assert loss.requires_grad == is_training info = MetricsTracker() - info["frames"] = (feature_lens // params.subsampling_factor).sum().item() + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + info["frames"] = ( + (feature_lens // params.subsampling_factor).sum().item() + ) # Note: We use reduction=sum while computing the loss. info["loss"] = loss.detach().cpu().item() diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py b/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py index 84ca64c89..26d4ee111 100644 --- a/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py +++ b/egs/aishell/ASR/transducer_stateless_modified-2/aidatatang_200zh.py @@ -18,7 +18,7 @@ import logging from pathlib import Path -from lhotse import CutSet, load_manifest +from lhotse import CutSet, load_manifest_lazy class AIDatatang200zh: @@ -28,26 +28,26 @@ class AIDatatang200zh: manifest_dir: It is expected to contain the following files:: - - cuts_dev_raw.jsonl.gz - - cuts_train_raw.jsonl.gz - - cuts_test_raw.jsonl.gz + - aidatatang_cuts_dev.jsonl.gz + - aidatatang_cuts_train.jsonl.gz + - aidatatang_cuts_test.jsonl.gz """ self.manifest_dir = Path(manifest_dir) def train_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_train_raw.jsonl.gz" + f = self.manifest_dir / "aidatatang_cuts_train.jsonl.gz" logging.info(f"About to get train cuts from {f}") - cuts_train = load_manifest(f) + cuts_train = load_manifest_lazy(f) return cuts_train def valid_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_valid_raw.jsonl.gz" + f = self.manifest_dir / "aidatatang_cuts_valid.jsonl.gz" logging.info(f"About to get valid cuts from {f}") - cuts_valid = load_manifest(f) + cuts_valid = load_manifest_lazy(f) return cuts_valid def test_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_test_raw.jsonl.gz" + f = self.manifest_dir / "aidatatang_cuts_test.jsonl.gz" logging.info(f"About to get test cuts from {f}") - cuts_test = load_manifest(f) + cuts_test = load_manifest_lazy(f) return cuts_test diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py b/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py index 94d1da066..ddeca4d88 100644 --- a/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py +++ b/egs/aishell/ASR/transducer_stateless_modified-2/aishell.py @@ -18,7 +18,7 @@ import logging from pathlib import Path -from lhotse import CutSet, load_manifest +from lhotse import CutSet, load_manifest_lazy class AIShell: @@ -28,26 +28,26 @@ class AIShell: manifest_dir: It is expected to contain the following files:: - - cuts_dev.json.gz - - cuts_train.json.gz - - cuts_test.json.gz + - aishell_cuts_dev.jsonl.gz + - aishell_cuts_train.jsonl.gz + - aishell_cuts_test.jsonl.gz """ self.manifest_dir = Path(manifest_dir) def train_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_train.json.gz" + f = self.manifest_dir / "aishell_cuts_train.jsonl.gz" logging.info(f"About to get train cuts from {f}") - cuts_train = load_manifest(f) + cuts_train = load_manifest_lazy(f) return cuts_train def valid_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_dev.json.gz" + f = self.manifest_dir / "aishell_cuts_dev.jsonl.gz" logging.info(f"About to get valid cuts from {f}") - cuts_valid = load_manifest(f) + cuts_valid = load_manifest_lazy(f) return cuts_valid def test_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_test.json.gz" + f = self.manifest_dir / "aishell_cuts_test.jsonl.gz" logging.info(f"About to get test cuts from {f}") - cuts_test = load_manifest(f) + cuts_test = load_manifest_lazy(f) return cuts_test diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py b/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py index 20eb8155c..838e53658 100644 --- a/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py +++ b/egs/aishell/ASR/transducer_stateless_modified-2/asr_datamodule.py @@ -24,7 +24,6 @@ from typing import Optional from lhotse import CutSet, Fbank, FbankConfig from lhotse.dataset import ( - BucketingSampler, CutMix, DynamicBucketingSampler, K2SpeechRecognitionDataset, @@ -73,8 +72,7 @@ class AsrDataModule: "--num-buckets", type=int, default=30, - help="The number of buckets for the BucketingSampler " - "and DynamicBucketingSampler." + help="The number of buckets for the DynamicBucketingSampler " "(you might want to increase it for larger datasets).", ) @@ -147,7 +145,6 @@ class AsrDataModule: def train_dataloaders( self, cuts_train: CutSet, - dynamic_bucketing: bool, on_the_fly_feats: bool, cuts_musan: Optional[CutSet] = None, ) -> DataLoader: @@ -157,9 +154,6 @@ class AsrDataModule: Cuts for training. cuts_musan: If not None, it is the cuts for mixing. - dynamic_bucketing: - True to use DynamicBucketingSampler; - False to use BucketingSampler. on_the_fly_feats: True to use OnTheFlyFeatures; False to use PrecomputedFeatures. @@ -232,25 +226,14 @@ class AsrDataModule: return_cuts=self.args.return_cuts, ) - if dynamic_bucketing: - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - num_buckets=self.args.num_buckets, - drop_last=True, - ) - else: - logging.info("Using BucketingSampler.") - train_sampler = BucketingSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - num_buckets=self.args.num_buckets, - bucket_method="equal_duration", - drop_last=True, - ) + logging.info("Using DynamicBucketingSampler.") + train_sampler = DynamicBucketingSampler( + cuts_train, + max_duration=self.args.max_duration, + shuffle=self.args.shuffle, + num_buckets=self.args.num_buckets, + drop_last=True, + ) logging.info("About to create train dataloader") train_dl = DataLoader( @@ -279,7 +262,7 @@ class AsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = BucketingSampler( + valid_sampler = DynamicBucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, @@ -303,8 +286,10 @@ class AsrDataModule: else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = BucketingSampler( - cuts, max_duration=self.args.max_duration, shuffle=False + sampler = DynamicBucketingSampler( + cuts, + max_duration=self.args.max_duration, + shuffle=False, ) logging.debug("About to create test dataloader") test_dl = DataLoader( diff --git a/egs/aishell/ASR/transducer_stateless_modified-2/train.py b/egs/aishell/ASR/transducer_stateless_modified-2/train.py index 53d4e455f..a6c17198f 100755 --- a/egs/aishell/ASR/transducer_stateless_modified-2/train.py +++ b/egs/aishell/ASR/transducer_stateless_modified-2/train.py @@ -41,6 +41,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2" import argparse import logging import random +import warnings from pathlib import Path from shutil import copyfile from typing import Optional, Tuple @@ -55,7 +56,7 @@ from asr_datamodule import AsrDataModule from conformer import Conformer from decoder import Decoder from joiner import Joiner -from lhotse import CutSet, load_manifest +from lhotse import CutSet, load_manifest_lazy from lhotse.cut import Cut from lhotse.utils import fix_random_seed from model import Transducer @@ -446,7 +447,11 @@ def compute_loss( assert loss.requires_grad == is_training info = MetricsTracker() - info["frames"] = (feature_lens // params.subsampling_factor).sum().item() + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + info["frames"] = ( + (feature_lens // params.subsampling_factor).sum().item() + ) # Note: We use reduction=sum while computing the loss. info["loss"] = loss.detach().cpu().item() @@ -635,20 +640,16 @@ def train_one_epoch( def filter_short_and_long_utterances(cuts: CutSet) -> CutSet: def remove_short_and_long_utt(c: Cut): - # Keep only utterances with duration between 1 second and 12 seconds + # Keep only utterances with duration between 1 second and 20 seconds + # + # Caution: There is a reason to select 12.0 here. Please see + # ../local/display_manifest_statistics.py + # + # You should use ../local/display_manifest_statistics.py to get + # an utterance duration distribution for your dataset to select + # the threshold return 1.0 <= c.duration <= 12.0 - num_in_total = len(cuts) - cuts = cuts.filter(remove_short_and_long_utt) - - num_left = len(cuts) - num_removed = num_in_total - num_left - removed_percent = num_removed / num_in_total * 100 - - logging.info(f"Before removing short and long utterances: {num_in_total}") - logging.info(f"After removing short and long utterances: {num_left}") - logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)") - return cuts @@ -728,15 +729,14 @@ def run(rank, world_size, args): train_cuts = aishell.train_cuts() train_cuts = filter_short_and_long_utterances(train_cuts) - datatang = AIDatatang200zh( - manifest_dir=f"{args.manifest_dir}/aidatatang_200zh" - ) + datatang = AIDatatang200zh(manifest_dir=args.manifest_dir) train_datatang_cuts = datatang.train_cuts() train_datatang_cuts = filter_short_and_long_utterances(train_datatang_cuts) + train_datatang_cuts = train_datatang_cuts.repeat(times=None) if args.enable_musan: - cuts_musan = load_manifest( - Path(args.manifest_dir) / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + Path(args.manifest_dir) / "musan_cuts.jsonl.gz" ) else: cuts_musan = None @@ -745,22 +745,23 @@ def run(rank, world_size, args): train_dl = asr_datamodule.train_dataloaders( train_cuts, - dynamic_bucketing=False, on_the_fly_feats=False, cuts_musan=cuts_musan, ) datatang_train_dl = asr_datamodule.train_dataloaders( train_datatang_cuts, - dynamic_bucketing=True, - on_the_fly_feats=True, + on_the_fly_feats=False, cuts_musan=cuts_musan, ) valid_cuts = aishell.valid_cuts() valid_dl = asr_datamodule.valid_dataloaders(valid_cuts) - for dl in [train_dl, datatang_train_dl]: + for dl in [ + train_dl, + # datatang_train_dl + ]: scan_pessimistic_batches_for_oom( model=model, train_dl=dl, diff --git a/egs/aishell/ASR/transducer_stateless_modified/train.py b/egs/aishell/ASR/transducer_stateless_modified/train.py index 524854b73..dcbc874a0 100755 --- a/egs/aishell/ASR/transducer_stateless_modified/train.py +++ b/egs/aishell/ASR/transducer_stateless_modified/train.py @@ -37,6 +37,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2" import argparse import logging +import warnings from pathlib import Path from shutil import copyfile from typing import Optional, Tuple @@ -411,7 +412,11 @@ def compute_loss( assert loss.requires_grad == is_training info = MetricsTracker() - info["frames"] = (feature_lens // params.subsampling_factor).sum().item() + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + info["frames"] = ( + (feature_lens // params.subsampling_factor).sum().item() + ) # Note: We use reduction=sum while computing the loss. info["loss"] = loss.detach().cpu().item() diff --git a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py index ae24127bf..b3fc8adbb 100755 --- a/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py +++ b/egs/alimeeting/ASR/local/compute_fbank_alimeeting.py @@ -43,7 +43,7 @@ torch.set_num_interop_threads(1) def compute_fbank_alimeeting(num_mel_bins: int = 80): - src_dir = Path("data/manifests/alimeeting") + src_dir = Path("data/manifests") output_dir = Path("data/fbank") num_jobs = min(15, os.cpu_count()) @@ -52,11 +52,14 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80): "eval", "test", ) + + prefix = "alimeeting" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=src_dir, - prefix="alimeeting", - suffix="jsonl.gz", + prefix=prefix, + suffix=suffix, ) assert manifests is not None @@ -64,7 +67,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80): with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): - if (output_dir / f"cuts_{partition}.json.gz").is_file(): + if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file(): logging.info(f"{partition} already exists - skipping.") continue logging.info(f"Processing {partition}") @@ -83,7 +86,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80): cut_set = cut_set.compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_{partition}", + storage_path=f"{output_dir}/{prefix}_feats_{partition}", # when an executor is specified, make more partitions num_jobs=cur_num_jobs, executor=ex, @@ -95,7 +98,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80): keep_overlapping=False, min_duration=None, ) - cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}") def get_args(): diff --git a/egs/alimeeting/ASR/local/display_manifest_statistics.py b/egs/alimeeting/ASR/local/display_manifest_statistics.py index 7f7aa094d..16cdecc91 100644 --- a/egs/alimeeting/ASR/local/display_manifest_statistics.py +++ b/egs/alimeeting/ASR/local/display_manifest_statistics.py @@ -25,19 +25,19 @@ for usage. """ -from lhotse import load_manifest +from lhotse import load_manifest_lazy def main(): paths = [ - "./data/fbank/cuts_train.json.gz", - "./data/fbank/cuts_eval.json.gz", - "./data/fbank/cuts_test.json.gz", + "./data/fbank/alimeeting_cuts_train.jsonl.gz", + "./data/fbank/alimeeting_cuts_eval.jsonl.gz", + "./data/fbank/alimeeting_cuts_test.jsonl.gz", ] for path in paths: print(f"Starting display the statistics for {path}") - cuts = load_manifest(path) + cuts = load_manifest_lazy(path) cuts.describe() @@ -45,7 +45,7 @@ if __name__ == "__main__": main() """ -Starting display the statistics for ./data/fbank/cuts_train.json.gz +Starting display the statistics for ./data/fbank/alimeeting_cuts_train.jsonl.gz Cuts count: 559092 Total duration (hours): 424.6 Speech duration (hours): 424.6 (100.0%) @@ -61,7 +61,7 @@ min 0.0 99.5% 14.7 99.9% 16.2 max 284.3 -Starting display the statistics for ./data/fbank/cuts_eval.json.gz +Starting display the statistics for ./data/fbank/alimeeting_cuts_eval.jsonl.gz Cuts count: 6457 Total duration (hours): 4.9 Speech duration (hours): 4.9 (100.0%) @@ -77,7 +77,7 @@ min 0.1 99.5% 14.1 99.9% 14.7 max 15.8 -Starting display the statistics for ./data/fbank/cuts_test.json.gz +Starting display the statistics for ./data/fbank/alimeeting_cuts_test.jsonl.gz Cuts count: 16358 Total duration (hours): 12.5 Speech duration (hours): 12.5 (100.0%) diff --git a/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py index bd41a7a1e..339612afe 100644 --- a/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py +++ b/egs/alimeeting/ASR/pruned_transducer_stateless2/asr_datamodule.py @@ -27,7 +27,7 @@ from lhotse import ( CutSet, Fbank, FbankConfig, - load_manifest, + load_manifest_lazy, set_caching_enabled, ) from lhotse.dataset import ( @@ -204,8 +204,8 @@ class AlimeetingAsrDataModule: The state dict for the training sampler. """ logging.info("About to get Musan cuts") - cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms = [] @@ -401,14 +401,20 @@ class AlimeetingAsrDataModule: @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - return load_manifest(self.args.manifest_dir / "cuts_train.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "alimeeting_cuts_train.jsonl.gz" + ) @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest(self.args.manifest_dir / "cuts_eval.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "alimeeting_cuts_eval.jsonl.gz" + ) @lru_cache() def test_cuts(self) -> List[CutSet]: logging.info("About to get test cuts") - return load_manifest(self.args.manifest_dir / "cuts_test.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "alimeeting_cuts_test.jsonl.gz" + ) diff --git a/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py b/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py index ab958fa68..62b43146a 100644 --- a/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py +++ b/egs/gigaspeech/ASR/conformer_ctc/asr_datamodule.py @@ -20,9 +20,8 @@ import logging from functools import lru_cache from pathlib import Path -from lhotse import CutSet, Fbank, FbankConfig, load_manifest +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse.dataset import ( - BucketingSampler, CutConcatenate, CutMix, DynamicBucketingSampler, @@ -190,8 +189,8 @@ class GigaSpeechAsrDataModule: def train_dataloaders(self, cuts_train: CutSet) -> DataLoader: logging.info("About to get Musan cuts") - cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms = [] @@ -315,7 +314,7 @@ class GigaSpeechAsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = BucketingSampler( + valid_sampler = DynamicBucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, @@ -339,8 +338,10 @@ class GigaSpeechAsrDataModule: else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = BucketingSampler( - cuts, max_duration=self.args.max_duration, shuffle=False + sampler = DynamicBucketingSampler( + cuts, + max_duration=self.args.max_duration, + shuffle=False, ) logging.debug("About to create test dataloader") test_dl = DataLoader( @@ -361,7 +362,9 @@ class GigaSpeechAsrDataModule: @lru_cache() def dev_cuts(self) -> CutSet: logging.info("About to get dev cuts") - cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz") + cuts_valid = load_manifest_lazy( + self.args.manifest_dir / "cuts_DEV.jsonl.gz" + ) if self.args.small_dev: return cuts_valid.subset(first=1000) else: @@ -370,4 +373,4 @@ class GigaSpeechAsrDataModule: @lru_cache() def test_cuts(self) -> CutSet: logging.info("About to get test cuts") - return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz") + return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz") diff --git a/egs/gigaspeech/ASR/local/compute_fbank_musan.py b/egs/gigaspeech/ASR/local/compute_fbank_musan.py deleted file mode 100755 index 562872993..000000000 --- a/egs/gigaspeech/ASR/local/compute_fbank_musan.py +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021 Johns Hopkins University (Piotr Żelasko) -# Copyright 2021 Xiaomi Corp. (Fangjun Kuang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from pathlib import Path - -import torch -from lhotse import ( - CutSet, - KaldifeatFbank, - KaldifeatFbankConfig, - combine, -) -from lhotse.recipes.utils import read_manifests_if_cached - -# Torch's multithreaded behavior needs to be disabled or -# it wastes a lot of CPU and slow things down. -# Do this outside of main() in case it needs to take effect -# even when we are not invoking the main (e.g. when spawning subprocesses). -torch.set_num_threads(1) -torch.set_num_interop_threads(1) - - -def compute_fbank_musan(): - src_dir = Path("data/manifests") - output_dir = Path("data/fbank") - - # number of workers in dataloader - num_workers = 10 - - # number of seconds in a batch - batch_duration = 600 - - dataset_parts = ( - "music", - "speech", - "noise", - ) - - manifests = read_manifests_if_cached( - prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir - ) - assert manifests is not None - - musan_cuts_path = output_dir / "cuts_musan.json.gz" - - if musan_cuts_path.is_file(): - logging.info(f"{musan_cuts_path} already exists - skipping") - return - - logging.info("Extracting features for Musan") - - device = torch.device("cpu") - if torch.cuda.is_available(): - device = torch.device("cuda", 0) - extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) - - logging.info(f"device: {device}") - - musan_cuts = ( - CutSet.from_manifests( - recordings=combine( - part["recordings"] for part in manifests.values() - ) - ) - .cut_into_windows(10.0) - .filter(lambda c: c.duration > 5) - .compute_and_store_features_batch( - extractor=extractor, - storage_path=f"{output_dir}/feats_musan", - num_workers=num_workers, - batch_duration=batch_duration, - ) - ) - musan_cuts.to_json(musan_cuts_path) - - -def main(): - formatter = ( - "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" - ) - logging.basicConfig(format=formatter, level=logging.INFO) - - compute_fbank_musan() - - -if __name__ == "__main__": - main() diff --git a/egs/gigaspeech/ASR/local/compute_fbank_musan.py b/egs/gigaspeech/ASR/local/compute_fbank_musan.py new file mode 120000 index 000000000..5833f2484 --- /dev/null +++ b/egs/gigaspeech/ASR/local/compute_fbank_musan.py @@ -0,0 +1 @@ +../../../librispeech/ASR/local/compute_fbank_musan.py \ No newline at end of file diff --git a/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py index ff3d3b07a..19fe7c6a7 100644 --- a/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py +++ b/egs/gigaspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py @@ -23,9 +23,8 @@ from pathlib import Path from typing import Any, Dict, Optional import torch -from lhotse import CutSet, Fbank, FbankConfig, load_manifest +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse.dataset import ( - BucketingSampler, CutConcatenate, CutMix, DynamicBucketingSampler, @@ -217,8 +216,8 @@ class GigaSpeechAsrDataModule: if self.args.enable_musan: logging.info("Enable MUSAN") logging.info("About to get Musan cuts") - cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms.append( CutMix( @@ -358,7 +357,7 @@ class GigaSpeechAsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = BucketingSampler( + valid_sampler = DynamicBucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, @@ -382,8 +381,10 @@ class GigaSpeechAsrDataModule: else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = BucketingSampler( - cuts, max_duration=self.args.max_duration, shuffle=False + sampler = DynamicBucketingSampler( + cuts, + max_duration=self.args.max_duration, + shuffle=False, ) logging.debug("About to create test dataloader") test_dl = DataLoader( @@ -404,7 +405,9 @@ class GigaSpeechAsrDataModule: @lru_cache() def dev_cuts(self) -> CutSet: logging.info("About to get dev cuts") - cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz") + cuts_valid = load_manifest_lazy( + self.args.manifest_dir / "cuts_DEV.jsonl.gz" + ) if self.args.small_dev: return cuts_valid.subset(first=1000) else: @@ -413,4 +416,4 @@ class GigaSpeechAsrDataModule: @lru_cache() def test_cuts(self) -> CutSet: logging.info("About to get test cuts") - return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz") + return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz") diff --git a/egs/librispeech/ASR/conformer_ctc/ali.py b/egs/librispeech/ASR/conformer_ctc/ali.py index 42fa2308e..2828e309e 100755 --- a/egs/librispeech/ASR/conformer_ctc/ali.py +++ b/egs/librispeech/ASR/conformer_ctc/ali.py @@ -96,14 +96,14 @@ def get_parser(): - labels_xxx.h5 - aux_labels_xxx.h5 - - cuts_xxx.json.gz + - librispeech_cuts_xxx.jsonl.gz where xxx is the value of `--dataset`. For instance, if `--dataset` is `train-clean-100`, it will contain 3 files: - `labels_train-clean-100.h5` - `aux_labels_train-clean-100.h5` - - `cuts_train-clean-100.json.gz` + - `librispeech_cuts_train-clean-100.jsonl.gz` Note: Both labels_xxx.h5 and aux_labels_xxx.h5 contain framewise alignment. The difference is that labels_xxx.h5 contains repeats. @@ -289,7 +289,9 @@ def main(): out_labels_ali_filename = out_dir / f"labels_{params.dataset}.h5" out_aux_labels_ali_filename = out_dir / f"aux_labels_{params.dataset}.h5" - out_manifest_filename = out_dir / f"cuts_{params.dataset}.json.gz" + out_manifest_filename = ( + out_dir / f"librispeech_cuts_{params.dataset}.jsonl.gz" + ) for f in ( out_labels_ali_filename, diff --git a/egs/librispeech/ASR/conformer_ctc/train.py b/egs/librispeech/ASR/conformer_ctc/train.py index b81bd6330..fc8fc8863 100755 --- a/egs/librispeech/ASR/conformer_ctc/train.py +++ b/egs/librispeech/ASR/conformer_ctc/train.py @@ -17,6 +17,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Usage: + export CUDA_VISIBLE_DEVICES="0,1,2,3" + ./conformer_ctc/train.py \ + --exp-dir ./conformer_ctc/exp \ + --world-size 4 \ + --full-libri 1 \ + --max-duration 200 \ + --num-epochs 20 +""" + import argparse import logging from pathlib import Path @@ -29,6 +40,7 @@ import torch.multiprocessing as mp import torch.nn as nn from asr_datamodule import LibriSpeechAsrDataModule from conformer import Conformer +from lhotse.cut import Cut from lhotse.utils import fix_random_seed from torch import Tensor from torch.nn.parallel import DistributedDataParallel as DDP @@ -676,6 +688,20 @@ def run(rank, world_size, args): if params.full_libri: train_cuts += librispeech.train_clean_360_cuts() train_cuts += librispeech.train_other_500_cuts() + + def remove_short_and_long_utt(c: Cut): + # Keep only utterances with duration between 1 second and 20 seconds + # + # Caution: There is a reason to select 20.0 here. Please see + # ../local/display_manifest_statistics.py + # + # You should use ../local/display_manifest_statistics.py to get + # an utterance duration distribution for your dataset to select + # the threshold + return 1.0 <= c.duration <= 20.0 + + train_cuts = train_cuts.filter(remove_short_and_long_utt) + train_dl = librispeech.train_dataloaders(train_cuts) valid_cuts = librispeech.dev_clean_cuts() diff --git a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py index 9f1039893..68d93d2c5 100644 --- a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py +++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_dev_test.py @@ -20,11 +20,7 @@ import logging from pathlib import Path import torch -from lhotse import ( - CutSet, - KaldifeatFbank, - KaldifeatFbankConfig, -) +from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig # Torch's multithreaded behavior needs to be disabled or # it wastes a lot of CPU and slow things down. @@ -51,13 +47,16 @@ def compute_fbank_gigaspeech_dev_test(): logging.info(f"device: {device}") + prefix = "gigaspeech" + suffix = "jsonl.gz" + for partition in subsets: - cuts_path = in_out_dir / f"cuts_{partition}.jsonl.gz" + cuts_path = in_out_dir / f"{prefix}_cuts_{partition}.{suffix}" if cuts_path.is_file(): logging.info(f"{cuts_path} exists - skipping") continue - raw_cuts_path = in_out_dir / f"cuts_{partition}_raw.jsonl.gz" + raw_cuts_path = in_out_dir / f"{prefix}_cuts_{partition}_raw.{suffix}" logging.info(f"Loading {raw_cuts_path}") cut_set = CutSet.from_file(raw_cuts_path) @@ -66,7 +65,7 @@ def compute_fbank_gigaspeech_dev_test(): cut_set = cut_set.compute_and_store_features_batch( extractor=extractor, - storage_path=f"{in_out_dir}/feats_{partition}", + storage_path=f"{in_out_dir}/{prefix}_feats_{partition}", num_workers=num_workers, batch_duration=batch_duration, ) diff --git a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py index a7ed2467d..f826f064e 100644 --- a/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py +++ b/egs/librispeech/ASR/local/compute_fbank_gigaspeech_splits.py @@ -77,7 +77,7 @@ def get_parser(): def compute_fbank_gigaspeech_splits(args): num_splits = args.num_splits - output_dir = f"data/fbank/XL_split_{num_splits}" + output_dir = f"data/fbank/gigaspeech_XL_split_{num_splits}" output_dir = Path(output_dir) assert output_dir.exists(), f"{output_dir} does not exist!" @@ -96,17 +96,19 @@ def compute_fbank_gigaspeech_splits(args): extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) logging.info(f"device: {device}") + prefix = "gigaspeech" + num_digits = 8 # num_digits is fixed by lhotse split-lazy for i in range(start, stop): idx = f"{i + 1}".zfill(num_digits) logging.info(f"Processing {idx}/{num_splits}") - cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz" + cuts_path = output_dir / f"{prefix}_cuts_XL.{idx}.jsonl.gz" if cuts_path.is_file(): logging.info(f"{cuts_path} exists - skipping") continue - raw_cuts_path = output_dir / f"cuts_XL_raw.{idx}.jsonl.gz" + raw_cuts_path = output_dir / f"{prefix}_cuts_XL_raw.{idx}.jsonl.gz" if not raw_cuts_path.is_file(): logging.info(f"{raw_cuts_path} does not exist - skipping it") continue @@ -115,13 +117,13 @@ def compute_fbank_gigaspeech_splits(args): cut_set = CutSet.from_file(raw_cuts_path) logging.info("Computing features") - if (output_dir / f"feats_XL_{idx}.lca").exists(): - logging.info(f"Removing {output_dir}/feats_XL_{idx}.lca") - os.remove(output_dir / f"feats_XL_{idx}.lca") + if (output_dir / f"{prefix}_feats_XL_{idx}.lca").exists(): + logging.info(f"Removing {output_dir}/{prefix}_feats_XL_{idx}.lca") + os.remove(output_dir / f"{prefix}_feats_XL_{idx}.lca") cut_set = cut_set.compute_and_store_features_batch( extractor=extractor, - storage_path=f"{output_dir}/feats_XL_{idx}", + storage_path=f"{output_dir}/{prefix}_feats_XL_{idx}", num_workers=args.num_workers, batch_duration=args.batch_duration, ) diff --git a/egs/librispeech/ASR/local/compute_fbank_librispeech.py b/egs/librispeech/ASR/local/compute_fbank_librispeech.py index 92f4f6ab7..642d9fd32 100755 --- a/egs/librispeech/ASR/local/compute_fbank_librispeech.py +++ b/egs/librispeech/ASR/local/compute_fbank_librispeech.py @@ -28,7 +28,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -56,8 +56,13 @@ def compute_fbank_librispeech(): "train-clean-360", "train-other-500", ) + prefix = "librispeech" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( - prefix="librispeech", dataset_parts=dataset_parts, output_dir=src_dir + dataset_parts=dataset_parts, + output_dir=src_dir, + prefix=prefix, + suffix=suffix, ) assert manifests is not None @@ -65,7 +70,8 @@ def compute_fbank_librispeech(): with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): - if (output_dir / f"cuts_{partition}.json.gz").is_file(): + cuts_filename = f"{prefix}_cuts_{partition}.{suffix}" + if (output_dir / cuts_filename).is_file(): logging.info(f"{partition} already exists - skipping.") continue logging.info(f"Processing {partition}") @@ -81,13 +87,13 @@ def compute_fbank_librispeech(): ) cut_set = cut_set.compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_{partition}", + storage_path=f"{output_dir}/{prefix}_feats_{partition}", # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) - cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + cut_set.to_file(output_dir / cuts_filename) if __name__ == "__main__": diff --git a/egs/librispeech/ASR/local/compute_fbank_musan.py b/egs/librispeech/ASR/local/compute_fbank_musan.py index 368bea4e8..fef372129 100755 --- a/egs/librispeech/ASR/local/compute_fbank_musan.py +++ b/egs/librispeech/ASR/local/compute_fbank_musan.py @@ -28,7 +28,7 @@ import os from pathlib import Path import torch -from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig, combine +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -52,12 +52,22 @@ def compute_fbank_musan(): "speech", "noise", ) + prefix = "musan" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( - prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir + dataset_parts=dataset_parts, + output_dir=src_dir, + prefix=prefix, + suffix=suffix, ) assert manifests is not None - musan_cuts_path = output_dir / "cuts_musan.json.gz" + assert len(manifests) == len(dataset_parts), ( + len(manifests), + len(dataset_parts), + ) + + musan_cuts_path = output_dir / "musan_cuts.jsonl.gz" if musan_cuts_path.is_file(): logging.info(f"{musan_cuts_path} already exists - skipping") @@ -79,13 +89,13 @@ def compute_fbank_musan(): .filter(lambda c: c.duration > 5) .compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_musan", + storage_path=f"{output_dir}/musan_feats", num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=ChunkedLilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) ) - musan_cuts.to_json(musan_cuts_path) + musan_cuts.to_file(musan_cuts_path) if __name__ == "__main__": diff --git a/egs/librispeech/ASR/local/display_manifest_statistics.py b/egs/librispeech/ASR/local/display_manifest_statistics.py index 15bd206fa..c3c684235 100755 --- a/egs/librispeech/ASR/local/display_manifest_statistics.py +++ b/egs/librispeech/ASR/local/display_manifest_statistics.py @@ -25,19 +25,19 @@ for usage. """ -from lhotse import load_manifest +from lhotse import load_manifest_lazy def main(): - path = "./data/fbank/cuts_train-clean-100.json.gz" - path = "./data/fbank/cuts_train-clean-360.json.gz" - path = "./data/fbank/cuts_train-other-500.json.gz" - path = "./data/fbank/cuts_dev-clean.json.gz" - path = "./data/fbank/cuts_dev-other.json.gz" - path = "./data/fbank/cuts_test-clean.json.gz" - path = "./data/fbank/cuts_test-other.json.gz" + # path = "./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz" + # path = "./data/fbank/librispeech_cuts_train-clean-360.jsonl.gz" + # path = "./data/fbank/librispeech_cuts_train-other-500.jsonl.gz" + # path = "./data/fbank/librispeech_cuts_dev-clean.jsonl.gz" + # path = "./data/fbank/librispeech_cuts_dev-other.jsonl.gz" + # path = "./data/fbank/librispeech_cuts_test-clean.jsonl.gz" + path = "./data/fbank/librispeech_cuts_test-other.jsonl.gz" - cuts = load_manifest(path) + cuts = load_manifest_lazy(path) cuts.describe() diff --git a/egs/librispeech/ASR/local/preprocess_gigaspeech.py b/egs/librispeech/ASR/local/preprocess_gigaspeech.py index cd1345904..0f4ae820b 100644 --- a/egs/librispeech/ASR/local/preprocess_gigaspeech.py +++ b/egs/librispeech/ASR/local/preprocess_gigaspeech.py @@ -58,17 +58,19 @@ def preprocess_giga_speech(): ) logging.info("Loading manifest (may take 4 minutes)") + prefix = "gigaspeech" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=src_dir, - prefix="gigaspeech", - suffix="jsonl.gz", + prefix=prefix, + suffix=suffix, ) assert manifests is not None for partition, m in manifests.items(): logging.info(f"Processing {partition}") - raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz" + raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.{suffix}" if raw_cuts_path.is_file(): logging.info(f"{partition} already exists - skipping") continue diff --git a/egs/librispeech/ASR/local/validate_manifest.py b/egs/librispeech/ASR/local/validate_manifest.py index 8d3d4c7ce..7c57d629a 100755 --- a/egs/librispeech/ASR/local/validate_manifest.py +++ b/egs/librispeech/ASR/local/validate_manifest.py @@ -25,7 +25,7 @@ We will add more checks later if needed. Usage example: python3 ./local/validate_manifest.py \ - ./data/fbank/cuts_train-clean-100.json.gz + ./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz """ @@ -33,7 +33,7 @@ import argparse import logging from pathlib import Path -from lhotse import load_manifest, CutSet +from lhotse import CutSet, load_manifest_lazy from lhotse.cut import Cut @@ -76,7 +76,7 @@ def main(): logging.info(f"Validating {manifest}") assert manifest.is_file(), f"{manifest} does not exist" - cut_set = load_manifest(manifest) + cut_set = load_manifest_lazy(manifest) assert isinstance(cut_set, CutSet) for c in cut_set: diff --git a/egs/librispeech/ASR/prepare.sh b/egs/librispeech/ASR/prepare.sh index 8cfb046c8..17a638502 100755 --- a/egs/librispeech/ASR/prepare.sh +++ b/egs/librispeech/ASR/prepare.sh @@ -40,9 +40,9 @@ dl_dir=$PWD/download # It will generate data/lang_bpe_xxx, # data/lang_bpe_yyy if the array contains xxx, yyy vocab_sizes=( - 5000 - 2000 - 1000 + # 5000 + # 2000 + # 1000 500 ) @@ -132,7 +132,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then ) for part in ${parts[@]}; do python3 ./local/validate_manifest.py \ - data/fbank/cuts_${part}.json.gz + data/fbank/librispeech_cuts_${part}.jsonl.gz done touch data/fbank/.librispeech-validated.done fi diff --git a/egs/librispeech/ASR/prepare_giga_speech.sh b/egs/librispeech/ASR/prepare_giga_speech.sh index 26b921eab..6f85ddc29 100755 --- a/egs/librispeech/ASR/prepare_giga_speech.sh +++ b/egs/librispeech/ASR/prepare_giga_speech.sh @@ -124,9 +124,9 @@ fi if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then log "Stage 4: Split XL subset into ${num_splits} pieces" - split_dir=data/fbank/XL_split_${num_splits} + split_dir=data/fbank/gigaspeech_XL_split_${num_splits} if [ ! -f $split_dir/.split_completed ]; then - lhotse split-lazy ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir $chunk_size + lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size touch $split_dir/.split_completed fi fi diff --git a/egs/librispeech/ASR/pruned_transducer_stateless/train.py b/egs/librispeech/ASR/pruned_transducer_stateless/train.py index c360d025a..e6795330f 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless/train.py @@ -807,28 +807,8 @@ def run(rank, world_size, args): # the threshold return 1.0 <= c.duration <= 20.0 - num_in_total = len(train_cuts) - train_cuts = train_cuts.filter(remove_short_and_long_utt) - try: - num_left = len(train_cuts) - num_removed = num_in_total - num_left - removed_percent = num_removed / num_in_total * 100 - - logging.info( - f"Before removing short and long utterances: {num_in_total}" - ) - logging.info(f"After removing short and long utterances: {num_left}") - logging.info( - f"Removed {num_removed} utterances ({removed_percent:.5f}%)" - ) - except TypeError as e: - # You can ignore this error as previous versions of Lhotse work fine - # for the above code. In recent versions of Lhotse, it uses - # lazy filter, producing cutsets that don't have the __len__ method - logging.info(str(e)) - if params.start_batch > 0 and checkpoints and "sampler" in checkpoints: # We only load the sampler's state dict when it loads a checkpoint # saved in the middle of an epoch diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py b/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py index 8828285aa..b54d1aa39 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless3/asr_datamodule.py @@ -22,7 +22,6 @@ from typing import Optional from lhotse import CutSet, Fbank, FbankConfig from lhotse.dataset import ( - BucketingSampler, CutMix, DynamicBucketingSampler, K2SpeechRecognitionDataset, @@ -71,8 +70,7 @@ class AsrDataModule: "--num-buckets", type=int, default=30, - help="The number of buckets for the BucketingSampler " - "and DynamicBucketingSampler." + help="The number of buckets for the DynamicBucketingSampler. " "(you might want to increase it for larger datasets).", ) @@ -152,7 +150,6 @@ class AsrDataModule: def train_dataloaders( self, cuts_train: CutSet, - dynamic_bucketing: bool, on_the_fly_feats: bool, cuts_musan: Optional[CutSet] = None, ) -> DataLoader: @@ -162,9 +159,6 @@ class AsrDataModule: Cuts for training. cuts_musan: If not None, it is the cuts for mixing. - dynamic_bucketing: - True to use DynamicBucketingSampler; - False to use BucketingSampler. on_the_fly_feats: True to use OnTheFlyFeatures; False to use PrecomputedFeatures. @@ -230,25 +224,14 @@ class AsrDataModule: return_cuts=self.args.return_cuts, ) - if dynamic_bucketing: - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - num_buckets=self.args.num_buckets, - drop_last=True, - ) - else: - logging.info("Using BucketingSampler.") - train_sampler = BucketingSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - num_buckets=self.args.num_buckets, - bucket_method="equal_duration", - drop_last=True, - ) + logging.info("Using DynamicBucketingSampler.") + train_sampler = DynamicBucketingSampler( + cuts_train, + max_duration=self.args.max_duration, + shuffle=self.args.shuffle, + num_buckets=self.args.num_buckets, + drop_last=True, + ) logging.info("About to create train dataloader") train_dl = DataLoader( diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py b/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py index 3f8bf3ba9..36f32c6b3 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless3/gigaspeech.py @@ -22,7 +22,7 @@ import re from pathlib import Path import lhotse -from lhotse import CutSet, load_manifest +from lhotse import CutSet, load_manifest_lazy class GigaSpeech: @@ -32,13 +32,13 @@ class GigaSpeech: manifest_dir: It is expected to contain the following files:: - - XL_split_2000/cuts_XL.*.jsonl.gz - - cuts_L_raw.jsonl.gz - - cuts_M_raw.jsonl.gz - - cuts_S_raw.jsonl.gz - - cuts_XS_raw.jsonl.gz - - cuts_DEV_raw.jsonl.gz - - cuts_TEST_raw.jsonl.gz + - gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz + - gigaspeech_cuts_L_raw.jsonl.gz + - gigaspeech_cuts_M_raw.jsonl.gz + - gigaspeech_cuts_S_raw.jsonl.gz + - gigaspeech_cuts_XS_raw.jsonl.gz + - gigaspeech_cuts_DEV_raw.jsonl.gz + - gigaspeech_cuts_TEST_raw.jsonl.gz """ self.manifest_dir = Path(manifest_dir) @@ -46,10 +46,12 @@ class GigaSpeech: logging.info("About to get train-XL cuts") filenames = list( - glob.glob(f"{self.manifest_dir}/XL_split_2000/cuts_XL.*.jsonl.gz") + glob.glob( + f"{self.manifest_dir}/gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz" # noqa + ) ) - pattern = re.compile(r"cuts_XL.([0-9]+).jsonl.gz") + pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz") idx_filenames = [ (int(pattern.search(f).group(1)), f) for f in filenames ] @@ -64,31 +66,31 @@ class GigaSpeech: ) def train_L_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_L_raw.jsonl.gz" + f = self.manifest_dir / "gigaspeech_cuts_L_raw.jsonl.gz" logging.info(f"About to get train-L cuts from {f}") return CutSet.from_jsonl_lazy(f) def train_M_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_M_raw.jsonl.gz" + f = self.manifest_dir / "gigaspeech_cuts_M_raw.jsonl.gz" logging.info(f"About to get train-M cuts from {f}") return CutSet.from_jsonl_lazy(f) def train_S_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_S_raw.jsonl.gz" + f = self.manifest_dir / "gigaspeech_cuts_S_raw.jsonl.gz" logging.info(f"About to get train-S cuts from {f}") return CutSet.from_jsonl_lazy(f) def train_XS_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_XS_raw.jsonl.gz" + f = self.manifest_dir / "gigaspeech_cuts_XS_raw.jsonl.gz" logging.info(f"About to get train-XS cuts from {f}") return CutSet.from_jsonl_lazy(f) def test_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_TEST.jsonl.gz" + f = self.manifest_dir / "gigaspeech_cuts_TEST.jsonl.gz" logging.info(f"About to get TEST cuts from {f}") - return load_manifest(f) + return load_manifest_lazy(f) def dev_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_DEV.jsonl.gz" + f = self.manifest_dir / "gigaspeech_cuts_DEV.jsonl.gz" logging.info(f"About to get DEV cuts from {f}") - return load_manifest(f) + return load_manifest_lazy(f) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py b/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py index 00b7c8334..6dba8e9fe 100644 --- a/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless3/librispeech.py @@ -18,7 +18,7 @@ import logging from pathlib import Path -from lhotse import CutSet, load_manifest +from lhotse import CutSet, load_manifest_lazy class LibriSpeech: @@ -28,47 +28,47 @@ class LibriSpeech: manifest_dir: It is expected to contain the following files:: - - cuts_dev-clean.json.gz - - cuts_dev-other.json.gz - - cuts_test-clean.json.gz - - cuts_test-other.json.gz - - cuts_train-clean-100.json.gz - - cuts_train-clean-360.json.gz - - cuts_train-other-500.json.gz + - librispeech_cuts_dev-clean.jsonl.gz + - librispeech_cuts_dev-other.jsonl.gz + - librispeech_cuts_test-clean.jsonl.gz + - librispeech_cuts_test-other.jsonl.gz + - librispeech_cuts_train-clean-100.jsonl.gz + - librispeech_cuts_train-clean-360.jsonl.gz + - librispeech_cuts_train-other-500.jsonl.gz """ self.manifest_dir = Path(manifest_dir) def train_clean_100_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_train-clean-100.json.gz" + f = self.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz" logging.info(f"About to get train-clean-100 cuts from {f}") - return load_manifest(f) + return load_manifest_lazy(f) def train_clean_360_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_train-clean-360.json.gz" + f = self.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz" logging.info(f"About to get train-clean-360 cuts from {f}") - return load_manifest(f) + return load_manifest_lazy(f) def train_other_500_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_train-other-500.json.gz" + f = self.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz" logging.info(f"About to get train-other-500 cuts from {f}") - return load_manifest(f) + return load_manifest_lazy(f) def test_clean_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_test-clean.json.gz" + f = self.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz" logging.info(f"About to get test-clean cuts from {f}") - return load_manifest(f) + return load_manifest_lazy(f) def test_other_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_test-other.json.gz" + f = self.manifest_dir / "librispeech_cuts_test-other.jsonl.gz" logging.info(f"About to get test-other cuts from {f}") - return load_manifest(f) + return load_manifest_lazy(f) def dev_clean_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_dev-clean.json.gz" + f = self.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz" logging.info(f"About to get dev-clean cuts from {f}") - return load_manifest(f) + return load_manifest_lazy(f) def dev_other_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_dev-other.json.gz" + f = self.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz" logging.info(f"About to get dev-other cuts from {f}") - return load_manifest(f) + return load_manifest_lazy(f) diff --git a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py index a2a5519f1..37cebd577 100755 --- a/egs/librispeech/ASR/pruned_transducer_stateless3/train.py +++ b/egs/librispeech/ASR/pruned_transducer_stateless3/train.py @@ -66,7 +66,7 @@ from conformer import Conformer from decoder import Decoder from gigaspeech import GigaSpeech from joiner import Joiner -from lhotse import CutSet, load_manifest +from lhotse import CutSet, load_manifest_lazy from lhotse.cut import Cut from lhotse.dataset.sampling.base import CutSampler from lhotse.utils import fix_random_seed @@ -968,8 +968,8 @@ def run(rank, world_size, args): train_giga_cuts = train_giga_cuts.repeat(times=None) if args.enable_musan: - cuts_musan = load_manifest( - Path(args.manifest_dir) / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + Path(args.manifest_dir) / "musan_cuts.jsonl.gz" ) else: cuts_musan = None @@ -978,14 +978,12 @@ def run(rank, world_size, args): train_dl = asr_datamodule.train_dataloaders( train_cuts, - dynamic_bucketing=False, on_the_fly_feats=False, cuts_musan=cuts_musan, ) giga_train_dl = asr_datamodule.train_dataloaders( train_giga_cuts, - dynamic_bucketing=True, on_the_fly_feats=False, cuts_musan=cuts_musan, ) diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py index 7628c8274..5cca06169 100644 --- a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py +++ b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py @@ -24,7 +24,7 @@ from pathlib import Path from typing import Any, Dict, Optional import torch -from lhotse import CutSet, Fbank, FbankConfig, load_manifest +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures CutConcatenate, CutMix, @@ -224,8 +224,8 @@ class LibriSpeechAsrDataModule: if self.args.enable_musan: logging.info("Enable MUSAN") logging.info("About to get Musan cuts") - cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms.append( CutMix( @@ -407,40 +407,48 @@ class LibriSpeechAsrDataModule: @lru_cache() def train_clean_100_cuts(self) -> CutSet: logging.info("About to get train-clean-100 cuts") - return load_manifest( - self.args.manifest_dir / "cuts_train-clean-100.json.gz" + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz" ) @lru_cache() def train_clean_360_cuts(self) -> CutSet: logging.info("About to get train-clean-360 cuts") - return load_manifest( - self.args.manifest_dir / "cuts_train-clean-360.json.gz" + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz" ) @lru_cache() def train_other_500_cuts(self) -> CutSet: logging.info("About to get train-other-500 cuts") - return load_manifest( - self.args.manifest_dir / "cuts_train-other-500.json.gz" + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz" ) @lru_cache() def dev_clean_cuts(self) -> CutSet: logging.info("About to get dev-clean cuts") - return load_manifest(self.args.manifest_dir / "cuts_dev-clean.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz" + ) @lru_cache() def dev_other_cuts(self) -> CutSet: logging.info("About to get dev-other cuts") - return load_manifest(self.args.manifest_dir / "cuts_dev-other.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz" + ) @lru_cache() def test_clean_cuts(self) -> CutSet: logging.info("About to get test-clean cuts") - return load_manifest(self.args.manifest_dir / "cuts_test-clean.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz" + ) @lru_cache() def test_other_cuts(self) -> CutSet: logging.info("About to get test-other cuts") - return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz" + ) diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py index 8597525ba..827e3ae1f 100755 --- a/egs/librispeech/ASR/tdnn_lstm_ctc/train.py +++ b/egs/librispeech/ASR/tdnn_lstm_ctc/train.py @@ -16,6 +16,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +""" +Usage: + export CUDA_VISIBLE_DEVICES="0,1,2,3" + ./tdnn_lstm_ctc/train.py \ + --world-size 4 \ + --full-libri 1 \ + --max-duration 300 \ + --num-epochs 20 +""" import argparse import logging @@ -29,6 +38,7 @@ import torch.multiprocessing as mp import torch.nn as nn import torch.optim as optim from asr_datamodule import LibriSpeechAsrDataModule +from lhotse.cut import Cut from lhotse.utils import fix_random_seed from model import TdnnLstm from torch import Tensor @@ -544,10 +554,25 @@ def run(rank, world_size, args): if params.full_libri: train_cuts += librispeech.train_clean_360_cuts() train_cuts += librispeech.train_other_500_cuts() + + def remove_short_and_long_utt(c: Cut): + # Keep only utterances with duration between 1 second and 20 seconds + # + # Caution: There is a reason to select 20.0 here. Please see + # ../local/display_manifest_statistics.py + # + # You should use ../local/display_manifest_statistics.py to get + # an utterance duration distribution for your dataset to select + # the threshold + return 1.0 <= c.duration <= 20.0 + + train_cuts = train_cuts.filter(remove_short_and_long_utt) + train_dl = librispeech.train_dataloaders(train_cuts) valid_cuts = librispeech.dev_clean_cuts() valid_cuts += librispeech.dev_other_cuts() + valid_dl = librispeech.valid_dataloaders(valid_cuts) for epoch in range(params.start_epoch, params.num_epochs): diff --git a/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py b/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py index 99d5b3788..b00fc34f1 100755 --- a/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py +++ b/egs/librispeech/ASR/transducer_stateless/test_compute_ali.py @@ -44,8 +44,8 @@ from pathlib import Path import sentencepiece as spm import torch from alignment import get_word_starting_frames -from lhotse import CutSet, load_manifest -from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler +from lhotse import CutSet, load_manifest_lazy +from lhotse.dataset import DynamicBucketingSampler, K2SpeechRecognitionDataset from lhotse.dataset.collation import collate_custom_field @@ -93,14 +93,15 @@ def main(): sp = spm.SentencePieceProcessor() sp.load(args.bpe_model) - cuts_json = args.ali_dir / f"cuts_{args.dataset}.json.gz" + cuts_jsonl = args.ali_dir / f"librispeech_cuts_{args.dataset}.jsonl.gz" - logging.info(f"Loading {cuts_json}") - cuts = load_manifest(cuts_json) + logging.info(f"Loading {cuts_jsonl}") + cuts = load_manifest_lazy(cuts_jsonl) - sampler = SingleCutSampler( + sampler = DynamicBucketingSampler( cuts, max_duration=30, + num_buckets=30, shuffle=False, ) diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py deleted file mode 100644 index c6cf739fb..000000000 --- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py +++ /dev/null @@ -1,333 +0,0 @@ -# Copyright 2021 Piotr Żelasko -# 2022 Xiaomi Corp. (authors: Fangjun Kuang -# Mingshuang Luo) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import inspect -import logging -from pathlib import Path -from typing import Optional - -import torch -from lhotse import CutSet, Fbank, FbankConfig -from lhotse.dataset import ( - BucketingSampler, - CutMix, - DynamicBucketingSampler, - K2SpeechRecognitionDataset, - SpecAugment, -) -from lhotse.dataset.input_strategies import ( - OnTheFlyFeatures, - PrecomputedFeatures, -) -from lhotse.utils import fix_random_seed -from torch.utils.data import DataLoader - -from icefall.utils import str2bool - - -class _SeedWorkers: - def __init__(self, seed: int): - self.seed = seed - - def __call__(self, worker_id: int): - fix_random_seed(self.seed + worker_id) - - -class AsrDataModule: - def __init__(self, args: argparse.Namespace): - self.args = args - - @classmethod - def add_arguments(cls, parser: argparse.ArgumentParser): - group = parser.add_argument_group( - title="ASR data related options", - description="These options are used for the preparation of " - "PyTorch DataLoaders from Lhotse CutSet's -- they control the " - "effective batch sizes, sampling strategies, applied data " - "augmentations, etc.", - ) - - group.add_argument( - "--max-duration", - type=int, - default=200.0, - help="Maximum pooled recordings duration (seconds) in a " - "single batch. You can reduce it if it causes CUDA OOM.", - ) - - group.add_argument( - "--bucketing-sampler", - type=str2bool, - default=True, - help="When enabled, the batches will come from buckets of " - "similar duration (saves padding frames).", - ) - - group.add_argument( - "--num-buckets", - type=int, - default=30, - help="The number of buckets for the BucketingSampler " - "and DynamicBucketingSampler." - "(you might want to increase it for larger datasets).", - ) - - group.add_argument( - "--shuffle", - type=str2bool, - default=True, - help="When enabled (=default), the examples will be " - "shuffled for each epoch.", - ) - - group.add_argument( - "--return-cuts", - type=str2bool, - default=True, - help="When enabled, each batch will have the " - "field: batch['supervisions']['cut'] with the cuts that " - "were used to construct it.", - ) - - group.add_argument( - "--num-workers", - type=int, - default=2, - help="The number of training dataloader workers that " - "collect the batches.", - ) - - group.add_argument( - "--enable-spec-aug", - type=str2bool, - default=True, - help="When enabled, use SpecAugment for training dataset.", - ) - - group.add_argument( - "--spec-aug-time-warp-factor", - type=int, - default=80, - help="Used only when --enable-spec-aug is True. " - "It specifies the factor for time warping in SpecAugment. " - "Larger values mean more warping. " - "A value less than 1 means to disable time warp.", - ) - - group.add_argument( - "--enable-musan", - type=str2bool, - default=True, - help="When enabled, select noise from MUSAN and mix it" - "with training dataset. ", - ) - - group.add_argument( - "--manifest-dir", - type=Path, - default=Path("data/fbank"), - help="Path to directory with train/valid/test cuts.", - ) - - group.add_argument( - "--on-the-fly-feats", - type=str2bool, - default=False, - help="When enabled, use on-the-fly cut mixing and feature " - "extraction. Will drop existing precomputed feature manifests " - "if available. Used only in dev/test CutSet", - ) - - def train_dataloaders( - self, - cuts_train: CutSet, - dynamic_bucketing: bool, - on_the_fly_feats: bool, - cuts_musan: Optional[CutSet] = None, - ) -> DataLoader: - """ - Args: - cuts_train: - Cuts for training. - cuts_musan: - If not None, it is the cuts for mixing. - dynamic_bucketing: - True to use DynamicBucketingSampler; - False to use BucketingSampler. - on_the_fly_feats: - True to use OnTheFlyFeatures; - False to use PrecomputedFeatures. - """ - transforms = [] - if cuts_musan is not None: - logging.info("Enable MUSAN") - transforms.append( - CutMix( - cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True - ) - ) - else: - logging.info("Disable MUSAN") - - input_transforms = [] - - if self.args.enable_spec_aug: - logging.info("Enable SpecAugment") - logging.info( - f"Time warp factor: {self.args.spec_aug_time_warp_factor}" - ) - # Set the value of num_frame_masks according to Lhotse's version. - # In different Lhotse's versions, the default of num_frame_masks is - # different. - num_frame_masks = 10 - num_frame_masks_parameter = inspect.signature( - SpecAugment.__init__ - ).parameters["num_frame_masks"] - if num_frame_masks_parameter.default == 1: - num_frame_masks = 2 - logging.info(f"Num frame mask: {num_frame_masks}") - input_transforms.append( - SpecAugment( - time_warp_factor=self.args.spec_aug_time_warp_factor, - num_frame_masks=num_frame_masks, - features_mask_size=27, - num_feature_masks=2, - frames_mask_size=100, - ) - ) - else: - logging.info("Disable SpecAugment") - - logging.info("About to create train dataset") - train = K2SpeechRecognitionDataset( - cut_transforms=transforms, - input_transforms=input_transforms, - return_cuts=self.args.return_cuts, - ) - - # NOTE: the PerturbSpeed transform should be added only if we - # remove it from data prep stage. - # Add on-the-fly speed perturbation; since originally it would - # have increased epoch size by 3, we will apply prob 2/3 and use - # 3x more epochs. - # Speed perturbation probably should come first before - # concatenation, but in principle the transforms order doesn't have - # to be strict (e.g. could be randomized) - # transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms # noqa - # Drop feats to be on the safe side. - train = K2SpeechRecognitionDataset( - cut_transforms=transforms, - input_strategy=( - OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))) - if on_the_fly_feats - else PrecomputedFeatures() - ), - input_transforms=input_transforms, - return_cuts=self.args.return_cuts, - ) - - if dynamic_bucketing: - logging.info("Using DynamicBucketingSampler.") - train_sampler = DynamicBucketingSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - num_buckets=self.args.num_buckets, - drop_last=True, - ) - else: - logging.info("Using BucketingSampler.") - train_sampler = BucketingSampler( - cuts_train, - max_duration=self.args.max_duration, - shuffle=self.args.shuffle, - num_buckets=self.args.num_buckets, - bucket_method="equal_duration", - drop_last=True, - ) - - logging.info("About to create train dataloader") - - # 'seed' is derived from the current random state, which will have - # previously been set in the main process. - seed = torch.randint(0, 100000, ()).item() - worker_init_fn = _SeedWorkers(seed) - - train_dl = DataLoader( - train, - sampler=train_sampler, - batch_size=None, - num_workers=self.args.num_workers, - persistent_workers=False, - worker_init_fn=worker_init_fn, - ) - return train_dl - - def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader: - transforms = [] - - logging.info("About to create dev dataset") - if self.args.on_the_fly_feats: - validate = K2SpeechRecognitionDataset( - cut_transforms=transforms, - input_strategy=OnTheFlyFeatures( - Fbank(FbankConfig(num_mel_bins=80)) - ), - return_cuts=self.args.return_cuts, - ) - else: - validate = K2SpeechRecognitionDataset( - cut_transforms=transforms, - return_cuts=self.args.return_cuts, - ) - valid_sampler = BucketingSampler( - cuts_valid, - max_duration=self.args.max_duration, - shuffle=False, - ) - logging.info("About to create dev dataloader") - valid_dl = DataLoader( - validate, - sampler=valid_sampler, - batch_size=None, - num_workers=2, - persistent_workers=False, - ) - - return valid_dl - - def test_dataloaders(self, cuts: CutSet) -> DataLoader: - logging.debug("About to create test dataset") - test = K2SpeechRecognitionDataset( - input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80))) - if self.args.on_the_fly_feats - else PrecomputedFeatures(), - return_cuts=self.args.return_cuts, - ) - sampler = BucketingSampler( - cuts, max_duration=self.args.max_duration, shuffle=False - ) - logging.debug("About to create test dataloader") - test_dl = DataLoader( - test, - batch_size=None, - sampler=sampler, - num_workers=self.args.num_workers, - ) - return test_dl diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py new file mode 120000 index 000000000..3ba9ada4f --- /dev/null +++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/asr_datamodule.py @@ -0,0 +1 @@ +../pruned_transducer_stateless3/asr_datamodule.py \ No newline at end of file diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py deleted file mode 100644 index 286771d7d..000000000 --- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright 2021 Piotr Żelasko -# 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import logging -from pathlib import Path - -from lhotse import CutSet, load_manifest - - -class GigaSpeech: - def __init__(self, manifest_dir: str): - """ - Args: - manifest_dir: - It is expected to contain the following files:: - - - cuts_XL_raw.jsonl.gz - - cuts_L_raw.jsonl.gz - - cuts_M_raw.jsonl.gz - - cuts_S_raw.jsonl.gz - - cuts_XS_raw.jsonl.gz - - cuts_DEV_raw.jsonl.gz - - cuts_TEST_raw.jsonl.gz - """ - self.manifest_dir = Path(manifest_dir) - - def train_XL_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_XL_raw.jsonl.gz" - logging.info(f"About to get train-XL cuts from {f}") - return CutSet.from_jsonl_lazy(f) - - def train_L_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_L_raw.jsonl.gz" - logging.info(f"About to get train-L cuts from {f}") - return CutSet.from_jsonl_lazy(f) - - def train_M_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_M_raw.jsonl.gz" - logging.info(f"About to get train-M cuts from {f}") - return CutSet.from_jsonl_lazy(f) - - def train_S_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_S_raw.jsonl.gz" - logging.info(f"About to get train-S cuts from {f}") - return CutSet.from_jsonl_lazy(f) - - def train_XS_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_XS_raw.jsonl.gz" - logging.info(f"About to get train-XS cuts from {f}") - return CutSet.from_jsonl_lazy(f) - - def test_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_TEST.jsonl.gz" - logging.info(f"About to get TEST cuts from {f}") - return load_manifest(f) - - def dev_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_DEV.jsonl.gz" - logging.info(f"About to get DEV cuts from {f}") - return load_manifest(f) diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py new file mode 120000 index 000000000..5242c652a --- /dev/null +++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/gigaspeech.py @@ -0,0 +1 @@ +../pruned_transducer_stateless3/gigaspeech.py \ No newline at end of file diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py deleted file mode 100644 index 00b7c8334..000000000 --- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2021 Piotr Żelasko -# 2022 Xiaomi Corp. (authors: Fangjun Kuang) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from pathlib import Path - -from lhotse import CutSet, load_manifest - - -class LibriSpeech: - def __init__(self, manifest_dir: str): - """ - Args: - manifest_dir: - It is expected to contain the following files:: - - - cuts_dev-clean.json.gz - - cuts_dev-other.json.gz - - cuts_test-clean.json.gz - - cuts_test-other.json.gz - - cuts_train-clean-100.json.gz - - cuts_train-clean-360.json.gz - - cuts_train-other-500.json.gz - """ - self.manifest_dir = Path(manifest_dir) - - def train_clean_100_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_train-clean-100.json.gz" - logging.info(f"About to get train-clean-100 cuts from {f}") - return load_manifest(f) - - def train_clean_360_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_train-clean-360.json.gz" - logging.info(f"About to get train-clean-360 cuts from {f}") - return load_manifest(f) - - def train_other_500_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_train-other-500.json.gz" - logging.info(f"About to get train-other-500 cuts from {f}") - return load_manifest(f) - - def test_clean_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_test-clean.json.gz" - logging.info(f"About to get test-clean cuts from {f}") - return load_manifest(f) - - def test_other_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_test-other.json.gz" - logging.info(f"About to get test-other cuts from {f}") - return load_manifest(f) - - def dev_clean_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_dev-clean.json.gz" - logging.info(f"About to get dev-clean cuts from {f}") - return load_manifest(f) - - def dev_other_cuts(self) -> CutSet: - f = self.manifest_dir / "cuts_dev-other.json.gz" - logging.info(f"About to get dev-other cuts from {f}") - return load_manifest(f) diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py new file mode 120000 index 000000000..b76723bf5 --- /dev/null +++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/librispeech.py @@ -0,0 +1 @@ +../pruned_transducer_stateless3/librispeech.py \ No newline at end of file diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py index e1833b841..3b51ff9bc 100755 --- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py +++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/test_asr_datamodule.py @@ -28,7 +28,7 @@ from pathlib import Path from asr_datamodule import AsrDataModule from gigaspeech import GigaSpeech -from lhotse import load_manifest +from lhotse import load_manifest_lazy from librispeech import LibriSpeech @@ -41,8 +41,8 @@ def test_dataset(): print(args) if args.enable_musan: - cuts_musan = load_manifest( - Path(args.manifest_dir) / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + Path(args.manifest_dir) / "musan_cuts.jsonl.gz" ) else: cuts_musan = None @@ -57,14 +57,12 @@ def test_dataset(): libri_train_dl = asr_datamodule.train_dataloaders( train_clean_100, - dynamic_bucketing=False, on_the_fly_feats=False, cuts_musan=cuts_musan, ) giga_train_dl = asr_datamodule.train_dataloaders( train_S, - dynamic_bucketing=True, on_the_fly_feats=True, cuts_musan=cuts_musan, ) diff --git a/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py b/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py index 5572d3f4c..217fdb39a 100755 --- a/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py +++ b/egs/librispeech/ASR/transducer_stateless_multi_datasets/train.py @@ -73,7 +73,7 @@ from conformer import Conformer from decoder import Decoder from gigaspeech import GigaSpeech from joiner import Joiner -from lhotse import CutSet, load_manifest +from lhotse import CutSet, load_manifest_lazy from lhotse.cut import Cut from lhotse.utils import fix_random_seed from librispeech import LibriSpeech @@ -662,19 +662,17 @@ def train_one_epoch( def filter_short_and_long_utterances(cuts: CutSet) -> CutSet: def remove_short_and_long_utt(c: Cut): # Keep only utterances with duration between 1 second and 20 seconds + # + # Caution: There is a reason to select 20.0 here. Please see + # ../local/display_manifest_statistics.py + # + # You should use ../local/display_manifest_statistics.py to get + # an utterance duration distribution for your dataset to select + # the threshold return 1.0 <= c.duration <= 20.0 - num_in_total = len(cuts) cuts = cuts.filter(remove_short_and_long_utt) - num_left = len(cuts) - num_removed = num_in_total - num_left - removed_percent = num_removed / num_in_total * 100 - - logging.info(f"Before removing short and long utterances: {num_in_total}") - logging.info(f"After removing short and long utterances: {num_left}") - logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)") - return cuts @@ -767,17 +765,18 @@ def run(rank, world_size, args): # DEV 12 hours # Test 40 hours if params.full_libri: - logging.info("Using the L subset of GigaSpeech (2.5k hours)") - train_giga_cuts = gigaspeech.train_L_cuts() + logging.info("Using the XL subset of GigaSpeech (10k hours)") + train_giga_cuts = gigaspeech.train_XL_cuts() else: logging.info("Using the S subset of GigaSpeech (250 hours)") train_giga_cuts = gigaspeech.train_S_cuts() train_giga_cuts = filter_short_and_long_utterances(train_giga_cuts) + train_giga_cuts = train_giga_cuts.repeat(times=None) if args.enable_musan: - cuts_musan = load_manifest( - Path(args.manifest_dir) / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + Path(args.manifest_dir) / "musan_cuts.jsonl.gz" ) else: cuts_musan = None @@ -786,14 +785,12 @@ def run(rank, world_size, args): train_dl = asr_datamodule.train_dataloaders( train_cuts, - dynamic_bucketing=False, on_the_fly_feats=False, cuts_musan=cuts_musan, ) giga_train_dl = asr_datamodule.train_dataloaders( train_giga_cuts, - dynamic_bucketing=True, on_the_fly_feats=True, cuts_musan=cuts_musan, ) diff --git a/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py index f165f6e60..a674d5527 100644 --- a/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py +++ b/egs/spgispeech/ASR/pruned_transducer_stateless2/asr_datamodule.py @@ -22,7 +22,7 @@ from pathlib import Path from typing import Any, Dict, Optional import torch -from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse.dataset import ( CutConcatenate, CutMix, @@ -176,7 +176,7 @@ class SPGISpeechAsrDataModule: The state dict for the training sampler. """ logging.info("About to get Musan cuts") - cuts_musan = load_manifest( + cuts_musan = load_manifest_lazy( self.args.manifest_dir / "cuts_musan.jsonl.gz" ) diff --git a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py index 14200f34f..78351d77c 100755 --- a/egs/tedlium3/ASR/local/compute_fbank_tedlium.py +++ b/egs/tedlium3/ASR/local/compute_fbank_tedlium.py @@ -52,8 +52,13 @@ def compute_fbank_tedlium(): "test", ) + prefix = "tedlium" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( - prefix="tedlium", dataset_parts=dataset_parts, output_dir=src_dir + dataset_parts=dataset_parts, + output_dir=src_dir, + prefix=prefix, + suffix=suffix, ) assert manifests is not None @@ -61,7 +66,7 @@ def compute_fbank_tedlium(): with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): - if (output_dir / f"cuts_{partition}.json.gz").is_file(): + if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file(): logging.info(f"{partition} already exists - skipping.") continue logging.info(f"Processing {partition}") @@ -80,7 +85,7 @@ def compute_fbank_tedlium(): cut_set = cut_set.compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_{partition}", + storage_path=f"{output_dir}/{prefix}_feats_{partition}", # when an executor is specified, make more partitions num_jobs=cur_num_jobs, executor=ex, @@ -88,7 +93,7 @@ def compute_fbank_tedlium(): ) # Split long cuts into many short and un-overlapping cuts cut_set = cut_set.trim_to_supervisions(keep_overlapping=False) - cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}") if __name__ == "__main__": diff --git a/egs/tedlium3/ASR/local/display_manifest_statistics.py b/egs/tedlium3/ASR/local/display_manifest_statistics.py index 972d03b12..52e152389 100755 --- a/egs/tedlium3/ASR/local/display_manifest_statistics.py +++ b/egs/tedlium3/ASR/local/display_manifest_statistics.py @@ -27,15 +27,15 @@ for usage. """ -from lhotse import load_manifest +from lhotse import load_manifest_lazy def main(): - path = "./data/fbank/cuts_train.json.gz" - path = "./data/fbank/cuts_dev.json.gz" - path = "./data/fbank/cuts_test.json.gz" + path = "./data/fbank/tedlium_cuts_train.jsonl.gz" + path = "./data/fbank/tedlium_cuts_dev.jsonl.gz" + path = "./data/fbank/tedlium_cuts_test.jsonl.gz" - cuts = load_manifest(path) + cuts = load_manifest_lazy(path) cuts.describe() diff --git a/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py b/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py index a6b986a94..ae22bfd92 100644 --- a/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py +++ b/egs/tedlium3/ASR/transducer_stateless/asr_datamodule.py @@ -22,11 +22,11 @@ import logging from functools import lru_cache from pathlib import Path -from lhotse import CutSet, Fbank, FbankConfig, load_manifest +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse.dataset import ( - BucketingSampler, CutConcatenate, CutMix, + DynamicBucketingSampler, K2SpeechRecognitionDataset, PrecomputedFeatures, SingleCutSampler, @@ -92,7 +92,7 @@ class TedLiumAsrDataModule: "--num-buckets", type=int, default=30, - help="The number of buckets for the BucketingSampler" + help="The number of buckets for the DynamicBucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -179,8 +179,8 @@ class TedLiumAsrDataModule: transforms = [] if self.args.enable_musan: logging.info("Enable MUSAN") - cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms.append( CutMix( @@ -261,13 +261,12 @@ class TedLiumAsrDataModule: ) if self.args.bucketing_sampler: - logging.info("Using BucketingSampler.") - train_sampler = BucketingSampler( + logging.info("Using DynamicBucketingSampler.") + train_sampler = DynamicBucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, - bucket_method="equal_duration", drop_last=True, ) else: @@ -311,7 +310,7 @@ class TedLiumAsrDataModule: cut_transforms=transforms, return_cuts=self.args.return_cuts, ) - valid_sampler = BucketingSampler( + valid_sampler = DynamicBucketingSampler( cuts_valid, max_duration=self.args.max_duration, shuffle=False, @@ -335,8 +334,10 @@ class TedLiumAsrDataModule: else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = BucketingSampler( - cuts, max_duration=self.args.max_duration, shuffle=False + sampler = DynamicBucketingSampler( + cuts, + max_duration=self.args.max_duration, + shuffle=False, ) logging.debug("About to create test dataloader") test_dl = DataLoader( @@ -350,14 +351,20 @@ class TedLiumAsrDataModule: @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - return load_manifest(self.args.manifest_dir / "cuts_train.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "tedlium_cuts_train.jsonl.gz" + ) @lru_cache() def dev_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "tedlium_cuts_dev.jsonl.gz" + ) @lru_cache() def test_cuts(self) -> CutSet: logging.info("About to get test cuts") - return load_manifest(self.args.manifest_dir / "cuts_test.json.gz") + return load_manifest_lazy( + self.args.manifest_dir / "tedlium_cuts_test.jsonl.gz" + ) diff --git a/egs/timit/ASR/local/compute_fbank_timit.py b/egs/timit/ASR/local/compute_fbank_timit.py index 8e3cbac4e..094769c8c 100644 --- a/egs/timit/ASR/local/compute_fbank_timit.py +++ b/egs/timit/ASR/local/compute_fbank_timit.py @@ -29,7 +29,7 @@ import os from pathlib import Path import torch -from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -53,8 +53,13 @@ def compute_fbank_timit(): "DEV", "TEST", ) + prefix = "timit" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( - prefix="timit", dataset_parts=dataset_parts, output_dir=src_dir + dataset_parts=dataset_parts, + output_dir=src_dir, + prefix=prefix, + suffix=suffix, ) assert manifests is not None @@ -62,7 +67,8 @@ def compute_fbank_timit(): with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): - if (output_dir / f"cuts_{partition}.json.gz").is_file(): + cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}" + if cuts_file.is_file(): logging.info(f"{partition} already exists - skipping.") continue logging.info(f"Processing {partition}") @@ -78,13 +84,13 @@ def compute_fbank_timit(): ) cut_set = cut_set.compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_{partition}", + storage_path=f"{output_dir}/{prefix}_feats_{partition}", # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 80, executor=ex, - storage_type=LilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) - cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + cut_set.to_file(cuts_file) if __name__ == "__main__": diff --git a/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py index a7029f514..665b5a771 100644 --- a/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py +++ b/egs/timit/ASR/tdnn_lstm_ctc/asr_datamodule.py @@ -23,11 +23,11 @@ from functools import lru_cache from pathlib import Path from typing import List, Union -from lhotse import CutSet, Fbank, FbankConfig, load_manifest +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy from lhotse.dataset import ( - BucketingSampler, CutConcatenate, CutMix, + DynamicBucketingSampler, K2SpeechRecognitionDataset, PrecomputedFeatures, SingleCutSampler, @@ -92,7 +92,7 @@ class TimitAsrDataModule(DataModule): "--num-buckets", type=int, default=30, - help="The number of buckets for the BucketingSampler" + help="The number of buckets for the DynamicBucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -154,7 +154,9 @@ class TimitAsrDataModule(DataModule): cuts_train = self.train_cuts() logging.info("About to get Musan cuts") - cuts_musan = load_manifest(self.args.feature_dir / "cuts_musan.json.gz") + cuts_musan = load_manifest_lazy( + self.args.feature_dir / "cuts_musan.jsonl.gz" + ) logging.info("About to create train dataset") transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))] @@ -218,13 +220,12 @@ class TimitAsrDataModule(DataModule): ) if self.args.bucketing_sampler: - logging.info("Using BucketingSampler.") - train_sampler = BucketingSampler( + logging.info("Using DynamicBucketingSampler.") + train_sampler = DynamicBucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, - bucket_method="equal_duration", drop_last=True, ) else: @@ -322,20 +323,26 @@ class TimitAsrDataModule(DataModule): @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - cuts_train = load_manifest(self.args.feature_dir / "cuts_TRAIN.json.gz") + cuts_train = load_manifest_lazy( + self.args.feature_dir / "timit_cuts_TRAIN.jsonl.gz" + ) return cuts_train @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - cuts_valid = load_manifest(self.args.feature_dir / "cuts_DEV.json.gz") + cuts_valid = load_manifest_lazy( + self.args.feature_dir / "timit_cuts_DEV.jsonl.gz" + ) return cuts_valid @lru_cache() def test_cuts(self) -> CutSet: logging.debug("About to get test cuts") - cuts_test = load_manifest(self.args.feature_dir / "cuts_TEST.json.gz") + cuts_test = load_manifest_lazy( + self.args.feature_dir / "timit_cuts_TEST.jsonl.gz" + ) return cuts_test diff --git a/egs/wenetspeech/ASR/local/display_manifest_statistics.py b/egs/wenetspeech/ASR/local/display_manifest_statistics.py index 30dc5a5ec..c41445b8d 100644 --- a/egs/wenetspeech/ASR/local/display_manifest_statistics.py +++ b/egs/wenetspeech/ASR/local/display_manifest_statistics.py @@ -26,7 +26,7 @@ for usage. """ -from lhotse import load_manifest +from lhotse import load_manifest_lazy def main(): @@ -40,7 +40,7 @@ def main(): for path in paths: print(f"Starting display the statistics for {path}") - cuts = load_manifest(path) + cuts = load_manifest_lazy(path) cuts.describe() diff --git a/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py b/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py index d2f8d85ce..6aebc2164 100644 --- a/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py +++ b/egs/wenetspeech/ASR/pruned_transducer_stateless2/asr_datamodule.py @@ -27,7 +27,7 @@ from lhotse import ( CutSet, Fbank, FbankConfig, - load_manifest, + load_manifest_lazy, set_caching_enabled, ) from lhotse.dataset import ( @@ -218,8 +218,8 @@ class WenetSpeechAsrDataModule: The state dict for the training sampler. """ logging.info("About to get Musan cuts") - cuts_musan = load_manifest( - self.args.manifest_dir / "cuts_musan.json.gz" + cuts_musan = load_manifest_lazy( + self.args.manifest_dir / "musan_cuts.jsonl.gz" ) transforms = [] @@ -435,16 +435,18 @@ class WenetSpeechAsrDataModule: @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz") + return load_manifest_lazy(self.args.manifest_dir / "cuts_DEV.jsonl.gz") @lru_cache() def test_net_cuts(self) -> List[CutSet]: logging.info("About to get TEST_NET cuts") - return load_manifest(self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz") + return load_manifest_lazy( + self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz" + ) @lru_cache() def test_meeting_cuts(self) -> List[CutSet]: logging.info("About to get TEST_MEETING cuts") - return load_manifest( + return load_manifest_lazy( self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz" ) diff --git a/egs/yesno/ASR/local/compute_fbank_yesno.py b/egs/yesno/ASR/local/compute_fbank_yesno.py index 6922ffe10..fb48b6f8e 100755 --- a/egs/yesno/ASR/local/compute_fbank_yesno.py +++ b/egs/yesno/ASR/local/compute_fbank_yesno.py @@ -12,7 +12,7 @@ import os from pathlib import Path import torch -from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer +from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter from lhotse.recipes.utils import read_manifests_if_cached from icefall.utils import get_executor @@ -37,10 +37,13 @@ def compute_fbank_yesno(): "train", "test", ) + prefix = "yesno" + suffix = "jsonl.gz" manifests = read_manifests_if_cached( dataset_parts=dataset_parts, output_dir=src_dir, - prefix="yesno", + prefix=prefix, + suffix=suffix, ) assert manifests is not None @@ -50,7 +53,8 @@ def compute_fbank_yesno(): with get_executor() as ex: # Initialize the executor only once. for partition, m in manifests.items(): - if (output_dir / f"cuts_{partition}.json.gz").is_file(): + cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}" + if cuts_file.is_file(): logging.info(f"{partition} already exists - skipping.") continue logging.info(f"Processing {partition}") @@ -66,13 +70,13 @@ def compute_fbank_yesno(): ) cut_set = cut_set.compute_and_store_features( extractor=extractor, - storage_path=f"{output_dir}/feats_{partition}", + storage_path=f"{output_dir}/{prefix}_feats_{partition}", # when an executor is specified, make more partitions num_jobs=num_jobs if ex is None else 1, # use one job executor=ex, - storage_type=LilcomHdf5Writer, + storage_type=LilcomChunkyWriter, ) - cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") + cut_set.to_file(cuts_file) if __name__ == "__main__": diff --git a/egs/yesno/ASR/tdnn/asr_datamodule.py b/egs/yesno/ASR/tdnn/asr_datamodule.py index 0a5a42089..85e5f1358 100644 --- a/egs/yesno/ASR/tdnn/asr_datamodule.py +++ b/egs/yesno/ASR/tdnn/asr_datamodule.py @@ -20,18 +20,19 @@ from functools import lru_cache from pathlib import Path from typing import List +from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy +from lhotse.dataset import ( + CutConcatenate, + DynamicBucketingSampler, + K2SpeechRecognitionDataset, + PrecomputedFeatures, + SingleCutSampler, +) +from lhotse.dataset.input_strategies import OnTheFlyFeatures from torch.utils.data import DataLoader from icefall.dataset.datamodule import DataModule from icefall.utils import str2bool -from lhotse import CutSet, Fbank, FbankConfig, load_manifest -from lhotse.dataset import ( - BucketingSampler, - CutConcatenate, - K2SpeechRecognitionDataset, - PrecomputedFeatures, -) -from lhotse.dataset.input_strategies import OnTheFlyFeatures class YesNoAsrDataModule(DataModule): @@ -84,7 +85,7 @@ class YesNoAsrDataModule(DataModule): "--num-buckets", type=int, default=10, - help="The number of buckets for the BucketingSampler" + help="The number of buckets for the DynamicBucketingSampler" "(you might want to increase it for larger datasets).", ) group.add_argument( @@ -186,18 +187,17 @@ class YesNoAsrDataModule(DataModule): ) if self.args.bucketing_sampler: - logging.info("Using BucketingSampler.") - train_sampler = BucketingSampler( + logging.info("Using DynamicBucketingSampler.") + train_sampler = DynamicBucketingSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, num_buckets=self.args.num_buckets, - bucket_method="equal_duration", drop_last=True, ) else: logging.info("Using SingleCutSampler.") - train_sampler = BucketingSampler( + train_sampler = SingleCutSampler( cuts_train, max_duration=self.args.max_duration, shuffle=self.args.shuffle, @@ -225,8 +225,10 @@ class YesNoAsrDataModule(DataModule): else PrecomputedFeatures(), return_cuts=self.args.return_cuts, ) - sampler = BucketingSampler( - cuts_test, max_duration=self.args.max_duration, shuffle=False + sampler = DynamicBucketingSampler( + cuts_test, + max_duration=self.args.max_duration, + shuffle=False, ) logging.debug("About to create test dataloader") test_dl = DataLoader( @@ -240,11 +242,15 @@ class YesNoAsrDataModule(DataModule): @lru_cache() def train_cuts(self) -> CutSet: logging.info("About to get train cuts") - cuts_train = load_manifest(self.args.feature_dir / "cuts_train.json.gz") + cuts_train = load_manifest_lazy( + self.args.feature_dir / "yesno_cuts_train.jsonl.gz" + ) return cuts_train @lru_cache() def test_cuts(self) -> List[CutSet]: logging.info("About to get test cuts") - cuts_test = load_manifest(self.args.feature_dir / "cuts_test.json.gz") + cuts_test = load_manifest_lazy( + self.args.feature_dir / "yesno_cuts_test.jsonl.gz" + ) return cuts_test diff --git a/icefall/utils.py b/icefall/utils.py index c9045006d..b38574f0c 100644 --- a/icefall/utils.py +++ b/icefall/utils.py @@ -131,7 +131,6 @@ def setup_logger( format=formatter, level=level, filemode="w", - force=True, ) if use_console: console = logging.StreamHandler()