Use jsonl for CutSet in the LibriSpeech recipe. (#397)

* Use jsonl for cutsets in the librispeech recipe.

* Use lazy cutset for all recipes.

* More fixes to use lazy CutSet.

* Remove force=True from logging to support Python < 3.8

* Minor fixes.

* Fix style issues.
This commit is contained in:
Fangjun Kuang 2022-06-06 10:19:16 +08:00 committed by GitHub
parent e5884f82e0
commit f1abce72f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
68 changed files with 702 additions and 1098 deletions

View File

@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies - name: Install Python dependencies
run: | run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat - name: Cache kaldifeat
id: my-cache id: my-cache

View File

@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies - name: Install Python dependencies
run: | run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat - name: Cache kaldifeat
id: my-cache id: my-cache
@ -99,7 +101,7 @@ jobs:
with: with:
path: | path: |
~/tmp/fbank-libri ~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other - name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies - name: Install Python dependencies
run: | run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat - name: Cache kaldifeat
id: my-cache id: my-cache
@ -99,7 +101,7 @@ jobs:
with: with:
path: | path: |
~/tmp/fbank-libri ~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other - name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies - name: Install Python dependencies
run: | run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat - name: Cache kaldifeat
id: my-cache id: my-cache
@ -99,7 +101,7 @@ jobs:
with: with:
path: | path: |
~/tmp/fbank-libri ~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other - name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies - name: Install Python dependencies
run: | run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat - name: Cache kaldifeat
id: my-cache id: my-cache
@ -99,7 +101,7 @@ jobs:
with: with:
path: | path: |
~/tmp/fbank-libri ~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other - name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies - name: Install Python dependencies
run: | run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat - name: Cache kaldifeat
id: my-cache id: my-cache
@ -99,7 +101,7 @@ jobs:
with: with:
path: | path: |
~/tmp/fbank-libri ~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other - name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -58,6 +58,8 @@ jobs:
- name: Install Python dependencies - name: Install Python dependencies
run: | run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat - name: Cache kaldifeat
id: my-cache id: my-cache
@ -98,7 +100,7 @@ jobs:
with: with:
path: | path: |
~/tmp/fbank-libri ~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other - name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -58,6 +58,8 @@ jobs:
- name: Install Python dependencies - name: Install Python dependencies
run: | run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat - name: Cache kaldifeat
id: my-cache id: my-cache
@ -98,7 +100,7 @@ jobs:
with: with:
path: | path: |
~/tmp/fbank-libri ~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other - name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -58,6 +58,8 @@ jobs:
- name: Install Python dependencies - name: Install Python dependencies
run: | run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat - name: Cache kaldifeat
id: my-cache id: my-cache
@ -98,7 +100,7 @@ jobs:
with: with:
path: | path: |
~/tmp/fbank-libri ~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other - name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true' if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -43,7 +43,7 @@ torch.set_num_interop_threads(1)
def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80): def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
src_dir = Path("data/manifests/aidatatang_200zh") src_dir = Path("data/manifests")
output_dir = Path("data/fbank") output_dir = Path("data/fbank")
num_jobs = min(15, os.cpu_count()) num_jobs = min(15, os.cpu_count())
@ -52,11 +52,13 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
"dev", "dev",
"test", "test",
) )
prefix = "aidatatang"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached( manifests = read_manifests_if_cached(
prefix="aidatatang",
suffix="jsonl.gz",
dataset_parts=dataset_parts, dataset_parts=dataset_parts,
output_dir=src_dir, output_dir=src_dir,
prefix=prefix,
suffix=suffix,
) )
assert manifests is not None assert manifests is not None
@ -64,10 +66,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
with get_executor() as ex: # Initialize the executor only once. with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items(): for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file(): if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
logging.info(f"{partition} already exists - skipping.") logging.info(f"{partition} already exists - skipping.")
continue continue
logging.info(f"Processing {partition}") logging.info(f"Processing {partition}")
for sup in m["supervisions"]:
sup.custom = {"origin": "aidatatang_200zh"}
cut_set = CutSet.from_manifests( cut_set = CutSet.from_manifests(
recordings=m["recordings"], recordings=m["recordings"],
supervisions=m["supervisions"], supervisions=m["supervisions"],
@ -80,13 +86,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
) )
cut_set = cut_set.compute_and_store_features( cut_set = cut_set.compute_and_store_features(
extractor=extractor, extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}", storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80, num_jobs=num_jobs if ex is None else 80,
executor=ex, executor=ex,
storage_type=ChunkedLilcomHdf5Writer, storage_type=ChunkedLilcomHdf5Writer,
) )
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
def get_args(): def get_args():

View File

@ -25,19 +25,19 @@ for usage.
""" """
from lhotse import load_manifest from lhotse import load_manifest_lazy
def main(): def main():
paths = [ paths = [
"./data/fbank/cuts_train.json.gz", "./data/fbank/aidatatang_cuts_train.jsonl.gz",
"./data/fbank/cuts_dev.json.gz", "./data/fbank/aidatatang_cuts_dev.jsonl.gz",
"./data/fbank/cuts_test.json.gz", "./data/fbank/aidatatang_cuts_test.jsonl.gz",
] ]
for path in paths: for path in paths:
print(f"Starting display the statistics for {path}") print(f"Starting display the statistics for {path}")
cuts = load_manifest(path) cuts = load_manifest_lazy(path)
cuts.describe() cuts.describe()
@ -45,7 +45,7 @@ if __name__ == "__main__":
main() main()
""" """
Starting display the statistics for ./data/fbank/cuts_train.json.gz Starting display the statistics for ./data/fbank/aidatatang_cuts_train.jsonl.gz
Cuts count: 494715 Cuts count: 494715
Total duration (hours): 422.6 Total duration (hours): 422.6
Speech duration (hours): 422.6 (100.0%) Speech duration (hours): 422.6 (100.0%)
@ -61,7 +61,7 @@ min 1.0
99.5% 8.0 99.5% 8.0
99.9% 9.5 99.9% 9.5
max 18.1 max 18.1
Starting display the statistics for ./data/fbank/cuts_dev.json.gz Starting display the statistics for ./data/fbank/aidatatang_cuts_dev.jsonl.gz
Cuts count: 24216 Cuts count: 24216
Total duration (hours): 20.2 Total duration (hours): 20.2
Speech duration (hours): 20.2 (100.0%) Speech duration (hours): 20.2 (100.0%)
@ -77,7 +77,7 @@ min 1.2
99.5% 7.3 99.5% 7.3
99.9% 8.8 99.9% 8.8
max 11.3 max 11.3
Starting display the statistics for ./data/fbank/cuts_test.json.gz Starting display the statistics for ./data/fbank/aidatatang_cuts_test.jsonl.gz
Cuts count: 48144 Cuts count: 48144
Total duration (hours): 40.2 Total duration (hours): 40.2
Speech duration (hours): 40.2 (100.0%) Speech duration (hours): 40.2 (100.0%)

View File

@ -27,11 +27,10 @@ from lhotse import (
CutSet, CutSet,
Fbank, Fbank,
FbankConfig, FbankConfig,
load_manifest, load_manifest_lazy,
set_caching_enabled, set_caching_enabled,
) )
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutConcatenate, CutConcatenate,
CutMix, CutMix,
DynamicBucketingSampler, DynamicBucketingSampler,
@ -205,8 +204,8 @@ class Aidatatang_200zhAsrDataModule:
The state dict for the training sampler. The state dict for the training sampler.
""" """
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest( cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "cuts_musan.json.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
transforms = [] transforms = []
@ -290,13 +289,12 @@ class Aidatatang_200zhAsrDataModule:
) )
if self.args.bucketing_sampler: if self.args.bucketing_sampler:
logging.info("Using BucketingSampler.") logging.info("Using DynamicBucketingSampler.")
train_sampler = BucketingSampler( train_sampler = DynamicBucketingSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True, drop_last=True,
) )
else: else:
@ -402,14 +400,20 @@ class Aidatatang_200zhAsrDataModule:
@lru_cache() @lru_cache()
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
logging.info("About to get train cuts") logging.info("About to get train cuts")
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
)
@lru_cache() @lru_cache()
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz"
)
@lru_cache() @lru_cache()
def test_cuts(self) -> List[CutSet]: def test_cuts(self) -> List[CutSet]:
logging.info("About to get test cuts") logging.info("About to get test cuts")
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
)

View File

@ -195,9 +195,9 @@ def get_params() -> AttributeDict:
"best_train_epoch": -1, "best_train_epoch": -1,
"best_valid_epoch": -1, "best_valid_epoch": -1,
"batch_idx_train": 0, "batch_idx_train": 0,
"log_interval": 10, "log_interval": 50,
"reset_interval": 200, "reset_interval": 200,
"valid_interval": 3000, "valid_interval": 2000,
# parameters for k2.ctc_loss # parameters for k2.ctc_loss
"beam_size": 10, "beam_size": 10,
"reduction": "sum", "reduction": "sum",

View File

@ -0,0 +1,119 @@
#!/usr/bin/env python3
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file computes fbank features of the aidatatang_200zh dataset.
It looks for manifests in the directory data/manifests.
The generated fbank features are saved in data/fbank.
"""
import argparse
import logging
import os
from pathlib import Path
import torch
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
# even when we are not invoking the main (e.g. when spawning subprocesses).
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
src_dir = Path("data/manifests")
output_dir = Path("data/fbank")
num_jobs = min(15, os.cpu_count())
dataset_parts = (
"train",
"test",
"dev",
)
prefix = "aidatatang"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items():
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
logging.info(f"{partition} already exists - skipping.")
continue
logging.info(f"Processing {partition}")
for sup in m["supervisions"]:
sup.custom = {"origin": "aidatatang_200zh"}
cut_set = CutSet.from_manifests(
recordings=m["recordings"],
supervisions=m["supervisions"],
)
if "train" in partition:
cut_set = (
cut_set
+ cut_set.perturb_speed(0.9)
+ cut_set.perturb_speed(1.1)
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=LilcomChunkyWriter,
)
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--num-mel-bins",
type=int,
default=80,
help="""The number of mel bins for Fbank""",
)
return parser.parse_args()
if __name__ == "__main__":
formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)
logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args()
compute_fbank_aidatatang_200zh(num_mel_bins=args.num_mel_bins)

View File

@ -29,7 +29,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -52,8 +52,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
"dev", "dev",
"test", "test",
) )
prefix = "aishell"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached( manifests = read_manifests_if_cached(
prefix="aishell", dataset_parts=dataset_parts, output_dir=src_dir dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
) )
assert manifests is not None assert manifests is not None
@ -61,7 +66,7 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
with get_executor() as ex: # Initialize the executor only once. with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items(): for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file(): if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
logging.info(f"{partition} already exists - skipping.") logging.info(f"{partition} already exists - skipping.")
continue continue
logging.info(f"Processing {partition}") logging.info(f"Processing {partition}")
@ -77,13 +82,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
) )
cut_set = cut_set.compute_and_store_features( cut_set = cut_set.compute_and_store_features(
extractor=extractor, extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}", storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80, num_jobs=num_jobs if ex is None else 80,
executor=ex, executor=ex,
storage_type=LilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
def get_args(): def get_args():

View File

@ -25,18 +25,18 @@ for usage.
""" """
from lhotse import load_manifest from lhotse import load_manifest_lazy
def main(): def main():
# path = "./data/fbank/cuts_train.json.gz" # path = "./data/fbank/aishell_cuts_train.jsonl.gz"
# path = "./data/fbank/cuts_test.json.gz" # path = "./data/fbank/aishell_cuts_test.jsonl.gz"
# path = "./data/fbank/cuts_dev.json.gz" path = "./data/fbank/aishell_cuts_dev.jsonl.gz"
# path = "./data/fbank/aidatatang_200zh/cuts_train_raw.jsonl.gz" # path = "./data/fbank/aidatatang_cuts_train.jsonl.gz"
# path = "./data/fbank/aidatatang_200zh/cuts_test_raw.jsonl.gz" # path = "./data/fbank/aidatatang_cuts_test.jsonl.gz"
path = "./data/fbank/aidatatang_200zh/cuts_dev_raw.jsonl.gz" # path = "./data/fbank/aidatatang_cuts_dev.jsonl.gz"
cuts = load_manifest(path) cuts = load_manifest_lazy(path)
cuts.describe() cuts.describe()

View File

@ -1,71 +0,0 @@
#!/usr/bin/env python3
# Copyright 2022 Xiaomi Corp. (Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
from lhotse import CutSet
from lhotse.recipes.utils import read_manifests_if_cached
def preprocess_aidatatang_200zh():
src_dir = Path("data/manifests/aidatatang_200zh")
output_dir = Path("data/fbank/aidatatang_200zh")
output_dir.mkdir(exist_ok=True, parents=True)
dataset_parts = (
"train",
"test",
"dev",
)
logging.info("Loading manifest")
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts, output_dir=src_dir, prefix="aidatatang"
)
assert len(manifests) > 0
for partition, m in manifests.items():
logging.info(f"Processing {partition}")
raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
if raw_cuts_path.is_file():
logging.info(f"{partition} already exists - skipping")
continue
for sup in m["supervisions"]:
sup.custom = {"origin": "aidatatang_200zh"}
cut_set = CutSet.from_manifests(
recordings=m["recordings"],
supervisions=m["supervisions"],
)
logging.info(f"Saving to {raw_cuts_path}")
cut_set.to_file(raw_cuts_path)
def main():
formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)
logging.basicConfig(format=formatter, level=logging.INFO)
preprocess_aidatatang_200zh()
if __name__ == "__main__":
main()

View File

@ -42,18 +42,18 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare manifest" log "Stage 1: Prepare manifest"
# We assume that you have downloaded the aidatatang_200zh corpus # We assume that you have downloaded the aidatatang_200zh corpus
# to $dl_dir/aidatatang_200zh # to $dl_dir/aidatatang_200zh
if [ ! -f data/manifests/aidatatang_200zh/.manifests.done ]; then if [ ! -f data/manifests/.aidatatang_200zh_manifests.done ]; then
mkdir -p data/manifests/aidatatang_200zh mkdir -p data/manifests
lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh lhotse prepare aidatatang-200zh $dl_dir data/manifests
touch data/manifests/aidatatang_200zh/.manifests.done touch data/manifests/.aidatatang_200zh_manifests.done
fi fi
fi fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Process aidatatang_200zh" log "Stage 2: Process aidatatang_200zh"
if [ ! -f data/fbank/aidatatang_200zh/.fbank.done ]; then if [ ! -f data/fbank/.aidatatang_200zh_fbank.done ]; then
mkdir -p data/fbank/aidatatang_200zh mkdir -p data/fbank
lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh ./local/compute_fbank_aidatatang_200zh.py
touch data/fbank/aidatatang_200zh/.fbank.done touch data/fbank/.aidatatang_200zh_fbank.done
fi fi
fi fi

View File

@ -23,11 +23,11 @@ from functools import lru_cache
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from lhotse import CutSet, Fbank, FbankConfig, load_manifest from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutConcatenate, CutConcatenate,
CutMix, CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
PrecomputedFeatures, PrecomputedFeatures,
SingleCutSampler, SingleCutSampler,
@ -93,7 +93,7 @@ class AishellAsrDataModule:
"--num-buckets", "--num-buckets",
type=int, type=int,
default=30, default=30,
help="The number of buckets for the BucketingSampler" help="The number of buckets for the DynamicBucketingSampler"
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
group.add_argument( group.add_argument(
@ -133,6 +133,12 @@ class AishellAsrDataModule:
help="When enabled (=default), the examples will be " help="When enabled (=default), the examples will be "
"shuffled for each epoch.", "shuffled for each epoch.",
) )
group.add_argument(
"--drop-last",
type=str2bool,
default=True,
help="Whether to drop last batch. Used by sampler.",
)
group.add_argument( group.add_argument(
"--return-cuts", "--return-cuts",
type=str2bool, type=str2bool,
@ -177,8 +183,8 @@ class AishellAsrDataModule:
def train_dataloaders(self, cuts_train: CutSet) -> DataLoader: def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest( cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "cuts_musan.json.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
transforms = [] transforms = []
@ -262,14 +268,13 @@ class AishellAsrDataModule:
) )
if self.args.bucketing_sampler: if self.args.bucketing_sampler:
logging.info("Using BucketingSampler.") logging.info("Using DynamicBucketingSampler.")
train_sampler = BucketingSampler( train_sampler = DynamicBucketingSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
bucket_method="equal_duration", drop_last=self.args.drop_last,
drop_last=True,
) )
else: else:
logging.info("Using SingleCutSampler.") logging.info("Using SingleCutSampler.")
@ -313,7 +318,7 @@ class AishellAsrDataModule:
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = BucketingSampler( valid_sampler = DynamicBucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -337,8 +342,10 @@ class AishellAsrDataModule:
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = BucketingSampler( sampler = DynamicBucketingSampler(
cuts, max_duration=self.args.max_duration, shuffle=False cuts,
max_duration=self.args.max_duration,
shuffle=False,
) )
test_dl = DataLoader( test_dl = DataLoader(
test, test,
@ -351,17 +358,21 @@ class AishellAsrDataModule:
@lru_cache() @lru_cache()
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
logging.info("About to get train cuts") logging.info("About to get train cuts")
cuts_train = load_manifest( cuts_train = load_manifest_lazy(
self.args.manifest_dir / "cuts_train.json.gz" self.args.manifest_dir / "aishell_cuts_train.jsonl.gz"
) )
return cuts_train return cuts_train
@lru_cache() @lru_cache()
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "aishell_cuts_dev.jsonl.gz"
)
@lru_cache() @lru_cache()
def test_cuts(self) -> List[CutSet]: def test_cuts(self) -> List[CutSet]:
logging.info("About to get test cuts") logging.info("About to get test cuts")
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "aishell_cuts_test.jsonl.gz"
)

View File

@ -15,6 +15,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""
Usage
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./tdnn_lstm_ctc/train.py \
--world-size 4 \
--num-epochs 20 \
--max-duration 300
"""
import argparse import argparse
import logging import logging

View File

@ -110,9 +110,7 @@ class Conformer(Transformer):
x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C) x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C)
# Caution: We assume the subsampling factor is 4! # Caution: We assume the subsampling factor is 4!
with warnings.catch_warnings(): lengths = (((x_lens - 1) >> 1) - 1) >> 1
warnings.simplefilter("ignore")
lengths = ((x_lens - 1) // 2 - 1) // 2
assert x.size(0) == lengths.max().item() assert x.size(0) == lengths.max().item()
mask = make_pad_mask(lengths) mask = make_pad_mask(lengths)

View File

@ -21,6 +21,7 @@
import argparse import argparse
import logging import logging
import warnings
from pathlib import Path from pathlib import Path
from shutil import copyfile from shutil import copyfile
from typing import Optional, Tuple from typing import Optional, Tuple
@ -386,7 +387,11 @@ def compute_loss(
assert loss.requires_grad == is_training assert loss.requires_grad == is_training
info = MetricsTracker() info = MetricsTracker()
info["frames"] = (feature_lens // params.subsampling_factor).sum().item() with warnings.catch_warnings():
warnings.simplefilter("ignore")
info["frames"] = (
(feature_lens // params.subsampling_factor).sum().item()
)
# Note: We use reduction=sum while computing the loss. # Note: We use reduction=sum while computing the loss.
info["loss"] = loss.detach().cpu().item() info["loss"] = loss.detach().cpu().item()

View File

@ -18,7 +18,7 @@
import logging import logging
from pathlib import Path from pathlib import Path
from lhotse import CutSet, load_manifest from lhotse import CutSet, load_manifest_lazy
class AIDatatang200zh: class AIDatatang200zh:
@ -28,26 +28,26 @@ class AIDatatang200zh:
manifest_dir: manifest_dir:
It is expected to contain the following files:: It is expected to contain the following files::
- cuts_dev_raw.jsonl.gz - aidatatang_cuts_dev.jsonl.gz
- cuts_train_raw.jsonl.gz - aidatatang_cuts_train.jsonl.gz
- cuts_test_raw.jsonl.gz - aidatatang_cuts_test.jsonl.gz
""" """
self.manifest_dir = Path(manifest_dir) self.manifest_dir = Path(manifest_dir)
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train_raw.jsonl.gz" f = self.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
logging.info(f"About to get train cuts from {f}") logging.info(f"About to get train cuts from {f}")
cuts_train = load_manifest(f) cuts_train = load_manifest_lazy(f)
return cuts_train return cuts_train
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_valid_raw.jsonl.gz" f = self.manifest_dir / "aidatatang_cuts_valid.jsonl.gz"
logging.info(f"About to get valid cuts from {f}") logging.info(f"About to get valid cuts from {f}")
cuts_valid = load_manifest(f) cuts_valid = load_manifest_lazy(f)
return cuts_valid return cuts_valid
def test_cuts(self) -> CutSet: def test_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_test_raw.jsonl.gz" f = self.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
logging.info(f"About to get test cuts from {f}") logging.info(f"About to get test cuts from {f}")
cuts_test = load_manifest(f) cuts_test = load_manifest_lazy(f)
return cuts_test return cuts_test

View File

@ -18,7 +18,7 @@
import logging import logging
from pathlib import Path from pathlib import Path
from lhotse import CutSet, load_manifest from lhotse import CutSet, load_manifest_lazy
class AIShell: class AIShell:
@ -28,26 +28,26 @@ class AIShell:
manifest_dir: manifest_dir:
It is expected to contain the following files:: It is expected to contain the following files::
- cuts_dev.json.gz - aishell_cuts_dev.jsonl.gz
- cuts_train.json.gz - aishell_cuts_train.jsonl.gz
- cuts_test.json.gz - aishell_cuts_test.jsonl.gz
""" """
self.manifest_dir = Path(manifest_dir) self.manifest_dir = Path(manifest_dir)
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train.json.gz" f = self.manifest_dir / "aishell_cuts_train.jsonl.gz"
logging.info(f"About to get train cuts from {f}") logging.info(f"About to get train cuts from {f}")
cuts_train = load_manifest(f) cuts_train = load_manifest_lazy(f)
return cuts_train return cuts_train
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_dev.json.gz" f = self.manifest_dir / "aishell_cuts_dev.jsonl.gz"
logging.info(f"About to get valid cuts from {f}") logging.info(f"About to get valid cuts from {f}")
cuts_valid = load_manifest(f) cuts_valid = load_manifest_lazy(f)
return cuts_valid return cuts_valid
def test_cuts(self) -> CutSet: def test_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_test.json.gz" f = self.manifest_dir / "aishell_cuts_test.jsonl.gz"
logging.info(f"About to get test cuts from {f}") logging.info(f"About to get test cuts from {f}")
cuts_test = load_manifest(f) cuts_test = load_manifest_lazy(f)
return cuts_test return cuts_test

View File

@ -24,7 +24,6 @@ from typing import Optional
from lhotse import CutSet, Fbank, FbankConfig from lhotse import CutSet, Fbank, FbankConfig
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutMix, CutMix,
DynamicBucketingSampler, DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
@ -73,8 +72,7 @@ class AsrDataModule:
"--num-buckets", "--num-buckets",
type=int, type=int,
default=30, default=30,
help="The number of buckets for the BucketingSampler " help="The number of buckets for the DynamicBucketingSampler "
"and DynamicBucketingSampler."
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
@ -147,7 +145,6 @@ class AsrDataModule:
def train_dataloaders( def train_dataloaders(
self, self,
cuts_train: CutSet, cuts_train: CutSet,
dynamic_bucketing: bool,
on_the_fly_feats: bool, on_the_fly_feats: bool,
cuts_musan: Optional[CutSet] = None, cuts_musan: Optional[CutSet] = None,
) -> DataLoader: ) -> DataLoader:
@ -157,9 +154,6 @@ class AsrDataModule:
Cuts for training. Cuts for training.
cuts_musan: cuts_musan:
If not None, it is the cuts for mixing. If not None, it is the cuts for mixing.
dynamic_bucketing:
True to use DynamicBucketingSampler;
False to use BucketingSampler.
on_the_fly_feats: on_the_fly_feats:
True to use OnTheFlyFeatures; True to use OnTheFlyFeatures;
False to use PrecomputedFeatures. False to use PrecomputedFeatures.
@ -232,25 +226,14 @@ class AsrDataModule:
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
if dynamic_bucketing: logging.info("Using DynamicBucketingSampler.")
logging.info("Using DynamicBucketingSampler.") train_sampler = DynamicBucketingSampler(
train_sampler = DynamicBucketingSampler( cuts_train,
cuts_train, max_duration=self.args.max_duration,
max_duration=self.args.max_duration, shuffle=self.args.shuffle,
shuffle=self.args.shuffle, num_buckets=self.args.num_buckets,
num_buckets=self.args.num_buckets, drop_last=True,
drop_last=True, )
)
else:
logging.info("Using BucketingSampler.")
train_sampler = BucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True,
)
logging.info("About to create train dataloader") logging.info("About to create train dataloader")
train_dl = DataLoader( train_dl = DataLoader(
@ -279,7 +262,7 @@ class AsrDataModule:
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = BucketingSampler( valid_sampler = DynamicBucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -303,8 +286,10 @@ class AsrDataModule:
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = BucketingSampler( sampler = DynamicBucketingSampler(
cuts, max_duration=self.args.max_duration, shuffle=False cuts,
max_duration=self.args.max_duration,
shuffle=False,
) )
logging.debug("About to create test dataloader") logging.debug("About to create test dataloader")
test_dl = DataLoader( test_dl = DataLoader(

View File

@ -41,6 +41,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2"
import argparse import argparse
import logging import logging
import random import random
import warnings
from pathlib import Path from pathlib import Path
from shutil import copyfile from shutil import copyfile
from typing import Optional, Tuple from typing import Optional, Tuple
@ -55,7 +56,7 @@ from asr_datamodule import AsrDataModule
from conformer import Conformer from conformer import Conformer
from decoder import Decoder from decoder import Decoder
from joiner import Joiner from joiner import Joiner
from lhotse import CutSet, load_manifest from lhotse import CutSet, load_manifest_lazy
from lhotse.cut import Cut from lhotse.cut import Cut
from lhotse.utils import fix_random_seed from lhotse.utils import fix_random_seed
from model import Transducer from model import Transducer
@ -446,7 +447,11 @@ def compute_loss(
assert loss.requires_grad == is_training assert loss.requires_grad == is_training
info = MetricsTracker() info = MetricsTracker()
info["frames"] = (feature_lens // params.subsampling_factor).sum().item() with warnings.catch_warnings():
warnings.simplefilter("ignore")
info["frames"] = (
(feature_lens // params.subsampling_factor).sum().item()
)
# Note: We use reduction=sum while computing the loss. # Note: We use reduction=sum while computing the loss.
info["loss"] = loss.detach().cpu().item() info["loss"] = loss.detach().cpu().item()
@ -635,20 +640,16 @@ def train_one_epoch(
def filter_short_and_long_utterances(cuts: CutSet) -> CutSet: def filter_short_and_long_utterances(cuts: CutSet) -> CutSet:
def remove_short_and_long_utt(c: Cut): def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 12 seconds # Keep only utterances with duration between 1 second and 20 seconds
#
# Caution: There is a reason to select 12.0 here. Please see
# ../local/display_manifest_statistics.py
#
# You should use ../local/display_manifest_statistics.py to get
# an utterance duration distribution for your dataset to select
# the threshold
return 1.0 <= c.duration <= 12.0 return 1.0 <= c.duration <= 12.0
num_in_total = len(cuts)
cuts = cuts.filter(remove_short_and_long_utt)
num_left = len(cuts)
num_removed = num_in_total - num_left
removed_percent = num_removed / num_in_total * 100
logging.info(f"Before removing short and long utterances: {num_in_total}")
logging.info(f"After removing short and long utterances: {num_left}")
logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
return cuts return cuts
@ -728,15 +729,14 @@ def run(rank, world_size, args):
train_cuts = aishell.train_cuts() train_cuts = aishell.train_cuts()
train_cuts = filter_short_and_long_utterances(train_cuts) train_cuts = filter_short_and_long_utterances(train_cuts)
datatang = AIDatatang200zh( datatang = AIDatatang200zh(manifest_dir=args.manifest_dir)
manifest_dir=f"{args.manifest_dir}/aidatatang_200zh"
)
train_datatang_cuts = datatang.train_cuts() train_datatang_cuts = datatang.train_cuts()
train_datatang_cuts = filter_short_and_long_utterances(train_datatang_cuts) train_datatang_cuts = filter_short_and_long_utterances(train_datatang_cuts)
train_datatang_cuts = train_datatang_cuts.repeat(times=None)
if args.enable_musan: if args.enable_musan:
cuts_musan = load_manifest( cuts_musan = load_manifest_lazy(
Path(args.manifest_dir) / "cuts_musan.json.gz" Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
) )
else: else:
cuts_musan = None cuts_musan = None
@ -745,22 +745,23 @@ def run(rank, world_size, args):
train_dl = asr_datamodule.train_dataloaders( train_dl = asr_datamodule.train_dataloaders(
train_cuts, train_cuts,
dynamic_bucketing=False,
on_the_fly_feats=False, on_the_fly_feats=False,
cuts_musan=cuts_musan, cuts_musan=cuts_musan,
) )
datatang_train_dl = asr_datamodule.train_dataloaders( datatang_train_dl = asr_datamodule.train_dataloaders(
train_datatang_cuts, train_datatang_cuts,
dynamic_bucketing=True, on_the_fly_feats=False,
on_the_fly_feats=True,
cuts_musan=cuts_musan, cuts_musan=cuts_musan,
) )
valid_cuts = aishell.valid_cuts() valid_cuts = aishell.valid_cuts()
valid_dl = asr_datamodule.valid_dataloaders(valid_cuts) valid_dl = asr_datamodule.valid_dataloaders(valid_cuts)
for dl in [train_dl, datatang_train_dl]: for dl in [
train_dl,
# datatang_train_dl
]:
scan_pessimistic_batches_for_oom( scan_pessimistic_batches_for_oom(
model=model, model=model,
train_dl=dl, train_dl=dl,

View File

@ -37,6 +37,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2"
import argparse import argparse
import logging import logging
import warnings
from pathlib import Path from pathlib import Path
from shutil import copyfile from shutil import copyfile
from typing import Optional, Tuple from typing import Optional, Tuple
@ -411,7 +412,11 @@ def compute_loss(
assert loss.requires_grad == is_training assert loss.requires_grad == is_training
info = MetricsTracker() info = MetricsTracker()
info["frames"] = (feature_lens // params.subsampling_factor).sum().item() with warnings.catch_warnings():
warnings.simplefilter("ignore")
info["frames"] = (
(feature_lens // params.subsampling_factor).sum().item()
)
# Note: We use reduction=sum while computing the loss. # Note: We use reduction=sum while computing the loss.
info["loss"] = loss.detach().cpu().item() info["loss"] = loss.detach().cpu().item()

View File

@ -43,7 +43,7 @@ torch.set_num_interop_threads(1)
def compute_fbank_alimeeting(num_mel_bins: int = 80): def compute_fbank_alimeeting(num_mel_bins: int = 80):
src_dir = Path("data/manifests/alimeeting") src_dir = Path("data/manifests")
output_dir = Path("data/fbank") output_dir = Path("data/fbank")
num_jobs = min(15, os.cpu_count()) num_jobs = min(15, os.cpu_count())
@ -52,11 +52,14 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
"eval", "eval",
"test", "test",
) )
prefix = "alimeeting"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached( manifests = read_manifests_if_cached(
dataset_parts=dataset_parts, dataset_parts=dataset_parts,
output_dir=src_dir, output_dir=src_dir,
prefix="alimeeting", prefix=prefix,
suffix="jsonl.gz", suffix=suffix,
) )
assert manifests is not None assert manifests is not None
@ -64,7 +67,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
with get_executor() as ex: # Initialize the executor only once. with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items(): for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file(): if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
logging.info(f"{partition} already exists - skipping.") logging.info(f"{partition} already exists - skipping.")
continue continue
logging.info(f"Processing {partition}") logging.info(f"Processing {partition}")
@ -83,7 +86,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
cut_set = cut_set.compute_and_store_features( cut_set = cut_set.compute_and_store_features(
extractor=extractor, extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}", storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=cur_num_jobs, num_jobs=cur_num_jobs,
executor=ex, executor=ex,
@ -95,7 +98,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
keep_overlapping=False, keep_overlapping=False,
min_duration=None, min_duration=None,
) )
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
def get_args(): def get_args():

View File

@ -25,19 +25,19 @@ for usage.
""" """
from lhotse import load_manifest from lhotse import load_manifest_lazy
def main(): def main():
paths = [ paths = [
"./data/fbank/cuts_train.json.gz", "./data/fbank/alimeeting_cuts_train.jsonl.gz",
"./data/fbank/cuts_eval.json.gz", "./data/fbank/alimeeting_cuts_eval.jsonl.gz",
"./data/fbank/cuts_test.json.gz", "./data/fbank/alimeeting_cuts_test.jsonl.gz",
] ]
for path in paths: for path in paths:
print(f"Starting display the statistics for {path}") print(f"Starting display the statistics for {path}")
cuts = load_manifest(path) cuts = load_manifest_lazy(path)
cuts.describe() cuts.describe()
@ -45,7 +45,7 @@ if __name__ == "__main__":
main() main()
""" """
Starting display the statistics for ./data/fbank/cuts_train.json.gz Starting display the statistics for ./data/fbank/alimeeting_cuts_train.jsonl.gz
Cuts count: 559092 Cuts count: 559092
Total duration (hours): 424.6 Total duration (hours): 424.6
Speech duration (hours): 424.6 (100.0%) Speech duration (hours): 424.6 (100.0%)
@ -61,7 +61,7 @@ min 0.0
99.5% 14.7 99.5% 14.7
99.9% 16.2 99.9% 16.2
max 284.3 max 284.3
Starting display the statistics for ./data/fbank/cuts_eval.json.gz Starting display the statistics for ./data/fbank/alimeeting_cuts_eval.jsonl.gz
Cuts count: 6457 Cuts count: 6457
Total duration (hours): 4.9 Total duration (hours): 4.9
Speech duration (hours): 4.9 (100.0%) Speech duration (hours): 4.9 (100.0%)
@ -77,7 +77,7 @@ min 0.1
99.5% 14.1 99.5% 14.1
99.9% 14.7 99.9% 14.7
max 15.8 max 15.8
Starting display the statistics for ./data/fbank/cuts_test.json.gz Starting display the statistics for ./data/fbank/alimeeting_cuts_test.jsonl.gz
Cuts count: 16358 Cuts count: 16358
Total duration (hours): 12.5 Total duration (hours): 12.5
Speech duration (hours): 12.5 (100.0%) Speech duration (hours): 12.5 (100.0%)

View File

@ -27,7 +27,7 @@ from lhotse import (
CutSet, CutSet,
Fbank, Fbank,
FbankConfig, FbankConfig,
load_manifest, load_manifest_lazy,
set_caching_enabled, set_caching_enabled,
) )
from lhotse.dataset import ( from lhotse.dataset import (
@ -204,8 +204,8 @@ class AlimeetingAsrDataModule:
The state dict for the training sampler. The state dict for the training sampler.
""" """
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest( cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "cuts_musan.json.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
transforms = [] transforms = []
@ -401,14 +401,20 @@ class AlimeetingAsrDataModule:
@lru_cache() @lru_cache()
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
logging.info("About to get train cuts") logging.info("About to get train cuts")
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "alimeeting_cuts_train.jsonl.gz"
)
@lru_cache() @lru_cache()
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
return load_manifest(self.args.manifest_dir / "cuts_eval.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "alimeeting_cuts_eval.jsonl.gz"
)
@lru_cache() @lru_cache()
def test_cuts(self) -> List[CutSet]: def test_cuts(self) -> List[CutSet]:
logging.info("About to get test cuts") logging.info("About to get test cuts")
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "alimeeting_cuts_test.jsonl.gz"
)

View File

@ -20,9 +20,8 @@ import logging
from functools import lru_cache from functools import lru_cache
from pathlib import Path from pathlib import Path
from lhotse import CutSet, Fbank, FbankConfig, load_manifest from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutConcatenate, CutConcatenate,
CutMix, CutMix,
DynamicBucketingSampler, DynamicBucketingSampler,
@ -190,8 +189,8 @@ class GigaSpeechAsrDataModule:
def train_dataloaders(self, cuts_train: CutSet) -> DataLoader: def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest( cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "cuts_musan.json.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
transforms = [] transforms = []
@ -315,7 +314,7 @@ class GigaSpeechAsrDataModule:
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = BucketingSampler( valid_sampler = DynamicBucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -339,8 +338,10 @@ class GigaSpeechAsrDataModule:
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = BucketingSampler( sampler = DynamicBucketingSampler(
cuts, max_duration=self.args.max_duration, shuffle=False cuts,
max_duration=self.args.max_duration,
shuffle=False,
) )
logging.debug("About to create test dataloader") logging.debug("About to create test dataloader")
test_dl = DataLoader( test_dl = DataLoader(
@ -361,7 +362,9 @@ class GigaSpeechAsrDataModule:
@lru_cache() @lru_cache()
def dev_cuts(self) -> CutSet: def dev_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz") cuts_valid = load_manifest_lazy(
self.args.manifest_dir / "cuts_DEV.jsonl.gz"
)
if self.args.small_dev: if self.args.small_dev:
return cuts_valid.subset(first=1000) return cuts_valid.subset(first=1000)
else: else:
@ -370,4 +373,4 @@ class GigaSpeechAsrDataModule:
@lru_cache() @lru_cache()
def test_cuts(self) -> CutSet: def test_cuts(self) -> CutSet:
logging.info("About to get test cuts") logging.info("About to get test cuts")
return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz") return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")

View File

@ -1,103 +0,0 @@
#!/usr/bin/env python3
# Copyright 2021 Johns Hopkins University (Piotr Żelasko)
# Copyright 2021 Xiaomi Corp. (Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
import torch
from lhotse import (
CutSet,
KaldifeatFbank,
KaldifeatFbankConfig,
combine,
)
from lhotse.recipes.utils import read_manifests_if_cached
# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
# even when we are not invoking the main (e.g. when spawning subprocesses).
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
def compute_fbank_musan():
src_dir = Path("data/manifests")
output_dir = Path("data/fbank")
# number of workers in dataloader
num_workers = 10
# number of seconds in a batch
batch_duration = 600
dataset_parts = (
"music",
"speech",
"noise",
)
manifests = read_manifests_if_cached(
prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir
)
assert manifests is not None
musan_cuts_path = output_dir / "cuts_musan.json.gz"
if musan_cuts_path.is_file():
logging.info(f"{musan_cuts_path} already exists - skipping")
return
logging.info("Extracting features for Musan")
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda", 0)
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
logging.info(f"device: {device}")
musan_cuts = (
CutSet.from_manifests(
recordings=combine(
part["recordings"] for part in manifests.values()
)
)
.cut_into_windows(10.0)
.filter(lambda c: c.duration > 5)
.compute_and_store_features_batch(
extractor=extractor,
storage_path=f"{output_dir}/feats_musan",
num_workers=num_workers,
batch_duration=batch_duration,
)
)
musan_cuts.to_json(musan_cuts_path)
def main():
formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)
logging.basicConfig(format=formatter, level=logging.INFO)
compute_fbank_musan()
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/compute_fbank_musan.py

View File

@ -23,9 +23,8 @@ from pathlib import Path
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
import torch import torch
from lhotse import CutSet, Fbank, FbankConfig, load_manifest from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutConcatenate, CutConcatenate,
CutMix, CutMix,
DynamicBucketingSampler, DynamicBucketingSampler,
@ -217,8 +216,8 @@ class GigaSpeechAsrDataModule:
if self.args.enable_musan: if self.args.enable_musan:
logging.info("Enable MUSAN") logging.info("Enable MUSAN")
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest( cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "cuts_musan.json.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
transforms.append( transforms.append(
CutMix( CutMix(
@ -358,7 +357,7 @@ class GigaSpeechAsrDataModule:
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = BucketingSampler( valid_sampler = DynamicBucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -382,8 +381,10 @@ class GigaSpeechAsrDataModule:
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = BucketingSampler( sampler = DynamicBucketingSampler(
cuts, max_duration=self.args.max_duration, shuffle=False cuts,
max_duration=self.args.max_duration,
shuffle=False,
) )
logging.debug("About to create test dataloader") logging.debug("About to create test dataloader")
test_dl = DataLoader( test_dl = DataLoader(
@ -404,7 +405,9 @@ class GigaSpeechAsrDataModule:
@lru_cache() @lru_cache()
def dev_cuts(self) -> CutSet: def dev_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz") cuts_valid = load_manifest_lazy(
self.args.manifest_dir / "cuts_DEV.jsonl.gz"
)
if self.args.small_dev: if self.args.small_dev:
return cuts_valid.subset(first=1000) return cuts_valid.subset(first=1000)
else: else:
@ -413,4 +416,4 @@ class GigaSpeechAsrDataModule:
@lru_cache() @lru_cache()
def test_cuts(self) -> CutSet: def test_cuts(self) -> CutSet:
logging.info("About to get test cuts") logging.info("About to get test cuts")
return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz") return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")

View File

@ -96,14 +96,14 @@ def get_parser():
- labels_xxx.h5 - labels_xxx.h5
- aux_labels_xxx.h5 - aux_labels_xxx.h5
- cuts_xxx.json.gz - librispeech_cuts_xxx.jsonl.gz
where xxx is the value of `--dataset`. For instance, if where xxx is the value of `--dataset`. For instance, if
`--dataset` is `train-clean-100`, it will contain 3 files: `--dataset` is `train-clean-100`, it will contain 3 files:
- `labels_train-clean-100.h5` - `labels_train-clean-100.h5`
- `aux_labels_train-clean-100.h5` - `aux_labels_train-clean-100.h5`
- `cuts_train-clean-100.json.gz` - `librispeech_cuts_train-clean-100.jsonl.gz`
Note: Both labels_xxx.h5 and aux_labels_xxx.h5 contain framewise Note: Both labels_xxx.h5 and aux_labels_xxx.h5 contain framewise
alignment. The difference is that labels_xxx.h5 contains repeats. alignment. The difference is that labels_xxx.h5 contains repeats.
@ -289,7 +289,9 @@ def main():
out_labels_ali_filename = out_dir / f"labels_{params.dataset}.h5" out_labels_ali_filename = out_dir / f"labels_{params.dataset}.h5"
out_aux_labels_ali_filename = out_dir / f"aux_labels_{params.dataset}.h5" out_aux_labels_ali_filename = out_dir / f"aux_labels_{params.dataset}.h5"
out_manifest_filename = out_dir / f"cuts_{params.dataset}.json.gz" out_manifest_filename = (
out_dir / f"librispeech_cuts_{params.dataset}.jsonl.gz"
)
for f in ( for f in (
out_labels_ali_filename, out_labels_ali_filename,

View File

@ -17,6 +17,17 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""
Usage:
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./conformer_ctc/train.py \
--exp-dir ./conformer_ctc/exp \
--world-size 4 \
--full-libri 1 \
--max-duration 200 \
--num-epochs 20
"""
import argparse import argparse
import logging import logging
from pathlib import Path from pathlib import Path
@ -29,6 +40,7 @@ import torch.multiprocessing as mp
import torch.nn as nn import torch.nn as nn
from asr_datamodule import LibriSpeechAsrDataModule from asr_datamodule import LibriSpeechAsrDataModule
from conformer import Conformer from conformer import Conformer
from lhotse.cut import Cut
from lhotse.utils import fix_random_seed from lhotse.utils import fix_random_seed
from torch import Tensor from torch import Tensor
from torch.nn.parallel import DistributedDataParallel as DDP from torch.nn.parallel import DistributedDataParallel as DDP
@ -676,6 +688,20 @@ def run(rank, world_size, args):
if params.full_libri: if params.full_libri:
train_cuts += librispeech.train_clean_360_cuts() train_cuts += librispeech.train_clean_360_cuts()
train_cuts += librispeech.train_other_500_cuts() train_cuts += librispeech.train_other_500_cuts()
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
#
# Caution: There is a reason to select 20.0 here. Please see
# ../local/display_manifest_statistics.py
#
# You should use ../local/display_manifest_statistics.py to get
# an utterance duration distribution for your dataset to select
# the threshold
return 1.0 <= c.duration <= 20.0
train_cuts = train_cuts.filter(remove_short_and_long_utt)
train_dl = librispeech.train_dataloaders(train_cuts) train_dl = librispeech.train_dataloaders(train_cuts)
valid_cuts = librispeech.dev_clean_cuts() valid_cuts = librispeech.dev_clean_cuts()

View File

@ -20,11 +20,7 @@ import logging
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ( from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig
CutSet,
KaldifeatFbank,
KaldifeatFbankConfig,
)
# Torch's multithreaded behavior needs to be disabled or # Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down. # it wastes a lot of CPU and slow things down.
@ -51,13 +47,16 @@ def compute_fbank_gigaspeech_dev_test():
logging.info(f"device: {device}") logging.info(f"device: {device}")
prefix = "gigaspeech"
suffix = "jsonl.gz"
for partition in subsets: for partition in subsets:
cuts_path = in_out_dir / f"cuts_{partition}.jsonl.gz" cuts_path = in_out_dir / f"{prefix}_cuts_{partition}.{suffix}"
if cuts_path.is_file(): if cuts_path.is_file():
logging.info(f"{cuts_path} exists - skipping") logging.info(f"{cuts_path} exists - skipping")
continue continue
raw_cuts_path = in_out_dir / f"cuts_{partition}_raw.jsonl.gz" raw_cuts_path = in_out_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
logging.info(f"Loading {raw_cuts_path}") logging.info(f"Loading {raw_cuts_path}")
cut_set = CutSet.from_file(raw_cuts_path) cut_set = CutSet.from_file(raw_cuts_path)
@ -66,7 +65,7 @@ def compute_fbank_gigaspeech_dev_test():
cut_set = cut_set.compute_and_store_features_batch( cut_set = cut_set.compute_and_store_features_batch(
extractor=extractor, extractor=extractor,
storage_path=f"{in_out_dir}/feats_{partition}", storage_path=f"{in_out_dir}/{prefix}_feats_{partition}",
num_workers=num_workers, num_workers=num_workers,
batch_duration=batch_duration, batch_duration=batch_duration,
) )

View File

@ -77,7 +77,7 @@ def get_parser():
def compute_fbank_gigaspeech_splits(args): def compute_fbank_gigaspeech_splits(args):
num_splits = args.num_splits num_splits = args.num_splits
output_dir = f"data/fbank/XL_split_{num_splits}" output_dir = f"data/fbank/gigaspeech_XL_split_{num_splits}"
output_dir = Path(output_dir) output_dir = Path(output_dir)
assert output_dir.exists(), f"{output_dir} does not exist!" assert output_dir.exists(), f"{output_dir} does not exist!"
@ -96,17 +96,19 @@ def compute_fbank_gigaspeech_splits(args):
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device)) extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
logging.info(f"device: {device}") logging.info(f"device: {device}")
prefix = "gigaspeech"
num_digits = 8 # num_digits is fixed by lhotse split-lazy num_digits = 8 # num_digits is fixed by lhotse split-lazy
for i in range(start, stop): for i in range(start, stop):
idx = f"{i + 1}".zfill(num_digits) idx = f"{i + 1}".zfill(num_digits)
logging.info(f"Processing {idx}/{num_splits}") logging.info(f"Processing {idx}/{num_splits}")
cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz" cuts_path = output_dir / f"{prefix}_cuts_XL.{idx}.jsonl.gz"
if cuts_path.is_file(): if cuts_path.is_file():
logging.info(f"{cuts_path} exists - skipping") logging.info(f"{cuts_path} exists - skipping")
continue continue
raw_cuts_path = output_dir / f"cuts_XL_raw.{idx}.jsonl.gz" raw_cuts_path = output_dir / f"{prefix}_cuts_XL_raw.{idx}.jsonl.gz"
if not raw_cuts_path.is_file(): if not raw_cuts_path.is_file():
logging.info(f"{raw_cuts_path} does not exist - skipping it") logging.info(f"{raw_cuts_path} does not exist - skipping it")
continue continue
@ -115,13 +117,13 @@ def compute_fbank_gigaspeech_splits(args):
cut_set = CutSet.from_file(raw_cuts_path) cut_set = CutSet.from_file(raw_cuts_path)
logging.info("Computing features") logging.info("Computing features")
if (output_dir / f"feats_XL_{idx}.lca").exists(): if (output_dir / f"{prefix}_feats_XL_{idx}.lca").exists():
logging.info(f"Removing {output_dir}/feats_XL_{idx}.lca") logging.info(f"Removing {output_dir}/{prefix}_feats_XL_{idx}.lca")
os.remove(output_dir / f"feats_XL_{idx}.lca") os.remove(output_dir / f"{prefix}_feats_XL_{idx}.lca")
cut_set = cut_set.compute_and_store_features_batch( cut_set = cut_set.compute_and_store_features_batch(
extractor=extractor, extractor=extractor,
storage_path=f"{output_dir}/feats_XL_{idx}", storage_path=f"{output_dir}/{prefix}_feats_XL_{idx}",
num_workers=args.num_workers, num_workers=args.num_workers,
batch_duration=args.batch_duration, batch_duration=args.batch_duration,
) )

View File

@ -28,7 +28,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -56,8 +56,13 @@ def compute_fbank_librispeech():
"train-clean-360", "train-clean-360",
"train-other-500", "train-other-500",
) )
prefix = "librispeech"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached( manifests = read_manifests_if_cached(
prefix="librispeech", dataset_parts=dataset_parts, output_dir=src_dir dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
) )
assert manifests is not None assert manifests is not None
@ -65,7 +70,8 @@ def compute_fbank_librispeech():
with get_executor() as ex: # Initialize the executor only once. with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items(): for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file(): cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
if (output_dir / cuts_filename).is_file():
logging.info(f"{partition} already exists - skipping.") logging.info(f"{partition} already exists - skipping.")
continue continue
logging.info(f"Processing {partition}") logging.info(f"Processing {partition}")
@ -81,13 +87,13 @@ def compute_fbank_librispeech():
) )
cut_set = cut_set.compute_and_store_features( cut_set = cut_set.compute_and_store_features(
extractor=extractor, extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}", storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80, num_jobs=num_jobs if ex is None else 80,
executor=ex, executor=ex,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") cut_set.to_file(output_dir / cuts_filename)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -28,7 +28,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig, combine from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -52,12 +52,22 @@ def compute_fbank_musan():
"speech", "speech",
"noise", "noise",
) )
prefix = "musan"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached( manifests = read_manifests_if_cached(
prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
) )
assert manifests is not None assert manifests is not None
musan_cuts_path = output_dir / "cuts_musan.json.gz" assert len(manifests) == len(dataset_parts), (
len(manifests),
len(dataset_parts),
)
musan_cuts_path = output_dir / "musan_cuts.jsonl.gz"
if musan_cuts_path.is_file(): if musan_cuts_path.is_file():
logging.info(f"{musan_cuts_path} already exists - skipping") logging.info(f"{musan_cuts_path} already exists - skipping")
@ -79,13 +89,13 @@ def compute_fbank_musan():
.filter(lambda c: c.duration > 5) .filter(lambda c: c.duration > 5)
.compute_and_store_features( .compute_and_store_features(
extractor=extractor, extractor=extractor,
storage_path=f"{output_dir}/feats_musan", storage_path=f"{output_dir}/musan_feats",
num_jobs=num_jobs if ex is None else 80, num_jobs=num_jobs if ex is None else 80,
executor=ex, executor=ex,
storage_type=ChunkedLilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
) )
musan_cuts.to_json(musan_cuts_path) musan_cuts.to_file(musan_cuts_path)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -25,19 +25,19 @@ for usage.
""" """
from lhotse import load_manifest from lhotse import load_manifest_lazy
def main(): def main():
path = "./data/fbank/cuts_train-clean-100.json.gz" # path = "./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz"
path = "./data/fbank/cuts_train-clean-360.json.gz" # path = "./data/fbank/librispeech_cuts_train-clean-360.jsonl.gz"
path = "./data/fbank/cuts_train-other-500.json.gz" # path = "./data/fbank/librispeech_cuts_train-other-500.jsonl.gz"
path = "./data/fbank/cuts_dev-clean.json.gz" # path = "./data/fbank/librispeech_cuts_dev-clean.jsonl.gz"
path = "./data/fbank/cuts_dev-other.json.gz" # path = "./data/fbank/librispeech_cuts_dev-other.jsonl.gz"
path = "./data/fbank/cuts_test-clean.json.gz" # path = "./data/fbank/librispeech_cuts_test-clean.jsonl.gz"
path = "./data/fbank/cuts_test-other.json.gz" path = "./data/fbank/librispeech_cuts_test-other.jsonl.gz"
cuts = load_manifest(path) cuts = load_manifest_lazy(path)
cuts.describe() cuts.describe()

View File

@ -58,17 +58,19 @@ def preprocess_giga_speech():
) )
logging.info("Loading manifest (may take 4 minutes)") logging.info("Loading manifest (may take 4 minutes)")
prefix = "gigaspeech"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached( manifests = read_manifests_if_cached(
dataset_parts=dataset_parts, dataset_parts=dataset_parts,
output_dir=src_dir, output_dir=src_dir,
prefix="gigaspeech", prefix=prefix,
suffix="jsonl.gz", suffix=suffix,
) )
assert manifests is not None assert manifests is not None
for partition, m in manifests.items(): for partition, m in manifests.items():
logging.info(f"Processing {partition}") logging.info(f"Processing {partition}")
raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz" raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
if raw_cuts_path.is_file(): if raw_cuts_path.is_file():
logging.info(f"{partition} already exists - skipping") logging.info(f"{partition} already exists - skipping")
continue continue

View File

@ -25,7 +25,7 @@ We will add more checks later if needed.
Usage example: Usage example:
python3 ./local/validate_manifest.py \ python3 ./local/validate_manifest.py \
./data/fbank/cuts_train-clean-100.json.gz ./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
""" """
@ -33,7 +33,7 @@ import argparse
import logging import logging
from pathlib import Path from pathlib import Path
from lhotse import load_manifest, CutSet from lhotse import CutSet, load_manifest_lazy
from lhotse.cut import Cut from lhotse.cut import Cut
@ -76,7 +76,7 @@ def main():
logging.info(f"Validating {manifest}") logging.info(f"Validating {manifest}")
assert manifest.is_file(), f"{manifest} does not exist" assert manifest.is_file(), f"{manifest} does not exist"
cut_set = load_manifest(manifest) cut_set = load_manifest_lazy(manifest)
assert isinstance(cut_set, CutSet) assert isinstance(cut_set, CutSet)
for c in cut_set: for c in cut_set:

View File

@ -40,9 +40,9 @@ dl_dir=$PWD/download
# It will generate data/lang_bpe_xxx, # It will generate data/lang_bpe_xxx,
# data/lang_bpe_yyy if the array contains xxx, yyy # data/lang_bpe_yyy if the array contains xxx, yyy
vocab_sizes=( vocab_sizes=(
5000 # 5000
2000 # 2000
1000 # 1000
500 500
) )
@ -132,7 +132,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
) )
for part in ${parts[@]}; do for part in ${parts[@]}; do
python3 ./local/validate_manifest.py \ python3 ./local/validate_manifest.py \
data/fbank/cuts_${part}.json.gz data/fbank/librispeech_cuts_${part}.jsonl.gz
done done
touch data/fbank/.librispeech-validated.done touch data/fbank/.librispeech-validated.done
fi fi

View File

@ -124,9 +124,9 @@ fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Split XL subset into ${num_splits} pieces" log "Stage 4: Split XL subset into ${num_splits} pieces"
split_dir=data/fbank/XL_split_${num_splits} split_dir=data/fbank/gigaspeech_XL_split_${num_splits}
if [ ! -f $split_dir/.split_completed ]; then if [ ! -f $split_dir/.split_completed ]; then
lhotse split-lazy ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir $chunk_size lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size
touch $split_dir/.split_completed touch $split_dir/.split_completed
fi fi
fi fi

View File

@ -807,28 +807,8 @@ def run(rank, world_size, args):
# the threshold # the threshold
return 1.0 <= c.duration <= 20.0 return 1.0 <= c.duration <= 20.0
num_in_total = len(train_cuts)
train_cuts = train_cuts.filter(remove_short_and_long_utt) train_cuts = train_cuts.filter(remove_short_and_long_utt)
try:
num_left = len(train_cuts)
num_removed = num_in_total - num_left
removed_percent = num_removed / num_in_total * 100
logging.info(
f"Before removing short and long utterances: {num_in_total}"
)
logging.info(f"After removing short and long utterances: {num_left}")
logging.info(
f"Removed {num_removed} utterances ({removed_percent:.5f}%)"
)
except TypeError as e:
# You can ignore this error as previous versions of Lhotse work fine
# for the above code. In recent versions of Lhotse, it uses
# lazy filter, producing cutsets that don't have the __len__ method
logging.info(str(e))
if params.start_batch > 0 and checkpoints and "sampler" in checkpoints: if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
# We only load the sampler's state dict when it loads a checkpoint # We only load the sampler's state dict when it loads a checkpoint
# saved in the middle of an epoch # saved in the middle of an epoch

View File

@ -22,7 +22,6 @@ from typing import Optional
from lhotse import CutSet, Fbank, FbankConfig from lhotse import CutSet, Fbank, FbankConfig
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutMix, CutMix,
DynamicBucketingSampler, DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
@ -71,8 +70,7 @@ class AsrDataModule:
"--num-buckets", "--num-buckets",
type=int, type=int,
default=30, default=30,
help="The number of buckets for the BucketingSampler " help="The number of buckets for the DynamicBucketingSampler. "
"and DynamicBucketingSampler."
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
@ -152,7 +150,6 @@ class AsrDataModule:
def train_dataloaders( def train_dataloaders(
self, self,
cuts_train: CutSet, cuts_train: CutSet,
dynamic_bucketing: bool,
on_the_fly_feats: bool, on_the_fly_feats: bool,
cuts_musan: Optional[CutSet] = None, cuts_musan: Optional[CutSet] = None,
) -> DataLoader: ) -> DataLoader:
@ -162,9 +159,6 @@ class AsrDataModule:
Cuts for training. Cuts for training.
cuts_musan: cuts_musan:
If not None, it is the cuts for mixing. If not None, it is the cuts for mixing.
dynamic_bucketing:
True to use DynamicBucketingSampler;
False to use BucketingSampler.
on_the_fly_feats: on_the_fly_feats:
True to use OnTheFlyFeatures; True to use OnTheFlyFeatures;
False to use PrecomputedFeatures. False to use PrecomputedFeatures.
@ -230,25 +224,14 @@ class AsrDataModule:
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
if dynamic_bucketing: logging.info("Using DynamicBucketingSampler.")
logging.info("Using DynamicBucketingSampler.") train_sampler = DynamicBucketingSampler(
train_sampler = DynamicBucketingSampler( cuts_train,
cuts_train, max_duration=self.args.max_duration,
max_duration=self.args.max_duration, shuffle=self.args.shuffle,
shuffle=self.args.shuffle, num_buckets=self.args.num_buckets,
num_buckets=self.args.num_buckets, drop_last=True,
drop_last=True, )
)
else:
logging.info("Using BucketingSampler.")
train_sampler = BucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True,
)
logging.info("About to create train dataloader") logging.info("About to create train dataloader")
train_dl = DataLoader( train_dl = DataLoader(

View File

@ -22,7 +22,7 @@ import re
from pathlib import Path from pathlib import Path
import lhotse import lhotse
from lhotse import CutSet, load_manifest from lhotse import CutSet, load_manifest_lazy
class GigaSpeech: class GigaSpeech:
@ -32,13 +32,13 @@ class GigaSpeech:
manifest_dir: manifest_dir:
It is expected to contain the following files:: It is expected to contain the following files::
- XL_split_2000/cuts_XL.*.jsonl.gz - gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz
- cuts_L_raw.jsonl.gz - gigaspeech_cuts_L_raw.jsonl.gz
- cuts_M_raw.jsonl.gz - gigaspeech_cuts_M_raw.jsonl.gz
- cuts_S_raw.jsonl.gz - gigaspeech_cuts_S_raw.jsonl.gz
- cuts_XS_raw.jsonl.gz - gigaspeech_cuts_XS_raw.jsonl.gz
- cuts_DEV_raw.jsonl.gz - gigaspeech_cuts_DEV_raw.jsonl.gz
- cuts_TEST_raw.jsonl.gz - gigaspeech_cuts_TEST_raw.jsonl.gz
""" """
self.manifest_dir = Path(manifest_dir) self.manifest_dir = Path(manifest_dir)
@ -46,10 +46,12 @@ class GigaSpeech:
logging.info("About to get train-XL cuts") logging.info("About to get train-XL cuts")
filenames = list( filenames = list(
glob.glob(f"{self.manifest_dir}/XL_split_2000/cuts_XL.*.jsonl.gz") glob.glob(
f"{self.manifest_dir}/gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz" # noqa
)
) )
pattern = re.compile(r"cuts_XL.([0-9]+).jsonl.gz") pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz")
idx_filenames = [ idx_filenames = [
(int(pattern.search(f).group(1)), f) for f in filenames (int(pattern.search(f).group(1)), f) for f in filenames
] ]
@ -64,31 +66,31 @@ class GigaSpeech:
) )
def train_L_cuts(self) -> CutSet: def train_L_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_L_raw.jsonl.gz" f = self.manifest_dir / "gigaspeech_cuts_L_raw.jsonl.gz"
logging.info(f"About to get train-L cuts from {f}") logging.info(f"About to get train-L cuts from {f}")
return CutSet.from_jsonl_lazy(f) return CutSet.from_jsonl_lazy(f)
def train_M_cuts(self) -> CutSet: def train_M_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_M_raw.jsonl.gz" f = self.manifest_dir / "gigaspeech_cuts_M_raw.jsonl.gz"
logging.info(f"About to get train-M cuts from {f}") logging.info(f"About to get train-M cuts from {f}")
return CutSet.from_jsonl_lazy(f) return CutSet.from_jsonl_lazy(f)
def train_S_cuts(self) -> CutSet: def train_S_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_S_raw.jsonl.gz" f = self.manifest_dir / "gigaspeech_cuts_S_raw.jsonl.gz"
logging.info(f"About to get train-S cuts from {f}") logging.info(f"About to get train-S cuts from {f}")
return CutSet.from_jsonl_lazy(f) return CutSet.from_jsonl_lazy(f)
def train_XS_cuts(self) -> CutSet: def train_XS_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_XS_raw.jsonl.gz" f = self.manifest_dir / "gigaspeech_cuts_XS_raw.jsonl.gz"
logging.info(f"About to get train-XS cuts from {f}") logging.info(f"About to get train-XS cuts from {f}")
return CutSet.from_jsonl_lazy(f) return CutSet.from_jsonl_lazy(f)
def test_cuts(self) -> CutSet: def test_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_TEST.jsonl.gz" f = self.manifest_dir / "gigaspeech_cuts_TEST.jsonl.gz"
logging.info(f"About to get TEST cuts from {f}") logging.info(f"About to get TEST cuts from {f}")
return load_manifest(f) return load_manifest_lazy(f)
def dev_cuts(self) -> CutSet: def dev_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_DEV.jsonl.gz" f = self.manifest_dir / "gigaspeech_cuts_DEV.jsonl.gz"
logging.info(f"About to get DEV cuts from {f}") logging.info(f"About to get DEV cuts from {f}")
return load_manifest(f) return load_manifest_lazy(f)

View File

@ -18,7 +18,7 @@
import logging import logging
from pathlib import Path from pathlib import Path
from lhotse import CutSet, load_manifest from lhotse import CutSet, load_manifest_lazy
class LibriSpeech: class LibriSpeech:
@ -28,47 +28,47 @@ class LibriSpeech:
manifest_dir: manifest_dir:
It is expected to contain the following files:: It is expected to contain the following files::
- cuts_dev-clean.json.gz - librispeech_cuts_dev-clean.jsonl.gz
- cuts_dev-other.json.gz - librispeech_cuts_dev-other.jsonl.gz
- cuts_test-clean.json.gz - librispeech_cuts_test-clean.jsonl.gz
- cuts_test-other.json.gz - librispeech_cuts_test-other.jsonl.gz
- cuts_train-clean-100.json.gz - librispeech_cuts_train-clean-100.jsonl.gz
- cuts_train-clean-360.json.gz - librispeech_cuts_train-clean-360.jsonl.gz
- cuts_train-other-500.json.gz - librispeech_cuts_train-other-500.jsonl.gz
""" """
self.manifest_dir = Path(manifest_dir) self.manifest_dir = Path(manifest_dir)
def train_clean_100_cuts(self) -> CutSet: def train_clean_100_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train-clean-100.json.gz" f = self.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
logging.info(f"About to get train-clean-100 cuts from {f}") logging.info(f"About to get train-clean-100 cuts from {f}")
return load_manifest(f) return load_manifest_lazy(f)
def train_clean_360_cuts(self) -> CutSet: def train_clean_360_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train-clean-360.json.gz" f = self.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
logging.info(f"About to get train-clean-360 cuts from {f}") logging.info(f"About to get train-clean-360 cuts from {f}")
return load_manifest(f) return load_manifest_lazy(f)
def train_other_500_cuts(self) -> CutSet: def train_other_500_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train-other-500.json.gz" f = self.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
logging.info(f"About to get train-other-500 cuts from {f}") logging.info(f"About to get train-other-500 cuts from {f}")
return load_manifest(f) return load_manifest_lazy(f)
def test_clean_cuts(self) -> CutSet: def test_clean_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_test-clean.json.gz" f = self.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
logging.info(f"About to get test-clean cuts from {f}") logging.info(f"About to get test-clean cuts from {f}")
return load_manifest(f) return load_manifest_lazy(f)
def test_other_cuts(self) -> CutSet: def test_other_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_test-other.json.gz" f = self.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
logging.info(f"About to get test-other cuts from {f}") logging.info(f"About to get test-other cuts from {f}")
return load_manifest(f) return load_manifest_lazy(f)
def dev_clean_cuts(self) -> CutSet: def dev_clean_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_dev-clean.json.gz" f = self.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
logging.info(f"About to get dev-clean cuts from {f}") logging.info(f"About to get dev-clean cuts from {f}")
return load_manifest(f) return load_manifest_lazy(f)
def dev_other_cuts(self) -> CutSet: def dev_other_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_dev-other.json.gz" f = self.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
logging.info(f"About to get dev-other cuts from {f}") logging.info(f"About to get dev-other cuts from {f}")
return load_manifest(f) return load_manifest_lazy(f)

View File

@ -66,7 +66,7 @@ from conformer import Conformer
from decoder import Decoder from decoder import Decoder
from gigaspeech import GigaSpeech from gigaspeech import GigaSpeech
from joiner import Joiner from joiner import Joiner
from lhotse import CutSet, load_manifest from lhotse import CutSet, load_manifest_lazy
from lhotse.cut import Cut from lhotse.cut import Cut
from lhotse.dataset.sampling.base import CutSampler from lhotse.dataset.sampling.base import CutSampler
from lhotse.utils import fix_random_seed from lhotse.utils import fix_random_seed
@ -968,8 +968,8 @@ def run(rank, world_size, args):
train_giga_cuts = train_giga_cuts.repeat(times=None) train_giga_cuts = train_giga_cuts.repeat(times=None)
if args.enable_musan: if args.enable_musan:
cuts_musan = load_manifest( cuts_musan = load_manifest_lazy(
Path(args.manifest_dir) / "cuts_musan.json.gz" Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
) )
else: else:
cuts_musan = None cuts_musan = None
@ -978,14 +978,12 @@ def run(rank, world_size, args):
train_dl = asr_datamodule.train_dataloaders( train_dl = asr_datamodule.train_dataloaders(
train_cuts, train_cuts,
dynamic_bucketing=False,
on_the_fly_feats=False, on_the_fly_feats=False,
cuts_musan=cuts_musan, cuts_musan=cuts_musan,
) )
giga_train_dl = asr_datamodule.train_dataloaders( giga_train_dl = asr_datamodule.train_dataloaders(
train_giga_cuts, train_giga_cuts,
dynamic_bucketing=True,
on_the_fly_feats=False, on_the_fly_feats=False,
cuts_musan=cuts_musan, cuts_musan=cuts_musan,
) )

View File

@ -24,7 +24,7 @@ from pathlib import Path
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
import torch import torch
from lhotse import CutSet, Fbank, FbankConfig, load_manifest from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures
CutConcatenate, CutConcatenate,
CutMix, CutMix,
@ -224,8 +224,8 @@ class LibriSpeechAsrDataModule:
if self.args.enable_musan: if self.args.enable_musan:
logging.info("Enable MUSAN") logging.info("Enable MUSAN")
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest( cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "cuts_musan.json.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
transforms.append( transforms.append(
CutMix( CutMix(
@ -407,40 +407,48 @@ class LibriSpeechAsrDataModule:
@lru_cache() @lru_cache()
def train_clean_100_cuts(self) -> CutSet: def train_clean_100_cuts(self) -> CutSet:
logging.info("About to get train-clean-100 cuts") logging.info("About to get train-clean-100 cuts")
return load_manifest( return load_manifest_lazy(
self.args.manifest_dir / "cuts_train-clean-100.json.gz" self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
) )
@lru_cache() @lru_cache()
def train_clean_360_cuts(self) -> CutSet: def train_clean_360_cuts(self) -> CutSet:
logging.info("About to get train-clean-360 cuts") logging.info("About to get train-clean-360 cuts")
return load_manifest( return load_manifest_lazy(
self.args.manifest_dir / "cuts_train-clean-360.json.gz" self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
) )
@lru_cache() @lru_cache()
def train_other_500_cuts(self) -> CutSet: def train_other_500_cuts(self) -> CutSet:
logging.info("About to get train-other-500 cuts") logging.info("About to get train-other-500 cuts")
return load_manifest( return load_manifest_lazy(
self.args.manifest_dir / "cuts_train-other-500.json.gz" self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
) )
@lru_cache() @lru_cache()
def dev_clean_cuts(self) -> CutSet: def dev_clean_cuts(self) -> CutSet:
logging.info("About to get dev-clean cuts") logging.info("About to get dev-clean cuts")
return load_manifest(self.args.manifest_dir / "cuts_dev-clean.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
)
@lru_cache() @lru_cache()
def dev_other_cuts(self) -> CutSet: def dev_other_cuts(self) -> CutSet:
logging.info("About to get dev-other cuts") logging.info("About to get dev-other cuts")
return load_manifest(self.args.manifest_dir / "cuts_dev-other.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
)
@lru_cache() @lru_cache()
def test_clean_cuts(self) -> CutSet: def test_clean_cuts(self) -> CutSet:
logging.info("About to get test-clean cuts") logging.info("About to get test-clean cuts")
return load_manifest(self.args.manifest_dir / "cuts_test-clean.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
)
@lru_cache() @lru_cache()
def test_other_cuts(self) -> CutSet: def test_other_cuts(self) -> CutSet:
logging.info("About to get test-other cuts") logging.info("About to get test-other cuts")
return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
)

View File

@ -16,6 +16,15 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""
Usage:
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./tdnn_lstm_ctc/train.py \
--world-size 4 \
--full-libri 1 \
--max-duration 300 \
--num-epochs 20
"""
import argparse import argparse
import logging import logging
@ -29,6 +38,7 @@ import torch.multiprocessing as mp
import torch.nn as nn import torch.nn as nn
import torch.optim as optim import torch.optim as optim
from asr_datamodule import LibriSpeechAsrDataModule from asr_datamodule import LibriSpeechAsrDataModule
from lhotse.cut import Cut
from lhotse.utils import fix_random_seed from lhotse.utils import fix_random_seed
from model import TdnnLstm from model import TdnnLstm
from torch import Tensor from torch import Tensor
@ -544,10 +554,25 @@ def run(rank, world_size, args):
if params.full_libri: if params.full_libri:
train_cuts += librispeech.train_clean_360_cuts() train_cuts += librispeech.train_clean_360_cuts()
train_cuts += librispeech.train_other_500_cuts() train_cuts += librispeech.train_other_500_cuts()
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
#
# Caution: There is a reason to select 20.0 here. Please see
# ../local/display_manifest_statistics.py
#
# You should use ../local/display_manifest_statistics.py to get
# an utterance duration distribution for your dataset to select
# the threshold
return 1.0 <= c.duration <= 20.0
train_cuts = train_cuts.filter(remove_short_and_long_utt)
train_dl = librispeech.train_dataloaders(train_cuts) train_dl = librispeech.train_dataloaders(train_cuts)
valid_cuts = librispeech.dev_clean_cuts() valid_cuts = librispeech.dev_clean_cuts()
valid_cuts += librispeech.dev_other_cuts() valid_cuts += librispeech.dev_other_cuts()
valid_dl = librispeech.valid_dataloaders(valid_cuts) valid_dl = librispeech.valid_dataloaders(valid_cuts)
for epoch in range(params.start_epoch, params.num_epochs): for epoch in range(params.start_epoch, params.num_epochs):

View File

@ -44,8 +44,8 @@ from pathlib import Path
import sentencepiece as spm import sentencepiece as spm
import torch import torch
from alignment import get_word_starting_frames from alignment import get_word_starting_frames
from lhotse import CutSet, load_manifest from lhotse import CutSet, load_manifest_lazy
from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler from lhotse.dataset import DynamicBucketingSampler, K2SpeechRecognitionDataset
from lhotse.dataset.collation import collate_custom_field from lhotse.dataset.collation import collate_custom_field
@ -93,14 +93,15 @@ def main():
sp = spm.SentencePieceProcessor() sp = spm.SentencePieceProcessor()
sp.load(args.bpe_model) sp.load(args.bpe_model)
cuts_json = args.ali_dir / f"cuts_{args.dataset}.json.gz" cuts_jsonl = args.ali_dir / f"librispeech_cuts_{args.dataset}.jsonl.gz"
logging.info(f"Loading {cuts_json}") logging.info(f"Loading {cuts_jsonl}")
cuts = load_manifest(cuts_json) cuts = load_manifest_lazy(cuts_jsonl)
sampler = SingleCutSampler( sampler = DynamicBucketingSampler(
cuts, cuts,
max_duration=30, max_duration=30,
num_buckets=30,
shuffle=False, shuffle=False,
) )

View File

@ -1,333 +0,0 @@
# Copyright 2021 Piotr Żelasko
# 2022 Xiaomi Corp. (authors: Fangjun Kuang
# Mingshuang Luo)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import inspect
import logging
from pathlib import Path
from typing import Optional
import torch
from lhotse import CutSet, Fbank, FbankConfig
from lhotse.dataset import (
BucketingSampler,
CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset,
SpecAugment,
)
from lhotse.dataset.input_strategies import (
OnTheFlyFeatures,
PrecomputedFeatures,
)
from lhotse.utils import fix_random_seed
from torch.utils.data import DataLoader
from icefall.utils import str2bool
class _SeedWorkers:
def __init__(self, seed: int):
self.seed = seed
def __call__(self, worker_id: int):
fix_random_seed(self.seed + worker_id)
class AsrDataModule:
def __init__(self, args: argparse.Namespace):
self.args = args
@classmethod
def add_arguments(cls, parser: argparse.ArgumentParser):
group = parser.add_argument_group(
title="ASR data related options",
description="These options are used for the preparation of "
"PyTorch DataLoaders from Lhotse CutSet's -- they control the "
"effective batch sizes, sampling strategies, applied data "
"augmentations, etc.",
)
group.add_argument(
"--max-duration",
type=int,
default=200.0,
help="Maximum pooled recordings duration (seconds) in a "
"single batch. You can reduce it if it causes CUDA OOM.",
)
group.add_argument(
"--bucketing-sampler",
type=str2bool,
default=True,
help="When enabled, the batches will come from buckets of "
"similar duration (saves padding frames).",
)
group.add_argument(
"--num-buckets",
type=int,
default=30,
help="The number of buckets for the BucketingSampler "
"and DynamicBucketingSampler."
"(you might want to increase it for larger datasets).",
)
group.add_argument(
"--shuffle",
type=str2bool,
default=True,
help="When enabled (=default), the examples will be "
"shuffled for each epoch.",
)
group.add_argument(
"--return-cuts",
type=str2bool,
default=True,
help="When enabled, each batch will have the "
"field: batch['supervisions']['cut'] with the cuts that "
"were used to construct it.",
)
group.add_argument(
"--num-workers",
type=int,
default=2,
help="The number of training dataloader workers that "
"collect the batches.",
)
group.add_argument(
"--enable-spec-aug",
type=str2bool,
default=True,
help="When enabled, use SpecAugment for training dataset.",
)
group.add_argument(
"--spec-aug-time-warp-factor",
type=int,
default=80,
help="Used only when --enable-spec-aug is True. "
"It specifies the factor for time warping in SpecAugment. "
"Larger values mean more warping. "
"A value less than 1 means to disable time warp.",
)
group.add_argument(
"--enable-musan",
type=str2bool,
default=True,
help="When enabled, select noise from MUSAN and mix it"
"with training dataset. ",
)
group.add_argument(
"--manifest-dir",
type=Path,
default=Path("data/fbank"),
help="Path to directory with train/valid/test cuts.",
)
group.add_argument(
"--on-the-fly-feats",
type=str2bool,
default=False,
help="When enabled, use on-the-fly cut mixing and feature "
"extraction. Will drop existing precomputed feature manifests "
"if available. Used only in dev/test CutSet",
)
def train_dataloaders(
self,
cuts_train: CutSet,
dynamic_bucketing: bool,
on_the_fly_feats: bool,
cuts_musan: Optional[CutSet] = None,
) -> DataLoader:
"""
Args:
cuts_train:
Cuts for training.
cuts_musan:
If not None, it is the cuts for mixing.
dynamic_bucketing:
True to use DynamicBucketingSampler;
False to use BucketingSampler.
on_the_fly_feats:
True to use OnTheFlyFeatures;
False to use PrecomputedFeatures.
"""
transforms = []
if cuts_musan is not None:
logging.info("Enable MUSAN")
transforms.append(
CutMix(
cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True
)
)
else:
logging.info("Disable MUSAN")
input_transforms = []
if self.args.enable_spec_aug:
logging.info("Enable SpecAugment")
logging.info(
f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
)
# Set the value of num_frame_masks according to Lhotse's version.
# In different Lhotse's versions, the default of num_frame_masks is
# different.
num_frame_masks = 10
num_frame_masks_parameter = inspect.signature(
SpecAugment.__init__
).parameters["num_frame_masks"]
if num_frame_masks_parameter.default == 1:
num_frame_masks = 2
logging.info(f"Num frame mask: {num_frame_masks}")
input_transforms.append(
SpecAugment(
time_warp_factor=self.args.spec_aug_time_warp_factor,
num_frame_masks=num_frame_masks,
features_mask_size=27,
num_feature_masks=2,
frames_mask_size=100,
)
)
else:
logging.info("Disable SpecAugment")
logging.info("About to create train dataset")
train = K2SpeechRecognitionDataset(
cut_transforms=transforms,
input_transforms=input_transforms,
return_cuts=self.args.return_cuts,
)
# NOTE: the PerturbSpeed transform should be added only if we
# remove it from data prep stage.
# Add on-the-fly speed perturbation; since originally it would
# have increased epoch size by 3, we will apply prob 2/3 and use
# 3x more epochs.
# Speed perturbation probably should come first before
# concatenation, but in principle the transforms order doesn't have
# to be strict (e.g. could be randomized)
# transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms # noqa
# Drop feats to be on the safe side.
train = K2SpeechRecognitionDataset(
cut_transforms=transforms,
input_strategy=(
OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
if on_the_fly_feats
else PrecomputedFeatures()
),
input_transforms=input_transforms,
return_cuts=self.args.return_cuts,
)
if dynamic_bucketing:
logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
drop_last=True,
)
else:
logging.info("Using BucketingSampler.")
train_sampler = BucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True,
)
logging.info("About to create train dataloader")
# 'seed' is derived from the current random state, which will have
# previously been set in the main process.
seed = torch.randint(0, 100000, ()).item()
worker_init_fn = _SeedWorkers(seed)
train_dl = DataLoader(
train,
sampler=train_sampler,
batch_size=None,
num_workers=self.args.num_workers,
persistent_workers=False,
worker_init_fn=worker_init_fn,
)
return train_dl
def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
transforms = []
logging.info("About to create dev dataset")
if self.args.on_the_fly_feats:
validate = K2SpeechRecognitionDataset(
cut_transforms=transforms,
input_strategy=OnTheFlyFeatures(
Fbank(FbankConfig(num_mel_bins=80))
),
return_cuts=self.args.return_cuts,
)
else:
validate = K2SpeechRecognitionDataset(
cut_transforms=transforms,
return_cuts=self.args.return_cuts,
)
valid_sampler = BucketingSampler(
cuts_valid,
max_duration=self.args.max_duration,
shuffle=False,
)
logging.info("About to create dev dataloader")
valid_dl = DataLoader(
validate,
sampler=valid_sampler,
batch_size=None,
num_workers=2,
persistent_workers=False,
)
return valid_dl
def test_dataloaders(self, cuts: CutSet) -> DataLoader:
logging.debug("About to create test dataset")
test = K2SpeechRecognitionDataset(
input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
if self.args.on_the_fly_feats
else PrecomputedFeatures(),
return_cuts=self.args.return_cuts,
)
sampler = BucketingSampler(
cuts, max_duration=self.args.max_duration, shuffle=False
)
logging.debug("About to create test dataloader")
test_dl = DataLoader(
test,
batch_size=None,
sampler=sampler,
num_workers=self.args.num_workers,
)
return test_dl

View File

@ -0,0 +1 @@
../pruned_transducer_stateless3/asr_datamodule.py

View File

@ -1,75 +0,0 @@
# Copyright 2021 Piotr Żelasko
# 2022 Xiaomi Corp. (authors: Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
from lhotse import CutSet, load_manifest
class GigaSpeech:
def __init__(self, manifest_dir: str):
"""
Args:
manifest_dir:
It is expected to contain the following files::
- cuts_XL_raw.jsonl.gz
- cuts_L_raw.jsonl.gz
- cuts_M_raw.jsonl.gz
- cuts_S_raw.jsonl.gz
- cuts_XS_raw.jsonl.gz
- cuts_DEV_raw.jsonl.gz
- cuts_TEST_raw.jsonl.gz
"""
self.manifest_dir = Path(manifest_dir)
def train_XL_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_XL_raw.jsonl.gz"
logging.info(f"About to get train-XL cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def train_L_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_L_raw.jsonl.gz"
logging.info(f"About to get train-L cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def train_M_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_M_raw.jsonl.gz"
logging.info(f"About to get train-M cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def train_S_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_S_raw.jsonl.gz"
logging.info(f"About to get train-S cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def train_XS_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_XS_raw.jsonl.gz"
logging.info(f"About to get train-XS cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def test_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_TEST.jsonl.gz"
logging.info(f"About to get TEST cuts from {f}")
return load_manifest(f)
def dev_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_DEV.jsonl.gz"
logging.info(f"About to get DEV cuts from {f}")
return load_manifest(f)

View File

@ -0,0 +1 @@
../pruned_transducer_stateless3/gigaspeech.py

View File

@ -1,74 +0,0 @@
# Copyright 2021 Piotr Żelasko
# 2022 Xiaomi Corp. (authors: Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
from lhotse import CutSet, load_manifest
class LibriSpeech:
def __init__(self, manifest_dir: str):
"""
Args:
manifest_dir:
It is expected to contain the following files::
- cuts_dev-clean.json.gz
- cuts_dev-other.json.gz
- cuts_test-clean.json.gz
- cuts_test-other.json.gz
- cuts_train-clean-100.json.gz
- cuts_train-clean-360.json.gz
- cuts_train-other-500.json.gz
"""
self.manifest_dir = Path(manifest_dir)
def train_clean_100_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train-clean-100.json.gz"
logging.info(f"About to get train-clean-100 cuts from {f}")
return load_manifest(f)
def train_clean_360_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train-clean-360.json.gz"
logging.info(f"About to get train-clean-360 cuts from {f}")
return load_manifest(f)
def train_other_500_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train-other-500.json.gz"
logging.info(f"About to get train-other-500 cuts from {f}")
return load_manifest(f)
def test_clean_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_test-clean.json.gz"
logging.info(f"About to get test-clean cuts from {f}")
return load_manifest(f)
def test_other_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_test-other.json.gz"
logging.info(f"About to get test-other cuts from {f}")
return load_manifest(f)
def dev_clean_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_dev-clean.json.gz"
logging.info(f"About to get dev-clean cuts from {f}")
return load_manifest(f)
def dev_other_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_dev-other.json.gz"
logging.info(f"About to get dev-other cuts from {f}")
return load_manifest(f)

View File

@ -0,0 +1 @@
../pruned_transducer_stateless3/librispeech.py

View File

@ -28,7 +28,7 @@ from pathlib import Path
from asr_datamodule import AsrDataModule from asr_datamodule import AsrDataModule
from gigaspeech import GigaSpeech from gigaspeech import GigaSpeech
from lhotse import load_manifest from lhotse import load_manifest_lazy
from librispeech import LibriSpeech from librispeech import LibriSpeech
@ -41,8 +41,8 @@ def test_dataset():
print(args) print(args)
if args.enable_musan: if args.enable_musan:
cuts_musan = load_manifest( cuts_musan = load_manifest_lazy(
Path(args.manifest_dir) / "cuts_musan.json.gz" Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
) )
else: else:
cuts_musan = None cuts_musan = None
@ -57,14 +57,12 @@ def test_dataset():
libri_train_dl = asr_datamodule.train_dataloaders( libri_train_dl = asr_datamodule.train_dataloaders(
train_clean_100, train_clean_100,
dynamic_bucketing=False,
on_the_fly_feats=False, on_the_fly_feats=False,
cuts_musan=cuts_musan, cuts_musan=cuts_musan,
) )
giga_train_dl = asr_datamodule.train_dataloaders( giga_train_dl = asr_datamodule.train_dataloaders(
train_S, train_S,
dynamic_bucketing=True,
on_the_fly_feats=True, on_the_fly_feats=True,
cuts_musan=cuts_musan, cuts_musan=cuts_musan,
) )

View File

@ -73,7 +73,7 @@ from conformer import Conformer
from decoder import Decoder from decoder import Decoder
from gigaspeech import GigaSpeech from gigaspeech import GigaSpeech
from joiner import Joiner from joiner import Joiner
from lhotse import CutSet, load_manifest from lhotse import CutSet, load_manifest_lazy
from lhotse.cut import Cut from lhotse.cut import Cut
from lhotse.utils import fix_random_seed from lhotse.utils import fix_random_seed
from librispeech import LibriSpeech from librispeech import LibriSpeech
@ -662,19 +662,17 @@ def train_one_epoch(
def filter_short_and_long_utterances(cuts: CutSet) -> CutSet: def filter_short_and_long_utterances(cuts: CutSet) -> CutSet:
def remove_short_and_long_utt(c: Cut): def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds # Keep only utterances with duration between 1 second and 20 seconds
#
# Caution: There is a reason to select 20.0 here. Please see
# ../local/display_manifest_statistics.py
#
# You should use ../local/display_manifest_statistics.py to get
# an utterance duration distribution for your dataset to select
# the threshold
return 1.0 <= c.duration <= 20.0 return 1.0 <= c.duration <= 20.0
num_in_total = len(cuts)
cuts = cuts.filter(remove_short_and_long_utt) cuts = cuts.filter(remove_short_and_long_utt)
num_left = len(cuts)
num_removed = num_in_total - num_left
removed_percent = num_removed / num_in_total * 100
logging.info(f"Before removing short and long utterances: {num_in_total}")
logging.info(f"After removing short and long utterances: {num_left}")
logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
return cuts return cuts
@ -767,17 +765,18 @@ def run(rank, world_size, args):
# DEV 12 hours # DEV 12 hours
# Test 40 hours # Test 40 hours
if params.full_libri: if params.full_libri:
logging.info("Using the L subset of GigaSpeech (2.5k hours)") logging.info("Using the XL subset of GigaSpeech (10k hours)")
train_giga_cuts = gigaspeech.train_L_cuts() train_giga_cuts = gigaspeech.train_XL_cuts()
else: else:
logging.info("Using the S subset of GigaSpeech (250 hours)") logging.info("Using the S subset of GigaSpeech (250 hours)")
train_giga_cuts = gigaspeech.train_S_cuts() train_giga_cuts = gigaspeech.train_S_cuts()
train_giga_cuts = filter_short_and_long_utterances(train_giga_cuts) train_giga_cuts = filter_short_and_long_utterances(train_giga_cuts)
train_giga_cuts = train_giga_cuts.repeat(times=None)
if args.enable_musan: if args.enable_musan:
cuts_musan = load_manifest( cuts_musan = load_manifest_lazy(
Path(args.manifest_dir) / "cuts_musan.json.gz" Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
) )
else: else:
cuts_musan = None cuts_musan = None
@ -786,14 +785,12 @@ def run(rank, world_size, args):
train_dl = asr_datamodule.train_dataloaders( train_dl = asr_datamodule.train_dataloaders(
train_cuts, train_cuts,
dynamic_bucketing=False,
on_the_fly_feats=False, on_the_fly_feats=False,
cuts_musan=cuts_musan, cuts_musan=cuts_musan,
) )
giga_train_dl = asr_datamodule.train_dataloaders( giga_train_dl = asr_datamodule.train_dataloaders(
train_giga_cuts, train_giga_cuts,
dynamic_bucketing=True,
on_the_fly_feats=True, on_the_fly_feats=True,
cuts_musan=cuts_musan, cuts_musan=cuts_musan,
) )

View File

@ -22,7 +22,7 @@ from pathlib import Path
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
import torch import torch
from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import ( from lhotse.dataset import (
CutConcatenate, CutConcatenate,
CutMix, CutMix,
@ -176,7 +176,7 @@ class SPGISpeechAsrDataModule:
The state dict for the training sampler. The state dict for the training sampler.
""" """
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest( cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "cuts_musan.jsonl.gz" self.args.manifest_dir / "cuts_musan.jsonl.gz"
) )

View File

@ -52,8 +52,13 @@ def compute_fbank_tedlium():
"test", "test",
) )
prefix = "tedlium"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached( manifests = read_manifests_if_cached(
prefix="tedlium", dataset_parts=dataset_parts, output_dir=src_dir dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
) )
assert manifests is not None assert manifests is not None
@ -61,7 +66,7 @@ def compute_fbank_tedlium():
with get_executor() as ex: # Initialize the executor only once. with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items(): for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file(): if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
logging.info(f"{partition} already exists - skipping.") logging.info(f"{partition} already exists - skipping.")
continue continue
logging.info(f"Processing {partition}") logging.info(f"Processing {partition}")
@ -80,7 +85,7 @@ def compute_fbank_tedlium():
cut_set = cut_set.compute_and_store_features( cut_set = cut_set.compute_and_store_features(
extractor=extractor, extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}", storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=cur_num_jobs, num_jobs=cur_num_jobs,
executor=ex, executor=ex,
@ -88,7 +93,7 @@ def compute_fbank_tedlium():
) )
# Split long cuts into many short and un-overlapping cuts # Split long cuts into many short and un-overlapping cuts
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False) cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -27,15 +27,15 @@ for usage.
""" """
from lhotse import load_manifest from lhotse import load_manifest_lazy
def main(): def main():
path = "./data/fbank/cuts_train.json.gz" path = "./data/fbank/tedlium_cuts_train.jsonl.gz"
path = "./data/fbank/cuts_dev.json.gz" path = "./data/fbank/tedlium_cuts_dev.jsonl.gz"
path = "./data/fbank/cuts_test.json.gz" path = "./data/fbank/tedlium_cuts_test.jsonl.gz"
cuts = load_manifest(path) cuts = load_manifest_lazy(path)
cuts.describe() cuts.describe()

View File

@ -22,11 +22,11 @@ import logging
from functools import lru_cache from functools import lru_cache
from pathlib import Path from pathlib import Path
from lhotse import CutSet, Fbank, FbankConfig, load_manifest from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutConcatenate, CutConcatenate,
CutMix, CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
PrecomputedFeatures, PrecomputedFeatures,
SingleCutSampler, SingleCutSampler,
@ -92,7 +92,7 @@ class TedLiumAsrDataModule:
"--num-buckets", "--num-buckets",
type=int, type=int,
default=30, default=30,
help="The number of buckets for the BucketingSampler" help="The number of buckets for the DynamicBucketingSampler"
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
group.add_argument( group.add_argument(
@ -179,8 +179,8 @@ class TedLiumAsrDataModule:
transforms = [] transforms = []
if self.args.enable_musan: if self.args.enable_musan:
logging.info("Enable MUSAN") logging.info("Enable MUSAN")
cuts_musan = load_manifest( cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "cuts_musan.json.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
transforms.append( transforms.append(
CutMix( CutMix(
@ -261,13 +261,12 @@ class TedLiumAsrDataModule:
) )
if self.args.bucketing_sampler: if self.args.bucketing_sampler:
logging.info("Using BucketingSampler.") logging.info("Using DynamicBucketingSampler.")
train_sampler = BucketingSampler( train_sampler = DynamicBucketingSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True, drop_last=True,
) )
else: else:
@ -311,7 +310,7 @@ class TedLiumAsrDataModule:
cut_transforms=transforms, cut_transforms=transforms,
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
valid_sampler = BucketingSampler( valid_sampler = DynamicBucketingSampler(
cuts_valid, cuts_valid,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=False, shuffle=False,
@ -335,8 +334,10 @@ class TedLiumAsrDataModule:
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = BucketingSampler( sampler = DynamicBucketingSampler(
cuts, max_duration=self.args.max_duration, shuffle=False cuts,
max_duration=self.args.max_duration,
shuffle=False,
) )
logging.debug("About to create test dataloader") logging.debug("About to create test dataloader")
test_dl = DataLoader( test_dl = DataLoader(
@ -350,14 +351,20 @@ class TedLiumAsrDataModule:
@lru_cache() @lru_cache()
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
logging.info("About to get train cuts") logging.info("About to get train cuts")
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "tedlium_cuts_train.jsonl.gz"
)
@lru_cache() @lru_cache()
def dev_cuts(self) -> CutSet: def dev_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "tedlium_cuts_dev.jsonl.gz"
)
@lru_cache() @lru_cache()
def test_cuts(self) -> CutSet: def test_cuts(self) -> CutSet:
logging.info("About to get test cuts") logging.info("About to get test cuts")
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz") return load_manifest_lazy(
self.args.manifest_dir / "tedlium_cuts_test.jsonl.gz"
)

View File

@ -29,7 +29,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -53,8 +53,13 @@ def compute_fbank_timit():
"DEV", "DEV",
"TEST", "TEST",
) )
prefix = "timit"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached( manifests = read_manifests_if_cached(
prefix="timit", dataset_parts=dataset_parts, output_dir=src_dir dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
) )
assert manifests is not None assert manifests is not None
@ -62,7 +67,8 @@ def compute_fbank_timit():
with get_executor() as ex: # Initialize the executor only once. with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items(): for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file(): cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
if cuts_file.is_file():
logging.info(f"{partition} already exists - skipping.") logging.info(f"{partition} already exists - skipping.")
continue continue
logging.info(f"Processing {partition}") logging.info(f"Processing {partition}")
@ -78,13 +84,13 @@ def compute_fbank_timit():
) )
cut_set = cut_set.compute_and_store_features( cut_set = cut_set.compute_and_store_features(
extractor=extractor, extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}", storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80, num_jobs=num_jobs if ex is None else 80,
executor=ex, executor=ex,
storage_type=LilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") cut_set.to_file(cuts_file)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -23,11 +23,11 @@ from functools import lru_cache
from pathlib import Path from pathlib import Path
from typing import List, Union from typing import List, Union
from lhotse import CutSet, Fbank, FbankConfig, load_manifest from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import ( from lhotse.dataset import (
BucketingSampler,
CutConcatenate, CutConcatenate,
CutMix, CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset, K2SpeechRecognitionDataset,
PrecomputedFeatures, PrecomputedFeatures,
SingleCutSampler, SingleCutSampler,
@ -92,7 +92,7 @@ class TimitAsrDataModule(DataModule):
"--num-buckets", "--num-buckets",
type=int, type=int,
default=30, default=30,
help="The number of buckets for the BucketingSampler" help="The number of buckets for the DynamicBucketingSampler"
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
group.add_argument( group.add_argument(
@ -154,7 +154,9 @@ class TimitAsrDataModule(DataModule):
cuts_train = self.train_cuts() cuts_train = self.train_cuts()
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest(self.args.feature_dir / "cuts_musan.json.gz") cuts_musan = load_manifest_lazy(
self.args.feature_dir / "cuts_musan.jsonl.gz"
)
logging.info("About to create train dataset") logging.info("About to create train dataset")
transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))] transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
@ -218,13 +220,12 @@ class TimitAsrDataModule(DataModule):
) )
if self.args.bucketing_sampler: if self.args.bucketing_sampler:
logging.info("Using BucketingSampler.") logging.info("Using DynamicBucketingSampler.")
train_sampler = BucketingSampler( train_sampler = DynamicBucketingSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True, drop_last=True,
) )
else: else:
@ -322,20 +323,26 @@ class TimitAsrDataModule(DataModule):
@lru_cache() @lru_cache()
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
logging.info("About to get train cuts") logging.info("About to get train cuts")
cuts_train = load_manifest(self.args.feature_dir / "cuts_TRAIN.json.gz") cuts_train = load_manifest_lazy(
self.args.feature_dir / "timit_cuts_TRAIN.jsonl.gz"
)
return cuts_train return cuts_train
@lru_cache() @lru_cache()
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
cuts_valid = load_manifest(self.args.feature_dir / "cuts_DEV.json.gz") cuts_valid = load_manifest_lazy(
self.args.feature_dir / "timit_cuts_DEV.jsonl.gz"
)
return cuts_valid return cuts_valid
@lru_cache() @lru_cache()
def test_cuts(self) -> CutSet: def test_cuts(self) -> CutSet:
logging.debug("About to get test cuts") logging.debug("About to get test cuts")
cuts_test = load_manifest(self.args.feature_dir / "cuts_TEST.json.gz") cuts_test = load_manifest_lazy(
self.args.feature_dir / "timit_cuts_TEST.jsonl.gz"
)
return cuts_test return cuts_test

View File

@ -26,7 +26,7 @@ for usage.
""" """
from lhotse import load_manifest from lhotse import load_manifest_lazy
def main(): def main():
@ -40,7 +40,7 @@ def main():
for path in paths: for path in paths:
print(f"Starting display the statistics for {path}") print(f"Starting display the statistics for {path}")
cuts = load_manifest(path) cuts = load_manifest_lazy(path)
cuts.describe() cuts.describe()

View File

@ -27,7 +27,7 @@ from lhotse import (
CutSet, CutSet,
Fbank, Fbank,
FbankConfig, FbankConfig,
load_manifest, load_manifest_lazy,
set_caching_enabled, set_caching_enabled,
) )
from lhotse.dataset import ( from lhotse.dataset import (
@ -218,8 +218,8 @@ class WenetSpeechAsrDataModule:
The state dict for the training sampler. The state dict for the training sampler.
""" """
logging.info("About to get Musan cuts") logging.info("About to get Musan cuts")
cuts_musan = load_manifest( cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "cuts_musan.json.gz" self.args.manifest_dir / "musan_cuts.jsonl.gz"
) )
transforms = [] transforms = []
@ -435,16 +435,18 @@ class WenetSpeechAsrDataModule:
@lru_cache() @lru_cache()
def valid_cuts(self) -> CutSet: def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts") logging.info("About to get dev cuts")
return load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz") return load_manifest_lazy(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
@lru_cache() @lru_cache()
def test_net_cuts(self) -> List[CutSet]: def test_net_cuts(self) -> List[CutSet]:
logging.info("About to get TEST_NET cuts") logging.info("About to get TEST_NET cuts")
return load_manifest(self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz") return load_manifest_lazy(
self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz"
)
@lru_cache() @lru_cache()
def test_meeting_cuts(self) -> List[CutSet]: def test_meeting_cuts(self) -> List[CutSet]:
logging.info("About to get TEST_MEETING cuts") logging.info("About to get TEST_MEETING cuts")
return load_manifest( return load_manifest_lazy(
self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz" self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz"
) )

View File

@ -12,7 +12,7 @@ import os
from pathlib import Path from pathlib import Path
import torch import torch
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor from icefall.utils import get_executor
@ -37,10 +37,13 @@ def compute_fbank_yesno():
"train", "train",
"test", "test",
) )
prefix = "yesno"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached( manifests = read_manifests_if_cached(
dataset_parts=dataset_parts, dataset_parts=dataset_parts,
output_dir=src_dir, output_dir=src_dir,
prefix="yesno", prefix=prefix,
suffix=suffix,
) )
assert manifests is not None assert manifests is not None
@ -50,7 +53,8 @@ def compute_fbank_yesno():
with get_executor() as ex: # Initialize the executor only once. with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items(): for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file(): cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
if cuts_file.is_file():
logging.info(f"{partition} already exists - skipping.") logging.info(f"{partition} already exists - skipping.")
continue continue
logging.info(f"Processing {partition}") logging.info(f"Processing {partition}")
@ -66,13 +70,13 @@ def compute_fbank_yesno():
) )
cut_set = cut_set.compute_and_store_features( cut_set = cut_set.compute_and_store_features(
extractor=extractor, extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}", storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions # when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 1, # use one job num_jobs=num_jobs if ex is None else 1, # use one job
executor=ex, executor=ex,
storage_type=LilcomHdf5Writer, storage_type=LilcomChunkyWriter,
) )
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz") cut_set.to_file(cuts_file)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -20,18 +20,19 @@ from functools import lru_cache
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import (
CutConcatenate,
DynamicBucketingSampler,
K2SpeechRecognitionDataset,
PrecomputedFeatures,
SingleCutSampler,
)
from lhotse.dataset.input_strategies import OnTheFlyFeatures
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from icefall.dataset.datamodule import DataModule from icefall.dataset.datamodule import DataModule
from icefall.utils import str2bool from icefall.utils import str2bool
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
from lhotse.dataset import (
BucketingSampler,
CutConcatenate,
K2SpeechRecognitionDataset,
PrecomputedFeatures,
)
from lhotse.dataset.input_strategies import OnTheFlyFeatures
class YesNoAsrDataModule(DataModule): class YesNoAsrDataModule(DataModule):
@ -84,7 +85,7 @@ class YesNoAsrDataModule(DataModule):
"--num-buckets", "--num-buckets",
type=int, type=int,
default=10, default=10,
help="The number of buckets for the BucketingSampler" help="The number of buckets for the DynamicBucketingSampler"
"(you might want to increase it for larger datasets).", "(you might want to increase it for larger datasets).",
) )
group.add_argument( group.add_argument(
@ -186,18 +187,17 @@ class YesNoAsrDataModule(DataModule):
) )
if self.args.bucketing_sampler: if self.args.bucketing_sampler:
logging.info("Using BucketingSampler.") logging.info("Using DynamicBucketingSampler.")
train_sampler = BucketingSampler( train_sampler = DynamicBucketingSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets, num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True, drop_last=True,
) )
else: else:
logging.info("Using SingleCutSampler.") logging.info("Using SingleCutSampler.")
train_sampler = BucketingSampler( train_sampler = SingleCutSampler(
cuts_train, cuts_train,
max_duration=self.args.max_duration, max_duration=self.args.max_duration,
shuffle=self.args.shuffle, shuffle=self.args.shuffle,
@ -225,8 +225,10 @@ class YesNoAsrDataModule(DataModule):
else PrecomputedFeatures(), else PrecomputedFeatures(),
return_cuts=self.args.return_cuts, return_cuts=self.args.return_cuts,
) )
sampler = BucketingSampler( sampler = DynamicBucketingSampler(
cuts_test, max_duration=self.args.max_duration, shuffle=False cuts_test,
max_duration=self.args.max_duration,
shuffle=False,
) )
logging.debug("About to create test dataloader") logging.debug("About to create test dataloader")
test_dl = DataLoader( test_dl = DataLoader(
@ -240,11 +242,15 @@ class YesNoAsrDataModule(DataModule):
@lru_cache() @lru_cache()
def train_cuts(self) -> CutSet: def train_cuts(self) -> CutSet:
logging.info("About to get train cuts") logging.info("About to get train cuts")
cuts_train = load_manifest(self.args.feature_dir / "cuts_train.json.gz") cuts_train = load_manifest_lazy(
self.args.feature_dir / "yesno_cuts_train.jsonl.gz"
)
return cuts_train return cuts_train
@lru_cache() @lru_cache()
def test_cuts(self) -> List[CutSet]: def test_cuts(self) -> List[CutSet]:
logging.info("About to get test cuts") logging.info("About to get test cuts")
cuts_test = load_manifest(self.args.feature_dir / "cuts_test.json.gz") cuts_test = load_manifest_lazy(
self.args.feature_dir / "yesno_cuts_test.jsonl.gz"
)
return cuts_test return cuts_test

View File

@ -131,7 +131,6 @@ def setup_logger(
format=formatter, format=formatter,
level=level, level=level,
filemode="w", filemode="w",
force=True,
) )
if use_console: if use_console:
console = logging.StreamHandler() console = logging.StreamHandler()