Use jsonl for CutSet in the LibriSpeech recipe. (#397)

* Use jsonl for cutsets in the librispeech recipe.

* Use lazy cutset for all recipes.

* More fixes to use lazy CutSet.

* Remove force=True from logging to support Python < 3.8

* Minor fixes.

* Fix style issues.
This commit is contained in:
Fangjun Kuang 2022-06-06 10:19:16 +08:00 committed by GitHub
parent e5884f82e0
commit f1abce72f8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
68 changed files with 702 additions and 1098 deletions

View File

@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache

View File

@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
@ -99,7 +101,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
@ -99,7 +101,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
@ -99,7 +101,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
@ -99,7 +101,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -59,6 +59,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
@ -99,7 +101,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -58,6 +58,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
@ -98,7 +100,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -58,6 +58,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
@ -98,7 +100,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -58,6 +58,8 @@ jobs:
- name: Install Python dependencies
run: |
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
pip uninstall -y protobuf
pip install --no-binary protobuf protobuf
- name: Cache kaldifeat
id: my-cache
@ -98,7 +100,7 @@ jobs:
with:
path: |
~/tmp/fbank-libri
key: cache-libri-fbank-test-clean-and-test-other
key: cache-libri-fbank-test-clean-and-test-other-v2
- name: Compute fbank for LibriSpeech test-clean and test-other
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'

View File

@ -43,7 +43,7 @@ torch.set_num_interop_threads(1)
def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
src_dir = Path("data/manifests/aidatatang_200zh")
src_dir = Path("data/manifests")
output_dir = Path("data/fbank")
num_jobs = min(15, os.cpu_count())
@ -52,11 +52,13 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
"dev",
"test",
)
prefix = "aidatatang"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
prefix="aidatatang",
suffix="jsonl.gz",
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
@ -64,10 +66,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file():
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
logging.info(f"{partition} already exists - skipping.")
continue
logging.info(f"Processing {partition}")
for sup in m["supervisions"]:
sup.custom = {"origin": "aidatatang_200zh"}
cut_set = CutSet.from_manifests(
recordings=m["recordings"],
supervisions=m["supervisions"],
@ -80,13 +86,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}",
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=ChunkedLilcomHdf5Writer,
)
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
def get_args():

View File

@ -25,19 +25,19 @@ for usage.
"""
from lhotse import load_manifest
from lhotse import load_manifest_lazy
def main():
paths = [
"./data/fbank/cuts_train.json.gz",
"./data/fbank/cuts_dev.json.gz",
"./data/fbank/cuts_test.json.gz",
"./data/fbank/aidatatang_cuts_train.jsonl.gz",
"./data/fbank/aidatatang_cuts_dev.jsonl.gz",
"./data/fbank/aidatatang_cuts_test.jsonl.gz",
]
for path in paths:
print(f"Starting display the statistics for {path}")
cuts = load_manifest(path)
cuts = load_manifest_lazy(path)
cuts.describe()
@ -45,7 +45,7 @@ if __name__ == "__main__":
main()
"""
Starting display the statistics for ./data/fbank/cuts_train.json.gz
Starting display the statistics for ./data/fbank/aidatatang_cuts_train.jsonl.gz
Cuts count: 494715
Total duration (hours): 422.6
Speech duration (hours): 422.6 (100.0%)
@ -61,7 +61,7 @@ min 1.0
99.5% 8.0
99.9% 9.5
max 18.1
Starting display the statistics for ./data/fbank/cuts_dev.json.gz
Starting display the statistics for ./data/fbank/aidatatang_cuts_dev.jsonl.gz
Cuts count: 24216
Total duration (hours): 20.2
Speech duration (hours): 20.2 (100.0%)
@ -77,7 +77,7 @@ min 1.2
99.5% 7.3
99.9% 8.8
max 11.3
Starting display the statistics for ./data/fbank/cuts_test.json.gz
Starting display the statistics for ./data/fbank/aidatatang_cuts_test.jsonl.gz
Cuts count: 48144
Total duration (hours): 40.2
Speech duration (hours): 40.2 (100.0%)

View File

@ -27,11 +27,10 @@ from lhotse import (
CutSet,
Fbank,
FbankConfig,
load_manifest,
load_manifest_lazy,
set_caching_enabled,
)
from lhotse.dataset import (
BucketingSampler,
CutConcatenate,
CutMix,
DynamicBucketingSampler,
@ -205,8 +204,8 @@ class Aidatatang_200zhAsrDataModule:
The state dict for the training sampler.
"""
logging.info("About to get Musan cuts")
cuts_musan = load_manifest(
self.args.manifest_dir / "cuts_musan.json.gz"
cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "musan_cuts.jsonl.gz"
)
transforms = []
@ -290,13 +289,12 @@ class Aidatatang_200zhAsrDataModule:
)
if self.args.bucketing_sampler:
logging.info("Using BucketingSampler.")
train_sampler = BucketingSampler(
logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True,
)
else:
@ -402,14 +400,20 @@ class Aidatatang_200zhAsrDataModule:
@lru_cache()
def train_cuts(self) -> CutSet:
logging.info("About to get train cuts")
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
)
@lru_cache()
def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts")
return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz"
)
@lru_cache()
def test_cuts(self) -> List[CutSet]:
logging.info("About to get test cuts")
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
)

View File

@ -195,9 +195,9 @@ def get_params() -> AttributeDict:
"best_train_epoch": -1,
"best_valid_epoch": -1,
"batch_idx_train": 0,
"log_interval": 10,
"log_interval": 50,
"reset_interval": 200,
"valid_interval": 3000,
"valid_interval": 2000,
# parameters for k2.ctc_loss
"beam_size": 10,
"reduction": "sum",

View File

@ -0,0 +1,119 @@
#!/usr/bin/env python3
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file computes fbank features of the aidatatang_200zh dataset.
It looks for manifests in the directory data/manifests.
The generated fbank features are saved in data/fbank.
"""
import argparse
import logging
import os
from pathlib import Path
import torch
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
# even when we are not invoking the main (e.g. when spawning subprocesses).
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
src_dir = Path("data/manifests")
output_dir = Path("data/fbank")
num_jobs = min(15, os.cpu_count())
dataset_parts = (
"train",
"test",
"dev",
)
prefix = "aidatatang"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items():
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
logging.info(f"{partition} already exists - skipping.")
continue
logging.info(f"Processing {partition}")
for sup in m["supervisions"]:
sup.custom = {"origin": "aidatatang_200zh"}
cut_set = CutSet.from_manifests(
recordings=m["recordings"],
supervisions=m["supervisions"],
)
if "train" in partition:
cut_set = (
cut_set
+ cut_set.perturb_speed(0.9)
+ cut_set.perturb_speed(1.1)
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=LilcomChunkyWriter,
)
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--num-mel-bins",
type=int,
default=80,
help="""The number of mel bins for Fbank""",
)
return parser.parse_args()
if __name__ == "__main__":
formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)
logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args()
compute_fbank_aidatatang_200zh(num_mel_bins=args.num_mel_bins)

View File

@ -29,7 +29,7 @@ import os
from pathlib import Path
import torch
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
@ -52,8 +52,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
"dev",
"test",
)
prefix = "aishell"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
prefix="aishell", dataset_parts=dataset_parts, output_dir=src_dir
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
@ -61,7 +66,7 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file():
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
logging.info(f"{partition} already exists - skipping.")
continue
logging.info(f"Processing {partition}")
@ -77,13 +82,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}",
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=LilcomHdf5Writer,
storage_type=LilcomChunkyWriter,
)
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
def get_args():

View File

@ -25,18 +25,18 @@ for usage.
"""
from lhotse import load_manifest
from lhotse import load_manifest_lazy
def main():
# path = "./data/fbank/cuts_train.json.gz"
# path = "./data/fbank/cuts_test.json.gz"
# path = "./data/fbank/cuts_dev.json.gz"
# path = "./data/fbank/aidatatang_200zh/cuts_train_raw.jsonl.gz"
# path = "./data/fbank/aidatatang_200zh/cuts_test_raw.jsonl.gz"
path = "./data/fbank/aidatatang_200zh/cuts_dev_raw.jsonl.gz"
# path = "./data/fbank/aishell_cuts_train.jsonl.gz"
# path = "./data/fbank/aishell_cuts_test.jsonl.gz"
path = "./data/fbank/aishell_cuts_dev.jsonl.gz"
# path = "./data/fbank/aidatatang_cuts_train.jsonl.gz"
# path = "./data/fbank/aidatatang_cuts_test.jsonl.gz"
# path = "./data/fbank/aidatatang_cuts_dev.jsonl.gz"
cuts = load_manifest(path)
cuts = load_manifest_lazy(path)
cuts.describe()

View File

@ -1,71 +0,0 @@
#!/usr/bin/env python3
# Copyright 2022 Xiaomi Corp. (Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
from lhotse import CutSet
from lhotse.recipes.utils import read_manifests_if_cached
def preprocess_aidatatang_200zh():
src_dir = Path("data/manifests/aidatatang_200zh")
output_dir = Path("data/fbank/aidatatang_200zh")
output_dir.mkdir(exist_ok=True, parents=True)
dataset_parts = (
"train",
"test",
"dev",
)
logging.info("Loading manifest")
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts, output_dir=src_dir, prefix="aidatatang"
)
assert len(manifests) > 0
for partition, m in manifests.items():
logging.info(f"Processing {partition}")
raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
if raw_cuts_path.is_file():
logging.info(f"{partition} already exists - skipping")
continue
for sup in m["supervisions"]:
sup.custom = {"origin": "aidatatang_200zh"}
cut_set = CutSet.from_manifests(
recordings=m["recordings"],
supervisions=m["supervisions"],
)
logging.info(f"Saving to {raw_cuts_path}")
cut_set.to_file(raw_cuts_path)
def main():
formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)
logging.basicConfig(format=formatter, level=logging.INFO)
preprocess_aidatatang_200zh()
if __name__ == "__main__":
main()

View File

@ -42,18 +42,18 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare manifest"
# We assume that you have downloaded the aidatatang_200zh corpus
# to $dl_dir/aidatatang_200zh
if [ ! -f data/manifests/aidatatang_200zh/.manifests.done ]; then
mkdir -p data/manifests/aidatatang_200zh
lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
touch data/manifests/aidatatang_200zh/.manifests.done
if [ ! -f data/manifests/.aidatatang_200zh_manifests.done ]; then
mkdir -p data/manifests
lhotse prepare aidatatang-200zh $dl_dir data/manifests
touch data/manifests/.aidatatang_200zh_manifests.done
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Process aidatatang_200zh"
if [ ! -f data/fbank/aidatatang_200zh/.fbank.done ]; then
mkdir -p data/fbank/aidatatang_200zh
lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
touch data/fbank/aidatatang_200zh/.fbank.done
if [ ! -f data/fbank/.aidatatang_200zh_fbank.done ]; then
mkdir -p data/fbank
./local/compute_fbank_aidatatang_200zh.py
touch data/fbank/.aidatatang_200zh_fbank.done
fi
fi

View File

@ -23,11 +23,11 @@ from functools import lru_cache
from pathlib import Path
from typing import List
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import (
BucketingSampler,
CutConcatenate,
CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset,
PrecomputedFeatures,
SingleCutSampler,
@ -93,7 +93,7 @@ class AishellAsrDataModule:
"--num-buckets",
type=int,
default=30,
help="The number of buckets for the BucketingSampler"
help="The number of buckets for the DynamicBucketingSampler"
"(you might want to increase it for larger datasets).",
)
group.add_argument(
@ -133,6 +133,12 @@ class AishellAsrDataModule:
help="When enabled (=default), the examples will be "
"shuffled for each epoch.",
)
group.add_argument(
"--drop-last",
type=str2bool,
default=True,
help="Whether to drop last batch. Used by sampler.",
)
group.add_argument(
"--return-cuts",
type=str2bool,
@ -177,8 +183,8 @@ class AishellAsrDataModule:
def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
logging.info("About to get Musan cuts")
cuts_musan = load_manifest(
self.args.manifest_dir / "cuts_musan.json.gz"
cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "musan_cuts.jsonl.gz"
)
transforms = []
@ -262,14 +268,13 @@ class AishellAsrDataModule:
)
if self.args.bucketing_sampler:
logging.info("Using BucketingSampler.")
train_sampler = BucketingSampler(
logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True,
drop_last=self.args.drop_last,
)
else:
logging.info("Using SingleCutSampler.")
@ -313,7 +318,7 @@ class AishellAsrDataModule:
cut_transforms=transforms,
return_cuts=self.args.return_cuts,
)
valid_sampler = BucketingSampler(
valid_sampler = DynamicBucketingSampler(
cuts_valid,
max_duration=self.args.max_duration,
shuffle=False,
@ -337,8 +342,10 @@ class AishellAsrDataModule:
else PrecomputedFeatures(),
return_cuts=self.args.return_cuts,
)
sampler = BucketingSampler(
cuts, max_duration=self.args.max_duration, shuffle=False
sampler = DynamicBucketingSampler(
cuts,
max_duration=self.args.max_duration,
shuffle=False,
)
test_dl = DataLoader(
test,
@ -351,17 +358,21 @@ class AishellAsrDataModule:
@lru_cache()
def train_cuts(self) -> CutSet:
logging.info("About to get train cuts")
cuts_train = load_manifest(
self.args.manifest_dir / "cuts_train.json.gz"
cuts_train = load_manifest_lazy(
self.args.manifest_dir / "aishell_cuts_train.jsonl.gz"
)
return cuts_train
@lru_cache()
def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts")
return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "aishell_cuts_dev.jsonl.gz"
)
@lru_cache()
def test_cuts(self) -> List[CutSet]:
logging.info("About to get test cuts")
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "aishell_cuts_test.jsonl.gz"
)

View File

@ -15,6 +15,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Usage
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./tdnn_lstm_ctc/train.py \
--world-size 4 \
--num-epochs 20 \
--max-duration 300
"""
import argparse
import logging

View File

@ -110,9 +110,7 @@ class Conformer(Transformer):
x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C)
# Caution: We assume the subsampling factor is 4!
with warnings.catch_warnings():
warnings.simplefilter("ignore")
lengths = ((x_lens - 1) // 2 - 1) // 2
lengths = (((x_lens - 1) >> 1) - 1) >> 1
assert x.size(0) == lengths.max().item()
mask = make_pad_mask(lengths)

View File

@ -21,6 +21,7 @@
import argparse
import logging
import warnings
from pathlib import Path
from shutil import copyfile
from typing import Optional, Tuple
@ -386,7 +387,11 @@ def compute_loss(
assert loss.requires_grad == is_training
info = MetricsTracker()
info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
info["frames"] = (
(feature_lens // params.subsampling_factor).sum().item()
)
# Note: We use reduction=sum while computing the loss.
info["loss"] = loss.detach().cpu().item()

View File

@ -18,7 +18,7 @@
import logging
from pathlib import Path
from lhotse import CutSet, load_manifest
from lhotse import CutSet, load_manifest_lazy
class AIDatatang200zh:
@ -28,26 +28,26 @@ class AIDatatang200zh:
manifest_dir:
It is expected to contain the following files::
- cuts_dev_raw.jsonl.gz
- cuts_train_raw.jsonl.gz
- cuts_test_raw.jsonl.gz
- aidatatang_cuts_dev.jsonl.gz
- aidatatang_cuts_train.jsonl.gz
- aidatatang_cuts_test.jsonl.gz
"""
self.manifest_dir = Path(manifest_dir)
def train_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train_raw.jsonl.gz"
f = self.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
logging.info(f"About to get train cuts from {f}")
cuts_train = load_manifest(f)
cuts_train = load_manifest_lazy(f)
return cuts_train
def valid_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_valid_raw.jsonl.gz"
f = self.manifest_dir / "aidatatang_cuts_valid.jsonl.gz"
logging.info(f"About to get valid cuts from {f}")
cuts_valid = load_manifest(f)
cuts_valid = load_manifest_lazy(f)
return cuts_valid
def test_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_test_raw.jsonl.gz"
f = self.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
logging.info(f"About to get test cuts from {f}")
cuts_test = load_manifest(f)
cuts_test = load_manifest_lazy(f)
return cuts_test

View File

@ -18,7 +18,7 @@
import logging
from pathlib import Path
from lhotse import CutSet, load_manifest
from lhotse import CutSet, load_manifest_lazy
class AIShell:
@ -28,26 +28,26 @@ class AIShell:
manifest_dir:
It is expected to contain the following files::
- cuts_dev.json.gz
- cuts_train.json.gz
- cuts_test.json.gz
- aishell_cuts_dev.jsonl.gz
- aishell_cuts_train.jsonl.gz
- aishell_cuts_test.jsonl.gz
"""
self.manifest_dir = Path(manifest_dir)
def train_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train.json.gz"
f = self.manifest_dir / "aishell_cuts_train.jsonl.gz"
logging.info(f"About to get train cuts from {f}")
cuts_train = load_manifest(f)
cuts_train = load_manifest_lazy(f)
return cuts_train
def valid_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_dev.json.gz"
f = self.manifest_dir / "aishell_cuts_dev.jsonl.gz"
logging.info(f"About to get valid cuts from {f}")
cuts_valid = load_manifest(f)
cuts_valid = load_manifest_lazy(f)
return cuts_valid
def test_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_test.json.gz"
f = self.manifest_dir / "aishell_cuts_test.jsonl.gz"
logging.info(f"About to get test cuts from {f}")
cuts_test = load_manifest(f)
cuts_test = load_manifest_lazy(f)
return cuts_test

View File

@ -24,7 +24,6 @@ from typing import Optional
from lhotse import CutSet, Fbank, FbankConfig
from lhotse.dataset import (
BucketingSampler,
CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset,
@ -73,8 +72,7 @@ class AsrDataModule:
"--num-buckets",
type=int,
default=30,
help="The number of buckets for the BucketingSampler "
"and DynamicBucketingSampler."
help="The number of buckets for the DynamicBucketingSampler "
"(you might want to increase it for larger datasets).",
)
@ -147,7 +145,6 @@ class AsrDataModule:
def train_dataloaders(
self,
cuts_train: CutSet,
dynamic_bucketing: bool,
on_the_fly_feats: bool,
cuts_musan: Optional[CutSet] = None,
) -> DataLoader:
@ -157,9 +154,6 @@ class AsrDataModule:
Cuts for training.
cuts_musan:
If not None, it is the cuts for mixing.
dynamic_bucketing:
True to use DynamicBucketingSampler;
False to use BucketingSampler.
on_the_fly_feats:
True to use OnTheFlyFeatures;
False to use PrecomputedFeatures.
@ -232,7 +226,6 @@ class AsrDataModule:
return_cuts=self.args.return_cuts,
)
if dynamic_bucketing:
logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler(
cuts_train,
@ -241,16 +234,6 @@ class AsrDataModule:
num_buckets=self.args.num_buckets,
drop_last=True,
)
else:
logging.info("Using BucketingSampler.")
train_sampler = BucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True,
)
logging.info("About to create train dataloader")
train_dl = DataLoader(
@ -279,7 +262,7 @@ class AsrDataModule:
cut_transforms=transforms,
return_cuts=self.args.return_cuts,
)
valid_sampler = BucketingSampler(
valid_sampler = DynamicBucketingSampler(
cuts_valid,
max_duration=self.args.max_duration,
shuffle=False,
@ -303,8 +286,10 @@ class AsrDataModule:
else PrecomputedFeatures(),
return_cuts=self.args.return_cuts,
)
sampler = BucketingSampler(
cuts, max_duration=self.args.max_duration, shuffle=False
sampler = DynamicBucketingSampler(
cuts,
max_duration=self.args.max_duration,
shuffle=False,
)
logging.debug("About to create test dataloader")
test_dl = DataLoader(

View File

@ -41,6 +41,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2"
import argparse
import logging
import random
import warnings
from pathlib import Path
from shutil import copyfile
from typing import Optional, Tuple
@ -55,7 +56,7 @@ from asr_datamodule import AsrDataModule
from conformer import Conformer
from decoder import Decoder
from joiner import Joiner
from lhotse import CutSet, load_manifest
from lhotse import CutSet, load_manifest_lazy
from lhotse.cut import Cut
from lhotse.utils import fix_random_seed
from model import Transducer
@ -446,7 +447,11 @@ def compute_loss(
assert loss.requires_grad == is_training
info = MetricsTracker()
info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
info["frames"] = (
(feature_lens // params.subsampling_factor).sum().item()
)
# Note: We use reduction=sum while computing the loss.
info["loss"] = loss.detach().cpu().item()
@ -635,20 +640,16 @@ def train_one_epoch(
def filter_short_and_long_utterances(cuts: CutSet) -> CutSet:
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 12 seconds
# Keep only utterances with duration between 1 second and 20 seconds
#
# Caution: There is a reason to select 12.0 here. Please see
# ../local/display_manifest_statistics.py
#
# You should use ../local/display_manifest_statistics.py to get
# an utterance duration distribution for your dataset to select
# the threshold
return 1.0 <= c.duration <= 12.0
num_in_total = len(cuts)
cuts = cuts.filter(remove_short_and_long_utt)
num_left = len(cuts)
num_removed = num_in_total - num_left
removed_percent = num_removed / num_in_total * 100
logging.info(f"Before removing short and long utterances: {num_in_total}")
logging.info(f"After removing short and long utterances: {num_left}")
logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
return cuts
@ -728,15 +729,14 @@ def run(rank, world_size, args):
train_cuts = aishell.train_cuts()
train_cuts = filter_short_and_long_utterances(train_cuts)
datatang = AIDatatang200zh(
manifest_dir=f"{args.manifest_dir}/aidatatang_200zh"
)
datatang = AIDatatang200zh(manifest_dir=args.manifest_dir)
train_datatang_cuts = datatang.train_cuts()
train_datatang_cuts = filter_short_and_long_utterances(train_datatang_cuts)
train_datatang_cuts = train_datatang_cuts.repeat(times=None)
if args.enable_musan:
cuts_musan = load_manifest(
Path(args.manifest_dir) / "cuts_musan.json.gz"
cuts_musan = load_manifest_lazy(
Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
)
else:
cuts_musan = None
@ -745,22 +745,23 @@ def run(rank, world_size, args):
train_dl = asr_datamodule.train_dataloaders(
train_cuts,
dynamic_bucketing=False,
on_the_fly_feats=False,
cuts_musan=cuts_musan,
)
datatang_train_dl = asr_datamodule.train_dataloaders(
train_datatang_cuts,
dynamic_bucketing=True,
on_the_fly_feats=True,
on_the_fly_feats=False,
cuts_musan=cuts_musan,
)
valid_cuts = aishell.valid_cuts()
valid_dl = asr_datamodule.valid_dataloaders(valid_cuts)
for dl in [train_dl, datatang_train_dl]:
for dl in [
train_dl,
# datatang_train_dl
]:
scan_pessimistic_batches_for_oom(
model=model,
train_dl=dl,

View File

@ -37,6 +37,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2"
import argparse
import logging
import warnings
from pathlib import Path
from shutil import copyfile
from typing import Optional, Tuple
@ -411,7 +412,11 @@ def compute_loss(
assert loss.requires_grad == is_training
info = MetricsTracker()
info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
info["frames"] = (
(feature_lens // params.subsampling_factor).sum().item()
)
# Note: We use reduction=sum while computing the loss.
info["loss"] = loss.detach().cpu().item()

View File

@ -43,7 +43,7 @@ torch.set_num_interop_threads(1)
def compute_fbank_alimeeting(num_mel_bins: int = 80):
src_dir = Path("data/manifests/alimeeting")
src_dir = Path("data/manifests")
output_dir = Path("data/fbank")
num_jobs = min(15, os.cpu_count())
@ -52,11 +52,14 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
"eval",
"test",
)
prefix = "alimeeting"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix="alimeeting",
suffix="jsonl.gz",
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
@ -64,7 +67,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file():
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
logging.info(f"{partition} already exists - skipping.")
continue
logging.info(f"Processing {partition}")
@ -83,7 +86,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}",
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=cur_num_jobs,
executor=ex,
@ -95,7 +98,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
keep_overlapping=False,
min_duration=None,
)
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
def get_args():

View File

@ -25,19 +25,19 @@ for usage.
"""
from lhotse import load_manifest
from lhotse import load_manifest_lazy
def main():
paths = [
"./data/fbank/cuts_train.json.gz",
"./data/fbank/cuts_eval.json.gz",
"./data/fbank/cuts_test.json.gz",
"./data/fbank/alimeeting_cuts_train.jsonl.gz",
"./data/fbank/alimeeting_cuts_eval.jsonl.gz",
"./data/fbank/alimeeting_cuts_test.jsonl.gz",
]
for path in paths:
print(f"Starting display the statistics for {path}")
cuts = load_manifest(path)
cuts = load_manifest_lazy(path)
cuts.describe()
@ -45,7 +45,7 @@ if __name__ == "__main__":
main()
"""
Starting display the statistics for ./data/fbank/cuts_train.json.gz
Starting display the statistics for ./data/fbank/alimeeting_cuts_train.jsonl.gz
Cuts count: 559092
Total duration (hours): 424.6
Speech duration (hours): 424.6 (100.0%)
@ -61,7 +61,7 @@ min 0.0
99.5% 14.7
99.9% 16.2
max 284.3
Starting display the statistics for ./data/fbank/cuts_eval.json.gz
Starting display the statistics for ./data/fbank/alimeeting_cuts_eval.jsonl.gz
Cuts count: 6457
Total duration (hours): 4.9
Speech duration (hours): 4.9 (100.0%)
@ -77,7 +77,7 @@ min 0.1
99.5% 14.1
99.9% 14.7
max 15.8
Starting display the statistics for ./data/fbank/cuts_test.json.gz
Starting display the statistics for ./data/fbank/alimeeting_cuts_test.jsonl.gz
Cuts count: 16358
Total duration (hours): 12.5
Speech duration (hours): 12.5 (100.0%)

View File

@ -27,7 +27,7 @@ from lhotse import (
CutSet,
Fbank,
FbankConfig,
load_manifest,
load_manifest_lazy,
set_caching_enabled,
)
from lhotse.dataset import (
@ -204,8 +204,8 @@ class AlimeetingAsrDataModule:
The state dict for the training sampler.
"""
logging.info("About to get Musan cuts")
cuts_musan = load_manifest(
self.args.manifest_dir / "cuts_musan.json.gz"
cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "musan_cuts.jsonl.gz"
)
transforms = []
@ -401,14 +401,20 @@ class AlimeetingAsrDataModule:
@lru_cache()
def train_cuts(self) -> CutSet:
logging.info("About to get train cuts")
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "alimeeting_cuts_train.jsonl.gz"
)
@lru_cache()
def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts")
return load_manifest(self.args.manifest_dir / "cuts_eval.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "alimeeting_cuts_eval.jsonl.gz"
)
@lru_cache()
def test_cuts(self) -> List[CutSet]:
logging.info("About to get test cuts")
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "alimeeting_cuts_test.jsonl.gz"
)

View File

@ -20,9 +20,8 @@ import logging
from functools import lru_cache
from pathlib import Path
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import (
BucketingSampler,
CutConcatenate,
CutMix,
DynamicBucketingSampler,
@ -190,8 +189,8 @@ class GigaSpeechAsrDataModule:
def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
logging.info("About to get Musan cuts")
cuts_musan = load_manifest(
self.args.manifest_dir / "cuts_musan.json.gz"
cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "musan_cuts.jsonl.gz"
)
transforms = []
@ -315,7 +314,7 @@ class GigaSpeechAsrDataModule:
cut_transforms=transforms,
return_cuts=self.args.return_cuts,
)
valid_sampler = BucketingSampler(
valid_sampler = DynamicBucketingSampler(
cuts_valid,
max_duration=self.args.max_duration,
shuffle=False,
@ -339,8 +338,10 @@ class GigaSpeechAsrDataModule:
else PrecomputedFeatures(),
return_cuts=self.args.return_cuts,
)
sampler = BucketingSampler(
cuts, max_duration=self.args.max_duration, shuffle=False
sampler = DynamicBucketingSampler(
cuts,
max_duration=self.args.max_duration,
shuffle=False,
)
logging.debug("About to create test dataloader")
test_dl = DataLoader(
@ -361,7 +362,9 @@ class GigaSpeechAsrDataModule:
@lru_cache()
def dev_cuts(self) -> CutSet:
logging.info("About to get dev cuts")
cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
cuts_valid = load_manifest_lazy(
self.args.manifest_dir / "cuts_DEV.jsonl.gz"
)
if self.args.small_dev:
return cuts_valid.subset(first=1000)
else:
@ -370,4 +373,4 @@ class GigaSpeechAsrDataModule:
@lru_cache()
def test_cuts(self) -> CutSet:
logging.info("About to get test cuts")
return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")

View File

@ -1,103 +0,0 @@
#!/usr/bin/env python3
# Copyright 2021 Johns Hopkins University (Piotr Żelasko)
# Copyright 2021 Xiaomi Corp. (Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
import torch
from lhotse import (
CutSet,
KaldifeatFbank,
KaldifeatFbankConfig,
combine,
)
from lhotse.recipes.utils import read_manifests_if_cached
# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
# even when we are not invoking the main (e.g. when spawning subprocesses).
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
def compute_fbank_musan():
src_dir = Path("data/manifests")
output_dir = Path("data/fbank")
# number of workers in dataloader
num_workers = 10
# number of seconds in a batch
batch_duration = 600
dataset_parts = (
"music",
"speech",
"noise",
)
manifests = read_manifests_if_cached(
prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir
)
assert manifests is not None
musan_cuts_path = output_dir / "cuts_musan.json.gz"
if musan_cuts_path.is_file():
logging.info(f"{musan_cuts_path} already exists - skipping")
return
logging.info("Extracting features for Musan")
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda", 0)
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
logging.info(f"device: {device}")
musan_cuts = (
CutSet.from_manifests(
recordings=combine(
part["recordings"] for part in manifests.values()
)
)
.cut_into_windows(10.0)
.filter(lambda c: c.duration > 5)
.compute_and_store_features_batch(
extractor=extractor,
storage_path=f"{output_dir}/feats_musan",
num_workers=num_workers,
batch_duration=batch_duration,
)
)
musan_cuts.to_json(musan_cuts_path)
def main():
formatter = (
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
)
logging.basicConfig(format=formatter, level=logging.INFO)
compute_fbank_musan()
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@
../../../librispeech/ASR/local/compute_fbank_musan.py

View File

@ -23,9 +23,8 @@ from pathlib import Path
from typing import Any, Dict, Optional
import torch
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import (
BucketingSampler,
CutConcatenate,
CutMix,
DynamicBucketingSampler,
@ -217,8 +216,8 @@ class GigaSpeechAsrDataModule:
if self.args.enable_musan:
logging.info("Enable MUSAN")
logging.info("About to get Musan cuts")
cuts_musan = load_manifest(
self.args.manifest_dir / "cuts_musan.json.gz"
cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "musan_cuts.jsonl.gz"
)
transforms.append(
CutMix(
@ -358,7 +357,7 @@ class GigaSpeechAsrDataModule:
cut_transforms=transforms,
return_cuts=self.args.return_cuts,
)
valid_sampler = BucketingSampler(
valid_sampler = DynamicBucketingSampler(
cuts_valid,
max_duration=self.args.max_duration,
shuffle=False,
@ -382,8 +381,10 @@ class GigaSpeechAsrDataModule:
else PrecomputedFeatures(),
return_cuts=self.args.return_cuts,
)
sampler = BucketingSampler(
cuts, max_duration=self.args.max_duration, shuffle=False
sampler = DynamicBucketingSampler(
cuts,
max_duration=self.args.max_duration,
shuffle=False,
)
logging.debug("About to create test dataloader")
test_dl = DataLoader(
@ -404,7 +405,9 @@ class GigaSpeechAsrDataModule:
@lru_cache()
def dev_cuts(self) -> CutSet:
logging.info("About to get dev cuts")
cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
cuts_valid = load_manifest_lazy(
self.args.manifest_dir / "cuts_DEV.jsonl.gz"
)
if self.args.small_dev:
return cuts_valid.subset(first=1000)
else:
@ -413,4 +416,4 @@ class GigaSpeechAsrDataModule:
@lru_cache()
def test_cuts(self) -> CutSet:
logging.info("About to get test cuts")
return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")

View File

@ -96,14 +96,14 @@ def get_parser():
- labels_xxx.h5
- aux_labels_xxx.h5
- cuts_xxx.json.gz
- librispeech_cuts_xxx.jsonl.gz
where xxx is the value of `--dataset`. For instance, if
`--dataset` is `train-clean-100`, it will contain 3 files:
- `labels_train-clean-100.h5`
- `aux_labels_train-clean-100.h5`
- `cuts_train-clean-100.json.gz`
- `librispeech_cuts_train-clean-100.jsonl.gz`
Note: Both labels_xxx.h5 and aux_labels_xxx.h5 contain framewise
alignment. The difference is that labels_xxx.h5 contains repeats.
@ -289,7 +289,9 @@ def main():
out_labels_ali_filename = out_dir / f"labels_{params.dataset}.h5"
out_aux_labels_ali_filename = out_dir / f"aux_labels_{params.dataset}.h5"
out_manifest_filename = out_dir / f"cuts_{params.dataset}.json.gz"
out_manifest_filename = (
out_dir / f"librispeech_cuts_{params.dataset}.jsonl.gz"
)
for f in (
out_labels_ali_filename,

View File

@ -17,6 +17,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Usage:
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./conformer_ctc/train.py \
--exp-dir ./conformer_ctc/exp \
--world-size 4 \
--full-libri 1 \
--max-duration 200 \
--num-epochs 20
"""
import argparse
import logging
from pathlib import Path
@ -29,6 +40,7 @@ import torch.multiprocessing as mp
import torch.nn as nn
from asr_datamodule import LibriSpeechAsrDataModule
from conformer import Conformer
from lhotse.cut import Cut
from lhotse.utils import fix_random_seed
from torch import Tensor
from torch.nn.parallel import DistributedDataParallel as DDP
@ -676,6 +688,20 @@ def run(rank, world_size, args):
if params.full_libri:
train_cuts += librispeech.train_clean_360_cuts()
train_cuts += librispeech.train_other_500_cuts()
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
#
# Caution: There is a reason to select 20.0 here. Please see
# ../local/display_manifest_statistics.py
#
# You should use ../local/display_manifest_statistics.py to get
# an utterance duration distribution for your dataset to select
# the threshold
return 1.0 <= c.duration <= 20.0
train_cuts = train_cuts.filter(remove_short_and_long_utt)
train_dl = librispeech.train_dataloaders(train_cuts)
valid_cuts = librispeech.dev_clean_cuts()

View File

@ -20,11 +20,7 @@ import logging
from pathlib import Path
import torch
from lhotse import (
CutSet,
KaldifeatFbank,
KaldifeatFbankConfig,
)
from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig
# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
@ -51,13 +47,16 @@ def compute_fbank_gigaspeech_dev_test():
logging.info(f"device: {device}")
prefix = "gigaspeech"
suffix = "jsonl.gz"
for partition in subsets:
cuts_path = in_out_dir / f"cuts_{partition}.jsonl.gz"
cuts_path = in_out_dir / f"{prefix}_cuts_{partition}.{suffix}"
if cuts_path.is_file():
logging.info(f"{cuts_path} exists - skipping")
continue
raw_cuts_path = in_out_dir / f"cuts_{partition}_raw.jsonl.gz"
raw_cuts_path = in_out_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
logging.info(f"Loading {raw_cuts_path}")
cut_set = CutSet.from_file(raw_cuts_path)
@ -66,7 +65,7 @@ def compute_fbank_gigaspeech_dev_test():
cut_set = cut_set.compute_and_store_features_batch(
extractor=extractor,
storage_path=f"{in_out_dir}/feats_{partition}",
storage_path=f"{in_out_dir}/{prefix}_feats_{partition}",
num_workers=num_workers,
batch_duration=batch_duration,
)

View File

@ -77,7 +77,7 @@ def get_parser():
def compute_fbank_gigaspeech_splits(args):
num_splits = args.num_splits
output_dir = f"data/fbank/XL_split_{num_splits}"
output_dir = f"data/fbank/gigaspeech_XL_split_{num_splits}"
output_dir = Path(output_dir)
assert output_dir.exists(), f"{output_dir} does not exist!"
@ -96,17 +96,19 @@ def compute_fbank_gigaspeech_splits(args):
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
logging.info(f"device: {device}")
prefix = "gigaspeech"
num_digits = 8 # num_digits is fixed by lhotse split-lazy
for i in range(start, stop):
idx = f"{i + 1}".zfill(num_digits)
logging.info(f"Processing {idx}/{num_splits}")
cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz"
cuts_path = output_dir / f"{prefix}_cuts_XL.{idx}.jsonl.gz"
if cuts_path.is_file():
logging.info(f"{cuts_path} exists - skipping")
continue
raw_cuts_path = output_dir / f"cuts_XL_raw.{idx}.jsonl.gz"
raw_cuts_path = output_dir / f"{prefix}_cuts_XL_raw.{idx}.jsonl.gz"
if not raw_cuts_path.is_file():
logging.info(f"{raw_cuts_path} does not exist - skipping it")
continue
@ -115,13 +117,13 @@ def compute_fbank_gigaspeech_splits(args):
cut_set = CutSet.from_file(raw_cuts_path)
logging.info("Computing features")
if (output_dir / f"feats_XL_{idx}.lca").exists():
logging.info(f"Removing {output_dir}/feats_XL_{idx}.lca")
os.remove(output_dir / f"feats_XL_{idx}.lca")
if (output_dir / f"{prefix}_feats_XL_{idx}.lca").exists():
logging.info(f"Removing {output_dir}/{prefix}_feats_XL_{idx}.lca")
os.remove(output_dir / f"{prefix}_feats_XL_{idx}.lca")
cut_set = cut_set.compute_and_store_features_batch(
extractor=extractor,
storage_path=f"{output_dir}/feats_XL_{idx}",
storage_path=f"{output_dir}/{prefix}_feats_XL_{idx}",
num_workers=args.num_workers,
batch_duration=args.batch_duration,
)

View File

@ -28,7 +28,7 @@ import os
from pathlib import Path
import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
@ -56,8 +56,13 @@ def compute_fbank_librispeech():
"train-clean-360",
"train-other-500",
)
prefix = "librispeech"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
prefix="librispeech", dataset_parts=dataset_parts, output_dir=src_dir
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
@ -65,7 +70,8 @@ def compute_fbank_librispeech():
with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file():
cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
if (output_dir / cuts_filename).is_file():
logging.info(f"{partition} already exists - skipping.")
continue
logging.info(f"Processing {partition}")
@ -81,13 +87,13 @@ def compute_fbank_librispeech():
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}",
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=ChunkedLilcomHdf5Writer,
storage_type=LilcomChunkyWriter,
)
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
cut_set.to_file(output_dir / cuts_filename)
if __name__ == "__main__":

View File

@ -28,7 +28,7 @@ import os
from pathlib import Path
import torch
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig, combine
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine
from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
@ -52,12 +52,22 @@ def compute_fbank_musan():
"speech",
"noise",
)
prefix = "musan"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
musan_cuts_path = output_dir / "cuts_musan.json.gz"
assert len(manifests) == len(dataset_parts), (
len(manifests),
len(dataset_parts),
)
musan_cuts_path = output_dir / "musan_cuts.jsonl.gz"
if musan_cuts_path.is_file():
logging.info(f"{musan_cuts_path} already exists - skipping")
@ -79,13 +89,13 @@ def compute_fbank_musan():
.filter(lambda c: c.duration > 5)
.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_musan",
storage_path=f"{output_dir}/musan_feats",
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=ChunkedLilcomHdf5Writer,
storage_type=LilcomChunkyWriter,
)
)
musan_cuts.to_json(musan_cuts_path)
musan_cuts.to_file(musan_cuts_path)
if __name__ == "__main__":

View File

@ -25,19 +25,19 @@ for usage.
"""
from lhotse import load_manifest
from lhotse import load_manifest_lazy
def main():
path = "./data/fbank/cuts_train-clean-100.json.gz"
path = "./data/fbank/cuts_train-clean-360.json.gz"
path = "./data/fbank/cuts_train-other-500.json.gz"
path = "./data/fbank/cuts_dev-clean.json.gz"
path = "./data/fbank/cuts_dev-other.json.gz"
path = "./data/fbank/cuts_test-clean.json.gz"
path = "./data/fbank/cuts_test-other.json.gz"
# path = "./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz"
# path = "./data/fbank/librispeech_cuts_train-clean-360.jsonl.gz"
# path = "./data/fbank/librispeech_cuts_train-other-500.jsonl.gz"
# path = "./data/fbank/librispeech_cuts_dev-clean.jsonl.gz"
# path = "./data/fbank/librispeech_cuts_dev-other.jsonl.gz"
# path = "./data/fbank/librispeech_cuts_test-clean.jsonl.gz"
path = "./data/fbank/librispeech_cuts_test-other.jsonl.gz"
cuts = load_manifest(path)
cuts = load_manifest_lazy(path)
cuts.describe()

View File

@ -58,17 +58,19 @@ def preprocess_giga_speech():
)
logging.info("Loading manifest (may take 4 minutes)")
prefix = "gigaspeech"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix="gigaspeech",
suffix="jsonl.gz",
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
for partition, m in manifests.items():
logging.info(f"Processing {partition}")
raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
if raw_cuts_path.is_file():
logging.info(f"{partition} already exists - skipping")
continue

View File

@ -25,7 +25,7 @@ We will add more checks later if needed.
Usage example:
python3 ./local/validate_manifest.py \
./data/fbank/cuts_train-clean-100.json.gz
./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
"""
@ -33,7 +33,7 @@ import argparse
import logging
from pathlib import Path
from lhotse import load_manifest, CutSet
from lhotse import CutSet, load_manifest_lazy
from lhotse.cut import Cut
@ -76,7 +76,7 @@ def main():
logging.info(f"Validating {manifest}")
assert manifest.is_file(), f"{manifest} does not exist"
cut_set = load_manifest(manifest)
cut_set = load_manifest_lazy(manifest)
assert isinstance(cut_set, CutSet)
for c in cut_set:

View File

@ -40,9 +40,9 @@ dl_dir=$PWD/download
# It will generate data/lang_bpe_xxx,
# data/lang_bpe_yyy if the array contains xxx, yyy
vocab_sizes=(
5000
2000
1000
# 5000
# 2000
# 1000
500
)
@ -132,7 +132,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
)
for part in ${parts[@]}; do
python3 ./local/validate_manifest.py \
data/fbank/cuts_${part}.json.gz
data/fbank/librispeech_cuts_${part}.jsonl.gz
done
touch data/fbank/.librispeech-validated.done
fi

View File

@ -124,9 +124,9 @@ fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Split XL subset into ${num_splits} pieces"
split_dir=data/fbank/XL_split_${num_splits}
split_dir=data/fbank/gigaspeech_XL_split_${num_splits}
if [ ! -f $split_dir/.split_completed ]; then
lhotse split-lazy ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir $chunk_size
lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size
touch $split_dir/.split_completed
fi
fi

View File

@ -807,28 +807,8 @@ def run(rank, world_size, args):
# the threshold
return 1.0 <= c.duration <= 20.0
num_in_total = len(train_cuts)
train_cuts = train_cuts.filter(remove_short_and_long_utt)
try:
num_left = len(train_cuts)
num_removed = num_in_total - num_left
removed_percent = num_removed / num_in_total * 100
logging.info(
f"Before removing short and long utterances: {num_in_total}"
)
logging.info(f"After removing short and long utterances: {num_left}")
logging.info(
f"Removed {num_removed} utterances ({removed_percent:.5f}%)"
)
except TypeError as e:
# You can ignore this error as previous versions of Lhotse work fine
# for the above code. In recent versions of Lhotse, it uses
# lazy filter, producing cutsets that don't have the __len__ method
logging.info(str(e))
if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
# We only load the sampler's state dict when it loads a checkpoint
# saved in the middle of an epoch

View File

@ -22,7 +22,6 @@ from typing import Optional
from lhotse import CutSet, Fbank, FbankConfig
from lhotse.dataset import (
BucketingSampler,
CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset,
@ -71,8 +70,7 @@ class AsrDataModule:
"--num-buckets",
type=int,
default=30,
help="The number of buckets for the BucketingSampler "
"and DynamicBucketingSampler."
help="The number of buckets for the DynamicBucketingSampler. "
"(you might want to increase it for larger datasets).",
)
@ -152,7 +150,6 @@ class AsrDataModule:
def train_dataloaders(
self,
cuts_train: CutSet,
dynamic_bucketing: bool,
on_the_fly_feats: bool,
cuts_musan: Optional[CutSet] = None,
) -> DataLoader:
@ -162,9 +159,6 @@ class AsrDataModule:
Cuts for training.
cuts_musan:
If not None, it is the cuts for mixing.
dynamic_bucketing:
True to use DynamicBucketingSampler;
False to use BucketingSampler.
on_the_fly_feats:
True to use OnTheFlyFeatures;
False to use PrecomputedFeatures.
@ -230,7 +224,6 @@ class AsrDataModule:
return_cuts=self.args.return_cuts,
)
if dynamic_bucketing:
logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler(
cuts_train,
@ -239,16 +232,6 @@ class AsrDataModule:
num_buckets=self.args.num_buckets,
drop_last=True,
)
else:
logging.info("Using BucketingSampler.")
train_sampler = BucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True,
)
logging.info("About to create train dataloader")
train_dl = DataLoader(

View File

@ -22,7 +22,7 @@ import re
from pathlib import Path
import lhotse
from lhotse import CutSet, load_manifest
from lhotse import CutSet, load_manifest_lazy
class GigaSpeech:
@ -32,13 +32,13 @@ class GigaSpeech:
manifest_dir:
It is expected to contain the following files::
- XL_split_2000/cuts_XL.*.jsonl.gz
- cuts_L_raw.jsonl.gz
- cuts_M_raw.jsonl.gz
- cuts_S_raw.jsonl.gz
- cuts_XS_raw.jsonl.gz
- cuts_DEV_raw.jsonl.gz
- cuts_TEST_raw.jsonl.gz
- gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz
- gigaspeech_cuts_L_raw.jsonl.gz
- gigaspeech_cuts_M_raw.jsonl.gz
- gigaspeech_cuts_S_raw.jsonl.gz
- gigaspeech_cuts_XS_raw.jsonl.gz
- gigaspeech_cuts_DEV_raw.jsonl.gz
- gigaspeech_cuts_TEST_raw.jsonl.gz
"""
self.manifest_dir = Path(manifest_dir)
@ -46,10 +46,12 @@ class GigaSpeech:
logging.info("About to get train-XL cuts")
filenames = list(
glob.glob(f"{self.manifest_dir}/XL_split_2000/cuts_XL.*.jsonl.gz")
glob.glob(
f"{self.manifest_dir}/gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz" # noqa
)
)
pattern = re.compile(r"cuts_XL.([0-9]+).jsonl.gz")
pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz")
idx_filenames = [
(int(pattern.search(f).group(1)), f) for f in filenames
]
@ -64,31 +66,31 @@ class GigaSpeech:
)
def train_L_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_L_raw.jsonl.gz"
f = self.manifest_dir / "gigaspeech_cuts_L_raw.jsonl.gz"
logging.info(f"About to get train-L cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def train_M_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_M_raw.jsonl.gz"
f = self.manifest_dir / "gigaspeech_cuts_M_raw.jsonl.gz"
logging.info(f"About to get train-M cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def train_S_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_S_raw.jsonl.gz"
f = self.manifest_dir / "gigaspeech_cuts_S_raw.jsonl.gz"
logging.info(f"About to get train-S cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def train_XS_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_XS_raw.jsonl.gz"
f = self.manifest_dir / "gigaspeech_cuts_XS_raw.jsonl.gz"
logging.info(f"About to get train-XS cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def test_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_TEST.jsonl.gz"
f = self.manifest_dir / "gigaspeech_cuts_TEST.jsonl.gz"
logging.info(f"About to get TEST cuts from {f}")
return load_manifest(f)
return load_manifest_lazy(f)
def dev_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_DEV.jsonl.gz"
f = self.manifest_dir / "gigaspeech_cuts_DEV.jsonl.gz"
logging.info(f"About to get DEV cuts from {f}")
return load_manifest(f)
return load_manifest_lazy(f)

View File

@ -18,7 +18,7 @@
import logging
from pathlib import Path
from lhotse import CutSet, load_manifest
from lhotse import CutSet, load_manifest_lazy
class LibriSpeech:
@ -28,47 +28,47 @@ class LibriSpeech:
manifest_dir:
It is expected to contain the following files::
- cuts_dev-clean.json.gz
- cuts_dev-other.json.gz
- cuts_test-clean.json.gz
- cuts_test-other.json.gz
- cuts_train-clean-100.json.gz
- cuts_train-clean-360.json.gz
- cuts_train-other-500.json.gz
- librispeech_cuts_dev-clean.jsonl.gz
- librispeech_cuts_dev-other.jsonl.gz
- librispeech_cuts_test-clean.jsonl.gz
- librispeech_cuts_test-other.jsonl.gz
- librispeech_cuts_train-clean-100.jsonl.gz
- librispeech_cuts_train-clean-360.jsonl.gz
- librispeech_cuts_train-other-500.jsonl.gz
"""
self.manifest_dir = Path(manifest_dir)
def train_clean_100_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train-clean-100.json.gz"
f = self.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
logging.info(f"About to get train-clean-100 cuts from {f}")
return load_manifest(f)
return load_manifest_lazy(f)
def train_clean_360_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train-clean-360.json.gz"
f = self.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
logging.info(f"About to get train-clean-360 cuts from {f}")
return load_manifest(f)
return load_manifest_lazy(f)
def train_other_500_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train-other-500.json.gz"
f = self.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
logging.info(f"About to get train-other-500 cuts from {f}")
return load_manifest(f)
return load_manifest_lazy(f)
def test_clean_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_test-clean.json.gz"
f = self.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
logging.info(f"About to get test-clean cuts from {f}")
return load_manifest(f)
return load_manifest_lazy(f)
def test_other_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_test-other.json.gz"
f = self.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
logging.info(f"About to get test-other cuts from {f}")
return load_manifest(f)
return load_manifest_lazy(f)
def dev_clean_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_dev-clean.json.gz"
f = self.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
logging.info(f"About to get dev-clean cuts from {f}")
return load_manifest(f)
return load_manifest_lazy(f)
def dev_other_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_dev-other.json.gz"
f = self.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
logging.info(f"About to get dev-other cuts from {f}")
return load_manifest(f)
return load_manifest_lazy(f)

View File

@ -66,7 +66,7 @@ from conformer import Conformer
from decoder import Decoder
from gigaspeech import GigaSpeech
from joiner import Joiner
from lhotse import CutSet, load_manifest
from lhotse import CutSet, load_manifest_lazy
from lhotse.cut import Cut
from lhotse.dataset.sampling.base import CutSampler
from lhotse.utils import fix_random_seed
@ -968,8 +968,8 @@ def run(rank, world_size, args):
train_giga_cuts = train_giga_cuts.repeat(times=None)
if args.enable_musan:
cuts_musan = load_manifest(
Path(args.manifest_dir) / "cuts_musan.json.gz"
cuts_musan = load_manifest_lazy(
Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
)
else:
cuts_musan = None
@ -978,14 +978,12 @@ def run(rank, world_size, args):
train_dl = asr_datamodule.train_dataloaders(
train_cuts,
dynamic_bucketing=False,
on_the_fly_feats=False,
cuts_musan=cuts_musan,
)
giga_train_dl = asr_datamodule.train_dataloaders(
train_giga_cuts,
dynamic_bucketing=True,
on_the_fly_feats=False,
cuts_musan=cuts_musan,
)

View File

@ -24,7 +24,7 @@ from pathlib import Path
from typing import Any, Dict, Optional
import torch
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures
CutConcatenate,
CutMix,
@ -224,8 +224,8 @@ class LibriSpeechAsrDataModule:
if self.args.enable_musan:
logging.info("Enable MUSAN")
logging.info("About to get Musan cuts")
cuts_musan = load_manifest(
self.args.manifest_dir / "cuts_musan.json.gz"
cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "musan_cuts.jsonl.gz"
)
transforms.append(
CutMix(
@ -407,40 +407,48 @@ class LibriSpeechAsrDataModule:
@lru_cache()
def train_clean_100_cuts(self) -> CutSet:
logging.info("About to get train-clean-100 cuts")
return load_manifest(
self.args.manifest_dir / "cuts_train-clean-100.json.gz"
return load_manifest_lazy(
self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
)
@lru_cache()
def train_clean_360_cuts(self) -> CutSet:
logging.info("About to get train-clean-360 cuts")
return load_manifest(
self.args.manifest_dir / "cuts_train-clean-360.json.gz"
return load_manifest_lazy(
self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
)
@lru_cache()
def train_other_500_cuts(self) -> CutSet:
logging.info("About to get train-other-500 cuts")
return load_manifest(
self.args.manifest_dir / "cuts_train-other-500.json.gz"
return load_manifest_lazy(
self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
)
@lru_cache()
def dev_clean_cuts(self) -> CutSet:
logging.info("About to get dev-clean cuts")
return load_manifest(self.args.manifest_dir / "cuts_dev-clean.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
)
@lru_cache()
def dev_other_cuts(self) -> CutSet:
logging.info("About to get dev-other cuts")
return load_manifest(self.args.manifest_dir / "cuts_dev-other.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
)
@lru_cache()
def test_clean_cuts(self) -> CutSet:
logging.info("About to get test-clean cuts")
return load_manifest(self.args.manifest_dir / "cuts_test-clean.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
)
@lru_cache()
def test_other_cuts(self) -> CutSet:
logging.info("About to get test-other cuts")
return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
)

View File

@ -16,6 +16,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Usage:
export CUDA_VISIBLE_DEVICES="0,1,2,3"
./tdnn_lstm_ctc/train.py \
--world-size 4 \
--full-libri 1 \
--max-duration 300 \
--num-epochs 20
"""
import argparse
import logging
@ -29,6 +38,7 @@ import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from asr_datamodule import LibriSpeechAsrDataModule
from lhotse.cut import Cut
from lhotse.utils import fix_random_seed
from model import TdnnLstm
from torch import Tensor
@ -544,10 +554,25 @@ def run(rank, world_size, args):
if params.full_libri:
train_cuts += librispeech.train_clean_360_cuts()
train_cuts += librispeech.train_other_500_cuts()
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
#
# Caution: There is a reason to select 20.0 here. Please see
# ../local/display_manifest_statistics.py
#
# You should use ../local/display_manifest_statistics.py to get
# an utterance duration distribution for your dataset to select
# the threshold
return 1.0 <= c.duration <= 20.0
train_cuts = train_cuts.filter(remove_short_and_long_utt)
train_dl = librispeech.train_dataloaders(train_cuts)
valid_cuts = librispeech.dev_clean_cuts()
valid_cuts += librispeech.dev_other_cuts()
valid_dl = librispeech.valid_dataloaders(valid_cuts)
for epoch in range(params.start_epoch, params.num_epochs):

View File

@ -44,8 +44,8 @@ from pathlib import Path
import sentencepiece as spm
import torch
from alignment import get_word_starting_frames
from lhotse import CutSet, load_manifest
from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler
from lhotse import CutSet, load_manifest_lazy
from lhotse.dataset import DynamicBucketingSampler, K2SpeechRecognitionDataset
from lhotse.dataset.collation import collate_custom_field
@ -93,14 +93,15 @@ def main():
sp = spm.SentencePieceProcessor()
sp.load(args.bpe_model)
cuts_json = args.ali_dir / f"cuts_{args.dataset}.json.gz"
cuts_jsonl = args.ali_dir / f"librispeech_cuts_{args.dataset}.jsonl.gz"
logging.info(f"Loading {cuts_json}")
cuts = load_manifest(cuts_json)
logging.info(f"Loading {cuts_jsonl}")
cuts = load_manifest_lazy(cuts_jsonl)
sampler = SingleCutSampler(
sampler = DynamicBucketingSampler(
cuts,
max_duration=30,
num_buckets=30,
shuffle=False,
)

View File

@ -1,333 +0,0 @@
# Copyright 2021 Piotr Żelasko
# 2022 Xiaomi Corp. (authors: Fangjun Kuang
# Mingshuang Luo)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import inspect
import logging
from pathlib import Path
from typing import Optional
import torch
from lhotse import CutSet, Fbank, FbankConfig
from lhotse.dataset import (
BucketingSampler,
CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset,
SpecAugment,
)
from lhotse.dataset.input_strategies import (
OnTheFlyFeatures,
PrecomputedFeatures,
)
from lhotse.utils import fix_random_seed
from torch.utils.data import DataLoader
from icefall.utils import str2bool
class _SeedWorkers:
def __init__(self, seed: int):
self.seed = seed
def __call__(self, worker_id: int):
fix_random_seed(self.seed + worker_id)
class AsrDataModule:
def __init__(self, args: argparse.Namespace):
self.args = args
@classmethod
def add_arguments(cls, parser: argparse.ArgumentParser):
group = parser.add_argument_group(
title="ASR data related options",
description="These options are used for the preparation of "
"PyTorch DataLoaders from Lhotse CutSet's -- they control the "
"effective batch sizes, sampling strategies, applied data "
"augmentations, etc.",
)
group.add_argument(
"--max-duration",
type=int,
default=200.0,
help="Maximum pooled recordings duration (seconds) in a "
"single batch. You can reduce it if it causes CUDA OOM.",
)
group.add_argument(
"--bucketing-sampler",
type=str2bool,
default=True,
help="When enabled, the batches will come from buckets of "
"similar duration (saves padding frames).",
)
group.add_argument(
"--num-buckets",
type=int,
default=30,
help="The number of buckets for the BucketingSampler "
"and DynamicBucketingSampler."
"(you might want to increase it for larger datasets).",
)
group.add_argument(
"--shuffle",
type=str2bool,
default=True,
help="When enabled (=default), the examples will be "
"shuffled for each epoch.",
)
group.add_argument(
"--return-cuts",
type=str2bool,
default=True,
help="When enabled, each batch will have the "
"field: batch['supervisions']['cut'] with the cuts that "
"were used to construct it.",
)
group.add_argument(
"--num-workers",
type=int,
default=2,
help="The number of training dataloader workers that "
"collect the batches.",
)
group.add_argument(
"--enable-spec-aug",
type=str2bool,
default=True,
help="When enabled, use SpecAugment for training dataset.",
)
group.add_argument(
"--spec-aug-time-warp-factor",
type=int,
default=80,
help="Used only when --enable-spec-aug is True. "
"It specifies the factor for time warping in SpecAugment. "
"Larger values mean more warping. "
"A value less than 1 means to disable time warp.",
)
group.add_argument(
"--enable-musan",
type=str2bool,
default=True,
help="When enabled, select noise from MUSAN and mix it"
"with training dataset. ",
)
group.add_argument(
"--manifest-dir",
type=Path,
default=Path("data/fbank"),
help="Path to directory with train/valid/test cuts.",
)
group.add_argument(
"--on-the-fly-feats",
type=str2bool,
default=False,
help="When enabled, use on-the-fly cut mixing and feature "
"extraction. Will drop existing precomputed feature manifests "
"if available. Used only in dev/test CutSet",
)
def train_dataloaders(
self,
cuts_train: CutSet,
dynamic_bucketing: bool,
on_the_fly_feats: bool,
cuts_musan: Optional[CutSet] = None,
) -> DataLoader:
"""
Args:
cuts_train:
Cuts for training.
cuts_musan:
If not None, it is the cuts for mixing.
dynamic_bucketing:
True to use DynamicBucketingSampler;
False to use BucketingSampler.
on_the_fly_feats:
True to use OnTheFlyFeatures;
False to use PrecomputedFeatures.
"""
transforms = []
if cuts_musan is not None:
logging.info("Enable MUSAN")
transforms.append(
CutMix(
cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True
)
)
else:
logging.info("Disable MUSAN")
input_transforms = []
if self.args.enable_spec_aug:
logging.info("Enable SpecAugment")
logging.info(
f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
)
# Set the value of num_frame_masks according to Lhotse's version.
# In different Lhotse's versions, the default of num_frame_masks is
# different.
num_frame_masks = 10
num_frame_masks_parameter = inspect.signature(
SpecAugment.__init__
).parameters["num_frame_masks"]
if num_frame_masks_parameter.default == 1:
num_frame_masks = 2
logging.info(f"Num frame mask: {num_frame_masks}")
input_transforms.append(
SpecAugment(
time_warp_factor=self.args.spec_aug_time_warp_factor,
num_frame_masks=num_frame_masks,
features_mask_size=27,
num_feature_masks=2,
frames_mask_size=100,
)
)
else:
logging.info("Disable SpecAugment")
logging.info("About to create train dataset")
train = K2SpeechRecognitionDataset(
cut_transforms=transforms,
input_transforms=input_transforms,
return_cuts=self.args.return_cuts,
)
# NOTE: the PerturbSpeed transform should be added only if we
# remove it from data prep stage.
# Add on-the-fly speed perturbation; since originally it would
# have increased epoch size by 3, we will apply prob 2/3 and use
# 3x more epochs.
# Speed perturbation probably should come first before
# concatenation, but in principle the transforms order doesn't have
# to be strict (e.g. could be randomized)
# transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms # noqa
# Drop feats to be on the safe side.
train = K2SpeechRecognitionDataset(
cut_transforms=transforms,
input_strategy=(
OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
if on_the_fly_feats
else PrecomputedFeatures()
),
input_transforms=input_transforms,
return_cuts=self.args.return_cuts,
)
if dynamic_bucketing:
logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
drop_last=True,
)
else:
logging.info("Using BucketingSampler.")
train_sampler = BucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True,
)
logging.info("About to create train dataloader")
# 'seed' is derived from the current random state, which will have
# previously been set in the main process.
seed = torch.randint(0, 100000, ()).item()
worker_init_fn = _SeedWorkers(seed)
train_dl = DataLoader(
train,
sampler=train_sampler,
batch_size=None,
num_workers=self.args.num_workers,
persistent_workers=False,
worker_init_fn=worker_init_fn,
)
return train_dl
def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
transforms = []
logging.info("About to create dev dataset")
if self.args.on_the_fly_feats:
validate = K2SpeechRecognitionDataset(
cut_transforms=transforms,
input_strategy=OnTheFlyFeatures(
Fbank(FbankConfig(num_mel_bins=80))
),
return_cuts=self.args.return_cuts,
)
else:
validate = K2SpeechRecognitionDataset(
cut_transforms=transforms,
return_cuts=self.args.return_cuts,
)
valid_sampler = BucketingSampler(
cuts_valid,
max_duration=self.args.max_duration,
shuffle=False,
)
logging.info("About to create dev dataloader")
valid_dl = DataLoader(
validate,
sampler=valid_sampler,
batch_size=None,
num_workers=2,
persistent_workers=False,
)
return valid_dl
def test_dataloaders(self, cuts: CutSet) -> DataLoader:
logging.debug("About to create test dataset")
test = K2SpeechRecognitionDataset(
input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
if self.args.on_the_fly_feats
else PrecomputedFeatures(),
return_cuts=self.args.return_cuts,
)
sampler = BucketingSampler(
cuts, max_duration=self.args.max_duration, shuffle=False
)
logging.debug("About to create test dataloader")
test_dl = DataLoader(
test,
batch_size=None,
sampler=sampler,
num_workers=self.args.num_workers,
)
return test_dl

View File

@ -0,0 +1 @@
../pruned_transducer_stateless3/asr_datamodule.py

View File

@ -1,75 +0,0 @@
# Copyright 2021 Piotr Żelasko
# 2022 Xiaomi Corp. (authors: Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
from lhotse import CutSet, load_manifest
class GigaSpeech:
def __init__(self, manifest_dir: str):
"""
Args:
manifest_dir:
It is expected to contain the following files::
- cuts_XL_raw.jsonl.gz
- cuts_L_raw.jsonl.gz
- cuts_M_raw.jsonl.gz
- cuts_S_raw.jsonl.gz
- cuts_XS_raw.jsonl.gz
- cuts_DEV_raw.jsonl.gz
- cuts_TEST_raw.jsonl.gz
"""
self.manifest_dir = Path(manifest_dir)
def train_XL_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_XL_raw.jsonl.gz"
logging.info(f"About to get train-XL cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def train_L_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_L_raw.jsonl.gz"
logging.info(f"About to get train-L cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def train_M_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_M_raw.jsonl.gz"
logging.info(f"About to get train-M cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def train_S_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_S_raw.jsonl.gz"
logging.info(f"About to get train-S cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def train_XS_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_XS_raw.jsonl.gz"
logging.info(f"About to get train-XS cuts from {f}")
return CutSet.from_jsonl_lazy(f)
def test_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_TEST.jsonl.gz"
logging.info(f"About to get TEST cuts from {f}")
return load_manifest(f)
def dev_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_DEV.jsonl.gz"
logging.info(f"About to get DEV cuts from {f}")
return load_manifest(f)

View File

@ -0,0 +1 @@
../pruned_transducer_stateless3/gigaspeech.py

View File

@ -1,74 +0,0 @@
# Copyright 2021 Piotr Żelasko
# 2022 Xiaomi Corp. (authors: Fangjun Kuang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from pathlib import Path
from lhotse import CutSet, load_manifest
class LibriSpeech:
def __init__(self, manifest_dir: str):
"""
Args:
manifest_dir:
It is expected to contain the following files::
- cuts_dev-clean.json.gz
- cuts_dev-other.json.gz
- cuts_test-clean.json.gz
- cuts_test-other.json.gz
- cuts_train-clean-100.json.gz
- cuts_train-clean-360.json.gz
- cuts_train-other-500.json.gz
"""
self.manifest_dir = Path(manifest_dir)
def train_clean_100_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train-clean-100.json.gz"
logging.info(f"About to get train-clean-100 cuts from {f}")
return load_manifest(f)
def train_clean_360_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train-clean-360.json.gz"
logging.info(f"About to get train-clean-360 cuts from {f}")
return load_manifest(f)
def train_other_500_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_train-other-500.json.gz"
logging.info(f"About to get train-other-500 cuts from {f}")
return load_manifest(f)
def test_clean_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_test-clean.json.gz"
logging.info(f"About to get test-clean cuts from {f}")
return load_manifest(f)
def test_other_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_test-other.json.gz"
logging.info(f"About to get test-other cuts from {f}")
return load_manifest(f)
def dev_clean_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_dev-clean.json.gz"
logging.info(f"About to get dev-clean cuts from {f}")
return load_manifest(f)
def dev_other_cuts(self) -> CutSet:
f = self.manifest_dir / "cuts_dev-other.json.gz"
logging.info(f"About to get dev-other cuts from {f}")
return load_manifest(f)

View File

@ -0,0 +1 @@
../pruned_transducer_stateless3/librispeech.py

View File

@ -28,7 +28,7 @@ from pathlib import Path
from asr_datamodule import AsrDataModule
from gigaspeech import GigaSpeech
from lhotse import load_manifest
from lhotse import load_manifest_lazy
from librispeech import LibriSpeech
@ -41,8 +41,8 @@ def test_dataset():
print(args)
if args.enable_musan:
cuts_musan = load_manifest(
Path(args.manifest_dir) / "cuts_musan.json.gz"
cuts_musan = load_manifest_lazy(
Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
)
else:
cuts_musan = None
@ -57,14 +57,12 @@ def test_dataset():
libri_train_dl = asr_datamodule.train_dataloaders(
train_clean_100,
dynamic_bucketing=False,
on_the_fly_feats=False,
cuts_musan=cuts_musan,
)
giga_train_dl = asr_datamodule.train_dataloaders(
train_S,
dynamic_bucketing=True,
on_the_fly_feats=True,
cuts_musan=cuts_musan,
)

View File

@ -73,7 +73,7 @@ from conformer import Conformer
from decoder import Decoder
from gigaspeech import GigaSpeech
from joiner import Joiner
from lhotse import CutSet, load_manifest
from lhotse import CutSet, load_manifest_lazy
from lhotse.cut import Cut
from lhotse.utils import fix_random_seed
from librispeech import LibriSpeech
@ -662,19 +662,17 @@ def train_one_epoch(
def filter_short_and_long_utterances(cuts: CutSet) -> CutSet:
def remove_short_and_long_utt(c: Cut):
# Keep only utterances with duration between 1 second and 20 seconds
#
# Caution: There is a reason to select 20.0 here. Please see
# ../local/display_manifest_statistics.py
#
# You should use ../local/display_manifest_statistics.py to get
# an utterance duration distribution for your dataset to select
# the threshold
return 1.0 <= c.duration <= 20.0
num_in_total = len(cuts)
cuts = cuts.filter(remove_short_and_long_utt)
num_left = len(cuts)
num_removed = num_in_total - num_left
removed_percent = num_removed / num_in_total * 100
logging.info(f"Before removing short and long utterances: {num_in_total}")
logging.info(f"After removing short and long utterances: {num_left}")
logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
return cuts
@ -767,17 +765,18 @@ def run(rank, world_size, args):
# DEV 12 hours
# Test 40 hours
if params.full_libri:
logging.info("Using the L subset of GigaSpeech (2.5k hours)")
train_giga_cuts = gigaspeech.train_L_cuts()
logging.info("Using the XL subset of GigaSpeech (10k hours)")
train_giga_cuts = gigaspeech.train_XL_cuts()
else:
logging.info("Using the S subset of GigaSpeech (250 hours)")
train_giga_cuts = gigaspeech.train_S_cuts()
train_giga_cuts = filter_short_and_long_utterances(train_giga_cuts)
train_giga_cuts = train_giga_cuts.repeat(times=None)
if args.enable_musan:
cuts_musan = load_manifest(
Path(args.manifest_dir) / "cuts_musan.json.gz"
cuts_musan = load_manifest_lazy(
Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
)
else:
cuts_musan = None
@ -786,14 +785,12 @@ def run(rank, world_size, args):
train_dl = asr_datamodule.train_dataloaders(
train_cuts,
dynamic_bucketing=False,
on_the_fly_feats=False,
cuts_musan=cuts_musan,
)
giga_train_dl = asr_datamodule.train_dataloaders(
train_giga_cuts,
dynamic_bucketing=True,
on_the_fly_feats=True,
cuts_musan=cuts_musan,
)

View File

@ -22,7 +22,7 @@ from pathlib import Path
from typing import Any, Dict, Optional
import torch
from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import (
CutConcatenate,
CutMix,
@ -176,7 +176,7 @@ class SPGISpeechAsrDataModule:
The state dict for the training sampler.
"""
logging.info("About to get Musan cuts")
cuts_musan = load_manifest(
cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "cuts_musan.jsonl.gz"
)

View File

@ -52,8 +52,13 @@ def compute_fbank_tedlium():
"test",
)
prefix = "tedlium"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
prefix="tedlium", dataset_parts=dataset_parts, output_dir=src_dir
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
@ -61,7 +66,7 @@ def compute_fbank_tedlium():
with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file():
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
logging.info(f"{partition} already exists - skipping.")
continue
logging.info(f"Processing {partition}")
@ -80,7 +85,7 @@ def compute_fbank_tedlium():
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}",
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=cur_num_jobs,
executor=ex,
@ -88,7 +93,7 @@ def compute_fbank_tedlium():
)
# Split long cuts into many short and un-overlapping cuts
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
if __name__ == "__main__":

View File

@ -27,15 +27,15 @@ for usage.
"""
from lhotse import load_manifest
from lhotse import load_manifest_lazy
def main():
path = "./data/fbank/cuts_train.json.gz"
path = "./data/fbank/cuts_dev.json.gz"
path = "./data/fbank/cuts_test.json.gz"
path = "./data/fbank/tedlium_cuts_train.jsonl.gz"
path = "./data/fbank/tedlium_cuts_dev.jsonl.gz"
path = "./data/fbank/tedlium_cuts_test.jsonl.gz"
cuts = load_manifest(path)
cuts = load_manifest_lazy(path)
cuts.describe()

View File

@ -22,11 +22,11 @@ import logging
from functools import lru_cache
from pathlib import Path
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import (
BucketingSampler,
CutConcatenate,
CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset,
PrecomputedFeatures,
SingleCutSampler,
@ -92,7 +92,7 @@ class TedLiumAsrDataModule:
"--num-buckets",
type=int,
default=30,
help="The number of buckets for the BucketingSampler"
help="The number of buckets for the DynamicBucketingSampler"
"(you might want to increase it for larger datasets).",
)
group.add_argument(
@ -179,8 +179,8 @@ class TedLiumAsrDataModule:
transforms = []
if self.args.enable_musan:
logging.info("Enable MUSAN")
cuts_musan = load_manifest(
self.args.manifest_dir / "cuts_musan.json.gz"
cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "musan_cuts.jsonl.gz"
)
transforms.append(
CutMix(
@ -261,13 +261,12 @@ class TedLiumAsrDataModule:
)
if self.args.bucketing_sampler:
logging.info("Using BucketingSampler.")
train_sampler = BucketingSampler(
logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True,
)
else:
@ -311,7 +310,7 @@ class TedLiumAsrDataModule:
cut_transforms=transforms,
return_cuts=self.args.return_cuts,
)
valid_sampler = BucketingSampler(
valid_sampler = DynamicBucketingSampler(
cuts_valid,
max_duration=self.args.max_duration,
shuffle=False,
@ -335,8 +334,10 @@ class TedLiumAsrDataModule:
else PrecomputedFeatures(),
return_cuts=self.args.return_cuts,
)
sampler = BucketingSampler(
cuts, max_duration=self.args.max_duration, shuffle=False
sampler = DynamicBucketingSampler(
cuts,
max_duration=self.args.max_duration,
shuffle=False,
)
logging.debug("About to create test dataloader")
test_dl = DataLoader(
@ -350,14 +351,20 @@ class TedLiumAsrDataModule:
@lru_cache()
def train_cuts(self) -> CutSet:
logging.info("About to get train cuts")
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "tedlium_cuts_train.jsonl.gz"
)
@lru_cache()
def dev_cuts(self) -> CutSet:
logging.info("About to get dev cuts")
return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "tedlium_cuts_dev.jsonl.gz"
)
@lru_cache()
def test_cuts(self) -> CutSet:
logging.info("About to get test cuts")
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
return load_manifest_lazy(
self.args.manifest_dir / "tedlium_cuts_test.jsonl.gz"
)

View File

@ -29,7 +29,7 @@ import os
from pathlib import Path
import torch
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
@ -53,8 +53,13 @@ def compute_fbank_timit():
"DEV",
"TEST",
)
prefix = "timit"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
prefix="timit", dataset_parts=dataset_parts, output_dir=src_dir
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
@ -62,7 +67,8 @@ def compute_fbank_timit():
with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file():
cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
if cuts_file.is_file():
logging.info(f"{partition} already exists - skipping.")
continue
logging.info(f"Processing {partition}")
@ -78,13 +84,13 @@ def compute_fbank_timit():
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}",
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=LilcomHdf5Writer,
storage_type=LilcomChunkyWriter,
)
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
cut_set.to_file(cuts_file)
if __name__ == "__main__":

View File

@ -23,11 +23,11 @@ from functools import lru_cache
from pathlib import Path
from typing import List, Union
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import (
BucketingSampler,
CutConcatenate,
CutMix,
DynamicBucketingSampler,
K2SpeechRecognitionDataset,
PrecomputedFeatures,
SingleCutSampler,
@ -92,7 +92,7 @@ class TimitAsrDataModule(DataModule):
"--num-buckets",
type=int,
default=30,
help="The number of buckets for the BucketingSampler"
help="The number of buckets for the DynamicBucketingSampler"
"(you might want to increase it for larger datasets).",
)
group.add_argument(
@ -154,7 +154,9 @@ class TimitAsrDataModule(DataModule):
cuts_train = self.train_cuts()
logging.info("About to get Musan cuts")
cuts_musan = load_manifest(self.args.feature_dir / "cuts_musan.json.gz")
cuts_musan = load_manifest_lazy(
self.args.feature_dir / "cuts_musan.jsonl.gz"
)
logging.info("About to create train dataset")
transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
@ -218,13 +220,12 @@ class TimitAsrDataModule(DataModule):
)
if self.args.bucketing_sampler:
logging.info("Using BucketingSampler.")
train_sampler = BucketingSampler(
logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True,
)
else:
@ -322,20 +323,26 @@ class TimitAsrDataModule(DataModule):
@lru_cache()
def train_cuts(self) -> CutSet:
logging.info("About to get train cuts")
cuts_train = load_manifest(self.args.feature_dir / "cuts_TRAIN.json.gz")
cuts_train = load_manifest_lazy(
self.args.feature_dir / "timit_cuts_TRAIN.jsonl.gz"
)
return cuts_train
@lru_cache()
def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts")
cuts_valid = load_manifest(self.args.feature_dir / "cuts_DEV.json.gz")
cuts_valid = load_manifest_lazy(
self.args.feature_dir / "timit_cuts_DEV.jsonl.gz"
)
return cuts_valid
@lru_cache()
def test_cuts(self) -> CutSet:
logging.debug("About to get test cuts")
cuts_test = load_manifest(self.args.feature_dir / "cuts_TEST.json.gz")
cuts_test = load_manifest_lazy(
self.args.feature_dir / "timit_cuts_TEST.jsonl.gz"
)
return cuts_test

View File

@ -26,7 +26,7 @@ for usage.
"""
from lhotse import load_manifest
from lhotse import load_manifest_lazy
def main():
@ -40,7 +40,7 @@ def main():
for path in paths:
print(f"Starting display the statistics for {path}")
cuts = load_manifest(path)
cuts = load_manifest_lazy(path)
cuts.describe()

View File

@ -27,7 +27,7 @@ from lhotse import (
CutSet,
Fbank,
FbankConfig,
load_manifest,
load_manifest_lazy,
set_caching_enabled,
)
from lhotse.dataset import (
@ -218,8 +218,8 @@ class WenetSpeechAsrDataModule:
The state dict for the training sampler.
"""
logging.info("About to get Musan cuts")
cuts_musan = load_manifest(
self.args.manifest_dir / "cuts_musan.json.gz"
cuts_musan = load_manifest_lazy(
self.args.manifest_dir / "musan_cuts.jsonl.gz"
)
transforms = []
@ -435,16 +435,18 @@ class WenetSpeechAsrDataModule:
@lru_cache()
def valid_cuts(self) -> CutSet:
logging.info("About to get dev cuts")
return load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
return load_manifest_lazy(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
@lru_cache()
def test_net_cuts(self) -> List[CutSet]:
logging.info("About to get TEST_NET cuts")
return load_manifest(self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz")
return load_manifest_lazy(
self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz"
)
@lru_cache()
def test_meeting_cuts(self) -> List[CutSet]:
logging.info("About to get TEST_MEETING cuts")
return load_manifest(
return load_manifest_lazy(
self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz"
)

View File

@ -12,7 +12,7 @@ import os
from pathlib import Path
import torch
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
@ -37,10 +37,13 @@ def compute_fbank_yesno():
"train",
"test",
)
prefix = "yesno"
suffix = "jsonl.gz"
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix="yesno",
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
@ -50,7 +53,8 @@ def compute_fbank_yesno():
with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items():
if (output_dir / f"cuts_{partition}.json.gz").is_file():
cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
if cuts_file.is_file():
logging.info(f"{partition} already exists - skipping.")
continue
logging.info(f"Processing {partition}")
@ -66,13 +70,13 @@ def compute_fbank_yesno():
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/feats_{partition}",
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 1, # use one job
executor=ex,
storage_type=LilcomHdf5Writer,
storage_type=LilcomChunkyWriter,
)
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
cut_set.to_file(cuts_file)
if __name__ == "__main__":

View File

@ -20,18 +20,19 @@ from functools import lru_cache
from pathlib import Path
from typing import List
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
from lhotse.dataset import (
CutConcatenate,
DynamicBucketingSampler,
K2SpeechRecognitionDataset,
PrecomputedFeatures,
SingleCutSampler,
)
from lhotse.dataset.input_strategies import OnTheFlyFeatures
from torch.utils.data import DataLoader
from icefall.dataset.datamodule import DataModule
from icefall.utils import str2bool
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
from lhotse.dataset import (
BucketingSampler,
CutConcatenate,
K2SpeechRecognitionDataset,
PrecomputedFeatures,
)
from lhotse.dataset.input_strategies import OnTheFlyFeatures
class YesNoAsrDataModule(DataModule):
@ -84,7 +85,7 @@ class YesNoAsrDataModule(DataModule):
"--num-buckets",
type=int,
default=10,
help="The number of buckets for the BucketingSampler"
help="The number of buckets for the DynamicBucketingSampler"
"(you might want to increase it for larger datasets).",
)
group.add_argument(
@ -186,18 +187,17 @@ class YesNoAsrDataModule(DataModule):
)
if self.args.bucketing_sampler:
logging.info("Using BucketingSampler.")
train_sampler = BucketingSampler(
logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
bucket_method="equal_duration",
drop_last=True,
)
else:
logging.info("Using SingleCutSampler.")
train_sampler = BucketingSampler(
train_sampler = SingleCutSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
@ -225,8 +225,10 @@ class YesNoAsrDataModule(DataModule):
else PrecomputedFeatures(),
return_cuts=self.args.return_cuts,
)
sampler = BucketingSampler(
cuts_test, max_duration=self.args.max_duration, shuffle=False
sampler = DynamicBucketingSampler(
cuts_test,
max_duration=self.args.max_duration,
shuffle=False,
)
logging.debug("About to create test dataloader")
test_dl = DataLoader(
@ -240,11 +242,15 @@ class YesNoAsrDataModule(DataModule):
@lru_cache()
def train_cuts(self) -> CutSet:
logging.info("About to get train cuts")
cuts_train = load_manifest(self.args.feature_dir / "cuts_train.json.gz")
cuts_train = load_manifest_lazy(
self.args.feature_dir / "yesno_cuts_train.jsonl.gz"
)
return cuts_train
@lru_cache()
def test_cuts(self) -> List[CutSet]:
logging.info("About to get test cuts")
cuts_test = load_manifest(self.args.feature_dir / "cuts_test.json.gz")
cuts_test = load_manifest_lazy(
self.args.feature_dir / "yesno_cuts_test.jsonl.gz"
)
return cuts_test

View File

@ -131,7 +131,6 @@ def setup_logger(
format=formatter,
level=level,
filemode="w",
force=True,
)
if use_console:
console = logging.StreamHandler()