mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-09 17:14:20 +00:00
More fixes to use lazy CutSet.
This commit is contained in:
parent
0040ff2157
commit
113818fd00
@ -43,7 +43,7 @@ torch.set_num_interop_threads(1)
|
|||||||
|
|
||||||
|
|
||||||
def compute_fbank_alimeeting(num_mel_bins: int = 80):
|
def compute_fbank_alimeeting(num_mel_bins: int = 80):
|
||||||
src_dir = Path("data/manifests/alimeeting")
|
src_dir = Path("data/manifests")
|
||||||
output_dir = Path("data/fbank")
|
output_dir = Path("data/fbank")
|
||||||
num_jobs = min(15, os.cpu_count())
|
num_jobs = min(15, os.cpu_count())
|
||||||
|
|
||||||
@ -52,11 +52,14 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
|
|||||||
"eval",
|
"eval",
|
||||||
"test",
|
"test",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
prefix = "alimeeting"
|
||||||
|
suffix = "jsonl.gz"
|
||||||
manifests = read_manifests_if_cached(
|
manifests = read_manifests_if_cached(
|
||||||
dataset_parts=dataset_parts,
|
dataset_parts=dataset_parts,
|
||||||
output_dir=src_dir,
|
output_dir=src_dir,
|
||||||
prefix="alimeeting",
|
prefix=prefix,
|
||||||
suffix="jsonl.gz",
|
suffix=suffix,
|
||||||
)
|
)
|
||||||
assert manifests is not None
|
assert manifests is not None
|
||||||
|
|
||||||
@ -64,7 +67,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
|
|||||||
|
|
||||||
with get_executor() as ex: # Initialize the executor only once.
|
with get_executor() as ex: # Initialize the executor only once.
|
||||||
for partition, m in manifests.items():
|
for partition, m in manifests.items():
|
||||||
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
|
||||||
logging.info(f"{partition} already exists - skipping.")
|
logging.info(f"{partition} already exists - skipping.")
|
||||||
continue
|
continue
|
||||||
logging.info(f"Processing {partition}")
|
logging.info(f"Processing {partition}")
|
||||||
@ -83,7 +86,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
|
|||||||
|
|
||||||
cut_set = cut_set.compute_and_store_features(
|
cut_set = cut_set.compute_and_store_features(
|
||||||
extractor=extractor,
|
extractor=extractor,
|
||||||
storage_path=f"{output_dir}/feats_{partition}",
|
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||||
# when an executor is specified, make more partitions
|
# when an executor is specified, make more partitions
|
||||||
num_jobs=cur_num_jobs,
|
num_jobs=cur_num_jobs,
|
||||||
executor=ex,
|
executor=ex,
|
||||||
@ -95,7 +98,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
|
|||||||
keep_overlapping=False,
|
keep_overlapping=False,
|
||||||
min_duration=None,
|
min_duration=None,
|
||||||
)
|
)
|
||||||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
|
@ -25,19 +25,19 @@ for usage.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
from lhotse import load_manifest
|
from lhotse import load_manifest_lazy
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
paths = [
|
paths = [
|
||||||
"./data/fbank/cuts_train.json.gz",
|
"./data/fbank/alimeeting_cuts_train.jsonl.gz",
|
||||||
"./data/fbank/cuts_eval.json.gz",
|
"./data/fbank/alimeeting_cuts_eval.jsonl.gz",
|
||||||
"./data/fbank/cuts_test.json.gz",
|
"./data/fbank/alimeeting_cuts_test.jsonl.gz",
|
||||||
]
|
]
|
||||||
|
|
||||||
for path in paths:
|
for path in paths:
|
||||||
print(f"Starting display the statistics for {path}")
|
print(f"Starting display the statistics for {path}")
|
||||||
cuts = load_manifest(path)
|
cuts = load_manifest_lazy(path)
|
||||||
cuts.describe()
|
cuts.describe()
|
||||||
|
|
||||||
|
|
||||||
@ -45,7 +45,7 @@ if __name__ == "__main__":
|
|||||||
main()
|
main()
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Starting display the statistics for ./data/fbank/cuts_train.json.gz
|
Starting display the statistics for ./data/fbank/alimeeting_cuts_train.jsonl.gz
|
||||||
Cuts count: 559092
|
Cuts count: 559092
|
||||||
Total duration (hours): 424.6
|
Total duration (hours): 424.6
|
||||||
Speech duration (hours): 424.6 (100.0%)
|
Speech duration (hours): 424.6 (100.0%)
|
||||||
@ -61,7 +61,7 @@ min 0.0
|
|||||||
99.5% 14.7
|
99.5% 14.7
|
||||||
99.9% 16.2
|
99.9% 16.2
|
||||||
max 284.3
|
max 284.3
|
||||||
Starting display the statistics for ./data/fbank/cuts_eval.json.gz
|
Starting display the statistics for ./data/fbank/alimeeting_cuts_eval.jsonl.gz
|
||||||
Cuts count: 6457
|
Cuts count: 6457
|
||||||
Total duration (hours): 4.9
|
Total duration (hours): 4.9
|
||||||
Speech duration (hours): 4.9 (100.0%)
|
Speech duration (hours): 4.9 (100.0%)
|
||||||
@ -77,7 +77,7 @@ min 0.1
|
|||||||
99.5% 14.1
|
99.5% 14.1
|
||||||
99.9% 14.7
|
99.9% 14.7
|
||||||
max 15.8
|
max 15.8
|
||||||
Starting display the statistics for ./data/fbank/cuts_test.json.gz
|
Starting display the statistics for ./data/fbank/alimeeting_cuts_test.jsonl.gz
|
||||||
Cuts count: 16358
|
Cuts count: 16358
|
||||||
Total duration (hours): 12.5
|
Total duration (hours): 12.5
|
||||||
Speech duration (hours): 12.5 (100.0%)
|
Speech duration (hours): 12.5 (100.0%)
|
||||||
|
@ -27,7 +27,7 @@ from lhotse import (
|
|||||||
CutSet,
|
CutSet,
|
||||||
Fbank,
|
Fbank,
|
||||||
FbankConfig,
|
FbankConfig,
|
||||||
load_manifest,
|
load_manifest_lazy,
|
||||||
set_caching_enabled,
|
set_caching_enabled,
|
||||||
)
|
)
|
||||||
from lhotse.dataset import (
|
from lhotse.dataset import (
|
||||||
@ -204,8 +204,8 @@ class AlimeetingAsrDataModule:
|
|||||||
The state dict for the training sampler.
|
The state dict for the training sampler.
|
||||||
"""
|
"""
|
||||||
logging.info("About to get Musan cuts")
|
logging.info("About to get Musan cuts")
|
||||||
cuts_musan = load_manifest(
|
cuts_musan = load_manifest_lazy(
|
||||||
self.args.manifest_dir / "cuts_musan.json.gz"
|
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
transforms = []
|
transforms = []
|
||||||
@ -401,14 +401,20 @@ class AlimeetingAsrDataModule:
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def train_cuts(self) -> CutSet:
|
def train_cuts(self) -> CutSet:
|
||||||
logging.info("About to get train cuts")
|
logging.info("About to get train cuts")
|
||||||
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "alimeeting_cuts_train.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def valid_cuts(self) -> CutSet:
|
def valid_cuts(self) -> CutSet:
|
||||||
logging.info("About to get dev cuts")
|
logging.info("About to get dev cuts")
|
||||||
return load_manifest(self.args.manifest_dir / "cuts_eval.json.gz")
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "alimeeting_cuts_eval.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def test_cuts(self) -> List[CutSet]:
|
def test_cuts(self) -> List[CutSet]:
|
||||||
logging.info("About to get test cuts")
|
logging.info("About to get test cuts")
|
||||||
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "alimeeting_cuts_test.jsonl.gz"
|
||||||
|
)
|
||||||
|
@ -20,7 +20,7 @@ import logging
|
|||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
|
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||||
from lhotse.dataset import (
|
from lhotse.dataset import (
|
||||||
CutConcatenate,
|
CutConcatenate,
|
||||||
CutMix,
|
CutMix,
|
||||||
@ -189,7 +189,7 @@ class GigaSpeechAsrDataModule:
|
|||||||
|
|
||||||
def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
|
def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
|
||||||
logging.info("About to get Musan cuts")
|
logging.info("About to get Musan cuts")
|
||||||
cuts_musan = load_manifest(
|
cuts_musan = load_manifest_lazy(
|
||||||
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -362,7 +362,9 @@ class GigaSpeechAsrDataModule:
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def dev_cuts(self) -> CutSet:
|
def dev_cuts(self) -> CutSet:
|
||||||
logging.info("About to get dev cuts")
|
logging.info("About to get dev cuts")
|
||||||
cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
|
cuts_valid = load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "gigaspeech_cuts_DEV.jsonl.gz"
|
||||||
|
)
|
||||||
if self.args.small_dev:
|
if self.args.small_dev:
|
||||||
return cuts_valid.subset(first=1000)
|
return cuts_valid.subset(first=1000)
|
||||||
else:
|
else:
|
||||||
@ -371,4 +373,6 @@ class GigaSpeechAsrDataModule:
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def test_cuts(self) -> CutSet:
|
def test_cuts(self) -> CutSet:
|
||||||
logging.info("About to get test cuts")
|
logging.info("About to get test cuts")
|
||||||
return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "gigaspeech_cuts_TEST.jsonl.gz"
|
||||||
|
)
|
||||||
|
@ -23,7 +23,7 @@ from pathlib import Path
|
|||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
|
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||||
from lhotse.dataset import (
|
from lhotse.dataset import (
|
||||||
CutConcatenate,
|
CutConcatenate,
|
||||||
CutMix,
|
CutMix,
|
||||||
@ -216,7 +216,7 @@ class GigaSpeechAsrDataModule:
|
|||||||
if self.args.enable_musan:
|
if self.args.enable_musan:
|
||||||
logging.info("Enable MUSAN")
|
logging.info("Enable MUSAN")
|
||||||
logging.info("About to get Musan cuts")
|
logging.info("About to get Musan cuts")
|
||||||
cuts_musan = load_manifest(
|
cuts_musan = load_manifest_lazy(
|
||||||
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||||
)
|
)
|
||||||
transforms.append(
|
transforms.append(
|
||||||
@ -405,7 +405,9 @@ class GigaSpeechAsrDataModule:
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def dev_cuts(self) -> CutSet:
|
def dev_cuts(self) -> CutSet:
|
||||||
logging.info("About to get dev cuts")
|
logging.info("About to get dev cuts")
|
||||||
cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
|
cuts_valid = load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "cuts_DEV.jsonl.gz"
|
||||||
|
)
|
||||||
if self.args.small_dev:
|
if self.args.small_dev:
|
||||||
return cuts_valid.subset(first=1000)
|
return cuts_valid.subset(first=1000)
|
||||||
else:
|
else:
|
||||||
@ -414,4 +416,4 @@ class GigaSpeechAsrDataModule:
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def test_cuts(self) -> CutSet:
|
def test_cuts(self) -> CutSet:
|
||||||
logging.info("About to get test cuts")
|
logging.info("About to get test cuts")
|
||||||
return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
|
return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
|
||||||
|
@ -96,14 +96,14 @@ def get_parser():
|
|||||||
|
|
||||||
- labels_xxx.h5
|
- labels_xxx.h5
|
||||||
- aux_labels_xxx.h5
|
- aux_labels_xxx.h5
|
||||||
- cuts_xxx.json.gz
|
- librispeech_cuts_xxx.jsonl.gz
|
||||||
|
|
||||||
where xxx is the value of `--dataset`. For instance, if
|
where xxx is the value of `--dataset`. For instance, if
|
||||||
`--dataset` is `train-clean-100`, it will contain 3 files:
|
`--dataset` is `train-clean-100`, it will contain 3 files:
|
||||||
|
|
||||||
- `labels_train-clean-100.h5`
|
- `labels_train-clean-100.h5`
|
||||||
- `aux_labels_train-clean-100.h5`
|
- `aux_labels_train-clean-100.h5`
|
||||||
- `cuts_train-clean-100.json.gz`
|
- `librispeech_cuts_train-clean-100.jsonl.gz`
|
||||||
|
|
||||||
Note: Both labels_xxx.h5 and aux_labels_xxx.h5 contain framewise
|
Note: Both labels_xxx.h5 and aux_labels_xxx.h5 contain framewise
|
||||||
alignment. The difference is that labels_xxx.h5 contains repeats.
|
alignment. The difference is that labels_xxx.h5 contains repeats.
|
||||||
@ -289,7 +289,9 @@ def main():
|
|||||||
|
|
||||||
out_labels_ali_filename = out_dir / f"labels_{params.dataset}.h5"
|
out_labels_ali_filename = out_dir / f"labels_{params.dataset}.h5"
|
||||||
out_aux_labels_ali_filename = out_dir / f"aux_labels_{params.dataset}.h5"
|
out_aux_labels_ali_filename = out_dir / f"aux_labels_{params.dataset}.h5"
|
||||||
out_manifest_filename = out_dir / f"cuts_{params.dataset}.json.gz"
|
out_manifest_filename = (
|
||||||
|
out_dir / f"librispeech_cuts_{params.dataset}.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
for f in (
|
for f in (
|
||||||
out_labels_ali_filename,
|
out_labels_ali_filename,
|
||||||
|
@ -22,7 +22,7 @@ from pathlib import Path
|
|||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
|
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||||
from lhotse.dataset import (
|
from lhotse.dataset import (
|
||||||
CutConcatenate,
|
CutConcatenate,
|
||||||
CutMix,
|
CutMix,
|
||||||
@ -176,7 +176,7 @@ class SPGISpeechAsrDataModule:
|
|||||||
The state dict for the training sampler.
|
The state dict for the training sampler.
|
||||||
"""
|
"""
|
||||||
logging.info("About to get Musan cuts")
|
logging.info("About to get Musan cuts")
|
||||||
cuts_musan = load_manifest(
|
cuts_musan = load_manifest_lazy(
|
||||||
self.args.manifest_dir / "cuts_musan.jsonl.gz"
|
self.args.manifest_dir / "cuts_musan.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -52,8 +52,13 @@ def compute_fbank_tedlium():
|
|||||||
"test",
|
"test",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
prefix = "tedlium"
|
||||||
|
suffix = "jsonl.gz"
|
||||||
manifests = read_manifests_if_cached(
|
manifests = read_manifests_if_cached(
|
||||||
prefix="tedlium", dataset_parts=dataset_parts, output_dir=src_dir
|
dataset_parts=dataset_parts,
|
||||||
|
output_dir=src_dir,
|
||||||
|
prefix=prefix,
|
||||||
|
suffix=suffix,
|
||||||
)
|
)
|
||||||
assert manifests is not None
|
assert manifests is not None
|
||||||
|
|
||||||
@ -61,7 +66,7 @@ def compute_fbank_tedlium():
|
|||||||
|
|
||||||
with get_executor() as ex: # Initialize the executor only once.
|
with get_executor() as ex: # Initialize the executor only once.
|
||||||
for partition, m in manifests.items():
|
for partition, m in manifests.items():
|
||||||
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
|
||||||
logging.info(f"{partition} already exists - skipping.")
|
logging.info(f"{partition} already exists - skipping.")
|
||||||
continue
|
continue
|
||||||
logging.info(f"Processing {partition}")
|
logging.info(f"Processing {partition}")
|
||||||
@ -80,7 +85,7 @@ def compute_fbank_tedlium():
|
|||||||
|
|
||||||
cut_set = cut_set.compute_and_store_features(
|
cut_set = cut_set.compute_and_store_features(
|
||||||
extractor=extractor,
|
extractor=extractor,
|
||||||
storage_path=f"{output_dir}/feats_{partition}",
|
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||||
# when an executor is specified, make more partitions
|
# when an executor is specified, make more partitions
|
||||||
num_jobs=cur_num_jobs,
|
num_jobs=cur_num_jobs,
|
||||||
executor=ex,
|
executor=ex,
|
||||||
@ -88,7 +93,7 @@ def compute_fbank_tedlium():
|
|||||||
)
|
)
|
||||||
# Split long cuts into many short and un-overlapping cuts
|
# Split long cuts into many short and un-overlapping cuts
|
||||||
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
|
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
|
||||||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -27,15 +27,15 @@ for usage.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
from lhotse import load_manifest
|
from lhotse import load_manifest_lazy
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
path = "./data/fbank/cuts_train.json.gz"
|
path = "./data/fbank/tedlium_cuts_train.jsonl.gz"
|
||||||
path = "./data/fbank/cuts_dev.json.gz"
|
path = "./data/fbank/tedlium_cuts_dev.jsonl.gz"
|
||||||
path = "./data/fbank/cuts_test.json.gz"
|
path = "./data/fbank/tedlium_cuts_test.jsonl.gz"
|
||||||
|
|
||||||
cuts = load_manifest(path)
|
cuts = load_manifest_lazy(path)
|
||||||
cuts.describe()
|
cuts.describe()
|
||||||
|
|
||||||
|
|
||||||
|
@ -22,11 +22,11 @@ import logging
|
|||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
|
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||||
from lhotse.dataset import (
|
from lhotse.dataset import (
|
||||||
BucketingSampler,
|
|
||||||
CutConcatenate,
|
CutConcatenate,
|
||||||
CutMix,
|
CutMix,
|
||||||
|
DynamicBucketingSampler,
|
||||||
K2SpeechRecognitionDataset,
|
K2SpeechRecognitionDataset,
|
||||||
PrecomputedFeatures,
|
PrecomputedFeatures,
|
||||||
SingleCutSampler,
|
SingleCutSampler,
|
||||||
@ -92,7 +92,7 @@ class TedLiumAsrDataModule:
|
|||||||
"--num-buckets",
|
"--num-buckets",
|
||||||
type=int,
|
type=int,
|
||||||
default=30,
|
default=30,
|
||||||
help="The number of buckets for the BucketingSampler"
|
help="The number of buckets for the DynamicBucketingSampler"
|
||||||
"(you might want to increase it for larger datasets).",
|
"(you might want to increase it for larger datasets).",
|
||||||
)
|
)
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
@ -179,8 +179,8 @@ class TedLiumAsrDataModule:
|
|||||||
transforms = []
|
transforms = []
|
||||||
if self.args.enable_musan:
|
if self.args.enable_musan:
|
||||||
logging.info("Enable MUSAN")
|
logging.info("Enable MUSAN")
|
||||||
cuts_musan = load_manifest(
|
cuts_musan = load_manifest_lazy(
|
||||||
self.args.manifest_dir / "cuts_musan.json.gz"
|
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||||
)
|
)
|
||||||
transforms.append(
|
transforms.append(
|
||||||
CutMix(
|
CutMix(
|
||||||
@ -261,13 +261,12 @@ class TedLiumAsrDataModule:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if self.args.bucketing_sampler:
|
if self.args.bucketing_sampler:
|
||||||
logging.info("Using BucketingSampler.")
|
logging.info("Using DynamicBucketingSampler.")
|
||||||
train_sampler = BucketingSampler(
|
train_sampler = DynamicBucketingSampler(
|
||||||
cuts_train,
|
cuts_train,
|
||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
shuffle=self.args.shuffle,
|
shuffle=self.args.shuffle,
|
||||||
num_buckets=self.args.num_buckets,
|
num_buckets=self.args.num_buckets,
|
||||||
bucket_method="equal_duration",
|
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -311,7 +310,7 @@ class TedLiumAsrDataModule:
|
|||||||
cut_transforms=transforms,
|
cut_transforms=transforms,
|
||||||
return_cuts=self.args.return_cuts,
|
return_cuts=self.args.return_cuts,
|
||||||
)
|
)
|
||||||
valid_sampler = BucketingSampler(
|
valid_sampler = DynamicBucketingSampler(
|
||||||
cuts_valid,
|
cuts_valid,
|
||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
@ -335,8 +334,10 @@ class TedLiumAsrDataModule:
|
|||||||
else PrecomputedFeatures(),
|
else PrecomputedFeatures(),
|
||||||
return_cuts=self.args.return_cuts,
|
return_cuts=self.args.return_cuts,
|
||||||
)
|
)
|
||||||
sampler = BucketingSampler(
|
sampler = DynamicBucketingSampler(
|
||||||
cuts, max_duration=self.args.max_duration, shuffle=False
|
cuts,
|
||||||
|
max_duration=self.args.max_duration,
|
||||||
|
shuffle=False,
|
||||||
)
|
)
|
||||||
logging.debug("About to create test dataloader")
|
logging.debug("About to create test dataloader")
|
||||||
test_dl = DataLoader(
|
test_dl = DataLoader(
|
||||||
@ -350,14 +351,20 @@ class TedLiumAsrDataModule:
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def train_cuts(self) -> CutSet:
|
def train_cuts(self) -> CutSet:
|
||||||
logging.info("About to get train cuts")
|
logging.info("About to get train cuts")
|
||||||
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "tedlium_cuts_train.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def dev_cuts(self) -> CutSet:
|
def dev_cuts(self) -> CutSet:
|
||||||
logging.info("About to get dev cuts")
|
logging.info("About to get dev cuts")
|
||||||
return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "tedlium_cuts_dev.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def test_cuts(self) -> CutSet:
|
def test_cuts(self) -> CutSet:
|
||||||
logging.info("About to get test cuts")
|
logging.info("About to get test cuts")
|
||||||
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "tedlium_cuts_test.jsonl.gz"
|
||||||
|
)
|
||||||
|
@ -29,7 +29,7 @@ import os
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
|
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
||||||
from lhotse.recipes.utils import read_manifests_if_cached
|
from lhotse.recipes.utils import read_manifests_if_cached
|
||||||
|
|
||||||
from icefall.utils import get_executor
|
from icefall.utils import get_executor
|
||||||
@ -53,8 +53,13 @@ def compute_fbank_timit():
|
|||||||
"DEV",
|
"DEV",
|
||||||
"TEST",
|
"TEST",
|
||||||
)
|
)
|
||||||
|
prefix = "timit"
|
||||||
|
suffix = "jsonl.gz"
|
||||||
manifests = read_manifests_if_cached(
|
manifests = read_manifests_if_cached(
|
||||||
prefix="timit", dataset_parts=dataset_parts, output_dir=src_dir
|
dataset_parts=dataset_parts,
|
||||||
|
output_dir=src_dir,
|
||||||
|
prefix=prefix,
|
||||||
|
suffix=suffix,
|
||||||
)
|
)
|
||||||
assert manifests is not None
|
assert manifests is not None
|
||||||
|
|
||||||
@ -62,7 +67,8 @@ def compute_fbank_timit():
|
|||||||
|
|
||||||
with get_executor() as ex: # Initialize the executor only once.
|
with get_executor() as ex: # Initialize the executor only once.
|
||||||
for partition, m in manifests.items():
|
for partition, m in manifests.items():
|
||||||
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
|
||||||
|
if cuts_file.is_file():
|
||||||
logging.info(f"{partition} already exists - skipping.")
|
logging.info(f"{partition} already exists - skipping.")
|
||||||
continue
|
continue
|
||||||
logging.info(f"Processing {partition}")
|
logging.info(f"Processing {partition}")
|
||||||
@ -78,13 +84,13 @@ def compute_fbank_timit():
|
|||||||
)
|
)
|
||||||
cut_set = cut_set.compute_and_store_features(
|
cut_set = cut_set.compute_and_store_features(
|
||||||
extractor=extractor,
|
extractor=extractor,
|
||||||
storage_path=f"{output_dir}/feats_{partition}",
|
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||||
# when an executor is specified, make more partitions
|
# when an executor is specified, make more partitions
|
||||||
num_jobs=num_jobs if ex is None else 80,
|
num_jobs=num_jobs if ex is None else 80,
|
||||||
executor=ex,
|
executor=ex,
|
||||||
storage_type=LilcomHdf5Writer,
|
storage_type=LilcomChunkyWriter,
|
||||||
)
|
)
|
||||||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
cut_set.to_file(cuts_file)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -23,11 +23,11 @@ from functools import lru_cache
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Union
|
from typing import List, Union
|
||||||
|
|
||||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
|
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||||
from lhotse.dataset import (
|
from lhotse.dataset import (
|
||||||
BucketingSampler,
|
|
||||||
CutConcatenate,
|
CutConcatenate,
|
||||||
CutMix,
|
CutMix,
|
||||||
|
DynamicBucketingSampler,
|
||||||
K2SpeechRecognitionDataset,
|
K2SpeechRecognitionDataset,
|
||||||
PrecomputedFeatures,
|
PrecomputedFeatures,
|
||||||
SingleCutSampler,
|
SingleCutSampler,
|
||||||
@ -92,7 +92,7 @@ class TimitAsrDataModule(DataModule):
|
|||||||
"--num-buckets",
|
"--num-buckets",
|
||||||
type=int,
|
type=int,
|
||||||
default=30,
|
default=30,
|
||||||
help="The number of buckets for the BucketingSampler"
|
help="The number of buckets for the DynamicBucketingSampler"
|
||||||
"(you might want to increase it for larger datasets).",
|
"(you might want to increase it for larger datasets).",
|
||||||
)
|
)
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
@ -154,7 +154,9 @@ class TimitAsrDataModule(DataModule):
|
|||||||
cuts_train = self.train_cuts()
|
cuts_train = self.train_cuts()
|
||||||
|
|
||||||
logging.info("About to get Musan cuts")
|
logging.info("About to get Musan cuts")
|
||||||
cuts_musan = load_manifest(self.args.feature_dir / "cuts_musan.json.gz")
|
cuts_musan = load_manifest_lazy(
|
||||||
|
self.args.feature_dir / "cuts_musan.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
logging.info("About to create train dataset")
|
logging.info("About to create train dataset")
|
||||||
transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
|
transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
|
||||||
@ -218,13 +220,12 @@ class TimitAsrDataModule(DataModule):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if self.args.bucketing_sampler:
|
if self.args.bucketing_sampler:
|
||||||
logging.info("Using BucketingSampler.")
|
logging.info("Using DynamicBucketingSampler.")
|
||||||
train_sampler = BucketingSampler(
|
train_sampler = DynamicBucketingSampler(
|
||||||
cuts_train,
|
cuts_train,
|
||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
shuffle=self.args.shuffle,
|
shuffle=self.args.shuffle,
|
||||||
num_buckets=self.args.num_buckets,
|
num_buckets=self.args.num_buckets,
|
||||||
bucket_method="equal_duration",
|
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -322,20 +323,26 @@ class TimitAsrDataModule(DataModule):
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def train_cuts(self) -> CutSet:
|
def train_cuts(self) -> CutSet:
|
||||||
logging.info("About to get train cuts")
|
logging.info("About to get train cuts")
|
||||||
cuts_train = load_manifest(self.args.feature_dir / "cuts_TRAIN.json.gz")
|
cuts_train = load_manifest_lazy(
|
||||||
|
self.args.feature_dir / "timit_cuts_TRAIN.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
return cuts_train
|
return cuts_train
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def valid_cuts(self) -> CutSet:
|
def valid_cuts(self) -> CutSet:
|
||||||
logging.info("About to get dev cuts")
|
logging.info("About to get dev cuts")
|
||||||
cuts_valid = load_manifest(self.args.feature_dir / "cuts_DEV.json.gz")
|
cuts_valid = load_manifest_lazy(
|
||||||
|
self.args.feature_dir / "timit_cuts_DEV.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
return cuts_valid
|
return cuts_valid
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def test_cuts(self) -> CutSet:
|
def test_cuts(self) -> CutSet:
|
||||||
logging.debug("About to get test cuts")
|
logging.debug("About to get test cuts")
|
||||||
cuts_test = load_manifest(self.args.feature_dir / "cuts_TEST.json.gz")
|
cuts_test = load_manifest_lazy(
|
||||||
|
self.args.feature_dir / "timit_cuts_TEST.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
return cuts_test
|
return cuts_test
|
||||||
|
@ -26,7 +26,7 @@ for usage.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
from lhotse import load_manifest
|
from lhotse import load_manifest_lazy
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@ -40,7 +40,7 @@ def main():
|
|||||||
|
|
||||||
for path in paths:
|
for path in paths:
|
||||||
print(f"Starting display the statistics for {path}")
|
print(f"Starting display the statistics for {path}")
|
||||||
cuts = load_manifest(path)
|
cuts = load_manifest_lazy(path)
|
||||||
cuts.describe()
|
cuts.describe()
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ from lhotse import (
|
|||||||
CutSet,
|
CutSet,
|
||||||
Fbank,
|
Fbank,
|
||||||
FbankConfig,
|
FbankConfig,
|
||||||
load_manifest,
|
load_manifest_lazy,
|
||||||
set_caching_enabled,
|
set_caching_enabled,
|
||||||
)
|
)
|
||||||
from lhotse.dataset import (
|
from lhotse.dataset import (
|
||||||
@ -218,8 +218,8 @@ class WenetSpeechAsrDataModule:
|
|||||||
The state dict for the training sampler.
|
The state dict for the training sampler.
|
||||||
"""
|
"""
|
||||||
logging.info("About to get Musan cuts")
|
logging.info("About to get Musan cuts")
|
||||||
cuts_musan = load_manifest(
|
cuts_musan = load_manifest_lazy(
|
||||||
self.args.manifest_dir / "cuts_musan.json.gz"
|
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
transforms = []
|
transforms = []
|
||||||
@ -435,16 +435,18 @@ class WenetSpeechAsrDataModule:
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def valid_cuts(self) -> CutSet:
|
def valid_cuts(self) -> CutSet:
|
||||||
logging.info("About to get dev cuts")
|
logging.info("About to get dev cuts")
|
||||||
return load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
|
return load_manifest_lazy(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def test_net_cuts(self) -> List[CutSet]:
|
def test_net_cuts(self) -> List[CutSet]:
|
||||||
logging.info("About to get TEST_NET cuts")
|
logging.info("About to get TEST_NET cuts")
|
||||||
return load_manifest(self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz")
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def test_meeting_cuts(self) -> List[CutSet]:
|
def test_meeting_cuts(self) -> List[CutSet]:
|
||||||
logging.info("About to get TEST_MEETING cuts")
|
logging.info("About to get TEST_MEETING cuts")
|
||||||
return load_manifest(
|
return load_manifest_lazy(
|
||||||
self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz"
|
self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
@ -12,7 +12,7 @@ import os
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
|
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
||||||
from lhotse.recipes.utils import read_manifests_if_cached
|
from lhotse.recipes.utils import read_manifests_if_cached
|
||||||
|
|
||||||
from icefall.utils import get_executor
|
from icefall.utils import get_executor
|
||||||
@ -37,10 +37,13 @@ def compute_fbank_yesno():
|
|||||||
"train",
|
"train",
|
||||||
"test",
|
"test",
|
||||||
)
|
)
|
||||||
|
prefix = "yesno"
|
||||||
|
suffix = "jsonl.gz"
|
||||||
manifests = read_manifests_if_cached(
|
manifests = read_manifests_if_cached(
|
||||||
dataset_parts=dataset_parts,
|
dataset_parts=dataset_parts,
|
||||||
output_dir=src_dir,
|
output_dir=src_dir,
|
||||||
prefix="yesno",
|
prefix=prefix,
|
||||||
|
suffix=suffix,
|
||||||
)
|
)
|
||||||
assert manifests is not None
|
assert manifests is not None
|
||||||
|
|
||||||
@ -50,7 +53,8 @@ def compute_fbank_yesno():
|
|||||||
|
|
||||||
with get_executor() as ex: # Initialize the executor only once.
|
with get_executor() as ex: # Initialize the executor only once.
|
||||||
for partition, m in manifests.items():
|
for partition, m in manifests.items():
|
||||||
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
|
||||||
|
if cuts_file.is_file():
|
||||||
logging.info(f"{partition} already exists - skipping.")
|
logging.info(f"{partition} already exists - skipping.")
|
||||||
continue
|
continue
|
||||||
logging.info(f"Processing {partition}")
|
logging.info(f"Processing {partition}")
|
||||||
@ -66,13 +70,13 @@ def compute_fbank_yesno():
|
|||||||
)
|
)
|
||||||
cut_set = cut_set.compute_and_store_features(
|
cut_set = cut_set.compute_and_store_features(
|
||||||
extractor=extractor,
|
extractor=extractor,
|
||||||
storage_path=f"{output_dir}/feats_{partition}",
|
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||||
# when an executor is specified, make more partitions
|
# when an executor is specified, make more partitions
|
||||||
num_jobs=num_jobs if ex is None else 1, # use one job
|
num_jobs=num_jobs if ex is None else 1, # use one job
|
||||||
executor=ex,
|
executor=ex,
|
||||||
storage_type=LilcomHdf5Writer,
|
storage_type=LilcomChunkyWriter,
|
||||||
)
|
)
|
||||||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
cut_set.to_file(cuts_file)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -20,18 +20,19 @@ from functools import lru_cache
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||||
|
from lhotse.dataset import (
|
||||||
|
CutConcatenate,
|
||||||
|
DynamicBucketingSampler,
|
||||||
|
K2SpeechRecognitionDataset,
|
||||||
|
PrecomputedFeatures,
|
||||||
|
SingleCutSampler,
|
||||||
|
)
|
||||||
|
from lhotse.dataset.input_strategies import OnTheFlyFeatures
|
||||||
from torch.utils.data import DataLoader
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
from icefall.dataset.datamodule import DataModule
|
from icefall.dataset.datamodule import DataModule
|
||||||
from icefall.utils import str2bool
|
from icefall.utils import str2bool
|
||||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
|
|
||||||
from lhotse.dataset import (
|
|
||||||
BucketingSampler,
|
|
||||||
CutConcatenate,
|
|
||||||
K2SpeechRecognitionDataset,
|
|
||||||
PrecomputedFeatures,
|
|
||||||
)
|
|
||||||
from lhotse.dataset.input_strategies import OnTheFlyFeatures
|
|
||||||
|
|
||||||
|
|
||||||
class YesNoAsrDataModule(DataModule):
|
class YesNoAsrDataModule(DataModule):
|
||||||
@ -84,7 +85,7 @@ class YesNoAsrDataModule(DataModule):
|
|||||||
"--num-buckets",
|
"--num-buckets",
|
||||||
type=int,
|
type=int,
|
||||||
default=10,
|
default=10,
|
||||||
help="The number of buckets for the BucketingSampler"
|
help="The number of buckets for the DynamicBucketingSampler"
|
||||||
"(you might want to increase it for larger datasets).",
|
"(you might want to increase it for larger datasets).",
|
||||||
)
|
)
|
||||||
group.add_argument(
|
group.add_argument(
|
||||||
@ -186,18 +187,17 @@ class YesNoAsrDataModule(DataModule):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if self.args.bucketing_sampler:
|
if self.args.bucketing_sampler:
|
||||||
logging.info("Using BucketingSampler.")
|
logging.info("Using DynamicBucketingSampler.")
|
||||||
train_sampler = BucketingSampler(
|
train_sampler = DynamicBucketingSampler(
|
||||||
cuts_train,
|
cuts_train,
|
||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
shuffle=self.args.shuffle,
|
shuffle=self.args.shuffle,
|
||||||
num_buckets=self.args.num_buckets,
|
num_buckets=self.args.num_buckets,
|
||||||
bucket_method="equal_duration",
|
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logging.info("Using SingleCutSampler.")
|
logging.info("Using SingleCutSampler.")
|
||||||
train_sampler = BucketingSampler(
|
train_sampler = SingleCutSampler(
|
||||||
cuts_train,
|
cuts_train,
|
||||||
max_duration=self.args.max_duration,
|
max_duration=self.args.max_duration,
|
||||||
shuffle=self.args.shuffle,
|
shuffle=self.args.shuffle,
|
||||||
@ -225,8 +225,10 @@ class YesNoAsrDataModule(DataModule):
|
|||||||
else PrecomputedFeatures(),
|
else PrecomputedFeatures(),
|
||||||
return_cuts=self.args.return_cuts,
|
return_cuts=self.args.return_cuts,
|
||||||
)
|
)
|
||||||
sampler = BucketingSampler(
|
sampler = DynamicBucketingSampler(
|
||||||
cuts_test, max_duration=self.args.max_duration, shuffle=False
|
cuts_test,
|
||||||
|
max_duration=self.args.max_duration,
|
||||||
|
shuffle=False,
|
||||||
)
|
)
|
||||||
logging.debug("About to create test dataloader")
|
logging.debug("About to create test dataloader")
|
||||||
test_dl = DataLoader(
|
test_dl = DataLoader(
|
||||||
@ -240,11 +242,15 @@ class YesNoAsrDataModule(DataModule):
|
|||||||
@lru_cache()
|
@lru_cache()
|
||||||
def train_cuts(self) -> CutSet:
|
def train_cuts(self) -> CutSet:
|
||||||
logging.info("About to get train cuts")
|
logging.info("About to get train cuts")
|
||||||
cuts_train = load_manifest(self.args.feature_dir / "cuts_train.json.gz")
|
cuts_train = load_manifest_lazy(
|
||||||
|
self.args.feature_dir / "yesno_cuts_train.jsonl.gz"
|
||||||
|
)
|
||||||
return cuts_train
|
return cuts_train
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def test_cuts(self) -> List[CutSet]:
|
def test_cuts(self) -> List[CutSet]:
|
||||||
logging.info("About to get test cuts")
|
logging.info("About to get test cuts")
|
||||||
cuts_test = load_manifest(self.args.feature_dir / "cuts_test.json.gz")
|
cuts_test = load_manifest_lazy(
|
||||||
|
self.args.feature_dir / "yesno_cuts_test.jsonl.gz"
|
||||||
|
)
|
||||||
return cuts_test
|
return cuts_test
|
||||||
|
Loading…
x
Reference in New Issue
Block a user