mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
Use jsonl for CutSet in the LibriSpeech recipe. (#397)
* Use jsonl for cutsets in the librispeech recipe. * Use lazy cutset for all recipes. * More fixes to use lazy CutSet. * Remove force=True from logging to support Python < 3.8 * Minor fixes. * Fix style issues.
This commit is contained in:
parent
e5884f82e0
commit
f1abce72f8
@ -59,6 +59,8 @@ jobs:
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
|
@ -59,6 +59,8 @@ jobs:
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -99,7 +101,7 @@ jobs:
|
||||
with:
|
||||
path: |
|
||||
~/tmp/fbank-libri
|
||||
key: cache-libri-fbank-test-clean-and-test-other
|
||||
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||
|
||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||
|
@ -59,6 +59,8 @@ jobs:
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -99,7 +101,7 @@ jobs:
|
||||
with:
|
||||
path: |
|
||||
~/tmp/fbank-libri
|
||||
key: cache-libri-fbank-test-clean-and-test-other
|
||||
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||
|
||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||
|
@ -59,6 +59,8 @@ jobs:
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -99,7 +101,7 @@ jobs:
|
||||
with:
|
||||
path: |
|
||||
~/tmp/fbank-libri
|
||||
key: cache-libri-fbank-test-clean-and-test-other
|
||||
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||
|
||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||
|
@ -59,6 +59,8 @@ jobs:
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -99,7 +101,7 @@ jobs:
|
||||
with:
|
||||
path: |
|
||||
~/tmp/fbank-libri
|
||||
key: cache-libri-fbank-test-clean-and-test-other
|
||||
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||
|
||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||
|
@ -59,6 +59,8 @@ jobs:
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -99,7 +101,7 @@ jobs:
|
||||
with:
|
||||
path: |
|
||||
~/tmp/fbank-libri
|
||||
key: cache-libri-fbank-test-clean-and-test-other
|
||||
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||
|
||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||
|
@ -58,6 +58,8 @@ jobs:
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -98,7 +100,7 @@ jobs:
|
||||
with:
|
||||
path: |
|
||||
~/tmp/fbank-libri
|
||||
key: cache-libri-fbank-test-clean-and-test-other
|
||||
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||
|
||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||
|
@ -58,6 +58,8 @@ jobs:
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -98,7 +100,7 @@ jobs:
|
||||
with:
|
||||
path: |
|
||||
~/tmp/fbank-libri
|
||||
key: cache-libri-fbank-test-clean-and-test-other
|
||||
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||
|
||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||
|
@ -58,6 +58,8 @@ jobs:
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
grep -v '^#' ./requirements-ci.txt | xargs -n 1 -L 1 pip install
|
||||
pip uninstall -y protobuf
|
||||
pip install --no-binary protobuf protobuf
|
||||
|
||||
- name: Cache kaldifeat
|
||||
id: my-cache
|
||||
@ -98,7 +100,7 @@ jobs:
|
||||
with:
|
||||
path: |
|
||||
~/tmp/fbank-libri
|
||||
key: cache-libri-fbank-test-clean-and-test-other
|
||||
key: cache-libri-fbank-test-clean-and-test-other-v2
|
||||
|
||||
- name: Compute fbank for LibriSpeech test-clean and test-other
|
||||
if: steps.libri-test-clean-and-test-other-fbank.outputs.cache-hit != 'true'
|
||||
|
@ -43,7 +43,7 @@ torch.set_num_interop_threads(1)
|
||||
|
||||
|
||||
def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
|
||||
src_dir = Path("data/manifests/aidatatang_200zh")
|
||||
src_dir = Path("data/manifests")
|
||||
output_dir = Path("data/fbank")
|
||||
num_jobs = min(15, os.cpu_count())
|
||||
|
||||
@ -52,11 +52,13 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
|
||||
"dev",
|
||||
"test",
|
||||
)
|
||||
prefix = "aidatatang"
|
||||
suffix = "jsonl.gz"
|
||||
manifests = read_manifests_if_cached(
|
||||
prefix="aidatatang",
|
||||
suffix="jsonl.gz",
|
||||
dataset_parts=dataset_parts,
|
||||
output_dir=src_dir,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
)
|
||||
assert manifests is not None
|
||||
|
||||
@ -64,10 +66,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
|
||||
|
||||
with get_executor() as ex: # Initialize the executor only once.
|
||||
for partition, m in manifests.items():
|
||||
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
||||
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
|
||||
logging.info(f"{partition} already exists - skipping.")
|
||||
continue
|
||||
logging.info(f"Processing {partition}")
|
||||
|
||||
for sup in m["supervisions"]:
|
||||
sup.custom = {"origin": "aidatatang_200zh"}
|
||||
|
||||
cut_set = CutSet.from_manifests(
|
||||
recordings=m["recordings"],
|
||||
supervisions=m["supervisions"],
|
||||
@ -80,13 +86,14 @@ def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
|
||||
)
|
||||
cut_set = cut_set.compute_and_store_features(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/feats_{partition}",
|
||||
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||
# when an executor is specified, make more partitions
|
||||
num_jobs=num_jobs if ex is None else 80,
|
||||
executor=ex,
|
||||
storage_type=ChunkedLilcomHdf5Writer,
|
||||
)
|
||||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
||||
|
||||
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||
|
||||
|
||||
def get_args():
|
||||
|
@ -25,19 +25,19 @@ for usage.
|
||||
"""
|
||||
|
||||
|
||||
from lhotse import load_manifest
|
||||
from lhotse import load_manifest_lazy
|
||||
|
||||
|
||||
def main():
|
||||
paths = [
|
||||
"./data/fbank/cuts_train.json.gz",
|
||||
"./data/fbank/cuts_dev.json.gz",
|
||||
"./data/fbank/cuts_test.json.gz",
|
||||
"./data/fbank/aidatatang_cuts_train.jsonl.gz",
|
||||
"./data/fbank/aidatatang_cuts_dev.jsonl.gz",
|
||||
"./data/fbank/aidatatang_cuts_test.jsonl.gz",
|
||||
]
|
||||
|
||||
for path in paths:
|
||||
print(f"Starting display the statistics for {path}")
|
||||
cuts = load_manifest(path)
|
||||
cuts = load_manifest_lazy(path)
|
||||
cuts.describe()
|
||||
|
||||
|
||||
@ -45,7 +45,7 @@ if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
"""
|
||||
Starting display the statistics for ./data/fbank/cuts_train.json.gz
|
||||
Starting display the statistics for ./data/fbank/aidatatang_cuts_train.jsonl.gz
|
||||
Cuts count: 494715
|
||||
Total duration (hours): 422.6
|
||||
Speech duration (hours): 422.6 (100.0%)
|
||||
@ -61,7 +61,7 @@ min 1.0
|
||||
99.5% 8.0
|
||||
99.9% 9.5
|
||||
max 18.1
|
||||
Starting display the statistics for ./data/fbank/cuts_dev.json.gz
|
||||
Starting display the statistics for ./data/fbank/aidatatang_cuts_dev.jsonl.gz
|
||||
Cuts count: 24216
|
||||
Total duration (hours): 20.2
|
||||
Speech duration (hours): 20.2 (100.0%)
|
||||
@ -77,7 +77,7 @@ min 1.2
|
||||
99.5% 7.3
|
||||
99.9% 8.8
|
||||
max 11.3
|
||||
Starting display the statistics for ./data/fbank/cuts_test.json.gz
|
||||
Starting display the statistics for ./data/fbank/aidatatang_cuts_test.jsonl.gz
|
||||
Cuts count: 48144
|
||||
Total duration (hours): 40.2
|
||||
Speech duration (hours): 40.2 (100.0%)
|
||||
|
@ -27,11 +27,10 @@ from lhotse import (
|
||||
CutSet,
|
||||
Fbank,
|
||||
FbankConfig,
|
||||
load_manifest,
|
||||
load_manifest_lazy,
|
||||
set_caching_enabled,
|
||||
)
|
||||
from lhotse.dataset import (
|
||||
BucketingSampler,
|
||||
CutConcatenate,
|
||||
CutMix,
|
||||
DynamicBucketingSampler,
|
||||
@ -205,8 +204,8 @@ class Aidatatang_200zhAsrDataModule:
|
||||
The state dict for the training sampler.
|
||||
"""
|
||||
logging.info("About to get Musan cuts")
|
||||
cuts_musan = load_manifest(
|
||||
self.args.manifest_dir / "cuts_musan.json.gz"
|
||||
cuts_musan = load_manifest_lazy(
|
||||
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||
)
|
||||
|
||||
transforms = []
|
||||
@ -290,13 +289,12 @@ class Aidatatang_200zhAsrDataModule:
|
||||
)
|
||||
|
||||
if self.args.bucketing_sampler:
|
||||
logging.info("Using BucketingSampler.")
|
||||
train_sampler = BucketingSampler(
|
||||
logging.info("Using DynamicBucketingSampler.")
|
||||
train_sampler = DynamicBucketingSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
num_buckets=self.args.num_buckets,
|
||||
bucket_method="equal_duration",
|
||||
drop_last=True,
|
||||
)
|
||||
else:
|
||||
@ -402,14 +400,20 @@ class Aidatatang_200zhAsrDataModule:
|
||||
@lru_cache()
|
||||
def train_cuts(self) -> CutSet:
|
||||
logging.info("About to get train cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def valid_cuts(self) -> CutSet:
|
||||
logging.info("About to get dev cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "aidatatang_cuts_dev.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def test_cuts(self) -> List[CutSet]:
|
||||
logging.info("About to get test cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
|
||||
)
|
||||
|
@ -195,9 +195,9 @@ def get_params() -> AttributeDict:
|
||||
"best_train_epoch": -1,
|
||||
"best_valid_epoch": -1,
|
||||
"batch_idx_train": 0,
|
||||
"log_interval": 10,
|
||||
"log_interval": 50,
|
||||
"reset_interval": 200,
|
||||
"valid_interval": 3000,
|
||||
"valid_interval": 2000,
|
||||
# parameters for k2.ctc_loss
|
||||
"beam_size": 10,
|
||||
"reduction": "sum",
|
||||
|
119
egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py
Executable file
119
egs/aishell/ASR/local/compute_fbank_aidatatang_200zh.py
Executable file
@ -0,0 +1,119 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
"""
|
||||
This file computes fbank features of the aidatatang_200zh dataset.
|
||||
It looks for manifests in the directory data/manifests.
|
||||
|
||||
The generated fbank features are saved in data/fbank.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
from icefall.utils import get_executor
|
||||
|
||||
# Torch's multithreaded behavior needs to be disabled or
|
||||
# it wastes a lot of CPU and slow things down.
|
||||
# Do this outside of main() in case it needs to take effect
|
||||
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||
torch.set_num_threads(1)
|
||||
torch.set_num_interop_threads(1)
|
||||
|
||||
|
||||
def compute_fbank_aidatatang_200zh(num_mel_bins: int = 80):
|
||||
src_dir = Path("data/manifests")
|
||||
output_dir = Path("data/fbank")
|
||||
num_jobs = min(15, os.cpu_count())
|
||||
|
||||
dataset_parts = (
|
||||
"train",
|
||||
"test",
|
||||
"dev",
|
||||
)
|
||||
prefix = "aidatatang"
|
||||
suffix = "jsonl.gz"
|
||||
manifests = read_manifests_if_cached(
|
||||
dataset_parts=dataset_parts,
|
||||
output_dir=src_dir,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
)
|
||||
assert manifests is not None
|
||||
|
||||
extractor = Fbank(FbankConfig(num_mel_bins=num_mel_bins))
|
||||
|
||||
with get_executor() as ex: # Initialize the executor only once.
|
||||
for partition, m in manifests.items():
|
||||
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
|
||||
logging.info(f"{partition} already exists - skipping.")
|
||||
continue
|
||||
logging.info(f"Processing {partition}")
|
||||
|
||||
for sup in m["supervisions"]:
|
||||
sup.custom = {"origin": "aidatatang_200zh"}
|
||||
|
||||
cut_set = CutSet.from_manifests(
|
||||
recordings=m["recordings"],
|
||||
supervisions=m["supervisions"],
|
||||
)
|
||||
if "train" in partition:
|
||||
cut_set = (
|
||||
cut_set
|
||||
+ cut_set.perturb_speed(0.9)
|
||||
+ cut_set.perturb_speed(1.1)
|
||||
)
|
||||
cut_set = cut_set.compute_and_store_features(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||
# when an executor is specified, make more partitions
|
||||
num_jobs=num_jobs if ex is None else 80,
|
||||
executor=ex,
|
||||
storage_type=LilcomChunkyWriter,
|
||||
)
|
||||
|
||||
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--num-mel-bins",
|
||||
type=int,
|
||||
default=80,
|
||||
help="""The number of mel bins for Fbank""",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
formatter = (
|
||||
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||
)
|
||||
|
||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||
|
||||
args = get_args()
|
||||
compute_fbank_aidatatang_200zh(num_mel_bins=args.num_mel_bins)
|
@ -29,7 +29,7 @@ import os
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
from icefall.utils import get_executor
|
||||
@ -52,8 +52,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
|
||||
"dev",
|
||||
"test",
|
||||
)
|
||||
prefix = "aishell"
|
||||
suffix = "jsonl.gz"
|
||||
manifests = read_manifests_if_cached(
|
||||
prefix="aishell", dataset_parts=dataset_parts, output_dir=src_dir
|
||||
dataset_parts=dataset_parts,
|
||||
output_dir=src_dir,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
)
|
||||
assert manifests is not None
|
||||
|
||||
@ -61,7 +66,7 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
|
||||
|
||||
with get_executor() as ex: # Initialize the executor only once.
|
||||
for partition, m in manifests.items():
|
||||
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
||||
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
|
||||
logging.info(f"{partition} already exists - skipping.")
|
||||
continue
|
||||
logging.info(f"Processing {partition}")
|
||||
@ -77,13 +82,13 @@ def compute_fbank_aishell(num_mel_bins: int = 80):
|
||||
)
|
||||
cut_set = cut_set.compute_and_store_features(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/feats_{partition}",
|
||||
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||
# when an executor is specified, make more partitions
|
||||
num_jobs=num_jobs if ex is None else 80,
|
||||
executor=ex,
|
||||
storage_type=LilcomHdf5Writer,
|
||||
storage_type=LilcomChunkyWriter,
|
||||
)
|
||||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
||||
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||
|
||||
|
||||
def get_args():
|
||||
|
@ -25,18 +25,18 @@ for usage.
|
||||
"""
|
||||
|
||||
|
||||
from lhotse import load_manifest
|
||||
from lhotse import load_manifest_lazy
|
||||
|
||||
|
||||
def main():
|
||||
# path = "./data/fbank/cuts_train.json.gz"
|
||||
# path = "./data/fbank/cuts_test.json.gz"
|
||||
# path = "./data/fbank/cuts_dev.json.gz"
|
||||
# path = "./data/fbank/aidatatang_200zh/cuts_train_raw.jsonl.gz"
|
||||
# path = "./data/fbank/aidatatang_200zh/cuts_test_raw.jsonl.gz"
|
||||
path = "./data/fbank/aidatatang_200zh/cuts_dev_raw.jsonl.gz"
|
||||
# path = "./data/fbank/aishell_cuts_train.jsonl.gz"
|
||||
# path = "./data/fbank/aishell_cuts_test.jsonl.gz"
|
||||
path = "./data/fbank/aishell_cuts_dev.jsonl.gz"
|
||||
# path = "./data/fbank/aidatatang_cuts_train.jsonl.gz"
|
||||
# path = "./data/fbank/aidatatang_cuts_test.jsonl.gz"
|
||||
# path = "./data/fbank/aidatatang_cuts_dev.jsonl.gz"
|
||||
|
||||
cuts = load_manifest(path)
|
||||
cuts = load_manifest_lazy(path)
|
||||
cuts.describe()
|
||||
|
||||
|
||||
|
@ -1,71 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2022 Xiaomi Corp. (Fangjun Kuang)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
|
||||
def preprocess_aidatatang_200zh():
|
||||
src_dir = Path("data/manifests/aidatatang_200zh")
|
||||
output_dir = Path("data/fbank/aidatatang_200zh")
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
dataset_parts = (
|
||||
"train",
|
||||
"test",
|
||||
"dev",
|
||||
)
|
||||
|
||||
logging.info("Loading manifest")
|
||||
manifests = read_manifests_if_cached(
|
||||
dataset_parts=dataset_parts, output_dir=src_dir, prefix="aidatatang"
|
||||
)
|
||||
assert len(manifests) > 0
|
||||
|
||||
for partition, m in manifests.items():
|
||||
logging.info(f"Processing {partition}")
|
||||
raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
|
||||
if raw_cuts_path.is_file():
|
||||
logging.info(f"{partition} already exists - skipping")
|
||||
continue
|
||||
|
||||
for sup in m["supervisions"]:
|
||||
sup.custom = {"origin": "aidatatang_200zh"}
|
||||
|
||||
cut_set = CutSet.from_manifests(
|
||||
recordings=m["recordings"],
|
||||
supervisions=m["supervisions"],
|
||||
)
|
||||
|
||||
logging.info(f"Saving to {raw_cuts_path}")
|
||||
cut_set.to_file(raw_cuts_path)
|
||||
|
||||
|
||||
def main():
|
||||
formatter = (
|
||||
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||
)
|
||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||
|
||||
preprocess_aidatatang_200zh()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -42,18 +42,18 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
log "Stage 1: Prepare manifest"
|
||||
# We assume that you have downloaded the aidatatang_200zh corpus
|
||||
# to $dl_dir/aidatatang_200zh
|
||||
if [ ! -f data/manifests/aidatatang_200zh/.manifests.done ]; then
|
||||
mkdir -p data/manifests/aidatatang_200zh
|
||||
lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
|
||||
touch data/manifests/aidatatang_200zh/.manifests.done
|
||||
if [ ! -f data/manifests/.aidatatang_200zh_manifests.done ]; then
|
||||
mkdir -p data/manifests
|
||||
lhotse prepare aidatatang-200zh $dl_dir data/manifests
|
||||
touch data/manifests/.aidatatang_200zh_manifests.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||
log "Stage 2: Process aidatatang_200zh"
|
||||
if [ ! -f data/fbank/aidatatang_200zh/.fbank.done ]; then
|
||||
mkdir -p data/fbank/aidatatang_200zh
|
||||
lhotse prepare aidatatang-200zh $dl_dir data/manifests/aidatatang_200zh
|
||||
touch data/fbank/aidatatang_200zh/.fbank.done
|
||||
if [ ! -f data/fbank/.aidatatang_200zh_fbank.done ]; then
|
||||
mkdir -p data/fbank
|
||||
./local/compute_fbank_aidatatang_200zh.py
|
||||
touch data/fbank/.aidatatang_200zh_fbank.done
|
||||
fi
|
||||
fi
|
||||
|
@ -23,11 +23,11 @@ from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||
from lhotse.dataset import (
|
||||
BucketingSampler,
|
||||
CutConcatenate,
|
||||
CutMix,
|
||||
DynamicBucketingSampler,
|
||||
K2SpeechRecognitionDataset,
|
||||
PrecomputedFeatures,
|
||||
SingleCutSampler,
|
||||
@ -93,7 +93,7 @@ class AishellAsrDataModule:
|
||||
"--num-buckets",
|
||||
type=int,
|
||||
default=30,
|
||||
help="The number of buckets for the BucketingSampler"
|
||||
help="The number of buckets for the DynamicBucketingSampler"
|
||||
"(you might want to increase it for larger datasets).",
|
||||
)
|
||||
group.add_argument(
|
||||
@ -133,6 +133,12 @@ class AishellAsrDataModule:
|
||||
help="When enabled (=default), the examples will be "
|
||||
"shuffled for each epoch.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--drop-last",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="Whether to drop last batch. Used by sampler.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--return-cuts",
|
||||
type=str2bool,
|
||||
@ -177,8 +183,8 @@ class AishellAsrDataModule:
|
||||
|
||||
def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
|
||||
logging.info("About to get Musan cuts")
|
||||
cuts_musan = load_manifest(
|
||||
self.args.manifest_dir / "cuts_musan.json.gz"
|
||||
cuts_musan = load_manifest_lazy(
|
||||
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||
)
|
||||
|
||||
transforms = []
|
||||
@ -262,14 +268,13 @@ class AishellAsrDataModule:
|
||||
)
|
||||
|
||||
if self.args.bucketing_sampler:
|
||||
logging.info("Using BucketingSampler.")
|
||||
train_sampler = BucketingSampler(
|
||||
logging.info("Using DynamicBucketingSampler.")
|
||||
train_sampler = DynamicBucketingSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
num_buckets=self.args.num_buckets,
|
||||
bucket_method="equal_duration",
|
||||
drop_last=True,
|
||||
drop_last=self.args.drop_last,
|
||||
)
|
||||
else:
|
||||
logging.info("Using SingleCutSampler.")
|
||||
@ -313,7 +318,7 @@ class AishellAsrDataModule:
|
||||
cut_transforms=transforms,
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
valid_sampler = BucketingSampler(
|
||||
valid_sampler = DynamicBucketingSampler(
|
||||
cuts_valid,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
@ -337,8 +342,10 @@ class AishellAsrDataModule:
|
||||
else PrecomputedFeatures(),
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
sampler = BucketingSampler(
|
||||
cuts, max_duration=self.args.max_duration, shuffle=False
|
||||
sampler = DynamicBucketingSampler(
|
||||
cuts,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
)
|
||||
test_dl = DataLoader(
|
||||
test,
|
||||
@ -351,17 +358,21 @@ class AishellAsrDataModule:
|
||||
@lru_cache()
|
||||
def train_cuts(self) -> CutSet:
|
||||
logging.info("About to get train cuts")
|
||||
cuts_train = load_manifest(
|
||||
self.args.manifest_dir / "cuts_train.json.gz"
|
||||
cuts_train = load_manifest_lazy(
|
||||
self.args.manifest_dir / "aishell_cuts_train.jsonl.gz"
|
||||
)
|
||||
return cuts_train
|
||||
|
||||
@lru_cache()
|
||||
def valid_cuts(self) -> CutSet:
|
||||
logging.info("About to get dev cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "aishell_cuts_dev.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def test_cuts(self) -> List[CutSet]:
|
||||
logging.info("About to get test cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "aishell_cuts_test.jsonl.gz"
|
||||
)
|
||||
|
@ -15,6 +15,14 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Usage
|
||||
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
||||
./tdnn_lstm_ctc/train.py \
|
||||
--world-size 4 \
|
||||
--num-epochs 20 \
|
||||
--max-duration 300
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
@ -110,9 +110,7 @@ class Conformer(Transformer):
|
||||
x = x.permute(1, 0, 2) # (N, T, C) -> (T, N, C)
|
||||
|
||||
# Caution: We assume the subsampling factor is 4!
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
lengths = ((x_lens - 1) // 2 - 1) // 2
|
||||
lengths = (((x_lens - 1) >> 1) - 1) >> 1
|
||||
assert x.size(0) == lengths.max().item()
|
||||
mask = make_pad_mask(lengths)
|
||||
|
||||
|
@ -21,6 +21,7 @@
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from shutil import copyfile
|
||||
from typing import Optional, Tuple
|
||||
@ -386,7 +387,11 @@ def compute_loss(
|
||||
assert loss.requires_grad == is_training
|
||||
|
||||
info = MetricsTracker()
|
||||
info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
info["frames"] = (
|
||||
(feature_lens // params.subsampling_factor).sum().item()
|
||||
)
|
||||
|
||||
# Note: We use reduction=sum while computing the loss.
|
||||
info["loss"] = loss.detach().cpu().item()
|
||||
|
@ -18,7 +18,7 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet, load_manifest
|
||||
from lhotse import CutSet, load_manifest_lazy
|
||||
|
||||
|
||||
class AIDatatang200zh:
|
||||
@ -28,26 +28,26 @@ class AIDatatang200zh:
|
||||
manifest_dir:
|
||||
It is expected to contain the following files::
|
||||
|
||||
- cuts_dev_raw.jsonl.gz
|
||||
- cuts_train_raw.jsonl.gz
|
||||
- cuts_test_raw.jsonl.gz
|
||||
- aidatatang_cuts_dev.jsonl.gz
|
||||
- aidatatang_cuts_train.jsonl.gz
|
||||
- aidatatang_cuts_test.jsonl.gz
|
||||
"""
|
||||
self.manifest_dir = Path(manifest_dir)
|
||||
|
||||
def train_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_train_raw.jsonl.gz"
|
||||
f = self.manifest_dir / "aidatatang_cuts_train.jsonl.gz"
|
||||
logging.info(f"About to get train cuts from {f}")
|
||||
cuts_train = load_manifest(f)
|
||||
cuts_train = load_manifest_lazy(f)
|
||||
return cuts_train
|
||||
|
||||
def valid_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_valid_raw.jsonl.gz"
|
||||
f = self.manifest_dir / "aidatatang_cuts_valid.jsonl.gz"
|
||||
logging.info(f"About to get valid cuts from {f}")
|
||||
cuts_valid = load_manifest(f)
|
||||
cuts_valid = load_manifest_lazy(f)
|
||||
return cuts_valid
|
||||
|
||||
def test_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_test_raw.jsonl.gz"
|
||||
f = self.manifest_dir / "aidatatang_cuts_test.jsonl.gz"
|
||||
logging.info(f"About to get test cuts from {f}")
|
||||
cuts_test = load_manifest(f)
|
||||
cuts_test = load_manifest_lazy(f)
|
||||
return cuts_test
|
||||
|
@ -18,7 +18,7 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet, load_manifest
|
||||
from lhotse import CutSet, load_manifest_lazy
|
||||
|
||||
|
||||
class AIShell:
|
||||
@ -28,26 +28,26 @@ class AIShell:
|
||||
manifest_dir:
|
||||
It is expected to contain the following files::
|
||||
|
||||
- cuts_dev.json.gz
|
||||
- cuts_train.json.gz
|
||||
- cuts_test.json.gz
|
||||
- aishell_cuts_dev.jsonl.gz
|
||||
- aishell_cuts_train.jsonl.gz
|
||||
- aishell_cuts_test.jsonl.gz
|
||||
"""
|
||||
self.manifest_dir = Path(manifest_dir)
|
||||
|
||||
def train_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_train.json.gz"
|
||||
f = self.manifest_dir / "aishell_cuts_train.jsonl.gz"
|
||||
logging.info(f"About to get train cuts from {f}")
|
||||
cuts_train = load_manifest(f)
|
||||
cuts_train = load_manifest_lazy(f)
|
||||
return cuts_train
|
||||
|
||||
def valid_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_dev.json.gz"
|
||||
f = self.manifest_dir / "aishell_cuts_dev.jsonl.gz"
|
||||
logging.info(f"About to get valid cuts from {f}")
|
||||
cuts_valid = load_manifest(f)
|
||||
cuts_valid = load_manifest_lazy(f)
|
||||
return cuts_valid
|
||||
|
||||
def test_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_test.json.gz"
|
||||
f = self.manifest_dir / "aishell_cuts_test.jsonl.gz"
|
||||
logging.info(f"About to get test cuts from {f}")
|
||||
cuts_test = load_manifest(f)
|
||||
cuts_test = load_manifest_lazy(f)
|
||||
return cuts_test
|
||||
|
@ -24,7 +24,6 @@ from typing import Optional
|
||||
|
||||
from lhotse import CutSet, Fbank, FbankConfig
|
||||
from lhotse.dataset import (
|
||||
BucketingSampler,
|
||||
CutMix,
|
||||
DynamicBucketingSampler,
|
||||
K2SpeechRecognitionDataset,
|
||||
@ -73,8 +72,7 @@ class AsrDataModule:
|
||||
"--num-buckets",
|
||||
type=int,
|
||||
default=30,
|
||||
help="The number of buckets for the BucketingSampler "
|
||||
"and DynamicBucketingSampler."
|
||||
help="The number of buckets for the DynamicBucketingSampler "
|
||||
"(you might want to increase it for larger datasets).",
|
||||
)
|
||||
|
||||
@ -147,7 +145,6 @@ class AsrDataModule:
|
||||
def train_dataloaders(
|
||||
self,
|
||||
cuts_train: CutSet,
|
||||
dynamic_bucketing: bool,
|
||||
on_the_fly_feats: bool,
|
||||
cuts_musan: Optional[CutSet] = None,
|
||||
) -> DataLoader:
|
||||
@ -157,9 +154,6 @@ class AsrDataModule:
|
||||
Cuts for training.
|
||||
cuts_musan:
|
||||
If not None, it is the cuts for mixing.
|
||||
dynamic_bucketing:
|
||||
True to use DynamicBucketingSampler;
|
||||
False to use BucketingSampler.
|
||||
on_the_fly_feats:
|
||||
True to use OnTheFlyFeatures;
|
||||
False to use PrecomputedFeatures.
|
||||
@ -232,7 +226,6 @@ class AsrDataModule:
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
|
||||
if dynamic_bucketing:
|
||||
logging.info("Using DynamicBucketingSampler.")
|
||||
train_sampler = DynamicBucketingSampler(
|
||||
cuts_train,
|
||||
@ -241,16 +234,6 @@ class AsrDataModule:
|
||||
num_buckets=self.args.num_buckets,
|
||||
drop_last=True,
|
||||
)
|
||||
else:
|
||||
logging.info("Using BucketingSampler.")
|
||||
train_sampler = BucketingSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
num_buckets=self.args.num_buckets,
|
||||
bucket_method="equal_duration",
|
||||
drop_last=True,
|
||||
)
|
||||
|
||||
logging.info("About to create train dataloader")
|
||||
train_dl = DataLoader(
|
||||
@ -279,7 +262,7 @@ class AsrDataModule:
|
||||
cut_transforms=transforms,
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
valid_sampler = BucketingSampler(
|
||||
valid_sampler = DynamicBucketingSampler(
|
||||
cuts_valid,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
@ -303,8 +286,10 @@ class AsrDataModule:
|
||||
else PrecomputedFeatures(),
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
sampler = BucketingSampler(
|
||||
cuts, max_duration=self.args.max_duration, shuffle=False
|
||||
sampler = DynamicBucketingSampler(
|
||||
cuts,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
)
|
||||
logging.debug("About to create test dataloader")
|
||||
test_dl = DataLoader(
|
||||
|
@ -41,6 +41,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2"
|
||||
import argparse
|
||||
import logging
|
||||
import random
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from shutil import copyfile
|
||||
from typing import Optional, Tuple
|
||||
@ -55,7 +56,7 @@ from asr_datamodule import AsrDataModule
|
||||
from conformer import Conformer
|
||||
from decoder import Decoder
|
||||
from joiner import Joiner
|
||||
from lhotse import CutSet, load_manifest
|
||||
from lhotse import CutSet, load_manifest_lazy
|
||||
from lhotse.cut import Cut
|
||||
from lhotse.utils import fix_random_seed
|
||||
from model import Transducer
|
||||
@ -446,7 +447,11 @@ def compute_loss(
|
||||
assert loss.requires_grad == is_training
|
||||
|
||||
info = MetricsTracker()
|
||||
info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
info["frames"] = (
|
||||
(feature_lens // params.subsampling_factor).sum().item()
|
||||
)
|
||||
|
||||
# Note: We use reduction=sum while computing the loss.
|
||||
info["loss"] = loss.detach().cpu().item()
|
||||
@ -635,20 +640,16 @@ def train_one_epoch(
|
||||
|
||||
def filter_short_and_long_utterances(cuts: CutSet) -> CutSet:
|
||||
def remove_short_and_long_utt(c: Cut):
|
||||
# Keep only utterances with duration between 1 second and 12 seconds
|
||||
# Keep only utterances with duration between 1 second and 20 seconds
|
||||
#
|
||||
# Caution: There is a reason to select 12.0 here. Please see
|
||||
# ../local/display_manifest_statistics.py
|
||||
#
|
||||
# You should use ../local/display_manifest_statistics.py to get
|
||||
# an utterance duration distribution for your dataset to select
|
||||
# the threshold
|
||||
return 1.0 <= c.duration <= 12.0
|
||||
|
||||
num_in_total = len(cuts)
|
||||
cuts = cuts.filter(remove_short_and_long_utt)
|
||||
|
||||
num_left = len(cuts)
|
||||
num_removed = num_in_total - num_left
|
||||
removed_percent = num_removed / num_in_total * 100
|
||||
|
||||
logging.info(f"Before removing short and long utterances: {num_in_total}")
|
||||
logging.info(f"After removing short and long utterances: {num_left}")
|
||||
logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
|
||||
|
||||
return cuts
|
||||
|
||||
|
||||
@ -728,15 +729,14 @@ def run(rank, world_size, args):
|
||||
train_cuts = aishell.train_cuts()
|
||||
train_cuts = filter_short_and_long_utterances(train_cuts)
|
||||
|
||||
datatang = AIDatatang200zh(
|
||||
manifest_dir=f"{args.manifest_dir}/aidatatang_200zh"
|
||||
)
|
||||
datatang = AIDatatang200zh(manifest_dir=args.manifest_dir)
|
||||
train_datatang_cuts = datatang.train_cuts()
|
||||
train_datatang_cuts = filter_short_and_long_utterances(train_datatang_cuts)
|
||||
train_datatang_cuts = train_datatang_cuts.repeat(times=None)
|
||||
|
||||
if args.enable_musan:
|
||||
cuts_musan = load_manifest(
|
||||
Path(args.manifest_dir) / "cuts_musan.json.gz"
|
||||
cuts_musan = load_manifest_lazy(
|
||||
Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
|
||||
)
|
||||
else:
|
||||
cuts_musan = None
|
||||
@ -745,22 +745,23 @@ def run(rank, world_size, args):
|
||||
|
||||
train_dl = asr_datamodule.train_dataloaders(
|
||||
train_cuts,
|
||||
dynamic_bucketing=False,
|
||||
on_the_fly_feats=False,
|
||||
cuts_musan=cuts_musan,
|
||||
)
|
||||
|
||||
datatang_train_dl = asr_datamodule.train_dataloaders(
|
||||
train_datatang_cuts,
|
||||
dynamic_bucketing=True,
|
||||
on_the_fly_feats=True,
|
||||
on_the_fly_feats=False,
|
||||
cuts_musan=cuts_musan,
|
||||
)
|
||||
|
||||
valid_cuts = aishell.valid_cuts()
|
||||
valid_dl = asr_datamodule.valid_dataloaders(valid_cuts)
|
||||
|
||||
for dl in [train_dl, datatang_train_dl]:
|
||||
for dl in [
|
||||
train_dl,
|
||||
# datatang_train_dl
|
||||
]:
|
||||
scan_pessimistic_batches_for_oom(
|
||||
model=model,
|
||||
train_dl=dl,
|
||||
|
@ -37,6 +37,7 @@ export CUDA_VISIBLE_DEVICES="0,1,2"
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from shutil import copyfile
|
||||
from typing import Optional, Tuple
|
||||
@ -411,7 +412,11 @@ def compute_loss(
|
||||
assert loss.requires_grad == is_training
|
||||
|
||||
info = MetricsTracker()
|
||||
info["frames"] = (feature_lens // params.subsampling_factor).sum().item()
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
info["frames"] = (
|
||||
(feature_lens // params.subsampling_factor).sum().item()
|
||||
)
|
||||
|
||||
# Note: We use reduction=sum while computing the loss.
|
||||
info["loss"] = loss.detach().cpu().item()
|
||||
|
@ -43,7 +43,7 @@ torch.set_num_interop_threads(1)
|
||||
|
||||
|
||||
def compute_fbank_alimeeting(num_mel_bins: int = 80):
|
||||
src_dir = Path("data/manifests/alimeeting")
|
||||
src_dir = Path("data/manifests")
|
||||
output_dir = Path("data/fbank")
|
||||
num_jobs = min(15, os.cpu_count())
|
||||
|
||||
@ -52,11 +52,14 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
|
||||
"eval",
|
||||
"test",
|
||||
)
|
||||
|
||||
prefix = "alimeeting"
|
||||
suffix = "jsonl.gz"
|
||||
manifests = read_manifests_if_cached(
|
||||
dataset_parts=dataset_parts,
|
||||
output_dir=src_dir,
|
||||
prefix="alimeeting",
|
||||
suffix="jsonl.gz",
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
)
|
||||
assert manifests is not None
|
||||
|
||||
@ -64,7 +67,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
|
||||
|
||||
with get_executor() as ex: # Initialize the executor only once.
|
||||
for partition, m in manifests.items():
|
||||
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
||||
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
|
||||
logging.info(f"{partition} already exists - skipping.")
|
||||
continue
|
||||
logging.info(f"Processing {partition}")
|
||||
@ -83,7 +86,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
|
||||
|
||||
cut_set = cut_set.compute_and_store_features(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/feats_{partition}",
|
||||
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||
# when an executor is specified, make more partitions
|
||||
num_jobs=cur_num_jobs,
|
||||
executor=ex,
|
||||
@ -95,7 +98,7 @@ def compute_fbank_alimeeting(num_mel_bins: int = 80):
|
||||
keep_overlapping=False,
|
||||
min_duration=None,
|
||||
)
|
||||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
||||
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||
|
||||
|
||||
def get_args():
|
||||
|
@ -25,19 +25,19 @@ for usage.
|
||||
"""
|
||||
|
||||
|
||||
from lhotse import load_manifest
|
||||
from lhotse import load_manifest_lazy
|
||||
|
||||
|
||||
def main():
|
||||
paths = [
|
||||
"./data/fbank/cuts_train.json.gz",
|
||||
"./data/fbank/cuts_eval.json.gz",
|
||||
"./data/fbank/cuts_test.json.gz",
|
||||
"./data/fbank/alimeeting_cuts_train.jsonl.gz",
|
||||
"./data/fbank/alimeeting_cuts_eval.jsonl.gz",
|
||||
"./data/fbank/alimeeting_cuts_test.jsonl.gz",
|
||||
]
|
||||
|
||||
for path in paths:
|
||||
print(f"Starting display the statistics for {path}")
|
||||
cuts = load_manifest(path)
|
||||
cuts = load_manifest_lazy(path)
|
||||
cuts.describe()
|
||||
|
||||
|
||||
@ -45,7 +45,7 @@ if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
"""
|
||||
Starting display the statistics for ./data/fbank/cuts_train.json.gz
|
||||
Starting display the statistics for ./data/fbank/alimeeting_cuts_train.jsonl.gz
|
||||
Cuts count: 559092
|
||||
Total duration (hours): 424.6
|
||||
Speech duration (hours): 424.6 (100.0%)
|
||||
@ -61,7 +61,7 @@ min 0.0
|
||||
99.5% 14.7
|
||||
99.9% 16.2
|
||||
max 284.3
|
||||
Starting display the statistics for ./data/fbank/cuts_eval.json.gz
|
||||
Starting display the statistics for ./data/fbank/alimeeting_cuts_eval.jsonl.gz
|
||||
Cuts count: 6457
|
||||
Total duration (hours): 4.9
|
||||
Speech duration (hours): 4.9 (100.0%)
|
||||
@ -77,7 +77,7 @@ min 0.1
|
||||
99.5% 14.1
|
||||
99.9% 14.7
|
||||
max 15.8
|
||||
Starting display the statistics for ./data/fbank/cuts_test.json.gz
|
||||
Starting display the statistics for ./data/fbank/alimeeting_cuts_test.jsonl.gz
|
||||
Cuts count: 16358
|
||||
Total duration (hours): 12.5
|
||||
Speech duration (hours): 12.5 (100.0%)
|
||||
|
@ -27,7 +27,7 @@ from lhotse import (
|
||||
CutSet,
|
||||
Fbank,
|
||||
FbankConfig,
|
||||
load_manifest,
|
||||
load_manifest_lazy,
|
||||
set_caching_enabled,
|
||||
)
|
||||
from lhotse.dataset import (
|
||||
@ -204,8 +204,8 @@ class AlimeetingAsrDataModule:
|
||||
The state dict for the training sampler.
|
||||
"""
|
||||
logging.info("About to get Musan cuts")
|
||||
cuts_musan = load_manifest(
|
||||
self.args.manifest_dir / "cuts_musan.json.gz"
|
||||
cuts_musan = load_manifest_lazy(
|
||||
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||
)
|
||||
|
||||
transforms = []
|
||||
@ -401,14 +401,20 @@ class AlimeetingAsrDataModule:
|
||||
@lru_cache()
|
||||
def train_cuts(self) -> CutSet:
|
||||
logging.info("About to get train cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "alimeeting_cuts_train.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def valid_cuts(self) -> CutSet:
|
||||
logging.info("About to get dev cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_eval.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "alimeeting_cuts_eval.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def test_cuts(self) -> List[CutSet]:
|
||||
logging.info("About to get test cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "alimeeting_cuts_test.jsonl.gz"
|
||||
)
|
||||
|
@ -20,9 +20,8 @@ import logging
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||
from lhotse.dataset import (
|
||||
BucketingSampler,
|
||||
CutConcatenate,
|
||||
CutMix,
|
||||
DynamicBucketingSampler,
|
||||
@ -190,8 +189,8 @@ class GigaSpeechAsrDataModule:
|
||||
|
||||
def train_dataloaders(self, cuts_train: CutSet) -> DataLoader:
|
||||
logging.info("About to get Musan cuts")
|
||||
cuts_musan = load_manifest(
|
||||
self.args.manifest_dir / "cuts_musan.json.gz"
|
||||
cuts_musan = load_manifest_lazy(
|
||||
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||
)
|
||||
|
||||
transforms = []
|
||||
@ -315,7 +314,7 @@ class GigaSpeechAsrDataModule:
|
||||
cut_transforms=transforms,
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
valid_sampler = BucketingSampler(
|
||||
valid_sampler = DynamicBucketingSampler(
|
||||
cuts_valid,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
@ -339,8 +338,10 @@ class GigaSpeechAsrDataModule:
|
||||
else PrecomputedFeatures(),
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
sampler = BucketingSampler(
|
||||
cuts, max_duration=self.args.max_duration, shuffle=False
|
||||
sampler = DynamicBucketingSampler(
|
||||
cuts,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
)
|
||||
logging.debug("About to create test dataloader")
|
||||
test_dl = DataLoader(
|
||||
@ -361,7 +362,9 @@ class GigaSpeechAsrDataModule:
|
||||
@lru_cache()
|
||||
def dev_cuts(self) -> CutSet:
|
||||
logging.info("About to get dev cuts")
|
||||
cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
|
||||
cuts_valid = load_manifest_lazy(
|
||||
self.args.manifest_dir / "cuts_DEV.jsonl.gz"
|
||||
)
|
||||
if self.args.small_dev:
|
||||
return cuts_valid.subset(first=1000)
|
||||
else:
|
||||
@ -370,4 +373,4 @@ class GigaSpeechAsrDataModule:
|
||||
@lru_cache()
|
||||
def test_cuts(self) -> CutSet:
|
||||
logging.info("About to get test cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
|
||||
return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
|
||||
|
@ -1,103 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2021 Johns Hopkins University (Piotr Żelasko)
|
||||
# Copyright 2021 Xiaomi Corp. (Fangjun Kuang)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from lhotse import (
|
||||
CutSet,
|
||||
KaldifeatFbank,
|
||||
KaldifeatFbankConfig,
|
||||
combine,
|
||||
)
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
# Torch's multithreaded behavior needs to be disabled or
|
||||
# it wastes a lot of CPU and slow things down.
|
||||
# Do this outside of main() in case it needs to take effect
|
||||
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||
torch.set_num_threads(1)
|
||||
torch.set_num_interop_threads(1)
|
||||
|
||||
|
||||
def compute_fbank_musan():
|
||||
src_dir = Path("data/manifests")
|
||||
output_dir = Path("data/fbank")
|
||||
|
||||
# number of workers in dataloader
|
||||
num_workers = 10
|
||||
|
||||
# number of seconds in a batch
|
||||
batch_duration = 600
|
||||
|
||||
dataset_parts = (
|
||||
"music",
|
||||
"speech",
|
||||
"noise",
|
||||
)
|
||||
|
||||
manifests = read_manifests_if_cached(
|
||||
prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir
|
||||
)
|
||||
assert manifests is not None
|
||||
|
||||
musan_cuts_path = output_dir / "cuts_musan.json.gz"
|
||||
|
||||
if musan_cuts_path.is_file():
|
||||
logging.info(f"{musan_cuts_path} already exists - skipping")
|
||||
return
|
||||
|
||||
logging.info("Extracting features for Musan")
|
||||
|
||||
device = torch.device("cpu")
|
||||
if torch.cuda.is_available():
|
||||
device = torch.device("cuda", 0)
|
||||
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
|
||||
|
||||
logging.info(f"device: {device}")
|
||||
|
||||
musan_cuts = (
|
||||
CutSet.from_manifests(
|
||||
recordings=combine(
|
||||
part["recordings"] for part in manifests.values()
|
||||
)
|
||||
)
|
||||
.cut_into_windows(10.0)
|
||||
.filter(lambda c: c.duration > 5)
|
||||
.compute_and_store_features_batch(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/feats_musan",
|
||||
num_workers=num_workers,
|
||||
batch_duration=batch_duration,
|
||||
)
|
||||
)
|
||||
musan_cuts.to_json(musan_cuts_path)
|
||||
|
||||
|
||||
def main():
|
||||
formatter = (
|
||||
"%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||
)
|
||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||
|
||||
compute_fbank_musan()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
1
egs/gigaspeech/ASR/local/compute_fbank_musan.py
Symbolic link
1
egs/gigaspeech/ASR/local/compute_fbank_musan.py
Symbolic link
@ -0,0 +1 @@
|
||||
../../../librispeech/ASR/local/compute_fbank_musan.py
|
@ -23,9 +23,8 @@ from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import torch
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||
from lhotse.dataset import (
|
||||
BucketingSampler,
|
||||
CutConcatenate,
|
||||
CutMix,
|
||||
DynamicBucketingSampler,
|
||||
@ -217,8 +216,8 @@ class GigaSpeechAsrDataModule:
|
||||
if self.args.enable_musan:
|
||||
logging.info("Enable MUSAN")
|
||||
logging.info("About to get Musan cuts")
|
||||
cuts_musan = load_manifest(
|
||||
self.args.manifest_dir / "cuts_musan.json.gz"
|
||||
cuts_musan = load_manifest_lazy(
|
||||
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||
)
|
||||
transforms.append(
|
||||
CutMix(
|
||||
@ -358,7 +357,7 @@ class GigaSpeechAsrDataModule:
|
||||
cut_transforms=transforms,
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
valid_sampler = BucketingSampler(
|
||||
valid_sampler = DynamicBucketingSampler(
|
||||
cuts_valid,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
@ -382,8 +381,10 @@ class GigaSpeechAsrDataModule:
|
||||
else PrecomputedFeatures(),
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
sampler = BucketingSampler(
|
||||
cuts, max_duration=self.args.max_duration, shuffle=False
|
||||
sampler = DynamicBucketingSampler(
|
||||
cuts,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
)
|
||||
logging.debug("About to create test dataloader")
|
||||
test_dl = DataLoader(
|
||||
@ -404,7 +405,9 @@ class GigaSpeechAsrDataModule:
|
||||
@lru_cache()
|
||||
def dev_cuts(self) -> CutSet:
|
||||
logging.info("About to get dev cuts")
|
||||
cuts_valid = load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
|
||||
cuts_valid = load_manifest_lazy(
|
||||
self.args.manifest_dir / "cuts_DEV.jsonl.gz"
|
||||
)
|
||||
if self.args.small_dev:
|
||||
return cuts_valid.subset(first=1000)
|
||||
else:
|
||||
@ -413,4 +416,4 @@ class GigaSpeechAsrDataModule:
|
||||
@lru_cache()
|
||||
def test_cuts(self) -> CutSet:
|
||||
logging.info("About to get test cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
|
||||
return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
|
||||
|
@ -96,14 +96,14 @@ def get_parser():
|
||||
|
||||
- labels_xxx.h5
|
||||
- aux_labels_xxx.h5
|
||||
- cuts_xxx.json.gz
|
||||
- librispeech_cuts_xxx.jsonl.gz
|
||||
|
||||
where xxx is the value of `--dataset`. For instance, if
|
||||
`--dataset` is `train-clean-100`, it will contain 3 files:
|
||||
|
||||
- `labels_train-clean-100.h5`
|
||||
- `aux_labels_train-clean-100.h5`
|
||||
- `cuts_train-clean-100.json.gz`
|
||||
- `librispeech_cuts_train-clean-100.jsonl.gz`
|
||||
|
||||
Note: Both labels_xxx.h5 and aux_labels_xxx.h5 contain framewise
|
||||
alignment. The difference is that labels_xxx.h5 contains repeats.
|
||||
@ -289,7 +289,9 @@ def main():
|
||||
|
||||
out_labels_ali_filename = out_dir / f"labels_{params.dataset}.h5"
|
||||
out_aux_labels_ali_filename = out_dir / f"aux_labels_{params.dataset}.h5"
|
||||
out_manifest_filename = out_dir / f"cuts_{params.dataset}.json.gz"
|
||||
out_manifest_filename = (
|
||||
out_dir / f"librispeech_cuts_{params.dataset}.jsonl.gz"
|
||||
)
|
||||
|
||||
for f in (
|
||||
out_labels_ali_filename,
|
||||
|
@ -17,6 +17,17 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Usage:
|
||||
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
||||
./conformer_ctc/train.py \
|
||||
--exp-dir ./conformer_ctc/exp \
|
||||
--world-size 4 \
|
||||
--full-libri 1 \
|
||||
--max-duration 200 \
|
||||
--num-epochs 20
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
@ -29,6 +40,7 @@ import torch.multiprocessing as mp
|
||||
import torch.nn as nn
|
||||
from asr_datamodule import LibriSpeechAsrDataModule
|
||||
from conformer import Conformer
|
||||
from lhotse.cut import Cut
|
||||
from lhotse.utils import fix_random_seed
|
||||
from torch import Tensor
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
@ -676,6 +688,20 @@ def run(rank, world_size, args):
|
||||
if params.full_libri:
|
||||
train_cuts += librispeech.train_clean_360_cuts()
|
||||
train_cuts += librispeech.train_other_500_cuts()
|
||||
|
||||
def remove_short_and_long_utt(c: Cut):
|
||||
# Keep only utterances with duration between 1 second and 20 seconds
|
||||
#
|
||||
# Caution: There is a reason to select 20.0 here. Please see
|
||||
# ../local/display_manifest_statistics.py
|
||||
#
|
||||
# You should use ../local/display_manifest_statistics.py to get
|
||||
# an utterance duration distribution for your dataset to select
|
||||
# the threshold
|
||||
return 1.0 <= c.duration <= 20.0
|
||||
|
||||
train_cuts = train_cuts.filter(remove_short_and_long_utt)
|
||||
|
||||
train_dl = librispeech.train_dataloaders(train_cuts)
|
||||
|
||||
valid_cuts = librispeech.dev_clean_cuts()
|
||||
|
@ -20,11 +20,7 @@ import logging
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from lhotse import (
|
||||
CutSet,
|
||||
KaldifeatFbank,
|
||||
KaldifeatFbankConfig,
|
||||
)
|
||||
from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig
|
||||
|
||||
# Torch's multithreaded behavior needs to be disabled or
|
||||
# it wastes a lot of CPU and slow things down.
|
||||
@ -51,13 +47,16 @@ def compute_fbank_gigaspeech_dev_test():
|
||||
|
||||
logging.info(f"device: {device}")
|
||||
|
||||
prefix = "gigaspeech"
|
||||
suffix = "jsonl.gz"
|
||||
|
||||
for partition in subsets:
|
||||
cuts_path = in_out_dir / f"cuts_{partition}.jsonl.gz"
|
||||
cuts_path = in_out_dir / f"{prefix}_cuts_{partition}.{suffix}"
|
||||
if cuts_path.is_file():
|
||||
logging.info(f"{cuts_path} exists - skipping")
|
||||
continue
|
||||
|
||||
raw_cuts_path = in_out_dir / f"cuts_{partition}_raw.jsonl.gz"
|
||||
raw_cuts_path = in_out_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
|
||||
|
||||
logging.info(f"Loading {raw_cuts_path}")
|
||||
cut_set = CutSet.from_file(raw_cuts_path)
|
||||
@ -66,7 +65,7 @@ def compute_fbank_gigaspeech_dev_test():
|
||||
|
||||
cut_set = cut_set.compute_and_store_features_batch(
|
||||
extractor=extractor,
|
||||
storage_path=f"{in_out_dir}/feats_{partition}",
|
||||
storage_path=f"{in_out_dir}/{prefix}_feats_{partition}",
|
||||
num_workers=num_workers,
|
||||
batch_duration=batch_duration,
|
||||
)
|
||||
|
@ -77,7 +77,7 @@ def get_parser():
|
||||
|
||||
def compute_fbank_gigaspeech_splits(args):
|
||||
num_splits = args.num_splits
|
||||
output_dir = f"data/fbank/XL_split_{num_splits}"
|
||||
output_dir = f"data/fbank/gigaspeech_XL_split_{num_splits}"
|
||||
output_dir = Path(output_dir)
|
||||
assert output_dir.exists(), f"{output_dir} does not exist!"
|
||||
|
||||
@ -96,17 +96,19 @@ def compute_fbank_gigaspeech_splits(args):
|
||||
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
|
||||
logging.info(f"device: {device}")
|
||||
|
||||
prefix = "gigaspeech"
|
||||
|
||||
num_digits = 8 # num_digits is fixed by lhotse split-lazy
|
||||
for i in range(start, stop):
|
||||
idx = f"{i + 1}".zfill(num_digits)
|
||||
logging.info(f"Processing {idx}/{num_splits}")
|
||||
|
||||
cuts_path = output_dir / f"cuts_XL.{idx}.jsonl.gz"
|
||||
cuts_path = output_dir / f"{prefix}_cuts_XL.{idx}.jsonl.gz"
|
||||
if cuts_path.is_file():
|
||||
logging.info(f"{cuts_path} exists - skipping")
|
||||
continue
|
||||
|
||||
raw_cuts_path = output_dir / f"cuts_XL_raw.{idx}.jsonl.gz"
|
||||
raw_cuts_path = output_dir / f"{prefix}_cuts_XL_raw.{idx}.jsonl.gz"
|
||||
if not raw_cuts_path.is_file():
|
||||
logging.info(f"{raw_cuts_path} does not exist - skipping it")
|
||||
continue
|
||||
@ -115,13 +117,13 @@ def compute_fbank_gigaspeech_splits(args):
|
||||
cut_set = CutSet.from_file(raw_cuts_path)
|
||||
|
||||
logging.info("Computing features")
|
||||
if (output_dir / f"feats_XL_{idx}.lca").exists():
|
||||
logging.info(f"Removing {output_dir}/feats_XL_{idx}.lca")
|
||||
os.remove(output_dir / f"feats_XL_{idx}.lca")
|
||||
if (output_dir / f"{prefix}_feats_XL_{idx}.lca").exists():
|
||||
logging.info(f"Removing {output_dir}/{prefix}_feats_XL_{idx}.lca")
|
||||
os.remove(output_dir / f"{prefix}_feats_XL_{idx}.lca")
|
||||
|
||||
cut_set = cut_set.compute_and_store_features_batch(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/feats_XL_{idx}",
|
||||
storage_path=f"{output_dir}/{prefix}_feats_XL_{idx}",
|
||||
num_workers=args.num_workers,
|
||||
batch_duration=args.batch_duration,
|
||||
)
|
||||
|
@ -28,7 +28,7 @@ import os
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
from icefall.utils import get_executor
|
||||
@ -56,8 +56,13 @@ def compute_fbank_librispeech():
|
||||
"train-clean-360",
|
||||
"train-other-500",
|
||||
)
|
||||
prefix = "librispeech"
|
||||
suffix = "jsonl.gz"
|
||||
manifests = read_manifests_if_cached(
|
||||
prefix="librispeech", dataset_parts=dataset_parts, output_dir=src_dir
|
||||
dataset_parts=dataset_parts,
|
||||
output_dir=src_dir,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
)
|
||||
assert manifests is not None
|
||||
|
||||
@ -65,7 +70,8 @@ def compute_fbank_librispeech():
|
||||
|
||||
with get_executor() as ex: # Initialize the executor only once.
|
||||
for partition, m in manifests.items():
|
||||
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
||||
cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
|
||||
if (output_dir / cuts_filename).is_file():
|
||||
logging.info(f"{partition} already exists - skipping.")
|
||||
continue
|
||||
logging.info(f"Processing {partition}")
|
||||
@ -81,13 +87,13 @@ def compute_fbank_librispeech():
|
||||
)
|
||||
cut_set = cut_set.compute_and_store_features(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/feats_{partition}",
|
||||
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||
# when an executor is specified, make more partitions
|
||||
num_jobs=num_jobs if ex is None else 80,
|
||||
executor=ex,
|
||||
storage_type=ChunkedLilcomHdf5Writer,
|
||||
storage_type=LilcomChunkyWriter,
|
||||
)
|
||||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
||||
cut_set.to_file(output_dir / cuts_filename)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -28,7 +28,7 @@ import os
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from lhotse import ChunkedLilcomHdf5Writer, CutSet, Fbank, FbankConfig, combine
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter, combine
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
from icefall.utils import get_executor
|
||||
@ -52,12 +52,22 @@ def compute_fbank_musan():
|
||||
"speech",
|
||||
"noise",
|
||||
)
|
||||
prefix = "musan"
|
||||
suffix = "jsonl.gz"
|
||||
manifests = read_manifests_if_cached(
|
||||
prefix="musan", dataset_parts=dataset_parts, output_dir=src_dir
|
||||
dataset_parts=dataset_parts,
|
||||
output_dir=src_dir,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
)
|
||||
assert manifests is not None
|
||||
|
||||
musan_cuts_path = output_dir / "cuts_musan.json.gz"
|
||||
assert len(manifests) == len(dataset_parts), (
|
||||
len(manifests),
|
||||
len(dataset_parts),
|
||||
)
|
||||
|
||||
musan_cuts_path = output_dir / "musan_cuts.jsonl.gz"
|
||||
|
||||
if musan_cuts_path.is_file():
|
||||
logging.info(f"{musan_cuts_path} already exists - skipping")
|
||||
@ -79,13 +89,13 @@ def compute_fbank_musan():
|
||||
.filter(lambda c: c.duration > 5)
|
||||
.compute_and_store_features(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/feats_musan",
|
||||
storage_path=f"{output_dir}/musan_feats",
|
||||
num_jobs=num_jobs if ex is None else 80,
|
||||
executor=ex,
|
||||
storage_type=ChunkedLilcomHdf5Writer,
|
||||
storage_type=LilcomChunkyWriter,
|
||||
)
|
||||
)
|
||||
musan_cuts.to_json(musan_cuts_path)
|
||||
musan_cuts.to_file(musan_cuts_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -25,19 +25,19 @@ for usage.
|
||||
"""
|
||||
|
||||
|
||||
from lhotse import load_manifest
|
||||
from lhotse import load_manifest_lazy
|
||||
|
||||
|
||||
def main():
|
||||
path = "./data/fbank/cuts_train-clean-100.json.gz"
|
||||
path = "./data/fbank/cuts_train-clean-360.json.gz"
|
||||
path = "./data/fbank/cuts_train-other-500.json.gz"
|
||||
path = "./data/fbank/cuts_dev-clean.json.gz"
|
||||
path = "./data/fbank/cuts_dev-other.json.gz"
|
||||
path = "./data/fbank/cuts_test-clean.json.gz"
|
||||
path = "./data/fbank/cuts_test-other.json.gz"
|
||||
# path = "./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz"
|
||||
# path = "./data/fbank/librispeech_cuts_train-clean-360.jsonl.gz"
|
||||
# path = "./data/fbank/librispeech_cuts_train-other-500.jsonl.gz"
|
||||
# path = "./data/fbank/librispeech_cuts_dev-clean.jsonl.gz"
|
||||
# path = "./data/fbank/librispeech_cuts_dev-other.jsonl.gz"
|
||||
# path = "./data/fbank/librispeech_cuts_test-clean.jsonl.gz"
|
||||
path = "./data/fbank/librispeech_cuts_test-other.jsonl.gz"
|
||||
|
||||
cuts = load_manifest(path)
|
||||
cuts = load_manifest_lazy(path)
|
||||
cuts.describe()
|
||||
|
||||
|
||||
|
@ -58,17 +58,19 @@ def preprocess_giga_speech():
|
||||
)
|
||||
|
||||
logging.info("Loading manifest (may take 4 minutes)")
|
||||
prefix = "gigaspeech"
|
||||
suffix = "jsonl.gz"
|
||||
manifests = read_manifests_if_cached(
|
||||
dataset_parts=dataset_parts,
|
||||
output_dir=src_dir,
|
||||
prefix="gigaspeech",
|
||||
suffix="jsonl.gz",
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
)
|
||||
assert manifests is not None
|
||||
|
||||
for partition, m in manifests.items():
|
||||
logging.info(f"Processing {partition}")
|
||||
raw_cuts_path = output_dir / f"cuts_{partition}_raw.jsonl.gz"
|
||||
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.{suffix}"
|
||||
if raw_cuts_path.is_file():
|
||||
logging.info(f"{partition} already exists - skipping")
|
||||
continue
|
||||
|
@ -25,7 +25,7 @@ We will add more checks later if needed.
|
||||
Usage example:
|
||||
|
||||
python3 ./local/validate_manifest.py \
|
||||
./data/fbank/cuts_train-clean-100.json.gz
|
||||
./data/fbank/librispeech_cuts_train-clean-100.jsonl.gz
|
||||
|
||||
"""
|
||||
|
||||
@ -33,7 +33,7 @@ import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import load_manifest, CutSet
|
||||
from lhotse import CutSet, load_manifest_lazy
|
||||
from lhotse.cut import Cut
|
||||
|
||||
|
||||
@ -76,7 +76,7 @@ def main():
|
||||
logging.info(f"Validating {manifest}")
|
||||
|
||||
assert manifest.is_file(), f"{manifest} does not exist"
|
||||
cut_set = load_manifest(manifest)
|
||||
cut_set = load_manifest_lazy(manifest)
|
||||
assert isinstance(cut_set, CutSet)
|
||||
|
||||
for c in cut_set:
|
||||
|
@ -40,9 +40,9 @@ dl_dir=$PWD/download
|
||||
# It will generate data/lang_bpe_xxx,
|
||||
# data/lang_bpe_yyy if the array contains xxx, yyy
|
||||
vocab_sizes=(
|
||||
5000
|
||||
2000
|
||||
1000
|
||||
# 5000
|
||||
# 2000
|
||||
# 1000
|
||||
500
|
||||
)
|
||||
|
||||
@ -132,7 +132,7 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
)
|
||||
for part in ${parts[@]}; do
|
||||
python3 ./local/validate_manifest.py \
|
||||
data/fbank/cuts_${part}.json.gz
|
||||
data/fbank/librispeech_cuts_${part}.jsonl.gz
|
||||
done
|
||||
touch data/fbank/.librispeech-validated.done
|
||||
fi
|
||||
|
@ -124,9 +124,9 @@ fi
|
||||
|
||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
log "Stage 4: Split XL subset into ${num_splits} pieces"
|
||||
split_dir=data/fbank/XL_split_${num_splits}
|
||||
split_dir=data/fbank/gigaspeech_XL_split_${num_splits}
|
||||
if [ ! -f $split_dir/.split_completed ]; then
|
||||
lhotse split-lazy ./data/fbank/cuts_XL_raw.jsonl.gz $split_dir $chunk_size
|
||||
lhotse split-lazy ./data/fbank/gigaspeech_cuts_XL_raw.jsonl.gz $split_dir $chunk_size
|
||||
touch $split_dir/.split_completed
|
||||
fi
|
||||
fi
|
||||
|
@ -807,28 +807,8 @@ def run(rank, world_size, args):
|
||||
# the threshold
|
||||
return 1.0 <= c.duration <= 20.0
|
||||
|
||||
num_in_total = len(train_cuts)
|
||||
|
||||
train_cuts = train_cuts.filter(remove_short_and_long_utt)
|
||||
|
||||
try:
|
||||
num_left = len(train_cuts)
|
||||
num_removed = num_in_total - num_left
|
||||
removed_percent = num_removed / num_in_total * 100
|
||||
|
||||
logging.info(
|
||||
f"Before removing short and long utterances: {num_in_total}"
|
||||
)
|
||||
logging.info(f"After removing short and long utterances: {num_left}")
|
||||
logging.info(
|
||||
f"Removed {num_removed} utterances ({removed_percent:.5f}%)"
|
||||
)
|
||||
except TypeError as e:
|
||||
# You can ignore this error as previous versions of Lhotse work fine
|
||||
# for the above code. In recent versions of Lhotse, it uses
|
||||
# lazy filter, producing cutsets that don't have the __len__ method
|
||||
logging.info(str(e))
|
||||
|
||||
if params.start_batch > 0 and checkpoints and "sampler" in checkpoints:
|
||||
# We only load the sampler's state dict when it loads a checkpoint
|
||||
# saved in the middle of an epoch
|
||||
|
@ -22,7 +22,6 @@ from typing import Optional
|
||||
|
||||
from lhotse import CutSet, Fbank, FbankConfig
|
||||
from lhotse.dataset import (
|
||||
BucketingSampler,
|
||||
CutMix,
|
||||
DynamicBucketingSampler,
|
||||
K2SpeechRecognitionDataset,
|
||||
@ -71,8 +70,7 @@ class AsrDataModule:
|
||||
"--num-buckets",
|
||||
type=int,
|
||||
default=30,
|
||||
help="The number of buckets for the BucketingSampler "
|
||||
"and DynamicBucketingSampler."
|
||||
help="The number of buckets for the DynamicBucketingSampler. "
|
||||
"(you might want to increase it for larger datasets).",
|
||||
)
|
||||
|
||||
@ -152,7 +150,6 @@ class AsrDataModule:
|
||||
def train_dataloaders(
|
||||
self,
|
||||
cuts_train: CutSet,
|
||||
dynamic_bucketing: bool,
|
||||
on_the_fly_feats: bool,
|
||||
cuts_musan: Optional[CutSet] = None,
|
||||
) -> DataLoader:
|
||||
@ -162,9 +159,6 @@ class AsrDataModule:
|
||||
Cuts for training.
|
||||
cuts_musan:
|
||||
If not None, it is the cuts for mixing.
|
||||
dynamic_bucketing:
|
||||
True to use DynamicBucketingSampler;
|
||||
False to use BucketingSampler.
|
||||
on_the_fly_feats:
|
||||
True to use OnTheFlyFeatures;
|
||||
False to use PrecomputedFeatures.
|
||||
@ -230,7 +224,6 @@ class AsrDataModule:
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
|
||||
if dynamic_bucketing:
|
||||
logging.info("Using DynamicBucketingSampler.")
|
||||
train_sampler = DynamicBucketingSampler(
|
||||
cuts_train,
|
||||
@ -239,16 +232,6 @@ class AsrDataModule:
|
||||
num_buckets=self.args.num_buckets,
|
||||
drop_last=True,
|
||||
)
|
||||
else:
|
||||
logging.info("Using BucketingSampler.")
|
||||
train_sampler = BucketingSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
num_buckets=self.args.num_buckets,
|
||||
bucket_method="equal_duration",
|
||||
drop_last=True,
|
||||
)
|
||||
|
||||
logging.info("About to create train dataloader")
|
||||
train_dl = DataLoader(
|
||||
|
@ -22,7 +22,7 @@ import re
|
||||
from pathlib import Path
|
||||
|
||||
import lhotse
|
||||
from lhotse import CutSet, load_manifest
|
||||
from lhotse import CutSet, load_manifest_lazy
|
||||
|
||||
|
||||
class GigaSpeech:
|
||||
@ -32,13 +32,13 @@ class GigaSpeech:
|
||||
manifest_dir:
|
||||
It is expected to contain the following files::
|
||||
|
||||
- XL_split_2000/cuts_XL.*.jsonl.gz
|
||||
- cuts_L_raw.jsonl.gz
|
||||
- cuts_M_raw.jsonl.gz
|
||||
- cuts_S_raw.jsonl.gz
|
||||
- cuts_XS_raw.jsonl.gz
|
||||
- cuts_DEV_raw.jsonl.gz
|
||||
- cuts_TEST_raw.jsonl.gz
|
||||
- gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz
|
||||
- gigaspeech_cuts_L_raw.jsonl.gz
|
||||
- gigaspeech_cuts_M_raw.jsonl.gz
|
||||
- gigaspeech_cuts_S_raw.jsonl.gz
|
||||
- gigaspeech_cuts_XS_raw.jsonl.gz
|
||||
- gigaspeech_cuts_DEV_raw.jsonl.gz
|
||||
- gigaspeech_cuts_TEST_raw.jsonl.gz
|
||||
"""
|
||||
self.manifest_dir = Path(manifest_dir)
|
||||
|
||||
@ -46,10 +46,12 @@ class GigaSpeech:
|
||||
logging.info("About to get train-XL cuts")
|
||||
|
||||
filenames = list(
|
||||
glob.glob(f"{self.manifest_dir}/XL_split_2000/cuts_XL.*.jsonl.gz")
|
||||
glob.glob(
|
||||
f"{self.manifest_dir}/gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz" # noqa
|
||||
)
|
||||
)
|
||||
|
||||
pattern = re.compile(r"cuts_XL.([0-9]+).jsonl.gz")
|
||||
pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz")
|
||||
idx_filenames = [
|
||||
(int(pattern.search(f).group(1)), f) for f in filenames
|
||||
]
|
||||
@ -64,31 +66,31 @@ class GigaSpeech:
|
||||
)
|
||||
|
||||
def train_L_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_L_raw.jsonl.gz"
|
||||
f = self.manifest_dir / "gigaspeech_cuts_L_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-L cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def train_M_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_M_raw.jsonl.gz"
|
||||
f = self.manifest_dir / "gigaspeech_cuts_M_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-M cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def train_S_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_S_raw.jsonl.gz"
|
||||
f = self.manifest_dir / "gigaspeech_cuts_S_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-S cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def train_XS_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_XS_raw.jsonl.gz"
|
||||
f = self.manifest_dir / "gigaspeech_cuts_XS_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-XS cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def test_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_TEST.jsonl.gz"
|
||||
f = self.manifest_dir / "gigaspeech_cuts_TEST.jsonl.gz"
|
||||
logging.info(f"About to get TEST cuts from {f}")
|
||||
return load_manifest(f)
|
||||
return load_manifest_lazy(f)
|
||||
|
||||
def dev_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_DEV.jsonl.gz"
|
||||
f = self.manifest_dir / "gigaspeech_cuts_DEV.jsonl.gz"
|
||||
logging.info(f"About to get DEV cuts from {f}")
|
||||
return load_manifest(f)
|
||||
return load_manifest_lazy(f)
|
||||
|
@ -18,7 +18,7 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet, load_manifest
|
||||
from lhotse import CutSet, load_manifest_lazy
|
||||
|
||||
|
||||
class LibriSpeech:
|
||||
@ -28,47 +28,47 @@ class LibriSpeech:
|
||||
manifest_dir:
|
||||
It is expected to contain the following files::
|
||||
|
||||
- cuts_dev-clean.json.gz
|
||||
- cuts_dev-other.json.gz
|
||||
- cuts_test-clean.json.gz
|
||||
- cuts_test-other.json.gz
|
||||
- cuts_train-clean-100.json.gz
|
||||
- cuts_train-clean-360.json.gz
|
||||
- cuts_train-other-500.json.gz
|
||||
- librispeech_cuts_dev-clean.jsonl.gz
|
||||
- librispeech_cuts_dev-other.jsonl.gz
|
||||
- librispeech_cuts_test-clean.jsonl.gz
|
||||
- librispeech_cuts_test-other.jsonl.gz
|
||||
- librispeech_cuts_train-clean-100.jsonl.gz
|
||||
- librispeech_cuts_train-clean-360.jsonl.gz
|
||||
- librispeech_cuts_train-other-500.jsonl.gz
|
||||
"""
|
||||
self.manifest_dir = Path(manifest_dir)
|
||||
|
||||
def train_clean_100_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_train-clean-100.json.gz"
|
||||
f = self.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
|
||||
logging.info(f"About to get train-clean-100 cuts from {f}")
|
||||
return load_manifest(f)
|
||||
return load_manifest_lazy(f)
|
||||
|
||||
def train_clean_360_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_train-clean-360.json.gz"
|
||||
f = self.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
|
||||
logging.info(f"About to get train-clean-360 cuts from {f}")
|
||||
return load_manifest(f)
|
||||
return load_manifest_lazy(f)
|
||||
|
||||
def train_other_500_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_train-other-500.json.gz"
|
||||
f = self.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
|
||||
logging.info(f"About to get train-other-500 cuts from {f}")
|
||||
return load_manifest(f)
|
||||
return load_manifest_lazy(f)
|
||||
|
||||
def test_clean_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_test-clean.json.gz"
|
||||
f = self.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
|
||||
logging.info(f"About to get test-clean cuts from {f}")
|
||||
return load_manifest(f)
|
||||
return load_manifest_lazy(f)
|
||||
|
||||
def test_other_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_test-other.json.gz"
|
||||
f = self.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
|
||||
logging.info(f"About to get test-other cuts from {f}")
|
||||
return load_manifest(f)
|
||||
return load_manifest_lazy(f)
|
||||
|
||||
def dev_clean_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_dev-clean.json.gz"
|
||||
f = self.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
|
||||
logging.info(f"About to get dev-clean cuts from {f}")
|
||||
return load_manifest(f)
|
||||
return load_manifest_lazy(f)
|
||||
|
||||
def dev_other_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_dev-other.json.gz"
|
||||
f = self.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
|
||||
logging.info(f"About to get dev-other cuts from {f}")
|
||||
return load_manifest(f)
|
||||
return load_manifest_lazy(f)
|
||||
|
@ -66,7 +66,7 @@ from conformer import Conformer
|
||||
from decoder import Decoder
|
||||
from gigaspeech import GigaSpeech
|
||||
from joiner import Joiner
|
||||
from lhotse import CutSet, load_manifest
|
||||
from lhotse import CutSet, load_manifest_lazy
|
||||
from lhotse.cut import Cut
|
||||
from lhotse.dataset.sampling.base import CutSampler
|
||||
from lhotse.utils import fix_random_seed
|
||||
@ -968,8 +968,8 @@ def run(rank, world_size, args):
|
||||
train_giga_cuts = train_giga_cuts.repeat(times=None)
|
||||
|
||||
if args.enable_musan:
|
||||
cuts_musan = load_manifest(
|
||||
Path(args.manifest_dir) / "cuts_musan.json.gz"
|
||||
cuts_musan = load_manifest_lazy(
|
||||
Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
|
||||
)
|
||||
else:
|
||||
cuts_musan = None
|
||||
@ -978,14 +978,12 @@ def run(rank, world_size, args):
|
||||
|
||||
train_dl = asr_datamodule.train_dataloaders(
|
||||
train_cuts,
|
||||
dynamic_bucketing=False,
|
||||
on_the_fly_feats=False,
|
||||
cuts_musan=cuts_musan,
|
||||
)
|
||||
|
||||
giga_train_dl = asr_datamodule.train_dataloaders(
|
||||
train_giga_cuts,
|
||||
dynamic_bucketing=True,
|
||||
on_the_fly_feats=False,
|
||||
cuts_musan=cuts_musan,
|
||||
)
|
||||
|
@ -24,7 +24,7 @@ from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import torch
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||
from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures
|
||||
CutConcatenate,
|
||||
CutMix,
|
||||
@ -224,8 +224,8 @@ class LibriSpeechAsrDataModule:
|
||||
if self.args.enable_musan:
|
||||
logging.info("Enable MUSAN")
|
||||
logging.info("About to get Musan cuts")
|
||||
cuts_musan = load_manifest(
|
||||
self.args.manifest_dir / "cuts_musan.json.gz"
|
||||
cuts_musan = load_manifest_lazy(
|
||||
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||
)
|
||||
transforms.append(
|
||||
CutMix(
|
||||
@ -407,40 +407,48 @@ class LibriSpeechAsrDataModule:
|
||||
@lru_cache()
|
||||
def train_clean_100_cuts(self) -> CutSet:
|
||||
logging.info("About to get train-clean-100 cuts")
|
||||
return load_manifest(
|
||||
self.args.manifest_dir / "cuts_train-clean-100.json.gz"
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "librispeech_cuts_train-clean-100.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def train_clean_360_cuts(self) -> CutSet:
|
||||
logging.info("About to get train-clean-360 cuts")
|
||||
return load_manifest(
|
||||
self.args.manifest_dir / "cuts_train-clean-360.json.gz"
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "librispeech_cuts_train-clean-360.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def train_other_500_cuts(self) -> CutSet:
|
||||
logging.info("About to get train-other-500 cuts")
|
||||
return load_manifest(
|
||||
self.args.manifest_dir / "cuts_train-other-500.json.gz"
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "librispeech_cuts_train-other-500.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def dev_clean_cuts(self) -> CutSet:
|
||||
logging.info("About to get dev-clean cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_dev-clean.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "librispeech_cuts_dev-clean.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def dev_other_cuts(self) -> CutSet:
|
||||
logging.info("About to get dev-other cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_dev-other.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "librispeech_cuts_dev-other.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def test_clean_cuts(self) -> CutSet:
|
||||
logging.info("About to get test-clean cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_test-clean.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "librispeech_cuts_test-clean.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def test_other_cuts(self) -> CutSet:
|
||||
logging.info("About to get test-other cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_test-other.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "librispeech_cuts_test-other.jsonl.gz"
|
||||
)
|
||||
|
@ -16,6 +16,15 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Usage:
|
||||
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
||||
./tdnn_lstm_ctc/train.py \
|
||||
--world-size 4 \
|
||||
--full-libri 1 \
|
||||
--max-duration 300 \
|
||||
--num-epochs 20
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
@ -29,6 +38,7 @@ import torch.multiprocessing as mp
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
from asr_datamodule import LibriSpeechAsrDataModule
|
||||
from lhotse.cut import Cut
|
||||
from lhotse.utils import fix_random_seed
|
||||
from model import TdnnLstm
|
||||
from torch import Tensor
|
||||
@ -544,10 +554,25 @@ def run(rank, world_size, args):
|
||||
if params.full_libri:
|
||||
train_cuts += librispeech.train_clean_360_cuts()
|
||||
train_cuts += librispeech.train_other_500_cuts()
|
||||
|
||||
def remove_short_and_long_utt(c: Cut):
|
||||
# Keep only utterances with duration between 1 second and 20 seconds
|
||||
#
|
||||
# Caution: There is a reason to select 20.0 here. Please see
|
||||
# ../local/display_manifest_statistics.py
|
||||
#
|
||||
# You should use ../local/display_manifest_statistics.py to get
|
||||
# an utterance duration distribution for your dataset to select
|
||||
# the threshold
|
||||
return 1.0 <= c.duration <= 20.0
|
||||
|
||||
train_cuts = train_cuts.filter(remove_short_and_long_utt)
|
||||
|
||||
train_dl = librispeech.train_dataloaders(train_cuts)
|
||||
|
||||
valid_cuts = librispeech.dev_clean_cuts()
|
||||
valid_cuts += librispeech.dev_other_cuts()
|
||||
|
||||
valid_dl = librispeech.valid_dataloaders(valid_cuts)
|
||||
|
||||
for epoch in range(params.start_epoch, params.num_epochs):
|
||||
|
@ -44,8 +44,8 @@ from pathlib import Path
|
||||
import sentencepiece as spm
|
||||
import torch
|
||||
from alignment import get_word_starting_frames
|
||||
from lhotse import CutSet, load_manifest
|
||||
from lhotse.dataset import K2SpeechRecognitionDataset, SingleCutSampler
|
||||
from lhotse import CutSet, load_manifest_lazy
|
||||
from lhotse.dataset import DynamicBucketingSampler, K2SpeechRecognitionDataset
|
||||
from lhotse.dataset.collation import collate_custom_field
|
||||
|
||||
|
||||
@ -93,14 +93,15 @@ def main():
|
||||
sp = spm.SentencePieceProcessor()
|
||||
sp.load(args.bpe_model)
|
||||
|
||||
cuts_json = args.ali_dir / f"cuts_{args.dataset}.json.gz"
|
||||
cuts_jsonl = args.ali_dir / f"librispeech_cuts_{args.dataset}.jsonl.gz"
|
||||
|
||||
logging.info(f"Loading {cuts_json}")
|
||||
cuts = load_manifest(cuts_json)
|
||||
logging.info(f"Loading {cuts_jsonl}")
|
||||
cuts = load_manifest_lazy(cuts_jsonl)
|
||||
|
||||
sampler = SingleCutSampler(
|
||||
sampler = DynamicBucketingSampler(
|
||||
cuts,
|
||||
max_duration=30,
|
||||
num_buckets=30,
|
||||
shuffle=False,
|
||||
)
|
||||
|
||||
|
@ -1,333 +0,0 @@
|
||||
# Copyright 2021 Piotr Żelasko
|
||||
# 2022 Xiaomi Corp. (authors: Fangjun Kuang
|
||||
# Mingshuang Luo)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import inspect
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from lhotse import CutSet, Fbank, FbankConfig
|
||||
from lhotse.dataset import (
|
||||
BucketingSampler,
|
||||
CutMix,
|
||||
DynamicBucketingSampler,
|
||||
K2SpeechRecognitionDataset,
|
||||
SpecAugment,
|
||||
)
|
||||
from lhotse.dataset.input_strategies import (
|
||||
OnTheFlyFeatures,
|
||||
PrecomputedFeatures,
|
||||
)
|
||||
from lhotse.utils import fix_random_seed
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from icefall.utils import str2bool
|
||||
|
||||
|
||||
class _SeedWorkers:
|
||||
def __init__(self, seed: int):
|
||||
self.seed = seed
|
||||
|
||||
def __call__(self, worker_id: int):
|
||||
fix_random_seed(self.seed + worker_id)
|
||||
|
||||
|
||||
class AsrDataModule:
|
||||
def __init__(self, args: argparse.Namespace):
|
||||
self.args = args
|
||||
|
||||
@classmethod
|
||||
def add_arguments(cls, parser: argparse.ArgumentParser):
|
||||
group = parser.add_argument_group(
|
||||
title="ASR data related options",
|
||||
description="These options are used for the preparation of "
|
||||
"PyTorch DataLoaders from Lhotse CutSet's -- they control the "
|
||||
"effective batch sizes, sampling strategies, applied data "
|
||||
"augmentations, etc.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--max-duration",
|
||||
type=int,
|
||||
default=200.0,
|
||||
help="Maximum pooled recordings duration (seconds) in a "
|
||||
"single batch. You can reduce it if it causes CUDA OOM.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--bucketing-sampler",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="When enabled, the batches will come from buckets of "
|
||||
"similar duration (saves padding frames).",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--num-buckets",
|
||||
type=int,
|
||||
default=30,
|
||||
help="The number of buckets for the BucketingSampler "
|
||||
"and DynamicBucketingSampler."
|
||||
"(you might want to increase it for larger datasets).",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--shuffle",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="When enabled (=default), the examples will be "
|
||||
"shuffled for each epoch.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--return-cuts",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="When enabled, each batch will have the "
|
||||
"field: batch['supervisions']['cut'] with the cuts that "
|
||||
"were used to construct it.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--num-workers",
|
||||
type=int,
|
||||
default=2,
|
||||
help="The number of training dataloader workers that "
|
||||
"collect the batches.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--enable-spec-aug",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="When enabled, use SpecAugment for training dataset.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--spec-aug-time-warp-factor",
|
||||
type=int,
|
||||
default=80,
|
||||
help="Used only when --enable-spec-aug is True. "
|
||||
"It specifies the factor for time warping in SpecAugment. "
|
||||
"Larger values mean more warping. "
|
||||
"A value less than 1 means to disable time warp.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--enable-musan",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="When enabled, select noise from MUSAN and mix it"
|
||||
"with training dataset. ",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--manifest-dir",
|
||||
type=Path,
|
||||
default=Path("data/fbank"),
|
||||
help="Path to directory with train/valid/test cuts.",
|
||||
)
|
||||
|
||||
group.add_argument(
|
||||
"--on-the-fly-feats",
|
||||
type=str2bool,
|
||||
default=False,
|
||||
help="When enabled, use on-the-fly cut mixing and feature "
|
||||
"extraction. Will drop existing precomputed feature manifests "
|
||||
"if available. Used only in dev/test CutSet",
|
||||
)
|
||||
|
||||
def train_dataloaders(
|
||||
self,
|
||||
cuts_train: CutSet,
|
||||
dynamic_bucketing: bool,
|
||||
on_the_fly_feats: bool,
|
||||
cuts_musan: Optional[CutSet] = None,
|
||||
) -> DataLoader:
|
||||
"""
|
||||
Args:
|
||||
cuts_train:
|
||||
Cuts for training.
|
||||
cuts_musan:
|
||||
If not None, it is the cuts for mixing.
|
||||
dynamic_bucketing:
|
||||
True to use DynamicBucketingSampler;
|
||||
False to use BucketingSampler.
|
||||
on_the_fly_feats:
|
||||
True to use OnTheFlyFeatures;
|
||||
False to use PrecomputedFeatures.
|
||||
"""
|
||||
transforms = []
|
||||
if cuts_musan is not None:
|
||||
logging.info("Enable MUSAN")
|
||||
transforms.append(
|
||||
CutMix(
|
||||
cuts=cuts_musan, prob=0.5, snr=(10, 20), preserve_id=True
|
||||
)
|
||||
)
|
||||
else:
|
||||
logging.info("Disable MUSAN")
|
||||
|
||||
input_transforms = []
|
||||
|
||||
if self.args.enable_spec_aug:
|
||||
logging.info("Enable SpecAugment")
|
||||
logging.info(
|
||||
f"Time warp factor: {self.args.spec_aug_time_warp_factor}"
|
||||
)
|
||||
# Set the value of num_frame_masks according to Lhotse's version.
|
||||
# In different Lhotse's versions, the default of num_frame_masks is
|
||||
# different.
|
||||
num_frame_masks = 10
|
||||
num_frame_masks_parameter = inspect.signature(
|
||||
SpecAugment.__init__
|
||||
).parameters["num_frame_masks"]
|
||||
if num_frame_masks_parameter.default == 1:
|
||||
num_frame_masks = 2
|
||||
logging.info(f"Num frame mask: {num_frame_masks}")
|
||||
input_transforms.append(
|
||||
SpecAugment(
|
||||
time_warp_factor=self.args.spec_aug_time_warp_factor,
|
||||
num_frame_masks=num_frame_masks,
|
||||
features_mask_size=27,
|
||||
num_feature_masks=2,
|
||||
frames_mask_size=100,
|
||||
)
|
||||
)
|
||||
else:
|
||||
logging.info("Disable SpecAugment")
|
||||
|
||||
logging.info("About to create train dataset")
|
||||
train = K2SpeechRecognitionDataset(
|
||||
cut_transforms=transforms,
|
||||
input_transforms=input_transforms,
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
|
||||
# NOTE: the PerturbSpeed transform should be added only if we
|
||||
# remove it from data prep stage.
|
||||
# Add on-the-fly speed perturbation; since originally it would
|
||||
# have increased epoch size by 3, we will apply prob 2/3 and use
|
||||
# 3x more epochs.
|
||||
# Speed perturbation probably should come first before
|
||||
# concatenation, but in principle the transforms order doesn't have
|
||||
# to be strict (e.g. could be randomized)
|
||||
# transforms = [PerturbSpeed(factors=[0.9, 1.1], p=2/3)] + transforms # noqa
|
||||
# Drop feats to be on the safe side.
|
||||
train = K2SpeechRecognitionDataset(
|
||||
cut_transforms=transforms,
|
||||
input_strategy=(
|
||||
OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
|
||||
if on_the_fly_feats
|
||||
else PrecomputedFeatures()
|
||||
),
|
||||
input_transforms=input_transforms,
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
|
||||
if dynamic_bucketing:
|
||||
logging.info("Using DynamicBucketingSampler.")
|
||||
train_sampler = DynamicBucketingSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
num_buckets=self.args.num_buckets,
|
||||
drop_last=True,
|
||||
)
|
||||
else:
|
||||
logging.info("Using BucketingSampler.")
|
||||
train_sampler = BucketingSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
num_buckets=self.args.num_buckets,
|
||||
bucket_method="equal_duration",
|
||||
drop_last=True,
|
||||
)
|
||||
|
||||
logging.info("About to create train dataloader")
|
||||
|
||||
# 'seed' is derived from the current random state, which will have
|
||||
# previously been set in the main process.
|
||||
seed = torch.randint(0, 100000, ()).item()
|
||||
worker_init_fn = _SeedWorkers(seed)
|
||||
|
||||
train_dl = DataLoader(
|
||||
train,
|
||||
sampler=train_sampler,
|
||||
batch_size=None,
|
||||
num_workers=self.args.num_workers,
|
||||
persistent_workers=False,
|
||||
worker_init_fn=worker_init_fn,
|
||||
)
|
||||
return train_dl
|
||||
|
||||
def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
|
||||
transforms = []
|
||||
|
||||
logging.info("About to create dev dataset")
|
||||
if self.args.on_the_fly_feats:
|
||||
validate = K2SpeechRecognitionDataset(
|
||||
cut_transforms=transforms,
|
||||
input_strategy=OnTheFlyFeatures(
|
||||
Fbank(FbankConfig(num_mel_bins=80))
|
||||
),
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
else:
|
||||
validate = K2SpeechRecognitionDataset(
|
||||
cut_transforms=transforms,
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
valid_sampler = BucketingSampler(
|
||||
cuts_valid,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
)
|
||||
logging.info("About to create dev dataloader")
|
||||
valid_dl = DataLoader(
|
||||
validate,
|
||||
sampler=valid_sampler,
|
||||
batch_size=None,
|
||||
num_workers=2,
|
||||
persistent_workers=False,
|
||||
)
|
||||
|
||||
return valid_dl
|
||||
|
||||
def test_dataloaders(self, cuts: CutSet) -> DataLoader:
|
||||
logging.debug("About to create test dataset")
|
||||
test = K2SpeechRecognitionDataset(
|
||||
input_strategy=OnTheFlyFeatures(Fbank(FbankConfig(num_mel_bins=80)))
|
||||
if self.args.on_the_fly_feats
|
||||
else PrecomputedFeatures(),
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
sampler = BucketingSampler(
|
||||
cuts, max_duration=self.args.max_duration, shuffle=False
|
||||
)
|
||||
logging.debug("About to create test dataloader")
|
||||
test_dl = DataLoader(
|
||||
test,
|
||||
batch_size=None,
|
||||
sampler=sampler,
|
||||
num_workers=self.args.num_workers,
|
||||
)
|
||||
return test_dl
|
@ -0,0 +1 @@
|
||||
../pruned_transducer_stateless3/asr_datamodule.py
|
@ -1,75 +0,0 @@
|
||||
# Copyright 2021 Piotr Żelasko
|
||||
# 2022 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet, load_manifest
|
||||
|
||||
|
||||
class GigaSpeech:
|
||||
def __init__(self, manifest_dir: str):
|
||||
"""
|
||||
Args:
|
||||
manifest_dir:
|
||||
It is expected to contain the following files::
|
||||
|
||||
- cuts_XL_raw.jsonl.gz
|
||||
- cuts_L_raw.jsonl.gz
|
||||
- cuts_M_raw.jsonl.gz
|
||||
- cuts_S_raw.jsonl.gz
|
||||
- cuts_XS_raw.jsonl.gz
|
||||
- cuts_DEV_raw.jsonl.gz
|
||||
- cuts_TEST_raw.jsonl.gz
|
||||
"""
|
||||
self.manifest_dir = Path(manifest_dir)
|
||||
|
||||
def train_XL_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_XL_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-XL cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def train_L_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_L_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-L cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def train_M_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_M_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-M cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def train_S_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_S_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-S cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def train_XS_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_XS_raw.jsonl.gz"
|
||||
logging.info(f"About to get train-XS cuts from {f}")
|
||||
return CutSet.from_jsonl_lazy(f)
|
||||
|
||||
def test_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_TEST.jsonl.gz"
|
||||
logging.info(f"About to get TEST cuts from {f}")
|
||||
return load_manifest(f)
|
||||
|
||||
def dev_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_DEV.jsonl.gz"
|
||||
logging.info(f"About to get DEV cuts from {f}")
|
||||
return load_manifest(f)
|
@ -0,0 +1 @@
|
||||
../pruned_transducer_stateless3/gigaspeech.py
|
@ -1,74 +0,0 @@
|
||||
# Copyright 2021 Piotr Żelasko
|
||||
# 2022 Xiaomi Corp. (authors: Fangjun Kuang)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet, load_manifest
|
||||
|
||||
|
||||
class LibriSpeech:
|
||||
def __init__(self, manifest_dir: str):
|
||||
"""
|
||||
Args:
|
||||
manifest_dir:
|
||||
It is expected to contain the following files::
|
||||
|
||||
- cuts_dev-clean.json.gz
|
||||
- cuts_dev-other.json.gz
|
||||
- cuts_test-clean.json.gz
|
||||
- cuts_test-other.json.gz
|
||||
- cuts_train-clean-100.json.gz
|
||||
- cuts_train-clean-360.json.gz
|
||||
- cuts_train-other-500.json.gz
|
||||
"""
|
||||
self.manifest_dir = Path(manifest_dir)
|
||||
|
||||
def train_clean_100_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_train-clean-100.json.gz"
|
||||
logging.info(f"About to get train-clean-100 cuts from {f}")
|
||||
return load_manifest(f)
|
||||
|
||||
def train_clean_360_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_train-clean-360.json.gz"
|
||||
logging.info(f"About to get train-clean-360 cuts from {f}")
|
||||
return load_manifest(f)
|
||||
|
||||
def train_other_500_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_train-other-500.json.gz"
|
||||
logging.info(f"About to get train-other-500 cuts from {f}")
|
||||
return load_manifest(f)
|
||||
|
||||
def test_clean_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_test-clean.json.gz"
|
||||
logging.info(f"About to get test-clean cuts from {f}")
|
||||
return load_manifest(f)
|
||||
|
||||
def test_other_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_test-other.json.gz"
|
||||
logging.info(f"About to get test-other cuts from {f}")
|
||||
return load_manifest(f)
|
||||
|
||||
def dev_clean_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_dev-clean.json.gz"
|
||||
logging.info(f"About to get dev-clean cuts from {f}")
|
||||
return load_manifest(f)
|
||||
|
||||
def dev_other_cuts(self) -> CutSet:
|
||||
f = self.manifest_dir / "cuts_dev-other.json.gz"
|
||||
logging.info(f"About to get dev-other cuts from {f}")
|
||||
return load_manifest(f)
|
@ -0,0 +1 @@
|
||||
../pruned_transducer_stateless3/librispeech.py
|
@ -28,7 +28,7 @@ from pathlib import Path
|
||||
|
||||
from asr_datamodule import AsrDataModule
|
||||
from gigaspeech import GigaSpeech
|
||||
from lhotse import load_manifest
|
||||
from lhotse import load_manifest_lazy
|
||||
from librispeech import LibriSpeech
|
||||
|
||||
|
||||
@ -41,8 +41,8 @@ def test_dataset():
|
||||
print(args)
|
||||
|
||||
if args.enable_musan:
|
||||
cuts_musan = load_manifest(
|
||||
Path(args.manifest_dir) / "cuts_musan.json.gz"
|
||||
cuts_musan = load_manifest_lazy(
|
||||
Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
|
||||
)
|
||||
else:
|
||||
cuts_musan = None
|
||||
@ -57,14 +57,12 @@ def test_dataset():
|
||||
|
||||
libri_train_dl = asr_datamodule.train_dataloaders(
|
||||
train_clean_100,
|
||||
dynamic_bucketing=False,
|
||||
on_the_fly_feats=False,
|
||||
cuts_musan=cuts_musan,
|
||||
)
|
||||
|
||||
giga_train_dl = asr_datamodule.train_dataloaders(
|
||||
train_S,
|
||||
dynamic_bucketing=True,
|
||||
on_the_fly_feats=True,
|
||||
cuts_musan=cuts_musan,
|
||||
)
|
||||
|
@ -73,7 +73,7 @@ from conformer import Conformer
|
||||
from decoder import Decoder
|
||||
from gigaspeech import GigaSpeech
|
||||
from joiner import Joiner
|
||||
from lhotse import CutSet, load_manifest
|
||||
from lhotse import CutSet, load_manifest_lazy
|
||||
from lhotse.cut import Cut
|
||||
from lhotse.utils import fix_random_seed
|
||||
from librispeech import LibriSpeech
|
||||
@ -662,19 +662,17 @@ def train_one_epoch(
|
||||
def filter_short_and_long_utterances(cuts: CutSet) -> CutSet:
|
||||
def remove_short_and_long_utt(c: Cut):
|
||||
# Keep only utterances with duration between 1 second and 20 seconds
|
||||
#
|
||||
# Caution: There is a reason to select 20.0 here. Please see
|
||||
# ../local/display_manifest_statistics.py
|
||||
#
|
||||
# You should use ../local/display_manifest_statistics.py to get
|
||||
# an utterance duration distribution for your dataset to select
|
||||
# the threshold
|
||||
return 1.0 <= c.duration <= 20.0
|
||||
|
||||
num_in_total = len(cuts)
|
||||
cuts = cuts.filter(remove_short_and_long_utt)
|
||||
|
||||
num_left = len(cuts)
|
||||
num_removed = num_in_total - num_left
|
||||
removed_percent = num_removed / num_in_total * 100
|
||||
|
||||
logging.info(f"Before removing short and long utterances: {num_in_total}")
|
||||
logging.info(f"After removing short and long utterances: {num_left}")
|
||||
logging.info(f"Removed {num_removed} utterances ({removed_percent:.5f}%)")
|
||||
|
||||
return cuts
|
||||
|
||||
|
||||
@ -767,17 +765,18 @@ def run(rank, world_size, args):
|
||||
# DEV 12 hours
|
||||
# Test 40 hours
|
||||
if params.full_libri:
|
||||
logging.info("Using the L subset of GigaSpeech (2.5k hours)")
|
||||
train_giga_cuts = gigaspeech.train_L_cuts()
|
||||
logging.info("Using the XL subset of GigaSpeech (10k hours)")
|
||||
train_giga_cuts = gigaspeech.train_XL_cuts()
|
||||
else:
|
||||
logging.info("Using the S subset of GigaSpeech (250 hours)")
|
||||
train_giga_cuts = gigaspeech.train_S_cuts()
|
||||
|
||||
train_giga_cuts = filter_short_and_long_utterances(train_giga_cuts)
|
||||
train_giga_cuts = train_giga_cuts.repeat(times=None)
|
||||
|
||||
if args.enable_musan:
|
||||
cuts_musan = load_manifest(
|
||||
Path(args.manifest_dir) / "cuts_musan.json.gz"
|
||||
cuts_musan = load_manifest_lazy(
|
||||
Path(args.manifest_dir) / "musan_cuts.jsonl.gz"
|
||||
)
|
||||
else:
|
||||
cuts_musan = None
|
||||
@ -786,14 +785,12 @@ def run(rank, world_size, args):
|
||||
|
||||
train_dl = asr_datamodule.train_dataloaders(
|
||||
train_cuts,
|
||||
dynamic_bucketing=False,
|
||||
on_the_fly_feats=False,
|
||||
cuts_musan=cuts_musan,
|
||||
)
|
||||
|
||||
giga_train_dl = asr_datamodule.train_dataloaders(
|
||||
train_giga_cuts,
|
||||
dynamic_bucketing=True,
|
||||
on_the_fly_feats=True,
|
||||
cuts_musan=cuts_musan,
|
||||
)
|
||||
|
@ -22,7 +22,7 @@ from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import torch
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||
from lhotse.dataset import (
|
||||
CutConcatenate,
|
||||
CutMix,
|
||||
@ -176,7 +176,7 @@ class SPGISpeechAsrDataModule:
|
||||
The state dict for the training sampler.
|
||||
"""
|
||||
logging.info("About to get Musan cuts")
|
||||
cuts_musan = load_manifest(
|
||||
cuts_musan = load_manifest_lazy(
|
||||
self.args.manifest_dir / "cuts_musan.jsonl.gz"
|
||||
)
|
||||
|
||||
|
@ -52,8 +52,13 @@ def compute_fbank_tedlium():
|
||||
"test",
|
||||
)
|
||||
|
||||
prefix = "tedlium"
|
||||
suffix = "jsonl.gz"
|
||||
manifests = read_manifests_if_cached(
|
||||
prefix="tedlium", dataset_parts=dataset_parts, output_dir=src_dir
|
||||
dataset_parts=dataset_parts,
|
||||
output_dir=src_dir,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
)
|
||||
assert manifests is not None
|
||||
|
||||
@ -61,7 +66,7 @@ def compute_fbank_tedlium():
|
||||
|
||||
with get_executor() as ex: # Initialize the executor only once.
|
||||
for partition, m in manifests.items():
|
||||
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
||||
if (output_dir / f"{prefix}_cuts_{partition}.{suffix}").is_file():
|
||||
logging.info(f"{partition} already exists - skipping.")
|
||||
continue
|
||||
logging.info(f"Processing {partition}")
|
||||
@ -80,7 +85,7 @@ def compute_fbank_tedlium():
|
||||
|
||||
cut_set = cut_set.compute_and_store_features(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/feats_{partition}",
|
||||
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||
# when an executor is specified, make more partitions
|
||||
num_jobs=cur_num_jobs,
|
||||
executor=ex,
|
||||
@ -88,7 +93,7 @@ def compute_fbank_tedlium():
|
||||
)
|
||||
# Split long cuts into many short and un-overlapping cuts
|
||||
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False)
|
||||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
||||
cut_set.to_file(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -27,15 +27,15 @@ for usage.
|
||||
"""
|
||||
|
||||
|
||||
from lhotse import load_manifest
|
||||
from lhotse import load_manifest_lazy
|
||||
|
||||
|
||||
def main():
|
||||
path = "./data/fbank/cuts_train.json.gz"
|
||||
path = "./data/fbank/cuts_dev.json.gz"
|
||||
path = "./data/fbank/cuts_test.json.gz"
|
||||
path = "./data/fbank/tedlium_cuts_train.jsonl.gz"
|
||||
path = "./data/fbank/tedlium_cuts_dev.jsonl.gz"
|
||||
path = "./data/fbank/tedlium_cuts_test.jsonl.gz"
|
||||
|
||||
cuts = load_manifest(path)
|
||||
cuts = load_manifest_lazy(path)
|
||||
cuts.describe()
|
||||
|
||||
|
||||
|
@ -22,11 +22,11 @@ import logging
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||
from lhotse.dataset import (
|
||||
BucketingSampler,
|
||||
CutConcatenate,
|
||||
CutMix,
|
||||
DynamicBucketingSampler,
|
||||
K2SpeechRecognitionDataset,
|
||||
PrecomputedFeatures,
|
||||
SingleCutSampler,
|
||||
@ -92,7 +92,7 @@ class TedLiumAsrDataModule:
|
||||
"--num-buckets",
|
||||
type=int,
|
||||
default=30,
|
||||
help="The number of buckets for the BucketingSampler"
|
||||
help="The number of buckets for the DynamicBucketingSampler"
|
||||
"(you might want to increase it for larger datasets).",
|
||||
)
|
||||
group.add_argument(
|
||||
@ -179,8 +179,8 @@ class TedLiumAsrDataModule:
|
||||
transforms = []
|
||||
if self.args.enable_musan:
|
||||
logging.info("Enable MUSAN")
|
||||
cuts_musan = load_manifest(
|
||||
self.args.manifest_dir / "cuts_musan.json.gz"
|
||||
cuts_musan = load_manifest_lazy(
|
||||
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||
)
|
||||
transforms.append(
|
||||
CutMix(
|
||||
@ -261,13 +261,12 @@ class TedLiumAsrDataModule:
|
||||
)
|
||||
|
||||
if self.args.bucketing_sampler:
|
||||
logging.info("Using BucketingSampler.")
|
||||
train_sampler = BucketingSampler(
|
||||
logging.info("Using DynamicBucketingSampler.")
|
||||
train_sampler = DynamicBucketingSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
num_buckets=self.args.num_buckets,
|
||||
bucket_method="equal_duration",
|
||||
drop_last=True,
|
||||
)
|
||||
else:
|
||||
@ -311,7 +310,7 @@ class TedLiumAsrDataModule:
|
||||
cut_transforms=transforms,
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
valid_sampler = BucketingSampler(
|
||||
valid_sampler = DynamicBucketingSampler(
|
||||
cuts_valid,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
@ -335,8 +334,10 @@ class TedLiumAsrDataModule:
|
||||
else PrecomputedFeatures(),
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
sampler = BucketingSampler(
|
||||
cuts, max_duration=self.args.max_duration, shuffle=False
|
||||
sampler = DynamicBucketingSampler(
|
||||
cuts,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
)
|
||||
logging.debug("About to create test dataloader")
|
||||
test_dl = DataLoader(
|
||||
@ -350,14 +351,20 @@ class TedLiumAsrDataModule:
|
||||
@lru_cache()
|
||||
def train_cuts(self) -> CutSet:
|
||||
logging.info("About to get train cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_train.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "tedlium_cuts_train.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def dev_cuts(self) -> CutSet:
|
||||
logging.info("About to get dev cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_dev.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "tedlium_cuts_dev.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def test_cuts(self) -> CutSet:
|
||||
logging.info("About to get test cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_test.json.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "tedlium_cuts_test.jsonl.gz"
|
||||
)
|
||||
|
@ -29,7 +29,7 @@ import os
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
from icefall.utils import get_executor
|
||||
@ -53,8 +53,13 @@ def compute_fbank_timit():
|
||||
"DEV",
|
||||
"TEST",
|
||||
)
|
||||
prefix = "timit"
|
||||
suffix = "jsonl.gz"
|
||||
manifests = read_manifests_if_cached(
|
||||
prefix="timit", dataset_parts=dataset_parts, output_dir=src_dir
|
||||
dataset_parts=dataset_parts,
|
||||
output_dir=src_dir,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
)
|
||||
assert manifests is not None
|
||||
|
||||
@ -62,7 +67,8 @@ def compute_fbank_timit():
|
||||
|
||||
with get_executor() as ex: # Initialize the executor only once.
|
||||
for partition, m in manifests.items():
|
||||
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
||||
cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
|
||||
if cuts_file.is_file():
|
||||
logging.info(f"{partition} already exists - skipping.")
|
||||
continue
|
||||
logging.info(f"Processing {partition}")
|
||||
@ -78,13 +84,13 @@ def compute_fbank_timit():
|
||||
)
|
||||
cut_set = cut_set.compute_and_store_features(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/feats_{partition}",
|
||||
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||
# when an executor is specified, make more partitions
|
||||
num_jobs=num_jobs if ex is None else 80,
|
||||
executor=ex,
|
||||
storage_type=LilcomHdf5Writer,
|
||||
storage_type=LilcomChunkyWriter,
|
||||
)
|
||||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
||||
cut_set.to_file(cuts_file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -23,11 +23,11 @@ from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||
from lhotse.dataset import (
|
||||
BucketingSampler,
|
||||
CutConcatenate,
|
||||
CutMix,
|
||||
DynamicBucketingSampler,
|
||||
K2SpeechRecognitionDataset,
|
||||
PrecomputedFeatures,
|
||||
SingleCutSampler,
|
||||
@ -92,7 +92,7 @@ class TimitAsrDataModule(DataModule):
|
||||
"--num-buckets",
|
||||
type=int,
|
||||
default=30,
|
||||
help="The number of buckets for the BucketingSampler"
|
||||
help="The number of buckets for the DynamicBucketingSampler"
|
||||
"(you might want to increase it for larger datasets).",
|
||||
)
|
||||
group.add_argument(
|
||||
@ -154,7 +154,9 @@ class TimitAsrDataModule(DataModule):
|
||||
cuts_train = self.train_cuts()
|
||||
|
||||
logging.info("About to get Musan cuts")
|
||||
cuts_musan = load_manifest(self.args.feature_dir / "cuts_musan.json.gz")
|
||||
cuts_musan = load_manifest_lazy(
|
||||
self.args.feature_dir / "cuts_musan.jsonl.gz"
|
||||
)
|
||||
|
||||
logging.info("About to create train dataset")
|
||||
transforms = [CutMix(cuts=cuts_musan, prob=0.5, snr=(10, 20))]
|
||||
@ -218,13 +220,12 @@ class TimitAsrDataModule(DataModule):
|
||||
)
|
||||
|
||||
if self.args.bucketing_sampler:
|
||||
logging.info("Using BucketingSampler.")
|
||||
train_sampler = BucketingSampler(
|
||||
logging.info("Using DynamicBucketingSampler.")
|
||||
train_sampler = DynamicBucketingSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
num_buckets=self.args.num_buckets,
|
||||
bucket_method="equal_duration",
|
||||
drop_last=True,
|
||||
)
|
||||
else:
|
||||
@ -322,20 +323,26 @@ class TimitAsrDataModule(DataModule):
|
||||
@lru_cache()
|
||||
def train_cuts(self) -> CutSet:
|
||||
logging.info("About to get train cuts")
|
||||
cuts_train = load_manifest(self.args.feature_dir / "cuts_TRAIN.json.gz")
|
||||
cuts_train = load_manifest_lazy(
|
||||
self.args.feature_dir / "timit_cuts_TRAIN.jsonl.gz"
|
||||
)
|
||||
|
||||
return cuts_train
|
||||
|
||||
@lru_cache()
|
||||
def valid_cuts(self) -> CutSet:
|
||||
logging.info("About to get dev cuts")
|
||||
cuts_valid = load_manifest(self.args.feature_dir / "cuts_DEV.json.gz")
|
||||
cuts_valid = load_manifest_lazy(
|
||||
self.args.feature_dir / "timit_cuts_DEV.jsonl.gz"
|
||||
)
|
||||
|
||||
return cuts_valid
|
||||
|
||||
@lru_cache()
|
||||
def test_cuts(self) -> CutSet:
|
||||
logging.debug("About to get test cuts")
|
||||
cuts_test = load_manifest(self.args.feature_dir / "cuts_TEST.json.gz")
|
||||
cuts_test = load_manifest_lazy(
|
||||
self.args.feature_dir / "timit_cuts_TEST.jsonl.gz"
|
||||
)
|
||||
|
||||
return cuts_test
|
||||
|
@ -26,7 +26,7 @@ for usage.
|
||||
"""
|
||||
|
||||
|
||||
from lhotse import load_manifest
|
||||
from lhotse import load_manifest_lazy
|
||||
|
||||
|
||||
def main():
|
||||
@ -40,7 +40,7 @@ def main():
|
||||
|
||||
for path in paths:
|
||||
print(f"Starting display the statistics for {path}")
|
||||
cuts = load_manifest(path)
|
||||
cuts = load_manifest_lazy(path)
|
||||
cuts.describe()
|
||||
|
||||
|
||||
|
@ -27,7 +27,7 @@ from lhotse import (
|
||||
CutSet,
|
||||
Fbank,
|
||||
FbankConfig,
|
||||
load_manifest,
|
||||
load_manifest_lazy,
|
||||
set_caching_enabled,
|
||||
)
|
||||
from lhotse.dataset import (
|
||||
@ -218,8 +218,8 @@ class WenetSpeechAsrDataModule:
|
||||
The state dict for the training sampler.
|
||||
"""
|
||||
logging.info("About to get Musan cuts")
|
||||
cuts_musan = load_manifest(
|
||||
self.args.manifest_dir / "cuts_musan.json.gz"
|
||||
cuts_musan = load_manifest_lazy(
|
||||
self.args.manifest_dir / "musan_cuts.jsonl.gz"
|
||||
)
|
||||
|
||||
transforms = []
|
||||
@ -435,16 +435,18 @@ class WenetSpeechAsrDataModule:
|
||||
@lru_cache()
|
||||
def valid_cuts(self) -> CutSet:
|
||||
logging.info("About to get dev cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
|
||||
return load_manifest_lazy(self.args.manifest_dir / "cuts_DEV.jsonl.gz")
|
||||
|
||||
@lru_cache()
|
||||
def test_net_cuts(self) -> List[CutSet]:
|
||||
logging.info("About to get TEST_NET cuts")
|
||||
return load_manifest(self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "cuts_TEST_NET.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def test_meeting_cuts(self) -> List[CutSet]:
|
||||
logging.info("About to get TEST_MEETING cuts")
|
||||
return load_manifest(
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "cuts_TEST_MEETING.jsonl.gz"
|
||||
)
|
||||
|
@ -12,7 +12,7 @@ import os
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
|
||||
from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
from icefall.utils import get_executor
|
||||
@ -37,10 +37,13 @@ def compute_fbank_yesno():
|
||||
"train",
|
||||
"test",
|
||||
)
|
||||
prefix = "yesno"
|
||||
suffix = "jsonl.gz"
|
||||
manifests = read_manifests_if_cached(
|
||||
dataset_parts=dataset_parts,
|
||||
output_dir=src_dir,
|
||||
prefix="yesno",
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
)
|
||||
assert manifests is not None
|
||||
|
||||
@ -50,7 +53,8 @@ def compute_fbank_yesno():
|
||||
|
||||
with get_executor() as ex: # Initialize the executor only once.
|
||||
for partition, m in manifests.items():
|
||||
if (output_dir / f"cuts_{partition}.json.gz").is_file():
|
||||
cuts_file = output_dir / f"{prefix}_cuts_{partition}.{suffix}"
|
||||
if cuts_file.is_file():
|
||||
logging.info(f"{partition} already exists - skipping.")
|
||||
continue
|
||||
logging.info(f"Processing {partition}")
|
||||
@ -66,13 +70,13 @@ def compute_fbank_yesno():
|
||||
)
|
||||
cut_set = cut_set.compute_and_store_features(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/feats_{partition}",
|
||||
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||
# when an executor is specified, make more partitions
|
||||
num_jobs=num_jobs if ex is None else 1, # use one job
|
||||
executor=ex,
|
||||
storage_type=LilcomHdf5Writer,
|
||||
storage_type=LilcomChunkyWriter,
|
||||
)
|
||||
cut_set.to_json(output_dir / f"cuts_{partition}.json.gz")
|
||||
cut_set.to_file(cuts_file)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -20,18 +20,19 @@ from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
|
||||
from lhotse.dataset import (
|
||||
CutConcatenate,
|
||||
DynamicBucketingSampler,
|
||||
K2SpeechRecognitionDataset,
|
||||
PrecomputedFeatures,
|
||||
SingleCutSampler,
|
||||
)
|
||||
from lhotse.dataset.input_strategies import OnTheFlyFeatures
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from icefall.dataset.datamodule import DataModule
|
||||
from icefall.utils import str2bool
|
||||
from lhotse import CutSet, Fbank, FbankConfig, load_manifest
|
||||
from lhotse.dataset import (
|
||||
BucketingSampler,
|
||||
CutConcatenate,
|
||||
K2SpeechRecognitionDataset,
|
||||
PrecomputedFeatures,
|
||||
)
|
||||
from lhotse.dataset.input_strategies import OnTheFlyFeatures
|
||||
|
||||
|
||||
class YesNoAsrDataModule(DataModule):
|
||||
@ -84,7 +85,7 @@ class YesNoAsrDataModule(DataModule):
|
||||
"--num-buckets",
|
||||
type=int,
|
||||
default=10,
|
||||
help="The number of buckets for the BucketingSampler"
|
||||
help="The number of buckets for the DynamicBucketingSampler"
|
||||
"(you might want to increase it for larger datasets).",
|
||||
)
|
||||
group.add_argument(
|
||||
@ -186,18 +187,17 @@ class YesNoAsrDataModule(DataModule):
|
||||
)
|
||||
|
||||
if self.args.bucketing_sampler:
|
||||
logging.info("Using BucketingSampler.")
|
||||
train_sampler = BucketingSampler(
|
||||
logging.info("Using DynamicBucketingSampler.")
|
||||
train_sampler = DynamicBucketingSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
num_buckets=self.args.num_buckets,
|
||||
bucket_method="equal_duration",
|
||||
drop_last=True,
|
||||
)
|
||||
else:
|
||||
logging.info("Using SingleCutSampler.")
|
||||
train_sampler = BucketingSampler(
|
||||
train_sampler = SingleCutSampler(
|
||||
cuts_train,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=self.args.shuffle,
|
||||
@ -225,8 +225,10 @@ class YesNoAsrDataModule(DataModule):
|
||||
else PrecomputedFeatures(),
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
sampler = BucketingSampler(
|
||||
cuts_test, max_duration=self.args.max_duration, shuffle=False
|
||||
sampler = DynamicBucketingSampler(
|
||||
cuts_test,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
)
|
||||
logging.debug("About to create test dataloader")
|
||||
test_dl = DataLoader(
|
||||
@ -240,11 +242,15 @@ class YesNoAsrDataModule(DataModule):
|
||||
@lru_cache()
|
||||
def train_cuts(self) -> CutSet:
|
||||
logging.info("About to get train cuts")
|
||||
cuts_train = load_manifest(self.args.feature_dir / "cuts_train.json.gz")
|
||||
cuts_train = load_manifest_lazy(
|
||||
self.args.feature_dir / "yesno_cuts_train.jsonl.gz"
|
||||
)
|
||||
return cuts_train
|
||||
|
||||
@lru_cache()
|
||||
def test_cuts(self) -> List[CutSet]:
|
||||
logging.info("About to get test cuts")
|
||||
cuts_test = load_manifest(self.args.feature_dir / "cuts_test.json.gz")
|
||||
cuts_test = load_manifest_lazy(
|
||||
self.args.feature_dir / "yesno_cuts_test.jsonl.gz"
|
||||
)
|
||||
return cuts_test
|
||||
|
@ -131,7 +131,6 @@ def setup_logger(
|
||||
format=formatter,
|
||||
level=level,
|
||||
filemode="w",
|
||||
force=True,
|
||||
)
|
||||
if use_console:
|
||||
console = logging.StreamHandler()
|
||||
|
Loading…
x
Reference in New Issue
Block a user