update training data maximum duration

This commit is contained in:
Mingshuang Luo 2022-02-25 11:40:57 +08:00
parent e752c0440d
commit b5f408689e
3 changed files with 10 additions and 7 deletions

View File

@ -6,6 +6,8 @@ per-file-ignores =
# line too long # line too long
egs/librispeech/ASR/*/conformer.py: E501, egs/librispeech/ASR/*/conformer.py: E501,
egs/aishell/ASR/*/conformer.py: E501, egs/aishell/ASR/*/conformer.py: E501,
egs/tedlium3/ASR/*/conformer.py: E501,
egs/tedlium3/ASR/local/display_manifest_statistics.py: E501,
exclude = exclude =
.git, .git,

View File

@ -1,5 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang) # Copyright 2021 Xiaomi Corp. (authors: Fangjun Kuang
# Mingshuang Luo)
# #
# See ../../../../LICENSE for clarification regarding multiple authors # See ../../../../LICENSE for clarification regarding multiple authors
# #
@ -31,9 +32,10 @@ import torch
from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer from lhotse import CutSet, Fbank, FbankConfig, LilcomHdf5Writer
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
# from utils import read_manifests_if_cached # from utils import read_manifests_if_cached
from icefall.utils import get_executor
# Torch's multithreaded behavior needs to be disabled or # Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down. # it wastes a lot of CPU and slow things down.
@ -71,7 +73,7 @@ def compute_fbank_tedlium():
cut_set = CutSet.from_manifests( cut_set = CutSet.from_manifests(
recordings=m["recordings"], recordings=m["recordings"],
supervisions=m["supervisions"], supervisions=m["supervisions"],
) ).trim_to_supervisions(keep_overlapping=False)
if "train" in partition: if "train" in partition:
cut_set = ( cut_set = (
cut_set cut_set

View File

@ -31,8 +31,9 @@ from lhotse import load_manifest
def describe(cuts) -> None: def describe(cuts) -> None:
""" """
Print a message describing details about the ``CutSet`` - the number of cuts and the Print a message describing details about the ``CutSet`` - the number
duration statistics, including the total duration and the percentage of speech segments. of cuts and the duration statistics, including the total duration
and the percentage of speech segments.
Example output: Example output:
Cuts count: 804789 Cuts count: 804789
@ -49,8 +50,6 @@ def describe(cuts) -> None:
99.5% 14.9 99.5% 14.9
99.9% 16.6 99.9% 16.6
max 33.3 max 33.3
In the above example, we set 15(>14.9) as the maximum duration of training samples.
""" """
durations = np.array([c.duration for c in cuts]) durations = np.array([c.duration for c in cuts])
speech_durations = np.array( speech_durations = np.array(