2022-01-21 17:47:57 +08:00

69 lines
2.6 KiB
Python

from collections import defaultdict
from typing import Dict, Iterable, Optional, Sequence, Union
from lhotse import load_manifest
from lhotse.audio import RecordingSet
from lhotse.supervision import SupervisionSet
from lhotse.utils import Pathlike
DEFAULT_DETECTED_MANIFEST_TYPES = ("recordings", "supervisions")
def read_manifests_if_cached(
dataset_parts: Optional[Sequence[str]],
output_dir: Optional[Pathlike],
prefix: str = "",
suffix: Optional[str] = "json",
types: Iterable[str] = DEFAULT_DETECTED_MANIFEST_TYPES,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Loads manifests from the disk, or a subset of them if only some exist.
The manifests are searched for using the pattern ``output_dir / f'{prefix}_{manifest}_{part}.json'``,
where `manifest` is one of ``["recordings", "supervisions"]`` and ``part`` is specified in ``dataset_parts``.
This function is intended to speedup data preparation if it has already been done before.
:param dataset_parts: Names of dataset pieces, e.g. in LibriSpeech: ``["test-clean", "dev-clean", ...]``.
:param output_dir: Where to look for the files.
:param prefix: Optional common prefix for the manifest files (underscore is automatically added).
:param suffix: Optional common suffix for the manifest files ("json" by default).
:param types: Which types of manifests are searched for (default: 'recordings' and 'supervisions').
:return: A dict with manifest (``d[dataset_part]['recording'|'manifest']``) or ``None``.
"""
print("using this file....")
if output_dir is None:
return {}
if prefix and not prefix.endswith("_"):
prefix = f"{prefix}_"
if suffix.startswith("."):
suffix = suffix[1:]
manifests = defaultdict(dict)
for part in dataset_parts:
print(part)
for manifest in types:
path = output_dir / f"{prefix}{manifest}_{part}.{suffix}"
if not path.is_file():
continue
# print(path)
manifests[part][manifest] = load_manifest(path)
return dict(manifests)
def manifests_exist(
part: str,
output_dir: Optional[Pathlike],
types: Iterable[str] = DEFAULT_DETECTED_MANIFEST_TYPES,
prefix: str = "",
suffix: str = "json",
) -> bool:
if output_dir is None:
return False
if prefix and not prefix.endswith("_"):
prefix = f"{prefix}_"
if suffix.startswith("."):
suffix = suffix[1:]
for name in types:
path = output_dir / f"{prefix}{name}_{part}.{suffix}"
if not path.is_file():
return False
return True