mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-27 02:34:21 +00:00
69 lines
2.6 KiB
Python
69 lines
2.6 KiB
Python
from collections import defaultdict
|
|
from typing import Dict, Iterable, Optional, Sequence, Union
|
|
|
|
from lhotse import load_manifest
|
|
from lhotse.audio import RecordingSet
|
|
from lhotse.supervision import SupervisionSet
|
|
from lhotse.utils import Pathlike
|
|
|
|
DEFAULT_DETECTED_MANIFEST_TYPES = ("recordings", "supervisions")
|
|
|
|
|
|
def read_manifests_if_cached(
|
|
dataset_parts: Optional[Sequence[str]],
|
|
output_dir: Optional[Pathlike],
|
|
prefix: str = "",
|
|
suffix: Optional[str] = "json",
|
|
types: Iterable[str] = DEFAULT_DETECTED_MANIFEST_TYPES,
|
|
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
|
|
"""
|
|
Loads manifests from the disk, or a subset of them if only some exist.
|
|
The manifests are searched for using the pattern ``output_dir / f'{prefix}_{manifest}_{part}.json'``,
|
|
where `manifest` is one of ``["recordings", "supervisions"]`` and ``part`` is specified in ``dataset_parts``.
|
|
This function is intended to speedup data preparation if it has already been done before.
|
|
|
|
:param dataset_parts: Names of dataset pieces, e.g. in LibriSpeech: ``["test-clean", "dev-clean", ...]``.
|
|
:param output_dir: Where to look for the files.
|
|
:param prefix: Optional common prefix for the manifest files (underscore is automatically added).
|
|
:param suffix: Optional common suffix for the manifest files ("json" by default).
|
|
:param types: Which types of manifests are searched for (default: 'recordings' and 'supervisions').
|
|
:return: A dict with manifest (``d[dataset_part]['recording'|'manifest']``) or ``None``.
|
|
"""
|
|
print("using this file....")
|
|
if output_dir is None:
|
|
return {}
|
|
if prefix and not prefix.endswith("_"):
|
|
prefix = f"{prefix}_"
|
|
if suffix.startswith("."):
|
|
suffix = suffix[1:]
|
|
manifests = defaultdict(dict)
|
|
for part in dataset_parts:
|
|
print(part)
|
|
for manifest in types:
|
|
path = output_dir / f"{prefix}{manifest}_{part}.{suffix}"
|
|
if not path.is_file():
|
|
continue
|
|
# print(path)
|
|
manifests[part][manifest] = load_manifest(path)
|
|
return dict(manifests)
|
|
|
|
|
|
def manifests_exist(
|
|
part: str,
|
|
output_dir: Optional[Pathlike],
|
|
types: Iterable[str] = DEFAULT_DETECTED_MANIFEST_TYPES,
|
|
prefix: str = "",
|
|
suffix: str = "json",
|
|
) -> bool:
|
|
if output_dir is None:
|
|
return False
|
|
if prefix and not prefix.endswith("_"):
|
|
prefix = f"{prefix}_"
|
|
if suffix.startswith("."):
|
|
suffix = suffix[1:]
|
|
for name in types:
|
|
path = output_dir / f"{prefix}{name}_{part}.{suffix}"
|
|
if not path.is_file():
|
|
return False
|
|
return True
|