From 169ec5c3e664563ac77e2fb760e596776df15274 Mon Sep 17 00:00:00 2001 From: dohe0342 Date: Mon, 12 Jun 2023 00:09:05 +0900 Subject: [PATCH] from local --- .../ASR/local/.prepare_ted_manifests.py.swp | Bin 4096 -> 0 bytes .../ASR/local/prepare_ted_manifests.py | 73 ++++++++++++++++++ 2 files changed, 73 insertions(+) delete mode 100644 egs/tedlium3/ASR/local/.prepare_ted_manifests.py.swp create mode 100644 egs/tedlium3/ASR/local/prepare_ted_manifests.py diff --git a/egs/tedlium3/ASR/local/.prepare_ted_manifests.py.swp b/egs/tedlium3/ASR/local/.prepare_ted_manifests.py.swp deleted file mode 100644 index b751452e6955b2c00a8189a3a63b28e144e25463..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4096 zcmYc?2=nw+u+%eP00IFJ0Rf#C85mOXGg1wVO^je-xM&xky42E4UEjoHr~LfvfTDc8 zoc!d(9CY<{Fa!0=^NX^J3lfu4^)r)G(-L!X^i$J|^-EGyaxzPEjrARagY>~B=ob{F z79#sj6|a})D2(^88|iuDRA(TyJEkA}c#2oMYbUIt?$LvU7CR#H?D778V(V^qOt f2#kinXb6mkz-S1JhQMeDjE2By2#kin@Cg9`@=`3r diff --git a/egs/tedlium3/ASR/local/prepare_ted_manifests.py b/egs/tedlium3/ASR/local/prepare_ted_manifests.py new file mode 100644 index 000000000..e03784d69 --- /dev/null +++ b/egs/tedlium3/ASR/local/prepare_ted_manifests.py @@ -0,0 +1,73 @@ +import logging +import shutil +import tarfile +from pathlib import Path +from typing import Dict, Optional, Union + +from lhotse import ( + Recording, + RecordingSet, + SupervisionSegment, + SupervisionSet, + validate_recordings_and_supervisions, +) +from lhotse.utils import Pathlike, safe_extract, urlretrieve_progress + +def prepare_tedlium( + tedlium_root: Pathlike, output_dir: Optional[Pathlike] = None +) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: + """ + Prepare manifests for the TED-LIUM v3 corpus. + + The manifests are created in a dict with three splits: train, dev and test. + Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'. + + :param tedlium_root: Path to the unpacked TED-LIUM data. + :return: A dict with standard corpus splits containing the manifests. + """ + tedlium_root = Path(tedlium_root) + output_dir = Path(output_dir) if output_dir is not None else None + corpus = {} + for split in ("train", "dev", "test"): + root = tedlium_root / "legacy" / split + recordings = RecordingSet.from_recordings( + Recording.from_file(p) for p in (root / "sph").glob("*.sph") + ) + stms = list((root / "stm").glob("*.stm")) + assert len(stms) == len(recordings), ( + f"Mismatch: found {len(recordings)} " + f"sphere files and {len(stms)} STM files. " + f"You might be missing some parts of TEDLIUM..." + ) + segments = [] + for p in stms: + with p.open() as f: + for idx, l in enumerate(f): + rec_id, _, _, start, end, _, *words = l.split() + start, end = float(start), float(end) + text = " ".join(words).replace("{NOISE}", "[NOISE]") + if text == "ignore_time_segment_in_scoring": + continue + segments.append( + SupervisionSegment( + id=f"{rec_id}-{idx}", + recording_id=rec_id, + start=start, + duration=round(end - start, ndigits=8), + channel=0, + text=text, + language="English", + speaker=rec_id, + ) + ) + supervisions = SupervisionSet.from_segments(segments) + corpus[split] = {"recordings": recordings, "supervisions": supervisions} + + validate_recordings_and_supervisions(**corpus[split]) + + if output_dir is not None: + recordings.to_file(output_dir / f"tedlium_recordings_{split}.jsonl.gz") + supervisions.to_file(output_dir / f"tedlium_supervisions_{split}.jsonl.gz") + + return corpus +