From 169ec5c3e664563ac77e2fb760e596776df15274 Mon Sep 17 00:00:00 2001
From: dohe0342 <kimdohe1070@gmail.com>
Date: Mon, 12 Jun 2023 00:09:05 +0900
Subject: [PATCH] from local

---
 .../ASR/local/.prepare_ted_manifests.py.swp   | Bin 4096 -> 0 bytes
 .../ASR/local/prepare_ted_manifests.py        |  73 ++++++++++++++++++
 2 files changed, 73 insertions(+)
 delete mode 100644 egs/tedlium3/ASR/local/.prepare_ted_manifests.py.swp
 create mode 100644 egs/tedlium3/ASR/local/prepare_ted_manifests.py
diff --git a/egs/tedlium3/ASR/local/.prepare_ted_manifests.py.swp b/egs/tedlium3/ASR/local/.prepare_ted_manifests.py.swp
deleted file mode 100644
index b751452e6955b2c00a8189a3a63b28e144e25463..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4096
zcmYc?2=nw+u+%eP00IFJ0Rf#C85mOXGg1wVO^je-xM&xky42E4UEjoHr~LfvfTDc8
zoc!d(9CY<{Fa!0=^NX^J3lfu4^)r)G(-L!X^i$J|^-EGyaxzPEjrARagY>~B=ob{F
z79<v>#sj6|a})D2(^88|iuDRA(TyJEkA}c#2oMYbUIt?$LvU7CR#H?D778V(V^qOt
f2#kinXb6mkz-S1JhQMeDjE2By2#kin@Cg9`@=`3r

diff --git a/egs/tedlium3/ASR/local/prepare_ted_manifests.py b/egs/tedlium3/ASR/local/prepare_ted_manifests.py
new file mode 100644
index 000000000..e03784d69
--- /dev/null
+++ b/egs/tedlium3/ASR/local/prepare_ted_manifests.py
@@ -0,0 +1,73 @@
+import logging
+import shutil
+import tarfile
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+from lhotse import (
+    Recording,
+    RecordingSet,
+    SupervisionSegment,
+    SupervisionSet,
+    validate_recordings_and_supervisions,
+)
+from lhotse.utils import Pathlike, safe_extract, urlretrieve_progress
+
+def prepare_tedlium(
+    tedlium_root: Pathlike, output_dir: Optional[Pathlike] = None
+) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
+    """
+    Prepare manifests for the TED-LIUM v3 corpus.
+
+    The manifests are created in a dict with three splits: train, dev and test.
+    Each split contains a RecordingSet and SupervisionSet in a dict under keys 'recordings' and 'supervisions'.
+
+    :param tedlium_root: Path to the unpacked TED-LIUM data.
+    :return: A dict with standard corpus splits containing the manifests.
+    """
+    tedlium_root = Path(tedlium_root)
+    output_dir = Path(output_dir) if output_dir is not None else None
+    corpus = {}
+    for split in ("train", "dev", "test"):
+        root = tedlium_root / "legacy" / split
+        recordings = RecordingSet.from_recordings(
+            Recording.from_file(p) for p in (root / "sph").glob("*.sph")
+        )
+        stms = list((root / "stm").glob("*.stm"))
+        assert len(stms) == len(recordings), (
+            f"Mismatch: found {len(recordings)} "
+            f"sphere files and {len(stms)} STM files. "
+            f"You might be missing some parts of TEDLIUM..."
+        )
+        segments = []
+        for p in stms:
+            with p.open() as f:
+                for idx, l in enumerate(f):
+                    rec_id, _, _, start, end, _, *words = l.split()
+                    start, end = float(start), float(end)
+                    text = " ".join(words).replace("{NOISE}", "[NOISE]")
+                    if text == "ignore_time_segment_in_scoring":
+                        continue
+                    segments.append(
+                        SupervisionSegment(
+                            id=f"{rec_id}-{idx}",
+                            recording_id=rec_id,
+                            start=start,
+                            duration=round(end - start, ndigits=8),
+                            channel=0,
+                            text=text,
+                            language="English",
+                            speaker=rec_id,
+                        )
+                    )
+        supervisions = SupervisionSet.from_segments(segments)
+        corpus[split] = {"recordings": recordings, "supervisions": supervisions}
+
+        validate_recordings_and_supervisions(**corpus[split])
+
+        if output_dir is not None:
+            recordings.to_file(output_dir / f"tedlium_recordings_{split}.jsonl.gz")
+            supervisions.to_file(output_dir / f"tedlium_supervisions_{split}.jsonl.gz")
+
+    return corpus
+