from local

2023-01-27 16:19:43 +09:00 · 2023-01-27 16:19:43 +09:00 · 9536f6e4f7
commit 9536f6e4f7
parent f87fb7e0fe
3 changed files with 393 additions and 0 deletions
--- a/egs/librispeech/ASR/local/prepare_LJSpeech.py
+++ b/egs/librispeech/ASR/local/prepare_LJSpeech.py
@ -0,0 +1,179 @@
+import logging
+import sys
+import os
+import re
+import shutil
+import tarfile
+import zipfile
+from concurrent.futures.thread import ThreadPoolExecutor
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+from tqdm.auto import tqdm
+
+from lhotse import validate_recordings_and_supervisions
+from lhotse.audio import Recording, RecordingSet
+from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
+from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
+from lhotse.utils import (
+    Pathlike,
+    is_module_available,
+    safe_extract,
+    urlretrieve_progress,
+)
+
+# LIBRISPEECH_ALIGNMENTS_URL = (
+#     "https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE"
+# )
+
+def prepare_LJSpeech(
+    corpus_dir: str,
+    dataset_parts: str = "auto",
+    output_dir: str = None,
+    num_jobs: int = 1,
+) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
+    """
+    Returns the manifests which consist of the Recordings and Supervisions.
+    When all the manifests are available in the ``output_dir``, it will simply read and return them.
+    :param corpus_dir: Pathlike, the path of the data dir.
+    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
+        By default we will infer which parts are available in ``corpus_dir``.
+    :param output_dir: Pathlike, the path where to write the manifests.
+    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
+    """
+
+    assert os.path.exists(corpus_dir), f"{corpus_dir} does not exist"
+
+    # wav_dir = Path(corpus_dir + "/wavs")
+    # wavs = os.listdir(wav_dir)
+
+    # text_dir = Path(corpus_dir + "/wavs")
+    # texts = os.listdir(text_dir)
+
+    # wavs_parts = (
+    #     set(wavs)
+    # )
+    # books_parts = (
+    #     set(texts)
+    # )
+
+    manifests = {}
+
+    dataset_parts = ["train", "dev", "test"]
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    import glob
+
+    futures = []
+    for part in tqdm(dataset_parts, desc="Dataset parts"):
+        logging.info(f"Processing LJSpeech subset: {part}")
+        if manifests_exist(part=part, output_dir=output_dir):
+            logging.info(f"LJSpeech subset: {part} already prepared - skipping.")
+            continue
+        recordings = []
+        supervisions = []
+        part_path = Path(os.path.join(corpus_dir, "wavs", part))
+        part_file_names = list(map(lambda x: x.strip('.wav'),os.listdir(part_path))) 
+        txt_path = os.path.join(corpus_dir, "texts")
+        futures = []
+        
+        for trans_path in tqdm(
+            glob.iglob(str(txt_path) + "/*.txt"), desc="Distributing tasks", leave=False
+        ):
+            alignments = {}
+            with open(trans_path) as f:
+                cur_file_name = trans_path.split('/')[-1].replace('.txt', '')
+                if cur_file_name not in part_file_names:
+                    continue
+                for line in f:
+                    futures.append(
+                        parse_utterance(part_path, trans_path + ' ' + line, alignments)
+                    )
+
+        for future in tqdm(futures, desc="Processing", leave=False):
+            result = future
+            if result is None:
+                continue
+            recording, segment = result
+            recordings.append(recording)
+            supervisions.append(segment)
+
+        recording_set = RecordingSet.from_recordings(recordings)
+        supervision_set = SupervisionSet.from_segments(supervisions)
+
+        validate_recordings_and_supervisions(recording_set, supervision_set)
+
+        if output_dir is not None:
+            supervision_set.to_file(
+                output_dir / f"LJSpeech_supervisions_{part}.jsonl.gz"
+            )
+            recording_set.to_file(
+                output_dir / f"LJSpeech_recordings_{part}.jsonl.gz"
+            )
+
+        manifests[part] = {
+            "recordings": recording_set,
+            "supervisions": supervision_set,
+        }
+
+    return manifests
+
+
+def parse_utterance(
+    dataset_split_path: Path,
+    line: str,
+    alignments: Dict[str, List[AlignmentItem]],
+) -> Optional[Tuple[Recording, SupervisionSegment]]:
+    recording_id, text = line.strip().split(maxsplit=1)
+    recording_id = recording_id.split('/')[-1].split('.txt')[0]
+
+    # Create the Recording first
+    audio_path = (
+        dataset_split_path / f"{recording_id}.wav"
+    )
+
+    if not os.path.exists(audio_path):
+        logging.warning(f"No such file: {audio_path}")
+        return None
+    recording = Recording.from_file(audio_path, recording_id=recording_id)
+    # Then, create the corresponding supervisions
+    segment = SupervisionSegment(
+        id=recording_id,
+        recording_id=recording_id,
+        start=0.0,
+        duration=recording.duration,
+        channel=0,
+        language="English",
+        speaker=re.sub(r"-.*", r"", recording.id),
+        text=text.strip(),
+        alignment={"word": alignments[recording_id]}
+        if recording_id in alignments
+        else None,
+    )
+    return recording, segment
+
+
+def parse_alignments(ali_path: Pathlike) -> Dict[str, List[AlignmentItem]]:
+    alignments = {}
+    for line in Path(ali_path).read_text().splitlines():
+        utt_id, words, timestamps = line.split()
+        words = words.replace('"', "").split(",")
+        timestamps = [0.0] + list(map(float, timestamps.replace('"', "").split(",")))
+        alignments[utt_id] = [
+            AlignmentItem(
+                symbol=word, start=start, duration=round(end - start, ndigits=8)
+            )
+            for word, start, end in zip(words, timestamps, timestamps[1:])
+        ]
+    return alignments
+
+def main(corpus_dir):
+    nj = 15
+    output_dir = "data/manifests"
+
+    prepare_LJSpeech(corpus_dir, "auto", output_dir, nj)
+
+corpus_dir = sys.argv[1]
+main(corpus_dir)
--- a/egs/librispeech/ASR/local/prepare_LJSpeech_pseudo.py
+++ b/egs/librispeech/ASR/local/prepare_LJSpeech_pseudo.py
@ -0,0 +1,179 @@
+import logging
+import sys
+import os
+import re
+import shutil
+import tarfile
+import zipfile
+from concurrent.futures.thread import ThreadPoolExecutor
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+from tqdm.auto import tqdm
+
+from lhotse import validate_recordings_and_supervisions
+from lhotse.audio import Recording, RecordingSet
+from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
+from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
+from lhotse.utils import (
+    Pathlike,
+    is_module_available,
+    safe_extract,
+    urlretrieve_progress,
+)
+
+# LIBRISPEECH_ALIGNMENTS_URL = (
+#     "https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE"
+# )
+
+def prepare_LJSpeech(
+    corpus_dir: str,
+    dataset_parts: str = "auto",
+    output_dir: str = None,
+    num_jobs: int = 1,
+) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
+    """
+    Returns the manifests which consist of the Recordings and Supervisions.
+    When all the manifests are available in the ``output_dir``, it will simply read and return them.
+    :param corpus_dir: Pathlike, the path of the data dir.
+    :param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
+        By default we will infer which parts are available in ``corpus_dir``.
+    :param output_dir: Pathlike, the path where to write the manifests.
+    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
+    """
+
+    assert os.path.exists(corpus_dir), f"{corpus_dir} does not exist"
+
+    # wav_dir = Path(corpus_dir + "/wavs")
+    # wavs = os.listdir(wav_dir)
+
+    # text_dir = Path(corpus_dir + "/wavs")
+    # texts = os.listdir(text_dir)
+
+    # wavs_parts = (
+    #     set(wavs)
+    # )
+    # books_parts = (
+    #     set(texts)
+    # )
+
+    manifests = {}
+
+    dataset_parts = ["train", "dev", "test"]
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    import glob
+
+    futures = []
+    for part in tqdm(dataset_parts, desc="Dataset parts"):
+        logging.info(f"Processing LJSpeech subset: {part}")
+        if manifests_exist(part=part, output_dir=output_dir):
+            logging.info(f"LJSpeech subset: {part} already prepared - skipping.")
+            continue
+        recordings = []
+        supervisions = []
+        part_path = Path(os.path.join(corpus_dir, "wavs", part))
+        part_file_names = list(map(lambda x: x.strip('.wav'),os.listdir(part_path))) 
+        txt_path = os.path.join(corpus_dir, "texts")
+        futures = []
+        
+        for trans_path in tqdm(
+            glob.iglob(str(txt_path) + "/*.txt"), desc="Distributing tasks", leave=False
+        ):
+            alignments = {}
+            with open(trans_path) as f:
+                cur_file_name = trans_path.split('/')[-1].replace('.txt', '')
+                if cur_file_name not in part_file_names:
+                    continue
+                for line in f:
+                    futures.append(
+                        parse_utterance(part_path, trans_path + ' ' + line, alignments)
+                    )
+
+        for future in tqdm(futures, desc="Processing", leave=False):
+            result = future
+            if result is None:
+                continue
+            recording, segment = result
+            recordings.append(recording)
+            supervisions.append(segment)
+
+        recording_set = RecordingSet.from_recordings(recordings)
+        supervision_set = SupervisionSet.from_segments(supervisions)
+
+        validate_recordings_and_supervisions(recording_set, supervision_set)
+
+        if output_dir is not None:
+            supervision_set.to_file(
+                output_dir / f"LJSpeech_pseudo_supervisions_{part}.jsonl.gz"
+            )
+            recording_set.to_file(
+                output_dir / f"LJSpeech_pseudo_recordings_{part}.jsonl.gz"
+            )
+
+        manifests[part] = {
+            "recordings": recording_set,
+            "supervisions": supervision_set,
+        }
+
+    return manifests
+
+
+def parse_utterance(
+    dataset_split_path: Path,
+    line: str,
+    alignments: Dict[str, List[AlignmentItem]],
+) -> Optional[Tuple[Recording, SupervisionSegment]]:
+    recording_id, text = line.strip().split(maxsplit=1)
+    recording_id = recording_id.split('/')[-1].split('.txt')[0]
+
+    # Create the Recording first
+    audio_path = (
+        dataset_split_path / f"{recording_id}.wav"
+    )
+
+    if not os.path.exists(audio_path):
+        logging.warning(f"No such file: {audio_path}")
+        return None
+    recording = Recording.from_file(audio_path, recording_id=recording_id)
+    # Then, create the corresponding supervisions
+    segment = SupervisionSegment(
+        id=recording_id,
+        recording_id=recording_id,
+        start=0.0,
+        duration=recording.duration,
+        channel=0,
+        language="English",
+        speaker=re.sub(r"-.*", r"", recording.id),
+        text=text.strip(),
+        alignment={"word": alignments[recording_id]}
+        if recording_id in alignments
+        else None,
+    )
+    return recording, segment
+
+
+def parse_alignments(ali_path: Pathlike) -> Dict[str, List[AlignmentItem]]:
+    alignments = {}
+    for line in Path(ali_path).read_text().splitlines():
+        utt_id, words, timestamps = line.split()
+        words = words.replace('"', "").split(",")
+        timestamps = [0.0] + list(map(float, timestamps.replace('"', "").split(",")))
+        alignments[utt_id] = [
+            AlignmentItem(
+                symbol=word, start=start, duration=round(end - start, ndigits=8)
+            )
+            for word, start, end in zip(words, timestamps, timestamps[1:])
+        ]
+    return alignments
+
+def main(corpus_dir):
+    nj = 15
+    output_dir = "data/manifests"
+
+    prepare_LJSpeech(corpus_dir, "auto", output_dir, nj)
+
+corpus_dir = sys.argv[1]
+main(corpus_dir)
--- a/egs/librispeech/ASR/local/prepare_LJSpeech_text.py
+++ b/egs/librispeech/ASR/local/prepare_LJSpeech_text.py
@ -0,0 +1,35 @@
+import os
+import sys
+import re
+
+metafile = sys.argv[1]
+outdir = "texts"
+save_dir = "/".join(metafile.split('/')[:-1])
+save_dir = os.path.join(save_dir, outdir)
+
+if not os.path.exists(save_dir):
+    os.makedirs(save_dir)
+
+with open(metafile, 'r') as f:
+    strings = f.readlines()
+
+for string in strings:
+
+    # Split the string into parts
+    parts = string.split("|")
+
+    # Assign the parts to variables
+    filename = parts[0]
+    text1 = parts[1]
+    try:
+        text2 = parts[2]
+    except:
+        text2 = text1
+    
+    text2 = text2.upper()
+    text2 = re.sub(r"[^A-Z ']", "", text2)
+    
+    # Create a new text file with the filename and write text2 to it
+    filename = os.path.join(save_dir, filename)
+    with open(f"{filename}.txt", "w") as file:
+        file.write(text2)