from local

This commit is contained in:
dohe0342 2023-01-27 16:19:43 +09:00
parent f87fb7e0fe
commit 9536f6e4f7
3 changed files with 393 additions and 0 deletions

View File

@ -0,0 +1,179 @@
import logging
import sys
import os
import re
import shutil
import tarfile
import zipfile
from concurrent.futures.thread import ThreadPoolExecutor
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Tuple, Union
from tqdm.auto import tqdm
from lhotse import validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
from lhotse.utils import (
Pathlike,
is_module_available,
safe_extract,
urlretrieve_progress,
)
# LIBRISPEECH_ALIGNMENTS_URL = (
# "https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE"
# )
def prepare_LJSpeech(
corpus_dir: str,
dataset_parts: str = "auto",
output_dir: str = None,
num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Returns the manifests which consist of the Recordings and Supervisions.
When all the manifests are available in the ``output_dir``, it will simply read and return them.
:param corpus_dir: Pathlike, the path of the data dir.
:param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
By default we will infer which parts are available in ``corpus_dir``.
:param output_dir: Pathlike, the path where to write the manifests.
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
"""
assert os.path.exists(corpus_dir), f"{corpus_dir} does not exist"
# wav_dir = Path(corpus_dir + "/wavs")
# wavs = os.listdir(wav_dir)
# text_dir = Path(corpus_dir + "/wavs")
# texts = os.listdir(text_dir)
# wavs_parts = (
# set(wavs)
# )
# books_parts = (
# set(texts)
# )
manifests = {}
dataset_parts = ["train", "dev", "test"]
if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
import glob
futures = []
for part in tqdm(dataset_parts, desc="Dataset parts"):
logging.info(f"Processing LJSpeech subset: {part}")
if manifests_exist(part=part, output_dir=output_dir):
logging.info(f"LJSpeech subset: {part} already prepared - skipping.")
continue
recordings = []
supervisions = []
part_path = Path(os.path.join(corpus_dir, "wavs", part))
part_file_names = list(map(lambda x: x.strip('.wav'),os.listdir(part_path)))
txt_path = os.path.join(corpus_dir, "texts")
futures = []
for trans_path in tqdm(
glob.iglob(str(txt_path) + "/*.txt"), desc="Distributing tasks", leave=False
):
alignments = {}
with open(trans_path) as f:
cur_file_name = trans_path.split('/')[-1].replace('.txt', '')
if cur_file_name not in part_file_names:
continue
for line in f:
futures.append(
parse_utterance(part_path, trans_path + ' ' + line, alignments)
)
for future in tqdm(futures, desc="Processing", leave=False):
result = future
if result is None:
continue
recording, segment = result
recordings.append(recording)
supervisions.append(segment)
recording_set = RecordingSet.from_recordings(recordings)
supervision_set = SupervisionSet.from_segments(supervisions)
validate_recordings_and_supervisions(recording_set, supervision_set)
if output_dir is not None:
supervision_set.to_file(
output_dir / f"LJSpeech_supervisions_{part}.jsonl.gz"
)
recording_set.to_file(
output_dir / f"LJSpeech_recordings_{part}.jsonl.gz"
)
manifests[part] = {
"recordings": recording_set,
"supervisions": supervision_set,
}
return manifests
def parse_utterance(
dataset_split_path: Path,
line: str,
alignments: Dict[str, List[AlignmentItem]],
) -> Optional[Tuple[Recording, SupervisionSegment]]:
recording_id, text = line.strip().split(maxsplit=1)
recording_id = recording_id.split('/')[-1].split('.txt')[0]
# Create the Recording first
audio_path = (
dataset_split_path / f"{recording_id}.wav"
)
if not os.path.exists(audio_path):
logging.warning(f"No such file: {audio_path}")
return None
recording = Recording.from_file(audio_path, recording_id=recording_id)
# Then, create the corresponding supervisions
segment = SupervisionSegment(
id=recording_id,
recording_id=recording_id,
start=0.0,
duration=recording.duration,
channel=0,
language="English",
speaker=re.sub(r"-.*", r"", recording.id),
text=text.strip(),
alignment={"word": alignments[recording_id]}
if recording_id in alignments
else None,
)
return recording, segment
def parse_alignments(ali_path: Pathlike) -> Dict[str, List[AlignmentItem]]:
alignments = {}
for line in Path(ali_path).read_text().splitlines():
utt_id, words, timestamps = line.split()
words = words.replace('"', "").split(",")
timestamps = [0.0] + list(map(float, timestamps.replace('"', "").split(",")))
alignments[utt_id] = [
AlignmentItem(
symbol=word, start=start, duration=round(end - start, ndigits=8)
)
for word, start, end in zip(words, timestamps, timestamps[1:])
]
return alignments
def main(corpus_dir):
nj = 15
output_dir = "data/manifests"
prepare_LJSpeech(corpus_dir, "auto", output_dir, nj)
corpus_dir = sys.argv[1]
main(corpus_dir)

View File

@ -0,0 +1,179 @@
import logging
import sys
import os
import re
import shutil
import tarfile
import zipfile
from concurrent.futures.thread import ThreadPoolExecutor
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Tuple, Union
from tqdm.auto import tqdm
from lhotse import validate_recordings_and_supervisions
from lhotse.audio import Recording, RecordingSet
from lhotse.recipes.utils import manifests_exist, read_manifests_if_cached
from lhotse.supervision import AlignmentItem, SupervisionSegment, SupervisionSet
from lhotse.utils import (
Pathlike,
is_module_available,
safe_extract,
urlretrieve_progress,
)
# LIBRISPEECH_ALIGNMENTS_URL = (
# "https://drive.google.com/uc?id=1WYfgr31T-PPwMcxuAq09XZfHQO5Mw8fE"
# )
def prepare_LJSpeech(
corpus_dir: str,
dataset_parts: str = "auto",
output_dir: str = None,
num_jobs: int = 1,
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
"""
Returns the manifests which consist of the Recordings and Supervisions.
When all the manifests are available in the ``output_dir``, it will simply read and return them.
:param corpus_dir: Pathlike, the path of the data dir.
:param dataset_parts: string or sequence of strings representing dataset part names, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean'.
By default we will infer which parts are available in ``corpus_dir``.
:param output_dir: Pathlike, the path where to write the manifests.
:return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
"""
assert os.path.exists(corpus_dir), f"{corpus_dir} does not exist"
# wav_dir = Path(corpus_dir + "/wavs")
# wavs = os.listdir(wav_dir)
# text_dir = Path(corpus_dir + "/wavs")
# texts = os.listdir(text_dir)
# wavs_parts = (
# set(wavs)
# )
# books_parts = (
# set(texts)
# )
manifests = {}
dataset_parts = ["train", "dev", "test"]
if output_dir is not None:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
import glob
futures = []
for part in tqdm(dataset_parts, desc="Dataset parts"):
logging.info(f"Processing LJSpeech subset: {part}")
if manifests_exist(part=part, output_dir=output_dir):
logging.info(f"LJSpeech subset: {part} already prepared - skipping.")
continue
recordings = []
supervisions = []
part_path = Path(os.path.join(corpus_dir, "wavs", part))
part_file_names = list(map(lambda x: x.strip('.wav'),os.listdir(part_path)))
txt_path = os.path.join(corpus_dir, "texts")
futures = []
for trans_path in tqdm(
glob.iglob(str(txt_path) + "/*.txt"), desc="Distributing tasks", leave=False
):
alignments = {}
with open(trans_path) as f:
cur_file_name = trans_path.split('/')[-1].replace('.txt', '')
if cur_file_name not in part_file_names:
continue
for line in f:
futures.append(
parse_utterance(part_path, trans_path + ' ' + line, alignments)
)
for future in tqdm(futures, desc="Processing", leave=False):
result = future
if result is None:
continue
recording, segment = result
recordings.append(recording)
supervisions.append(segment)
recording_set = RecordingSet.from_recordings(recordings)
supervision_set = SupervisionSet.from_segments(supervisions)
validate_recordings_and_supervisions(recording_set, supervision_set)
if output_dir is not None:
supervision_set.to_file(
output_dir / f"LJSpeech_pseudo_supervisions_{part}.jsonl.gz"
)
recording_set.to_file(
output_dir / f"LJSpeech_pseudo_recordings_{part}.jsonl.gz"
)
manifests[part] = {
"recordings": recording_set,
"supervisions": supervision_set,
}
return manifests
def parse_utterance(
dataset_split_path: Path,
line: str,
alignments: Dict[str, List[AlignmentItem]],
) -> Optional[Tuple[Recording, SupervisionSegment]]:
recording_id, text = line.strip().split(maxsplit=1)
recording_id = recording_id.split('/')[-1].split('.txt')[0]
# Create the Recording first
audio_path = (
dataset_split_path / f"{recording_id}.wav"
)
if not os.path.exists(audio_path):
logging.warning(f"No such file: {audio_path}")
return None
recording = Recording.from_file(audio_path, recording_id=recording_id)
# Then, create the corresponding supervisions
segment = SupervisionSegment(
id=recording_id,
recording_id=recording_id,
start=0.0,
duration=recording.duration,
channel=0,
language="English",
speaker=re.sub(r"-.*", r"", recording.id),
text=text.strip(),
alignment={"word": alignments[recording_id]}
if recording_id in alignments
else None,
)
return recording, segment
def parse_alignments(ali_path: Pathlike) -> Dict[str, List[AlignmentItem]]:
alignments = {}
for line in Path(ali_path).read_text().splitlines():
utt_id, words, timestamps = line.split()
words = words.replace('"', "").split(",")
timestamps = [0.0] + list(map(float, timestamps.replace('"', "").split(",")))
alignments[utt_id] = [
AlignmentItem(
symbol=word, start=start, duration=round(end - start, ndigits=8)
)
for word, start, end in zip(words, timestamps, timestamps[1:])
]
return alignments
def main(corpus_dir):
nj = 15
output_dir = "data/manifests"
prepare_LJSpeech(corpus_dir, "auto", output_dir, nj)
corpus_dir = sys.argv[1]
main(corpus_dir)

View File

@ -0,0 +1,35 @@
import os
import sys
import re
metafile = sys.argv[1]
outdir = "texts"
save_dir = "/".join(metafile.split('/')[:-1])
save_dir = os.path.join(save_dir, outdir)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
with open(metafile, 'r') as f:
strings = f.readlines()
for string in strings:
# Split the string into parts
parts = string.split("|")
# Assign the parts to variables
filename = parts[0]
text1 = parts[1]
try:
text2 = parts[2]
except:
text2 = text1
text2 = text2.upper()
text2 = re.sub(r"[^A-Z ']", "", text2)
# Create a new text file with the filename and write text2 to it
filename = os.path.join(save_dir, filename)
with open(f"{filename}.txt", "w") as file:
file.write(text2)