This commit is contained in:
yfy62 2023-11-10 21:17:35 +08:00
parent 4a9ea0ffc6
commit 1c4db88747
3 changed files with 49 additions and 38 deletions

View File

@ -0,0 +1,20 @@
import jsonlines
from tqdm import tqdm
with open(
"/mnt/lustre/sjtu/home/yfy62/discrete_token_data/GigaSpeech/xl/wavlm_large_l21_kms2000/out_quantized_sp1.1"
) as f:
discrete_tokens = f.read().splitlines()
discrete_tokens_info = {}
for discrete_token in discrete_tokens:
discrete_token = discrete_token.split(" ", 1)
discrete_tokens_info[discrete_token[0]] = discrete_token[1]
with jsonlines.open("gigaspeech_supervisions_XL.jsonl") as reader:
with jsonlines.open("gigaspeech_supervisions_XL_new.jsonl", mode="w") as writer:
for obj in tqdm(reader):
obj["custom"] = {"discrete_tokens": discrete_tokens_info[obj["id"]]}
writer.write(obj)

View File

@ -21,10 +21,11 @@ import re
from pathlib import Path
import jsonlines
from tqdm import tqdm
from lhotse import CutSet, SupervisionSegment
from lhotse.recipes.utils import read_manifests_if_cached
from lhotse.serialization import open_best
from tqdm import tqdm
# Similar text filtering and normalization procedure as in:
# https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh
@ -39,32 +40,26 @@ def normalize_text(
def has_no_oov(
sup: SupervisionSegment,
oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
sup: SupervisionSegment, oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
) -> bool:
return oov_pattern.search(sup.text) is None
def preprocess_gigaspeech():
src_dir = Path("data/manifests")
output_dir = Path("data/fbank")
# src_dir = Path("data/manifests")
# output_dir = Path("data/fbank")
src_dir = Path(".")
output_dir = Path(".")
output_dir.mkdir(exist_ok=True)
dataset_parts = (
"DEV",
"TEST",
"M",
)
dataset_parts = ("XL",)
prefix = "gigaspeech"
suffix = "jsonl.gz"
logging.info("Loading manifest (may take 1 minutes)")
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
dataset_parts=dataset_parts, output_dir=src_dir, prefix=prefix, suffix=suffix,
)
assert manifests is not None
@ -76,7 +71,7 @@ def preprocess_gigaspeech():
)
for partition, m in manifests.items():
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz"
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl"
if raw_cuts_path.is_file():
logging.info(f"{partition} already exists - skipping")
continue
@ -93,8 +88,7 @@ def preprocess_gigaspeech():
# Create long-recording cut manifests.
logging.info(f"Preprocessing {partition}")
cut_set = CutSet.from_manifests(
recordings=m["recordings"],
supervisions=m["supervisions"],
recordings=m["recordings"], supervisions=m["supervisions"],
)
logging.info("About to split cuts into smaller chunks.")
@ -105,27 +99,6 @@ def preprocess_gigaspeech():
logging.info(f"Saving to {raw_cuts_path}")
cut_set.to_file(raw_cuts_path)
for partition in dataset_parts:
cuts_path = output_dir / f"{prefix}_cuts_{partition}.jsonl"
if cuts_path.is_file():
logging.info(f"{partition} already exists - skipping")
continue
logging.info(f"Processing {partition}")
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz"
with open_best(raw_cuts_path) as reader, jsonlines.open(
cuts_path, "a"
) as writer:
for cut in reader:
cut = eval(cut)
cut["custom"] = {
"discrete_tokens": cut["supervisions"][0]["custom"][
"discrete_tokens"
]
}
del cut["supervisions"][0]["custom"]
writer.write(cut)
def main():
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"

View File

@ -0,0 +1,18 @@
import jsonlines
from tqdm import tqdm
with jsonlines.open("gigaspeech_cuts_XL_raw.jsonl") as reader:
with jsonlines.open("gigaspeech_cuts_XL.jsonl", mode="w") as writer:
for obj in tqdm(reader):
obj["custom"] = {
"discrete_tokens": obj["supervisions"][0]["custom"]["discrete_tokens"]
}
del obj["supervisions"][0]["custom"]
# Speed perturb
obj["duration"] /= 1.1
obj["supervisions"][0]["duration"] /= 1.1
obj["id"] += "_sp1.1"
obj["supervisions"][0]["id"] += "_sp1.1"
writer.write(obj)