This commit is contained in:
yfy62 2023-11-10 21:17:35 +08:00
parent 4a9ea0ffc6
commit 1c4db88747
3 changed files with 49 additions and 38 deletions

View File

@ -0,0 +1,20 @@
import jsonlines
from tqdm import tqdm
with open(
"/mnt/lustre/sjtu/home/yfy62/discrete_token_data/GigaSpeech/xl/wavlm_large_l21_kms2000/out_quantized_sp1.1"
) as f:
discrete_tokens = f.read().splitlines()
discrete_tokens_info = {}
for discrete_token in discrete_tokens:
discrete_token = discrete_token.split(" ", 1)
discrete_tokens_info[discrete_token[0]] = discrete_token[1]
with jsonlines.open("gigaspeech_supervisions_XL.jsonl") as reader:
with jsonlines.open("gigaspeech_supervisions_XL_new.jsonl", mode="w") as writer:
for obj in tqdm(reader):
obj["custom"] = {"discrete_tokens": discrete_tokens_info[obj["id"]]}
writer.write(obj)

View File

@ -21,10 +21,11 @@ import re
from pathlib import Path from pathlib import Path
import jsonlines import jsonlines
from tqdm import tqdm
from lhotse import CutSet, SupervisionSegment from lhotse import CutSet, SupervisionSegment
from lhotse.recipes.utils import read_manifests_if_cached from lhotse.recipes.utils import read_manifests_if_cached
from lhotse.serialization import open_best from lhotse.serialization import open_best
from tqdm import tqdm
# Similar text filtering and normalization procedure as in: # Similar text filtering and normalization procedure as in:
# https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh # https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh
@ -39,32 +40,26 @@ def normalize_text(
def has_no_oov( def has_no_oov(
sup: SupervisionSegment, sup: SupervisionSegment, oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
) -> bool: ) -> bool:
return oov_pattern.search(sup.text) is None return oov_pattern.search(sup.text) is None
def preprocess_gigaspeech(): def preprocess_gigaspeech():
src_dir = Path("data/manifests") # src_dir = Path("data/manifests")
output_dir = Path("data/fbank") # output_dir = Path("data/fbank")
src_dir = Path(".")
output_dir = Path(".")
output_dir.mkdir(exist_ok=True) output_dir.mkdir(exist_ok=True)
dataset_parts = ( dataset_parts = ("XL",)
"DEV",
"TEST",
"M",
)
prefix = "gigaspeech" prefix = "gigaspeech"
suffix = "jsonl.gz" suffix = "jsonl.gz"
logging.info("Loading manifest (may take 1 minutes)") logging.info("Loading manifest (may take 1 minutes)")
manifests = read_manifests_if_cached( manifests = read_manifests_if_cached(
dataset_parts=dataset_parts, dataset_parts=dataset_parts, output_dir=src_dir, prefix=prefix, suffix=suffix,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
) )
assert manifests is not None assert manifests is not None
@ -76,7 +71,7 @@ def preprocess_gigaspeech():
) )
for partition, m in manifests.items(): for partition, m in manifests.items():
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz" raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl"
if raw_cuts_path.is_file(): if raw_cuts_path.is_file():
logging.info(f"{partition} already exists - skipping") logging.info(f"{partition} already exists - skipping")
continue continue
@ -93,8 +88,7 @@ def preprocess_gigaspeech():
# Create long-recording cut manifests. # Create long-recording cut manifests.
logging.info(f"Preprocessing {partition}") logging.info(f"Preprocessing {partition}")
cut_set = CutSet.from_manifests( cut_set = CutSet.from_manifests(
recordings=m["recordings"], recordings=m["recordings"], supervisions=m["supervisions"],
supervisions=m["supervisions"],
) )
logging.info("About to split cuts into smaller chunks.") logging.info("About to split cuts into smaller chunks.")
@ -105,27 +99,6 @@ def preprocess_gigaspeech():
logging.info(f"Saving to {raw_cuts_path}") logging.info(f"Saving to {raw_cuts_path}")
cut_set.to_file(raw_cuts_path) cut_set.to_file(raw_cuts_path)
for partition in dataset_parts:
cuts_path = output_dir / f"{prefix}_cuts_{partition}.jsonl"
if cuts_path.is_file():
logging.info(f"{partition} already exists - skipping")
continue
logging.info(f"Processing {partition}")
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz"
with open_best(raw_cuts_path) as reader, jsonlines.open(
cuts_path, "a"
) as writer:
for cut in reader:
cut = eval(cut)
cut["custom"] = {
"discrete_tokens": cut["supervisions"][0]["custom"][
"discrete_tokens"
]
}
del cut["supervisions"][0]["custom"]
writer.write(cut)
def main(): def main():
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"

View File

@ -0,0 +1,18 @@
import jsonlines
from tqdm import tqdm
with jsonlines.open("gigaspeech_cuts_XL_raw.jsonl") as reader:
with jsonlines.open("gigaspeech_cuts_XL.jsonl", mode="w") as writer:
for obj in tqdm(reader):
obj["custom"] = {
"discrete_tokens": obj["supervisions"][0]["custom"]["discrete_tokens"]
}
del obj["supervisions"][0]["custom"]
# Speed perturb
obj["duration"] /= 1.1
obj["supervisions"][0]["duration"] /= 1.1
obj["id"] += "_sp1.1"
obj["supervisions"][0]["id"] += "_sp1.1"
writer.write(obj)