From 1c4db887476944bd70f17e92f21a796cb36e146d Mon Sep 17 00:00:00 2001 From: yfy62 Date: Fri, 10 Nov 2023 21:17:35 +0800 Subject: [PATCH] update --- .../attach_discrete_tokens_to_supervisions.py | 20 ++++++++ .../local/preprocess_gigaspeech.py | 49 +++++-------------- .../local/preprocess_raw_cuts.py | 18 +++++++ 3 files changed, 49 insertions(+), 38 deletions(-) create mode 100755 egs/gigaspeech/wavlm_large_l24_kms2000/local/attach_discrete_tokens_to_supervisions.py create mode 100644 egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_raw_cuts.py diff --git a/egs/gigaspeech/wavlm_large_l24_kms2000/local/attach_discrete_tokens_to_supervisions.py b/egs/gigaspeech/wavlm_large_l24_kms2000/local/attach_discrete_tokens_to_supervisions.py new file mode 100755 index 000000000..bd5742f00 --- /dev/null +++ b/egs/gigaspeech/wavlm_large_l24_kms2000/local/attach_discrete_tokens_to_supervisions.py @@ -0,0 +1,20 @@ +import jsonlines +from tqdm import tqdm + +with open( + "/mnt/lustre/sjtu/home/yfy62/discrete_token_data/GigaSpeech/xl/wavlm_large_l21_kms2000/out_quantized_sp1.1" +) as f: + discrete_tokens = f.read().splitlines() + +discrete_tokens_info = {} +for discrete_token in discrete_tokens: + discrete_token = discrete_token.split(" ", 1) + discrete_tokens_info[discrete_token[0]] = discrete_token[1] + + +with jsonlines.open("gigaspeech_supervisions_XL.jsonl") as reader: + with jsonlines.open("gigaspeech_supervisions_XL_new.jsonl", mode="w") as writer: + for obj in tqdm(reader): + obj["custom"] = {"discrete_tokens": discrete_tokens_info[obj["id"]]} + + writer.write(obj) diff --git a/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_gigaspeech.py b/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_gigaspeech.py index 0120b577d..cc169d577 100755 --- a/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_gigaspeech.py +++ b/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_gigaspeech.py @@ -21,10 +21,11 @@ import re from pathlib import Path import jsonlines +from tqdm import tqdm + from lhotse import CutSet, SupervisionSegment from lhotse.recipes.utils import read_manifests_if_cached from lhotse.serialization import open_best -from tqdm import tqdm # Similar text filtering and normalization procedure as in: # https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh @@ -39,32 +40,26 @@ def normalize_text( def has_no_oov( - sup: SupervisionSegment, - oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"), + sup: SupervisionSegment, oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"), ) -> bool: return oov_pattern.search(sup.text) is None def preprocess_gigaspeech(): - src_dir = Path("data/manifests") - output_dir = Path("data/fbank") + # src_dir = Path("data/manifests") + # output_dir = Path("data/fbank") + src_dir = Path(".") + output_dir = Path(".") output_dir.mkdir(exist_ok=True) - dataset_parts = ( - "DEV", - "TEST", - "M", - ) + dataset_parts = ("XL",) prefix = "gigaspeech" suffix = "jsonl.gz" logging.info("Loading manifest (may take 1 minutes)") manifests = read_manifests_if_cached( - dataset_parts=dataset_parts, - output_dir=src_dir, - prefix=prefix, - suffix=suffix, + dataset_parts=dataset_parts, output_dir=src_dir, prefix=prefix, suffix=suffix, ) assert manifests is not None @@ -76,7 +71,7 @@ def preprocess_gigaspeech(): ) for partition, m in manifests.items(): - raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz" + raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl" if raw_cuts_path.is_file(): logging.info(f"{partition} already exists - skipping") continue @@ -93,8 +88,7 @@ def preprocess_gigaspeech(): # Create long-recording cut manifests. logging.info(f"Preprocessing {partition}") cut_set = CutSet.from_manifests( - recordings=m["recordings"], - supervisions=m["supervisions"], + recordings=m["recordings"], supervisions=m["supervisions"], ) logging.info("About to split cuts into smaller chunks.") @@ -105,27 +99,6 @@ def preprocess_gigaspeech(): logging.info(f"Saving to {raw_cuts_path}") cut_set.to_file(raw_cuts_path) - for partition in dataset_parts: - cuts_path = output_dir / f"{prefix}_cuts_{partition}.jsonl" - if cuts_path.is_file(): - logging.info(f"{partition} already exists - skipping") - continue - - logging.info(f"Processing {partition}") - raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz" - with open_best(raw_cuts_path) as reader, jsonlines.open( - cuts_path, "a" - ) as writer: - for cut in reader: - cut = eval(cut) - cut["custom"] = { - "discrete_tokens": cut["supervisions"][0]["custom"][ - "discrete_tokens" - ] - } - del cut["supervisions"][0]["custom"] - writer.write(cut) - def main(): formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" diff --git a/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_raw_cuts.py b/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_raw_cuts.py new file mode 100644 index 000000000..0e44223d9 --- /dev/null +++ b/egs/gigaspeech/wavlm_large_l24_kms2000/local/preprocess_raw_cuts.py @@ -0,0 +1,18 @@ +import jsonlines +from tqdm import tqdm + +with jsonlines.open("gigaspeech_cuts_XL_raw.jsonl") as reader: + with jsonlines.open("gigaspeech_cuts_XL.jsonl", mode="w") as writer: + for obj in tqdm(reader): + obj["custom"] = { + "discrete_tokens": obj["supervisions"][0]["custom"]["discrete_tokens"] + } + del obj["supervisions"][0]["custom"] + + # Speed perturb + obj["duration"] /= 1.1 + obj["supervisions"][0]["duration"] /= 1.1 + obj["id"] += "_sp1.1" + obj["supervisions"][0]["id"] += "_sp1.1" + + writer.write(obj)