mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-04 06:34:20 +00:00
update
This commit is contained in:
parent
4a9ea0ffc6
commit
1c4db88747
@ -0,0 +1,20 @@
|
||||
import jsonlines
|
||||
from tqdm import tqdm
|
||||
|
||||
with open(
|
||||
"/mnt/lustre/sjtu/home/yfy62/discrete_token_data/GigaSpeech/xl/wavlm_large_l21_kms2000/out_quantized_sp1.1"
|
||||
) as f:
|
||||
discrete_tokens = f.read().splitlines()
|
||||
|
||||
discrete_tokens_info = {}
|
||||
for discrete_token in discrete_tokens:
|
||||
discrete_token = discrete_token.split(" ", 1)
|
||||
discrete_tokens_info[discrete_token[0]] = discrete_token[1]
|
||||
|
||||
|
||||
with jsonlines.open("gigaspeech_supervisions_XL.jsonl") as reader:
|
||||
with jsonlines.open("gigaspeech_supervisions_XL_new.jsonl", mode="w") as writer:
|
||||
for obj in tqdm(reader):
|
||||
obj["custom"] = {"discrete_tokens": discrete_tokens_info[obj["id"]]}
|
||||
|
||||
writer.write(obj)
|
@ -21,10 +21,11 @@ import re
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
from tqdm import tqdm
|
||||
|
||||
from lhotse import CutSet, SupervisionSegment
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
from lhotse.serialization import open_best
|
||||
from tqdm import tqdm
|
||||
|
||||
# Similar text filtering and normalization procedure as in:
|
||||
# https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh
|
||||
@ -39,32 +40,26 @@ def normalize_text(
|
||||
|
||||
|
||||
def has_no_oov(
|
||||
sup: SupervisionSegment,
|
||||
oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
|
||||
sup: SupervisionSegment, oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
|
||||
) -> bool:
|
||||
return oov_pattern.search(sup.text) is None
|
||||
|
||||
|
||||
def preprocess_gigaspeech():
|
||||
src_dir = Path("data/manifests")
|
||||
output_dir = Path("data/fbank")
|
||||
# src_dir = Path("data/manifests")
|
||||
# output_dir = Path("data/fbank")
|
||||
src_dir = Path(".")
|
||||
output_dir = Path(".")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
dataset_parts = (
|
||||
"DEV",
|
||||
"TEST",
|
||||
"M",
|
||||
)
|
||||
dataset_parts = ("XL",)
|
||||
|
||||
prefix = "gigaspeech"
|
||||
suffix = "jsonl.gz"
|
||||
|
||||
logging.info("Loading manifest (may take 1 minutes)")
|
||||
manifests = read_manifests_if_cached(
|
||||
dataset_parts=dataset_parts,
|
||||
output_dir=src_dir,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
dataset_parts=dataset_parts, output_dir=src_dir, prefix=prefix, suffix=suffix,
|
||||
)
|
||||
assert manifests is not None
|
||||
|
||||
@ -76,7 +71,7 @@ def preprocess_gigaspeech():
|
||||
)
|
||||
|
||||
for partition, m in manifests.items():
|
||||
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz"
|
||||
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl"
|
||||
if raw_cuts_path.is_file():
|
||||
logging.info(f"{partition} already exists - skipping")
|
||||
continue
|
||||
@ -93,8 +88,7 @@ def preprocess_gigaspeech():
|
||||
# Create long-recording cut manifests.
|
||||
logging.info(f"Preprocessing {partition}")
|
||||
cut_set = CutSet.from_manifests(
|
||||
recordings=m["recordings"],
|
||||
supervisions=m["supervisions"],
|
||||
recordings=m["recordings"], supervisions=m["supervisions"],
|
||||
)
|
||||
|
||||
logging.info("About to split cuts into smaller chunks.")
|
||||
@ -105,27 +99,6 @@ def preprocess_gigaspeech():
|
||||
logging.info(f"Saving to {raw_cuts_path}")
|
||||
cut_set.to_file(raw_cuts_path)
|
||||
|
||||
for partition in dataset_parts:
|
||||
cuts_path = output_dir / f"{prefix}_cuts_{partition}.jsonl"
|
||||
if cuts_path.is_file():
|
||||
logging.info(f"{partition} already exists - skipping")
|
||||
continue
|
||||
|
||||
logging.info(f"Processing {partition}")
|
||||
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz"
|
||||
with open_best(raw_cuts_path) as reader, jsonlines.open(
|
||||
cuts_path, "a"
|
||||
) as writer:
|
||||
for cut in reader:
|
||||
cut = eval(cut)
|
||||
cut["custom"] = {
|
||||
"discrete_tokens": cut["supervisions"][0]["custom"][
|
||||
"discrete_tokens"
|
||||
]
|
||||
}
|
||||
del cut["supervisions"][0]["custom"]
|
||||
writer.write(cut)
|
||||
|
||||
|
||||
def main():
|
||||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||
|
@ -0,0 +1,18 @@
|
||||
import jsonlines
|
||||
from tqdm import tqdm
|
||||
|
||||
with jsonlines.open("gigaspeech_cuts_XL_raw.jsonl") as reader:
|
||||
with jsonlines.open("gigaspeech_cuts_XL.jsonl", mode="w") as writer:
|
||||
for obj in tqdm(reader):
|
||||
obj["custom"] = {
|
||||
"discrete_tokens": obj["supervisions"][0]["custom"]["discrete_tokens"]
|
||||
}
|
||||
del obj["supervisions"][0]["custom"]
|
||||
|
||||
# Speed perturb
|
||||
obj["duration"] /= 1.1
|
||||
obj["supervisions"][0]["duration"] /= 1.1
|
||||
obj["id"] += "_sp1.1"
|
||||
obj["supervisions"][0]["id"] += "_sp1.1"
|
||||
|
||||
writer.write(obj)
|
Loading…
x
Reference in New Issue
Block a user