mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-04 14:44:18 +00:00
update
This commit is contained in:
parent
4a9ea0ffc6
commit
1c4db88747
@ -0,0 +1,20 @@
|
|||||||
|
import jsonlines
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
with open(
|
||||||
|
"/mnt/lustre/sjtu/home/yfy62/discrete_token_data/GigaSpeech/xl/wavlm_large_l21_kms2000/out_quantized_sp1.1"
|
||||||
|
) as f:
|
||||||
|
discrete_tokens = f.read().splitlines()
|
||||||
|
|
||||||
|
discrete_tokens_info = {}
|
||||||
|
for discrete_token in discrete_tokens:
|
||||||
|
discrete_token = discrete_token.split(" ", 1)
|
||||||
|
discrete_tokens_info[discrete_token[0]] = discrete_token[1]
|
||||||
|
|
||||||
|
|
||||||
|
with jsonlines.open("gigaspeech_supervisions_XL.jsonl") as reader:
|
||||||
|
with jsonlines.open("gigaspeech_supervisions_XL_new.jsonl", mode="w") as writer:
|
||||||
|
for obj in tqdm(reader):
|
||||||
|
obj["custom"] = {"discrete_tokens": discrete_tokens_info[obj["id"]]}
|
||||||
|
|
||||||
|
writer.write(obj)
|
@ -21,10 +21,11 @@ import re
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import jsonlines
|
import jsonlines
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
from lhotse import CutSet, SupervisionSegment
|
from lhotse import CutSet, SupervisionSegment
|
||||||
from lhotse.recipes.utils import read_manifests_if_cached
|
from lhotse.recipes.utils import read_manifests_if_cached
|
||||||
from lhotse.serialization import open_best
|
from lhotse.serialization import open_best
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
# Similar text filtering and normalization procedure as in:
|
# Similar text filtering and normalization procedure as in:
|
||||||
# https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh
|
# https://github.com/SpeechColab/GigaSpeech/blob/main/toolkits/kaldi/gigaspeech_data_prep.sh
|
||||||
@ -39,32 +40,26 @@ def normalize_text(
|
|||||||
|
|
||||||
|
|
||||||
def has_no_oov(
|
def has_no_oov(
|
||||||
sup: SupervisionSegment,
|
sup: SupervisionSegment, oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
|
||||||
oov_pattern=re.compile(r"<(SIL|MUSIC|NOISE|OTHER)>"),
|
|
||||||
) -> bool:
|
) -> bool:
|
||||||
return oov_pattern.search(sup.text) is None
|
return oov_pattern.search(sup.text) is None
|
||||||
|
|
||||||
|
|
||||||
def preprocess_gigaspeech():
|
def preprocess_gigaspeech():
|
||||||
src_dir = Path("data/manifests")
|
# src_dir = Path("data/manifests")
|
||||||
output_dir = Path("data/fbank")
|
# output_dir = Path("data/fbank")
|
||||||
|
src_dir = Path(".")
|
||||||
|
output_dir = Path(".")
|
||||||
output_dir.mkdir(exist_ok=True)
|
output_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
dataset_parts = (
|
dataset_parts = ("XL",)
|
||||||
"DEV",
|
|
||||||
"TEST",
|
|
||||||
"M",
|
|
||||||
)
|
|
||||||
|
|
||||||
prefix = "gigaspeech"
|
prefix = "gigaspeech"
|
||||||
suffix = "jsonl.gz"
|
suffix = "jsonl.gz"
|
||||||
|
|
||||||
logging.info("Loading manifest (may take 1 minutes)")
|
logging.info("Loading manifest (may take 1 minutes)")
|
||||||
manifests = read_manifests_if_cached(
|
manifests = read_manifests_if_cached(
|
||||||
dataset_parts=dataset_parts,
|
dataset_parts=dataset_parts, output_dir=src_dir, prefix=prefix, suffix=suffix,
|
||||||
output_dir=src_dir,
|
|
||||||
prefix=prefix,
|
|
||||||
suffix=suffix,
|
|
||||||
)
|
)
|
||||||
assert manifests is not None
|
assert manifests is not None
|
||||||
|
|
||||||
@ -76,7 +71,7 @@ def preprocess_gigaspeech():
|
|||||||
)
|
)
|
||||||
|
|
||||||
for partition, m in manifests.items():
|
for partition, m in manifests.items():
|
||||||
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz"
|
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl"
|
||||||
if raw_cuts_path.is_file():
|
if raw_cuts_path.is_file():
|
||||||
logging.info(f"{partition} already exists - skipping")
|
logging.info(f"{partition} already exists - skipping")
|
||||||
continue
|
continue
|
||||||
@ -93,8 +88,7 @@ def preprocess_gigaspeech():
|
|||||||
# Create long-recording cut manifests.
|
# Create long-recording cut manifests.
|
||||||
logging.info(f"Preprocessing {partition}")
|
logging.info(f"Preprocessing {partition}")
|
||||||
cut_set = CutSet.from_manifests(
|
cut_set = CutSet.from_manifests(
|
||||||
recordings=m["recordings"],
|
recordings=m["recordings"], supervisions=m["supervisions"],
|
||||||
supervisions=m["supervisions"],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
logging.info("About to split cuts into smaller chunks.")
|
logging.info("About to split cuts into smaller chunks.")
|
||||||
@ -105,27 +99,6 @@ def preprocess_gigaspeech():
|
|||||||
logging.info(f"Saving to {raw_cuts_path}")
|
logging.info(f"Saving to {raw_cuts_path}")
|
||||||
cut_set.to_file(raw_cuts_path)
|
cut_set.to_file(raw_cuts_path)
|
||||||
|
|
||||||
for partition in dataset_parts:
|
|
||||||
cuts_path = output_dir / f"{prefix}_cuts_{partition}.jsonl"
|
|
||||||
if cuts_path.is_file():
|
|
||||||
logging.info(f"{partition} already exists - skipping")
|
|
||||||
continue
|
|
||||||
|
|
||||||
logging.info(f"Processing {partition}")
|
|
||||||
raw_cuts_path = output_dir / f"{prefix}_cuts_{partition}_raw.jsonl.gz"
|
|
||||||
with open_best(raw_cuts_path) as reader, jsonlines.open(
|
|
||||||
cuts_path, "a"
|
|
||||||
) as writer:
|
|
||||||
for cut in reader:
|
|
||||||
cut = eval(cut)
|
|
||||||
cut["custom"] = {
|
|
||||||
"discrete_tokens": cut["supervisions"][0]["custom"][
|
|
||||||
"discrete_tokens"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
del cut["supervisions"][0]["custom"]
|
|
||||||
writer.write(cut)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
@ -0,0 +1,18 @@
|
|||||||
|
import jsonlines
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
with jsonlines.open("gigaspeech_cuts_XL_raw.jsonl") as reader:
|
||||||
|
with jsonlines.open("gigaspeech_cuts_XL.jsonl", mode="w") as writer:
|
||||||
|
for obj in tqdm(reader):
|
||||||
|
obj["custom"] = {
|
||||||
|
"discrete_tokens": obj["supervisions"][0]["custom"]["discrete_tokens"]
|
||||||
|
}
|
||||||
|
del obj["supervisions"][0]["custom"]
|
||||||
|
|
||||||
|
# Speed perturb
|
||||||
|
obj["duration"] /= 1.1
|
||||||
|
obj["supervisions"][0]["duration"] /= 1.1
|
||||||
|
obj["id"] += "_sp1.1"
|
||||||
|
obj["supervisions"][0]["id"] += "_sp1.1"
|
||||||
|
|
||||||
|
writer.write(obj)
|
Loading…
x
Reference in New Issue
Block a user