import os import jsonlines from tqdm import tqdm dataset_parts = ( "dev-clean", "train-clean-100", "train-clean-360", "train-other-500", ) for part in dataset_parts: with jsonlines.open(f"librispeech_cuts_{part}_raw.jsonl") as reader: with jsonlines.open(f"librispeech_cuts_{part}.jsonl", mode="w") as writer: for obj in tqdm(reader): obj["custom"] = {"kmeans": obj["supervisions"][0]["custom"]["kmeans"]} del obj["supervisions"][0]["custom"] writer.write(obj) os.system("rm *_raw.jsonl") os.system("gzip *.jsonl")