icefall/egs/librispeech/SSL/local/process_raw_cuts.py
jaeeunbaik 915e8e399c Add CHiME-4 dataset, RIR and Self-Distillation
- Added CHiME-4 dataset integration in asr_datamodule.py
- Added Hugging Face upload script
- Added RIR augmentation
- Added Self-Distillation Training
2025-08-27 16:11:20 +09:00

24 lines
608 B
Python
Executable File

import os
import jsonlines
from tqdm import tqdm
dataset_parts = (
"dev-clean",
"train-clean-100",
"train-clean-360",
"train-other-500",
)
for part in dataset_parts:
with jsonlines.open(f"librispeech_cuts_{part}_raw.jsonl") as reader:
with jsonlines.open(f"librispeech_cuts_{part}.jsonl", mode="w") as writer:
for obj in tqdm(reader):
obj["custom"] = {"kmeans": obj["supervisions"][0]["custom"]["kmeans"]}
del obj["supervisions"][0]["custom"]
writer.write(obj)
os.system("rm *_raw.jsonl")
os.system("gzip *.jsonl")