icefall/egs/librispeech/SSL/local/attach_kmeans_to_supervisions.py
Yifan Yang 87843e9382
k2SSL: a Faster and Better Framework for Self-Supervised Speech Representation Learning (#1500)
* Add k2SSL

* fix flake8

* fix for black

* fix for black

* fix for black

* Update ssl_datamodule.py

* Fix bugs in HubertDataset

* update comments

* add librilight

* add checkpoint convert script

* format

---------

Co-authored-by: yifanyeung <yifanyeung@yifanyeung.local>
Co-authored-by: zzasdf <15218404468@163.com>
2024-04-04 23:29:16 +08:00

53 lines
1.4 KiB
Python

import os
import jsonlines
from tqdm import tqdm
os.system(
"cp /userhome/user/yfy62/librispeech_data/data4ssl/manifests/librispeech_*_dev-clean* ."
)
os.system(
"cp /userhome/user/yfy62/librispeech_data/data4ssl/manifests/librispeech_*_train* ."
)
os.system("chmod -R 644 *.jsonl.gz")
os.system("gunzip *.gz")
dataset_parts = (
"dev-clean",
"train-clean-100",
"train-clean-360",
"train-other-500",
)
kmeans_dir = "/userhome/user/yangguanrou/data/k500"
idx_dir = "/userhome/user/yangguanrou/data/shu"
kmeans = []
idxs = []
for part in ["train", "valid"]:
with open(kmeans_dir + "/" + part + ".km", "r") as f:
kmeans += f.read().splitlines()
with open(idx_dir + "/" + part + ".tsv", "r") as f:
lines = f.read().splitlines()
idxs += [
line.split("\t", -1)[0].split("/", -1)[-1].replace(".flac", "")
for line in lines
if ".flac" in line
]
idx2kmeans = {}
for idx, km in zip(idxs, kmeans):
idx2kmeans[idx] = km
for part in dataset_parts:
with jsonlines.open(f"librispeech_supervisions_{part}.jsonl") as reader:
with jsonlines.open(
f"librispeech_supervisions_{part}_new.jsonl", mode="w"
) as writer:
for obj in tqdm(reader):
obj["custom"] = {"kmeans": idx2kmeans[obj["id"]]}
writer.write(obj)
os.system('for file in *_new.jsonl; do mv "$file" "${file%_new.jsonl}.jsonl"; done')