mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
* Add k2SSL * fix flake8 * fix for black * fix for black * fix for black * Update ssl_datamodule.py * Fix bugs in HubertDataset * update comments * add librilight * add checkpoint convert script * format --------- Co-authored-by: yifanyeung <yifanyeung@yifanyeung.local> Co-authored-by: zzasdf <15218404468@163.com>
53 lines
1.4 KiB
Python
53 lines
1.4 KiB
Python
import os
|
|
|
|
import jsonlines
|
|
from tqdm import tqdm
|
|
|
|
os.system(
|
|
"cp /userhome/user/yfy62/librispeech_data/data4ssl/manifests/librispeech_*_dev-clean* ."
|
|
)
|
|
os.system(
|
|
"cp /userhome/user/yfy62/librispeech_data/data4ssl/manifests/librispeech_*_train* ."
|
|
)
|
|
os.system("chmod -R 644 *.jsonl.gz")
|
|
os.system("gunzip *.gz")
|
|
|
|
dataset_parts = (
|
|
"dev-clean",
|
|
"train-clean-100",
|
|
"train-clean-360",
|
|
"train-other-500",
|
|
)
|
|
|
|
kmeans_dir = "/userhome/user/yangguanrou/data/k500"
|
|
idx_dir = "/userhome/user/yangguanrou/data/shu"
|
|
|
|
kmeans = []
|
|
idxs = []
|
|
for part in ["train", "valid"]:
|
|
with open(kmeans_dir + "/" + part + ".km", "r") as f:
|
|
kmeans += f.read().splitlines()
|
|
|
|
with open(idx_dir + "/" + part + ".tsv", "r") as f:
|
|
lines = f.read().splitlines()
|
|
idxs += [
|
|
line.split("\t", -1)[0].split("/", -1)[-1].replace(".flac", "")
|
|
for line in lines
|
|
if ".flac" in line
|
|
]
|
|
|
|
idx2kmeans = {}
|
|
for idx, km in zip(idxs, kmeans):
|
|
idx2kmeans[idx] = km
|
|
|
|
for part in dataset_parts:
|
|
with jsonlines.open(f"librispeech_supervisions_{part}.jsonl") as reader:
|
|
with jsonlines.open(
|
|
f"librispeech_supervisions_{part}_new.jsonl", mode="w"
|
|
) as writer:
|
|
for obj in tqdm(reader):
|
|
obj["custom"] = {"kmeans": idx2kmeans[obj["id"]]}
|
|
writer.write(obj)
|
|
|
|
os.system('for file in *_new.jsonl; do mv "$file" "${file%_new.jsonl}.jsonl"; done')
|