mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
55 lines
1.8 KiB
Python
55 lines
1.8 KiB
Python
from pathlib import Path
|
|
|
|
from backend_np import WujiEEGBackend
|
|
from lhotse import CutSet, MonoCut, Recording, SupervisionSegment
|
|
from lhotse.audio.backend import set_current_audio_backend
|
|
from tqdm import tqdm
|
|
|
|
set_current_audio_backend(WujiEEGBackend())
|
|
|
|
SPLIT=Path("/nvme3/wyc/sleep-net-zero/index/sleep_staging/hsp_nsrr.csv")
|
|
DATA_DIR=Path("/home/jinzengrui/proj/biofall/egs/tokenizer/CODEC/data/from_wyc")
|
|
|
|
if __name__ == "__main__":
|
|
with open(SPLIT, "r") as f:
|
|
csv_lines = f.readlines()
|
|
csv_lines = csv_lines[1:]
|
|
train_cuts, val_cuts = [], []
|
|
|
|
for line in tqdm(csv_lines):
|
|
line = line.strip()
|
|
npz_path, sess_id, duration, split = line.split(",")
|
|
duration = float(duration)
|
|
npz_path = Path(npz_path)
|
|
npz_fname = npz_path.stem.split(".")[0]
|
|
audio = Recording.from_file(npz_path, recording_id=f"{sess_id}-{npz_fname}")
|
|
cut = MonoCut(
|
|
id=f"{sess_id}-{npz_fname}",
|
|
start=0.0,
|
|
duration=duration,
|
|
channel=0,
|
|
recording=audio,
|
|
supervisions=[
|
|
SupervisionSegment(
|
|
id=f"{sess_id}-{npz_fname}",
|
|
recording_id=f"{sess_id}-{npz_fname}",
|
|
start=0.0,
|
|
duration=duration,
|
|
channel=0,
|
|
text="",
|
|
language="",
|
|
speaker=sess_id,
|
|
)
|
|
],
|
|
)
|
|
if split == "train":
|
|
train_cuts.append(cut)
|
|
elif split == "val":
|
|
val_cuts.append(cut)
|
|
else:
|
|
raise ValueError(f"Unknown split: {split}")
|
|
|
|
train_cuts = CutSet.from_cuts(cuts=train_cuts)
|
|
train_cuts.to_jsonl(DATA_DIR / "train.jsonl.gz")
|
|
val_cuts = CutSet.from_cuts(cuts=val_cuts)
|
|
val_cuts.to_jsonl(DATA_DIR / "val.jsonl.gz") |