icefall/egs/tokenizer/CODEC/local/prepare_eeg_manifest.py
2024-10-24 22:58:52 +08:00

55 lines
1.8 KiB
Python

from pathlib import Path
from backend_np import WujiEEGBackend
from lhotse import CutSet, MonoCut, Recording, SupervisionSegment
from lhotse.audio.backend import set_current_audio_backend
from tqdm import tqdm
set_current_audio_backend(WujiEEGBackend())
SPLIT=Path("/nvme3/wyc/sleep-net-zero/index/sleep_staging/hsp_nsrr.csv")
DATA_DIR=Path("/home/jinzengrui/proj/biofall/egs/tokenizer/CODEC/data/from_wyc")
if __name__ == "__main__":
with open(SPLIT, "r") as f:
csv_lines = f.readlines()
csv_lines = csv_lines[1:]
train_cuts, val_cuts = [], []
for line in tqdm(csv_lines):
line = line.strip()
npz_path, sess_id, duration, split = line.split(",")
duration = float(duration)
npz_path = Path(npz_path)
npz_fname = npz_path.stem.split(".")[0]
audio = Recording.from_file(npz_path, recording_id=f"{sess_id}-{npz_fname}")
cut = MonoCut(
id=f"{sess_id}-{npz_fname}",
start=0.0,
duration=duration,
channel=0,
recording=audio,
supervisions=[
SupervisionSegment(
id=f"{sess_id}-{npz_fname}",
recording_id=f"{sess_id}-{npz_fname}",
start=0.0,
duration=duration,
channel=0,
text="",
language="",
speaker=sess_id,
)
],
)
if split == "train":
train_cuts.append(cut)
elif split == "val":
val_cuts.append(cut)
else:
raise ValueError(f"Unknown split: {split}")
train_cuts = CutSet.from_cuts(cuts=train_cuts)
train_cuts.to_jsonl(DATA_DIR / "train.jsonl.gz")
val_cuts = CutSet.from_cuts(cuts=val_cuts)
val_cuts.to_jsonl(DATA_DIR / "val.jsonl.gz")