From 35984ca22420d43cb0582a682424c8780f439456 Mon Sep 17 00:00:00 2001 From: JinZr Date: Mon, 26 Jun 2023 17:27:42 +0800 Subject: [PATCH] removed unsed scripts --- egs/swbd/ASR/local/prepare_swbd_testsets.py | 161 -------------------- 1 file changed, 161 deletions(-) delete mode 100755 egs/swbd/ASR/local/prepare_swbd_testsets.py diff --git a/egs/swbd/ASR/local/prepare_swbd_testsets.py b/egs/swbd/ASR/local/prepare_swbd_testsets.py deleted file mode 100755 index 62ad0fda8..000000000 --- a/egs/swbd/ASR/local/prepare_swbd_testsets.py +++ /dev/null @@ -1,161 +0,0 @@ -import argparse -import logging -import os -from pathlib import Path -import tarfile -from itertools import chain -from typing import Dict, Optional, Union - -from lhotse import fix_manifests, validate_recordings_and_supervisions -from lhotse.audio import Recording, RecordingSet -from lhotse.supervision import SupervisionSegment, SupervisionSet -from lhotse.utils import Pathlike, check_and_rglob, resumable_download, safe_extract - -import sentencepiece as spm -from filter_cuts import filter_cuts -from lhotse import CutSet, Fbank, FbankConfig, LilcomChunkyWriter -from lhotse.recipes.utils import read_manifests_if_cached - -from icefall.utils import get_executor, str2bool - - -def prepare_switchboard( - audio_dir: Pathlike, - transcripts_dir: Optional[Pathlike] = None, - sentiment_dir: Optional[Pathlike] = None, - output_dir: Optional[Pathlike] = None, - omit_silence: bool = True, - absolute_paths: bool = False, -) -> Dict[str, Union[RecordingSet, SupervisionSet]]: - """ - Prepare manifests for the Switchboard corpus. - We create two manifests: one with recordings, and the other one with text supervisions. - When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations. - :param audio_dir: Path to ``LDC97S62`` package. - :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions"). - If not provided, the transcripts will be downloaded. - :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations - for SWBD segments. - :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing. - :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept. - :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings. - :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``. - """ - audio_paths = check_and_rglob(audio_dir, "*.sph") - text_paths = check_and_rglob(transcripts_dir, "*trans.text") - - groups = [] - name_to_text = {p.stem.split("-")[0]: p for p in text_paths} - for ap in audio_paths: - name = ap.stem.replace("sw0", "sw") - groups.append( - { - "audio": ap, - "text-0": name_to_text[f"{name}A"], - "text-1": name_to_text[f"{name}B"], - } - ) - - recordings = RecordingSet.from_recordings( - Recording.from_file( - group["audio"], relative_path_depth=None if absolute_paths else 3 - ) - for group in groups - ) - supervisions = SupervisionSet.from_segments( - chain.from_iterable( - make_segments( - transcript_path=group[f"text-{channel}"], - recording=recording, - channel=channel, - omit_silence=omit_silence, - ) - for group, recording in zip(groups, recordings) - for channel in [0, 1] - ) - ) - - recordings, supervisions = fix_manifests(recordings, supervisions) - validate_recordings_and_supervisions(recordings, supervisions) - - if sentiment_dir is not None: - parse_and_add_sentiment_labels(sentiment_dir, supervisions) - - if output_dir is not None: - output_dir = Path(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - recordings.to_file(output_dir / "swbd_recordings_all.jsonl.gz") - supervisions.to_file(output_dir / "swbd_supervisions_all.jsonl.gz") - return {"recordings": recordings, "supervisions": supervisions} - - -def make_segments( - transcript_path: Path, recording: Recording, channel: int, omit_silence: bool = True -): - lines = transcript_path.read_text().splitlines() - return [ - SupervisionSegment( - id=segment_id, - recording_id=recording.id, - start=float(start), - duration=round(float(end) - float(start), ndigits=8), - channel=channel, - text=" ".join(words), - language="English", - speaker=f"{recording.id}A", - ) - for segment_id, start, end, *words in map(str.split, lines) - if words[0] != "[silence]" or not omit_silence - ] - - -def download_and_untar( - target_dir: Pathlike = ".", force_download: bool = False, url: str = SWBD_TEXT_URL -) -> Path: - target_dir = Path(target_dir) - transcript_dir = target_dir / "swb_ms98_transcriptions" - if transcript_dir.is_dir(): - return transcript_dir - target_dir.mkdir(parents=True, exist_ok=True) - tar_name = "switchboard_word_alignments.tar.gz" - tar_path = target_dir / tar_name - resumable_download(url, filename=tar_path, force_download=force_download) - with tarfile.open(tar_path) as tar: - safe_extract(tar, path=target_dir) - return transcript_dir - - -def parse_and_add_sentiment_labels( - sentiment_dir: Pathlike, supervisions: SupervisionSet -): - """Read 'LDC2020T14' sentiment annotations and add then to the supervision segments.""" - import pandas as pd - - # Sanity checks - sentiment_dir = Path(sentiment_dir) - labels = sentiment_dir / "data" / "sentiment_labels.tsv" - assert sentiment_dir.is_dir() and labels.is_file() - # Read the TSV as a dataframe - df = pd.read_csv(labels, delimiter="\t", names=["id", "start", "end", "sentiment"]) - # We are going to match the segments in LDC2020T14 with the ones we already - # parsed from ISIP transcripts. We simply look which of the existing segments - # fall into a sentiment-annotated time span. When doing it this way, we use - # 51773 out of 52293 available sentiment annotations, which should be okay. - for _, row in df.iterrows(): - call_id = row["id"].split("_")[0] - matches = list( - supervisions.find( - recording_id=call_id, - start_after=row["start"] - 1e-2, - end_before=row["end"] + 1e-2, - ) - ) - if not matches: - continue - labels = row["sentiment"].split("#") - # SupervisionSegments returned from .find() are references to the ones in the - # SupervisionSet, so we can just modify them. We use the "custom" field - # to add the sentiment label. Since there were multiple annotators, - # we add all available labels and leave it up to the user to disambiguate them. - for segment in matches: - segment.custom = {f"sentiment{i}": label for i, label in enumerate(labels)}