mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-10 10:32:17 +00:00
95 lines
2.7 KiB
Python
Executable File
95 lines
2.7 KiB
Python
Executable File
import argparse
|
|
import logging
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
POSITIONS = ("DA01", "DA02", "DA03", "DA04")
|
|
|
|
|
|
def get_args():
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument(
|
|
"--corpus-dir",
|
|
type=str,
|
|
help="""Corpus directory path to compute overlap ratio. If None, we will use all""",
|
|
)
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
def compute_overlap_ratio(
|
|
corpus_dir: Optional[str] = None,
|
|
):
|
|
corpus_dir = Path(corpus_dir)
|
|
sections = os.listdir(corpus_dir / "train")
|
|
infos = []
|
|
for section in sorted(sections):
|
|
intervals = []
|
|
overlap = 0
|
|
duration = 0
|
|
for position in POSITIONS:
|
|
text_path = (
|
|
corpus_dir / "train" / section / (position + ".TextGrid")
|
|
).resolve()
|
|
if not text_path.is_file():
|
|
continue
|
|
|
|
with open(text_path) as f:
|
|
datalines = f.read().splitlines()
|
|
|
|
for dataline in datalines:
|
|
if "xmin =" in dataline:
|
|
start = float(dataline.split("=")[1].strip())
|
|
elif "xmax =" in dataline:
|
|
end = float(dataline.split("=")[1].strip())
|
|
elif "text" in dataline:
|
|
text = dataline.split('"')[1].strip()
|
|
if len(text) > 0:
|
|
intervals.append((start, end))
|
|
duration += end - start
|
|
|
|
intervals = sorted(intervals, key=lambda x: x[1])
|
|
st = -1
|
|
ed = -1
|
|
for interval in intervals:
|
|
if ed < interval[0]: # No overlap
|
|
st = interval[0]
|
|
ed = interval[1]
|
|
else:
|
|
overlap += ed - max(st, interval[0])
|
|
st = min(st, interval[0])
|
|
ed = interval[1]
|
|
|
|
infos.append((section, overlap, duration))
|
|
|
|
total_overlap = 0
|
|
total_duration = 0
|
|
for info in infos:
|
|
total_overlap += info[1]
|
|
total_duration += info[2]
|
|
logging.info(
|
|
f"section: {info[0]}\t overlap: {info[1]}\t duration: {info[2]}\t overlap ratio: {info[1] / info[2]}"
|
|
)
|
|
|
|
logging.info(
|
|
f"total duration: {total_duration}\t total overlap: {total_overlap}\t total overlap ratio: {total_overlap / total_duration}"
|
|
)
|
|
|
|
|
|
def main():
|
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
|
|
logging.basicConfig(format=formatter, level=logging.INFO, filename="overlap_info.log")
|
|
args = get_args()
|
|
logging.info(vars(args))
|
|
compute_overlap_ratio(
|
|
corpus_dir=args.corpus_dir,
|
|
)
|
|
logging.info("Done")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|