icefall/egs/icmcasr/ASR/local/compute_overlap_ratio.py
2023-10-16 19:28:21 +08:00

95 lines
2.7 KiB
Python
Executable File

import argparse
import logging
import os
from pathlib import Path
from typing import Optional
POSITIONS = ("DA01", "DA02", "DA03", "DA04")
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--corpus-dir",
type=str,
help="""Corpus directory path to compute overlap ratio. If None, we will use all""",
)
return parser.parse_args()
def compute_overlap_ratio(
corpus_dir: Optional[str] = None,
):
corpus_dir = Path(corpus_dir)
sections = os.listdir(corpus_dir / "train")
infos = []
for section in sorted(sections):
intervals = []
overlap = 0
duration = 0
for position in POSITIONS:
text_path = (
corpus_dir / "train" / section / (position + ".TextGrid")
).resolve()
if not text_path.is_file():
continue
with open(text_path) as f:
datalines = f.read().splitlines()
for dataline in datalines:
if "xmin =" in dataline:
start = float(dataline.split("=")[1].strip())
elif "xmax =" in dataline:
end = float(dataline.split("=")[1].strip())
elif "text" in dataline:
text = dataline.split('"')[1].strip()
if len(text) > 0:
intervals.append((start, end))
duration += end - start
intervals = sorted(intervals, key=lambda x: x[1])
st = -1
ed = -1
for interval in intervals:
if ed < interval[0]: # No overlap
st = interval[0]
ed = interval[1]
else:
overlap += ed - max(st, interval[0])
st = min(st, interval[0])
ed = interval[1]
infos.append((section, overlap, duration))
total_overlap = 0
total_duration = 0
for info in infos:
total_overlap += info[1]
total_duration += info[2]
logging.info(
f"section: {info[0]}\t overlap: {info[1]}\t duration: {info[2]}\t overlap ratio: {info[1] / info[2]}"
)
logging.info(
f"total duration: {total_duration}\t total overlap: {total_overlap}\t total overlap ratio: {total_overlap / total_duration}"
)
def main():
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO, filename="overlap_info.log")
args = get_args()
logging.info(vars(args))
compute_overlap_ratio(
corpus_dir=args.corpus_dir,
)
logging.info("Done")
if __name__ == "__main__":
main()