import argparse import logging from configparser import ConfigParser from pathlib import Path from typing import List from lhotse import CutSet, SupervisionSet from lhotse.recipes.csj import CSJSDBParser ARGPARSE_DESCRIPTION = """ This script adds transcript modes to an existing CutSet or SupervisionSet. """ def get_args(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=ARGPARSE_DESCRIPTION, ) parser.add_argument( "-f", "--fbank-dir", type=Path, help="Path to directory where manifests are stored.", ) parser.add_argument( "-c", "--config", type=Path, nargs="+", help="Path to config file for transcript parsing.", ) return parser.parse_args() def get_CSJParsers(config_files: List[Path]) -> List[CSJSDBParser]: parsers = [] for config_file in config_files: config = ConfigParser() config.optionxform = str assert config.read(config_file), f"{config_file} could not be found." decisions = {} for k, v in config["DECISIONS"].items(): try: decisions[k] = int(v) except ValueError: decisions[k] = v parsers.append( (config["CONSTANTS"].get("MODE"), CSJSDBParser(decisions=decisions)) ) return parsers def main(): args = get_args() logging.basicConfig( format=("%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"), level=logging.INFO, ) parsers = get_CSJParsers(args.config) config = ConfigParser() config.optionxform = str assert config.read(args.config), args.config decisions = {} for k, v in config["DECISIONS"].items(): try: decisions[k] = int(v) except ValueError: decisions[k] = v logging.info(f"Adding {', '.join(x[0] for x in parsers)} transcript mode.") manifests = args.fbank_dir.glob("csj_cuts_*.jsonl.gz") assert manifests, f"No cuts to be found in {args.fbank_dir}" for manifest in manifests: results = [] logging.info(f"Adding transcript modes to {manifest.name} now.") cutset = CutSet.from_file(manifest) for cut in cutset: for name, parser in parsers: cut.supervisions[0].custom[name] = parser.parse( cut.supervisions[0].custom["raw"] ) cut.supervisions[0].text = "" results.append(cut) results = CutSet.from_items(results) res_file = manifest.as_posix() manifest.replace(manifest.parent / ("bak." + manifest.name)) results.to_file(res_file) if __name__ == "__main__": main()