#!/usr/bin/env python3 # Copyright 2015 Minhua Wu # Apache 2.0 # convert acronyms in swbd transcript to fisher convention # according to first two columns in the input acronyms mapping import argparse import re __author__ = "Minhua Wu" parser = argparse.ArgumentParser(description="format acronyms to a._b._c.") parser.add_argument("-i", "--input", help="Input transcripts", required=True) parser.add_argument("-o", "--output", help="Output transcripts", required=True) parser.add_argument("-M", "--Map", help="Input acronyms mapping", required=True) args = parser.parse_args() fin_map = open(args.Map, "r") dict_acronym = {} dict_acronym_noi = {} # Mapping of acronyms without I, i for pair in fin_map: items = pair.split("\t") dict_acronym[items[0]] = items[1] dict_acronym_noi[items[0]] = items[1] fin_map.close() del dict_acronym_noi["I"] del dict_acronym_noi["i"] fin_trans = open(args.input, "r") fout_trans = open(args.output, "w") for line in fin_trans: items = line.split() L = len(items) # First pass mapping to map I as part of acronym for i in range(L): if items[i] == "I": x = 0 while i - 1 - x >= 0 and re.match(r"^[A-Z]$", items[i - 1 - x]): x += 1 y = 0 while i + 1 + y < L and re.match(r"^[A-Z]$", items[i + 1 + y]): y += 1 if x + y > 0: for bias in range(-x, y + 1): items[i + bias] = dict_acronym[items[i + bias]] # Second pass mapping (not mapping 'i' and 'I') for i in range(len(items)): if items[i] in dict_acronym_noi.keys(): items[i] = dict_acronym_noi[items[i]] sentence = " ".join(items[1:]) fout_trans.write(items[0] + " " + sentence.lower() + "\n") fin_trans.close() fout_trans.close()