mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
61 lines
1.8 KiB
Python
Executable File
61 lines
1.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# Copyright 2015 Minhua Wu
|
|
# Apache 2.0
|
|
|
|
# convert acronyms in swbd transcript to fisher convention
|
|
# according to first two columns in the input acronyms mapping
|
|
|
|
import argparse
|
|
import re
|
|
|
|
__author__ = "Minhua Wu"
|
|
|
|
parser = argparse.ArgumentParser(description="format acronyms to a._b._c.")
|
|
parser.add_argument("-i", "--input", help="Input transcripts", required=True)
|
|
parser.add_argument("-o", "--output", help="Output transcripts", required=True)
|
|
parser.add_argument("-M", "--Map", help="Input acronyms mapping", required=True)
|
|
args = parser.parse_args()
|
|
|
|
fin_map = open(args.Map, "r")
|
|
dict_acronym = {}
|
|
dict_acronym_noi = {} # Mapping of acronyms without I, i
|
|
for pair in fin_map:
|
|
items = pair.split("\t")
|
|
dict_acronym[items[0]] = items[1]
|
|
dict_acronym_noi[items[0]] = items[1]
|
|
fin_map.close()
|
|
del dict_acronym_noi["I"]
|
|
del dict_acronym_noi["i"]
|
|
|
|
|
|
fin_trans = open(args.input, "r")
|
|
fout_trans = open(args.output, "w")
|
|
for line in fin_trans:
|
|
items = line.split()
|
|
L = len(items)
|
|
# First pass mapping to map I as part of acronym
|
|
for i in range(L):
|
|
if items[i] == "I":
|
|
x = 0
|
|
while i - 1 - x >= 0 and re.match(r"^[A-Z]$", items[i - 1 - x]):
|
|
x += 1
|
|
|
|
y = 0
|
|
while i + 1 + y < L and re.match(r"^[A-Z]$", items[i + 1 + y]):
|
|
y += 1
|
|
|
|
if x + y > 0:
|
|
for bias in range(-x, y + 1):
|
|
items[i + bias] = dict_acronym[items[i + bias]]
|
|
|
|
# Second pass mapping (not mapping 'i' and 'I')
|
|
for i in range(len(items)):
|
|
if items[i] in dict_acronym_noi.keys():
|
|
items[i] = dict_acronym_noi[items[i]]
|
|
sentence = " ".join(items[1:])
|
|
fout_trans.write(items[0] + " " + sentence.lower() + "\n")
|
|
|
|
fin_trans.close()
|
|
fout_trans.close()
|