icefall/egs/swbd/ASR/local/format_acronyms_dict.py
2023-10-07 11:44:18 +08:00

119 lines
4.6 KiB
Python
Executable File

#!/usr/bin/env python3
# Copyright 2015 Minhua Wu
# Apache 2.0
# convert acronyms in swbd dict to fisher convention
# IBM to i._b._m.
# BBC to b._b._c.
# BBCs to b._b._c.s
# BBC's to b._b._c.'s
import argparse
import re
__author__ = "Minhua Wu"
parser = argparse.ArgumentParser(description="format acronyms to a._b._c.")
parser.add_argument("-i", "--input", help="Input lexicon", required=True)
parser.add_argument("-o", "--output", help="Output lexicon", required=True)
parser.add_argument(
"-L", "--Letter", help="Input single letter pronunciation", required=True
)
parser.add_argument("-M", "--Map", help="Output acronyms mapping", required=True)
args = parser.parse_args()
fin_lex = open(args.input, "r")
fin_Letter = open(args.Letter, "r")
fout_lex = open(args.output, "w")
fout_map = open(args.Map, "w")
# Initialise single letter dictionary
dict_letter = {}
for single_letter_lex in fin_Letter:
items = single_letter_lex.split()
dict_letter[items[0]] = single_letter_lex[len(items[0]) + 1 :].strip()
fin_Letter.close()
# print dict_letter
for lex in fin_lex:
items = lex.split()
word = items[0]
lexicon = lex[len(items[0]) + 1 :].strip()
# find acronyms from words with only letters and '
pre_match = re.match(r"^[A-Za-z]+$|^[A-Za-z]+\'s$|^[A-Za-z]+s$", word)
if pre_match:
# find if words in the form of xxx's is acronym
if word[-2:] == "'s" and (lexicon[-1] == "s" or lexicon[-1] == "z"):
actual_word = word[:-2]
actual_lexicon = lexicon[:-2]
acronym_lexicon = ""
for w in actual_word:
acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
if acronym_lexicon.strip() == actual_lexicon:
acronym_mapped = ""
acronym_mapped_back = ""
for w in actual_word[:-1]:
acronym_mapped = acronym_mapped + w.lower() + "._"
acronym_mapped_back = acronym_mapped_back + w.lower() + " "
acronym_mapped = acronym_mapped + actual_word[-1].lower() + ".'s"
acronym_mapped_back = (
acronym_mapped_back + actual_word[-1].lower() + "'s"
)
fout_map.write(
word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
)
fout_lex.write(acronym_mapped + " " + lexicon + "\n")
else:
fout_lex.write(lex)
# find if words in the form of xxxs is acronym
elif word[-1] == "s" and (lexicon[-1] == "s" or lexicon[-1] == "z"):
actual_word = word[:-1]
actual_lexicon = lexicon[:-2]
acronym_lexicon = ""
for w in actual_word:
acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
if acronym_lexicon.strip() == actual_lexicon:
acronym_mapped = ""
acronym_mapped_back = ""
for w in actual_word[:-1]:
acronym_mapped = acronym_mapped + w.lower() + "._"
acronym_mapped_back = acronym_mapped_back + w.lower() + " "
acronym_mapped = acronym_mapped + actual_word[-1].lower() + ".s"
acronym_mapped_back = (
acronym_mapped_back + actual_word[-1].lower() + "'s"
)
fout_map.write(
word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
)
fout_lex.write(acronym_mapped + " " + lexicon + "\n")
else:
fout_lex.write(lex)
# find if words in the form of xxx (not ended with 's or s) is acronym
elif word.find("'") == -1 and word[-1] != "s":
acronym_lexicon = ""
for w in word:
acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
if acronym_lexicon.strip() == lexicon:
acronym_mapped = ""
acronym_mapped_back = ""
for w in word[:-1]:
acronym_mapped = acronym_mapped + w.lower() + "._"
acronym_mapped_back = acronym_mapped_back + w.lower() + " "
acronym_mapped = acronym_mapped + word[-1].lower() + "."
acronym_mapped_back = acronym_mapped_back + word[-1].lower()
fout_map.write(
word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
)
fout_lex.write(acronym_mapped + " " + lexicon + "\n")
else:
fout_lex.write(lex)
else:
fout_lex.write(lex)
else:
fout_lex.write(lex)