mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
119 lines
4.6 KiB
Python
Executable File
119 lines
4.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# Copyright 2015 Minhua Wu
|
|
# Apache 2.0
|
|
|
|
# convert acronyms in swbd dict to fisher convention
|
|
# IBM to i._b._m.
|
|
# BBC to b._b._c.
|
|
# BBCs to b._b._c.s
|
|
# BBC's to b._b._c.'s
|
|
|
|
import argparse
|
|
import re
|
|
|
|
__author__ = "Minhua Wu"
|
|
|
|
parser = argparse.ArgumentParser(description="format acronyms to a._b._c.")
|
|
parser.add_argument("-i", "--input", help="Input lexicon", required=True)
|
|
parser.add_argument("-o", "--output", help="Output lexicon", required=True)
|
|
parser.add_argument(
|
|
"-L", "--Letter", help="Input single letter pronunciation", required=True
|
|
)
|
|
parser.add_argument("-M", "--Map", help="Output acronyms mapping", required=True)
|
|
args = parser.parse_args()
|
|
|
|
|
|
fin_lex = open(args.input, "r")
|
|
fin_Letter = open(args.Letter, "r")
|
|
fout_lex = open(args.output, "w")
|
|
fout_map = open(args.Map, "w")
|
|
|
|
# Initialise single letter dictionary
|
|
dict_letter = {}
|
|
for single_letter_lex in fin_Letter:
|
|
items = single_letter_lex.split()
|
|
dict_letter[items[0]] = single_letter_lex[len(items[0]) + 1 :].strip()
|
|
fin_Letter.close()
|
|
# print dict_letter
|
|
|
|
for lex in fin_lex:
|
|
items = lex.split()
|
|
word = items[0]
|
|
lexicon = lex[len(items[0]) + 1 :].strip()
|
|
# find acronyms from words with only letters and '
|
|
pre_match = re.match(r"^[A-Za-z]+$|^[A-Za-z]+\'s$|^[A-Za-z]+s$", word)
|
|
if pre_match:
|
|
# find if words in the form of xxx's is acronym
|
|
if word[-2:] == "'s" and (lexicon[-1] == "s" or lexicon[-1] == "z"):
|
|
actual_word = word[:-2]
|
|
actual_lexicon = lexicon[:-2]
|
|
acronym_lexicon = ""
|
|
for w in actual_word:
|
|
acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
|
|
if acronym_lexicon.strip() == actual_lexicon:
|
|
acronym_mapped = ""
|
|
acronym_mapped_back = ""
|
|
for w in actual_word[:-1]:
|
|
acronym_mapped = acronym_mapped + w.lower() + "._"
|
|
acronym_mapped_back = acronym_mapped_back + w.lower() + " "
|
|
acronym_mapped = acronym_mapped + actual_word[-1].lower() + ".'s"
|
|
acronym_mapped_back = (
|
|
acronym_mapped_back + actual_word[-1].lower() + "'s"
|
|
)
|
|
fout_map.write(
|
|
word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
|
|
)
|
|
fout_lex.write(acronym_mapped + " " + lexicon + "\n")
|
|
else:
|
|
fout_lex.write(lex)
|
|
|
|
# find if words in the form of xxxs is acronym
|
|
elif word[-1] == "s" and (lexicon[-1] == "s" or lexicon[-1] == "z"):
|
|
actual_word = word[:-1]
|
|
actual_lexicon = lexicon[:-2]
|
|
acronym_lexicon = ""
|
|
for w in actual_word:
|
|
acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
|
|
if acronym_lexicon.strip() == actual_lexicon:
|
|
acronym_mapped = ""
|
|
acronym_mapped_back = ""
|
|
for w in actual_word[:-1]:
|
|
acronym_mapped = acronym_mapped + w.lower() + "._"
|
|
acronym_mapped_back = acronym_mapped_back + w.lower() + " "
|
|
acronym_mapped = acronym_mapped + actual_word[-1].lower() + ".s"
|
|
acronym_mapped_back = (
|
|
acronym_mapped_back + actual_word[-1].lower() + "'s"
|
|
)
|
|
fout_map.write(
|
|
word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
|
|
)
|
|
fout_lex.write(acronym_mapped + " " + lexicon + "\n")
|
|
else:
|
|
fout_lex.write(lex)
|
|
|
|
# find if words in the form of xxx (not ended with 's or s) is acronym
|
|
elif word.find("'") == -1 and word[-1] != "s":
|
|
acronym_lexicon = ""
|
|
for w in word:
|
|
acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
|
|
if acronym_lexicon.strip() == lexicon:
|
|
acronym_mapped = ""
|
|
acronym_mapped_back = ""
|
|
for w in word[:-1]:
|
|
acronym_mapped = acronym_mapped + w.lower() + "._"
|
|
acronym_mapped_back = acronym_mapped_back + w.lower() + " "
|
|
acronym_mapped = acronym_mapped + word[-1].lower() + "."
|
|
acronym_mapped_back = acronym_mapped_back + word[-1].lower()
|
|
fout_map.write(
|
|
word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
|
|
)
|
|
fout_lex.write(acronym_mapped + " " + lexicon + "\n")
|
|
else:
|
|
fout_lex.write(lex)
|
|
else:
|
|
fout_lex.write(lex)
|
|
|
|
else:
|
|
fout_lex.write(lex)
|