icefall/egs/swbd/ASR/local/format_acronyms_dict.py

#!/usr/bin/env python3

# Copyright 2015  Minhua Wu
# Apache 2.0

# convert acronyms in swbd dict to fisher convention
# IBM to i._b._m.
# BBC to b._b._c.
# BBCs to b._b._c.s
# BBC's to b._b._c.'s

import argparse
import re

__author__ = "Minhua Wu"

parser = argparse.ArgumentParser(description="format acronyms to a._b._c.")
parser.add_argument("-i", "--input", help="Input lexicon", required=True)
parser.add_argument("-o", "--output", help="Output lexicon", required=True)
parser.add_argument(
    "-L", "--Letter", help="Input single letter pronunciation", required=True
)
parser.add_argument("-M", "--Map", help="Output acronyms mapping", required=True)
args = parser.parse_args()


fin_lex = open(args.input, "r")
fin_Letter = open(args.Letter, "r")
fout_lex = open(args.output, "w")
fout_map = open(args.Map, "w")

# Initialise single letter dictionary
dict_letter = {}
for single_letter_lex in fin_Letter:
    items = single_letter_lex.split()
    dict_letter[items[0]] = single_letter_lex[len(items[0]) + 1 :].strip()
fin_Letter.close()
# print dict_letter

for lex in fin_lex:
    items = lex.split()
    word = items[0]
    lexicon = lex[len(items[0]) + 1 :].strip()
    # find acronyms from words with only letters and '
    pre_match = re.match(r"^[A-Za-z]+$|^[A-Za-z]+\'s$|^[A-Za-z]+s$", word)
    if pre_match:
        # find if words in the form of xxx's is acronym
        if word[-2:] == "'s" and (lexicon[-1] == "s" or lexicon[-1] == "z"):
            actual_word = word[:-2]
            actual_lexicon = lexicon[:-2]
            acronym_lexicon = ""
            for w in actual_word:
                acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
            if acronym_lexicon.strip() == actual_lexicon:
                acronym_mapped = ""
                acronym_mapped_back = ""
                for w in actual_word[:-1]:
                    acronym_mapped = acronym_mapped + w.lower() + "._"
                    acronym_mapped_back = acronym_mapped_back + w.lower() + " "
                acronym_mapped = acronym_mapped + actual_word[-1].lower() + ".'s"
                acronym_mapped_back = (
                    acronym_mapped_back + actual_word[-1].lower() + "'s"
                )
                fout_map.write(
                    word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
                )
                fout_lex.write(acronym_mapped + " " + lexicon + "\n")
            else:
                fout_lex.write(lex)

        # find if words in the form of xxxs is acronym
        elif word[-1] == "s" and (lexicon[-1] == "s" or lexicon[-1] == "z"):
            actual_word = word[:-1]
            actual_lexicon = lexicon[:-2]
            acronym_lexicon = ""
            for w in actual_word:
                acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
            if acronym_lexicon.strip() == actual_lexicon:
                acronym_mapped = ""
                acronym_mapped_back = ""
                for w in actual_word[:-1]:
                    acronym_mapped = acronym_mapped + w.lower() + "._"
                    acronym_mapped_back = acronym_mapped_back + w.lower() + " "
                acronym_mapped = acronym_mapped + actual_word[-1].lower() + ".s"
                acronym_mapped_back = (
                    acronym_mapped_back + actual_word[-1].lower() + "'s"
                )
                fout_map.write(
                    word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
                )
                fout_lex.write(acronym_mapped + " " + lexicon + "\n")
            else:
                fout_lex.write(lex)

        # find if words in the form of xxx (not ended with 's or s) is acronym
        elif word.find("'") == -1 and word[-1] != "s":
            acronym_lexicon = ""
            for w in word:
                acronym_lexicon = acronym_lexicon + dict_letter[w.upper()] + " "
            if acronym_lexicon.strip() == lexicon:
                acronym_mapped = ""
                acronym_mapped_back = ""
                for w in word[:-1]:
                    acronym_mapped = acronym_mapped + w.lower() + "._"
                    acronym_mapped_back = acronym_mapped_back + w.lower() + " "
                acronym_mapped = acronym_mapped + word[-1].lower() + "."
                acronym_mapped_back = acronym_mapped_back + word[-1].lower()
                fout_map.write(
                    word + "\t" + acronym_mapped + "\t" + acronym_mapped_back + "\n"
                )
                fout_lex.write(acronym_mapped + " " + lexicon + "\n")
            else:
                fout_lex.write(lex)
        else:
            fout_lex.write(lex)

    else:
        fout_lex.write(lex)