""" from https://github.com/keithito/tacotron Cleaners are transformations that run over the input text at both training and eval time. Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" hyperparameter. Some cleaners are English-specific. You'll typically want to use: 1. "english_cleaners" for English text 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using the Unidecode library (https://pypi.python.org/pypi/Unidecode) 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update the symbols in symbols.py to match your data). """ import logging import re import phonemizer from unidecode import unidecode # To avoid excessive logging we set the log level of the phonemizer package to Critical critical_logger = logging.getLogger("phonemizer") critical_logger.setLevel(logging.CRITICAL) # Intializing the phonemizer globally significantly reduces the speed # now the phonemizer is not initialising at every call # Might be less flexible, but it is much-much faster global_phonemizer = phonemizer.backend.EspeakBackend( language="en-us", preserve_punctuation=True, with_stress=True, language_switch="remove-flags", logger=critical_logger, ) # Regular expression matching whitespace: _whitespace_re = re.compile(r"\s+") # List of (regular expression, replacement) pairs for abbreviations: _abbreviations = [ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [ ("mrs", "misess"), ("mr", "mister"), ("dr", "doctor"), ("st", "saint"), ("co", "company"), ("jr", "junior"), ("maj", "major"), ("gen", "general"), ("drs", "doctors"), ("rev", "reverend"), ("lt", "lieutenant"), ("hon", "honorable"), ("sgt", "sergeant"), ("capt", "captain"), ("esq", "esquire"), ("ltd", "limited"), ("col", "colonel"), ("ft", "fort"), ] ] def expand_abbreviations(text): for regex, replacement in _abbreviations: text = re.sub(regex, replacement, text) return text def lowercase(text): return text.lower() def collapse_whitespace(text): return re.sub(_whitespace_re, " ", text) def remove_parentheses(text): text = text.replace("(", "") text = text.replace(")", "") text = text.replace("[", "") text = text.replace("]", "") return text def convert_to_ascii(text): return unidecode(text) def basic_cleaners(text): """Basic pipeline that lowercases and collapses whitespace without transliteration.""" text = lowercase(text) text = collapse_whitespace(text) return text def transliteration_cleaners(text): """Pipeline for non-English text that transliterates to ASCII.""" text = convert_to_ascii(text) text = lowercase(text) text = collapse_whitespace(text) return text def english_cleaners2(text): """Pipeline for English text, including abbreviation expansion. + punctuation + stress""" text = convert_to_ascii(text) text = lowercase(text) text = expand_abbreviations(text) text = remove_parentheses(text) phonemes = global_phonemizer.phonemize([text], strip=True, njobs=1)[0] phonemes = collapse_whitespace(phonemes) return phonemes # I am removing this due to incompatibility with several version of python # However, if you want to use it, you can uncomment it # and install piper-phonemize with the following command: # pip install piper-phonemize # import piper_phonemize # def english_cleaners_piper(text): # """Pipeline for English text, including abbreviation expansion. + punctuation + stress""" # text = convert_to_ascii(text) # text = lowercase(text) # text = expand_abbreviations(text) # phonemes = "".join(piper_phonemize.phonemize_espeak(text=text, voice="en-US")[0]) # phonemes = collapse_whitespace(phonemes) # return phonemes