mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 02:06:13 +00:00
remove the text folder
This commit is contained in:
parent
ba4df19224
commit
f6328edf5b
@ -1,64 +0,0 @@
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
from matcha.text import cleaners
|
||||
from matcha.text.symbols import symbols
|
||||
|
||||
# Mappings from symbol to numeric ID and vice versa:
|
||||
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||
_id_to_symbol = {
|
||||
i: s for i, s in enumerate(symbols)
|
||||
} # pylint: disable=unnecessary-comprehension
|
||||
|
||||
|
||||
def text_to_sequence(text, cleaner_names):
|
||||
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
Args:
|
||||
text: string to convert to a sequence
|
||||
cleaner_names: names of the cleaner functions to run the text through
|
||||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
"""
|
||||
sequence = []
|
||||
|
||||
clean_text = _clean_text(text, cleaner_names)
|
||||
for symbol in clean_text:
|
||||
try:
|
||||
if symbol in "_()[]# ̃":
|
||||
continue
|
||||
symbol_id = _symbol_to_id[symbol]
|
||||
except Exception as ex:
|
||||
print(text)
|
||||
print(clean_text)
|
||||
raise RuntimeError(
|
||||
f"text: {text}, clean_text: {clean_text}, ex: {ex}, symbol: {symbol}"
|
||||
)
|
||||
sequence += [symbol_id]
|
||||
return sequence, clean_text
|
||||
|
||||
|
||||
def cleaned_text_to_sequence(cleaned_text):
|
||||
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
Args:
|
||||
text: string to convert to a sequence
|
||||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
"""
|
||||
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
|
||||
return sequence
|
||||
|
||||
|
||||
def sequence_to_text(sequence):
|
||||
"""Converts a sequence of IDs back to a string"""
|
||||
result = ""
|
||||
for symbol_id in sequence:
|
||||
s = _id_to_symbol[symbol_id]
|
||||
result += s
|
||||
return result
|
||||
|
||||
|
||||
def _clean_text(text, cleaner_names):
|
||||
for name in cleaner_names:
|
||||
cleaner = getattr(cleaners, name)
|
||||
if not cleaner:
|
||||
raise Exception("Unknown cleaner: %s" % name)
|
||||
text = cleaner(text)
|
||||
return text
|
@ -1,130 +0,0 @@
|
||||
""" from https://github.com/keithito/tacotron
|
||||
|
||||
Cleaners are transformations that run over the input text at both training and eval time.
|
||||
|
||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
||||
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
||||
the symbols in symbols.py to match your data).
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
|
||||
import phonemizer
|
||||
from unidecode import unidecode
|
||||
|
||||
# To avoid excessive logging we set the log level of the phonemizer package to Critical
|
||||
critical_logger = logging.getLogger("phonemizer")
|
||||
critical_logger.setLevel(logging.CRITICAL)
|
||||
|
||||
# Intializing the phonemizer globally significantly reduces the speed
|
||||
# now the phonemizer is not initialising at every call
|
||||
# Might be less flexible, but it is much-much faster
|
||||
global_phonemizer = phonemizer.backend.EspeakBackend(
|
||||
language="en-us",
|
||||
preserve_punctuation=True,
|
||||
with_stress=True,
|
||||
language_switch="remove-flags",
|
||||
logger=critical_logger,
|
||||
)
|
||||
|
||||
|
||||
# Regular expression matching whitespace:
|
||||
_whitespace_re = re.compile(r"\s+")
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations:
|
||||
_abbreviations = [
|
||||
(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
|
||||
for x in [
|
||||
("mrs", "misess"),
|
||||
("mr", "mister"),
|
||||
("dr", "doctor"),
|
||||
("st", "saint"),
|
||||
("co", "company"),
|
||||
("jr", "junior"),
|
||||
("maj", "major"),
|
||||
("gen", "general"),
|
||||
("drs", "doctors"),
|
||||
("rev", "reverend"),
|
||||
("lt", "lieutenant"),
|
||||
("hon", "honorable"),
|
||||
("sgt", "sergeant"),
|
||||
("capt", "captain"),
|
||||
("esq", "esquire"),
|
||||
("ltd", "limited"),
|
||||
("col", "colonel"),
|
||||
("ft", "fort"),
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
def expand_abbreviations(text):
|
||||
for regex, replacement in _abbreviations:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def lowercase(text):
|
||||
return text.lower()
|
||||
|
||||
|
||||
def collapse_whitespace(text):
|
||||
return re.sub(_whitespace_re, " ", text)
|
||||
|
||||
|
||||
def remove_parentheses(text):
|
||||
text = text.replace("(", "")
|
||||
text = text.replace(")", "")
|
||||
text = text.replace("[", "")
|
||||
text = text.replace("]", "")
|
||||
return text
|
||||
|
||||
|
||||
def convert_to_ascii(text):
|
||||
return unidecode(text)
|
||||
|
||||
|
||||
def basic_cleaners(text):
|
||||
"""Basic pipeline that lowercases and collapses whitespace without transliteration."""
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def transliteration_cleaners(text):
|
||||
"""Pipeline for non-English text that transliterates to ASCII."""
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def english_cleaners2(text):
|
||||
"""Pipeline for English text, including abbreviation expansion. + punctuation + stress"""
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = expand_abbreviations(text)
|
||||
text = remove_parentheses(text)
|
||||
phonemes = global_phonemizer.phonemize([text], strip=True, njobs=1)[0]
|
||||
phonemes = collapse_whitespace(phonemes)
|
||||
return phonemes
|
||||
|
||||
|
||||
# I am removing this due to incompatibility with several version of python
|
||||
# However, if you want to use it, you can uncomment it
|
||||
# and install piper-phonemize with the following command:
|
||||
# pip install piper-phonemize
|
||||
|
||||
# import piper_phonemize
|
||||
# def english_cleaners_piper(text):
|
||||
# """Pipeline for English text, including abbreviation expansion. + punctuation + stress"""
|
||||
# text = convert_to_ascii(text)
|
||||
# text = lowercase(text)
|
||||
# text = expand_abbreviations(text)
|
||||
# phonemes = "".join(piper_phonemize.phonemize_espeak(text=text, voice="en-US")[0])
|
||||
# phonemes = collapse_whitespace(phonemes)
|
||||
# return phonemes
|
@ -1,73 +0,0 @@
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
|
||||
import re
|
||||
|
||||
import inflect
|
||||
|
||||
_inflect = inflect.engine()
|
||||
_comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
|
||||
_decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
|
||||
_pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
|
||||
_dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
|
||||
_ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
|
||||
_number_re = re.compile(r"[0-9]+")
|
||||
|
||||
|
||||
def _remove_commas(m):
|
||||
return m.group(1).replace(",", "")
|
||||
|
||||
|
||||
def _expand_decimal_point(m):
|
||||
return m.group(1).replace(".", " point ")
|
||||
|
||||
|
||||
def _expand_dollars(m):
|
||||
match = m.group(1)
|
||||
parts = match.split(".")
|
||||
if len(parts) > 2:
|
||||
return match + " dollars"
|
||||
dollars = int(parts[0]) if parts[0] else 0
|
||||
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
||||
if dollars and cents:
|
||||
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
||||
cent_unit = "cent" if cents == 1 else "cents"
|
||||
return f"{dollars} {dollar_unit}, {cents} {cent_unit}"
|
||||
elif dollars:
|
||||
dollar_unit = "dollar" if dollars == 1 else "dollars"
|
||||
return f"{dollars} {dollar_unit}"
|
||||
elif cents:
|
||||
cent_unit = "cent" if cents == 1 else "cents"
|
||||
return f"{cents} {cent_unit}"
|
||||
else:
|
||||
return "zero dollars"
|
||||
|
||||
|
||||
def _expand_ordinal(m):
|
||||
return _inflect.number_to_words(m.group(0))
|
||||
|
||||
|
||||
def _expand_number(m):
|
||||
num = int(m.group(0))
|
||||
if num > 1000 and num < 3000:
|
||||
if num == 2000:
|
||||
return "two thousand"
|
||||
elif num > 2000 and num < 2010:
|
||||
return "two thousand " + _inflect.number_to_words(num % 100)
|
||||
elif num % 100 == 0:
|
||||
return _inflect.number_to_words(num // 100) + " hundred"
|
||||
else:
|
||||
return _inflect.number_to_words(
|
||||
num, andword="", zero="oh", group=2
|
||||
).replace(", ", " ")
|
||||
else:
|
||||
return _inflect.number_to_words(num, andword="")
|
||||
|
||||
|
||||
def normalize_numbers(text):
|
||||
text = re.sub(_comma_number_re, _remove_commas, text)
|
||||
text = re.sub(_pounds_re, r"\1 pounds", text)
|
||||
text = re.sub(_dollars_re, _expand_dollars, text)
|
||||
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
||||
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
||||
text = re.sub(_number_re, _expand_number, text)
|
||||
return text
|
@ -1,15 +0,0 @@
|
||||
""" from https://github.com/keithito/tacotron
|
||||
|
||||
Defines the set of symbols used in text input to the model.
|
||||
"""
|
||||
_pad = "_"
|
||||
_punctuation = ';:,.!?¡¿—…"«»“” '
|
||||
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
||||
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
|
||||
|
||||
|
||||
# Export all symbols:
|
||||
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
|
||||
|
||||
# Special symbol ids
|
||||
SPACE_ID = symbols.index(" ")
|
Loading…
x
Reference in New Issue
Block a user