icefall/egs/aishell3/TTS/local/tokenizer.py
2024-04-06 21:49:32 +08:00

138 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# This file is modified from
# https://github.com/UEhQZXI/vits_chinese/blob/master/vits_strings.py
import logging
from pathlib import Path
from typing import List
# Note pinyin_dict is from ./pinyin_dict.py
from pinyin_dict import pinyin_dict
from pypinyin import Style
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin, load_phrases_dict
class _MyConverter(NeutralToneWith5Mixin, DefaultConverter):
pass
class Tokenizer:
def __init__(self, tokens: str = ""):
self._load_pinyin_dict()
self._pinyin_parser = Pinyin(_MyConverter())
if tokens != "":
self._load_tokens(tokens)
def texts_to_token_ids(self, texts: List[str], **kwargs) -> List[List[int]]:
"""
Args:
texts:
A list of sentences.
kwargs:
Not used. It is for compatibility with other TTS recipes in icefall.
"""
tokens = []
for text in texts:
tokens.append(self.text_to_tokens(text))
return self.tokens_to_token_ids(tokens)
def tokens_to_token_ids(self, tokens: List[List[str]]) -> List[List[int]]:
ans = []
for token_list in tokens:
token_ids = []
for t in token_list:
if t not in self.token2id:
logging.warning(f"Skip OOV {t}")
continue
token_ids.append(self.token2id[t])
ans.append(token_ids)
return ans
def text_to_tokens(self, text: str) -> List[str]:
# Convert "" to ["sp", "sil"]
# Convert "。" to ["sil"]
# append ["eos"] at the end of a sentence
phonemes = ["sil"]
pinyins = self._pinyin_parser.pinyin(
text,
style=Style.TONE3,
errors=lambda x: [[w] for w in x],
)
new_pinyin = []
for p in pinyins:
p = p[0]
if p == "":
new_pinyin.extend(["sp", "sil"])
elif p == "":
new_pinyin.append("sil")
else:
new_pinyin.append(p)
sub_phonemes = self._get_phoneme4pinyin(new_pinyin)
sub_phonemes.append("eos")
phonemes.extend(sub_phonemes)
return phonemes
def _get_phoneme4pinyin(self, pinyins):
result = []
for pinyin in pinyins:
if pinyin in ("sil", "sp"):
result.append(pinyin)
elif pinyin[:-1] in pinyin_dict:
tone = pinyin[-1]
a = pinyin[:-1]
a1, a2 = pinyin_dict[a]
# every word is appended with a #0
result += [a1, a2 + tone, "#0"]
return result
def _load_pinyin_dict(self):
this_dir = Path(__file__).parent.resolve()
my_dict = {}
with open(f"{this_dir}/pypinyin-local.dict", "r", encoding="utf-8") as f:
content = f.readlines()
for line in content:
cuts = line.strip().split()
hanzi = cuts[0]
pinyin = cuts[1:]
my_dict[hanzi] = [[p] for p in pinyin]
load_phrases_dict(my_dict)
def _load_tokens(self, filename):
token2id: Dict[str, int] = {}
with open(filename, "r", encoding="utf-8") as f:
for line in f.readlines():
info = line.rstrip().split()
if len(info) == 1:
# case of space
token = " "
idx = int(info[0])
else:
token, idx = info[0], int(info[1])
assert token not in token2id, token
token2id[token] = idx
self.token2id = token2id
self.vocab_size = len(self.token2id)
self.pad_id = self.token2id["#0"]
def main():
tokenizer = Tokenizer()
tokenizer._sentence_to_ids("你好,好的。")
if __name__ == "__main__":
main()