icefall/egs/aishell3/TTS/local/tokenizer.py

# This file is modified from
# https://github.com/UEhQZXI/vits_chinese/blob/master/vits_strings.py

import logging
from pathlib import Path
from typing import List

# Note pinyin_dict is from ./pinyin_dict.py
from pinyin_dict import pinyin_dict
from pypinyin import Style
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin, load_phrases_dict


class _MyConverter(NeutralToneWith5Mixin, DefaultConverter):
    pass


class Tokenizer:
    def __init__(self, tokens: str = ""):
        self._load_pinyin_dict()
        self._pinyin_parser = Pinyin(_MyConverter())

        if tokens != "":
            self._load_tokens(tokens)

    def texts_to_token_ids(self, texts: List[str], **kwargs) -> List[List[int]]:
        """
        Args:
          texts:
            A list of sentences.
          kwargs:
            Not used. It is for compatibility with other TTS recipes in icefall.
        """
        tokens = []

        for text in texts:
            tokens.append(self.text_to_tokens(text))

        return self.tokens_to_token_ids(tokens)

    def tokens_to_token_ids(self, tokens: List[List[str]]) -> List[List[int]]:
        ans = []

        for token_list in tokens:
            token_ids = []
            for t in token_list:
                if t not in self.token2id:
                    logging.warning(f"Skip OOV {t}")
                    continue
                token_ids.append(self.token2id[t])
            ans.append(token_ids)

        return ans

    def text_to_tokens(self, text: str) -> List[str]:
        # Convert "，" to  ["sp", "sil"]
        # Convert "。" to  ["sil"]
        # append ["eos"] at the end of a sentence
        phonemes = ["sil"]
        pinyins = self._pinyin_parser.pinyin(
            text,
            style=Style.TONE3,
            errors=lambda x: [[w] for w in x],
        )

        new_pinyin = []
        for p in pinyins:
            p = p[0]
            if p == "，":
                new_pinyin.extend(["sp", "sil"])
            elif p == "。":
                new_pinyin.append("sil")
            else:
                new_pinyin.append(p)
        sub_phonemes = self._get_phoneme4pinyin(new_pinyin)
        sub_phonemes.append("eos")
        phonemes.extend(sub_phonemes)
        return phonemes

    def _get_phoneme4pinyin(self, pinyins):
        result = []
        for pinyin in pinyins:
            if pinyin in ("sil", "sp"):
                result.append(pinyin)
            elif pinyin[:-1] in pinyin_dict:
                tone = pinyin[-1]
                a = pinyin[:-1]
                a1, a2 = pinyin_dict[a]
                # every word is appended with a #0
                result += [a1, a2 + tone, "#0"]

        return result

    def _load_pinyin_dict(self):
        this_dir = Path(__file__).parent.resolve()
        my_dict = {}
        with open(f"{this_dir}/pypinyin-local.dict", "r", encoding="utf-8") as f:
            content = f.readlines()
            for line in content:
                cuts = line.strip().split()
                hanzi = cuts[0]
                pinyin = cuts[1:]
                my_dict[hanzi] = [[p] for p in pinyin]

        load_phrases_dict(my_dict)

    def _load_tokens(self, filename):
        token2id: Dict[str, int] = {}

        with open(filename, "r", encoding="utf-8") as f:
            for line in f.readlines():
                info = line.rstrip().split()
                if len(info) == 1:
                    # case of space
                    token = " "
                    idx = int(info[0])
                else:
                    token, idx = info[0], int(info[1])

                assert token not in token2id, token

                token2id[token] = idx

        self.token2id = token2id
        self.vocab_size = len(self.token2id)
        self.pad_id = self.token2id["#0"]


def main():
    tokenizer = Tokenizer()
    tokenizer._sentence_to_ids("你好，好的。")


if __name__ == "__main__":
    main()