mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 18:12:19 +00:00
120 lines
3.5 KiB
Python
120 lines
3.5 KiB
Python
# Copyright 2024 Xiaomi Corp. (authors: Fangjun Kuang)
|
|
|
|
import logging
|
|
from typing import Dict, List
|
|
|
|
import tacotron_cleaner.cleaners
|
|
|
|
try:
|
|
from piper_phonemize import phonemize_espeak
|
|
except Exception as ex:
|
|
raise RuntimeError(
|
|
f"{ex}\nPlease run\n"
|
|
"pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html"
|
|
)
|
|
|
|
from utils import intersperse
|
|
|
|
|
|
# This tokenizer supports both English and Chinese.
|
|
# We assume you have used
|
|
# ../local/convert_text_to_tokens.py
|
|
# to process your text
|
|
class Tokenizer(object):
|
|
def __init__(self, tokens: str):
|
|
"""
|
|
Args:
|
|
tokens: the file that maps tokens to ids
|
|
"""
|
|
# Parse token file
|
|
self.token2id: Dict[str, int] = {}
|
|
with open(tokens, "r", encoding="utf-8") as f:
|
|
for line in f.readlines():
|
|
info = line.rstrip().split()
|
|
if len(info) == 1:
|
|
# case of space
|
|
token = " "
|
|
id = int(info[0])
|
|
else:
|
|
token, id = info[0], int(info[1])
|
|
assert token not in self.token2id, token
|
|
self.token2id[token] = id
|
|
|
|
# Refer to https://github.com/rhasspy/piper/blob/master/TRAINING.md
|
|
self.pad_id = self.token2id["_"] # padding
|
|
self.space_id = self.token2id[" "] # word separator (whitespace)
|
|
|
|
self.vocab_size = len(self.token2id)
|
|
|
|
def texts_to_token_ids(
|
|
self,
|
|
sentence_list: List[List[str]],
|
|
intersperse_blank: bool = True,
|
|
lang: str = "en-us",
|
|
) -> List[List[int]]:
|
|
"""
|
|
Args:
|
|
sentence_list:
|
|
A list of sentences.
|
|
intersperse_blank:
|
|
Whether to intersperse blanks in the token sequence.
|
|
lang:
|
|
Language argument passed to phonemize_espeak().
|
|
|
|
Returns:
|
|
Return a list of token id list [utterance][token_id]
|
|
"""
|
|
token_ids_list = []
|
|
|
|
for sentence in sentence_list:
|
|
tokens_list = []
|
|
for word in sentence:
|
|
if word in self.token2id:
|
|
tokens_list.append(word)
|
|
continue
|
|
|
|
tmp_tokens_list = phonemize_espeak(word, lang)
|
|
for t in tmp_tokens_list:
|
|
tokens_list.extend(t)
|
|
|
|
token_ids = []
|
|
for t in tokens_list:
|
|
if t not in self.token2id:
|
|
logging.warning(f"Skip OOV {t} {sentence}")
|
|
continue
|
|
|
|
if t == " " and len(token_ids) > 0 and token_ids[-1] == self.space_id:
|
|
continue
|
|
|
|
token_ids.append(self.token2id[t])
|
|
|
|
if intersperse_blank:
|
|
token_ids = intersperse(token_ids, self.pad_id)
|
|
|
|
token_ids_list.append(token_ids)
|
|
|
|
return token_ids_list
|
|
|
|
|
|
def test_tokenizer():
|
|
import jieba
|
|
from pypinyin import Style, lazy_pinyin
|
|
|
|
tokenizer = Tokenizer("data/tokens.txt")
|
|
text1 = "今天is Monday, tomorrow is 星期二"
|
|
text2 = "你好吗? 我很好, how about you?"
|
|
|
|
text1 = list(jieba.cut(text1))
|
|
text2 = list(jieba.cut(text2))
|
|
tokens1 = lazy_pinyin(text1, style=Style.TONE3, tone_sandhi=True)
|
|
tokens2 = lazy_pinyin(text2, style=Style.TONE3, tone_sandhi=True)
|
|
print(tokens1)
|
|
print(tokens2)
|
|
|
|
ids = tokenizer.texts_to_token_ids([tokens1, tokens2])
|
|
print(ids)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_tokenizer()
|