icefall/egs/baker_zh/TTS/matcha/generate_lexicon.py
2024-12-31 17:17:05 +08:00

43 lines
1.0 KiB
Python
Executable File

#!/usr/bin/env python3
import jieba
from pypinyin import Style, lazy_pinyin, load_phrases_dict, phrases_dict, pinyin_dict
from tokenizer import Tokenizer
load_phrases_dict(
{
"行长": [["hang2"], ["zhang3"]],
"银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
}
)
def main():
filename = "lexicon.txt"
tokens = "./data/tokens.txt"
tokenizer = Tokenizer(tokens)
word_dict = pinyin_dict.pinyin_dict
phrases = phrases_dict.phrases_dict
i = 0
with open(filename, "w", encoding="utf-8") as f:
for key in word_dict:
if not (0x4E00 <= key <= 0x9FFF):
continue
w = chr(key)
tokens = lazy_pinyin(w, style=Style.TONE3, tone_sandhi=True)[0]
f.write(f"{w} {tokens}\n")
for key in phrases:
tokens = lazy_pinyin(key, style=Style.TONE3, tone_sandhi=True)
tokens = " ".join(tokens)
f.write(f"{key} {tokens}\n")
if __name__ == "__main__":
main()