mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-08 09:32:20 +00:00
43 lines
1.0 KiB
Python
Executable File
43 lines
1.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import jieba
|
|
from pypinyin import Style, lazy_pinyin, load_phrases_dict, phrases_dict, pinyin_dict
|
|
from tokenizer import Tokenizer
|
|
|
|
load_phrases_dict(
|
|
{
|
|
"行长": [["hang2"], ["zhang3"]],
|
|
"银行行长": [["yin2"], ["hang2"], ["hang2"], ["zhang3"]],
|
|
}
|
|
)
|
|
|
|
|
|
def main():
|
|
filename = "lexicon.txt"
|
|
tokens = "./data/tokens.txt"
|
|
tokenizer = Tokenizer(tokens)
|
|
|
|
word_dict = pinyin_dict.pinyin_dict
|
|
phrases = phrases_dict.phrases_dict
|
|
|
|
i = 0
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|
for key in word_dict:
|
|
if not (0x4E00 <= key <= 0x9FFF):
|
|
continue
|
|
|
|
w = chr(key)
|
|
tokens = lazy_pinyin(w, style=Style.TONE3, tone_sandhi=True)[0]
|
|
|
|
f.write(f"{w} {tokens}\n")
|
|
|
|
for key in phrases:
|
|
tokens = lazy_pinyin(key, style=Style.TONE3, tone_sandhi=True)
|
|
tokens = " ".join(tokens)
|
|
|
|
f.write(f"{key} {tokens}\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|