mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
69 lines
1.9 KiB
Python
Executable File
69 lines
1.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
# Copyright 2021 (Author: Pingfeng Luo)
|
|
"""
|
|
make syllables lexicon and handle heteronym
|
|
"""
|
|
import argparse
|
|
from pathlib import Path
|
|
from pypinyin import pinyin, lazy_pinyin, Style
|
|
|
|
def get_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--lexicon", type=str, help="The input lexicon file.")
|
|
return parser.parse_args()
|
|
|
|
|
|
def process_line(
|
|
line: str
|
|
) -> None:
|
|
"""
|
|
Args:
|
|
line:
|
|
A line of transcript consisting of space(s) separated word and phones
|
|
input :
|
|
你好 n i3 h ao3
|
|
晴天 q ing2 t ian1
|
|
|
|
output :
|
|
你好 ni3 hao3
|
|
晴天 qing2 tian1
|
|
Returns:
|
|
Return None.
|
|
"""
|
|
chars = line.strip().split()[0]
|
|
pinyins = pinyin(chars, style=Style.TONE3, heteronym=True)
|
|
word_syllables = []
|
|
word_syllables_num = 1
|
|
inited = False
|
|
for char_syllables in pinyins:
|
|
new_char_syllables_num = len(char_syllables)
|
|
if not inited and len(char_syllables):
|
|
word_syllables = [char_syllables[0]]
|
|
inited = True
|
|
elif new_char_syllables_num == 1:
|
|
for i in range(word_syllables_num):
|
|
word_syllables[i] += " " + str(char_syllables)
|
|
elif new_char_syllables_num > 1:
|
|
word_syllables = word_syllables * new_char_syllables_num
|
|
for pre_index in range(word_syllables_num):
|
|
for expand_index in range(new_char_syllables_num):
|
|
word_syllables[pre_index * new_char_syllables_num + expand_index] += " " + char_syllables[expand_index]
|
|
word_syllables_num *= new_char_syllables_num
|
|
|
|
for word_syallable in word_syllables:
|
|
print("{} {}".format(chars.strip(), str(word_syallable).strip()))
|
|
|
|
|
|
def main():
|
|
args = get_args()
|
|
assert Path(args.lexicon).is_file()
|
|
|
|
with open(args.lexicon) as f:
|
|
for line in f:
|
|
process_line(line=line)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|