icefall/egs/wenetspeech/ASR/local/make_syllable_lexicon.py
PingFeng Luo f7b44da375 sytle
2022-01-04 19:02:22 +08:00

73 lines
2.0 KiB
Python
Executable File

#!/usr/bin/env python3
# Copyright 2022 (Author: Pingfeng Luo)
"""
make syllables lexicon and handle heteronym
"""
import argparse
from pathlib import Path
from pypinyin import pinyin, Style
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--lexicon", type=str, help="The input lexicon file.")
return parser.parse_args()
def process_line(
line: str
) -> None:
"""
Args:
line:
A line of transcript consisting of space(s) separated word and phones
input :
你好 n i3 h ao3
晴天 q ing2 t ian1
output :
你好 ni3 hao3
晴天 qing2 tian1
Returns:
Return None.
"""
chars = line.strip().split()[0]
pinyins = pinyin(chars, style=Style.TONE3, heteronym=True)
word_syllables = []
word_syllables_num = 1
inited = False
for char_syllables in pinyins:
new_char_syllables_num = len(char_syllables)
if not inited and len(char_syllables):
word_syllables = char_syllables
inited = True
elif new_char_syllables_num == 1:
for i in range(word_syllables_num):
word_syllables[i] += " " + str(char_syllables)
elif new_char_syllables_num > 1:
word_syllables = word_syllables * new_char_syllables_num
for pre_index in range(word_syllables_num):
for expand_index in range(new_char_syllables_num):
word_syllables[
pre_index * new_char_syllables_num + expand_index
] += " "
+ str(char_syllables[expand_index])
word_syllables_num *= new_char_syllables_num
for word_syallable in word_syllables:
print("{} {}".format(chars.strip(), str(word_syallable).strip()))
def main():
args = get_args()
assert Path(args.lexicon).is_file()
with open(args.lexicon) as f:
for line in f:
process_line(line=line)
if __name__ == "__main__":
main()