mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-27 10:44:19 +00:00
add make_syllable_lexicon and handle heteronym
This commit is contained in:
parent
8e1de6a377
commit
73271d6ba4
71
egs/wenetspeech/ASR/local/make_syllable_lexicon.py
Executable file
71
egs/wenetspeech/ASR/local/make_syllable_lexicon.py
Executable file
@ -0,0 +1,71 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# Copyright 2022 (Author: Pingfeng Luo)
|
||||||
|
"""
|
||||||
|
make syllables lexicon and handle heteronym
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from pypinyin import pinyin, Style
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--lexicon", type=str, help="The input lexicon file.")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def process_line(
|
||||||
|
line: str
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
line:
|
||||||
|
A line of transcript consisting of space(s) separated word and phones
|
||||||
|
input :
|
||||||
|
你好 n i3 h ao3
|
||||||
|
晴天 q ing2 t ian1
|
||||||
|
|
||||||
|
output :
|
||||||
|
你好 ni3 hao3
|
||||||
|
晴天 qing2 tian1
|
||||||
|
Returns:
|
||||||
|
Return None.
|
||||||
|
"""
|
||||||
|
chars = line.strip().split()[0]
|
||||||
|
pinyins = pinyin(chars, style=Style.TONE3, heteronym=True)
|
||||||
|
word_syllables = []
|
||||||
|
word_syllables_num = 1
|
||||||
|
inited = False
|
||||||
|
for char_syllables in pinyins:
|
||||||
|
new_char_syllables_num = len(char_syllables)
|
||||||
|
if not inited and len(char_syllables):
|
||||||
|
word_syllables = char_syllables
|
||||||
|
inited = True
|
||||||
|
elif new_char_syllables_num == 1:
|
||||||
|
for i in range(word_syllables_num):
|
||||||
|
word_syllables[i] += " " + str(char_syllables)
|
||||||
|
elif new_char_syllables_num > 1:
|
||||||
|
word_syllables = word_syllables * new_char_syllables_num
|
||||||
|
for pre_index in range(word_syllables_num):
|
||||||
|
for expand_index in range(new_char_syllables_num):
|
||||||
|
word_syllables[pre_index * new_char_syllables_num
|
||||||
|
+ expand_index] += " "
|
||||||
|
+ str(char_syllables[expand_index])
|
||||||
|
word_syllables_num *= new_char_syllables_num
|
||||||
|
|
||||||
|
for word_syallable in word_syllables:
|
||||||
|
print("{} {}".format(chars.strip(), str(word_syallable).strip()))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = get_args()
|
||||||
|
assert Path(args.lexicon).is_file()
|
||||||
|
|
||||||
|
with open(args.lexicon) as f:
|
||||||
|
for line in f:
|
||||||
|
process_line(line=line)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
x
Reference in New Issue
Block a user