mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-27 18:54:18 +00:00
add make_syllable_lexicon and handle heteronym
This commit is contained in:
parent
8e1de6a377
commit
73271d6ba4
71
egs/wenetspeech/ASR/local/make_syllable_lexicon.py
Executable file
71
egs/wenetspeech/ASR/local/make_syllable_lexicon.py
Executable file
@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright 2022 (Author: Pingfeng Luo)
|
||||
"""
|
||||
make syllables lexicon and handle heteronym
|
||||
"""
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from pypinyin import pinyin, Style
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--lexicon", type=str, help="The input lexicon file.")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def process_line(
|
||||
line: str
|
||||
) -> None:
|
||||
"""
|
||||
Args:
|
||||
line:
|
||||
A line of transcript consisting of space(s) separated word and phones
|
||||
input :
|
||||
你好 n i3 h ao3
|
||||
晴天 q ing2 t ian1
|
||||
|
||||
output :
|
||||
你好 ni3 hao3
|
||||
晴天 qing2 tian1
|
||||
Returns:
|
||||
Return None.
|
||||
"""
|
||||
chars = line.strip().split()[0]
|
||||
pinyins = pinyin(chars, style=Style.TONE3, heteronym=True)
|
||||
word_syllables = []
|
||||
word_syllables_num = 1
|
||||
inited = False
|
||||
for char_syllables in pinyins:
|
||||
new_char_syllables_num = len(char_syllables)
|
||||
if not inited and len(char_syllables):
|
||||
word_syllables = char_syllables
|
||||
inited = True
|
||||
elif new_char_syllables_num == 1:
|
||||
for i in range(word_syllables_num):
|
||||
word_syllables[i] += " " + str(char_syllables)
|
||||
elif new_char_syllables_num > 1:
|
||||
word_syllables = word_syllables * new_char_syllables_num
|
||||
for pre_index in range(word_syllables_num):
|
||||
for expand_index in range(new_char_syllables_num):
|
||||
word_syllables[pre_index * new_char_syllables_num
|
||||
+ expand_index] += " "
|
||||
+ str(char_syllables[expand_index])
|
||||
word_syllables_num *= new_char_syllables_num
|
||||
|
||||
for word_syallable in word_syllables:
|
||||
print("{} {}".format(chars.strip(), str(word_syallable).strip()))
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
assert Path(args.lexicon).is_file()
|
||||
|
||||
with open(args.lexicon) as f:
|
||||
for line in f:
|
||||
process_line(line=line)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user