mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
fix style
This commit is contained in:
parent
8b8cf6c68e
commit
a82d826987
@ -1,68 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright 2021 (Author: Pingfeng Luo)
|
||||
"""
|
||||
make syllables lexicon and handle heteronym
|
||||
"""
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from pypinyin import pinyin, lazy_pinyin, Style
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--lexicon", type=str, help="The input lexicon file.")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def process_line(
|
||||
line: str
|
||||
) -> None:
|
||||
"""
|
||||
Args:
|
||||
line:
|
||||
A line of transcript consisting of space(s) separated word and phones
|
||||
input :
|
||||
你好 n i3 h ao3
|
||||
晴天 q ing2 t ian1
|
||||
|
||||
output :
|
||||
你好 ni3 hao3
|
||||
晴天 qing2 tian1
|
||||
Returns:
|
||||
Return None.
|
||||
"""
|
||||
chars = line.strip().split()[0]
|
||||
pinyins = pinyin(chars, style=Style.TONE3, heteronym=True)
|
||||
word_syllables = []
|
||||
word_syllables_num = 1
|
||||
inited = False
|
||||
for char_syllables in pinyins:
|
||||
new_char_syllables_num = len(char_syllables)
|
||||
if not inited and len(char_syllables):
|
||||
word_syllables = [char_syllables[0]]
|
||||
inited = True
|
||||
elif new_char_syllables_num == 1:
|
||||
for i in range(word_syllables_num):
|
||||
word_syllables[i] += " " + str(char_syllables)
|
||||
elif new_char_syllables_num > 1:
|
||||
word_syllables = word_syllables * new_char_syllables_num
|
||||
for pre_index in range(word_syllables_num):
|
||||
for expand_index in range(new_char_syllables_num):
|
||||
word_syllables[pre_index * new_char_syllables_num + expand_index] += " " + char_syllables[expand_index]
|
||||
word_syllables_num *= new_char_syllables_num
|
||||
|
||||
for word_syallable in word_syllables:
|
||||
print("{} {}".format(chars.strip(), str(word_syallable).strip()))
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
assert Path(args.lexicon).is_file()
|
||||
|
||||
with open(args.lexicon) as f:
|
||||
for line in f:
|
||||
process_line(line=line)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -317,7 +317,8 @@ def lexicon_to_fst(
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--lang-dir", type=str, help="The lang dir, data/lang_phone or data/lang_syllable")
|
||||
parser.add_argument("--lang-dir", type=str,
|
||||
help="The lang dir, data/lang_phone")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
|
@ -340,17 +340,17 @@ class AishellAsrDataModule:
|
||||
def train_cuts(self) -> CutSet:
|
||||
logging.info("About to get train cuts")
|
||||
cuts_train = load_manifest(self.args.manifest_dir /
|
||||
"cuts_train.json.gz")
|
||||
"cuts_train.json.gz")
|
||||
return cuts_train
|
||||
|
||||
@lru_cache()
|
||||
def valid_cuts(self) -> CutSet:
|
||||
logging.info("About to get dev cuts")
|
||||
return load_manifest(self.args.manifest_dir /
|
||||
"cuts_dev.json.gz")
|
||||
return load_manifest(self.args.manifest_dir /
|
||||
"cuts_dev.json.gz")
|
||||
|
||||
@lru_cache()
|
||||
def test_cuts(self) -> List[CutSet]:
|
||||
logging.info("About to get test cuts")
|
||||
return load_manifest(self.args.manifest_dir /
|
||||
"cuts_test.json.gz")
|
||||
return load_manifest(self.args.manifest_dir /
|
||||
"cuts_test.json.gz")
|
||||
|
Loading…
x
Reference in New Issue
Block a user