diff --git a/egs/aishell/ASR/local/make_syllable_lexicon.py b/egs/aishell/ASR/local/make_syllable_lexicon.py deleted file mode 100755 index 33e1f8b2b..000000000 --- a/egs/aishell/ASR/local/make_syllable_lexicon.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2021 (Author: Pingfeng Luo) -""" - make syllables lexicon and handle heteronym -""" -import argparse -from pathlib import Path -from pypinyin import pinyin, lazy_pinyin, Style - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--lexicon", type=str, help="The input lexicon file.") - return parser.parse_args() - - -def process_line( - line: str -) -> None: - """ - Args: - line: - A line of transcript consisting of space(s) separated word and phones - input : - 你好 n i3 h ao3 - 晴天 q ing2 t ian1 - - output : - 你好 ni3 hao3 - 晴天 qing2 tian1 - Returns: - Return None. - """ - chars = line.strip().split()[0] - pinyins = pinyin(chars, style=Style.TONE3, heteronym=True) - word_syllables = [] - word_syllables_num = 1 - inited = False - for char_syllables in pinyins: - new_char_syllables_num = len(char_syllables) - if not inited and len(char_syllables): - word_syllables = [char_syllables[0]] - inited = True - elif new_char_syllables_num == 1: - for i in range(word_syllables_num): - word_syllables[i] += " " + str(char_syllables) - elif new_char_syllables_num > 1: - word_syllables = word_syllables * new_char_syllables_num - for pre_index in range(word_syllables_num): - for expand_index in range(new_char_syllables_num): - word_syllables[pre_index * new_char_syllables_num + expand_index] += " " + char_syllables[expand_index] - word_syllables_num *= new_char_syllables_num - - for word_syallable in word_syllables: - print("{} {}".format(chars.strip(), str(word_syallable).strip())) - - -def main(): - args = get_args() - assert Path(args.lexicon).is_file() - - with open(args.lexicon) as f: - for line in f: - process_line(line=line) - - -if __name__ == "__main__": - main() diff --git a/egs/aishell/ASR/local/prepare_lang.py b/egs/aishell/ASR/local/prepare_lang.py index 495f62cb4..8c208e10d 100755 --- a/egs/aishell/ASR/local/prepare_lang.py +++ b/egs/aishell/ASR/local/prepare_lang.py @@ -317,7 +317,8 @@ def lexicon_to_fst( def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--lang-dir", type=str, help="The lang dir, data/lang_phone or data/lang_syllable") + parser.add_argument("--lang-dir", type=str, + help="The lang dir, data/lang_phone") return parser.parse_args() diff --git a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py index b3f88a597..3a10f39b6 100644 --- a/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py +++ b/egs/aishell/ASR/tdnn_lstm_ctc/asr_datamodule.py @@ -340,17 +340,17 @@ class AishellAsrDataModule: def train_cuts(self) -> CutSet: logging.info("About to get train cuts") cuts_train = load_manifest(self.args.manifest_dir / - "cuts_train.json.gz") + "cuts_train.json.gz") return cuts_train @lru_cache() def valid_cuts(self) -> CutSet: logging.info("About to get dev cuts") - return load_manifest(self.args.manifest_dir / - "cuts_dev.json.gz") + return load_manifest(self.args.manifest_dir / + "cuts_dev.json.gz") @lru_cache() def test_cuts(self) -> List[CutSet]: logging.info("About to get test cuts") - return load_manifest(self.args.manifest_dir / - "cuts_test.json.gz") + return load_manifest(self.args.manifest_dir / + "cuts_test.json.gz")