diff --git a/egs/ljspeech/TTS/local/prepare_token_file.py b/egs/ljspeech/TTS/local/prepare_token_file.py index dd76c1565..5b048b600 100755 --- a/egs/ljspeech/TTS/local/prepare_token_file.py +++ b/egs/ljspeech/TTS/local/prepare_token_file.py @@ -43,10 +43,14 @@ def get_args(): def get_token2id(filename: Path) -> Dict[str, int]: """Get a dict that maps token to IDs, and save it to the given filename.""" - all_tokens = get_espeak_map() + all_tokens = get_espeak_map() # token: [token_id] + all_tokens = {token: token_id[0] for token, token_id in all_tokens.items()} + # sort by token_id + all_tokens = sorted(all_tokens.items(), key=lambda x: x[1]) + with open(filename, "w", encoding="utf-8") as f: - for token, token_id in all_tokens.items(): - f.write(f"{token} {token_id[0]}\n") + for token, token_id in all_tokens: + f.write(f"{token} {token_id}\n") if __name__ == "__main__": diff --git a/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py b/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py index 56361cf9a..08fe7430e 100755 --- a/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py +++ b/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py @@ -39,7 +39,7 @@ def prepare_tokens_ljspeech(): new_cuts = [] for cut in cut_set: # Each cut only contains one supervision - assert len(cut.supervisions) == 1, len(cut.supervisions) + assert len(cut.supervisions) == 1, (len(cut.supervisions), cut) text = cut.supervisions[0].normalized_text # Text normalization text = tacotron_cleaner.cleaners.custom_english_cleaners(text) diff --git a/egs/ljspeech/TTS/vits/tokenizer.py b/egs/ljspeech/TTS/vits/tokenizer.py index e005fc184..9a5a9090e 100644 --- a/egs/ljspeech/TTS/vits/tokenizer.py +++ b/egs/ljspeech/TTS/vits/tokenizer.py @@ -1,4 +1,4 @@ -# Copyright 2023 Xiaomi Corp. (authors: Zengwei Yao) +# Copyright 2023-2024 Xiaomi Corp. (authors: Zengwei Yao) # # See ../../LICENSE for clarification regarding multiple authors # @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging from typing import Dict, List import tacotron_cleaner.cleaners @@ -55,7 +56,8 @@ class Tokenizer(object): intersperse_blank: bool = True, add_sos: bool = False, add_eos: bool = False, - ): + lang: str = "en-us", + ) -> List[List[int]]: """ Args: texts: @@ -66,6 +68,8 @@ class Tokenizer(object): Whether to add sos token at the start. add_eos: Whether to add eos token at the end. + lang: + Language argument passed to phonemize_espeak(). Returns: Return a list of token id list [utterance][token_id] @@ -76,14 +80,16 @@ class Tokenizer(object): # Text normalization text = tacotron_cleaner.cleaners.custom_english_cleaners(text) # Convert to phonemes - tokens_list = phonemize_espeak(text, "en-us") + tokens_list = phonemize_espeak(text, lang) tokens = [] for t in tokens_list: tokens.extend(t) token_ids = [] for t in tokens: - assert t in self.token2id, t + if t not in self.token2id: + logging.warning(f"Skip OOV {t}") + continue token_ids.append(self.token2id[t]) if intersperse_blank: @@ -103,7 +109,7 @@ class Tokenizer(object): intersperse_blank: bool = True, add_sos: bool = False, add_eos: bool = False, - ): + ) -> List[List[int]]: """ Args: tokens_list: @@ -123,7 +129,9 @@ class Tokenizer(object): for tokens in tokens_list: token_ids = [] for t in tokens: - assert t in self.token2id, t + if t not in self.token2id: + logging.warning(f"Skip OOV {t}") + continue token_ids.append(self.token2id[t]) if intersperse_blank: