From 2cf5891c154edd5493ef66a56fc170099ed0ec4b Mon Sep 17 00:00:00 2001 From: yaozengwei Date: Tue, 20 Feb 2024 17:45:56 +0800 Subject: [PATCH] use piper_phonemize as text tokenizer in ljspeech recipe --- egs/ljspeech/TTS/local/prepare_token_file.py | 65 +++++-------------- .../TTS/local/prepare_tokens_ljspeech.py | 9 ++- egs/ljspeech/TTS/prepare.sh | 14 ++-- egs/ljspeech/TTS/vits/tokenizer.py | 42 ++++++++++-- 4 files changed, 66 insertions(+), 64 deletions(-) diff --git a/egs/ljspeech/TTS/local/prepare_token_file.py b/egs/ljspeech/TTS/local/prepare_token_file.py index df976804a..29e4a50c9 100755 --- a/egs/ljspeech/TTS/local/prepare_token_file.py +++ b/egs/ljspeech/TTS/local/prepare_token_file.py @@ -17,7 +17,7 @@ """ -This file reads the texts in given manifest and generates the file that maps tokens to IDs. +This file generates the file that maps tokens to IDs. """ import argparse @@ -25,80 +25,47 @@ import logging from pathlib import Path from typing import Dict -from lhotse import load_manifest +from piper_phonemize import get_espeak_map def get_args(): parser = argparse.ArgumentParser() - parser.add_argument( - "--manifest-file", - type=Path, - default=Path("data/spectrogram/ljspeech_cuts_train.jsonl.gz"), - help="Path to the manifest file", - ) - parser.add_argument( "--tokens", type=Path, default=Path("data/tokens.txt"), - help="Path to the tokens", + help="Path to the dict that maps the text tokens to IDs", ) return parser.parse_args() -def write_mapping(filename: str, sym2id: Dict[str, int]) -> None: - """Write a symbol to ID mapping to a file. - - Note: - No need to implement `read_mapping` as it can be done - through :func:`k2.SymbolTable.from_file`. - - Args: - filename: - Filename to save the mapping. - sym2id: - A dict mapping symbols to IDs. - Returns: - Return None. - """ - with open(filename, "w", encoding="utf-8") as f: - for sym, i in sym2id.items(): - f.write(f"{sym} {i}\n") - - -def get_token2id(manifest_file: Path) -> Dict[str, int]: - """Return a dict that maps token to IDs.""" +def get_token2id(filename: Path) -> Dict[str, int]: + """Get a dict that maps token to IDs, and save it to the given filename.""" extra_tokens = [ "", # 0 for blank - "", # 1 for sos and eos symbols. - "", # 2 for OOV + "", # 1 for sos + "", # 2 for eos + "", # 3 for OOV ] - all_tokens = set() - cut_set = load_manifest(manifest_file) + all_tokens = list(get_espeak_map().keys()) - for cut in cut_set: - # Each cut only contain one supervision - assert len(cut.supervisions) == 1, len(cut.supervisions) - for t in cut.tokens: - all_tokens.add(t) + for t in extra_tokens: + assert t not in all_tokens, t - all_tokens = extra_tokens + list(all_tokens) + all_tokens = extra_tokens + all_tokens - token2id: Dict[str, int] = {token: i for i, token in enumerate(all_tokens)} - return token2id + with open(filename, "w", encoding="utf-8") as f: + for i, token in enumerate(all_tokens): + f.write(f"{token} {i}\n") if __name__ == "__main__": formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" - logging.basicConfig(format=formatter, level=logging.INFO) args = get_args() - manifest_file = Path(args.manifest_file) out_file = Path(args.tokens) - - token2id = get_token2id(manifest_file) - write_mapping(out_file, token2id) + get_token2id(out_file) diff --git a/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py b/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py index fcd0137a0..56361cf9a 100755 --- a/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py +++ b/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py @@ -23,9 +23,9 @@ This file reads the texts in given manifest and save the new cuts with phoneme t import logging from pathlib import Path -import g2p_en import tacotron_cleaner.cleaners from lhotse import CutSet, load_manifest +from piper_phonemize import phonemize_espeak def prepare_tokens_ljspeech(): @@ -35,7 +35,6 @@ def prepare_tokens_ljspeech(): partition = "all" cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}") - g2p = g2p_en.G2p() new_cuts = [] for cut in cut_set: @@ -45,7 +44,11 @@ def prepare_tokens_ljspeech(): # Text normalization text = tacotron_cleaner.cleaners.custom_english_cleaners(text) # Convert to phonemes - cut.tokens = g2p(text) + tokens_list = phonemize_espeak(text, "en-us") + tokens = [] + for t in tokens_list: + tokens.extend(t) + cut.tokens = tokens new_cuts.append(cut) new_cut_set = CutSet.from_cuts(new_cuts) diff --git a/egs/ljspeech/TTS/prepare.sh b/egs/ljspeech/TTS/prepare.sh index ed0a07f5e..890bc841f 100755 --- a/egs/ljspeech/TTS/prepare.sh +++ b/egs/ljspeech/TTS/prepare.sh @@ -30,7 +30,7 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then cd vits/monotonic_align python setup.py build_ext --inplace cd ../../ - else + else log "monotonic_align lib already built" fi fi @@ -80,6 +80,10 @@ fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then log "Stage 3: Prepare phoneme tokens for LJSpeech" + # We assume you have installed piper_phonemize and espnet_tts_frontend. + # If not, please install them with: + # - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize + # - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/ if [ ! -e data/spectrogram/.ljspeech_with_token.done ]; then ./local/prepare_tokens_ljspeech.py mv data/spectrogram/ljspeech_cuts_with_tokens_all.jsonl.gz \ @@ -113,13 +117,11 @@ fi if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then log "Stage 5: Generate token file" - # We assume you have installed g2p_en and espnet_tts_frontend. + # We assume you have installed piper_phonemize and espnet_tts_frontend. # If not, please install them with: - # - g2p_en: `pip install g2p_en`, refer to https://github.com/Kyubyong/g2p + # - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize # - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/ if [ ! -e data/tokens.txt ]; then - ./local/prepare_token_file.py \ - --manifest-file data/spectrogram/ljspeech_cuts_train.jsonl.gz \ - --tokens data/tokens.txt + ./local/prepare_token_file.py --tokens data/tokens.txt fi fi diff --git a/egs/ljspeech/TTS/vits/tokenizer.py b/egs/ljspeech/TTS/vits/tokenizer.py index b0afc6a04..64530fa33 100644 --- a/egs/ljspeech/TTS/vits/tokenizer.py +++ b/egs/ljspeech/TTS/vits/tokenizer.py @@ -16,8 +16,8 @@ from typing import Dict, List -import g2p_en import tacotron_cleaner.cleaners +from piper_phonemize import phonemize_espeak from utils import intersperse @@ -41,18 +41,28 @@ class Tokenizer(object): self.token2id[token] = id self.blank_id = self.token2id[""] + self.sos_id = self.token2id[""] + self.eos_id = self.token2id[""] self.oov_id = self.token2id[""] self.vocab_size = len(self.token2id) - self.g2p = g2p_en.G2p() - - def texts_to_token_ids(self, texts: List[str], intersperse_blank: bool = True): + def texts_to_token_ids( + self, + texts: List[str], + intersperse_blank: bool = True, + add_sos: bool = False, + add_eos: bool = False, + ): """ Args: texts: A list of transcripts. intersperse_blank: Whether to intersperse blanks in the token sequence. + add_sos: + Whether to add sos token at the start. + add_eos: + Whether to add eos token at the end. Returns: Return a list of token id list [utterance][token_id] @@ -63,7 +73,11 @@ class Tokenizer(object): # Text normalization text = tacotron_cleaner.cleaners.custom_english_cleaners(text) # Convert to phonemes - tokens = self.g2p(text) + tokens_list = phonemize_espeak(text, "en-us") + tokens = [] + for t in tokens_list: + tokens.extend(t) + token_ids = [] for t in tokens: if t in self.token2id: @@ -73,13 +87,21 @@ class Tokenizer(object): if intersperse_blank: token_ids = intersperse(token_ids, self.blank_id) + if add_sos: + token_ids = [self.sos_id] + token_ids + if add_eos: + token_ids = token_ids + [self.eos_id] token_ids_list.append(token_ids) return token_ids_list def tokens_to_token_ids( - self, tokens_list: List[str], intersperse_blank: bool = True + self, + tokens_list: List[str], + intersperse_blank: bool = True, + add_sos: bool = False, + add_eos: bool = False, ): """ Args: @@ -87,6 +109,10 @@ class Tokenizer(object): A list of token list, each corresponding to one utterance. intersperse_blank: Whether to intersperse blanks in the token sequence. + add_sos: + Whether to add sos token at the start. + add_eos: + Whether to add eos token at the end. Returns: Return a list of token id list [utterance][token_id] @@ -103,6 +129,10 @@ class Tokenizer(object): if intersperse_blank: token_ids = intersperse(token_ids, self.blank_id) + if add_sos: + token_ids = [self.sos_id] + token_ids + if add_eos: + token_ids = token_ids + [self.eos_id] token_ids_list.append(token_ids)