use piper_phonemize as text tokenizer in ljspeech recipe

This commit is contained in:
yaozengwei 2024-02-20 17:45:56 +08:00
parent ff6784d147
commit 2cf5891c15
4 changed files with 66 additions and 64 deletions

View File

@ -17,7 +17,7 @@
""" """
This file reads the texts in given manifest and generates the file that maps tokens to IDs. This file generates the file that maps tokens to IDs.
""" """
import argparse import argparse
@ -25,80 +25,47 @@ import logging
from pathlib import Path from pathlib import Path
from typing import Dict from typing import Dict
from lhotse import load_manifest from piper_phonemize import get_espeak_map
def get_args(): def get_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument(
"--manifest-file",
type=Path,
default=Path("data/spectrogram/ljspeech_cuts_train.jsonl.gz"),
help="Path to the manifest file",
)
parser.add_argument( parser.add_argument(
"--tokens", "--tokens",
type=Path, type=Path,
default=Path("data/tokens.txt"), default=Path("data/tokens.txt"),
help="Path to the tokens", help="Path to the dict that maps the text tokens to IDs",
) )
return parser.parse_args() return parser.parse_args()
def write_mapping(filename: str, sym2id: Dict[str, int]) -> None: def get_token2id(filename: Path) -> Dict[str, int]:
"""Write a symbol to ID mapping to a file. """Get a dict that maps token to IDs, and save it to the given filename."""
Note:
No need to implement `read_mapping` as it can be done
through :func:`k2.SymbolTable.from_file`.
Args:
filename:
Filename to save the mapping.
sym2id:
A dict mapping symbols to IDs.
Returns:
Return None.
"""
with open(filename, "w", encoding="utf-8") as f:
for sym, i in sym2id.items():
f.write(f"{sym} {i}\n")
def get_token2id(manifest_file: Path) -> Dict[str, int]:
"""Return a dict that maps token to IDs."""
extra_tokens = [ extra_tokens = [
"<blk>", # 0 for blank "<blk>", # 0 for blank
"<sos/eos>", # 1 for sos and eos symbols. "<sos>", # 1 for sos
"<unk>", # 2 for OOV "<eos>", # 2 for eos
"<unk>", # 3 for OOV
] ]
all_tokens = set()
cut_set = load_manifest(manifest_file) all_tokens = list(get_espeak_map().keys())
for cut in cut_set: for t in extra_tokens:
# Each cut only contain one supervision assert t not in all_tokens, t
assert len(cut.supervisions) == 1, len(cut.supervisions)
for t in cut.tokens:
all_tokens.add(t)
all_tokens = extra_tokens + list(all_tokens) all_tokens = extra_tokens + all_tokens
token2id: Dict[str, int] = {token: i for i, token in enumerate(all_tokens)} with open(filename, "w", encoding="utf-8") as f:
return token2id for i, token in enumerate(all_tokens):
f.write(f"{token} {i}\n")
if __name__ == "__main__": if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO) logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args() args = get_args()
manifest_file = Path(args.manifest_file)
out_file = Path(args.tokens) out_file = Path(args.tokens)
get_token2id(out_file)
token2id = get_token2id(manifest_file)
write_mapping(out_file, token2id)

View File

@ -23,9 +23,9 @@ This file reads the texts in given manifest and save the new cuts with phoneme t
import logging import logging
from pathlib import Path from pathlib import Path
import g2p_en
import tacotron_cleaner.cleaners import tacotron_cleaner.cleaners
from lhotse import CutSet, load_manifest from lhotse import CutSet, load_manifest
from piper_phonemize import phonemize_espeak
def prepare_tokens_ljspeech(): def prepare_tokens_ljspeech():
@ -35,7 +35,6 @@ def prepare_tokens_ljspeech():
partition = "all" partition = "all"
cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}") cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
g2p = g2p_en.G2p()
new_cuts = [] new_cuts = []
for cut in cut_set: for cut in cut_set:
@ -45,7 +44,11 @@ def prepare_tokens_ljspeech():
# Text normalization # Text normalization
text = tacotron_cleaner.cleaners.custom_english_cleaners(text) text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
# Convert to phonemes # Convert to phonemes
cut.tokens = g2p(text) tokens_list = phonemize_espeak(text, "en-us")
tokens = []
for t in tokens_list:
tokens.extend(t)
cut.tokens = tokens
new_cuts.append(cut) new_cuts.append(cut)
new_cut_set = CutSet.from_cuts(new_cuts) new_cut_set = CutSet.from_cuts(new_cuts)

View File

@ -30,7 +30,7 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
cd vits/monotonic_align cd vits/monotonic_align
python setup.py build_ext --inplace python setup.py build_ext --inplace
cd ../../ cd ../../
else else
log "monotonic_align lib already built" log "monotonic_align lib already built"
fi fi
fi fi
@ -80,6 +80,10 @@ fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Prepare phoneme tokens for LJSpeech" log "Stage 3: Prepare phoneme tokens for LJSpeech"
# We assume you have installed piper_phonemize and espnet_tts_frontend.
# If not, please install them with:
# - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize
# - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
if [ ! -e data/spectrogram/.ljspeech_with_token.done ]; then if [ ! -e data/spectrogram/.ljspeech_with_token.done ]; then
./local/prepare_tokens_ljspeech.py ./local/prepare_tokens_ljspeech.py
mv data/spectrogram/ljspeech_cuts_with_tokens_all.jsonl.gz \ mv data/spectrogram/ljspeech_cuts_with_tokens_all.jsonl.gz \
@ -113,13 +117,11 @@ fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Generate token file" log "Stage 5: Generate token file"
# We assume you have installed g2p_en and espnet_tts_frontend. # We assume you have installed piper_phonemize and espnet_tts_frontend.
# If not, please install them with: # If not, please install them with:
# - g2p_en: `pip install g2p_en`, refer to https://github.com/Kyubyong/g2p # - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize
# - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/ # - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
if [ ! -e data/tokens.txt ]; then if [ ! -e data/tokens.txt ]; then
./local/prepare_token_file.py \ ./local/prepare_token_file.py --tokens data/tokens.txt
--manifest-file data/spectrogram/ljspeech_cuts_train.jsonl.gz \
--tokens data/tokens.txt
fi fi
fi fi

View File

@ -16,8 +16,8 @@
from typing import Dict, List from typing import Dict, List
import g2p_en
import tacotron_cleaner.cleaners import tacotron_cleaner.cleaners
from piper_phonemize import phonemize_espeak
from utils import intersperse from utils import intersperse
@ -41,18 +41,28 @@ class Tokenizer(object):
self.token2id[token] = id self.token2id[token] = id
self.blank_id = self.token2id["<blk>"] self.blank_id = self.token2id["<blk>"]
self.sos_id = self.token2id["<sos>"]
self.eos_id = self.token2id["<eos>"]
self.oov_id = self.token2id["<unk>"] self.oov_id = self.token2id["<unk>"]
self.vocab_size = len(self.token2id) self.vocab_size = len(self.token2id)
self.g2p = g2p_en.G2p() def texts_to_token_ids(
self,
def texts_to_token_ids(self, texts: List[str], intersperse_blank: bool = True): texts: List[str],
intersperse_blank: bool = True,
add_sos: bool = False,
add_eos: bool = False,
):
""" """
Args: Args:
texts: texts:
A list of transcripts. A list of transcripts.
intersperse_blank: intersperse_blank:
Whether to intersperse blanks in the token sequence. Whether to intersperse blanks in the token sequence.
add_sos:
Whether to add sos token at the start.
add_eos:
Whether to add eos token at the end.
Returns: Returns:
Return a list of token id list [utterance][token_id] Return a list of token id list [utterance][token_id]
@ -63,7 +73,11 @@ class Tokenizer(object):
# Text normalization # Text normalization
text = tacotron_cleaner.cleaners.custom_english_cleaners(text) text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
# Convert to phonemes # Convert to phonemes
tokens = self.g2p(text) tokens_list = phonemize_espeak(text, "en-us")
tokens = []
for t in tokens_list:
tokens.extend(t)
token_ids = [] token_ids = []
for t in tokens: for t in tokens:
if t in self.token2id: if t in self.token2id:
@ -73,13 +87,21 @@ class Tokenizer(object):
if intersperse_blank: if intersperse_blank:
token_ids = intersperse(token_ids, self.blank_id) token_ids = intersperse(token_ids, self.blank_id)
if add_sos:
token_ids = [self.sos_id] + token_ids
if add_eos:
token_ids = token_ids + [self.eos_id]
token_ids_list.append(token_ids) token_ids_list.append(token_ids)
return token_ids_list return token_ids_list
def tokens_to_token_ids( def tokens_to_token_ids(
self, tokens_list: List[str], intersperse_blank: bool = True self,
tokens_list: List[str],
intersperse_blank: bool = True,
add_sos: bool = False,
add_eos: bool = False,
): ):
""" """
Args: Args:
@ -87,6 +109,10 @@ class Tokenizer(object):
A list of token list, each corresponding to one utterance. A list of token list, each corresponding to one utterance.
intersperse_blank: intersperse_blank:
Whether to intersperse blanks in the token sequence. Whether to intersperse blanks in the token sequence.
add_sos:
Whether to add sos token at the start.
add_eos:
Whether to add eos token at the end.
Returns: Returns:
Return a list of token id list [utterance][token_id] Return a list of token id list [utterance][token_id]
@ -103,6 +129,10 @@ class Tokenizer(object):
if intersperse_blank: if intersperse_blank:
token_ids = intersperse(token_ids, self.blank_id) token_ids = intersperse(token_ids, self.blank_id)
if add_sos:
token_ids = [self.sos_id] + token_ids
if add_eos:
token_ids = token_ids + [self.eos_id]
token_ids_list.append(token_ids) token_ids_list.append(token_ids)