mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-03 06:04:18 +00:00
use piper_phonemize as text tokenizer in ljspeech recipe
This commit is contained in:
parent
ff6784d147
commit
2cf5891c15
@ -17,7 +17,7 @@
|
||||
|
||||
|
||||
"""
|
||||
This file reads the texts in given manifest and generates the file that maps tokens to IDs.
|
||||
This file generates the file that maps tokens to IDs.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@ -25,80 +25,47 @@ import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
from lhotse import load_manifest
|
||||
from piper_phonemize import get_espeak_map
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--manifest-file",
|
||||
type=Path,
|
||||
default=Path("data/spectrogram/ljspeech_cuts_train.jsonl.gz"),
|
||||
help="Path to the manifest file",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--tokens",
|
||||
type=Path,
|
||||
default=Path("data/tokens.txt"),
|
||||
help="Path to the tokens",
|
||||
help="Path to the dict that maps the text tokens to IDs",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
|
||||
"""Write a symbol to ID mapping to a file.
|
||||
|
||||
Note:
|
||||
No need to implement `read_mapping` as it can be done
|
||||
through :func:`k2.SymbolTable.from_file`.
|
||||
|
||||
Args:
|
||||
filename:
|
||||
Filename to save the mapping.
|
||||
sym2id:
|
||||
A dict mapping symbols to IDs.
|
||||
Returns:
|
||||
Return None.
|
||||
"""
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
for sym, i in sym2id.items():
|
||||
f.write(f"{sym} {i}\n")
|
||||
|
||||
|
||||
def get_token2id(manifest_file: Path) -> Dict[str, int]:
|
||||
"""Return a dict that maps token to IDs."""
|
||||
def get_token2id(filename: Path) -> Dict[str, int]:
|
||||
"""Get a dict that maps token to IDs, and save it to the given filename."""
|
||||
extra_tokens = [
|
||||
"<blk>", # 0 for blank
|
||||
"<sos/eos>", # 1 for sos and eos symbols.
|
||||
"<unk>", # 2 for OOV
|
||||
"<sos>", # 1 for sos
|
||||
"<eos>", # 2 for eos
|
||||
"<unk>", # 3 for OOV
|
||||
]
|
||||
all_tokens = set()
|
||||
|
||||
cut_set = load_manifest(manifest_file)
|
||||
all_tokens = list(get_espeak_map().keys())
|
||||
|
||||
for cut in cut_set:
|
||||
# Each cut only contain one supervision
|
||||
assert len(cut.supervisions) == 1, len(cut.supervisions)
|
||||
for t in cut.tokens:
|
||||
all_tokens.add(t)
|
||||
for t in extra_tokens:
|
||||
assert t not in all_tokens, t
|
||||
|
||||
all_tokens = extra_tokens + list(all_tokens)
|
||||
all_tokens = extra_tokens + all_tokens
|
||||
|
||||
token2id: Dict[str, int] = {token: i for i, token in enumerate(all_tokens)}
|
||||
return token2id
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
for i, token in enumerate(all_tokens):
|
||||
f.write(f"{token} {i}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||
|
||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||
|
||||
args = get_args()
|
||||
manifest_file = Path(args.manifest_file)
|
||||
out_file = Path(args.tokens)
|
||||
|
||||
token2id = get_token2id(manifest_file)
|
||||
write_mapping(out_file, token2id)
|
||||
get_token2id(out_file)
|
||||
|
@ -23,9 +23,9 @@ This file reads the texts in given manifest and save the new cuts with phoneme t
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import g2p_en
|
||||
import tacotron_cleaner.cleaners
|
||||
from lhotse import CutSet, load_manifest
|
||||
from piper_phonemize import phonemize_espeak
|
||||
|
||||
|
||||
def prepare_tokens_ljspeech():
|
||||
@ -35,7 +35,6 @@ def prepare_tokens_ljspeech():
|
||||
partition = "all"
|
||||
|
||||
cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||
g2p = g2p_en.G2p()
|
||||
|
||||
new_cuts = []
|
||||
for cut in cut_set:
|
||||
@ -45,7 +44,11 @@ def prepare_tokens_ljspeech():
|
||||
# Text normalization
|
||||
text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
|
||||
# Convert to phonemes
|
||||
cut.tokens = g2p(text)
|
||||
tokens_list = phonemize_espeak(text, "en-us")
|
||||
tokens = []
|
||||
for t in tokens_list:
|
||||
tokens.extend(t)
|
||||
cut.tokens = tokens
|
||||
new_cuts.append(cut)
|
||||
|
||||
new_cut_set = CutSet.from_cuts(new_cuts)
|
||||
|
@ -30,7 +30,7 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
||||
cd vits/monotonic_align
|
||||
python setup.py build_ext --inplace
|
||||
cd ../../
|
||||
else
|
||||
else
|
||||
log "monotonic_align lib already built"
|
||||
fi
|
||||
fi
|
||||
@ -80,6 +80,10 @@ fi
|
||||
|
||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
log "Stage 3: Prepare phoneme tokens for LJSpeech"
|
||||
# We assume you have installed piper_phonemize and espnet_tts_frontend.
|
||||
# If not, please install them with:
|
||||
# - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize
|
||||
# - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
|
||||
if [ ! -e data/spectrogram/.ljspeech_with_token.done ]; then
|
||||
./local/prepare_tokens_ljspeech.py
|
||||
mv data/spectrogram/ljspeech_cuts_with_tokens_all.jsonl.gz \
|
||||
@ -113,13 +117,11 @@ fi
|
||||
|
||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
log "Stage 5: Generate token file"
|
||||
# We assume you have installed g2p_en and espnet_tts_frontend.
|
||||
# We assume you have installed piper_phonemize and espnet_tts_frontend.
|
||||
# If not, please install them with:
|
||||
# - g2p_en: `pip install g2p_en`, refer to https://github.com/Kyubyong/g2p
|
||||
# - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize
|
||||
# - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
|
||||
if [ ! -e data/tokens.txt ]; then
|
||||
./local/prepare_token_file.py \
|
||||
--manifest-file data/spectrogram/ljspeech_cuts_train.jsonl.gz \
|
||||
--tokens data/tokens.txt
|
||||
./local/prepare_token_file.py --tokens data/tokens.txt
|
||||
fi
|
||||
fi
|
||||
|
@ -16,8 +16,8 @@
|
||||
|
||||
from typing import Dict, List
|
||||
|
||||
import g2p_en
|
||||
import tacotron_cleaner.cleaners
|
||||
from piper_phonemize import phonemize_espeak
|
||||
from utils import intersperse
|
||||
|
||||
|
||||
@ -41,18 +41,28 @@ class Tokenizer(object):
|
||||
self.token2id[token] = id
|
||||
|
||||
self.blank_id = self.token2id["<blk>"]
|
||||
self.sos_id = self.token2id["<sos>"]
|
||||
self.eos_id = self.token2id["<eos>"]
|
||||
self.oov_id = self.token2id["<unk>"]
|
||||
self.vocab_size = len(self.token2id)
|
||||
|
||||
self.g2p = g2p_en.G2p()
|
||||
|
||||
def texts_to_token_ids(self, texts: List[str], intersperse_blank: bool = True):
|
||||
def texts_to_token_ids(
|
||||
self,
|
||||
texts: List[str],
|
||||
intersperse_blank: bool = True,
|
||||
add_sos: bool = False,
|
||||
add_eos: bool = False,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
texts:
|
||||
A list of transcripts.
|
||||
intersperse_blank:
|
||||
Whether to intersperse blanks in the token sequence.
|
||||
add_sos:
|
||||
Whether to add sos token at the start.
|
||||
add_eos:
|
||||
Whether to add eos token at the end.
|
||||
|
||||
Returns:
|
||||
Return a list of token id list [utterance][token_id]
|
||||
@ -63,7 +73,11 @@ class Tokenizer(object):
|
||||
# Text normalization
|
||||
text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
|
||||
# Convert to phonemes
|
||||
tokens = self.g2p(text)
|
||||
tokens_list = phonemize_espeak(text, "en-us")
|
||||
tokens = []
|
||||
for t in tokens_list:
|
||||
tokens.extend(t)
|
||||
|
||||
token_ids = []
|
||||
for t in tokens:
|
||||
if t in self.token2id:
|
||||
@ -73,13 +87,21 @@ class Tokenizer(object):
|
||||
|
||||
if intersperse_blank:
|
||||
token_ids = intersperse(token_ids, self.blank_id)
|
||||
if add_sos:
|
||||
token_ids = [self.sos_id] + token_ids
|
||||
if add_eos:
|
||||
token_ids = token_ids + [self.eos_id]
|
||||
|
||||
token_ids_list.append(token_ids)
|
||||
|
||||
return token_ids_list
|
||||
|
||||
def tokens_to_token_ids(
|
||||
self, tokens_list: List[str], intersperse_blank: bool = True
|
||||
self,
|
||||
tokens_list: List[str],
|
||||
intersperse_blank: bool = True,
|
||||
add_sos: bool = False,
|
||||
add_eos: bool = False,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
@ -87,6 +109,10 @@ class Tokenizer(object):
|
||||
A list of token list, each corresponding to one utterance.
|
||||
intersperse_blank:
|
||||
Whether to intersperse blanks in the token sequence.
|
||||
add_sos:
|
||||
Whether to add sos token at the start.
|
||||
add_eos:
|
||||
Whether to add eos token at the end.
|
||||
|
||||
Returns:
|
||||
Return a list of token id list [utterance][token_id]
|
||||
@ -103,6 +129,10 @@ class Tokenizer(object):
|
||||
|
||||
if intersperse_blank:
|
||||
token_ids = intersperse(token_ids, self.blank_id)
|
||||
if add_sos:
|
||||
token_ids = [self.sos_id] + token_ids
|
||||
if add_eos:
|
||||
token_ids = token_ids + [self.eos_id]
|
||||
|
||||
token_ids_list.append(token_ids)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user