remove extra tokens

This commit is contained in:
yaozengwei 2024-02-20 21:18:03 +08:00
parent e7749120de
commit cb04833f8e
3 changed files with 20 additions and 32 deletions

View File

@ -43,23 +43,10 @@ def get_args():
def get_token2id(filename: Path) -> Dict[str, int]:
"""Get a dict that maps token to IDs, and save it to the given filename."""
extra_tokens = [
"<blk>", # 0 for blank
"<sos>", # 1 for sos
"<eos>", # 2 for eos
"<unk>", # 3 for OOV
]
all_tokens = list(get_espeak_map().keys())
for t in extra_tokens:
assert t not in all_tokens, t
all_tokens = extra_tokens + all_tokens
all_tokens = get_espeak_map()
with open(filename, "w", encoding="utf-8") as f:
for i, token in enumerate(all_tokens):
f.write(f"{token} {i}\n")
for token, token_id in all_tokens.items():
f.write(f"{token} {token_id[0]}\n")
if __name__ == "__main__":

View File

@ -82,7 +82,8 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Prepare phoneme tokens for LJSpeech"
# We assume you have installed piper_phonemize and espnet_tts_frontend.
# If not, please install them with:
# - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize
# - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize,
# could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
# - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
if [ ! -e data/spectrogram/.ljspeech_with_token.done ]; then
./local/prepare_tokens_ljspeech.py
@ -119,7 +120,8 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Generate token file"
# We assume you have installed piper_phonemize and espnet_tts_frontend.
# If not, please install them with:
# - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize
# - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize,
# could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
# - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
if [ ! -e data/tokens.txt ]; then
./local/prepare_token_file.py --tokens data/tokens.txt

View File

@ -38,12 +38,15 @@ class Tokenizer(object):
id = int(info[0])
else:
token, id = info[0], int(info[1])
assert token not in self.token2id, token
self.token2id[token] = id
self.blank_id = self.token2id["<blk>"]
self.sos_id = self.token2id["<sos>"]
self.eos_id = self.token2id["<eos>"]
self.oov_id = self.token2id["<unk>"]
# Refer to https://github.com/rhasspy/piper/blob/master/TRAINING.md
self.pad_id = self.token2id["_"] # padding
self.sos_id = self.token2id["^"] # beginning of an utterance (bos)
self.eos_id = self.token2id["$"] # end of an utterance (eos)
self.space_id = self.token2id[" "] # word separator (whitespace)
self.vocab_size = len(self.token2id)
def texts_to_token_ids(
@ -80,13 +83,11 @@ class Tokenizer(object):
token_ids = []
for t in tokens:
if t in self.token2id:
token_ids.append(self.token2id[t])
else:
token_ids.append(self.oov_id)
assert t in self.token2id, t
token_ids.append(self.token2id[t])
if intersperse_blank:
token_ids = intersperse(token_ids, self.blank_id)
token_ids = intersperse(token_ids, self.pad_id)
if add_sos:
token_ids = [self.sos_id] + token_ids
if add_eos:
@ -122,13 +123,11 @@ class Tokenizer(object):
for tokens in tokens_list:
token_ids = []
for t in tokens:
if t in self.token2id:
token_ids.append(self.token2id[t])
else:
token_ids.append(self.oov_id)
assert t in self.token2id, t
token_ids.append(self.token2id[t])
if intersperse_blank:
token_ids = intersperse(token_ids, self.blank_id)
token_ids = intersperse(token_ids, self.pad_id)
if add_sos:
token_ids = [self.sos_id] + token_ids
if add_eos: