mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-05 07:04:18 +00:00
remove extra tokens
This commit is contained in:
parent
e7749120de
commit
cb04833f8e
@ -43,23 +43,10 @@ def get_args():
|
|||||||
|
|
||||||
def get_token2id(filename: Path) -> Dict[str, int]:
|
def get_token2id(filename: Path) -> Dict[str, int]:
|
||||||
"""Get a dict that maps token to IDs, and save it to the given filename."""
|
"""Get a dict that maps token to IDs, and save it to the given filename."""
|
||||||
extra_tokens = [
|
all_tokens = get_espeak_map()
|
||||||
"<blk>", # 0 for blank
|
|
||||||
"<sos>", # 1 for sos
|
|
||||||
"<eos>", # 2 for eos
|
|
||||||
"<unk>", # 3 for OOV
|
|
||||||
]
|
|
||||||
|
|
||||||
all_tokens = list(get_espeak_map().keys())
|
|
||||||
|
|
||||||
for t in extra_tokens:
|
|
||||||
assert t not in all_tokens, t
|
|
||||||
|
|
||||||
all_tokens = extra_tokens + all_tokens
|
|
||||||
|
|
||||||
with open(filename, "w", encoding="utf-8") as f:
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
for i, token in enumerate(all_tokens):
|
for token, token_id in all_tokens.items():
|
||||||
f.write(f"{token} {i}\n")
|
f.write(f"{token} {token_id[0]}\n")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -82,7 +82,8 @@ if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|||||||
log "Stage 3: Prepare phoneme tokens for LJSpeech"
|
log "Stage 3: Prepare phoneme tokens for LJSpeech"
|
||||||
# We assume you have installed piper_phonemize and espnet_tts_frontend.
|
# We assume you have installed piper_phonemize and espnet_tts_frontend.
|
||||||
# If not, please install them with:
|
# If not, please install them with:
|
||||||
# - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize
|
# - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize,
|
||||||
|
# could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
|
||||||
# - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
|
# - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
|
||||||
if [ ! -e data/spectrogram/.ljspeech_with_token.done ]; then
|
if [ ! -e data/spectrogram/.ljspeech_with_token.done ]; then
|
||||||
./local/prepare_tokens_ljspeech.py
|
./local/prepare_tokens_ljspeech.py
|
||||||
@ -119,7 +120,8 @@ if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|||||||
log "Stage 5: Generate token file"
|
log "Stage 5: Generate token file"
|
||||||
# We assume you have installed piper_phonemize and espnet_tts_frontend.
|
# We assume you have installed piper_phonemize and espnet_tts_frontend.
|
||||||
# If not, please install them with:
|
# If not, please install them with:
|
||||||
# - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize
|
# - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize,
|
||||||
|
# could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
|
||||||
# - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
|
# - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
|
||||||
if [ ! -e data/tokens.txt ]; then
|
if [ ! -e data/tokens.txt ]; then
|
||||||
./local/prepare_token_file.py --tokens data/tokens.txt
|
./local/prepare_token_file.py --tokens data/tokens.txt
|
||||||
|
@ -38,12 +38,15 @@ class Tokenizer(object):
|
|||||||
id = int(info[0])
|
id = int(info[0])
|
||||||
else:
|
else:
|
||||||
token, id = info[0], int(info[1])
|
token, id = info[0], int(info[1])
|
||||||
|
assert token not in self.token2id, token
|
||||||
self.token2id[token] = id
|
self.token2id[token] = id
|
||||||
|
|
||||||
self.blank_id = self.token2id["<blk>"]
|
# Refer to https://github.com/rhasspy/piper/blob/master/TRAINING.md
|
||||||
self.sos_id = self.token2id["<sos>"]
|
self.pad_id = self.token2id["_"] # padding
|
||||||
self.eos_id = self.token2id["<eos>"]
|
self.sos_id = self.token2id["^"] # beginning of an utterance (bos)
|
||||||
self.oov_id = self.token2id["<unk>"]
|
self.eos_id = self.token2id["$"] # end of an utterance (eos)
|
||||||
|
self.space_id = self.token2id[" "] # word separator (whitespace)
|
||||||
|
|
||||||
self.vocab_size = len(self.token2id)
|
self.vocab_size = len(self.token2id)
|
||||||
|
|
||||||
def texts_to_token_ids(
|
def texts_to_token_ids(
|
||||||
@ -80,13 +83,11 @@ class Tokenizer(object):
|
|||||||
|
|
||||||
token_ids = []
|
token_ids = []
|
||||||
for t in tokens:
|
for t in tokens:
|
||||||
if t in self.token2id:
|
assert t in self.token2id, t
|
||||||
token_ids.append(self.token2id[t])
|
token_ids.append(self.token2id[t])
|
||||||
else:
|
|
||||||
token_ids.append(self.oov_id)
|
|
||||||
|
|
||||||
if intersperse_blank:
|
if intersperse_blank:
|
||||||
token_ids = intersperse(token_ids, self.blank_id)
|
token_ids = intersperse(token_ids, self.pad_id)
|
||||||
if add_sos:
|
if add_sos:
|
||||||
token_ids = [self.sos_id] + token_ids
|
token_ids = [self.sos_id] + token_ids
|
||||||
if add_eos:
|
if add_eos:
|
||||||
@ -122,13 +123,11 @@ class Tokenizer(object):
|
|||||||
for tokens in tokens_list:
|
for tokens in tokens_list:
|
||||||
token_ids = []
|
token_ids = []
|
||||||
for t in tokens:
|
for t in tokens:
|
||||||
if t in self.token2id:
|
assert t in self.token2id, t
|
||||||
token_ids.append(self.token2id[t])
|
token_ids.append(self.token2id[t])
|
||||||
else:
|
|
||||||
token_ids.append(self.oov_id)
|
|
||||||
|
|
||||||
if intersperse_blank:
|
if intersperse_blank:
|
||||||
token_ids = intersperse(token_ids, self.blank_id)
|
token_ids = intersperse(token_ids, self.pad_id)
|
||||||
if add_sos:
|
if add_sos:
|
||||||
token_ids = [self.sos_id] + token_ids
|
token_ids = [self.sos_id] + token_ids
|
||||||
if add_eos:
|
if add_eos:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user