mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-09-03 06:04:18 +00:00
minor updates
This commit is contained in:
parent
cb04833f8e
commit
1851443801
@ -43,10 +43,14 @@ def get_args():
|
||||
|
||||
def get_token2id(filename: Path) -> Dict[str, int]:
|
||||
"""Get a dict that maps token to IDs, and save it to the given filename."""
|
||||
all_tokens = get_espeak_map()
|
||||
all_tokens = get_espeak_map() # token: [token_id]
|
||||
all_tokens = {token: token_id[0] for token, token_id in all_tokens.items()}
|
||||
# sort by token_id
|
||||
all_tokens = sorted(all_tokens.items(), key=lambda x: x[1])
|
||||
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
for token, token_id in all_tokens.items():
|
||||
f.write(f"{token} {token_id[0]}\n")
|
||||
for token, token_id in all_tokens:
|
||||
f.write(f"{token} {token_id}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -39,7 +39,7 @@ def prepare_tokens_ljspeech():
|
||||
new_cuts = []
|
||||
for cut in cut_set:
|
||||
# Each cut only contains one supervision
|
||||
assert len(cut.supervisions) == 1, len(cut.supervisions)
|
||||
assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
|
||||
text = cut.supervisions[0].normalized_text
|
||||
# Text normalization
|
||||
text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
|
||||
|
@ -1,4 +1,4 @@
|
||||
# Copyright 2023 Xiaomi Corp. (authors: Zengwei Yao)
|
||||
# Copyright 2023-2024 Xiaomi Corp. (authors: Zengwei Yao)
|
||||
#
|
||||
# See ../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
@ -14,6 +14,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
|
||||
import tacotron_cleaner.cleaners
|
||||
@ -55,7 +56,8 @@ class Tokenizer(object):
|
||||
intersperse_blank: bool = True,
|
||||
add_sos: bool = False,
|
||||
add_eos: bool = False,
|
||||
):
|
||||
lang: str = "en-us",
|
||||
) -> List[List[int]]:
|
||||
"""
|
||||
Args:
|
||||
texts:
|
||||
@ -66,6 +68,8 @@ class Tokenizer(object):
|
||||
Whether to add sos token at the start.
|
||||
add_eos:
|
||||
Whether to add eos token at the end.
|
||||
lang:
|
||||
Language argument passed to phonemize_espeak().
|
||||
|
||||
Returns:
|
||||
Return a list of token id list [utterance][token_id]
|
||||
@ -76,14 +80,16 @@ class Tokenizer(object):
|
||||
# Text normalization
|
||||
text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
|
||||
# Convert to phonemes
|
||||
tokens_list = phonemize_espeak(text, "en-us")
|
||||
tokens_list = phonemize_espeak(text, lang)
|
||||
tokens = []
|
||||
for t in tokens_list:
|
||||
tokens.extend(t)
|
||||
|
||||
token_ids = []
|
||||
for t in tokens:
|
||||
assert t in self.token2id, t
|
||||
if t not in self.token2id:
|
||||
logging.warning(f"Skip OOV {t}")
|
||||
continue
|
||||
token_ids.append(self.token2id[t])
|
||||
|
||||
if intersperse_blank:
|
||||
@ -103,7 +109,7 @@ class Tokenizer(object):
|
||||
intersperse_blank: bool = True,
|
||||
add_sos: bool = False,
|
||||
add_eos: bool = False,
|
||||
):
|
||||
) -> List[List[int]]:
|
||||
"""
|
||||
Args:
|
||||
tokens_list:
|
||||
@ -123,7 +129,9 @@ class Tokenizer(object):
|
||||
for tokens in tokens_list:
|
||||
token_ids = []
|
||||
for t in tokens:
|
||||
assert t in self.token2id, t
|
||||
if t not in self.token2id:
|
||||
logging.warning(f"Skip OOV {t}")
|
||||
continue
|
||||
token_ids.append(self.token2id[t])
|
||||
|
||||
if intersperse_blank:
|
||||
|
Loading…
x
Reference in New Issue
Block a user