init commit

This commit is contained in:
jinzr 2024-02-29 15:04:08 +08:00
parent 42d68f0755
commit a0dc097ad9
6 changed files with 37 additions and 22 deletions

View File

@ -24,9 +24,9 @@ This file reads the texts in given manifest and save the new cuts with phoneme t
import logging import logging
from pathlib import Path from pathlib import Path
import g2p_en
import tacotron_cleaner.cleaners import tacotron_cleaner.cleaners
from lhotse import CutSet, load_manifest from lhotse import CutSet, load_manifest
from piper_phonemize import phonemize_espeak
from tqdm.auto import tqdm from tqdm.auto import tqdm
@ -37,17 +37,20 @@ def prepare_tokens_vctk():
partition = "all" partition = "all"
cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}") cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
g2p = g2p_en.G2p()
new_cuts = [] new_cuts = []
for cut in tqdm(cut_set): for cut in tqdm(cut_set):
# Each cut only contains one supervision # Each cut only contains one supervision
assert len(cut.supervisions) == 1, len(cut.supervisions) assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
text = cut.supervisions[0].text text = cut.supervisions[0].text
# Text normalization # Text normalization
text = tacotron_cleaner.cleaners.custom_english_cleaners(text) text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
# Convert to phonemes # Convert to phonemes
cut.tokens = g2p(text) tokens_list = phonemize_espeak(text, "en-us")
tokens = []
for t in tokens_list:
tokens.extend(t)
cut.tokens = tokens
new_cuts.append(cut) new_cuts.append(cut)
new_cut_set = CutSet.from_cuts(new_cuts) new_cut_set = CutSet.from_cuts(new_cuts)

View File

@ -78,6 +78,13 @@ fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Prepare phoneme tokens for VCTK" log "Stage 3: Prepare phoneme tokens for VCTK"
# We assume you have installed piper_phonemize and espnet_tts_frontend.
# If not, please install them with:
# - piper_phonemize:
# refer to https://github.com/rhasspy/piper-phonemize,
# could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
# - espnet_tts_frontend:
# `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
if [ ! -e data/spectrogram/.vctk_with_token.done ]; then if [ ! -e data/spectrogram/.vctk_with_token.done ]; then
./local/prepare_tokens_vctk.py ./local/prepare_tokens_vctk.py
mv data/spectrogram/vctk_cuts_with_tokens_all.jsonl.gz \ mv data/spectrogram/vctk_cuts_with_tokens_all.jsonl.gz \
@ -111,14 +118,15 @@ fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Generate token file" log "Stage 5: Generate token file"
# We assume you have installed g2p_en and espnet_tts_frontend. # We assume you have installed piper_phonemize and espnet_tts_frontend.
# If not, please install them with: # If not, please install them with:
# - g2p_en: `pip install g2p_en`, refer to https://github.com/Kyubyong/g2p # - piper_phonemize:
# - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/ # refer to https://github.com/rhasspy/piper-phonemize,
# could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
# - espnet_tts_frontend:
# `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
if [ ! -e data/tokens.txt ]; then if [ ! -e data/tokens.txt ]; then
./local/prepare_token_file.py \ ./local/prepare_token_file.py --tokens data/tokens.txt
--manifest-file data/spectrogram/vctk_cuts_train.jsonl.gz \
--tokens data/tokens.txt
fi fi
fi fi

View File

@ -231,8 +231,7 @@ def main():
params.update(vars(args)) params.update(vars(args))
tokenizer = Tokenizer(params.tokens) tokenizer = Tokenizer(params.tokens)
params.blank_id = tokenizer.blank_id params.blank_id = tokenizer.pad_id
params.oov_id = tokenizer.oov_id
params.vocab_size = tokenizer.vocab_size params.vocab_size = tokenizer.vocab_size
with open(args.speakers) as f: with open(args.speakers) as f:

View File

@ -135,14 +135,16 @@ def infer_dataset(
batch_size = len(batch["tokens"]) batch_size = len(batch["tokens"])
tokens = batch["tokens"] tokens = batch["tokens"]
tokens = tokenizer.tokens_to_token_ids(tokens) tokens = tokenizer.tokens_to_token_ids(
tokens, intersperse_blank=True, add_sos=True, add_eos=True
)
tokens = k2.RaggedTensor(tokens) tokens = k2.RaggedTensor(tokens)
row_splits = tokens.shape.row_splits(1) row_splits = tokens.shape.row_splits(1)
tokens_lens = row_splits[1:] - row_splits[:-1] tokens_lens = row_splits[1:] - row_splits[:-1]
tokens = tokens.to(device) tokens = tokens.to(device)
tokens_lens = tokens_lens.to(device) tokens_lens = tokens_lens.to(device)
# tensor of shape (B, T) # tensor of shape (B, T)
tokens = tokens.pad(mode="constant", padding_value=tokenizer.blank_id) tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id)
speakers = ( speakers = (
torch.Tensor([speaker_map[sid] for sid in batch["speakers"]]) torch.Tensor([speaker_map[sid] for sid in batch["speakers"]])
.int() .int()
@ -214,8 +216,7 @@ def main():
device = torch.device("cuda", 0) device = torch.device("cuda", 0)
tokenizer = Tokenizer(params.tokens) tokenizer = Tokenizer(params.tokens)
params.blank_id = tokenizer.blank_id params.blank_id = tokenizer.pad_id
params.oov_id = tokenizer.oov_id
params.vocab_size = tokenizer.vocab_size params.vocab_size = tokenizer.vocab_size
# we need cut ids to display recognition results. # we need cut ids to display recognition results.

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# #
# Copyright 2023 Xiaomi Corporation (Author: Zengwei Yao) # Copyright 2023-2024 Xiaomi Corporation (Author: Zengwei Yao,
# Zengrui Jin,)
# #
# See ../../../../LICENSE for clarification regarding multiple authors # See ../../../../LICENSE for clarification regarding multiple authors
# #
@ -122,7 +123,9 @@ def main():
model = OnnxModel(args.model_filename) model = OnnxModel(args.model_filename)
text = "I went there to see the land, the people and how their system works, end quote." text = "I went there to see the land, the people and how their system works, end quote."
tokens = tokenizer.texts_to_token_ids([text]) tokens = tokenizer.texts_to_token_ids(
[text], intersperse_blank=True, add_sos=True, add_eos=True
)
tokens = torch.tensor(tokens) # (1, T) tokens = torch.tensor(tokens) # (1, T)
tokens_lens = torch.tensor([tokens.shape[1]], dtype=torch.int64) # (1, T) tokens_lens = torch.tensor([tokens.shape[1]], dtype=torch.int64) # (1, T)
speaker = torch.tensor([1], dtype=torch.int64) # (1, ) speaker = torch.tensor([1], dtype=torch.int64) # (1, )

View File

@ -342,14 +342,16 @@ def prepare_input(
torch.Tensor([speaker_map[sid] for sid in batch["speakers"]]).int().to(device) torch.Tensor([speaker_map[sid] for sid in batch["speakers"]]).int().to(device)
) )
tokens = tokenizer.tokens_to_token_ids(tokens) tokens = tokenizer.texts_to_token_ids(
tokens, intersperse_blank=True, add_sos=True, add_eos=True
)
tokens = k2.RaggedTensor(tokens) tokens = k2.RaggedTensor(tokens)
row_splits = tokens.shape.row_splits(1) row_splits = tokens.shape.row_splits(1)
tokens_lens = row_splits[1:] - row_splits[:-1] tokens_lens = row_splits[1:] - row_splits[:-1]
tokens = tokens.to(device) tokens = tokens.to(device)
tokens_lens = tokens_lens.to(device) tokens_lens = tokens_lens.to(device)
# a tensor of shape (B, T) # a tensor of shape (B, T)
tokens = tokens.pad(mode="constant", padding_value=tokenizer.blank_id) tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id)
return audio, audio_lens, features, features_lens, tokens, tokens_lens, speakers return audio, audio_lens, features, features_lens, tokens, tokens_lens, speakers
@ -812,8 +814,7 @@ def run(rank, world_size, args):
logging.info(f"Device: {device}") logging.info(f"Device: {device}")
tokenizer = Tokenizer(params.tokens) tokenizer = Tokenizer(params.tokens)
params.blank_id = tokenizer.blank_id params.blank_id = tokenizer.pad_id
params.oov_id = tokenizer.oov_id
params.vocab_size = tokenizer.vocab_size params.vocab_size = tokenizer.vocab_size
vctk = VctkTtsDataModule(args) vctk = VctkTtsDataModule(args)