mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
init commit
This commit is contained in:
parent
42d68f0755
commit
a0dc097ad9
@ -24,9 +24,9 @@ This file reads the texts in given manifest and save the new cuts with phoneme t
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import g2p_en
|
||||
import tacotron_cleaner.cleaners
|
||||
from lhotse import CutSet, load_manifest
|
||||
from piper_phonemize import phonemize_espeak
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
|
||||
@ -37,17 +37,20 @@ def prepare_tokens_vctk():
|
||||
partition = "all"
|
||||
|
||||
cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||
g2p = g2p_en.G2p()
|
||||
|
||||
new_cuts = []
|
||||
for cut in tqdm(cut_set):
|
||||
# Each cut only contains one supervision
|
||||
assert len(cut.supervisions) == 1, len(cut.supervisions)
|
||||
assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
|
||||
text = cut.supervisions[0].text
|
||||
# Text normalization
|
||||
text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
|
||||
# Convert to phonemes
|
||||
cut.tokens = g2p(text)
|
||||
tokens_list = phonemize_espeak(text, "en-us")
|
||||
tokens = []
|
||||
for t in tokens_list:
|
||||
tokens.extend(t)
|
||||
cut.tokens = tokens
|
||||
new_cuts.append(cut)
|
||||
|
||||
new_cut_set = CutSet.from_cuts(new_cuts)
|
||||
|
@ -78,6 +78,13 @@ fi
|
||||
|
||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
log "Stage 3: Prepare phoneme tokens for VCTK"
|
||||
# We assume you have installed piper_phonemize and espnet_tts_frontend.
|
||||
# If not, please install them with:
|
||||
# - piper_phonemize:
|
||||
# refer to https://github.com/rhasspy/piper-phonemize,
|
||||
# could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
|
||||
# - espnet_tts_frontend:
|
||||
# `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
|
||||
if [ ! -e data/spectrogram/.vctk_with_token.done ]; then
|
||||
./local/prepare_tokens_vctk.py
|
||||
mv data/spectrogram/vctk_cuts_with_tokens_all.jsonl.gz \
|
||||
@ -111,14 +118,15 @@ fi
|
||||
|
||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
log "Stage 5: Generate token file"
|
||||
# We assume you have installed g2p_en and espnet_tts_frontend.
|
||||
# We assume you have installed piper_phonemize and espnet_tts_frontend.
|
||||
# If not, please install them with:
|
||||
# - g2p_en: `pip install g2p_en`, refer to https://github.com/Kyubyong/g2p
|
||||
# - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
|
||||
# - piper_phonemize:
|
||||
# refer to https://github.com/rhasspy/piper-phonemize,
|
||||
# could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
|
||||
# - espnet_tts_frontend:
|
||||
# `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
|
||||
if [ ! -e data/tokens.txt ]; then
|
||||
./local/prepare_token_file.py \
|
||||
--manifest-file data/spectrogram/vctk_cuts_train.jsonl.gz \
|
||||
--tokens data/tokens.txt
|
||||
./local/prepare_token_file.py --tokens data/tokens.txt
|
||||
fi
|
||||
fi
|
||||
|
||||
|
@ -231,8 +231,7 @@ def main():
|
||||
params.update(vars(args))
|
||||
|
||||
tokenizer = Tokenizer(params.tokens)
|
||||
params.blank_id = tokenizer.blank_id
|
||||
params.oov_id = tokenizer.oov_id
|
||||
params.blank_id = tokenizer.pad_id
|
||||
params.vocab_size = tokenizer.vocab_size
|
||||
|
||||
with open(args.speakers) as f:
|
||||
|
@ -135,14 +135,16 @@ def infer_dataset(
|
||||
batch_size = len(batch["tokens"])
|
||||
|
||||
tokens = batch["tokens"]
|
||||
tokens = tokenizer.tokens_to_token_ids(tokens)
|
||||
tokens = tokenizer.tokens_to_token_ids(
|
||||
tokens, intersperse_blank=True, add_sos=True, add_eos=True
|
||||
)
|
||||
tokens = k2.RaggedTensor(tokens)
|
||||
row_splits = tokens.shape.row_splits(1)
|
||||
tokens_lens = row_splits[1:] - row_splits[:-1]
|
||||
tokens = tokens.to(device)
|
||||
tokens_lens = tokens_lens.to(device)
|
||||
# tensor of shape (B, T)
|
||||
tokens = tokens.pad(mode="constant", padding_value=tokenizer.blank_id)
|
||||
tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id)
|
||||
speakers = (
|
||||
torch.Tensor([speaker_map[sid] for sid in batch["speakers"]])
|
||||
.int()
|
||||
@ -214,8 +216,7 @@ def main():
|
||||
device = torch.device("cuda", 0)
|
||||
|
||||
tokenizer = Tokenizer(params.tokens)
|
||||
params.blank_id = tokenizer.blank_id
|
||||
params.oov_id = tokenizer.oov_id
|
||||
params.blank_id = tokenizer.pad_id
|
||||
params.vocab_size = tokenizer.vocab_size
|
||||
|
||||
# we need cut ids to display recognition results.
|
||||
|
@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright 2023 Xiaomi Corporation (Author: Zengwei Yao)
|
||||
# Copyright 2023-2024 Xiaomi Corporation (Author: Zengwei Yao,
|
||||
# Zengrui Jin,)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
@ -122,7 +123,9 @@ def main():
|
||||
model = OnnxModel(args.model_filename)
|
||||
|
||||
text = "I went there to see the land, the people and how their system works, end quote."
|
||||
tokens = tokenizer.texts_to_token_ids([text])
|
||||
tokens = tokenizer.texts_to_token_ids(
|
||||
[text], intersperse_blank=True, add_sos=True, add_eos=True
|
||||
)
|
||||
tokens = torch.tensor(tokens) # (1, T)
|
||||
tokens_lens = torch.tensor([tokens.shape[1]], dtype=torch.int64) # (1, T)
|
||||
speaker = torch.tensor([1], dtype=torch.int64) # (1, )
|
||||
|
@ -342,14 +342,16 @@ def prepare_input(
|
||||
torch.Tensor([speaker_map[sid] for sid in batch["speakers"]]).int().to(device)
|
||||
)
|
||||
|
||||
tokens = tokenizer.tokens_to_token_ids(tokens)
|
||||
tokens = tokenizer.texts_to_token_ids(
|
||||
tokens, intersperse_blank=True, add_sos=True, add_eos=True
|
||||
)
|
||||
tokens = k2.RaggedTensor(tokens)
|
||||
row_splits = tokens.shape.row_splits(1)
|
||||
tokens_lens = row_splits[1:] - row_splits[:-1]
|
||||
tokens = tokens.to(device)
|
||||
tokens_lens = tokens_lens.to(device)
|
||||
# a tensor of shape (B, T)
|
||||
tokens = tokens.pad(mode="constant", padding_value=tokenizer.blank_id)
|
||||
tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id)
|
||||
|
||||
return audio, audio_lens, features, features_lens, tokens, tokens_lens, speakers
|
||||
|
||||
@ -812,8 +814,7 @@ def run(rank, world_size, args):
|
||||
logging.info(f"Device: {device}")
|
||||
|
||||
tokenizer = Tokenizer(params.tokens)
|
||||
params.blank_id = tokenizer.blank_id
|
||||
params.oov_id = tokenizer.oov_id
|
||||
params.blank_id = tokenizer.pad_id
|
||||
params.vocab_size = tokenizer.vocab_size
|
||||
|
||||
vctk = VctkTtsDataModule(args)
|
||||
|
Loading…
x
Reference in New Issue
Block a user