mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
init commit
This commit is contained in:
parent
42d68f0755
commit
a0dc097ad9
@ -24,9 +24,9 @@ This file reads the texts in given manifest and save the new cuts with phoneme t
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import g2p_en
|
|
||||||
import tacotron_cleaner.cleaners
|
import tacotron_cleaner.cleaners
|
||||||
from lhotse import CutSet, load_manifest
|
from lhotse import CutSet, load_manifest
|
||||||
|
from piper_phonemize import phonemize_espeak
|
||||||
from tqdm.auto import tqdm
|
from tqdm.auto import tqdm
|
||||||
|
|
||||||
|
|
||||||
@ -37,17 +37,20 @@ def prepare_tokens_vctk():
|
|||||||
partition = "all"
|
partition = "all"
|
||||||
|
|
||||||
cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||||
g2p = g2p_en.G2p()
|
|
||||||
|
|
||||||
new_cuts = []
|
new_cuts = []
|
||||||
for cut in tqdm(cut_set):
|
for cut in tqdm(cut_set):
|
||||||
# Each cut only contains one supervision
|
# Each cut only contains one supervision
|
||||||
assert len(cut.supervisions) == 1, len(cut.supervisions)
|
assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
|
||||||
text = cut.supervisions[0].text
|
text = cut.supervisions[0].text
|
||||||
# Text normalization
|
# Text normalization
|
||||||
text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
|
text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
|
||||||
# Convert to phonemes
|
# Convert to phonemes
|
||||||
cut.tokens = g2p(text)
|
tokens_list = phonemize_espeak(text, "en-us")
|
||||||
|
tokens = []
|
||||||
|
for t in tokens_list:
|
||||||
|
tokens.extend(t)
|
||||||
|
cut.tokens = tokens
|
||||||
new_cuts.append(cut)
|
new_cuts.append(cut)
|
||||||
|
|
||||||
new_cut_set = CutSet.from_cuts(new_cuts)
|
new_cut_set = CutSet.from_cuts(new_cuts)
|
||||||
|
@ -78,6 +78,13 @@ fi
|
|||||||
|
|
||||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
log "Stage 3: Prepare phoneme tokens for VCTK"
|
log "Stage 3: Prepare phoneme tokens for VCTK"
|
||||||
|
# We assume you have installed piper_phonemize and espnet_tts_frontend.
|
||||||
|
# If not, please install them with:
|
||||||
|
# - piper_phonemize:
|
||||||
|
# refer to https://github.com/rhasspy/piper-phonemize,
|
||||||
|
# could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
|
||||||
|
# - espnet_tts_frontend:
|
||||||
|
# `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
|
||||||
if [ ! -e data/spectrogram/.vctk_with_token.done ]; then
|
if [ ! -e data/spectrogram/.vctk_with_token.done ]; then
|
||||||
./local/prepare_tokens_vctk.py
|
./local/prepare_tokens_vctk.py
|
||||||
mv data/spectrogram/vctk_cuts_with_tokens_all.jsonl.gz \
|
mv data/spectrogram/vctk_cuts_with_tokens_all.jsonl.gz \
|
||||||
@ -111,14 +118,15 @@ fi
|
|||||||
|
|
||||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||||
log "Stage 5: Generate token file"
|
log "Stage 5: Generate token file"
|
||||||
# We assume you have installed g2p_en and espnet_tts_frontend.
|
# We assume you have installed piper_phonemize and espnet_tts_frontend.
|
||||||
# If not, please install them with:
|
# If not, please install them with:
|
||||||
# - g2p_en: `pip install g2p_en`, refer to https://github.com/Kyubyong/g2p
|
# - piper_phonemize:
|
||||||
# - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
|
# refer to https://github.com/rhasspy/piper-phonemize,
|
||||||
|
# could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
|
||||||
|
# - espnet_tts_frontend:
|
||||||
|
# `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
|
||||||
if [ ! -e data/tokens.txt ]; then
|
if [ ! -e data/tokens.txt ]; then
|
||||||
./local/prepare_token_file.py \
|
./local/prepare_token_file.py --tokens data/tokens.txt
|
||||||
--manifest-file data/spectrogram/vctk_cuts_train.jsonl.gz \
|
|
||||||
--tokens data/tokens.txt
|
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -231,8 +231,7 @@ def main():
|
|||||||
params.update(vars(args))
|
params.update(vars(args))
|
||||||
|
|
||||||
tokenizer = Tokenizer(params.tokens)
|
tokenizer = Tokenizer(params.tokens)
|
||||||
params.blank_id = tokenizer.blank_id
|
params.blank_id = tokenizer.pad_id
|
||||||
params.oov_id = tokenizer.oov_id
|
|
||||||
params.vocab_size = tokenizer.vocab_size
|
params.vocab_size = tokenizer.vocab_size
|
||||||
|
|
||||||
with open(args.speakers) as f:
|
with open(args.speakers) as f:
|
||||||
|
@ -135,14 +135,16 @@ def infer_dataset(
|
|||||||
batch_size = len(batch["tokens"])
|
batch_size = len(batch["tokens"])
|
||||||
|
|
||||||
tokens = batch["tokens"]
|
tokens = batch["tokens"]
|
||||||
tokens = tokenizer.tokens_to_token_ids(tokens)
|
tokens = tokenizer.tokens_to_token_ids(
|
||||||
|
tokens, intersperse_blank=True, add_sos=True, add_eos=True
|
||||||
|
)
|
||||||
tokens = k2.RaggedTensor(tokens)
|
tokens = k2.RaggedTensor(tokens)
|
||||||
row_splits = tokens.shape.row_splits(1)
|
row_splits = tokens.shape.row_splits(1)
|
||||||
tokens_lens = row_splits[1:] - row_splits[:-1]
|
tokens_lens = row_splits[1:] - row_splits[:-1]
|
||||||
tokens = tokens.to(device)
|
tokens = tokens.to(device)
|
||||||
tokens_lens = tokens_lens.to(device)
|
tokens_lens = tokens_lens.to(device)
|
||||||
# tensor of shape (B, T)
|
# tensor of shape (B, T)
|
||||||
tokens = tokens.pad(mode="constant", padding_value=tokenizer.blank_id)
|
tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id)
|
||||||
speakers = (
|
speakers = (
|
||||||
torch.Tensor([speaker_map[sid] for sid in batch["speakers"]])
|
torch.Tensor([speaker_map[sid] for sid in batch["speakers"]])
|
||||||
.int()
|
.int()
|
||||||
@ -214,8 +216,7 @@ def main():
|
|||||||
device = torch.device("cuda", 0)
|
device = torch.device("cuda", 0)
|
||||||
|
|
||||||
tokenizer = Tokenizer(params.tokens)
|
tokenizer = Tokenizer(params.tokens)
|
||||||
params.blank_id = tokenizer.blank_id
|
params.blank_id = tokenizer.pad_id
|
||||||
params.oov_id = tokenizer.oov_id
|
|
||||||
params.vocab_size = tokenizer.vocab_size
|
params.vocab_size = tokenizer.vocab_size
|
||||||
|
|
||||||
# we need cut ids to display recognition results.
|
# we need cut ids to display recognition results.
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
#
|
#
|
||||||
# Copyright 2023 Xiaomi Corporation (Author: Zengwei Yao)
|
# Copyright 2023-2024 Xiaomi Corporation (Author: Zengwei Yao,
|
||||||
|
# Zengrui Jin,)
|
||||||
#
|
#
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
#
|
#
|
||||||
@ -122,7 +123,9 @@ def main():
|
|||||||
model = OnnxModel(args.model_filename)
|
model = OnnxModel(args.model_filename)
|
||||||
|
|
||||||
text = "I went there to see the land, the people and how their system works, end quote."
|
text = "I went there to see the land, the people and how their system works, end quote."
|
||||||
tokens = tokenizer.texts_to_token_ids([text])
|
tokens = tokenizer.texts_to_token_ids(
|
||||||
|
[text], intersperse_blank=True, add_sos=True, add_eos=True
|
||||||
|
)
|
||||||
tokens = torch.tensor(tokens) # (1, T)
|
tokens = torch.tensor(tokens) # (1, T)
|
||||||
tokens_lens = torch.tensor([tokens.shape[1]], dtype=torch.int64) # (1, T)
|
tokens_lens = torch.tensor([tokens.shape[1]], dtype=torch.int64) # (1, T)
|
||||||
speaker = torch.tensor([1], dtype=torch.int64) # (1, )
|
speaker = torch.tensor([1], dtype=torch.int64) # (1, )
|
||||||
|
@ -342,14 +342,16 @@ def prepare_input(
|
|||||||
torch.Tensor([speaker_map[sid] for sid in batch["speakers"]]).int().to(device)
|
torch.Tensor([speaker_map[sid] for sid in batch["speakers"]]).int().to(device)
|
||||||
)
|
)
|
||||||
|
|
||||||
tokens = tokenizer.tokens_to_token_ids(tokens)
|
tokens = tokenizer.texts_to_token_ids(
|
||||||
|
tokens, intersperse_blank=True, add_sos=True, add_eos=True
|
||||||
|
)
|
||||||
tokens = k2.RaggedTensor(tokens)
|
tokens = k2.RaggedTensor(tokens)
|
||||||
row_splits = tokens.shape.row_splits(1)
|
row_splits = tokens.shape.row_splits(1)
|
||||||
tokens_lens = row_splits[1:] - row_splits[:-1]
|
tokens_lens = row_splits[1:] - row_splits[:-1]
|
||||||
tokens = tokens.to(device)
|
tokens = tokens.to(device)
|
||||||
tokens_lens = tokens_lens.to(device)
|
tokens_lens = tokens_lens.to(device)
|
||||||
# a tensor of shape (B, T)
|
# a tensor of shape (B, T)
|
||||||
tokens = tokens.pad(mode="constant", padding_value=tokenizer.blank_id)
|
tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id)
|
||||||
|
|
||||||
return audio, audio_lens, features, features_lens, tokens, tokens_lens, speakers
|
return audio, audio_lens, features, features_lens, tokens, tokens_lens, speakers
|
||||||
|
|
||||||
@ -812,8 +814,7 @@ def run(rank, world_size, args):
|
|||||||
logging.info(f"Device: {device}")
|
logging.info(f"Device: {device}")
|
||||||
|
|
||||||
tokenizer = Tokenizer(params.tokens)
|
tokenizer = Tokenizer(params.tokens)
|
||||||
params.blank_id = tokenizer.blank_id
|
params.blank_id = tokenizer.pad_id
|
||||||
params.oov_id = tokenizer.oov_id
|
|
||||||
params.vocab_size = tokenizer.vocab_size
|
params.vocab_size = tokenizer.vocab_size
|
||||||
|
|
||||||
vctk = VctkTtsDataModule(args)
|
vctk = VctkTtsDataModule(args)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user