mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-14 04:22:21 +00:00
minor updates
This commit is contained in:
parent
d56f8a7894
commit
3ac1331b27
@ -334,6 +334,7 @@ We provide a Colab notebook to test the pre-trained model: [.
|
||||
> While these recipes has the potential to advance various fields such as accessibility, language education, and AI-driven solutions, it also carries certain ethical and legal responsibilities.
|
||||
>
|
||||
> By using this framework, you agree to the following:
|
||||
> 1. Legal and Ethical Use: You shall not use this framework, or any models derived from it, for any unlawful or unethical purposes. This includes, but is not limited to: Creating voice clones without the explicit, informed consent of the individual whose voice is being cloned. Engaging in any form of identity theft, impersonation, or fraud using cloned voices. Violating any local, national, or international laws regarding privacy, intellectual property, or personal data.
|
||||
>
|
||||
> 2. Responsibility of Use: The users of this framework are solely responsible for ensuring that their use of voice cloning technologies complies with all applicable laws and ethical guidelines. We explicitly disclaim any liability for misuse of the technology.
|
||||
>
|
||||
> 3. Attribution and Use of Open-Source Components: This project is provided under the Apache 2.0 license. Users must adhere to the terms of this license and provide appropriate attribution when required.
|
||||
>
|
||||
> 4. No Warranty: This framework is provided “as-is,” without warranty of any kind, either express or implied. We do not guarantee that the use of this software will comply with legal requirements or that it will not infringe the rights of third parties.
|
||||
|
||||
|
||||
# VITS
|
||||
|
||||
This recipe provides a VITS model trained on the LibriTTS dataset.
|
||||
|
@ -39,6 +39,7 @@ def remove_punc_to_upper(text: str) -> str:
|
||||
s = " ".join("".join(s_list).split()).strip()
|
||||
return s
|
||||
|
||||
|
||||
def prepare_tokens_libritts():
|
||||
output_dir = Path("data/spectrogram")
|
||||
prefix = "libritts"
|
||||
@ -72,7 +73,7 @@ def prepare_tokens_libritts():
|
||||
tokens.extend(t)
|
||||
cut.tokens = tokens
|
||||
cut.supervisions[0].normalized_text = remove_punc_to_upper(text)
|
||||
|
||||
|
||||
new_cuts.append(cut)
|
||||
|
||||
new_cut_set = CutSet.from_cuts(new_cuts)
|
||||
|
@ -34,9 +34,11 @@ from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
import k2
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torchaudio
|
||||
from lhotse.features.io import KaldiReader
|
||||
from tokenizer import Tokenizer
|
||||
from train import get_model, get_params
|
||||
from tts_datamodule import LibrittsTtsDataModule
|
||||
@ -82,7 +84,7 @@ def infer_dataset(
|
||||
params: AttributeDict,
|
||||
model: nn.Module,
|
||||
tokenizer: Tokenizer,
|
||||
speaker_map: Dict[str, int],
|
||||
speaker_map: KaldiReader,
|
||||
) -> None:
|
||||
"""Decode dataset.
|
||||
The ground-truth and generated audio pairs will be saved to `params.save_wav_dir`.
|
||||
@ -145,20 +147,21 @@ def infer_dataset(
|
||||
tokens_lens = tokens_lens.to(device)
|
||||
# tensor of shape (B, T)
|
||||
tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id)
|
||||
speakers = (
|
||||
torch.Tensor([speaker_map[sid] for sid in batch["speakers"]])
|
||||
.int()
|
||||
.to(device)
|
||||
)
|
||||
|
||||
audio = batch["audio"]
|
||||
audio_lens = batch["audio_lens"].tolist()
|
||||
cut_ids = [cut.id for cut in batch["cut"]]
|
||||
sids = ["_".join(cut_id.split("_")[:2]) for cut_id in cut_ids]
|
||||
speakers = (
|
||||
torch.Tensor(np.array([speaker_map.read(sid) for sid in sids]))
|
||||
.squeeze(1)
|
||||
.to(device)
|
||||
)
|
||||
|
||||
audio_pred, _, durations = model.inference_batch(
|
||||
text=tokens,
|
||||
text_lengths=tokens_lens,
|
||||
sids=speakers,
|
||||
spembs=speakers,
|
||||
)
|
||||
audio_pred = audio_pred.detach().cpu()
|
||||
# convert to samples
|
||||
@ -222,8 +225,6 @@ def main():
|
||||
# we need cut ids to display recognition results.
|
||||
args.return_cuts = True
|
||||
libritts = LibrittsTtsDataModule(args)
|
||||
speaker_map = libritts.speakers()
|
||||
params.num_spks = len(speaker_map)
|
||||
|
||||
logging.info(f"Device: {device}")
|
||||
logging.info(params)
|
||||
@ -242,17 +243,23 @@ def main():
|
||||
logging.info(f"Number of parameters in discriminator: {num_param_d}")
|
||||
logging.info(f"Total number of parameters: {num_param_g + num_param_d}")
|
||||
|
||||
test_cuts = libritts.test_cuts()
|
||||
test_dl = libritts.test_dataloaders(test_cuts)
|
||||
test_clean_cuts = libritts.test_clean_cuts()
|
||||
test_clean_speaker_map = libritts.test_clean_xvector()
|
||||
test_clean_dl = libritts.test_dataloaders(test_clean_cuts)
|
||||
|
||||
valid_cuts = libritts.valid_cuts()
|
||||
valid_dl = libritts.valid_dataloaders(valid_cuts)
|
||||
dev_clean_cuts = libritts.dev_clean_cuts()
|
||||
dev_clean_speaker_map = libritts.dev_clean_xvector()
|
||||
dev_clean_dl = libritts.dev_dataloaders(dev_clean_cuts)
|
||||
|
||||
infer_sets = {"test": test_dl, "valid": valid_dl}
|
||||
infer_sets = {
|
||||
"test-clean": (test_clean_dl, test_clean_speaker_map),
|
||||
"dev-clean": (dev_clean_dl, dev_clean_speaker_map),
|
||||
}
|
||||
|
||||
for subset, dl in infer_sets.items():
|
||||
for subset, data in infer_sets.items():
|
||||
save_wav_dir = params.res_dir / "wav" / subset
|
||||
save_wav_dir.mkdir(parents=True, exist_ok=True)
|
||||
dl, speaker_map = data
|
||||
|
||||
logging.info(f"Processing {subset} set, saving to {save_wav_dir}")
|
||||
|
||||
|
@ -272,21 +272,21 @@ class LibrittsTtsDataModule:
|
||||
feature_input_strategy=eval(self.args.input_strategy)(),
|
||||
return_cuts=self.args.return_cuts,
|
||||
)
|
||||
valid_sampler = DynamicBucketingSampler(
|
||||
dev_sampler = DynamicBucketingSampler(
|
||||
cuts_valid,
|
||||
max_duration=self.args.max_duration,
|
||||
shuffle=False,
|
||||
)
|
||||
logging.info("About to create valid dataloader")
|
||||
valid_dl = DataLoader(
|
||||
dev_dl = DataLoader(
|
||||
validate,
|
||||
sampler=valid_sampler,
|
||||
sampler=dev_sampler,
|
||||
batch_size=None,
|
||||
num_workers=2,
|
||||
persistent_workers=False,
|
||||
)
|
||||
|
||||
return valid_dl
|
||||
return dev_dl
|
||||
|
||||
def test_dataloaders(self, cuts: CutSet) -> DataLoader:
|
||||
logging.info("About to create test dataset")
|
||||
|
@ -622,6 +622,7 @@ class VITS(nn.Module):
|
||||
text: torch.Tensor,
|
||||
text_lengths: torch.Tensor,
|
||||
sids: Optional[torch.Tensor] = None,
|
||||
spembs: Optional[torch.Tensor] = None,
|
||||
durations: Optional[torch.Tensor] = None,
|
||||
noise_scale: float = 0.667,
|
||||
noise_scale_dur: float = 0.8,
|
||||
@ -635,6 +636,7 @@ class VITS(nn.Module):
|
||||
text (Tensor): Input text index tensor (B, T_text).
|
||||
text_lengths (Tensor): Input text index tensor (B,).
|
||||
sids (Tensor): Speaker index tensor (B,).
|
||||
spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
|
||||
noise_scale (float): Noise scale value for flow.
|
||||
noise_scale_dur (float): Noise scale value for duration predictor.
|
||||
alpha (float): Alpha parameter to control the speed of generated speech.
|
||||
@ -650,6 +652,7 @@ class VITS(nn.Module):
|
||||
text=text,
|
||||
text_lengths=text_lengths,
|
||||
sids=sids,
|
||||
spembs=spembs,
|
||||
noise_scale=noise_scale,
|
||||
noise_scale_dur=noise_scale_dur,
|
||||
alpha=alpha,
|
||||
|
Loading…
x
Reference in New Issue
Block a user