minor updates

2025-08-14 04:22:21 +00:00 · 2024-10-21 18:55:18 +08:00 · 2024-10-21 18:55:18 +08:00 · 3ac1331b27
commit 3ac1331b27
parent d56f8a7894
6 changed files with 48 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -334,6 +334,7 @@ We provide a Colab notebook to test the pre-trained model: [![Open In Colab](htt

  - [LJSpeech][ljspeech]
  - [VCTK][vctk]
+  - [LibriTTS][libritts_tts]

 ### Supported Models

@ -373,6 +374,7 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad
 [commonvoice]: egs/commonvoice/ASR
 [csj]: egs/csj/ASR
 [libricss]: egs/libricss/SURT
+[libritts_asr]: egs/libritts/ASR
 [libriheavy]: egs/libriheavy/ASR
 [mgb2]: egs/mgb2/ASR
 [peoplespeech]: egs/peoples_speech/ASR
@ -382,3 +384,4 @@ Please see: [![Open In Colab](https://colab.research.google.com/assets/colab-bad

 [vctk]: egs/vctk/TTS
 [ljspeech]: egs/ljspeech/TTS
+[libritts_tts]: egs/libritts/TTS
--- a/egs/libritts/TTS/README.md
+++ b/egs/libritts/TTS/README.md
@ -10,6 +10,20 @@ The main differences from the LibriSpeech corpus are listed below:
 5. Utterances with significant background noise are excluded.
 For more information, refer to the paper "LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech", Heiga Zen, Viet Dang, Rob Clark, Yu Zhang, Ron J. Weiss, Ye Jia, Zhifeng Chen, and Yonghui Wu, arXiv, 2019. If you use the LibriTTS corpus in your work, please cite this paper where it was introduced.

+> [!CAUTION]
+> The next-gen Kaldi framework provides tools and models for generating high-quality, synthetic speech (Text-to-Speech, TTS). 
+> While these recipes has the potential to advance various fields such as accessibility, language education, and AI-driven solutions, it also carries certain ethical and legal responsibilities.
+> 
+> By using this framework, you agree to the following:
+> 1.	Legal and Ethical Use: You shall not use this framework, or any models derived from it, for any unlawful or unethical purposes. This includes, but is not limited to: Creating voice clones without the explicit, informed consent of the individual whose voice is being cloned. Engaging in any form of identity theft, impersonation, or fraud using cloned voices. Violating any local, national, or international laws regarding privacy, intellectual property, or personal data.
+> 
+> 2.	Responsibility of Use: The users of this framework are solely responsible for ensuring that their use of voice cloning technologies complies with all applicable laws and ethical guidelines. We explicitly disclaim any liability for misuse of the technology.
+> 
+> 3.	Attribution and Use of Open-Source Components: This project is provided under the Apache 2.0 license. Users must adhere to the terms of this license and provide appropriate attribution when required.
+> 
+> 4.	No Warranty: This framework is provided “as-is,” without warranty of any kind, either express or implied. We do not guarantee that the use of this software will comply with legal requirements or that it will not infringe the rights of third parties.
+
+
 # VITS

 This recipe provides a VITS model trained on the LibriTTS dataset.
--- a/egs/libritts/TTS/local/prepare_tokens_libritts.py
+++ b/egs/libritts/TTS/local/prepare_tokens_libritts.py
@ -39,6 +39,7 @@ def remove_punc_to_upper(text: str) -> str:
    s = " ".join("".join(s_list).split()).strip()
    return s

+
 def prepare_tokens_libritts():
    output_dir = Path("data/spectrogram")
    prefix = "libritts"
@ -72,7 +73,7 @@ def prepare_tokens_libritts():
                tokens.extend(t)
            cut.tokens = tokens
            cut.supervisions[0].normalized_text = remove_punc_to_upper(text)
-            
+
            new_cuts.append(cut)

        new_cut_set = CutSet.from_cuts(new_cuts)
--- a/egs/libritts/TTS/vits/infer.py
+++ b/egs/libritts/TTS/vits/infer.py
@ -34,9 +34,11 @@ from pathlib import Path
 from typing import Dict, List

 import k2
+import numpy as np
 import torch
 import torch.nn as nn
 import torchaudio
+from lhotse.features.io import KaldiReader
 from tokenizer import Tokenizer
 from train import get_model, get_params
 from tts_datamodule import LibrittsTtsDataModule
@ -82,7 +84,7 @@ def infer_dataset(
    params: AttributeDict,
    model: nn.Module,
    tokenizer: Tokenizer,
-    speaker_map: Dict[str, int],
+    speaker_map: KaldiReader,
 ) -> None:
    """Decode dataset.
    The ground-truth and generated audio pairs will be saved to `params.save_wav_dir`.
@ -145,20 +147,21 @@ def infer_dataset(
            tokens_lens = tokens_lens.to(device)
            # tensor of shape (B, T)
            tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id)
-            speakers = (
-                torch.Tensor([speaker_map[sid] for sid in batch["speakers"]])
-                .int()
-                .to(device)
-            )

            audio = batch["audio"]
            audio_lens = batch["audio_lens"].tolist()
            cut_ids = [cut.id for cut in batch["cut"]]
+            sids = ["_".join(cut_id.split("_")[:2]) for cut_id in cut_ids]
+            speakers = (
+                torch.Tensor(np.array([speaker_map.read(sid) for sid in sids]))
+                .squeeze(1)
+                .to(device)
+            )

            audio_pred, _, durations = model.inference_batch(
                text=tokens,
                text_lengths=tokens_lens,
-                sids=speakers,
+                spembs=speakers,
            )
            audio_pred = audio_pred.detach().cpu()
            # convert to samples
@ -222,8 +225,6 @@ def main():
    # we need cut ids to display recognition results.
    args.return_cuts = True
    libritts = LibrittsTtsDataModule(args)
-    speaker_map = libritts.speakers()
-    params.num_spks = len(speaker_map)

    logging.info(f"Device: {device}")
    logging.info(params)
@ -242,17 +243,23 @@ def main():
    logging.info(f"Number of parameters in discriminator: {num_param_d}")
    logging.info(f"Total number of parameters: {num_param_g + num_param_d}")

-    test_cuts = libritts.test_cuts()
-    test_dl = libritts.test_dataloaders(test_cuts)
+    test_clean_cuts = libritts.test_clean_cuts()
+    test_clean_speaker_map = libritts.test_clean_xvector()
+    test_clean_dl = libritts.test_dataloaders(test_clean_cuts)

-    valid_cuts = libritts.valid_cuts()
-    valid_dl = libritts.valid_dataloaders(valid_cuts)
+    dev_clean_cuts = libritts.dev_clean_cuts()
+    dev_clean_speaker_map = libritts.dev_clean_xvector()
+    dev_clean_dl = libritts.dev_dataloaders(dev_clean_cuts)

-    infer_sets = {"test": test_dl, "valid": valid_dl}
+    infer_sets = {
+        "test-clean": (test_clean_dl, test_clean_speaker_map),
+        "dev-clean": (dev_clean_dl, dev_clean_speaker_map),
+    }

-    for subset, dl in infer_sets.items():
+    for subset, data in infer_sets.items():
        save_wav_dir = params.res_dir / "wav" / subset
        save_wav_dir.mkdir(parents=True, exist_ok=True)
+        dl, speaker_map = data

        logging.info(f"Processing {subset} set, saving to {save_wav_dir}")

--- a/egs/libritts/TTS/vits/tts_datamodule.py
+++ b/egs/libritts/TTS/vits/tts_datamodule.py
@ -272,21 +272,21 @@ class LibrittsTtsDataModule:
                feature_input_strategy=eval(self.args.input_strategy)(),
                return_cuts=self.args.return_cuts,
            )
-        valid_sampler = DynamicBucketingSampler(
+        dev_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            shuffle=False,
        )
        logging.info("About to create valid dataloader")
-        valid_dl = DataLoader(
+        dev_dl = DataLoader(
            validate,
-            sampler=valid_sampler,
+            sampler=dev_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=False,
        )

-        return valid_dl
+        return dev_dl

    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.info("About to create test dataset")
--- a/egs/ljspeech/TTS/vits/vits.py
+++ b/egs/ljspeech/TTS/vits/vits.py
@ -622,6 +622,7 @@ class VITS(nn.Module):
        text: torch.Tensor,
        text_lengths: torch.Tensor,
        sids: Optional[torch.Tensor] = None,
+        spembs: Optional[torch.Tensor] = None,
        durations: Optional[torch.Tensor] = None,
        noise_scale: float = 0.667,
        noise_scale_dur: float = 0.8,
@ -635,6 +636,7 @@ class VITS(nn.Module):
            text (Tensor): Input text index tensor (B, T_text).
            text_lengths (Tensor): Input text index tensor (B,).
            sids (Tensor): Speaker index tensor (B,).
+            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
            noise_scale (float): Noise scale value for flow.
            noise_scale_dur (float): Noise scale value for duration predictor.
            alpha (float): Alpha parameter to control the speed of generated speech.
@ -650,6 +652,7 @@ class VITS(nn.Module):
            text=text,
            text_lengths=text_lengths,
            sids=sids,
+            spembs=spembs,
            noise_scale=noise_scale,
            noise_scale_dur=noise_scale_dur,
            alpha=alpha,