performed end to end testing to the VALL-E recipe (#1818)

* added the missing ``visualize`` function * minor fixes
2025-08-08 17:42:21 +00:00 · 2024-12-06 16:14:51 +08:00 · 2024-12-06 16:14:51 +08:00 · 6e6b022e41
commit 6e6b022e41
parent bdd0f85704
5 changed files with 109 additions and 11 deletions
--- a/egs/wenetspeech4tts/TTS/local/compute_neural_codec_and_prepare_text_tokens.py
+++ b/egs/wenetspeech4tts/TTS/local/compute_neural_codec_and_prepare_text_tokens.py
@ -516,9 +516,19 @@ def main():
            for idx, part in enumerate(cut_sets):
                if args.audio_extractor:
                    if args.audio_extractor == "Encodec":
-                        storage_path = f"{args.output_dir}/{args.prefix}_encodec_{partition}_{idx if split > 1 else ''}"
+                        if split > 1:
                            storage_path = f"{args.output_dir}/{args.prefix}_encodec_{partition}_{idx}"
                        else:
                            storage_path = (
                                f"{args.output_dir}/{args.prefix}_encodec_{partition}"
                            )
                    else:
-                        storage_path = f"{args.output_dir}/{args.prefix}_fbank_{partition}_{idx if split > 1 else ''}"
+                        if split > 1:
                            storage_path = f"{args.output_dir}/{args.prefix}_fbank_{partition}_{idx}"
                        else:
                            storage_path = (
                                f"{args.output_dir}/{args.prefix}_fbank_{partition}"
                            )
                    if args.prefix.lower() in [
                        "ljspeech",
@ -587,9 +597,11 @@ def main():
                        ].normalized_text, "normalized_text is None"
                # Save each part with an index if split > 1
-                cuts_filename = (
+                if split > 1:
-                    f"{prefix}cuts_{partition}.{idx if split > 1 else ''}.{args.suffix}"
+                    cuts_filename = f"{prefix}cuts_{partition}.{idx}.{args.suffix}"
-                )
+                else:
                    cuts_filename = f"{prefix}cuts_{partition}.{args.suffix}"
                part.to_file(f"{args.output_dir}/{cuts_filename}")
                logging.info(f"Saved {cuts_filename}")
--- a/egs/wenetspeech4tts/TTS/valle/infer.py
+++ b/egs/wenetspeech4tts/TTS/valle/infer.py
@ -86,7 +86,7 @@ def get_args():
    parser.add_argument(
        "--checkpoint",
        type=str,
-        default="exp/vallf_nano_full/checkpoint-100000.pt",
+        default="./valle/exp/checkpoint-100000.pt",
        help="Path to the saved checkpoint.",
    )
--- a/egs/wenetspeech4tts/TTS/valle/requirements.txt
+++ b/egs/wenetspeech4tts/TTS/valle/requirements.txt
@ -0,0 +1,2 @@
 phonemizer==3.2.1
 git+https://github.com/facebookresearch/encodec.git
--- a/egs/wenetspeech4tts/TTS/valle/train.py
+++ b/egs/wenetspeech4tts/TTS/valle/train.py
@ -4,6 +4,7 @@
 #                                                       Mingshuang Luo)
 # Copyright    2023                           (authors: Feiteng Li)
 # Copyright    2024                           (authors: Yuekai Zhang)
 # Copyright    2024  Tsinghua University      (authors: Zengrui Jin,)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -48,10 +49,8 @@ python3 valle/train.py --max-duration 160 --filter-min-duration 0.5 --filter-max
 import argparse
 import copy
 import logging
 import os
 import random
 import warnings
 from contextlib import nullcontext
 from pathlib import Path
 from shutil import copyfile
 from typing import Any, Dict, Optional, Tuple, Union
@ -216,7 +215,7 @@ def get_parser():
    parser.add_argument(
        "--exp-dir",
        type=str,
-        default="exp/valle_dev",
+        default="./valle/exp",
        help="""The experiment dir.
        It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
@ -686,9 +685,9 @@ def compute_validation_loss(
        output_dir = Path(f"{params.exp_dir}/eval/step-{params.batch_idx_train:06d}")
        output_dir.mkdir(parents=True, exist_ok=True)
        if isinstance(model, DDP):
-            model.module.visualize(predicts, batch, output_dir=output_dir)
+            model.module.visualize(predicts, batch, tokenizer, output_dir=output_dir)
        else:
-            model.visualize(predicts, batch, output_dir=output_dir)
+            model.visualize(predicts, batch, tokenizer, output_dir=output_dir)
    return tot_loss
--- a/egs/wenetspeech4tts/TTS/valle/valle.py
+++ b/egs/wenetspeech4tts/TTS/valle/valle.py
@ -19,8 +19,11 @@ import random
 from functools import partial
 from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
 import matplotlib.pyplot as plt
 import numpy as np
 import torch
 import torch.nn as nn
 from tokenizer import TextTokenCollater
 from torch import Tensor
 from torch.nn import Linear, Module
 from torch.nn import functional as F
@ -1658,6 +1661,88 @@ class VALLE(nn.Module):
        assert len(codes) == 8
        return torch.stack(codes, dim=-1)
    def visualize(
        self,
        predicts: Tuple[torch.Tensor],
        batch: Dict[str, Union[List, torch.Tensor]],
        tokenizer: TextTokenCollater,
        output_dir: str,
        limit: int = 4,
    ) -> None:
        audio_features = batch["features"].to("cpu").detach().numpy()
        audio_features_lens = batch["features_lens"].to("cpu").detach().numpy()
        tokens = batch["tokens"]
        text_tokens, text_tokens_lens = tokenizer(tokens)
        assert text_tokens.ndim == 2
        texts = batch["text"]
        utt_ids = [cut.id for cut in batch["cut"]]
        encoder_outputs = predicts[0].to("cpu").type(torch.float32).detach().numpy()
        decoder_outputs = predicts[1]
        if isinstance(decoder_outputs, list):
            decoder_outputs = decoder_outputs[-1]
        decoder_outputs = decoder_outputs.to("cpu").type(torch.float32).detach().numpy()
        vmin, vmax = 0, 1024  # Encodec
        if decoder_outputs.dtype == np.float32:
            vmin, vmax = -6, 0  # Fbank
        num_figures = 3
        for b, (utt_id, text) in enumerate(zip(utt_ids[:limit], texts[:limit])):
            _ = plt.figure(figsize=(14, 8 * num_figures))
            S = text_tokens_lens[b]
            T = audio_features_lens[b]
            # encoder
            plt.subplot(num_figures, 1, 1)
            plt.title(f"Text: {text}")
            plt.imshow(
                X=np.transpose(encoder_outputs[b]),
                cmap=plt.get_cmap("jet"),
                aspect="auto",
                interpolation="nearest",
            )
            plt.gca().invert_yaxis()
            plt.axvline(x=S - 0.4, linewidth=2, color="r")
            plt.xlabel("Encoder Output")
            plt.colorbar()
            # decoder
            plt.subplot(num_figures, 1, 2)
            plt.imshow(
                X=np.transpose(decoder_outputs[b]),
                cmap=plt.get_cmap("jet"),
                aspect="auto",
                interpolation="nearest",
                vmin=vmin,
                vmax=vmax,
            )
            plt.gca().invert_yaxis()
            plt.axvline(x=T - 0.4, linewidth=2, color="r")
            plt.xlabel("Decoder Output")
            plt.colorbar()
            # target
            plt.subplot(num_figures, 1, 3)
            plt.imshow(
                X=np.transpose(audio_features[b]),
                cmap=plt.get_cmap("jet"),
                aspect="auto",
                interpolation="nearest",
                vmin=vmin,
                vmax=vmax,
            )
            plt.gca().invert_yaxis()
            plt.axvline(x=T - 0.4, linewidth=2, color="r")
            plt.xlabel("Decoder Target")
            plt.colorbar()
            plt.savefig(f"{output_dir}/{utt_id}.png")
            plt.close()
 # https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py
 def top_k_top_p_filtering(
		`@ -0,0 +1,2 @@`
							`phonemizer==3.2.1`
							`git+https://github.com/facebookresearch/encodec.git`