add dit

2024-12-23 17:38:12 +08:00 · 2024-12-23 17:38:12 +08:00 · 3ba6febe4f
commit 3ba6febe4f
parent ec5cc5526e
3 changed files with 418 additions and 1 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -11,7 +11,8 @@ repos:
    rev: 5.0.4
    hooks:
      - id: flake8
-        args: ["--max-line-length=88", "--extend-ignore=E203,E266,E501,F401,E402,F403,F841,W503"]
+        args: ["--max-line-length=88", "--extend-ignore=E203,E266,E501,F401,E402,F403,F841,W503, F722, F821"]
+          #exclude:

      # What are we ignoring here?
      # E203: whitespace before ':'
--- a/egs/wenetspeech4tts/TTS/f5-tts/model/dit.py
+++ b/egs/wenetspeech4tts/TTS/f5-tts/model/dit.py
@ -0,0 +1,210 @@
+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+
+from __future__ import annotations
+
+import torch
+import torch.nn.functional as F
+from model.modules import (
+    AdaLayerNormZero_Final,
+    ConvNeXtV2Block,
+    ConvPositionEmbedding,
+    DiTBlock,
+    TimestepEmbedding,
+    get_pos_embed_indices,
+    precompute_freqs_cis,
+)
+from torch import nn
+from x_transformers.x_transformers import RotaryEmbedding
+
+# Text embedding
+
+
+class TextEmbedding(nn.Module):
+    def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
+        super().__init__()
+        self.text_embed = nn.Embedding(
+            text_num_embeds + 1, text_dim
+        )  # use 0 as filler token
+
+        if conv_layers > 0:
+            self.extra_modeling = True
+            self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer(
+                "freqs_cis",
+                precompute_freqs_cis(text_dim, self.precompute_max_pos),
+                persistent=False,
+            )
+            self.text_blocks = nn.Sequential(
+                *[
+                    ConvNeXtV2Block(text_dim, text_dim * conv_mult)
+                    for _ in range(conv_layers)
+                ]
+            )
+        else:
+            self.extra_modeling = False
+
+    def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
+        text = (
+            text + 1
+        )  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
+        text = text[
+            :, :seq_len
+        ]  # curtail if character tokens are more than the mel spec tokens
+        batch, text_len = text.shape[0], text.shape[1]
+        text = F.pad(text, (0, seq_len - text_len), value=0)
+
+        if drop_text:  # cfg for text
+            text = torch.zeros_like(text)
+
+        text = self.text_embed(text)  # b n -> b n d
+
+        # possible extra modeling
+        if self.extra_modeling:
+            # sinus pos emb
+            batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(
+                batch_start, seq_len, max_pos=self.precompute_max_pos
+            )
+            text_pos_embed = self.freqs_cis[pos_idx]
+            text = text + text_pos_embed
+
+            # convnextv2 blocks
+            text = self.text_blocks(text)
+
+        return text
+
+
+# noised input audio and context mixing embedding
+
+
+class InputEmbedding(nn.Module):
+    def __init__(self, mel_dim, text_dim, out_dim):
+        super().__init__()
+        self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
+        self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
+
+    def forward(
+        self,
+        x: float["b n d"],
+        cond: float["b n d"],
+        text_embed: float["b n d"],
+        drop_audio_cond=False,
+    ):  # noqa: F722
+        if drop_audio_cond:  # cfg for cond audio
+            cond = torch.zeros_like(cond)
+
+        x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
+        x = self.conv_pos_embed(x) + x
+        return x
+
+
+# Transformer backbone using DiT blocks
+
+
+class DiT(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=8,
+        heads=8,
+        dim_head=64,
+        dropout=0.1,
+        ff_mult=4,
+        mel_dim=100,
+        text_num_embeds=256,
+        text_dim=None,
+        conv_layers=0,
+        long_skip_connection=False,
+        checkpoint_activations=False,
+    ):
+        super().__init__()
+
+        self.time_embed = TimestepEmbedding(dim)
+        if text_dim is None:
+            text_dim = mel_dim
+        self.text_embed = TextEmbedding(
+            text_num_embeds, text_dim, conv_layers=conv_layers
+        )
+        self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
+
+        self.rotary_embed = RotaryEmbedding(dim_head)
+
+        self.dim = dim
+        self.depth = depth
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                DiTBlock(
+                    dim=dim,
+                    heads=heads,
+                    dim_head=dim_head,
+                    ff_mult=ff_mult,
+                    dropout=dropout,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.long_skip_connection = (
+            nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
+        )
+
+        self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
+        self.proj_out = nn.Linear(dim, mel_dim)
+
+        self.checkpoint_activations = checkpoint_activations
+
+    def ckpt_wrapper(self, module):
+        # https://github.com/chuanyangjin/fast-DiT/blob/main/models.py
+        def ckpt_forward(*inputs):
+            outputs = module(*inputs)
+            return outputs
+
+        return ckpt_forward
+
+    def forward(
+        self,
+        x: float["b n d"],  # nosied input audio  # noqa: F722
+        cond: float["b n d"],  # masked cond audio  # noqa: F722
+        text: int["b nt"],  # text  # noqa: F722
+        time: float["b"] | float[""],  # time step  # noqa: F821 F722
+        drop_audio_cond,  # cfg for cond audio
+        drop_text,  # cfg for text
+        mask: bool["b n"] | None = None,  # noqa: F722
+    ):
+        batch, seq_len = x.shape[0], x.shape[1]
+        if time.ndim == 0:
+            time = time.repeat(batch)
+
+        # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
+        t = self.time_embed(time)
+        text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
+        x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
+
+        rope = self.rotary_embed.forward_from_seq_len(seq_len)
+
+        if self.long_skip_connection is not None:
+            residual = x
+
+        for block in self.transformer_blocks:
+            if self.checkpoint_activations:
+                x = torch.utils.checkpoint.checkpoint(
+                    self.ckpt_wrapper(block), x, t, mask, rope
+                )
+            else:
+                x = block(x, t, mask=mask, rope=rope)
+
+        if self.long_skip_connection is not None:
+            x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
+
+        x = self.norm_out(x, t)
+        output = self.proj_out(x)
+
+        return output
--- a/egs/wenetspeech4tts/TTS/f5-tts/model/utils.py
+++ b/egs/wenetspeech4tts/TTS/f5-tts/model/utils.py
@ -0,0 +1,206 @@
+from __future__ import annotations
+
+import os
+import random
+from collections import defaultdict
+from importlib.resources import files
+
+import jieba
+import torch
+from pypinyin import Style, lazy_pinyin
+from torch.nn.utils.rnn import pad_sequence
+
+# seed everything
+
+
+def seed_everything(seed=0):
+    random.seed(seed)
+    os.environ["PYTHONHASHSEED"] = str(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+
+# helpers
+
+
+def exists(v):
+    return v is not None
+
+
+def default(v, d):
+    return v if exists(v) else d
+
+
+# tensor helpers
+
+
+def lens_to_mask(
+    t: int["b"], length: int | None = None
+) -> bool["b n"]:  # noqa: F722 F821
+    if not exists(length):
+        length = t.amax()
+
+    seq = torch.arange(length, device=t.device)
+    return seq[None, :] < t[:, None]
+
+
+def mask_from_start_end_indices(
+    seq_len: int["b"], start: int["b"], end: int["b"]
+):  # noqa: F722 F821
+    max_seq_len = seq_len.max().item()
+    seq = torch.arange(max_seq_len, device=start.device).long()
+    start_mask = seq[None, :] >= start[:, None]
+    end_mask = seq[None, :] < end[:, None]
+    return start_mask & end_mask
+
+
+def mask_from_frac_lengths(
+    seq_len: int["b"], frac_lengths: float["b"]
+):  # noqa: F722 F821
+    lengths = (frac_lengths * seq_len).long()
+    max_start = seq_len - lengths
+
+    rand = torch.rand_like(frac_lengths)
+    start = (max_start * rand).long().clamp(min=0)
+    end = start + lengths
+
+    return mask_from_start_end_indices(seq_len, start, end)
+
+
+def maybe_masked_mean(
+    t: float["b n d"], mask: bool["b n"] = None
+) -> float["b d"]:  # noqa: F722
+    if not exists(mask):
+        return t.mean(dim=1)
+
+    t = torch.where(mask[:, :, None], t, torch.tensor(0.0, device=t.device))
+    num = t.sum(dim=1)
+    den = mask.float().sum(dim=1)
+
+    return num / den.clamp(min=1.0)
+
+
+# simple utf-8 tokenizer, since paper went character based
+def list_str_to_tensor(text: list[str], padding_value=-1) -> int["b nt"]:  # noqa: F722
+    list_tensors = [torch.tensor([*bytes(t, "UTF-8")]) for t in text]  # ByT5 style
+    text = pad_sequence(list_tensors, padding_value=padding_value, batch_first=True)
+    return text
+
+
+# char tokenizer, based on custom dataset's extracted .txt file
+def list_str_to_idx(
+    text: list[str] | list[list[str]],
+    vocab_char_map: dict[str, int],  # {char: idx}
+    padding_value=-1,
+) -> int["b nt"]:  # noqa: F722
+    list_idx_tensors = [
+        torch.tensor([vocab_char_map.get(c, 0) for c in t]) for t in text
+    ]  # pinyin or char style
+    text = pad_sequence(list_idx_tensors, padding_value=padding_value, batch_first=True)
+    return text
+
+
+# Get tokenizer
+
+
+def get_tokenizer(dataset_name, tokenizer: str = "pinyin"):
+    """
+    tokenizer   - "pinyin" do g2p for only chinese characters, need .txt vocab_file
+                - "char" for char-wise tokenizer, need .txt vocab_file
+                - "byte" for utf-8 tokenizer
+                - "custom" if you're directly passing in a path to the vocab.txt you want to use
+    vocab_size  - if use "pinyin", all available pinyin types, common alphabets (also those with accent) and symbols
+                - if use "char", derived from unfiltered character & symbol counts of custom dataset
+                - if use "byte", set to 256 (unicode byte range)
+    """
+    if tokenizer in ["pinyin", "char"]:
+        tokenizer_path = os.path.join(
+            files("f5_tts").joinpath("../../data"),
+            f"{dataset_name}_{tokenizer}/vocab.txt",
+        )
+        with open(tokenizer_path, "r", encoding="utf-8") as f:
+            vocab_char_map = {}
+            for i, char in enumerate(f):
+                vocab_char_map[char[:-1]] = i
+        vocab_size = len(vocab_char_map)
+        assert (
+            vocab_char_map[" "] == 0
+        ), "make sure space is of idx 0 in vocab.txt, cuz 0 is used for unknown char"
+
+    elif tokenizer == "byte":
+        vocab_char_map = None
+        vocab_size = 256
+
+    elif tokenizer == "custom":
+        with open(dataset_name, "r", encoding="utf-8") as f:
+            vocab_char_map = {}
+            for i, char in enumerate(f):
+                vocab_char_map[char[:-1]] = i
+        vocab_size = len(vocab_char_map)
+
+    return vocab_char_map, vocab_size
+
+
+# convert char to pinyin
+
+jieba.initialize()
+print("Word segmentation module jieba initialized.\n")
+
+
+def convert_char_to_pinyin(text_list, polyphone=True):
+    final_text_list = []
+    custom_trans = str.maketrans(
+        {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'"}
+    )  # add custom trans here, to address oov
+
+    def is_chinese(c):
+        return "\u3100" <= c <= "\u9fff"  # common chinese characters
+
+    for text in text_list:
+        char_list = []
+        text = text.translate(custom_trans)
+        for seg in jieba.cut(text):
+            seg_byte_len = len(bytes(seg, "UTF-8"))
+            if seg_byte_len == len(seg):  # if pure alphabets and symbols
+                if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
+                    char_list.append(" ")
+                char_list.extend(seg)
+            elif polyphone and seg_byte_len == 3 * len(
+                seg
+            ):  # if pure east asian characters
+                seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
+                for i, c in enumerate(seg):
+                    if is_chinese(c):
+                        char_list.append(" ")
+                    char_list.append(seg_[i])
+            else:  # if mixed characters, alphabets and symbols
+                for c in seg:
+                    if ord(c) < 256:
+                        char_list.extend(c)
+                    elif is_chinese(c):
+                        char_list.append(" ")
+                        char_list.extend(
+                            lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True)
+                        )
+                    else:
+                        char_list.append(c)
+        final_text_list.append(char_list)
+
+    return final_text_list
+
+
+# filter func for dirty data with many repetitions
+
+
+def repetition_found(text, length=2, tolerance=10):
+    pattern_count = defaultdict(int)
+    for i in range(len(text) - length + 1):
+        pattern = text[i : i + length]
+        pattern_count[pattern] += 1
+    for pattern, count in pattern_count.items():
+        if count > tolerance:
+            return True
+    return False