create model from parameters

2025-12-11 06:55:27 +00:00 · 2024-10-15 17:57:10 +08:00 · 2024-10-15 17:57:10 +08:00 · 6fac3a3143
commit 6fac3a3143
parent f95ac12d70
6 changed files with 117 additions and 16 deletions
--- a/egs/ljspeech/TTS/matcha/models/components/flow_matching.py
+++ b/egs/ljspeech/TTS/matcha/models/components/flow_matching.py
@ -4,9 +4,9 @@ import torch
 import torch.nn.functional as F
 from matcha.models.components.decoder import Decoder
-from matcha.utils.pylogger import get_pylogger
+#  from matcha.utils.pylogger import get_pylogger
-log = get_pylogger(__name__)
+#  log = get_pylogger(__name__)
 class BASECFM(torch.nn.Module, ABC):
--- a/egs/ljspeech/TTS/matcha/models/components/text_encoder.py
+++ b/egs/ljspeech/TTS/matcha/models/components/text_encoder.py
@ -6,10 +6,10 @@ import torch
 import torch.nn as nn
 from einops import rearrange
-import matcha.utils as utils
+#  import matcha.utils as utils
 from matcha.utils.model import sequence_mask
-log = utils.get_pylogger(__name__)
+#  log = utils.get_pylogger(__name__)
 class LayerNorm(nn.Module):
--- a/egs/ljspeech/TTS/matcha/models/matcha_tts.py
+++ b/egs/ljspeech/TTS/matcha/models/matcha_tts.py
@ -5,8 +5,8 @@ import random
 import torch
 import matcha.utils.monotonic_align as monotonic_align
-from matcha import utils
+#  from matcha import utils
-from matcha.models.baselightningmodule import BaseLightningClass
+#  from matcha.models.baselightningmodule import BaseLightningClass
 from matcha.models.components.flow_matching import CFM
 from matcha.models.components.text_encoder import TextEncoder
 from matcha.utils.model import (
@ -17,10 +17,10 @@ from matcha.utils.model import (
    sequence_mask,
 )
-log = utils.get_pylogger(__name__)
+#  log = utils.get_pylogger(__name__)
-class MatchaTTS(BaseLightningClass):  # 🍵
+class MatchaTTS(torch.nn.Module):  # 🍵
    def __init__(
        self,
        n_vocab,
@ -30,7 +30,7 @@ class MatchaTTS(BaseLightningClass):  # 🍵
        encoder,
        decoder,
        cfm,
-        data_statistics,
+        #  data_statistics,
        out_size,
        optimizer=None,
        scheduler=None,
@ -39,7 +39,7 @@ class MatchaTTS(BaseLightningClass):  # 🍵
    ):
        super().__init__()
-        self.save_hyperparameters(logger=False)
+        #  self.save_hyperparameters(logger=False)
        self.n_vocab = n_vocab
        self.n_spks = n_spks
@ -70,7 +70,7 @@ class MatchaTTS(BaseLightningClass):  # 🍵
            spk_emb_dim=spk_emb_dim,
        )
-        self.update_data_statistics(data_statistics)
+        #  self.update_data_statistics(data_statistics)
    @torch.inference_mode()
    def synthesise(self, x, x_lengths, n_timesteps, temperature=1.0, spks=None, length_scale=1.0):
--- a/egs/ljspeech/TTS/matcha/train.py
+++ b/egs/ljspeech/TTS/matcha/train.py
@ -0,0 +1,98 @@
 #!/usr/bin/env python3
 # Copyright         2023  Xiaomi Corp.        (authors: Fangjun Kuang)
 import torch
 from icefall.utils import AttributeDict
 from matcha.models.matcha_tts import MatchaTTS
 def get_model(params):
    m = MatchaTTS(**params.model)
    return m
 def main():
    n_feats = 80
    filter_channels_dp = 256
    encoder_params_p_dropout = 0.1
    params = AttributeDict(
        {
            "model": AttributeDict(
                {
                    "n_vocab": 178,
                    "n_spks": 1,  # for ljspeech.
                    "spk_emb_dim": 64,
                    "n_feats": n_feats,
                    "out_size": None,  # or use 172
                    "prior_loss": True,
                    "use_precomputed_durations": False,
                    "encoder": AttributeDict(
                        {
                            "encoder_type": "RoPE Encoder",  # not used
                            "encoder_params": AttributeDict(
                                {
                                    "n_feats": n_feats,
                                    "n_channels": 192,
                                    "filter_channels": 768,
                                    "filter_channels_dp": filter_channels_dp,
                                    "n_heads": 2,
                                    "n_layers": 6,
                                    "kernel_size": 3,
                                    "p_dropout": encoder_params_p_dropout,
                                    "spk_emb_dim": 64,
                                    "n_spks": 1,
                                    "prenet": True,
                                }
                            ),
                            "duration_predictor_params": AttributeDict(
                                {
                                    "filter_channels_dp": filter_channels_dp,
                                    "kernel_size": 3,
                                    "p_dropout": encoder_params_p_dropout,
                                }
                            ),
                        }
                    ),
                    "decoder": AttributeDict(
                        {
                            "channels": [256, 256],
                            "dropout": 0.05,
                            "attention_head_dim": 64,
                            "n_blocks": 1,
                            "num_mid_blocks": 2,
                            "num_heads": 2,
                            "act_fn": "snakebeta",
                        }
                    ),
                    "cfm": AttributeDict(
                        {
                            "name": "CFM",
                            "solver": "euler",
                            "sigma_min": 1e-4,
                        }
                    ),
                    "optimizer": AttributeDict(
                        {
                            "lr": 1e-4,
                            "weight_decay": 0.0,
                        }
                    ),
                }
            )
        }
    )
    m = get_model(params)
    print(m)
    num_param = sum([p.numel() for p in m.parameters()])
    print(f"Number of parameters: {num_param}")
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 if __name__ == "__main__":
    main()
--- a/egs/ljspeech/TTS/matcha/utils/init.py
+++ b/egs/ljspeech/TTS/matcha/utils/init.py
@ -1,5 +1,5 @@
-from matcha.utils.instantiators import instantiate_callbacks, instantiate_loggers
+#  from matcha.utils.instantiators import instantiate_callbacks, instantiate_loggers
-from matcha.utils.logging_utils import log_hyperparameters
+#  from matcha.utils.logging_utils import log_hyperparameters
-from matcha.utils.pylogger import get_pylogger
+#  from matcha.utils.pylogger import get_pylogger
-from matcha.utils.rich_utils import enforce_tags, print_config_tree
+#  from matcha.utils.rich_utils import enforce_tags, print_config_tree
-from matcha.utils.utils import extras, get_metric_value, task_wrapper
+#  from matcha.utils.utils import extras, get_metric_value, task_wrapper
--- a/egs/ljspeech/TTS/matcha/utils/monotonic_align/.gitignore
+++ b/egs/ljspeech/TTS/matcha/utils/monotonic_align/.gitignore
@ -0,0 +1,3 @@
 build
 core.c
 *.so