From c197be2c051cc4bf90acd008abc21c7b492ade7e Mon Sep 17 00:00:00 2001 From: pkufool Date: Wed, 18 Jun 2025 13:50:39 +0800 Subject: [PATCH] simplify the requirements for pretrained model inference --- .flake8 | 1 + egs/zipvoice/README.md | 21 +++++++++++---------- egs/zipvoice/requirements.txt | 5 +++++ egs/zipvoice/zipvoice/scaling.py | 28 ++++++++++++++++++++++++---- 4 files changed, 41 insertions(+), 14 deletions(-) diff --git a/.flake8 b/.flake8 index cf276d0ba..534c7e5f4 100644 --- a/.flake8 +++ b/.flake8 @@ -25,6 +25,7 @@ exclude = icefall/shared/make_kn_lm.py, icefall/__init__.py icefall/ctc/__init__.py + egs/zipvoice/zipvoice/scaling.py ignore = # E203 white space before ":" diff --git a/egs/zipvoice/README.md b/egs/zipvoice/README.md index 4bca60301..0c97d7ed8 100644 --- a/egs/zipvoice/README.md +++ b/egs/zipvoice/README.md @@ -39,15 +39,6 @@ source venv/bin/activate * Install the required packages: ```bash -# Install pytorch and k2. -# If you want to use different versions, please refer to https://k2-fsa.org/get-started/k2/ for details. -# For users in China mainland, please refer to https://k2-fsa.org/zh-CN/get-started/k2/ - -pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121 -pip install k2==1.24.4.dev20250208+cuda12.1.torch2.5.1 -f https://k2-fsa.github.io/k2/cuda.html - -# Install other dependencies. -pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html pip install -r requirements.txt ``` @@ -97,6 +88,16 @@ The following steps show how to train a model from scratch on Emilia and LibriTT ### 0. Install dependencies for training ```bash +# Install pytorch and k2. +# If you want to use different versions, please refer to https://k2-fsa.org/get-started/k2/ for details. +# For users in China mainland, please refer to https://k2-fsa.org/zh-CN/get-started/k2/ + +# Note: Make sure you have installed the correct version of PyTorch and k2 that matches your CUDA version. +# For example, if want to use pytorch 2.5.1 and you are using CUDA 12.1, you can install PyTorch and k2 as follows: + +pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121 +pip install k2==1.24.4.dev20250208+cuda12.1.torch2.5.1 -f https://k2-fsa.github.io/k2/cuda.html + pip install -r ../../requirements.txt ``` @@ -403,7 +404,7 @@ on three test sets, i.e., LibriSpeech-PC test-clean, Seed-TTS test-en and Seed-T ```bibtex @article{zhu-2025-zipvoice, - title={ZipVoice: Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching}, + title={ZipVoice: Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching}, author={Han Zhu and Wei Kang and Zengwei Yao and Liyong Guo and Fangjun Kuang and Zhaoqing Li and Weiji Zhuang and Long Lin and Daniel Povey} journal={arXiv preprint arXiv:2506.13053}, year={2025}, diff --git a/egs/zipvoice/requirements.txt b/egs/zipvoice/requirements.txt index d046515e8..cbbe860a5 100644 --- a/egs/zipvoice/requirements.txt +++ b/egs/zipvoice/requirements.txt @@ -1,3 +1,7 @@ +--find-links https://k2-fsa.github.io/icefall/piper_phonemize.html + +torch +torchaudio huggingface_hub lhotse safetensors @@ -9,4 +13,5 @@ inflect # Tokenization jieba +piper_phonemize pypinyin diff --git a/egs/zipvoice/zipvoice/scaling.py b/egs/zipvoice/zipvoice/scaling.py index 5211e3a76..afe9ad468 100644 --- a/egs/zipvoice/zipvoice/scaling.py +++ b/egs/zipvoice/zipvoice/scaling.py @@ -18,9 +18,17 @@ import logging import math import random +import sys from typing import Optional, Tuple, Union -import k2 +try: + import k2 +except Exception as ex: + logging.warning( + "k2 is not installed correctly. Swoosh functions will fallback to " + "pytorch implementation." + ) + import torch import torch.nn as nn from torch import Tensor @@ -1398,7 +1406,11 @@ class SwooshLFunction(torch.autograd.Function): class SwooshL(torch.nn.Module): def forward(self, x: Tensor) -> Tensor: """Return Swoosh-L activation.""" - if torch.jit.is_scripting() or torch.jit.is_tracing(): + if ( + torch.jit.is_scripting() + or torch.jit.is_tracing() + or "k2" not in sys.modules + ): zero = torch.tensor(0.0, dtype=x.dtype, device=x.device) return logaddexp(zero, x - 4.0) - 0.08 * x - 0.035 if not x.requires_grad: @@ -1472,7 +1484,11 @@ class SwooshRFunction(torch.autograd.Function): class SwooshR(torch.nn.Module): def forward(self, x: Tensor) -> Tensor: """Return Swoosh-R activation.""" - if torch.jit.is_scripting() or torch.jit.is_tracing(): + if ( + torch.jit.is_scripting() + or torch.jit.is_tracing() + or "k2" not in sys.modules + ): zero = torch.tensor(0.0, dtype=x.dtype, device=x.device) return logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687 if not x.requires_grad: @@ -1636,7 +1652,11 @@ class ActivationDropoutAndLinear(torch.nn.Module): self.dropout_shared_dim = dropout_shared_dim def forward(self, x: Tensor): - if torch.jit.is_scripting() or torch.jit.is_tracing(): + if ( + torch.jit.is_scripting() + or torch.jit.is_tracing() + or "k2" not in sys.modules + ): if self.activation == "SwooshL": x = SwooshLForward(x) elif self.activation == "SwooshR":