From c197be2c051cc4bf90acd008abc21c7b492ade7e Mon Sep 17 00:00:00 2001
From: pkufool <wkang.pku@gmail.com>
Date: Wed, 18 Jun 2025 13:50:39 +0800
Subject: [PATCH] simplify the requirements for pretrained model inference

---
 .flake8                          |  1 +
 egs/zipvoice/README.md           | 21 +++++++++++----------
 egs/zipvoice/requirements.txt    |  5 +++++
 egs/zipvoice/zipvoice/scaling.py | 28 ++++++++++++++++++++++++----
 4 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/.flake8 b/.flake8
index cf276d0ba..534c7e5f4 100644
--- a/.flake8
+++ b/.flake8
@@ -25,6 +25,7 @@ exclude =
   icefall/shared/make_kn_lm.py,
   icefall/__init__.py
   icefall/ctc/__init__.py
+  egs/zipvoice/zipvoice/scaling.py
 
 ignore =
   # E203 white space before ":"
diff --git a/egs/zipvoice/README.md b/egs/zipvoice/README.md
index 4bca60301..0c97d7ed8 100644
--- a/egs/zipvoice/README.md
+++ b/egs/zipvoice/README.md
@@ -39,15 +39,6 @@ source venv/bin/activate
 * Install the required packages:
 
 ```bash
-# Install pytorch and k2.
-# If you want to use different versions, please refer to https://k2-fsa.org/get-started/k2/ for details.
-# For users in China mainland, please refer to https://k2-fsa.org/zh-CN/get-started/k2/
-
-pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121
-pip install k2==1.24.4.dev20250208+cuda12.1.torch2.5.1 -f https://k2-fsa.github.io/k2/cuda.html
-
-# Install other dependencies.
-pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
 pip install -r requirements.txt
 ```
 
@@ -97,6 +88,16 @@ The following steps show how to train a model from scratch on Emilia and LibriTT
 ### 0. Install dependencies for training
 
 ```bash
+# Install pytorch and k2.
+# If you want to use different versions, please refer to https://k2-fsa.org/get-started/k2/ for details.
+# For users in China mainland, please refer to https://k2-fsa.org/zh-CN/get-started/k2/
+
+# Note: Make sure you have installed the correct version of PyTorch and k2 that matches your CUDA version.
+# For example, if want to use pytorch 2.5.1 and you are using CUDA 12.1, you can install PyTorch and k2 as follows:
+
+pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121
+pip install k2==1.24.4.dev20250208+cuda12.1.torch2.5.1 -f https://k2-fsa.github.io/k2/cuda.html
+
 pip install -r ../../requirements.txt
 ```
 
@@ -403,7 +404,7 @@ on three test sets, i.e., LibriSpeech-PC test-clean, Seed-TTS test-en and Seed-T
 
 ```bibtex
 @article{zhu-2025-zipvoice,
-      title={ZipVoice: Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching}, 
+      title={ZipVoice: Fast and High-Quality Zero-Shot Text-to-Speech with Flow Matching},
       author={Han Zhu and Wei Kang and Zengwei Yao and Liyong Guo and Fangjun Kuang and Zhaoqing Li and Weiji Zhuang and Long Lin and Daniel Povey}
       journal={arXiv preprint arXiv:2506.13053},
       year={2025},
diff --git a/egs/zipvoice/requirements.txt b/egs/zipvoice/requirements.txt
index d046515e8..cbbe860a5 100644
--- a/egs/zipvoice/requirements.txt
+++ b/egs/zipvoice/requirements.txt
@@ -1,3 +1,7 @@
+--find-links https://k2-fsa.github.io/icefall/piper_phonemize.html
+
+torch
+torchaudio
 huggingface_hub
 lhotse
 safetensors
@@ -9,4 +13,5 @@ inflect
 
 # Tokenization
 jieba
+piper_phonemize
 pypinyin
diff --git a/egs/zipvoice/zipvoice/scaling.py b/egs/zipvoice/zipvoice/scaling.py
index 5211e3a76..afe9ad468 100644
--- a/egs/zipvoice/zipvoice/scaling.py
+++ b/egs/zipvoice/zipvoice/scaling.py
@@ -18,9 +18,17 @@
 import logging
 import math
 import random
+import sys
 from typing import Optional, Tuple, Union
 
-import k2
+try:
+    import k2
+except Exception as ex:
+    logging.warning(
+        "k2 is not installed correctly. Swoosh functions will fallback to "
+        "pytorch implementation."
+    )
+
 import torch
 import torch.nn as nn
 from torch import Tensor
@@ -1398,7 +1406,11 @@ class SwooshLFunction(torch.autograd.Function):
 class SwooshL(torch.nn.Module):
     def forward(self, x: Tensor) -> Tensor:
         """Return Swoosh-L activation."""
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if (
+            torch.jit.is_scripting()
+            or torch.jit.is_tracing()
+            or "k2" not in sys.modules
+        ):
             zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
             return logaddexp(zero, x - 4.0) - 0.08 * x - 0.035
         if not x.requires_grad:
@@ -1472,7 +1484,11 @@ class SwooshRFunction(torch.autograd.Function):
 class SwooshR(torch.nn.Module):
     def forward(self, x: Tensor) -> Tensor:
         """Return Swoosh-R activation."""
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if (
+            torch.jit.is_scripting()
+            or torch.jit.is_tracing()
+            or "k2" not in sys.modules
+        ):
             zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
             return logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687
         if not x.requires_grad:
@@ -1636,7 +1652,11 @@ class ActivationDropoutAndLinear(torch.nn.Module):
         self.dropout_shared_dim = dropout_shared_dim
 
     def forward(self, x: Tensor):
-        if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if (
+            torch.jit.is_scripting()
+            or torch.jit.is_tracing()
+            or "k2" not in sys.modules
+        ):
             if self.activation == "SwooshL":
                 x = SwooshLForward(x)
             elif self.activation == "SwooshR":