Minor fixes to infer pretrained model

This commit is contained in:
pkufool 2025-06-17 16:02:20 +08:00
parent 8c529ebe90
commit 60572c2444
11 changed files with 359 additions and 13 deletions

View File

@ -21,7 +21,33 @@ ZipVoice is a high-quality zero-shot TTS model with a small model size and fast
## Installation ## Installation
* Clone icefall repository and change to zipvoice directory:
```bash
git clone https://github.com/k2-fsa/icefall.git
cd icefall/egs/zipvoice
``` ```
* Create a Python virtual environment (optional but recommended):
```bash
python3 -m venv venv
source venv/bin/activate
```
* Install the required packages:
```bash
# Install pytorch and k2.
# If you want to use different versions, please refer to https://k2-fsa.org/get-started/k2/ for details.
# For users in China mainland, please refer to https://k2-fsa.org/zh-CN/get-started/k2/
pip install torch==2.5.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu121
pip install k2==1.24.4.dev20250208+cuda12.1.torch2.5.1 -f https://k2-fsa.github.io/k2/cuda.html
# Install other dependencies.
pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
pip install -r requirements.txt pip install -r requirements.txt
``` ```
@ -31,12 +57,21 @@ To generate speech with our pre-trained ZipVoice or ZipVoice-Distill models, use
### 1. Inference of a single sentence: ### 1. Inference of a single sentence:
```bash ```bash
# Chinese example
python3 zipvoice/zipvoice_infer.py \ python3 zipvoice/zipvoice_infer.py \
--model-name "zipvoice_distill" \ --model-name "zipvoice_distill" \
--prompt-wav prompt.wav \ --prompt-wav assets/prompt-zh.wav \
--prompt-text "I am the transcription of the prompt wav." \ --prompt-text "对,这就是我,万人敬仰的太乙真人,虽然有点婴儿肥,但也掩不住我逼人的帅气。" \
--text "I am the text to be synthesized." \ --text "欢迎使用我们的语音合成模型,希望它能给你带来惊喜!" \
--res-wav-path result.wav --res-wav-path result-zh.wav
# English example
python3 zipvoice/zipvoice_infer.py \
--model-name "zipvoice_distill" \
--prompt-wav assets/prompt-en.wav \
--prompt-text "Some call me nature, others call me mother nature. I've been here for over four point five billion years, twenty two thousand five hundred times longer than you." \
--text "Welcome to use our tts model, have fun!" \
--res-wav-path result-en.wav
``` ```
### 2. Inference of a list of sentences: ### 2. Inference of a list of sentences:
@ -46,6 +81,7 @@ python3 zipvoice/zipvoice_infer.py \
--test-list test.tsv \ --test-list test.tsv \
--res-dir results/test --res-dir results/test
``` ```
- `--model-name` can be `zipvoice` or `zipvoice_distill`, which are models before and after distillation, respectively. - `--model-name` can be `zipvoice` or `zipvoice_distill`, which are models before and after distillation, respectively.
- Each line of `test.tsv` is in the format of `{wav_name}\t{prompt_transcription}\t{prompt_wav}\t{text}`. - Each line of `test.tsv` is in the format of `{wav_name}\t{prompt_transcription}\t{prompt_wav}\t{text}`.

Binary file not shown.

Binary file not shown.

0
egs/zipvoice/local/compute_fbank_libritts.py Executable file → Normal file
View File

0
egs/zipvoice/local/validate_manifest.py Executable file → Normal file
View File

232
egs/zipvoice/scripts/prepare.sh Executable file
View File

@ -0,0 +1,232 @@
#!/usr/bin/env bash
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
# add icefall to PYTHONPATH
export PYTHONPATH=../../../:$PYTHONPATH
set -eou pipefail
stage=0
stop_stage=100
token_type=bpe # bpe, letter, phone
bpe_vocab_size=500
nj=32
dl_dir=$PWD/download
. shared/parse_options.sh || exit 1
# All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
log "dl_dir: $dl_dir"
if [ $stage -le -2 ] && [ $stop_stage -ge -2 ]; then
if [ ! -d $dl_dir/xvector_nnet_1a_libritts_clean_460 ]; then
log "Downloading x-vector"
git clone https://huggingface.co/datasets/zrjin/xvector_nnet_1a_libritts_clean_460 $dl_dir/xvector_nnet_1a_libritts_clean_460
mkdir -p exp/xvector_nnet_1a/
cp -r $dl_dir/xvector_nnet_1a_libritts_clean_460/* exp/xvector_nnet_1a/
fi
fi
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
log "Stage -1: build monotonic_align lib"
if [ ! -d vits/monotonic_align/build ]; then
cd vits/monotonic_align
python setup.py build_ext --inplace
cd ../../
else
log "monotonic_align lib already built"
fi
fi
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"
# If you have pre-downloaded it to /path/to/LibriTTS,
# you can create a symlink
#
# ln -sfv /path/to/LibriTTS $dl_dir/LibriTTS
#
if [ ! -d $dl_dir/LibriTTS ]; then
lhotse download libritts $dl_dir
fi
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare LibriTTS manifest"
# We assume that you have downloaded the LibriTTS corpus
# to $dl_dir/LibriTTS
mkdir -p data/manifests
if [ ! -e data/manifests/.libritts.done ]; then
lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests
touch data/manifests/.libritts.done
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Compute Fbank for LibriTTS"
mkdir -p data/fbank
for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do
python local/compute_fbank.py --dataset libritts --subset ${subset}
done
# Here we shuffle and combine the train-clean-100, train-clean-360 and
# train-other-500 together to form the training set.
if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \
<(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \
shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
fi
if [ ! -f data/fbank/libritts_cuts_train-clean-460.jsonl.gz ]; then
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) | \
shuf | gzip -c > data/fbank/libritts_cuts_train-clean-460.jsonl.gz
fi
if [ ! -e data/fbank/.libritts-validated.done ]; then
log "Validating data/fbank for LibriTTS"
./local/validate_manifest.py \
data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
touch data/fbank/.libritts-validated.done
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Prepare tokens.txt"
if [ $token_type == "bpe" ] || [ $token_type == "letter" ]; then
if [ ! -e data/texts.txt ]; then
./local/export_normalized_texts.py --output data/texts.txt \
--manifests data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
fi
fi
if [ $token_type == "bpe" ]; then
mkdir -p data/lang_bpe_${bpe_vocab_size}
if [ ! -e data/lang_bpe_${bpe_vocab_size}/tokens.txt ]; then
./local/train_bpe_model.py --transcript data/texts.txt \
--lang-dir data/lang_bpe_${bpe_vocab_size} \
--vocab-size $bpe_vocab_size
fi
fi
if [ $token_type == "phone" ]; then
mkdir -p data/lang_phone
./local/export_tokens.py --token-type phone \
--output data/lang_phone/tokens.txt
fi
if [ $token_type == "letter" ]; then
mkdir -p data/lang_letter
./local/export_tokens.py --token-type letter \
--texts data/texts.txt \
--output data/lang_letter/tokens.txt
fi
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Download and prepare librispeech-pc test clean for testing."
if [ ! -e $dl_dir/test-clean.tar.gz ]; then
wget https://huggingface.co/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir
fi
# For China users.
if [ ! -e $dl_dir/test-clean.tar.gz ]; then
wget https://hf-mirror.com/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir
fi
if [ ! -d $dl_dir/LibriSpeech/test-clean ]; then
tar -xvf $dl_dir/test-clean.tar.gz -C $dl_dir
fi
mkdir -p $dl_dir/LibriSpeech-PC
if [ ! -e $dl_dir/LibriSpeech-PC/test-clean.json ]; then
wget https://us.openslr.org/resources/145/manifests.tar.gz -P $dl_dir/LibriSpeech-PC
tar -xvf $dl_dir/LibriSpeech-PC/manifests.tar.gz -C $dl_dir/LibriSpeech-PC
fi
python local/compute_fbank.py --dataset librispeech --subset test-clean
python local/prepare_prompts_librispeech_test_clean.py
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Compute Spectrogram for LibriTTS (for VITS system)"
mkdir -p data/spectrogram
if [ ! -e data/spectrogram/.libritts.done ]; then
./local/compute_spectrogram_libritts.py --sampling-rate $sampling_rate
touch data/spectrogram/.libritts.done
fi
# Here we shuffle and combine the train-clean-100, train-clean-360 and
# train-other-500 together to form the training set.
if [ ! -f data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz ]; then
cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) \
<(gunzip -c data/spectrogram/libritts_cuts_train-other-500.jsonl.gz) | \
shuf | gzip -c > data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
fi
# Here we shuffle and combine the train-clean-100, train-clean-360
# together to form the training set.
if [ ! -f data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz ]; then
cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) | \
shuf | gzip -c > data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz
fi
if [ ! -e data/spectrogram/.libritts-validated.done ]; then
log "Validating data/spectrogram for LibriTTS"
./local/validate_manifest.py \
data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
touch data/spectrogram/.libritts-validated.done
fi
fi
audio_feats_dir=data/tokenized
dataset_parts="--dataset-parts all" # debug "-p dev-clean -p test-clean"
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Tokenize/Fbank LibriTTS for valle"
mkdir -p ${audio_feats_dir}
if [ ! -e ${audio_feats_dir}/.libritts.tokenize.done ]; then
python3 ./local/compute_neural_codec_and_prepare_text_tokens.py --dataset-parts "${dataset_parts}" \
--audio-extractor "Encodec" \
--batch-duration 400 \
--src-dir "data/manifests" \
--output-dir "${audio_feats_dir}"
fi
touch ${audio_feats_dir}/.libritts.tokenize.done
lhotse combine \
${audio_feats_dir}/libritts_cuts_train-clean-100.jsonl.gz \
${audio_feats_dir}/libritts_cuts_train-clean-360.jsonl.gz \
${audio_feats_dir}/libritts_cuts_train-other-500.jsonl.gz \
${audio_feats_dir}/cuts_train.jsonl.gz
lhotse copy \
${audio_feats_dir}/libritts_cuts_dev-clean.jsonl.gz \
${audio_feats_dir}/cuts_dev.jsonl.gz
lhotse copy \
${audio_feats_dir}/libritts_cuts_test-clean.jsonl.gz \
${audio_feats_dir}/cuts_test.jsonl.gz
fi

View File

@ -0,0 +1,75 @@
#!/usr/bin/env bash
export PYTHONPATH=../../../:$PYTHONPATH
stage=1
stop_stage=10
generated_wav_path="flow-matching/exp/generated_wavs"
. shared/parse_options.sh || exit 1
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
if [ $stage -le -2 ] && [ $stop_stage -ge -2 ]; then
log "Stage -2: Install dependencies and download models"
pip install -r requirements-eval.txt
pip install git+https://github.com/sarulab-speech/UTMOSv2.git
modelscope download --model k2-fsa/TTS_eval_models --local_dir TTS_eval_models
fi
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
log "Stage -1: Prepare evaluation data."
mkdir -p data/reference/librispeech-test-clean
gunzip -c data/fbank/librispeech_cuts_with_prompts_test-clean.jsonl.gz | \
jq -r '"\(.recording.sources[0].source)"' | \
awk '{split($1, a, "/"); cmd="cp "$1" data/reference/librispeech-test-clean/"a[length(a)]; print cmd; system(cmd)}'
mkdir -p data/reference/librispeech-test-clean-prompt
gunzip -c data/fbank/librispeech_cuts_with_prompts_test-clean.jsonl.gz | \
jq -r '"\(.custom.prompt.recording.sources[0].source) \(.recording.sources[0].source)"' | \
awk '{split($2, a, "/"); cmd="cp "$1" data/reference/librispeech-test-clean-prompt/"a[length(a)]; print cmd; system(cmd)}'
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Evaluate the model with FSD."
python local/evaluate_fsd.py --real-path data/reference/librispeech-test-clean \
--eval-path $generated_wav_path
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Evaluate the model with SIM."
python local/evaluate_sim.py --real-path data/reference/librispeech-test-clean \
--eval-path $generated_wav_path
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Evaluate the model with UTMOS."
python local/evaluate_utmos.py --wav-path $generated_wav_path
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Evaluate the model with UTMOSv2."
python local/evaluate_utmosv2.py --wav-path $generated_wav_path
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Evaluate the model with WER."
python local/evaluate_wer_hubert.py --wav-path $generated_wav_path \
--decode-path $generated_wav_path/decode
fi

0
egs/zipvoice/zipvoice/generate_averaged_model.py Executable file → Normal file
View File

View File

@ -23,7 +23,7 @@ This script generates speech with our pre-trained ZipVoice or
Usage: Usage:
Note: If you having trouble connecting to HuggingFace, Note: If you having trouble connecting to HuggingFace,
you try switch endpoint to mirror site: try switching endpoint to mirror site:
export HF_ENDPOINT=https://hf-mirror.com export HF_ENDPOINT=https://hf-mirror.com
@ -55,7 +55,6 @@ import os
import numpy as np import numpy as np
import safetensors.torch import safetensors.torch
import soundfile as sf
import torch import torch
import torch.nn as nn import torch.nn as nn
import torchaudio import torchaudio
@ -115,15 +114,20 @@ def get_parser():
"--res-dir", "--res-dir",
type=str, type=str,
default="results", default="results",
help="Path name of the generated wavs dir, " help="""
"used when decdode-list is not None", Path name of the generated wavs dir,
used when test-list is not None
""",
) )
parser.add_argument( parser.add_argument(
"--res-wav-path", "--res-wav-path",
type=str, type=str,
default="result.wav", default="result.wav",
help="Path name of the generated wav path, " "used when decdode-list is None", help="""
Path name of the generated wav path,
used when test-list is None
""",
) )
parser.add_argument( parser.add_argument(
@ -456,8 +460,7 @@ def generate_sentence(
# Adjust wav volume if necessary # Adjust wav volume if necessary
if prompt_rms < target_rms: if prompt_rms < target_rms:
wav = wav * prompt_rms / target_rms wav = wav * prompt_rms / target_rms
wav = wav[0].cpu().numpy() torchaudio.save(save_path, wav.cpu(), sample_rate=sampling_rate)
sf.write(save_path, wav, sampling_rate)
return metrics return metrics