icefall/egs/zipvoice/scripts/evaluate.sh
Wei Kang 06539d2b9d
Add Zipvoice (#1964)
* Add ZipVoice - a flow-matching based zero-shot TTS model.
2025-06-17 20:17:12 +08:00

102 lines
3.8 KiB
Bash

export CUDA_VISIBLE_DEVICES="0"
export PYTHONWARNINGS=ignore
export PYTHONPATH=../../:$PYTHONPATH
# Uncomment this if you have trouble connecting to HuggingFace
# export HF_ENDPOINT=https://hf-mirror.com
start_stage=1
end_stage=3
# Models used for SIM-o evaluation.
# SV model wavlm_large_finetune.pth is downloaded from https://github.com/microsoft/UniSpeech/tree/main/downstreams/speaker_verification
# SSL model wavlm_large.pt is downloaded from https://huggingface.co/s3prl/converted_ckpts/resolve/main/wavlm_large.pt
sv_model_path=model/UniSpeech/wavlm_large_finetune.pth
wavlm_model_path=model/s3prl/wavlm_large.pt
# Models used for UTMOS evaluation.
# wget https://huggingface.co/spaces/sarulab-speech/UTMOS-demo/resolve/main/epoch%3D3-step%3D7459.ckpt -P model/huggingface/utmos/utmos.pt
# wget https://huggingface.co/spaces/sarulab-speech/UTMOS-demo/resolve/main/wav2vec_small.pt -P model/huggingface/utmos/wav2vec_small.pt
utmos_model_path=model/huggingface/utmos/utmos.pt
wav2vec_model_path=model/huggingface/utmos/wav2vec_small.pt
if [ $start_stage -le 1 ] && [ $end_stage -ge 1 ]; then
echo "=====Evaluate for Seed-TTS test-en======="
test_list=testset/test_seedtts_en.tsv
wav_path=results/zipvoice_seedtts_en
echo $wav_path
echo "-----Computing SIM-o-----"
python3 local/evaluate_sim.py \
--sv-model-path ${sv_model_path} \
--ssl-model-path ${wavlm_model_path} \
--eval-path ${wav_path} \
--test-list ${test_list}
echo "-----Computing WER-----"
python3 local/evaluate_wer_seedtts.py \
--test-list ${test_list} \
--wav-path ${wav_path} \
--lang "en"
echo "-----Computing UTSMOS-----"
python3 local/evaluate_utmos.py \
--wav-path ${wav_path} \
--utmos-model-path ${utmos_model_path} \
--ssl-model-path ${wav2vec_model_path}
fi
if [ $start_stage -le 2 ] && [ $end_stage -ge 2 ]; then
echo "=====Evaluate for Seed-TTS test-zh======="
test_list=testset/test_seedtts_zh.tsv
wav_path=results/zipvoice_seedtts_zh
echo $wav_path
echo "-----Computing SIM-o-----"
python3 local/evaluate_sim.py \
--sv-model-path ${sv_model_path} \
--ssl-model-path ${wavlm_model_path} \
--eval-path ${wav_path} \
--test-list ${test_list}
echo "-----Computing WER-----"
python3 local/evaluate_wer_seedtts.py \
--test-list ${test_list} \
--wav-path ${wav_path} \
--lang "zh"
echo "-----Computing UTSMOS-----"
python3 local/evaluate_utmos.py \
--wav-path ${wav_path} \
--utmos-model-path ${utmos_model_path} \
--ssl-model-path ${wav2vec_model_path}
fi
if [ $start_stage -le 3 ] && [ $end_stage -ge 3 ]; then
echo "=====Evaluate for Librispeech test-clean======="
test_list=testset/test_librispeech_pc_test_clean.tsv
wav_path=results/zipvoice_librispeech_test_clean
echo $wav_path
echo "-----Computing SIM-o-----"
python3 local/evaluate_sim.py \
--sv-model-path ${sv_model_path} \
--ssl-model-path ${wavlm_model_path} \
--eval-path ${wav_path} \
--test-list ${test_list}
echo "-----Computing WER-----"
python3 local/evaluate_wer_hubert.py \
--test-list ${test_list} \
--wav-path ${wav_path} \
echo "-----Computing UTSMOS-----"
python3 local/evaluate_utmos.py \
--wav-path ${wav_path} \
--utmos-model-path ${utmos_model_path} \
--ssl-model-path ${wav2vec_model_path}
fi