diff --git a/egs/wenetspeech4tts/TTS/f5-tts/requirements.txt b/egs/wenetspeech4tts/TTS/f5-tts/requirements.txt new file mode 100644 index 000000000..63f1e237c --- /dev/null +++ b/egs/wenetspeech4tts/TTS/f5-tts/requirements.txt @@ -0,0 +1,36 @@ +# F5-TTS +accelerate>=0.33.0 +bitsandbytes>0.37.0 +cached_path +click +datasets +ema_pytorch>=0.5.2 +gradio>=3.45.2 +hydra-core>=1.3.0 +jieba +librosa +matplotlib +numpy<=1.26.4 +pydub +pypinyin +safetensors +soundfile +tomli +torch>=2.0.0 +torchaudio>=2.0.0 +torchdiffeq +tqdm>=4.65.0 +transformers +x_transformers>=1.31.14 + +# icefall +kaldialign +lhotse +tensorboard +bigvganinference +sentencepiece +sherpa-onnx +k2 + +# semantic experiment +s3tokenizer diff --git a/egs/wenetspeech4tts/TTS/prepare.sh b/egs/wenetspeech4tts/TTS/prepare.sh index 7b7cd24b7..f1daa0e62 100755 --- a/egs/wenetspeech4tts/TTS/prepare.sh +++ b/egs/wenetspeech4tts/TTS/prepare.sh @@ -142,7 +142,6 @@ fi if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then log "Stage 7: Extract cosyvoice2 FSQ token (used by ./f5-tts semantic token experiment)" - pip install s3tokenizer split_name=("valid" "test" "train") for split in "${split_name[@]}"; do echo "Processing $split"