From 93225563cdc02414d050abe5237c8346f6e37380 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 21 Feb 2025 02:36:41 +0000 Subject: [PATCH] update readme --- egs/wenetspeech4tts/TTS/README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/egs/wenetspeech4tts/TTS/README.md b/egs/wenetspeech4tts/TTS/README.md index cb41c67c7..8329ae948 100644 --- a/egs/wenetspeech4tts/TTS/README.md +++ b/egs/wenetspeech4tts/TTS/README.md @@ -140,9 +140,7 @@ bash local/compute_wer.sh $output_dir $manifest # F5-TTS-Semantic-Token -./f5-tts contains the code for training F5-TTS-Semantic-Token. We replaced the text tokens in F5-TTS with pretrained cosyvoice2 semantic tokens. - -We observed faster convergence and better prosody modeling results by doing this. +./f5-tts contains the code for training F5-TTS-Semantic-Token. We replaced the text tokens in F5-TTS with pretrained cosyvoice2 semantic tokens. During inference, we use the pretrained CosyVoice2 LLM to predict the semantic tokens for target audios. We observed that this approach leads to faster convergence and improved prosody modeling results. Generated samples and training logs of wenetspeech basic 7k hours data can be found [here](https://huggingface.co/yuekai/f5-tts-semantic-token-small-wenetspeech4tts-basic/tree/main).