From 14a28edab6e032e67caa1770da149aa0a72ef083 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 28 Oct 2024 22:49:14 +0800 Subject: [PATCH] Update README --- .github/scripts/ljspeech/TTS/run-matcha.sh | 0 egs/ljspeech/TTS/.gitignore | 3 + egs/ljspeech/TTS/README.md | 114 +++++++++++++++++++++ egs/ljspeech/TTS/matcha/inference.py | 84 +++++---------- egs/ljspeech/TTS/matcha/onnx_pretrained.py | 68 +++++++++--- egs/ljspeech/TTS/prepare.sh | 6 +- 6 files changed, 200 insertions(+), 75 deletions(-) create mode 100755 .github/scripts/ljspeech/TTS/run-matcha.sh diff --git a/.github/scripts/ljspeech/TTS/run-matcha.sh b/.github/scripts/ljspeech/TTS/run-matcha.sh new file mode 100755 index 000000000..e69de29bb diff --git a/egs/ljspeech/TTS/.gitignore b/egs/ljspeech/TTS/.gitignore index 1eef06a28..d5c19797a 100644 --- a/egs/ljspeech/TTS/.gitignore +++ b/egs/ljspeech/TTS/.gitignore @@ -2,3 +2,6 @@ build core.c *.so my-output* +*.wav +*.onnx +generator_v* diff --git a/egs/ljspeech/TTS/README.md b/egs/ljspeech/TTS/README.md index 7b112c12c..fe613024a 100644 --- a/egs/ljspeech/TTS/README.md +++ b/egs/ljspeech/TTS/README.md @@ -101,3 +101,117 @@ export CUDA_VISIBLE_DEVICES=4,5,6,7 # (Note it is killed after `epoch-820.pt`) ``` +# matcha + +[./matcha](./matcha) contains the code for training [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS) + +This recipe provides a Matcha-TTS model trained on the LJSpeech dataset. + +Pretrained model can be found [here](https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28). + +The training command is given below: +```bash +export CUDA_VISIBLE_DEVICES=0,1,2,3 + +python3 ./matcha/train.py \ + --exp-dir ./matcha/exp-new-3/ \ + --num-workers 4 \ + --world-size 4 \ + --num-epochs 4000 \ + --max-duration 1000 \ + --bucketing-sampler 1 \ + --start-epoch 1 +``` + +To inference, use: + +```bash +# Download Hifigan vocoder. We use Hifigan v1 below. You can select from v1, v2, or v3 + +wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1 + +./matcha/inference \ + --exp-dir ./matcha/exp-new-3 \ + --epoch 4000 \ + --tokens ./data/tokens.txt \ + --vocoder ./generator_v1 \ + --input-text "how are you doing?" + --output-wav ./generated.wav +``` + +```bash +soxi ./generated.wav +``` +prints: +``` +Input File : './generated.wav' +Channels : 1 +Sample Rate : 22050 +Precision : 16-bit +Duration : 00:00:01.29 = 28416 samples ~ 96.6531 CDDA sectors +File Size : 56.9k +Bit Rate : 353k +Sample Encoding: 16-bit Signed Integer PCM +``` + +To export the checkpoint to onnx: + +```bash +# export the acoustic model to onnx + +./matcha/export_onnx.py \ + --exp-dir ./matcha/exp-new-3 \ + --epoch 4000 \ + --tokens ./data/tokens.txt +``` + +The above command generate the following files: + + - model-steps-2.onnx + - model-steps-3.onnx + - model-steps-4.onnx + - model-steps-5.onnx + - model-steps-6.onnx + +where the 2 in `model-steps-2.onnx` means it uses 2 steps for the ODE solver. + + +To export the Hifigan vocoder to onnx, please use: + +```bash +wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1 +wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2 +wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v3 + +python3 ./matcha/export_onnx_hifigan.py +``` + +The above command generates 3 files: + + - hifigan_v1.onnx + - hifigan_v2.onnx + - hifigan_v3.onnx + +To use the generated onnx files to generate speech from text, please run: + +```bash +python3 ./matcha/onnx_pretrained.py \ + --acoustic-model ./model-steps-6.onnx \ + --vocoder ./hifigan_v2.onnx \ + --tokens ./data/tokens.txt \ + --input-text "how are you doing?" \ + --output-wav ./generated-2.wav +``` + +```bash +soxi ./generated-2.wav + +Input File : './generated-2.wav' +Channels : 1 +Sample Rate : 22050 +Precision : 16-bit +Duration : 00:00:01.25 = 27648 samples ~ 94.0408 CDDA sectors +File Size : 55.3k +Bit Rate : 353k +Sample Encoding: 16-bit Signed Integer PCM +``` diff --git a/egs/ljspeech/TTS/matcha/inference.py b/egs/ljspeech/TTS/matcha/inference.py index 8fc0ec3ac..1189160f6 100755 --- a/egs/ljspeech/TTS/matcha/inference.py +++ b/egs/ljspeech/TTS/matcha/inference.py @@ -6,14 +6,12 @@ import json import logging from pathlib import Path -import numpy as np import soundfile as sf import torch from matcha.hifigan.config import v1, v2, v3 from matcha.hifigan.denoiser import Denoiser from matcha.hifigan.models import Generator as HiFiGAN from tokenizer import Tokenizer -from tqdm.auto import tqdm from train import get_model, get_params from icefall.checkpoint import load_checkpoint @@ -64,6 +62,20 @@ def get_parser(): help="""Path to vocabulary.""", ) + parser.add_argument( + "--input-text", + type=str, + required=True, + help="The text to generate speech for", + ) + + parser.add_argument( + "--output-wav", + type=str, + required=True, + help="The filename of the wave to save the generated speech", + ) + return parser @@ -93,13 +105,6 @@ def to_waveform(mel, vocoder, denoiser): return audio.cpu().squeeze() -def save_to_folder(filename: str, output: dict, folder: str): - folder = Path(folder) - folder.mkdir(exist_ok=True, parents=True) - np.save(folder / f"{filename}", output["mel"].cpu().numpy()) - sf.write(folder / f"{filename}.wav", output["waveform"], 22050, "PCM_24") - - def process_text(text: str, tokenizer): x = tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True) x = torch.tensor(x, dtype=torch.long) @@ -120,7 +125,6 @@ def synthesise( spks=spks, length_scale=length_scale, ) - print("output.shape", list(output.keys()), output["mel"].shape) # merge everything to one dict output.update({"start_t": start_t, **text_processed}) return output @@ -163,16 +167,6 @@ def main(): vocoder = load_vocoder(params.vocoder) denoiser = Denoiser(vocoder, mode="zeros") - texts = [ - "The Secret Service believed that it was very doubtful that any " - "President would ride regularly in a vehicle with a fixed top, even " - "though transparent.", - "Today as always, men fall into two groups: slaves and free men. " - "Whoever does not have two-thirds of his day for himself, is a slave, " - "whatever he may be: a statesman, a businessman, an official, or a " - "scholar.", - ] - # Number of ODE Solver steps n_timesteps = 2 @@ -182,47 +176,17 @@ def main(): # Sampling temperature temperature = 0.667 - rtfs = [] - rtfs_w = [] - for i, text in enumerate(tqdm(texts)): - output = synthesise( - model=model, - tokenizer=tokenizer, - n_timesteps=n_timesteps, - text=text, - length_scale=length_scale, - temperature=temperature, - ) # , torch.tensor([15], device=device, dtype=torch.long).unsqueeze(0)) - output["waveform"] = to_waveform(output["mel"], vocoder, denoiser) - - # Compute Real Time Factor (RTF) with HiFi-GAN - t = (dt.datetime.now() - output["start_t"]).total_seconds() - rtf_w = t * 22050 / (output["waveform"].shape[-1]) - - # Pretty print - print(f"{'*' * 53}") - print(f"Input text - {i}") - print(f"{'-' * 53}") - print(output["x_orig"]) - print(f"{'*' * 53}") - print(f"Phonetised text - {i}") - print(f"{'-' * 53}") - print(output["x"]) - print(f"{'*' * 53}") - print(f"RTF:\t\t{output['rtf']:.6f}") - print(f"RTF Waveform:\t{rtf_w:.6f}") - rtfs.append(output["rtf"]) - rtfs_w.append(rtf_w) - - # Save the generated waveform - save_to_folder(i, output, folder=f"./my-output-{params.epoch}") - - print(f"Number of ODE steps: {n_timesteps}") - print(f"Mean RTF:\t\t\t\t{np.mean(rtfs):.6f} ± {np.std(rtfs):.6f}") - print( - "Mean RTF Waveform " - f"(incl. vocoder):\t{np.mean(rtfs_w):.6f} ± {np.std(rtfs_w):.6f}" + output = synthesise( + model=model, + tokenizer=tokenizer, + n_timesteps=n_timesteps, + text=params.input_text, + length_scale=length_scale, + temperature=temperature, ) + output["waveform"] = to_waveform(output["mel"], vocoder, denoiser) + + sf.write(params.output_wav, output["waveform"], 22050, "PCM_16") if __name__ == "__main__": diff --git a/egs/ljspeech/TTS/matcha/onnx_pretrained.py b/egs/ljspeech/TTS/matcha/onnx_pretrained.py index 3953d5d0a..6a37f3c17 100755 --- a/egs/ljspeech/TTS/matcha/onnx_pretrained.py +++ b/egs/ljspeech/TTS/matcha/onnx_pretrained.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import argparse import datetime as dt import logging @@ -9,6 +10,49 @@ from inference import load_vocoder from tokenizer import Tokenizer +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--acoustic-model", + type=str, + required=True, + help="Path to the acoustic model", + ) + + parser.add_argument( + "--tokens", + type=str, + required=True, + help="Path to the tokens.txt", + ) + + parser.add_argument( + "--vocoder", + type=str, + required=True, + help="Path to the vocoder", + ) + + parser.add_argument( + "--input-text", + type=str, + required=True, + help="The text to generate speech for", + ) + + parser.add_argument( + "--output-wav", + type=str, + required=True, + help="The filename of the wave to save the generated speech", + ) + + return parser + + class OnnxHifiGANModel: def __init__( self, @@ -98,10 +142,12 @@ class OnnxModel: @torch.no_grad() def main(): - model = OnnxModel("./model-steps-6.onnx") - vocoder = OnnxHifiGANModel("./hifigan_v1.onnx") - text = "Today as always, men fall into two groups: slaves and free men." - text += "hello, how are you doing?" + params = get_parser().parse_args() + logging.info(vars(params)) + + model = OnnxModel(params.acoustic_model) + vocoder = OnnxHifiGANModel(params.vocoder) + text = params.input_text x = model.tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True) x = torch.tensor(x, dtype=torch.int64) @@ -109,9 +155,6 @@ def main(): mel = model(x) end_t = dt.datetime.now() - for i in range(3): - audio = vocoder(mel) - start_t2 = dt.datetime.now() audio = vocoder(mel) end_t2 = dt.datetime.now() @@ -121,13 +164,14 @@ def main(): t = (end_t - start_t).total_seconds() t2 = (end_t2 - start_t2).total_seconds() - rtf = t * 22050 / audio.shape[-1] - rtf2 = t2 * 22050 / audio.shape[-1] - print("RTF", rtf) - print("RTF", rtf2) + rtf_am = t * 22050 / audio.shape[-1] + rtf_vocoder = t2 * 22050 / audio.shape[-1] + print("RTF for acoustic model ", rtf_am) + print("RTF for vocoder", rtf_vocoder) # skip denoiser - sf.write("onnx2.wav", audio, 22050, "PCM_16") + sf.write(params.output_wav, audio, 22050, "PCM_16") + logging.info(f"Saved to {params.output_wav}") if __name__ == "__main__": diff --git a/egs/ljspeech/TTS/prepare.sh b/egs/ljspeech/TTS/prepare.sh index b140e6f01..dfc2b3540 100755 --- a/egs/ljspeech/TTS/prepare.sh +++ b/egs/ljspeech/TTS/prepare.sh @@ -34,10 +34,10 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then log "monotonic_align lib for vits already built" fi - if [ ! -f ./matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ]; then - pushd matcha/utils/monotonic_align + if [ ! -f ./matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ]; then + pushd matcha/monotonic_align python3 setup.py build_ext --inplace - mv -v matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./ + mv -v matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./ rm -rf matcha rm -rf build rm core.c