Update README

This commit is contained in:
Fangjun Kuang 2024-10-28 22:49:14 +08:00
parent 8cb1cda040
commit 14a28edab6
6 changed files with 200 additions and 75 deletions

0
.github/scripts/ljspeech/TTS/run-matcha.sh vendored Executable file
View File

View File

@ -2,3 +2,6 @@ build
core.c
*.so
my-output*
*.wav
*.onnx
generator_v*

View File

@ -101,3 +101,117 @@ export CUDA_VISIBLE_DEVICES=4,5,6,7
# (Note it is killed after `epoch-820.pt`)
```
# matcha
[./matcha](./matcha) contains the code for training [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS)
This recipe provides a Matcha-TTS model trained on the LJSpeech dataset.
Pretrained model can be found [here](https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28).
The training command is given below:
```bash
export CUDA_VISIBLE_DEVICES=0,1,2,3
python3 ./matcha/train.py \
--exp-dir ./matcha/exp-new-3/ \
--num-workers 4 \
--world-size 4 \
--num-epochs 4000 \
--max-duration 1000 \
--bucketing-sampler 1 \
--start-epoch 1
```
To inference, use:
```bash
# Download Hifigan vocoder. We use Hifigan v1 below. You can select from v1, v2, or v3
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
./matcha/inference \
--exp-dir ./matcha/exp-new-3 \
--epoch 4000 \
--tokens ./data/tokens.txt \
--vocoder ./generator_v1 \
--input-text "how are you doing?"
--output-wav ./generated.wav
```
```bash
soxi ./generated.wav
```
prints:
```
Input File : './generated.wav'
Channels : 1
Sample Rate : 22050
Precision : 16-bit
Duration : 00:00:01.29 = 28416 samples ~ 96.6531 CDDA sectors
File Size : 56.9k
Bit Rate : 353k
Sample Encoding: 16-bit Signed Integer PCM
```
To export the checkpoint to onnx:
```bash
# export the acoustic model to onnx
./matcha/export_onnx.py \
--exp-dir ./matcha/exp-new-3 \
--epoch 4000 \
--tokens ./data/tokens.txt
```
The above command generate the following files:
- model-steps-2.onnx
- model-steps-3.onnx
- model-steps-4.onnx
- model-steps-5.onnx
- model-steps-6.onnx
where the 2 in `model-steps-2.onnx` means it uses 2 steps for the ODE solver.
To export the Hifigan vocoder to onnx, please use:
```bash
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v3
python3 ./matcha/export_onnx_hifigan.py
```
The above command generates 3 files:
- hifigan_v1.onnx
- hifigan_v2.onnx
- hifigan_v3.onnx
To use the generated onnx files to generate speech from text, please run:
```bash
python3 ./matcha/onnx_pretrained.py \
--acoustic-model ./model-steps-6.onnx \
--vocoder ./hifigan_v2.onnx \
--tokens ./data/tokens.txt \
--input-text "how are you doing?" \
--output-wav ./generated-2.wav
```
```bash
soxi ./generated-2.wav
Input File : './generated-2.wav'
Channels : 1
Sample Rate : 22050
Precision : 16-bit
Duration : 00:00:01.25 = 27648 samples ~ 94.0408 CDDA sectors
File Size : 55.3k
Bit Rate : 353k
Sample Encoding: 16-bit Signed Integer PCM
```

View File

@ -6,14 +6,12 @@ import json
import logging
from pathlib import Path
import numpy as np
import soundfile as sf
import torch
from matcha.hifigan.config import v1, v2, v3
from matcha.hifigan.denoiser import Denoiser
from matcha.hifigan.models import Generator as HiFiGAN
from tokenizer import Tokenizer
from tqdm.auto import tqdm
from train import get_model, get_params
from icefall.checkpoint import load_checkpoint
@ -64,6 +62,20 @@ def get_parser():
help="""Path to vocabulary.""",
)
parser.add_argument(
"--input-text",
type=str,
required=True,
help="The text to generate speech for",
)
parser.add_argument(
"--output-wav",
type=str,
required=True,
help="The filename of the wave to save the generated speech",
)
return parser
@ -93,13 +105,6 @@ def to_waveform(mel, vocoder, denoiser):
return audio.cpu().squeeze()
def save_to_folder(filename: str, output: dict, folder: str):
folder = Path(folder)
folder.mkdir(exist_ok=True, parents=True)
np.save(folder / f"{filename}", output["mel"].cpu().numpy())
sf.write(folder / f"{filename}.wav", output["waveform"], 22050, "PCM_24")
def process_text(text: str, tokenizer):
x = tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
x = torch.tensor(x, dtype=torch.long)
@ -120,7 +125,6 @@ def synthesise(
spks=spks,
length_scale=length_scale,
)
print("output.shape", list(output.keys()), output["mel"].shape)
# merge everything to one dict
output.update({"start_t": start_t, **text_processed})
return output
@ -163,16 +167,6 @@ def main():
vocoder = load_vocoder(params.vocoder)
denoiser = Denoiser(vocoder, mode="zeros")
texts = [
"The Secret Service believed that it was very doubtful that any "
"President would ride regularly in a vehicle with a fixed top, even "
"though transparent.",
"Today as always, men fall into two groups: slaves and free men. "
"Whoever does not have two-thirds of his day for himself, is a slave, "
"whatever he may be: a statesman, a businessman, an official, or a "
"scholar.",
]
# Number of ODE Solver steps
n_timesteps = 2
@ -182,47 +176,17 @@ def main():
# Sampling temperature
temperature = 0.667
rtfs = []
rtfs_w = []
for i, text in enumerate(tqdm(texts)):
output = synthesise(
model=model,
tokenizer=tokenizer,
n_timesteps=n_timesteps,
text=text,
length_scale=length_scale,
temperature=temperature,
) # , torch.tensor([15], device=device, dtype=torch.long).unsqueeze(0))
output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
# Compute Real Time Factor (RTF) with HiFi-GAN
t = (dt.datetime.now() - output["start_t"]).total_seconds()
rtf_w = t * 22050 / (output["waveform"].shape[-1])
# Pretty print
print(f"{'*' * 53}")
print(f"Input text - {i}")
print(f"{'-' * 53}")
print(output["x_orig"])
print(f"{'*' * 53}")
print(f"Phonetised text - {i}")
print(f"{'-' * 53}")
print(output["x"])
print(f"{'*' * 53}")
print(f"RTF:\t\t{output['rtf']:.6f}")
print(f"RTF Waveform:\t{rtf_w:.6f}")
rtfs.append(output["rtf"])
rtfs_w.append(rtf_w)
# Save the generated waveform
save_to_folder(i, output, folder=f"./my-output-{params.epoch}")
print(f"Number of ODE steps: {n_timesteps}")
print(f"Mean RTF:\t\t\t\t{np.mean(rtfs):.6f} ± {np.std(rtfs):.6f}")
print(
"Mean RTF Waveform "
f"(incl. vocoder):\t{np.mean(rtfs_w):.6f} ± {np.std(rtfs_w):.6f}"
output = synthesise(
model=model,
tokenizer=tokenizer,
n_timesteps=n_timesteps,
text=params.input_text,
length_scale=length_scale,
temperature=temperature,
)
output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
sf.write(params.output_wav, output["waveform"], 22050, "PCM_16")
if __name__ == "__main__":

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python3
import argparse
import datetime as dt
import logging
@ -9,6 +10,49 @@ from inference import load_vocoder
from tokenizer import Tokenizer
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--acoustic-model",
type=str,
required=True,
help="Path to the acoustic model",
)
parser.add_argument(
"--tokens",
type=str,
required=True,
help="Path to the tokens.txt",
)
parser.add_argument(
"--vocoder",
type=str,
required=True,
help="Path to the vocoder",
)
parser.add_argument(
"--input-text",
type=str,
required=True,
help="The text to generate speech for",
)
parser.add_argument(
"--output-wav",
type=str,
required=True,
help="The filename of the wave to save the generated speech",
)
return parser
class OnnxHifiGANModel:
def __init__(
self,
@ -98,10 +142,12 @@ class OnnxModel:
@torch.no_grad()
def main():
model = OnnxModel("./model-steps-6.onnx")
vocoder = OnnxHifiGANModel("./hifigan_v1.onnx")
text = "Today as always, men fall into two groups: slaves and free men."
text += "hello, how are you doing?"
params = get_parser().parse_args()
logging.info(vars(params))
model = OnnxModel(params.acoustic_model)
vocoder = OnnxHifiGANModel(params.vocoder)
text = params.input_text
x = model.tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
x = torch.tensor(x, dtype=torch.int64)
@ -109,9 +155,6 @@ def main():
mel = model(x)
end_t = dt.datetime.now()
for i in range(3):
audio = vocoder(mel)
start_t2 = dt.datetime.now()
audio = vocoder(mel)
end_t2 = dt.datetime.now()
@ -121,13 +164,14 @@ def main():
t = (end_t - start_t).total_seconds()
t2 = (end_t2 - start_t2).total_seconds()
rtf = t * 22050 / audio.shape[-1]
rtf2 = t2 * 22050 / audio.shape[-1]
print("RTF", rtf)
print("RTF", rtf2)
rtf_am = t * 22050 / audio.shape[-1]
rtf_vocoder = t2 * 22050 / audio.shape[-1]
print("RTF for acoustic model ", rtf_am)
print("RTF for vocoder", rtf_vocoder)
# skip denoiser
sf.write("onnx2.wav", audio, 22050, "PCM_16")
sf.write(params.output_wav, audio, 22050, "PCM_16")
logging.info(f"Saved to {params.output_wav}")
if __name__ == "__main__":

View File

@ -34,10 +34,10 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
log "monotonic_align lib for vits already built"
fi
if [ ! -f ./matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ]; then
pushd matcha/utils/monotonic_align
if [ ! -f ./matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ]; then
pushd matcha/monotonic_align
python3 setup.py build_ext --inplace
mv -v matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./
mv -v matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./
rm -rf matcha
rm -rf build
rm core.c