mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-27 02:34:21 +00:00
Update README
This commit is contained in:
parent
8cb1cda040
commit
14a28edab6
0
.github/scripts/ljspeech/TTS/run-matcha.sh
vendored
Executable file
0
.github/scripts/ljspeech/TTS/run-matcha.sh
vendored
Executable file
3
egs/ljspeech/TTS/.gitignore
vendored
3
egs/ljspeech/TTS/.gitignore
vendored
@ -2,3 +2,6 @@ build
|
|||||||
core.c
|
core.c
|
||||||
*.so
|
*.so
|
||||||
my-output*
|
my-output*
|
||||||
|
*.wav
|
||||||
|
*.onnx
|
||||||
|
generator_v*
|
||||||
|
@ -101,3 +101,117 @@ export CUDA_VISIBLE_DEVICES=4,5,6,7
|
|||||||
|
|
||||||
# (Note it is killed after `epoch-820.pt`)
|
# (Note it is killed after `epoch-820.pt`)
|
||||||
```
|
```
|
||||||
|
# matcha
|
||||||
|
|
||||||
|
[./matcha](./matcha) contains the code for training [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS)
|
||||||
|
|
||||||
|
This recipe provides a Matcha-TTS model trained on the LJSpeech dataset.
|
||||||
|
|
||||||
|
Pretrained model can be found [here](https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28).
|
||||||
|
|
||||||
|
The training command is given below:
|
||||||
|
```bash
|
||||||
|
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
||||||
|
|
||||||
|
python3 ./matcha/train.py \
|
||||||
|
--exp-dir ./matcha/exp-new-3/ \
|
||||||
|
--num-workers 4 \
|
||||||
|
--world-size 4 \
|
||||||
|
--num-epochs 4000 \
|
||||||
|
--max-duration 1000 \
|
||||||
|
--bucketing-sampler 1 \
|
||||||
|
--start-epoch 1
|
||||||
|
```
|
||||||
|
|
||||||
|
To inference, use:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Download Hifigan vocoder. We use Hifigan v1 below. You can select from v1, v2, or v3
|
||||||
|
|
||||||
|
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
|
||||||
|
|
||||||
|
./matcha/inference \
|
||||||
|
--exp-dir ./matcha/exp-new-3 \
|
||||||
|
--epoch 4000 \
|
||||||
|
--tokens ./data/tokens.txt \
|
||||||
|
--vocoder ./generator_v1 \
|
||||||
|
--input-text "how are you doing?"
|
||||||
|
--output-wav ./generated.wav
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
soxi ./generated.wav
|
||||||
|
```
|
||||||
|
prints:
|
||||||
|
```
|
||||||
|
Input File : './generated.wav'
|
||||||
|
Channels : 1
|
||||||
|
Sample Rate : 22050
|
||||||
|
Precision : 16-bit
|
||||||
|
Duration : 00:00:01.29 = 28416 samples ~ 96.6531 CDDA sectors
|
||||||
|
File Size : 56.9k
|
||||||
|
Bit Rate : 353k
|
||||||
|
Sample Encoding: 16-bit Signed Integer PCM
|
||||||
|
```
|
||||||
|
|
||||||
|
To export the checkpoint to onnx:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# export the acoustic model to onnx
|
||||||
|
|
||||||
|
./matcha/export_onnx.py \
|
||||||
|
--exp-dir ./matcha/exp-new-3 \
|
||||||
|
--epoch 4000 \
|
||||||
|
--tokens ./data/tokens.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
The above command generate the following files:
|
||||||
|
|
||||||
|
- model-steps-2.onnx
|
||||||
|
- model-steps-3.onnx
|
||||||
|
- model-steps-4.onnx
|
||||||
|
- model-steps-5.onnx
|
||||||
|
- model-steps-6.onnx
|
||||||
|
|
||||||
|
where the 2 in `model-steps-2.onnx` means it uses 2 steps for the ODE solver.
|
||||||
|
|
||||||
|
|
||||||
|
To export the Hifigan vocoder to onnx, please use:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
|
||||||
|
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
|
||||||
|
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v3
|
||||||
|
|
||||||
|
python3 ./matcha/export_onnx_hifigan.py
|
||||||
|
```
|
||||||
|
|
||||||
|
The above command generates 3 files:
|
||||||
|
|
||||||
|
- hifigan_v1.onnx
|
||||||
|
- hifigan_v2.onnx
|
||||||
|
- hifigan_v3.onnx
|
||||||
|
|
||||||
|
To use the generated onnx files to generate speech from text, please run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 ./matcha/onnx_pretrained.py \
|
||||||
|
--acoustic-model ./model-steps-6.onnx \
|
||||||
|
--vocoder ./hifigan_v2.onnx \
|
||||||
|
--tokens ./data/tokens.txt \
|
||||||
|
--input-text "how are you doing?" \
|
||||||
|
--output-wav ./generated-2.wav
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
soxi ./generated-2.wav
|
||||||
|
|
||||||
|
Input File : './generated-2.wav'
|
||||||
|
Channels : 1
|
||||||
|
Sample Rate : 22050
|
||||||
|
Precision : 16-bit
|
||||||
|
Duration : 00:00:01.25 = 27648 samples ~ 94.0408 CDDA sectors
|
||||||
|
File Size : 55.3k
|
||||||
|
Bit Rate : 353k
|
||||||
|
Sample Encoding: 16-bit Signed Integer PCM
|
||||||
|
```
|
||||||
|
@ -6,14 +6,12 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import soundfile as sf
|
import soundfile as sf
|
||||||
import torch
|
import torch
|
||||||
from matcha.hifigan.config import v1, v2, v3
|
from matcha.hifigan.config import v1, v2, v3
|
||||||
from matcha.hifigan.denoiser import Denoiser
|
from matcha.hifigan.denoiser import Denoiser
|
||||||
from matcha.hifigan.models import Generator as HiFiGAN
|
from matcha.hifigan.models import Generator as HiFiGAN
|
||||||
from tokenizer import Tokenizer
|
from tokenizer import Tokenizer
|
||||||
from tqdm.auto import tqdm
|
|
||||||
from train import get_model, get_params
|
from train import get_model, get_params
|
||||||
|
|
||||||
from icefall.checkpoint import load_checkpoint
|
from icefall.checkpoint import load_checkpoint
|
||||||
@ -64,6 +62,20 @@ def get_parser():
|
|||||||
help="""Path to vocabulary.""",
|
help="""Path to vocabulary.""",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--input-text",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="The text to generate speech for",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-wav",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="The filename of the wave to save the generated speech",
|
||||||
|
)
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
@ -93,13 +105,6 @@ def to_waveform(mel, vocoder, denoiser):
|
|||||||
return audio.cpu().squeeze()
|
return audio.cpu().squeeze()
|
||||||
|
|
||||||
|
|
||||||
def save_to_folder(filename: str, output: dict, folder: str):
|
|
||||||
folder = Path(folder)
|
|
||||||
folder.mkdir(exist_ok=True, parents=True)
|
|
||||||
np.save(folder / f"{filename}", output["mel"].cpu().numpy())
|
|
||||||
sf.write(folder / f"{filename}.wav", output["waveform"], 22050, "PCM_24")
|
|
||||||
|
|
||||||
|
|
||||||
def process_text(text: str, tokenizer):
|
def process_text(text: str, tokenizer):
|
||||||
x = tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
|
x = tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
|
||||||
x = torch.tensor(x, dtype=torch.long)
|
x = torch.tensor(x, dtype=torch.long)
|
||||||
@ -120,7 +125,6 @@ def synthesise(
|
|||||||
spks=spks,
|
spks=spks,
|
||||||
length_scale=length_scale,
|
length_scale=length_scale,
|
||||||
)
|
)
|
||||||
print("output.shape", list(output.keys()), output["mel"].shape)
|
|
||||||
# merge everything to one dict
|
# merge everything to one dict
|
||||||
output.update({"start_t": start_t, **text_processed})
|
output.update({"start_t": start_t, **text_processed})
|
||||||
return output
|
return output
|
||||||
@ -163,16 +167,6 @@ def main():
|
|||||||
vocoder = load_vocoder(params.vocoder)
|
vocoder = load_vocoder(params.vocoder)
|
||||||
denoiser = Denoiser(vocoder, mode="zeros")
|
denoiser = Denoiser(vocoder, mode="zeros")
|
||||||
|
|
||||||
texts = [
|
|
||||||
"The Secret Service believed that it was very doubtful that any "
|
|
||||||
"President would ride regularly in a vehicle with a fixed top, even "
|
|
||||||
"though transparent.",
|
|
||||||
"Today as always, men fall into two groups: slaves and free men. "
|
|
||||||
"Whoever does not have two-thirds of his day for himself, is a slave, "
|
|
||||||
"whatever he may be: a statesman, a businessman, an official, or a "
|
|
||||||
"scholar.",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Number of ODE Solver steps
|
# Number of ODE Solver steps
|
||||||
n_timesteps = 2
|
n_timesteps = 2
|
||||||
|
|
||||||
@ -182,47 +176,17 @@ def main():
|
|||||||
# Sampling temperature
|
# Sampling temperature
|
||||||
temperature = 0.667
|
temperature = 0.667
|
||||||
|
|
||||||
rtfs = []
|
|
||||||
rtfs_w = []
|
|
||||||
for i, text in enumerate(tqdm(texts)):
|
|
||||||
output = synthesise(
|
output = synthesise(
|
||||||
model=model,
|
model=model,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
n_timesteps=n_timesteps,
|
n_timesteps=n_timesteps,
|
||||||
text=text,
|
text=params.input_text,
|
||||||
length_scale=length_scale,
|
length_scale=length_scale,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
) # , torch.tensor([15], device=device, dtype=torch.long).unsqueeze(0))
|
)
|
||||||
output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
|
output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
|
||||||
|
|
||||||
# Compute Real Time Factor (RTF) with HiFi-GAN
|
sf.write(params.output_wav, output["waveform"], 22050, "PCM_16")
|
||||||
t = (dt.datetime.now() - output["start_t"]).total_seconds()
|
|
||||||
rtf_w = t * 22050 / (output["waveform"].shape[-1])
|
|
||||||
|
|
||||||
# Pretty print
|
|
||||||
print(f"{'*' * 53}")
|
|
||||||
print(f"Input text - {i}")
|
|
||||||
print(f"{'-' * 53}")
|
|
||||||
print(output["x_orig"])
|
|
||||||
print(f"{'*' * 53}")
|
|
||||||
print(f"Phonetised text - {i}")
|
|
||||||
print(f"{'-' * 53}")
|
|
||||||
print(output["x"])
|
|
||||||
print(f"{'*' * 53}")
|
|
||||||
print(f"RTF:\t\t{output['rtf']:.6f}")
|
|
||||||
print(f"RTF Waveform:\t{rtf_w:.6f}")
|
|
||||||
rtfs.append(output["rtf"])
|
|
||||||
rtfs_w.append(rtf_w)
|
|
||||||
|
|
||||||
# Save the generated waveform
|
|
||||||
save_to_folder(i, output, folder=f"./my-output-{params.epoch}")
|
|
||||||
|
|
||||||
print(f"Number of ODE steps: {n_timesteps}")
|
|
||||||
print(f"Mean RTF:\t\t\t\t{np.mean(rtfs):.6f} ± {np.std(rtfs):.6f}")
|
|
||||||
print(
|
|
||||||
"Mean RTF Waveform "
|
|
||||||
f"(incl. vocoder):\t{np.mean(rtfs_w):.6f} ± {np.std(rtfs_w):.6f}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
import datetime as dt
|
import datetime as dt
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
@ -9,6 +10,49 @@ from inference import load_vocoder
|
|||||||
from tokenizer import Tokenizer
|
from tokenizer import Tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--acoustic-model",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Path to the acoustic model",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--tokens",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Path to the tokens.txt",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--vocoder",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Path to the vocoder",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--input-text",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="The text to generate speech for",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-wav",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="The filename of the wave to save the generated speech",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
class OnnxHifiGANModel:
|
class OnnxHifiGANModel:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -98,10 +142,12 @@ class OnnxModel:
|
|||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def main():
|
def main():
|
||||||
model = OnnxModel("./model-steps-6.onnx")
|
params = get_parser().parse_args()
|
||||||
vocoder = OnnxHifiGANModel("./hifigan_v1.onnx")
|
logging.info(vars(params))
|
||||||
text = "Today as always, men fall into two groups: slaves and free men."
|
|
||||||
text += "hello, how are you doing?"
|
model = OnnxModel(params.acoustic_model)
|
||||||
|
vocoder = OnnxHifiGANModel(params.vocoder)
|
||||||
|
text = params.input_text
|
||||||
x = model.tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
|
x = model.tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
|
||||||
x = torch.tensor(x, dtype=torch.int64)
|
x = torch.tensor(x, dtype=torch.int64)
|
||||||
|
|
||||||
@ -109,9 +155,6 @@ def main():
|
|||||||
mel = model(x)
|
mel = model(x)
|
||||||
end_t = dt.datetime.now()
|
end_t = dt.datetime.now()
|
||||||
|
|
||||||
for i in range(3):
|
|
||||||
audio = vocoder(mel)
|
|
||||||
|
|
||||||
start_t2 = dt.datetime.now()
|
start_t2 = dt.datetime.now()
|
||||||
audio = vocoder(mel)
|
audio = vocoder(mel)
|
||||||
end_t2 = dt.datetime.now()
|
end_t2 = dt.datetime.now()
|
||||||
@ -121,13 +164,14 @@ def main():
|
|||||||
|
|
||||||
t = (end_t - start_t).total_seconds()
|
t = (end_t - start_t).total_seconds()
|
||||||
t2 = (end_t2 - start_t2).total_seconds()
|
t2 = (end_t2 - start_t2).total_seconds()
|
||||||
rtf = t * 22050 / audio.shape[-1]
|
rtf_am = t * 22050 / audio.shape[-1]
|
||||||
rtf2 = t2 * 22050 / audio.shape[-1]
|
rtf_vocoder = t2 * 22050 / audio.shape[-1]
|
||||||
print("RTF", rtf)
|
print("RTF for acoustic model ", rtf_am)
|
||||||
print("RTF", rtf2)
|
print("RTF for vocoder", rtf_vocoder)
|
||||||
|
|
||||||
# skip denoiser
|
# skip denoiser
|
||||||
sf.write("onnx2.wav", audio, 22050, "PCM_16")
|
sf.write(params.output_wav, audio, 22050, "PCM_16")
|
||||||
|
logging.info(f"Saved to {params.output_wav}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -34,10 +34,10 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
|||||||
log "monotonic_align lib for vits already built"
|
log "monotonic_align lib for vits already built"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f ./matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ]; then
|
if [ ! -f ./matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ]; then
|
||||||
pushd matcha/utils/monotonic_align
|
pushd matcha/monotonic_align
|
||||||
python3 setup.py build_ext --inplace
|
python3 setup.py build_ext --inplace
|
||||||
mv -v matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./
|
mv -v matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./
|
||||||
rm -rf matcha
|
rm -rf matcha
|
||||||
rm -rf build
|
rm -rf build
|
||||||
rm core.c
|
rm core.c
|
||||||
|
Loading…
x
Reference in New Issue
Block a user