mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-26 18:24:18 +00:00
Update README
This commit is contained in:
parent
8cb1cda040
commit
14a28edab6
0
.github/scripts/ljspeech/TTS/run-matcha.sh
vendored
Executable file
0
.github/scripts/ljspeech/TTS/run-matcha.sh
vendored
Executable file
3
egs/ljspeech/TTS/.gitignore
vendored
3
egs/ljspeech/TTS/.gitignore
vendored
@ -2,3 +2,6 @@ build
|
||||
core.c
|
||||
*.so
|
||||
my-output*
|
||||
*.wav
|
||||
*.onnx
|
||||
generator_v*
|
||||
|
@ -101,3 +101,117 @@ export CUDA_VISIBLE_DEVICES=4,5,6,7
|
||||
|
||||
# (Note it is killed after `epoch-820.pt`)
|
||||
```
|
||||
# matcha
|
||||
|
||||
[./matcha](./matcha) contains the code for training [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS)
|
||||
|
||||
This recipe provides a Matcha-TTS model trained on the LJSpeech dataset.
|
||||
|
||||
Pretrained model can be found [here](https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28).
|
||||
|
||||
The training command is given below:
|
||||
```bash
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
||||
|
||||
python3 ./matcha/train.py \
|
||||
--exp-dir ./matcha/exp-new-3/ \
|
||||
--num-workers 4 \
|
||||
--world-size 4 \
|
||||
--num-epochs 4000 \
|
||||
--max-duration 1000 \
|
||||
--bucketing-sampler 1 \
|
||||
--start-epoch 1
|
||||
```
|
||||
|
||||
To inference, use:
|
||||
|
||||
```bash
|
||||
# Download Hifigan vocoder. We use Hifigan v1 below. You can select from v1, v2, or v3
|
||||
|
||||
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
|
||||
|
||||
./matcha/inference \
|
||||
--exp-dir ./matcha/exp-new-3 \
|
||||
--epoch 4000 \
|
||||
--tokens ./data/tokens.txt \
|
||||
--vocoder ./generator_v1 \
|
||||
--input-text "how are you doing?"
|
||||
--output-wav ./generated.wav
|
||||
```
|
||||
|
||||
```bash
|
||||
soxi ./generated.wav
|
||||
```
|
||||
prints:
|
||||
```
|
||||
Input File : './generated.wav'
|
||||
Channels : 1
|
||||
Sample Rate : 22050
|
||||
Precision : 16-bit
|
||||
Duration : 00:00:01.29 = 28416 samples ~ 96.6531 CDDA sectors
|
||||
File Size : 56.9k
|
||||
Bit Rate : 353k
|
||||
Sample Encoding: 16-bit Signed Integer PCM
|
||||
```
|
||||
|
||||
To export the checkpoint to onnx:
|
||||
|
||||
```bash
|
||||
# export the acoustic model to onnx
|
||||
|
||||
./matcha/export_onnx.py \
|
||||
--exp-dir ./matcha/exp-new-3 \
|
||||
--epoch 4000 \
|
||||
--tokens ./data/tokens.txt
|
||||
```
|
||||
|
||||
The above command generate the following files:
|
||||
|
||||
- model-steps-2.onnx
|
||||
- model-steps-3.onnx
|
||||
- model-steps-4.onnx
|
||||
- model-steps-5.onnx
|
||||
- model-steps-6.onnx
|
||||
|
||||
where the 2 in `model-steps-2.onnx` means it uses 2 steps for the ODE solver.
|
||||
|
||||
|
||||
To export the Hifigan vocoder to onnx, please use:
|
||||
|
||||
```bash
|
||||
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
|
||||
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
|
||||
wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v3
|
||||
|
||||
python3 ./matcha/export_onnx_hifigan.py
|
||||
```
|
||||
|
||||
The above command generates 3 files:
|
||||
|
||||
- hifigan_v1.onnx
|
||||
- hifigan_v2.onnx
|
||||
- hifigan_v3.onnx
|
||||
|
||||
To use the generated onnx files to generate speech from text, please run:
|
||||
|
||||
```bash
|
||||
python3 ./matcha/onnx_pretrained.py \
|
||||
--acoustic-model ./model-steps-6.onnx \
|
||||
--vocoder ./hifigan_v2.onnx \
|
||||
--tokens ./data/tokens.txt \
|
||||
--input-text "how are you doing?" \
|
||||
--output-wav ./generated-2.wav
|
||||
```
|
||||
|
||||
```bash
|
||||
soxi ./generated-2.wav
|
||||
|
||||
Input File : './generated-2.wav'
|
||||
Channels : 1
|
||||
Sample Rate : 22050
|
||||
Precision : 16-bit
|
||||
Duration : 00:00:01.25 = 27648 samples ~ 94.0408 CDDA sectors
|
||||
File Size : 55.3k
|
||||
Bit Rate : 353k
|
||||
Sample Encoding: 16-bit Signed Integer PCM
|
||||
```
|
||||
|
@ -6,14 +6,12 @@ import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import torch
|
||||
from matcha.hifigan.config import v1, v2, v3
|
||||
from matcha.hifigan.denoiser import Denoiser
|
||||
from matcha.hifigan.models import Generator as HiFiGAN
|
||||
from tokenizer import Tokenizer
|
||||
from tqdm.auto import tqdm
|
||||
from train import get_model, get_params
|
||||
|
||||
from icefall.checkpoint import load_checkpoint
|
||||
@ -64,6 +62,20 @@ def get_parser():
|
||||
help="""Path to vocabulary.""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--input-text",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The text to generate speech for",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output-wav",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The filename of the wave to save the generated speech",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
@ -93,13 +105,6 @@ def to_waveform(mel, vocoder, denoiser):
|
||||
return audio.cpu().squeeze()
|
||||
|
||||
|
||||
def save_to_folder(filename: str, output: dict, folder: str):
|
||||
folder = Path(folder)
|
||||
folder.mkdir(exist_ok=True, parents=True)
|
||||
np.save(folder / f"{filename}", output["mel"].cpu().numpy())
|
||||
sf.write(folder / f"{filename}.wav", output["waveform"], 22050, "PCM_24")
|
||||
|
||||
|
||||
def process_text(text: str, tokenizer):
|
||||
x = tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
|
||||
x = torch.tensor(x, dtype=torch.long)
|
||||
@ -120,7 +125,6 @@ def synthesise(
|
||||
spks=spks,
|
||||
length_scale=length_scale,
|
||||
)
|
||||
print("output.shape", list(output.keys()), output["mel"].shape)
|
||||
# merge everything to one dict
|
||||
output.update({"start_t": start_t, **text_processed})
|
||||
return output
|
||||
@ -163,16 +167,6 @@ def main():
|
||||
vocoder = load_vocoder(params.vocoder)
|
||||
denoiser = Denoiser(vocoder, mode="zeros")
|
||||
|
||||
texts = [
|
||||
"The Secret Service believed that it was very doubtful that any "
|
||||
"President would ride regularly in a vehicle with a fixed top, even "
|
||||
"though transparent.",
|
||||
"Today as always, men fall into two groups: slaves and free men. "
|
||||
"Whoever does not have two-thirds of his day for himself, is a slave, "
|
||||
"whatever he may be: a statesman, a businessman, an official, or a "
|
||||
"scholar.",
|
||||
]
|
||||
|
||||
# Number of ODE Solver steps
|
||||
n_timesteps = 2
|
||||
|
||||
@ -182,47 +176,17 @@ def main():
|
||||
# Sampling temperature
|
||||
temperature = 0.667
|
||||
|
||||
rtfs = []
|
||||
rtfs_w = []
|
||||
for i, text in enumerate(tqdm(texts)):
|
||||
output = synthesise(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
n_timesteps=n_timesteps,
|
||||
text=text,
|
||||
length_scale=length_scale,
|
||||
temperature=temperature,
|
||||
) # , torch.tensor([15], device=device, dtype=torch.long).unsqueeze(0))
|
||||
output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
|
||||
|
||||
# Compute Real Time Factor (RTF) with HiFi-GAN
|
||||
t = (dt.datetime.now() - output["start_t"]).total_seconds()
|
||||
rtf_w = t * 22050 / (output["waveform"].shape[-1])
|
||||
|
||||
# Pretty print
|
||||
print(f"{'*' * 53}")
|
||||
print(f"Input text - {i}")
|
||||
print(f"{'-' * 53}")
|
||||
print(output["x_orig"])
|
||||
print(f"{'*' * 53}")
|
||||
print(f"Phonetised text - {i}")
|
||||
print(f"{'-' * 53}")
|
||||
print(output["x"])
|
||||
print(f"{'*' * 53}")
|
||||
print(f"RTF:\t\t{output['rtf']:.6f}")
|
||||
print(f"RTF Waveform:\t{rtf_w:.6f}")
|
||||
rtfs.append(output["rtf"])
|
||||
rtfs_w.append(rtf_w)
|
||||
|
||||
# Save the generated waveform
|
||||
save_to_folder(i, output, folder=f"./my-output-{params.epoch}")
|
||||
|
||||
print(f"Number of ODE steps: {n_timesteps}")
|
||||
print(f"Mean RTF:\t\t\t\t{np.mean(rtfs):.6f} ± {np.std(rtfs):.6f}")
|
||||
print(
|
||||
"Mean RTF Waveform "
|
||||
f"(incl. vocoder):\t{np.mean(rtfs_w):.6f} ± {np.std(rtfs_w):.6f}"
|
||||
output = synthesise(
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
n_timesteps=n_timesteps,
|
||||
text=params.input_text,
|
||||
length_scale=length_scale,
|
||||
temperature=temperature,
|
||||
)
|
||||
output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
|
||||
|
||||
sf.write(params.output_wav, output["waveform"], 22050, "PCM_16")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -1,4 +1,5 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import datetime as dt
|
||||
import logging
|
||||
|
||||
@ -9,6 +10,49 @@ from inference import load_vocoder
|
||||
from tokenizer import Tokenizer
|
||||
|
||||
|
||||
def get_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--acoustic-model",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the acoustic model",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--tokens",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the tokens.txt",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--vocoder",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to the vocoder",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--input-text",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The text to generate speech for",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output-wav",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The filename of the wave to save the generated speech",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
class OnnxHifiGANModel:
|
||||
def __init__(
|
||||
self,
|
||||
@ -98,10 +142,12 @@ class OnnxModel:
|
||||
|
||||
@torch.no_grad()
|
||||
def main():
|
||||
model = OnnxModel("./model-steps-6.onnx")
|
||||
vocoder = OnnxHifiGANModel("./hifigan_v1.onnx")
|
||||
text = "Today as always, men fall into two groups: slaves and free men."
|
||||
text += "hello, how are you doing?"
|
||||
params = get_parser().parse_args()
|
||||
logging.info(vars(params))
|
||||
|
||||
model = OnnxModel(params.acoustic_model)
|
||||
vocoder = OnnxHifiGANModel(params.vocoder)
|
||||
text = params.input_text
|
||||
x = model.tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
|
||||
x = torch.tensor(x, dtype=torch.int64)
|
||||
|
||||
@ -109,9 +155,6 @@ def main():
|
||||
mel = model(x)
|
||||
end_t = dt.datetime.now()
|
||||
|
||||
for i in range(3):
|
||||
audio = vocoder(mel)
|
||||
|
||||
start_t2 = dt.datetime.now()
|
||||
audio = vocoder(mel)
|
||||
end_t2 = dt.datetime.now()
|
||||
@ -121,13 +164,14 @@ def main():
|
||||
|
||||
t = (end_t - start_t).total_seconds()
|
||||
t2 = (end_t2 - start_t2).total_seconds()
|
||||
rtf = t * 22050 / audio.shape[-1]
|
||||
rtf2 = t2 * 22050 / audio.shape[-1]
|
||||
print("RTF", rtf)
|
||||
print("RTF", rtf2)
|
||||
rtf_am = t * 22050 / audio.shape[-1]
|
||||
rtf_vocoder = t2 * 22050 / audio.shape[-1]
|
||||
print("RTF for acoustic model ", rtf_am)
|
||||
print("RTF for vocoder", rtf_vocoder)
|
||||
|
||||
# skip denoiser
|
||||
sf.write("onnx2.wav", audio, 22050, "PCM_16")
|
||||
sf.write(params.output_wav, audio, 22050, "PCM_16")
|
||||
logging.info(f"Saved to {params.output_wav}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -34,10 +34,10 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
||||
log "monotonic_align lib for vits already built"
|
||||
fi
|
||||
|
||||
if [ ! -f ./matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ]; then
|
||||
pushd matcha/utils/monotonic_align
|
||||
if [ ! -f ./matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ]; then
|
||||
pushd matcha/monotonic_align
|
||||
python3 setup.py build_ext --inplace
|
||||
mv -v matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./
|
||||
mv -v matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./
|
||||
rm -rf matcha
|
||||
rm -rf build
|
||||
rm core.c
|
||||
|
Loading…
x
Reference in New Issue
Block a user