Update README

2025-12-11 06:55:27 +00:00 · 2024-10-28 22:49:14 +08:00 · 2024-10-28 22:49:14 +08:00 · 14a28edab6
commit 14a28edab6
parent 8cb1cda040
6 changed files with 200 additions and 75 deletions
--- a/.github/scripts/ljspeech/TTS/run-matcha.sh
+++ b/.github/scripts/ljspeech/TTS/run-matcha.sh
--- a/egs/ljspeech/TTS/.gitignore
+++ b/egs/ljspeech/TTS/.gitignore
@ -2,3 +2,6 @@ build
 core.c
 *.so
 my-output*
 *.wav
 *.onnx
 generator_v*
--- a/egs/ljspeech/TTS/README.md
+++ b/egs/ljspeech/TTS/README.md
@ -101,3 +101,117 @@ export CUDA_VISIBLE_DEVICES=4,5,6,7
 # (Note it is killed after `epoch-820.pt`)
 ```
 # matcha
 [./matcha](./matcha) contains the code for training [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS)
 This recipe provides a Matcha-TTS model trained on the LJSpeech dataset.
 Pretrained model can be found [here](https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28).
 The training command is given below:
 ```bash
 export CUDA_VISIBLE_DEVICES=0,1,2,3
 python3 ./matcha/train.py \
  --exp-dir ./matcha/exp-new-3/ \
  --num-workers 4 \
  --world-size 4 \
  --num-epochs 4000 \
  --max-duration 1000 \
  --bucketing-sampler 1 \
  --start-epoch 1
 ```
 To inference, use:
 ```bash
 # Download Hifigan vocoder. We use Hifigan v1 below. You can select from v1, v2, or v3
 wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
 ./matcha/inference \
  --exp-dir ./matcha/exp-new-3 \
  --epoch 4000 \
  --tokens ./data/tokens.txt \
  --vocoder ./generator_v1 \
  --input-text "how are you doing?"
  --output-wav ./generated.wav
 ```
 ```bash
 soxi ./generated.wav
 ```
 prints:
 ```
 Input File     : './generated.wav'
 Channels       : 1
 Sample Rate    : 22050
 Precision      : 16-bit
 Duration       : 00:00:01.29 = 28416 samples ~ 96.6531 CDDA sectors
 File Size      : 56.9k
 Bit Rate       : 353k
 Sample Encoding: 16-bit Signed Integer PCM
 ```
 To export the checkpoint to onnx:
 ```bash
 # export the acoustic model to onnx
 ./matcha/export_onnx.py \
  --exp-dir ./matcha/exp-new-3 \
  --epoch 4000 \
  --tokens ./data/tokens.txt
 ```
 The above command generate the following files:
  - model-steps-2.onnx
  - model-steps-3.onnx
  - model-steps-4.onnx
  - model-steps-5.onnx
  - model-steps-6.onnx
 where the 2 in `model-steps-2.onnx` means it uses 2 steps for the ODE solver.
 To export the Hifigan vocoder to onnx, please use:
 ```bash
 wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
 wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
 wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v3
 python3 ./matcha/export_onnx_hifigan.py
 ```
 The above command generates 3 files:
  - hifigan_v1.onnx
  - hifigan_v2.onnx
  - hifigan_v3.onnx
 To use the generated onnx files to generate speech from text, please run:
 ```bash
 python3 ./matcha/onnx_pretrained.py \
 --acoustic-model ./model-steps-6.onnx \
 --vocoder ./hifigan_v2.onnx \
 --tokens ./data/tokens.txt \
 --input-text "how are you doing?" \
 --output-wav ./generated-2.wav
 ```
 ```bash
 soxi ./generated-2.wav
 Input File     : './generated-2.wav'
 Channels       : 1
 Sample Rate    : 22050
 Precision      : 16-bit
 Duration       : 00:00:01.25 = 27648 samples ~ 94.0408 CDDA sectors
 File Size      : 55.3k
 Bit Rate       : 353k
 Sample Encoding: 16-bit Signed Integer PCM
 ```
--- a/egs/ljspeech/TTS/matcha/inference.py
+++ b/egs/ljspeech/TTS/matcha/inference.py
@ -6,14 +6,12 @@ import json
 import logging
 from pathlib import Path
 import numpy as np
 import soundfile as sf
 import torch
 from matcha.hifigan.config import v1, v2, v3
 from matcha.hifigan.denoiser import Denoiser
 from matcha.hifigan.models import Generator as HiFiGAN
 from tokenizer import Tokenizer
 from tqdm.auto import tqdm
 from train import get_model, get_params
 from icefall.checkpoint import load_checkpoint
@ -64,6 +62,20 @@ def get_parser():
        help="""Path to vocabulary.""",
    )
    parser.add_argument(
        "--input-text",
        type=str,
        required=True,
        help="The text to generate speech for",
    )
    parser.add_argument(
        "--output-wav",
        type=str,
        required=True,
        help="The filename of the wave to save the generated speech",
    )
    return parser
@ -93,13 +105,6 @@ def to_waveform(mel, vocoder, denoiser):
    return audio.cpu().squeeze()
 def save_to_folder(filename: str, output: dict, folder: str):
    folder = Path(folder)
    folder.mkdir(exist_ok=True, parents=True)
    np.save(folder / f"{filename}", output["mel"].cpu().numpy())
    sf.write(folder / f"{filename}.wav", output["waveform"], 22050, "PCM_24")
 def process_text(text: str, tokenizer):
    x = tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
    x = torch.tensor(x, dtype=torch.long)
@ -120,7 +125,6 @@ def synthesise(
        spks=spks,
        length_scale=length_scale,
    )
    print("output.shape", list(output.keys()), output["mel"].shape)
    # merge everything to one dict
    output.update({"start_t": start_t, **text_processed})
    return output
@ -163,16 +167,6 @@ def main():
    vocoder = load_vocoder(params.vocoder)
    denoiser = Denoiser(vocoder, mode="zeros")
    texts = [
        "The Secret Service believed that it was very doubtful that any "
        "President would ride regularly in a vehicle with a fixed top, even "
        "though transparent.",
        "Today as always, men fall into two groups: slaves and free men. "
        "Whoever does not have two-thirds of his day for himself, is a slave, "
        "whatever he may be: a statesman, a businessman, an official, or a "
        "scholar.",
    ]
    # Number of ODE Solver steps
    n_timesteps = 2
@ -182,47 +176,17 @@ def main():
    # Sampling temperature
    temperature = 0.667
    rtfs = []
    rtfs_w = []
    for i, text in enumerate(tqdm(texts)):
    output = synthesise(
        model=model,
        tokenizer=tokenizer,
        n_timesteps=n_timesteps,
-            text=text,
+        text=params.input_text,
        length_scale=length_scale,
        temperature=temperature,
-        )  # , torch.tensor([15], device=device, dtype=torch.long).unsqueeze(0))
+    )
    output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
-        # Compute Real Time Factor (RTF) with HiFi-GAN
+    sf.write(params.output_wav, output["waveform"], 22050, "PCM_16")
        t = (dt.datetime.now() - output["start_t"]).total_seconds()
        rtf_w = t * 22050 / (output["waveform"].shape[-1])
        # Pretty print
        print(f"{'*' * 53}")
        print(f"Input text - {i}")
        print(f"{'-' * 53}")
        print(output["x_orig"])
        print(f"{'*' * 53}")
        print(f"Phonetised text - {i}")
        print(f"{'-' * 53}")
        print(output["x"])
        print(f"{'*' * 53}")
        print(f"RTF:\t\t{output['rtf']:.6f}")
        print(f"RTF Waveform:\t{rtf_w:.6f}")
        rtfs.append(output["rtf"])
        rtfs_w.append(rtf_w)
        # Save the generated waveform
        save_to_folder(i, output, folder=f"./my-output-{params.epoch}")
    print(f"Number of ODE steps: {n_timesteps}")
    print(f"Mean RTF:\t\t\t\t{np.mean(rtfs):.6f} ± {np.std(rtfs):.6f}")
    print(
        "Mean RTF Waveform "
        f"(incl. vocoder):\t{np.mean(rtfs_w):.6f} ± {np.std(rtfs_w):.6f}"
    )
 if __name__ == "__main__":
--- a/egs/ljspeech/TTS/matcha/onnx_pretrained.py
+++ b/egs/ljspeech/TTS/matcha/onnx_pretrained.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
 import argparse
 import datetime as dt
 import logging
@ -9,6 +10,49 @@ from inference import load_vocoder
 from tokenizer import Tokenizer
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--acoustic-model",
        type=str,
        required=True,
        help="Path to the acoustic model",
    )
    parser.add_argument(
        "--tokens",
        type=str,
        required=True,
        help="Path to the tokens.txt",
    )
    parser.add_argument(
        "--vocoder",
        type=str,
        required=True,
        help="Path to the vocoder",
    )
    parser.add_argument(
        "--input-text",
        type=str,
        required=True,
        help="The text to generate speech for",
    )
    parser.add_argument(
        "--output-wav",
        type=str,
        required=True,
        help="The filename of the wave to save the generated speech",
    )
    return parser
 class OnnxHifiGANModel:
    def __init__(
        self,
@ -98,10 +142,12 @@ class OnnxModel:
@torch.no_grad()
 def main():
-    model = OnnxModel("./model-steps-6.onnx")
+    params = get_parser().parse_args()
-    vocoder = OnnxHifiGANModel("./hifigan_v1.onnx")
+    logging.info(vars(params))
-    text = "Today as always, men fall into two groups: slaves and free men."
+
-    text += "hello, how are you doing?"
+    model = OnnxModel(params.acoustic_model)
    vocoder = OnnxHifiGANModel(params.vocoder)
    text = params.input_text
    x = model.tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
    x = torch.tensor(x, dtype=torch.int64)
@ -109,9 +155,6 @@ def main():
    mel = model(x)
    end_t = dt.datetime.now()
    for i in range(3):
        audio = vocoder(mel)
    start_t2 = dt.datetime.now()
    audio = vocoder(mel)
    end_t2 = dt.datetime.now()
@ -121,13 +164,14 @@ def main():
    t = (end_t - start_t).total_seconds()
    t2 = (end_t2 - start_t2).total_seconds()
-    rtf = t * 22050 / audio.shape[-1]
+    rtf_am = t * 22050 / audio.shape[-1]
-    rtf2 = t2 * 22050 / audio.shape[-1]
+    rtf_vocoder = t2 * 22050 / audio.shape[-1]
-    print("RTF", rtf)
+    print("RTF for acoustic model ", rtf_am)
-    print("RTF", rtf2)
+    print("RTF for vocoder", rtf_vocoder)
    # skip denoiser
-    sf.write("onnx2.wav", audio, 22050, "PCM_16")
+    sf.write(params.output_wav, audio, 22050, "PCM_16")
    logging.info(f"Saved to {params.output_wav}")
 if __name__ == "__main__":
--- a/egs/ljspeech/TTS/prepare.sh
+++ b/egs/ljspeech/TTS/prepare.sh
@ -34,10 +34,10 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
    log "monotonic_align lib for vits already built"
  fi
-  if [ ! -f ./matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so  ]; then
+  if [ ! -f ./matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so  ]; then
-    pushd matcha/utils/monotonic_align
+    pushd matcha/monotonic_align
    python3 setup.py build_ext --inplace
-    mv -v matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./
+    mv -v matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./
    rm -rf matcha
    rm -rf build
    rm core.c