Update README

2025-08-26 18:24:18 +00:00 · 2024-10-28 22:49:14 +08:00 · 2024-10-28 22:49:14 +08:00 · 14a28edab6
commit 14a28edab6
parent 8cb1cda040
6 changed files with 200 additions and 75 deletions
--- a/.github/scripts/ljspeech/TTS/run-matcha.sh
+++ b/.github/scripts/ljspeech/TTS/run-matcha.sh
--- a/egs/ljspeech/TTS/.gitignore
+++ b/egs/ljspeech/TTS/.gitignore
@ -2,3 +2,6 @@ build
 core.c
 *.so
 my-output*
+*.wav
+*.onnx
+generator_v*
--- a/egs/ljspeech/TTS/README.md
+++ b/egs/ljspeech/TTS/README.md
@ -101,3 +101,117 @@ export CUDA_VISIBLE_DEVICES=4,5,6,7

 # (Note it is killed after `epoch-820.pt`)
 ```
+# matcha
+
+[./matcha](./matcha) contains the code for training [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS)
+
+This recipe provides a Matcha-TTS model trained on the LJSpeech dataset.
+
+Pretrained model can be found [here](https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28).
+
+The training command is given below:
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 ./matcha/train.py \
+  --exp-dir ./matcha/exp-new-3/ \
+  --num-workers 4 \
+  --world-size 4 \
+  --num-epochs 4000 \
+  --max-duration 1000 \
+  --bucketing-sampler 1 \
+  --start-epoch 1
+```
+
+To inference, use:
+
+```bash
+# Download Hifigan vocoder. We use Hifigan v1 below. You can select from v1, v2, or v3
+
+wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
+
+./matcha/inference \
+  --exp-dir ./matcha/exp-new-3 \
+  --epoch 4000 \
+  --tokens ./data/tokens.txt \
+  --vocoder ./generator_v1 \
+  --input-text "how are you doing?"
+  --output-wav ./generated.wav
+```
+
+```bash
+soxi ./generated.wav
+```
+prints:
+```
+Input File     : './generated.wav'
+Channels       : 1
+Sample Rate    : 22050
+Precision      : 16-bit
+Duration       : 00:00:01.29 = 28416 samples ~ 96.6531 CDDA sectors
+File Size      : 56.9k
+Bit Rate       : 353k
+Sample Encoding: 16-bit Signed Integer PCM
+```
+
+To export the checkpoint to onnx:
+
+```bash
+# export the acoustic model to onnx
+
+./matcha/export_onnx.py \
+  --exp-dir ./matcha/exp-new-3 \
+  --epoch 4000 \
+  --tokens ./data/tokens.txt
+```
+
+The above command generate the following files:
+
+  - model-steps-2.onnx
+  - model-steps-3.onnx
+  - model-steps-4.onnx
+  - model-steps-5.onnx
+  - model-steps-6.onnx
+
+where the 2 in `model-steps-2.onnx` means it uses 2 steps for the ODE solver.
+
+
+To export the Hifigan vocoder to onnx, please use:
+
+```bash
+wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
+wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
+wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v3
+
+python3 ./matcha/export_onnx_hifigan.py
+```
+
+The above command generates 3 files:
+
+  - hifigan_v1.onnx
+  - hifigan_v2.onnx
+  - hifigan_v3.onnx
+
+To use the generated onnx files to generate speech from text, please run:
+
+```bash
+python3 ./matcha/onnx_pretrained.py \
+ --acoustic-model ./model-steps-6.onnx \
+ --vocoder ./hifigan_v2.onnx \
+ --tokens ./data/tokens.txt \
+ --input-text "how are you doing?" \
+ --output-wav ./generated-2.wav
+```
+
+```bash
+soxi ./generated-2.wav
+
+Input File     : './generated-2.wav'
+Channels       : 1
+Sample Rate    : 22050
+Precision      : 16-bit
+Duration       : 00:00:01.25 = 27648 samples ~ 94.0408 CDDA sectors
+File Size      : 55.3k
+Bit Rate       : 353k
+Sample Encoding: 16-bit Signed Integer PCM
+```
--- a/egs/ljspeech/TTS/matcha/inference.py
+++ b/egs/ljspeech/TTS/matcha/inference.py
@ -6,14 +6,12 @@ import json
 import logging
 from pathlib import Path

-import numpy as np
 import soundfile as sf
 import torch
 from matcha.hifigan.config import v1, v2, v3
 from matcha.hifigan.denoiser import Denoiser
 from matcha.hifigan.models import Generator as HiFiGAN
 from tokenizer import Tokenizer
-from tqdm.auto import tqdm
 from train import get_model, get_params

 from icefall.checkpoint import load_checkpoint
@ -64,6 +62,20 @@ def get_parser():
        help="""Path to vocabulary.""",
    )

+    parser.add_argument(
+        "--input-text",
+        type=str,
+        required=True,
+        help="The text to generate speech for",
+    )
+
+    parser.add_argument(
+        "--output-wav",
+        type=str,
+        required=True,
+        help="The filename of the wave to save the generated speech",
+    )
+
    return parser


@ -93,13 +105,6 @@ def to_waveform(mel, vocoder, denoiser):
    return audio.cpu().squeeze()


-def save_to_folder(filename: str, output: dict, folder: str):
-    folder = Path(folder)
-    folder.mkdir(exist_ok=True, parents=True)
-    np.save(folder / f"{filename}", output["mel"].cpu().numpy())
-    sf.write(folder / f"{filename}.wav", output["waveform"], 22050, "PCM_24")
-
-
 def process_text(text: str, tokenizer):
    x = tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
    x = torch.tensor(x, dtype=torch.long)
@ -120,7 +125,6 @@ def synthesise(
        spks=spks,
        length_scale=length_scale,
    )
-    print("output.shape", list(output.keys()), output["mel"].shape)
    # merge everything to one dict
    output.update({"start_t": start_t, **text_processed})
    return output
@ -163,16 +167,6 @@ def main():
    vocoder = load_vocoder(params.vocoder)
    denoiser = Denoiser(vocoder, mode="zeros")

-    texts = [
-        "The Secret Service believed that it was very doubtful that any "
-        "President would ride regularly in a vehicle with a fixed top, even "
-        "though transparent.",
-        "Today as always, men fall into two groups: slaves and free men. "
-        "Whoever does not have two-thirds of his day for himself, is a slave, "
-        "whatever he may be: a statesman, a businessman, an official, or a "
-        "scholar.",
-    ]
-
    # Number of ODE Solver steps
    n_timesteps = 2

@ -182,47 +176,17 @@ def main():
    # Sampling temperature
    temperature = 0.667

-    rtfs = []
-    rtfs_w = []
-    for i, text in enumerate(tqdm(texts)):
-        output = synthesise(
-            model=model,
-            tokenizer=tokenizer,
-            n_timesteps=n_timesteps,
-            text=text,
-            length_scale=length_scale,
-            temperature=temperature,
-        )  # , torch.tensor([15], device=device, dtype=torch.long).unsqueeze(0))
-        output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
-
-        # Compute Real Time Factor (RTF) with HiFi-GAN
-        t = (dt.datetime.now() - output["start_t"]).total_seconds()
-        rtf_w = t * 22050 / (output["waveform"].shape[-1])
-
-        # Pretty print
-        print(f"{'*' * 53}")
-        print(f"Input text - {i}")
-        print(f"{'-' * 53}")
-        print(output["x_orig"])
-        print(f"{'*' * 53}")
-        print(f"Phonetised text - {i}")
-        print(f"{'-' * 53}")
-        print(output["x"])
-        print(f"{'*' * 53}")
-        print(f"RTF:\t\t{output['rtf']:.6f}")
-        print(f"RTF Waveform:\t{rtf_w:.6f}")
-        rtfs.append(output["rtf"])
-        rtfs_w.append(rtf_w)
-
-        # Save the generated waveform
-        save_to_folder(i, output, folder=f"./my-output-{params.epoch}")
-
-    print(f"Number of ODE steps: {n_timesteps}")
-    print(f"Mean RTF:\t\t\t\t{np.mean(rtfs):.6f} ± {np.std(rtfs):.6f}")
-    print(
-        "Mean RTF Waveform "
-        f"(incl. vocoder):\t{np.mean(rtfs_w):.6f} ± {np.std(rtfs_w):.6f}"
+    output = synthesise(
+        model=model,
+        tokenizer=tokenizer,
+        n_timesteps=n_timesteps,
+        text=params.input_text,
+        length_scale=length_scale,
+        temperature=temperature,
    )
+    output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
+
+    sf.write(params.output_wav, output["waveform"], 22050, "PCM_16")


 if __name__ == "__main__":
--- a/egs/ljspeech/TTS/matcha/onnx_pretrained.py
+++ b/egs/ljspeech/TTS/matcha/onnx_pretrained.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import argparse
 import datetime as dt
 import logging

@ -9,6 +10,49 @@ from inference import load_vocoder
 from tokenizer import Tokenizer


+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--acoustic-model",
+        type=str,
+        required=True,
+        help="Path to the acoustic model",
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        required=True,
+        help="Path to the tokens.txt",
+    )
+
+    parser.add_argument(
+        "--vocoder",
+        type=str,
+        required=True,
+        help="Path to the vocoder",
+    )
+
+    parser.add_argument(
+        "--input-text",
+        type=str,
+        required=True,
+        help="The text to generate speech for",
+    )
+
+    parser.add_argument(
+        "--output-wav",
+        type=str,
+        required=True,
+        help="The filename of the wave to save the generated speech",
+    )
+
+    return parser
+
+
 class OnnxHifiGANModel:
    def __init__(
        self,
@ -98,10 +142,12 @@ class OnnxModel:

@torch.no_grad()
 def main():
-    model = OnnxModel("./model-steps-6.onnx")
-    vocoder = OnnxHifiGANModel("./hifigan_v1.onnx")
-    text = "Today as always, men fall into two groups: slaves and free men."
-    text += "hello, how are you doing?"
+    params = get_parser().parse_args()
+    logging.info(vars(params))
+
+    model = OnnxModel(params.acoustic_model)
+    vocoder = OnnxHifiGANModel(params.vocoder)
+    text = params.input_text
    x = model.tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
    x = torch.tensor(x, dtype=torch.int64)

@ -109,9 +155,6 @@ def main():
    mel = model(x)
    end_t = dt.datetime.now()

-    for i in range(3):
-        audio = vocoder(mel)
-
    start_t2 = dt.datetime.now()
    audio = vocoder(mel)
    end_t2 = dt.datetime.now()
@ -121,13 +164,14 @@ def main():

    t = (end_t - start_t).total_seconds()
    t2 = (end_t2 - start_t2).total_seconds()
-    rtf = t * 22050 / audio.shape[-1]
-    rtf2 = t2 * 22050 / audio.shape[-1]
-    print("RTF", rtf)
-    print("RTF", rtf2)
+    rtf_am = t * 22050 / audio.shape[-1]
+    rtf_vocoder = t2 * 22050 / audio.shape[-1]
+    print("RTF for acoustic model ", rtf_am)
+    print("RTF for vocoder", rtf_vocoder)

    # skip denoiser
-    sf.write("onnx2.wav", audio, 22050, "PCM_16")
+    sf.write(params.output_wav, audio, 22050, "PCM_16")
+    logging.info(f"Saved to {params.output_wav}")


 if __name__ == "__main__":
--- a/egs/ljspeech/TTS/prepare.sh
+++ b/egs/ljspeech/TTS/prepare.sh
@ -34,10 +34,10 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
    log "monotonic_align lib for vits already built"
  fi

-  if [ ! -f ./matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so  ]; then
-    pushd matcha/utils/monotonic_align
+  if [ ! -f ./matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so  ]; then
+    pushd matcha/monotonic_align
    python3 setup.py build_ext --inplace
-    mv -v matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./
+    mv -v matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./
    rm -rf matcha
    rm -rf build
    rm core.c