From 14a28edab6e032e67caa1770da149aa0a72ef083 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Mon, 28 Oct 2024 22:49:14 +0800
Subject: [PATCH] Update README

---
 .github/scripts/ljspeech/TTS/run-matcha.sh |   0
 egs/ljspeech/TTS/.gitignore                |   3 +
 egs/ljspeech/TTS/README.md                 | 114 +++++++++++++++++++++
 egs/ljspeech/TTS/matcha/inference.py       |  84 +++++----------
 egs/ljspeech/TTS/matcha/onnx_pretrained.py |  68 +++++++++---
 egs/ljspeech/TTS/prepare.sh                |   6 +-
 6 files changed, 200 insertions(+), 75 deletions(-)
 create mode 100755 .github/scripts/ljspeech/TTS/run-matcha.sh

diff --git a/.github/scripts/ljspeech/TTS/run-matcha.sh b/.github/scripts/ljspeech/TTS/run-matcha.sh
new file mode 100755
index 000000000..e69de29bb
diff --git a/egs/ljspeech/TTS/.gitignore b/egs/ljspeech/TTS/.gitignore
index 1eef06a28..d5c19797a 100644
--- a/egs/ljspeech/TTS/.gitignore
+++ b/egs/ljspeech/TTS/.gitignore
@@ -2,3 +2,6 @@ build
 core.c
 *.so
 my-output*
+*.wav
+*.onnx
+generator_v*
diff --git a/egs/ljspeech/TTS/README.md b/egs/ljspeech/TTS/README.md
index 7b112c12c..fe613024a 100644
--- a/egs/ljspeech/TTS/README.md
+++ b/egs/ljspeech/TTS/README.md
@@ -101,3 +101,117 @@ export CUDA_VISIBLE_DEVICES=4,5,6,7
 
 # (Note it is killed after `epoch-820.pt`)
 ```
+# matcha
+
+[./matcha](./matcha) contains the code for training [Matcha-TTS](https://github.com/shivammehta25/Matcha-TTS)
+
+This recipe provides a Matcha-TTS model trained on the LJSpeech dataset.
+
+Pretrained model can be found [here](https://huggingface.co/csukuangfj/icefall-tts-ljspeech-matcha-en-2024-10-28).
+
+The training command is given below:
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+
+python3 ./matcha/train.py \
+  --exp-dir ./matcha/exp-new-3/ \
+  --num-workers 4 \
+  --world-size 4 \
+  --num-epochs 4000 \
+  --max-duration 1000 \
+  --bucketing-sampler 1 \
+  --start-epoch 1
+```
+
+To inference, use:
+
+```bash
+# Download Hifigan vocoder. We use Hifigan v1 below. You can select from v1, v2, or v3
+
+wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
+
+./matcha/inference \
+  --exp-dir ./matcha/exp-new-3 \
+  --epoch 4000 \
+  --tokens ./data/tokens.txt \
+  --vocoder ./generator_v1 \
+  --input-text "how are you doing?"
+  --output-wav ./generated.wav
+```
+
+```bash
+soxi ./generated.wav
+```
+prints:
+```
+Input File     : './generated.wav'
+Channels       : 1
+Sample Rate    : 22050
+Precision      : 16-bit
+Duration       : 00:00:01.29 = 28416 samples ~ 96.6531 CDDA sectors
+File Size      : 56.9k
+Bit Rate       : 353k
+Sample Encoding: 16-bit Signed Integer PCM
+```
+
+To export the checkpoint to onnx:
+
+```bash
+# export the acoustic model to onnx
+
+./matcha/export_onnx.py \
+  --exp-dir ./matcha/exp-new-3 \
+  --epoch 4000 \
+  --tokens ./data/tokens.txt
+```
+
+The above command generate the following files:
+
+  - model-steps-2.onnx
+  - model-steps-3.onnx
+  - model-steps-4.onnx
+  - model-steps-5.onnx
+  - model-steps-6.onnx
+
+where the 2 in `model-steps-2.onnx` means it uses 2 steps for the ODE solver.
+
+
+To export the Hifigan vocoder to onnx, please use:
+
+```bash
+wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v1
+wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v2
+wget https://github.com/csukuangfj/models/raw/refs/heads/master/hifigan/generator_v3
+
+python3 ./matcha/export_onnx_hifigan.py
+```
+
+The above command generates 3 files:
+
+  - hifigan_v1.onnx
+  - hifigan_v2.onnx
+  - hifigan_v3.onnx
+
+To use the generated onnx files to generate speech from text, please run:
+
+```bash
+python3 ./matcha/onnx_pretrained.py \
+ --acoustic-model ./model-steps-6.onnx \
+ --vocoder ./hifigan_v2.onnx \
+ --tokens ./data/tokens.txt \
+ --input-text "how are you doing?" \
+ --output-wav ./generated-2.wav
+```
+
+```bash
+soxi ./generated-2.wav
+
+Input File     : './generated-2.wav'
+Channels       : 1
+Sample Rate    : 22050
+Precision      : 16-bit
+Duration       : 00:00:01.25 = 27648 samples ~ 94.0408 CDDA sectors
+File Size      : 55.3k
+Bit Rate       : 353k
+Sample Encoding: 16-bit Signed Integer PCM
+```
diff --git a/egs/ljspeech/TTS/matcha/inference.py b/egs/ljspeech/TTS/matcha/inference.py
index 8fc0ec3ac..1189160f6 100755
--- a/egs/ljspeech/TTS/matcha/inference.py
+++ b/egs/ljspeech/TTS/matcha/inference.py
@@ -6,14 +6,12 @@ import json
 import logging
 from pathlib import Path
 
-import numpy as np
 import soundfile as sf
 import torch
 from matcha.hifigan.config import v1, v2, v3
 from matcha.hifigan.denoiser import Denoiser
 from matcha.hifigan.models import Generator as HiFiGAN
 from tokenizer import Tokenizer
-from tqdm.auto import tqdm
 from train import get_model, get_params
 
 from icefall.checkpoint import load_checkpoint
@@ -64,6 +62,20 @@ def get_parser():
         help="""Path to vocabulary.""",
     )
 
+    parser.add_argument(
+        "--input-text",
+        type=str,
+        required=True,
+        help="The text to generate speech for",
+    )
+
+    parser.add_argument(
+        "--output-wav",
+        type=str,
+        required=True,
+        help="The filename of the wave to save the generated speech",
+    )
+
     return parser
 
 
@@ -93,13 +105,6 @@ def to_waveform(mel, vocoder, denoiser):
     return audio.cpu().squeeze()
 
 
-def save_to_folder(filename: str, output: dict, folder: str):
-    folder = Path(folder)
-    folder.mkdir(exist_ok=True, parents=True)
-    np.save(folder / f"{filename}", output["mel"].cpu().numpy())
-    sf.write(folder / f"{filename}.wav", output["waveform"], 22050, "PCM_24")
-
-
 def process_text(text: str, tokenizer):
     x = tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
     x = torch.tensor(x, dtype=torch.long)
@@ -120,7 +125,6 @@ def synthesise(
         spks=spks,
         length_scale=length_scale,
     )
-    print("output.shape", list(output.keys()), output["mel"].shape)
     # merge everything to one dict
     output.update({"start_t": start_t, **text_processed})
     return output
@@ -163,16 +167,6 @@ def main():
     vocoder = load_vocoder(params.vocoder)
     denoiser = Denoiser(vocoder, mode="zeros")
 
-    texts = [
-        "The Secret Service believed that it was very doubtful that any "
-        "President would ride regularly in a vehicle with a fixed top, even "
-        "though transparent.",
-        "Today as always, men fall into two groups: slaves and free men. "
-        "Whoever does not have two-thirds of his day for himself, is a slave, "
-        "whatever he may be: a statesman, a businessman, an official, or a "
-        "scholar.",
-    ]
-
     # Number of ODE Solver steps
     n_timesteps = 2
 
@@ -182,47 +176,17 @@ def main():
     # Sampling temperature
     temperature = 0.667
 
-    rtfs = []
-    rtfs_w = []
-    for i, text in enumerate(tqdm(texts)):
-        output = synthesise(
-            model=model,
-            tokenizer=tokenizer,
-            n_timesteps=n_timesteps,
-            text=text,
-            length_scale=length_scale,
-            temperature=temperature,
-        )  # , torch.tensor([15], device=device, dtype=torch.long).unsqueeze(0))
-        output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
-
-        # Compute Real Time Factor (RTF) with HiFi-GAN
-        t = (dt.datetime.now() - output["start_t"]).total_seconds()
-        rtf_w = t * 22050 / (output["waveform"].shape[-1])
-
-        # Pretty print
-        print(f"{'*' * 53}")
-        print(f"Input text - {i}")
-        print(f"{'-' * 53}")
-        print(output["x_orig"])
-        print(f"{'*' * 53}")
-        print(f"Phonetised text - {i}")
-        print(f"{'-' * 53}")
-        print(output["x"])
-        print(f"{'*' * 53}")
-        print(f"RTF:\t\t{output['rtf']:.6f}")
-        print(f"RTF Waveform:\t{rtf_w:.6f}")
-        rtfs.append(output["rtf"])
-        rtfs_w.append(rtf_w)
-
-        # Save the generated waveform
-        save_to_folder(i, output, folder=f"./my-output-{params.epoch}")
-
-    print(f"Number of ODE steps: {n_timesteps}")
-    print(f"Mean RTF:\t\t\t\t{np.mean(rtfs):.6f} ± {np.std(rtfs):.6f}")
-    print(
-        "Mean RTF Waveform "
-        f"(incl. vocoder):\t{np.mean(rtfs_w):.6f} ± {np.std(rtfs_w):.6f}"
+    output = synthesise(
+        model=model,
+        tokenizer=tokenizer,
+        n_timesteps=n_timesteps,
+        text=params.input_text,
+        length_scale=length_scale,
+        temperature=temperature,
     )
+    output["waveform"] = to_waveform(output["mel"], vocoder, denoiser)
+
+    sf.write(params.output_wav, output["waveform"], 22050, "PCM_16")
 
 
 if __name__ == "__main__":
diff --git a/egs/ljspeech/TTS/matcha/onnx_pretrained.py b/egs/ljspeech/TTS/matcha/onnx_pretrained.py
index 3953d5d0a..6a37f3c17 100755
--- a/egs/ljspeech/TTS/matcha/onnx_pretrained.py
+++ b/egs/ljspeech/TTS/matcha/onnx_pretrained.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+import argparse
 import datetime as dt
 import logging
 
@@ -9,6 +10,49 @@ from inference import load_vocoder
 from tokenizer import Tokenizer
 
 
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--acoustic-model",
+        type=str,
+        required=True,
+        help="Path to the acoustic model",
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        required=True,
+        help="Path to the tokens.txt",
+    )
+
+    parser.add_argument(
+        "--vocoder",
+        type=str,
+        required=True,
+        help="Path to the vocoder",
+    )
+
+    parser.add_argument(
+        "--input-text",
+        type=str,
+        required=True,
+        help="The text to generate speech for",
+    )
+
+    parser.add_argument(
+        "--output-wav",
+        type=str,
+        required=True,
+        help="The filename of the wave to save the generated speech",
+    )
+
+    return parser
+
+
 class OnnxHifiGANModel:
     def __init__(
         self,
@@ -98,10 +142,12 @@ class OnnxModel:
 
 @torch.no_grad()
 def main():
-    model = OnnxModel("./model-steps-6.onnx")
-    vocoder = OnnxHifiGANModel("./hifigan_v1.onnx")
-    text = "Today as always, men fall into two groups: slaves and free men."
-    text += "hello, how are you doing?"
+    params = get_parser().parse_args()
+    logging.info(vars(params))
+
+    model = OnnxModel(params.acoustic_model)
+    vocoder = OnnxHifiGANModel(params.vocoder)
+    text = params.input_text
     x = model.tokenizer.texts_to_token_ids([text], add_sos=True, add_eos=True)
     x = torch.tensor(x, dtype=torch.int64)
 
@@ -109,9 +155,6 @@ def main():
     mel = model(x)
     end_t = dt.datetime.now()
 
-    for i in range(3):
-        audio = vocoder(mel)
-
     start_t2 = dt.datetime.now()
     audio = vocoder(mel)
     end_t2 = dt.datetime.now()
@@ -121,13 +164,14 @@ def main():
 
     t = (end_t - start_t).total_seconds()
     t2 = (end_t2 - start_t2).total_seconds()
-    rtf = t * 22050 / audio.shape[-1]
-    rtf2 = t2 * 22050 / audio.shape[-1]
-    print("RTF", rtf)
-    print("RTF", rtf2)
+    rtf_am = t * 22050 / audio.shape[-1]
+    rtf_vocoder = t2 * 22050 / audio.shape[-1]
+    print("RTF for acoustic model ", rtf_am)
+    print("RTF for vocoder", rtf_vocoder)
 
     # skip denoiser
-    sf.write("onnx2.wav", audio, 22050, "PCM_16")
+    sf.write(params.output_wav, audio, 22050, "PCM_16")
+    logging.info(f"Saved to {params.output_wav}")
 
 
 if __name__ == "__main__":
diff --git a/egs/ljspeech/TTS/prepare.sh b/egs/ljspeech/TTS/prepare.sh
index b140e6f01..dfc2b3540 100755
--- a/egs/ljspeech/TTS/prepare.sh
+++ b/egs/ljspeech/TTS/prepare.sh
@@ -34,10 +34,10 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
     log "monotonic_align lib for vits already built"
   fi
 
-  if [ ! -f ./matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so  ]; then
-    pushd matcha/utils/monotonic_align
+  if [ ! -f ./matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so  ]; then
+    pushd matcha/monotonic_align
     python3 setup.py build_ext --inplace
-    mv -v matcha/utils/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./
+    mv -v matcha/monotonic_align/core.cpython-38-x86_64-linux-gnu.so ./
     rm -rf matcha
     rm -rf build
     rm core.c