Use piper_phonemize as text tokenizer in vctk TTS recipe (#1522)

* to align with PR #1524
2025-08-08 09:32:20 +00:00 · 2024-03-18 17:53:52 +08:00 · 2024-03-18 17:53:52 +08:00 · eec12f053d
commit eec12f053d
parent 9b0eae3b4a
9 changed files with 56 additions and 135 deletions
--- a/egs/vctk/TTS/README.md
+++ b/egs/vctk/TTS/README.md
@ -10,7 +10,7 @@ The above information is from the [CSTR VCTK website](https://datashare.ed.ac.uk

 This recipe provides a VITS model trained on the VCTK dataset.

-Pretrained model can be found [here](https://huggingface.co/zrjin/icefall-tts-vctk-vits-2023-12-05), note that this model was pretrained on the Edinburgh DataShare VCTK dataset.
+Pretrained model can be found [here](https://huggingface.co/zrjin/icefall-tts-vctk-vits-2024-03-18), note that this model was pretrained on the Edinburgh DataShare VCTK dataset.

 For tutorial and more details, please refer to the [VITS documentation](https://k2-fsa.github.io/icefall/recipes/TTS/vctk/vits.html).

@ -21,7 +21,6 @@ export CUDA_VISIBLE_DEVICES="0,1,2,3"
  --world-size 4 \
  --num-epochs 1000 \
  --start-epoch 1 \
-  --use-fp16 1 \
  --exp-dir vits/exp \
  --tokens data/tokens.txt
  --max-duration 350
--- a/egs/vctk/TTS/local/prepare_token_file.py
+++ b/egs/vctk/TTS/local/prepare_token_file.py
@ -1,104 +0,0 @@
-#!/usr/bin/env python3
-# Copyright         2023  Xiaomi Corp.        (authors: Zengwei Yao)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This file reads the texts in given manifest and generates the file that maps tokens to IDs.
-"""
-
-import argparse
-import logging
-from pathlib import Path
-from typing import Dict
-
-from lhotse import load_manifest
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--manifest-file",
-        type=Path,
-        default=Path("data/spectrogram/vctk_cuts_all.jsonl.gz"),
-        help="Path to the manifest file",
-    )
-
-    parser.add_argument(
-        "--tokens",
-        type=Path,
-        default=Path("data/tokens.txt"),
-        help="Path to the tokens",
-    )
-
-    return parser.parse_args()
-
-
-def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
-    """Write a symbol to ID mapping to a file.
-
-    Note:
-      No need to implement `read_mapping` as it can be done
-      through :func:`k2.SymbolTable.from_file`.
-
-    Args:
-      filename:
-        Filename to save the mapping.
-      sym2id:
-        A dict mapping symbols to IDs.
-    Returns:
-      Return None.
-    """
-    with open(filename, "w", encoding="utf-8") as f:
-        for sym, i in sym2id.items():
-            f.write(f"{sym} {i}\n")
-
-
-def get_token2id(manifest_file: Path) -> Dict[str, int]:
-    """Return a dict that maps token to IDs."""
-    extra_tokens = [
-        "<blk>",  # 0 for blank
-        "<sos/eos>",  # 1 for sos and eos symbols.
-        "<unk>",  # 2 for OOV
-    ]
-    all_tokens = set()
-
-    cut_set = load_manifest(manifest_file)
-
-    for cut in cut_set:
-        # Each cut only contain one supervision
-        assert len(cut.supervisions) == 1, len(cut.supervisions)
-        for t in cut.tokens:
-            all_tokens.add(t)
-
-    all_tokens = extra_tokens + list(all_tokens)
-
-    token2id: Dict[str, int] = {token: i for i, token in enumerate(all_tokens)}
-    return token2id
-
-
-if __name__ == "__main__":
-    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-
-    logging.basicConfig(format=formatter, level=logging.INFO)
-
-    args = get_args()
-    manifest_file = Path(args.manifest_file)
-    out_file = Path(args.tokens)
-
-    token2id = get_token2id(manifest_file)
-    write_mapping(out_file, token2id)
--- a/egs/vctk/TTS/local/prepare_token_file.py
+++ b/egs/vctk/TTS/local/prepare_token_file.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/local/prepare_token_file.py
--- a/egs/vctk/TTS/local/prepare_tokens_vctk.py
+++ b/egs/vctk/TTS/local/prepare_tokens_vctk.py
@ -24,9 +24,9 @@ This file reads the texts in given manifest and save the new cuts with phoneme t
 import logging
 from pathlib import Path

-import g2p_en
 import tacotron_cleaner.cleaners
 from lhotse import CutSet, load_manifest
+from piper_phonemize import phonemize_espeak
 from tqdm.auto import tqdm


@ -37,17 +37,20 @@ def prepare_tokens_vctk():
    partition = "all"

    cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
-    g2p = g2p_en.G2p()

    new_cuts = []
    for cut in tqdm(cut_set):
        # Each cut only contains one supervision
-        assert len(cut.supervisions) == 1, len(cut.supervisions)
+        assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
        text = cut.supervisions[0].text
        # Text normalization
        text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
        # Convert to phonemes
-        cut.tokens = g2p(text)
+        tokens_list = phonemize_espeak(text, "en-us")
+        tokens = []
+        for t in tokens_list:
+            tokens.extend(t)
+        cut.tokens = tokens
        new_cuts.append(cut)

    new_cut_set = CutSet.from_cuts(new_cuts)
--- a/egs/vctk/TTS/prepare.sh
+++ b/egs/vctk/TTS/prepare.sh
@ -78,6 +78,13 @@ fi

 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Prepare phoneme tokens for VCTK"
+  # We assume you have installed piper_phonemize and espnet_tts_frontend.
+  # If not, please install them with:
+  #   - piper_phonemize: 
+  #       refer to https://github.com/rhasspy/piper-phonemize,
+  #       could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
+  #   - espnet_tts_frontend: 
+  #       `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
  if [ ! -e data/spectrogram/.vctk_with_token.done ]; then
    ./local/prepare_tokens_vctk.py
    mv data/spectrogram/vctk_cuts_with_tokens_all.jsonl.gz \
@ -111,14 +118,15 @@ fi

 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "Stage 5: Generate token file"
-  # We assume you have installed g2p_en and espnet_tts_frontend.
+  # We assume you have installed piper_phonemize and espnet_tts_frontend.
  # If not, please install them with:
-  #   - g2p_en: `pip install g2p_en`, refer to https://github.com/Kyubyong/g2p
-  #   - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
+  #   - piper_phonemize: 
+  #       refer to https://github.com/rhasspy/piper-phonemize,
+  #       could install the pre-built wheels from https://github.com/csukuangfj/piper-phonemize/releases/tag/2023.12.5
+  #   - espnet_tts_frontend: 
+  #       `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
  if [ ! -e data/tokens.txt ]; then
-    ./local/prepare_token_file.py \
-      --manifest-file data/spectrogram/vctk_cuts_train.jsonl.gz \
-      --tokens data/tokens.txt
+    ./local/prepare_token_file.py --tokens data/tokens.txt
  fi
 fi

--- a/egs/vctk/TTS/vits/export-onnx.py
+++ b/egs/vctk/TTS/vits/export-onnx.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 #
-# Copyright      2023 Xiaomi Corporation     (Author: Zengwei Yao)
+# Copyright   2023-2024  Xiaomi Corporation     (Author: Zengwei Yao,
+#                                                        Zengrui Jin,)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -97,7 +98,7 @@ def add_meta_data(filename: str, meta_data: Dict[str, str]):
    for key, value in meta_data.items():
        meta = model.metadata_props.add()
        meta.key = key
-        meta.value = value
+        meta.value = str(value)

    onnx.save(model, filename)

@ -160,6 +161,7 @@ def export_model_onnx(
    model: nn.Module,
    model_filename: str,
    vocab_size: int,
+    n_speakers: int,
    opset_version: int = 11,
 ) -> None:
    """Export the given generator model to ONNX format.
@ -212,10 +214,15 @@ def export_model_onnx(
    )

    meta_data = {
-        "model_type": "VITS",
+        "model_type": "vits",
        "version": "1",
        "model_author": "k2-fsa",
-        "comment": "VITS generator",
+        "comment": "icefall",  # must be icefall for models from icefall
+        "language": "English",
+        "voice": "en-us",  # Choose your language appropriately
+        "has_espeak": 1,
+        "n_speakers": n_speakers,
+        "sample_rate": 22050,  # Must match the real sample rate
    }
    logging.info(f"meta_data: {meta_data}")

@ -231,8 +238,7 @@ def main():
    params.update(vars(args))

    tokenizer = Tokenizer(params.tokens)
-    params.blank_id = tokenizer.blank_id
-    params.oov_id = tokenizer.oov_id
+    params.blank_id = tokenizer.pad_id
    params.vocab_size = tokenizer.vocab_size

    with open(args.speakers) as f:
@ -265,6 +271,7 @@ def main():
        model,
        model_filename,
        params.vocab_size,
+        params.num_spks,
        opset_version=opset_version,
    )
    logging.info(f"Exported generator to {model_filename}")
--- a/egs/vctk/TTS/vits/infer.py
+++ b/egs/vctk/TTS/vits/infer.py
@ -135,14 +135,16 @@ def infer_dataset(
            batch_size = len(batch["tokens"])

            tokens = batch["tokens"]
-            tokens = tokenizer.tokens_to_token_ids(tokens)
+            tokens = tokenizer.tokens_to_token_ids(
+                tokens, intersperse_blank=True, add_sos=True, add_eos=True
+            )
            tokens = k2.RaggedTensor(tokens)
            row_splits = tokens.shape.row_splits(1)
            tokens_lens = row_splits[1:] - row_splits[:-1]
            tokens = tokens.to(device)
            tokens_lens = tokens_lens.to(device)
            # tensor of shape (B, T)
-            tokens = tokens.pad(mode="constant", padding_value=tokenizer.blank_id)
+            tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id)
            speakers = (
                torch.Tensor([speaker_map[sid] for sid in batch["speakers"]])
                .int()
@ -214,8 +216,7 @@ def main():
        device = torch.device("cuda", 0)

    tokenizer = Tokenizer(params.tokens)
-    params.blank_id = tokenizer.blank_id
-    params.oov_id = tokenizer.oov_id
+    params.blank_id = tokenizer.pad_id
    params.vocab_size = tokenizer.vocab_size

    # we need cut ids to display recognition results.
--- a/egs/vctk/TTS/vits/test_onnx.py
+++ b/egs/vctk/TTS/vits/test_onnx.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 #
-# Copyright      2023 Xiaomi Corporation     (Author: Zengwei Yao)
+# Copyright   2023-2024   Xiaomi Corporation     (Author: Zengwei Yao,
+#                                                         Zengrui Jin,)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -122,7 +123,9 @@ def main():
    model = OnnxModel(args.model_filename)

    text = "I went there to see the land, the people and how their system works, end quote."
-    tokens = tokenizer.texts_to_token_ids([text])
+    tokens = tokenizer.texts_to_token_ids(
+        [text], intersperse_blank=True, add_sos=True, add_eos=True
+    )
    tokens = torch.tensor(tokens)  # (1, T)
    tokens_lens = torch.tensor([tokens.shape[1]], dtype=torch.int64)  # (1, T)
    speaker = torch.tensor([1], dtype=torch.int64)  # (1, )
--- a/egs/vctk/TTS/vits/train.py
+++ b/egs/vctk/TTS/vits/train.py
@ -1,5 +1,6 @@
 #!/usr/bin/env python3
-# Copyright         2023  Xiaomi Corp.        (authors: Zengwei Yao)
+# Copyright   2023-2024  Xiaomi Corporation     (Author: Zengwei Yao,
+#                                                        Zengrui Jin,)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
@ -342,14 +343,16 @@ def prepare_input(
        torch.Tensor([speaker_map[sid] for sid in batch["speakers"]]).int().to(device)
    )

-    tokens = tokenizer.tokens_to_token_ids(tokens)
+    tokens = tokenizer.tokens_to_token_ids(
+        tokens, intersperse_blank=True, add_sos=True, add_eos=True
+    )
    tokens = k2.RaggedTensor(tokens)
    row_splits = tokens.shape.row_splits(1)
    tokens_lens = row_splits[1:] - row_splits[:-1]
    tokens = tokens.to(device)
    tokens_lens = tokens_lens.to(device)
    # a tensor of shape (B, T)
-    tokens = tokens.pad(mode="constant", padding_value=tokenizer.blank_id)
+    tokens = tokens.pad(mode="constant", padding_value=tokenizer.pad_id)

    return audio, audio_lens, features, features_lens, tokens, tokens_lens, speakers

@ -812,8 +815,7 @@ def run(rank, world_size, args):
    logging.info(f"Device: {device}")

    tokenizer = Tokenizer(params.tokens)
-    params.blank_id = tokenizer.blank_id
-    params.oov_id = tokenizer.oov_id
+    params.blank_id = tokenizer.pad_id
    params.vocab_size = tokenizer.vocab_size

    vctk = VctkTtsDataModule(args)
--- a/egs/vctk/TTS/vits/tts_datamodule.py
+++ b/egs/vctk/TTS/vits/tts_datamodule.py
@ -1,6 +1,7 @@
 # Copyright      2021  Piotr Żelasko
-# Copyright      2022-2023  Xiaomi Corporation     (Authors: Mingshuang Luo,
-#                                                            Zengwei Yao)
+# Copyright      2022-2024  Xiaomi Corporation     (Authors: Mingshuang Luo,
+#                                                            Zengwei Yao,
+#                                                            Zengrui Jin,)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
				`@ -0,0 +1 @@`
				`../../../ljspeech/TTS/local/prepare_token_file.py`