use piper_phonemize as text tokenizer in ljspeech recipe

2025-12-11 06:55:27 +00:00 · 2024-02-20 17:45:56 +08:00 · 2024-02-20 17:45:56 +08:00 · 2cf5891c15
commit 2cf5891c15
parent ff6784d147
4 changed files with 66 additions and 64 deletions
--- a/egs/ljspeech/TTS/local/prepare_token_file.py
+++ b/egs/ljspeech/TTS/local/prepare_token_file.py
@ -17,7 +17,7 @@


 """
-This file reads the texts in given manifest and generates the file that maps tokens to IDs.
+This file generates the file that maps tokens to IDs.
 """

 import argparse
@ -25,80 +25,47 @@ import logging
 from pathlib import Path
 from typing import Dict

-from lhotse import load_manifest
+from piper_phonemize import get_espeak_map


 def get_args():
    parser = argparse.ArgumentParser()

-    parser.add_argument(
-        "--manifest-file",
-        type=Path,
-        default=Path("data/spectrogram/ljspeech_cuts_train.jsonl.gz"),
-        help="Path to the manifest file",
-    )
-
    parser.add_argument(
        "--tokens",
        type=Path,
        default=Path("data/tokens.txt"),
-        help="Path to the tokens",
+        help="Path to the dict that maps the text tokens to IDs",
    )

    return parser.parse_args()


-def write_mapping(filename: str, sym2id: Dict[str, int]) -> None:
-    """Write a symbol to ID mapping to a file.
-
-    Note:
-      No need to implement `read_mapping` as it can be done
-      through :func:`k2.SymbolTable.from_file`.
-
-    Args:
-      filename:
-        Filename to save the mapping.
-      sym2id:
-        A dict mapping symbols to IDs.
-    Returns:
-      Return None.
-    """
-    with open(filename, "w", encoding="utf-8") as f:
-        for sym, i in sym2id.items():
-            f.write(f"{sym} {i}\n")
-
-
-def get_token2id(manifest_file: Path) -> Dict[str, int]:
-    """Return a dict that maps token to IDs."""
+def get_token2id(filename: Path) -> Dict[str, int]:
+    """Get a dict that maps token to IDs, and save it to the given filename."""
    extra_tokens = [
        "<blk>",  # 0 for blank
-        "<sos/eos>",  # 1 for sos and eos symbols.
-        "<unk>",  # 2 for OOV
+        "<sos>",  # 1 for sos
+        "<eos>",  # 2 for eos
+        "<unk>",  # 3 for OOV
    ]
-    all_tokens = set()

-    cut_set = load_manifest(manifest_file)
+    all_tokens = list(get_espeak_map().keys())

-    for cut in cut_set:
-        # Each cut only contain one supervision
-        assert len(cut.supervisions) == 1, len(cut.supervisions)
-        for t in cut.tokens:
-            all_tokens.add(t)
+    for t in extra_tokens:
+        assert t not in all_tokens, t

-    all_tokens = extra_tokens + list(all_tokens)
+    all_tokens = extra_tokens + all_tokens

-    token2id: Dict[str, int] = {token: i for i, token in enumerate(all_tokens)}
-    return token2id
+    with open(filename, "w", encoding="utf-8") as f:
+        for i, token in enumerate(all_tokens):
+            f.write(f"{token} {i}\n")


 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-
    logging.basicConfig(format=formatter, level=logging.INFO)

    args = get_args()
-    manifest_file = Path(args.manifest_file)
    out_file = Path(args.tokens)
-
-    token2id = get_token2id(manifest_file)
-    write_mapping(out_file, token2id)
+    get_token2id(out_file)
--- a/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py
+++ b/egs/ljspeech/TTS/local/prepare_tokens_ljspeech.py
@ -23,9 +23,9 @@ This file reads the texts in given manifest and save the new cuts with phoneme t
 import logging
 from pathlib import Path

-import g2p_en
 import tacotron_cleaner.cleaners
 from lhotse import CutSet, load_manifest
+from piper_phonemize import phonemize_espeak


 def prepare_tokens_ljspeech():
@ -35,7 +35,6 @@ def prepare_tokens_ljspeech():
    partition = "all"

    cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
-    g2p = g2p_en.G2p()

    new_cuts = []
    for cut in cut_set:
@ -45,7 +44,11 @@ def prepare_tokens_ljspeech():
        # Text normalization
        text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
        # Convert to phonemes
-        cut.tokens = g2p(text)
+        tokens_list = phonemize_espeak(text, "en-us")
+        tokens = []
+        for t in tokens_list:
+            tokens.extend(t)
+        cut.tokens = tokens
        new_cuts.append(cut)

    new_cut_set = CutSet.from_cuts(new_cuts)
--- a/egs/ljspeech/TTS/prepare.sh
+++ b/egs/ljspeech/TTS/prepare.sh
@ -30,7 +30,7 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
    cd vits/monotonic_align
    python setup.py build_ext --inplace
    cd ../../
-  else 
+  else
    log "monotonic_align lib already built"
  fi
 fi
@ -80,6 +80,10 @@ fi

 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Prepare phoneme tokens for LJSpeech"
+  # We assume you have installed piper_phonemize and espnet_tts_frontend.
+  # If not, please install them with:
+  #   - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize
+  #   - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
  if [ ! -e data/spectrogram/.ljspeech_with_token.done ]; then
    ./local/prepare_tokens_ljspeech.py
    mv data/spectrogram/ljspeech_cuts_with_tokens_all.jsonl.gz \
@ -113,13 +117,11 @@ fi

 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "Stage 5: Generate token file"
-  # We assume you have installed g2p_en and espnet_tts_frontend.
+  # We assume you have installed piper_phonemize and espnet_tts_frontend.
  # If not, please install them with:
-  #   - g2p_en: `pip install g2p_en`, refer to https://github.com/Kyubyong/g2p
+  #   - piper_phonemize: refer to https://github.com/rhasspy/piper-phonemize
  #   - espnet_tts_frontend, `pip install espnet_tts_frontend`, refer to https://github.com/espnet/espnet_tts_frontend/
  if [ ! -e data/tokens.txt ]; then
-    ./local/prepare_token_file.py \
-      --manifest-file data/spectrogram/ljspeech_cuts_train.jsonl.gz \
-      --tokens data/tokens.txt
+    ./local/prepare_token_file.py --tokens data/tokens.txt
  fi
 fi
--- a/egs/ljspeech/TTS/vits/tokenizer.py
+++ b/egs/ljspeech/TTS/vits/tokenizer.py
@ -16,8 +16,8 @@

 from typing import Dict, List

-import g2p_en
 import tacotron_cleaner.cleaners
+from piper_phonemize import phonemize_espeak
 from utils import intersperse


@ -41,18 +41,28 @@ class Tokenizer(object):
                self.token2id[token] = id

        self.blank_id = self.token2id["<blk>"]
+        self.sos_id = self.token2id["<sos>"]
+        self.eos_id = self.token2id["<eos>"]
        self.oov_id = self.token2id["<unk>"]
        self.vocab_size = len(self.token2id)

-        self.g2p = g2p_en.G2p()
-
-    def texts_to_token_ids(self, texts: List[str], intersperse_blank: bool = True):
+    def texts_to_token_ids(
+        self,
+        texts: List[str],
+        intersperse_blank: bool = True,
+        add_sos: bool = False,
+        add_eos: bool = False,
+    ):
        """
        Args:
          texts:
            A list of transcripts.
          intersperse_blank:
            Whether to intersperse blanks in the token sequence.
+          add_sos:
+            Whether to add sos token at the start.
+          add_eos:
+            Whether to add eos token at the end.

        Returns:
          Return a list of token id list [utterance][token_id]
@ -63,7 +73,11 @@ class Tokenizer(object):
            # Text normalization
            text = tacotron_cleaner.cleaners.custom_english_cleaners(text)
            # Convert to phonemes
-            tokens = self.g2p(text)
+            tokens_list = phonemize_espeak(text, "en-us")
+            tokens = []
+            for t in tokens_list:
+                tokens.extend(t)
+
            token_ids = []
            for t in tokens:
                if t in self.token2id:
@ -73,13 +87,21 @@ class Tokenizer(object):

            if intersperse_blank:
                token_ids = intersperse(token_ids, self.blank_id)
+            if add_sos:
+                token_ids = [self.sos_id] + token_ids
+            if add_eos:
+                token_ids = token_ids + [self.eos_id]

            token_ids_list.append(token_ids)

        return token_ids_list

    def tokens_to_token_ids(
-        self, tokens_list: List[str], intersperse_blank: bool = True
+        self,
+        tokens_list: List[str],
+        intersperse_blank: bool = True,
+        add_sos: bool = False,
+        add_eos: bool = False,
    ):
        """
        Args:
@ -87,6 +109,10 @@ class Tokenizer(object):
            A list of token list, each corresponding to one utterance.
          intersperse_blank:
            Whether to intersperse blanks in the token sequence.
+          add_sos:
+            Whether to add sos token at the start.
+          add_eos:
+            Whether to add eos token at the end.

        Returns:
          Return a list of token id list [utterance][token_id]
@ -103,6 +129,10 @@ class Tokenizer(object):

            if intersperse_blank:
                token_ids = intersperse(token_ids, self.blank_id)
+            if add_sos:
+                token_ids = [self.sos_id] + token_ids
+            if add_eos:
+                token_ids = token_ids + [self.eos_id]

            token_ids_list.append(token_ids)