From 35766d7db45887046b0a0f0bf5e0e02accf75135 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Wed, 18 Dec 2024 18:28:05 +0800 Subject: [PATCH] generate tokens.txt --- egs/baker_zh/TTS/local/generate_tokens.py | 65 +++++++++++++++++++++++ egs/baker_zh/TTS/prepare.sh | 18 +++++++ 2 files changed, 83 insertions(+) create mode 100644 egs/baker_zh/TTS/local/generate_tokens.py diff --git a/egs/baker_zh/TTS/local/generate_tokens.py b/egs/baker_zh/TTS/local/generate_tokens.py new file mode 100644 index 000000000..9d51cbfc7 --- /dev/null +++ b/egs/baker_zh/TTS/local/generate_tokens.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +""" +This file generates the file tokens.txt. + +Usage: + +python3 ./local/generate_tokens.py > data/tokens.txt +""" + + +import argparse +from typing import List + +import jieba +from pypinyin import Style, lazy_pinyin, pinyin_dict + + +def get_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "--tokens", + type=str, + required=True, + help="Path to to save tokens.txt.", + ) + + return parser + + +def generate_token_list() -> List[str]: + token_set = set() + + word_dict = pinyin_dict.pinyin_dict + i = 0 + for key in word_dict: + if not (0x4E00 <= key <= 0x9FFF): + continue + + w = chr(key) + t = lazy_pinyin(w, style=Style.TONE3, tone_sandhi=True)[0] + token_set.add(t) + + ans = list(token_set) + ans.sort() + + # use ID 0 for blank + # We use blank for padding + ans.insert(0, " ") + + return ans + + +def main(): + args = get_parser().parse_args() + token_list = generate_token_list() + with open(args.tokens, "w", encoding="utf-8") as f: + for indx, token in enumerate(token_list): + f.write(f"{token} {indx}\n") + + +if __name__ == "__main__": + main() diff --git a/egs/baker_zh/TTS/prepare.sh b/egs/baker_zh/TTS/prepare.sh index e6840f66a..e5fcf0278 100755 --- a/egs/baker_zh/TTS/prepare.sh +++ b/egs/baker_zh/TTS/prepare.sh @@ -64,3 +64,21 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then lhotse download baker-zh $dl_dir fi fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare baker-zh manifest" + # We assume that you have downloaded the baker corpus + # to $dl_dir/BZNSYP + mkdir -p data/manifests + if [ ! -e data/manifests/.baker-zh.done ]; then + lhotse prepare baker-zh $dl_dir/BZNSYP data/manifests + touch data/manifests/.baker-zh.done + fi +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Generate tokens.txt" + if [ ! -e data/tokens.txt ]; then + python3 ./local/generate_tokens.py --tokens data/tokens.txt + fi +fi