mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-17 21:22:20 +00:00
generate tokens.txt
This commit is contained in:
parent
42fee0228b
commit
35766d7db4
65
egs/baker_zh/TTS/local/generate_tokens.py
Normal file
65
egs/baker_zh/TTS/local/generate_tokens.py
Normal file
@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
This file generates the file tokens.txt.
|
||||
|
||||
Usage:
|
||||
|
||||
python3 ./local/generate_tokens.py > data/tokens.txt
|
||||
"""
|
||||
|
||||
|
||||
import argparse
|
||||
from typing import List
|
||||
|
||||
import jieba
|
||||
from pypinyin import Style, lazy_pinyin, pinyin_dict
|
||||
|
||||
|
||||
def get_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokens",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Path to to save tokens.txt.",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
def generate_token_list() -> List[str]:
|
||||
token_set = set()
|
||||
|
||||
word_dict = pinyin_dict.pinyin_dict
|
||||
i = 0
|
||||
for key in word_dict:
|
||||
if not (0x4E00 <= key <= 0x9FFF):
|
||||
continue
|
||||
|
||||
w = chr(key)
|
||||
t = lazy_pinyin(w, style=Style.TONE3, tone_sandhi=True)[0]
|
||||
token_set.add(t)
|
||||
|
||||
ans = list(token_set)
|
||||
ans.sort()
|
||||
|
||||
# use ID 0 for blank
|
||||
# We use blank for padding
|
||||
ans.insert(0, " ")
|
||||
|
||||
return ans
|
||||
|
||||
|
||||
def main():
|
||||
args = get_parser().parse_args()
|
||||
token_list = generate_token_list()
|
||||
with open(args.tokens, "w", encoding="utf-8") as f:
|
||||
for indx, token in enumerate(token_list):
|
||||
f.write(f"{token} {indx}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -64,3 +64,21 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||
lhotse download baker-zh $dl_dir
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
log "Stage 1: Prepare baker-zh manifest"
|
||||
# We assume that you have downloaded the baker corpus
|
||||
# to $dl_dir/BZNSYP
|
||||
mkdir -p data/manifests
|
||||
if [ ! -e data/manifests/.baker-zh.done ]; then
|
||||
lhotse prepare baker-zh $dl_dir/BZNSYP data/manifests
|
||||
touch data/manifests/.baker-zh.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||
log "Stage 2: Generate tokens.txt"
|
||||
if [ ! -e data/tokens.txt ]; then
|
||||
python3 ./local/generate_tokens.py --tokens data/tokens.txt
|
||||
fi
|
||||
fi
|
||||
|
Loading…
x
Reference in New Issue
Block a user