Merge c25dc02d5d192a03fc61302d05d2ee602c008b4d into 9293edc62f4a3ebf769d66cc037d4e67953440f5

This commit is contained in:
Fangjun Kuang 2025-07-08 17:44:55 +08:00 committed by GitHub
commit bf048133e1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
37 changed files with 3415 additions and 6 deletions

118
.github/scripts/aishell3/TTS/run.sh vendored Executable file
View File

@ -0,0 +1,118 @@
#!/usr/bin/env bash
set -ex
python3 -m pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
python3 -m pip install numba
python3 -m pip install pypinyin
python3 -m pip install cython
apt-get update
apt-get install -y jq
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
cd egs/aishell3/TTS
sed -i.bak s/1000/10/g ./prepare.sh
function download_data() {
mkdir download
pushd download
curl -SL -O https://huggingface.co/csukuangfj/aishell3-ci-data/resolve/main/aishell3.tar.bz2
tar xf aishell3.tar.bz2
rm aishell3.tar.bz2
ls -lh
popd
}
function prepare_data() {
./prepare.sh
echo "----------tokens.txt----------"
cat data/tokens.txt
echo "------------------------------"
wc -l data/tokens.txt
echo "------------------------------"
echo "----------lexicon.txt----------"
head data/lexicon.txt
echo "----"
tail data/lexicon.txt
echo "----"
wc -l data/lexicon.txt
}
function train() {
pushd ./vits
sed -i.bak s/200/50/g ./train.py
git diff .
popd
# for t in low medium high; do
for t in low; do
./vits/train.py \
--exp-dir vits/exp-$t \
--model-type $t \
--num-epochs 1 \
--save-every-n 1 \
--num-buckets 2 \
--tokens data/tokens.txt \
--max-duration 20
ls -lh vits/exp-$t
done
}
function export_onnx() {
# for t in low medium high; do
for t in low; do
./vits/export-onnx.py \
--model-type $t \
--epoch 1 \
--exp-dir ./vits/exp-$t \
--tokens data/tokens.txt \
--speakers ./data/speakers.txt
ls -lh vits/exp-$t/
done
}
function test_low() {
git clone https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06
repo=icefall-tts-aishell3-vits-low-2024-04-06
./vits/export-onnx.py \
--model-type low \
--epoch 1000 \
--exp-dir $repo/exp \
--tokens $repo/data/tokens.txt \
--speakers $repo/data/speakers.txt
ls -lh $repo/exp/vits-epoch-1000.onnx
python3 -m pip install sherpa-onnx
sherpa-onnx-offline-tts \
--vits-model=$repo/exp/vits-epoch-960.onnx \
--vits-tokens=$repo/data/tokens.txt \
--vits-lexicon=$repo/data/lexicon.txt \
--num-threads=1 \
--vits-length-scale=1.0 \
--sid=33 \
--output-filename=/icefall/low.wav \
--debug=1 \
"这是一个语音合成测试"
}
download_data
prepare_data
train
export_onnx
test_low

84
.github/workflows/aishell3.yml vendored Normal file
View File

@ -0,0 +1,84 @@
name: aishell3
on:
push:
branches:
- master
- tts-aishell3
pull_request:
branches:
- master
workflow_dispatch:
concurrency:
group: aishell3-${{ github.ref }}
cancel-in-progress: true
jobs:
generate_build_matrix:
if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event.label.name == 'ready' || github.event_name == 'push' || github.event_name == 'aishell3')
# see https://github.com/pytorch/pytorch/pull/50633
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Generating build matrix
id: set-matrix
run: |
# outputting for debugging purposes
python ./.github/scripts/docker/generate_build_matrix.py
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
echo "::set-output name=matrix::${MATRIX}"
aishell3:
needs: generate_build_matrix
name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Free space
shell: bash
run: |
df -h
rm -rf /opt/hostedtoolcache
df -h
echo "pwd: $PWD"
echo "github.workspace ${{ github.workspace }}"
- name: Run aishell3 tests
uses: addnab/docker-run-action@v3
with:
image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
options: |
--volume ${{ github.workspace }}/:/icefall
shell: bash
run: |
export PYTHONPATH=/icefall:$PYTHONPATH
cd /icefall
git config --global --add safe.directory /icefall
.github/scripts/aishell3/TTS/run.sh
- name: display files
shell: bash
run: |
ls -lh
- uses: actions/upload-artifact@v4
if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
with:
name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
path: ./*.wav

4
.gitignore vendored
View File

@ -36,3 +36,7 @@ node_modules
.DS_Store
*.fst
*.arpa
core.c
*.so
build
*.wav

View File

@ -19,7 +19,7 @@ Install extra dependencies
.. code-block:: bash
pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
pip install numba espnet_tts_frontend
pip install numba espnet_tts_frontend cython
Data preparation
----------------

View File

@ -0,0 +1,110 @@
#!/usr/bin/env python3
# Copyright 2021-2023 Xiaomi Corp. (authors: Fangjun Kuang,
# Zengwei Yao)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file computes fbank features of the aishell3 dataset.
It looks for manifests in the directory data/manifests.
The generated spectrogram features are saved in data/spectrogram.
"""
import logging
import os
from pathlib import Path
import torch
from lhotse import (
CutSet,
LilcomChunkyWriter,
Spectrogram,
SpectrogramConfig,
load_manifest,
)
from lhotse.audio import RecordingSet
from lhotse.supervision import SupervisionSet
from icefall.utils import get_executor
# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
# even when we are not invoking the main (e.g. when spawning subprocesses).
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
def compute_spectrogram_aishell3():
src_dir = Path("data/manifests")
output_dir = Path("data/spectrogram")
num_jobs = min(4, os.cpu_count())
sampling_rate = 8000
frame_length = 1024 / sampling_rate # (in second)
frame_shift = 256 / sampling_rate # (in second)
use_fft_mag = True
prefix = "aishell3"
suffix = "jsonl.gz"
partitions = ("test", "train")
config = SpectrogramConfig(
sampling_rate=sampling_rate,
frame_length=frame_length,
frame_shift=frame_shift,
use_fft_mag=use_fft_mag,
)
extractor = Spectrogram(config)
for partition in partitions:
recordings = load_manifest(
src_dir / f"{prefix}_recordings_{partition}.{suffix}", RecordingSet
)
supervisions = load_manifest(
src_dir / f"{prefix}_supervisions_{partition}.{suffix}", SupervisionSet
)
# resample from 44100 to 8000
recordings = recordings.resample(sampling_rate)
with get_executor() as ex: # Initialize the executor only once.
cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
if (output_dir / cuts_filename).is_file():
logging.info(f"{cuts_filename} already exists - skipping.")
return
logging.info(f"Processing {partition}")
cut_set = CutSet.from_manifests(
recordings=recordings, supervisions=supervisions
)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=LilcomChunkyWriter,
)
cut_set.to_file(output_dir / cuts_filename)
if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
compute_spectrogram_aishell3()

View File

@ -0,0 +1,68 @@
#!/usr/bin/env python3
"""
This file generates the file lexicon.txt that contains pronunciations of all
words and phrases
"""
from pypinyin import phrases_dict, pinyin_dict
from tokenizer import Tokenizer
import argparse
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--tokens",
type=str,
default="data/tokens.txt",
help="""Path to vocabulary.""",
)
parser.add_argument(
"--lexicon",
type=str,
default="data/lexicon.txt",
help="""Path to save the generated lexicon.""",
)
return parser
def main():
args = get_parser().parse_args()
filename = args.lexicon
tokens = args.tokens
tokenizer = Tokenizer(tokens)
word_dict = pinyin_dict.pinyin_dict
phrases = phrases_dict.phrases_dict
i = 0
with open(filename, "w", encoding="utf-8") as f:
for key in word_dict:
if not (0x4E00 <= key <= 0x9FFF):
continue
w = chr(key)
# 1 to remove the initial sil
# :-1 to remove the final eos
tokens = tokenizer.text_to_tokens(w)[1:-1]
tokens = " ".join(tokens)
f.write(f"{w} {tokens}\n")
# TODO(fangjun): Add phrases
# for key in phrases:
# # 1 to remove the initial sil
# # :-1 to remove the final eos
# tokens = tokenizer.text_to_tokens(key)[1:-1]
# tokens = " ".join(tokens)
# f.write(f"{key} {tokens}\n")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,421 @@
# This dict is copied from
# https://github.com/UEhQZXI/vits_chinese/blob/master/vits_strings.py
pinyin_dict = {
"a": ("^", "a"),
"ai": ("^", "ai"),
"an": ("^", "an"),
"ang": ("^", "ang"),
"ao": ("^", "ao"),
"ba": ("b", "a"),
"bai": ("b", "ai"),
"ban": ("b", "an"),
"bang": ("b", "ang"),
"bao": ("b", "ao"),
"be": ("b", "e"),
"bei": ("b", "ei"),
"ben": ("b", "en"),
"beng": ("b", "eng"),
"bi": ("b", "i"),
"bian": ("b", "ian"),
"biao": ("b", "iao"),
"bie": ("b", "ie"),
"bin": ("b", "in"),
"bing": ("b", "ing"),
"bo": ("b", "o"),
"bu": ("b", "u"),
"ca": ("c", "a"),
"cai": ("c", "ai"),
"can": ("c", "an"),
"cang": ("c", "ang"),
"cao": ("c", "ao"),
"ce": ("c", "e"),
"cen": ("c", "en"),
"ceng": ("c", "eng"),
"cha": ("ch", "a"),
"chai": ("ch", "ai"),
"chan": ("ch", "an"),
"chang": ("ch", "ang"),
"chao": ("ch", "ao"),
"che": ("ch", "e"),
"chen": ("ch", "en"),
"cheng": ("ch", "eng"),
"chi": ("ch", "iii"),
"chong": ("ch", "ong"),
"chou": ("ch", "ou"),
"chu": ("ch", "u"),
"chua": ("ch", "ua"),
"chuai": ("ch", "uai"),
"chuan": ("ch", "uan"),
"chuang": ("ch", "uang"),
"chui": ("ch", "uei"),
"chun": ("ch", "uen"),
"chuo": ("ch", "uo"),
"ci": ("c", "ii"),
"cong": ("c", "ong"),
"cou": ("c", "ou"),
"cu": ("c", "u"),
"cuan": ("c", "uan"),
"cui": ("c", "uei"),
"cun": ("c", "uen"),
"cuo": ("c", "uo"),
"da": ("d", "a"),
"dai": ("d", "ai"),
"dan": ("d", "an"),
"dang": ("d", "ang"),
"dao": ("d", "ao"),
"de": ("d", "e"),
"dei": ("d", "ei"),
"den": ("d", "en"),
"deng": ("d", "eng"),
"di": ("d", "i"),
"dia": ("d", "ia"),
"dian": ("d", "ian"),
"diao": ("d", "iao"),
"die": ("d", "ie"),
"ding": ("d", "ing"),
"diu": ("d", "iou"),
"dong": ("d", "ong"),
"dou": ("d", "ou"),
"du": ("d", "u"),
"duan": ("d", "uan"),
"dui": ("d", "uei"),
"dun": ("d", "uen"),
"duo": ("d", "uo"),
"e": ("^", "e"),
"ei": ("^", "ei"),
"en": ("^", "en"),
"ng": ("^", "en"),
"eng": ("^", "eng"),
"er": ("^", "er"),
"fa": ("f", "a"),
"fan": ("f", "an"),
"fang": ("f", "ang"),
"fei": ("f", "ei"),
"fen": ("f", "en"),
"feng": ("f", "eng"),
"fo": ("f", "o"),
"fou": ("f", "ou"),
"fu": ("f", "u"),
"ga": ("g", "a"),
"gai": ("g", "ai"),
"gan": ("g", "an"),
"gang": ("g", "ang"),
"gao": ("g", "ao"),
"ge": ("g", "e"),
"gei": ("g", "ei"),
"gen": ("g", "en"),
"geng": ("g", "eng"),
"gong": ("g", "ong"),
"gou": ("g", "ou"),
"gu": ("g", "u"),
"gua": ("g", "ua"),
"guai": ("g", "uai"),
"guan": ("g", "uan"),
"guang": ("g", "uang"),
"gui": ("g", "uei"),
"gun": ("g", "uen"),
"guo": ("g", "uo"),
"ha": ("h", "a"),
"hai": ("h", "ai"),
"han": ("h", "an"),
"hang": ("h", "ang"),
"hao": ("h", "ao"),
"he": ("h", "e"),
"hei": ("h", "ei"),
"hen": ("h", "en"),
"heng": ("h", "eng"),
"hong": ("h", "ong"),
"hou": ("h", "ou"),
"hu": ("h", "u"),
"hua": ("h", "ua"),
"huai": ("h", "uai"),
"huan": ("h", "uan"),
"huang": ("h", "uang"),
"hui": ("h", "uei"),
"hun": ("h", "uen"),
"huo": ("h", "uo"),
"ji": ("j", "i"),
"jia": ("j", "ia"),
"jian": ("j", "ian"),
"jiang": ("j", "iang"),
"jiao": ("j", "iao"),
"jie": ("j", "ie"),
"jin": ("j", "in"),
"jing": ("j", "ing"),
"jiong": ("j", "iong"),
"jiu": ("j", "iou"),
"ju": ("j", "v"),
"juan": ("j", "van"),
"jue": ("j", "ve"),
"jun": ("j", "vn"),
"ka": ("k", "a"),
"kai": ("k", "ai"),
"kan": ("k", "an"),
"kang": ("k", "ang"),
"kao": ("k", "ao"),
"ke": ("k", "e"),
"kei": ("k", "ei"),
"ken": ("k", "en"),
"keng": ("k", "eng"),
"kong": ("k", "ong"),
"kou": ("k", "ou"),
"ku": ("k", "u"),
"kua": ("k", "ua"),
"kuai": ("k", "uai"),
"kuan": ("k", "uan"),
"kuang": ("k", "uang"),
"kui": ("k", "uei"),
"kun": ("k", "uen"),
"kuo": ("k", "uo"),
"la": ("l", "a"),
"lai": ("l", "ai"),
"lan": ("l", "an"),
"lang": ("l", "ang"),
"lao": ("l", "ao"),
"le": ("l", "e"),
"lei": ("l", "ei"),
"leng": ("l", "eng"),
"li": ("l", "i"),
"lia": ("l", "ia"),
"lian": ("l", "ian"),
"liang": ("l", "iang"),
"liao": ("l", "iao"),
"lie": ("l", "ie"),
"lin": ("l", "in"),
"ling": ("l", "ing"),
"liu": ("l", "iou"),
"lo": ("l", "o"),
"long": ("l", "ong"),
"lou": ("l", "ou"),
"lu": ("l", "u"),
"lv": ("l", "v"),
"luan": ("l", "uan"),
"lve": ("l", "ve"),
"lue": ("l", "ve"),
"lun": ("l", "uen"),
"luo": ("l", "uo"),
"ma": ("m", "a"),
"mai": ("m", "ai"),
"man": ("m", "an"),
"mang": ("m", "ang"),
"mao": ("m", "ao"),
"me": ("m", "e"),
"mei": ("m", "ei"),
"men": ("m", "en"),
"meng": ("m", "eng"),
"mi": ("m", "i"),
"mian": ("m", "ian"),
"miao": ("m", "iao"),
"mie": ("m", "ie"),
"min": ("m", "in"),
"ming": ("m", "ing"),
"miu": ("m", "iou"),
"mo": ("m", "o"),
"mou": ("m", "ou"),
"mu": ("m", "u"),
"na": ("n", "a"),
"nai": ("n", "ai"),
"nan": ("n", "an"),
"nang": ("n", "ang"),
"nao": ("n", "ao"),
"ne": ("n", "e"),
"nei": ("n", "ei"),
"nen": ("n", "en"),
"neng": ("n", "eng"),
"ni": ("n", "i"),
"nia": ("n", "ia"),
"nian": ("n", "ian"),
"niang": ("n", "iang"),
"niao": ("n", "iao"),
"nie": ("n", "ie"),
"nin": ("n", "in"),
"ning": ("n", "ing"),
"niu": ("n", "iou"),
"nong": ("n", "ong"),
"nou": ("n", "ou"),
"nu": ("n", "u"),
"nv": ("n", "v"),
"nuan": ("n", "uan"),
"nve": ("n", "ve"),
"nue": ("n", "ve"),
"nuo": ("n", "uo"),
"o": ("^", "o"),
"ou": ("^", "ou"),
"pa": ("p", "a"),
"pai": ("p", "ai"),
"pan": ("p", "an"),
"pang": ("p", "ang"),
"pao": ("p", "ao"),
"pe": ("p", "e"),
"pei": ("p", "ei"),
"pen": ("p", "en"),
"peng": ("p", "eng"),
"pi": ("p", "i"),
"pian": ("p", "ian"),
"piao": ("p", "iao"),
"pie": ("p", "ie"),
"pin": ("p", "in"),
"ping": ("p", "ing"),
"po": ("p", "o"),
"pou": ("p", "ou"),
"pu": ("p", "u"),
"qi": ("q", "i"),
"qia": ("q", "ia"),
"qian": ("q", "ian"),
"qiang": ("q", "iang"),
"qiao": ("q", "iao"),
"qie": ("q", "ie"),
"qin": ("q", "in"),
"qing": ("q", "ing"),
"qiong": ("q", "iong"),
"qiu": ("q", "iou"),
"qu": ("q", "v"),
"quan": ("q", "van"),
"que": ("q", "ve"),
"qun": ("q", "vn"),
"ran": ("r", "an"),
"rang": ("r", "ang"),
"rao": ("r", "ao"),
"re": ("r", "e"),
"ren": ("r", "en"),
"reng": ("r", "eng"),
"ri": ("r", "iii"),
"rong": ("r", "ong"),
"rou": ("r", "ou"),
"ru": ("r", "u"),
"rua": ("r", "ua"),
"ruan": ("r", "uan"),
"rui": ("r", "uei"),
"run": ("r", "uen"),
"ruo": ("r", "uo"),
"sa": ("s", "a"),
"sai": ("s", "ai"),
"san": ("s", "an"),
"sang": ("s", "ang"),
"sao": ("s", "ao"),
"se": ("s", "e"),
"sen": ("s", "en"),
"seng": ("s", "eng"),
"sha": ("sh", "a"),
"shai": ("sh", "ai"),
"shan": ("sh", "an"),
"shang": ("sh", "ang"),
"shao": ("sh", "ao"),
"she": ("sh", "e"),
"shei": ("sh", "ei"),
"shen": ("sh", "en"),
"sheng": ("sh", "eng"),
"shi": ("sh", "iii"),
"shou": ("sh", "ou"),
"shu": ("sh", "u"),
"shua": ("sh", "ua"),
"shuai": ("sh", "uai"),
"shuan": ("sh", "uan"),
"shuang": ("sh", "uang"),
"shui": ("sh", "uei"),
"shun": ("sh", "uen"),
"shuo": ("sh", "uo"),
"si": ("s", "ii"),
"song": ("s", "ong"),
"sou": ("s", "ou"),
"su": ("s", "u"),
"suan": ("s", "uan"),
"sui": ("s", "uei"),
"sun": ("s", "uen"),
"suo": ("s", "uo"),
"ta": ("t", "a"),
"tai": ("t", "ai"),
"tan": ("t", "an"),
"tang": ("t", "ang"),
"tao": ("t", "ao"),
"te": ("t", "e"),
"tei": ("t", "ei"),
"teng": ("t", "eng"),
"ti": ("t", "i"),
"tian": ("t", "ian"),
"tiao": ("t", "iao"),
"tie": ("t", "ie"),
"ting": ("t", "ing"),
"tong": ("t", "ong"),
"tou": ("t", "ou"),
"tu": ("t", "u"),
"tuan": ("t", "uan"),
"tui": ("t", "uei"),
"tun": ("t", "uen"),
"tuo": ("t", "uo"),
"wa": ("^", "ua"),
"wai": ("^", "uai"),
"wan": ("^", "uan"),
"wang": ("^", "uang"),
"wei": ("^", "uei"),
"wen": ("^", "uen"),
"weng": ("^", "ueng"),
"wo": ("^", "uo"),
"wu": ("^", "u"),
"xi": ("x", "i"),
"xia": ("x", "ia"),
"xian": ("x", "ian"),
"xiang": ("x", "iang"),
"xiao": ("x", "iao"),
"xie": ("x", "ie"),
"xin": ("x", "in"),
"xing": ("x", "ing"),
"xiong": ("x", "iong"),
"xiu": ("x", "iou"),
"xu": ("x", "v"),
"xuan": ("x", "van"),
"xue": ("x", "ve"),
"xun": ("x", "vn"),
"ya": ("^", "ia"),
"yan": ("^", "ian"),
"yang": ("^", "iang"),
"yao": ("^", "iao"),
"ye": ("^", "ie"),
"yi": ("^", "i"),
"yin": ("^", "in"),
"ying": ("^", "ing"),
"yo": ("^", "iou"),
"yong": ("^", "iong"),
"you": ("^", "iou"),
"yu": ("^", "v"),
"yuan": ("^", "van"),
"yue": ("^", "ve"),
"yun": ("^", "vn"),
"za": ("z", "a"),
"zai": ("z", "ai"),
"zan": ("z", "an"),
"zang": ("z", "ang"),
"zao": ("z", "ao"),
"ze": ("z", "e"),
"zei": ("z", "ei"),
"zen": ("z", "en"),
"zeng": ("z", "eng"),
"zha": ("zh", "a"),
"zhai": ("zh", "ai"),
"zhan": ("zh", "an"),
"zhang": ("zh", "ang"),
"zhao": ("zh", "ao"),
"zhe": ("zh", "e"),
"zhei": ("zh", "ei"),
"zhen": ("zh", "en"),
"zheng": ("zh", "eng"),
"zhi": ("zh", "iii"),
"zhong": ("zh", "ong"),
"zhou": ("zh", "ou"),
"zhu": ("zh", "u"),
"zhua": ("zh", "ua"),
"zhuai": ("zh", "uai"),
"zhuan": ("zh", "uan"),
"zhuang": ("zh", "uang"),
"zhui": ("zh", "uei"),
"zhun": ("zh", "uen"),
"zhuo": ("zh", "uo"),
"zi": ("z", "ii"),
"zong": ("z", "ong"),
"zou": ("z", "ou"),
"zu": ("z", "u"),
"zuan": ("z", "uan"),
"zui": ("z", "uei"),
"zun": ("z", "uen"),
"zuo": ("z", "uo"),
}

View File

@ -0,0 +1,53 @@
#!/usr/bin/env python3
# Copyright 2023 Xiaomi Corp. (authors: Zengwei Yao)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file generates the file tokens.txt that maps tokens to IDs.
"""
import argparse
import logging
from pathlib import Path
from typing import Dict
from symbols import symbols
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--tokens",
type=Path,
default=Path("data/tokens.txt"),
help="Path to the dict that maps the text tokens to IDs",
)
return parser.parse_args()
def main():
args = get_args()
tokens = Path(args.tokens)
with open(tokens, "w", encoding="utf-8") as f:
for token_id, token in enumerate(symbols):
f.write(f"{token} {token_id}\n")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,62 @@
#!/usr/bin/env python3
# Copyright 2023 Xiaomi Corp. (authors: Zengwei Yao)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file reads the texts in given manifest and save the new cuts with tokens.
"""
import logging
from pathlib import Path
from lhotse import CutSet, load_manifest
from tokenizer import Tokenizer
def prepare_tokens_aishell3():
output_dir = Path("data/spectrogram")
prefix = "aishell3"
suffix = "jsonl.gz"
partitions = ("train", "test")
tokenizer = Tokenizer()
for partition in partitions:
cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
new_cuts = []
i = 0
for cut in cut_set:
# Each cut only contains one supervision
assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
text = cut.supervisions[0].text
cut.tokens = tokenizer.text_to_tokens(text)
new_cuts.append(cut)
new_cut_set = CutSet.from_cuts(new_cuts)
new_cut_set.to_file(
output_dir / f"{prefix}_cuts_with_tokens_{partition}.{suffix}"
)
if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
prepare_tokens_aishell3()

View File

@ -0,0 +1,328 @@
姐姐 jie3 jie
宝宝 bao3 bao
哥哥 ge1 ge
妹妹 mei4 mei
弟弟 di4 di
妈妈 ma1 ma
开心哦 kai1 xin1 o
爸爸 ba4 ba
秘密哟 mi4 mi4 yo
哦 o
一年 yi4 nian2
一夜 yi2 ye4
一切 yi2 qie4
一座 yi2 zuo4
一下 yi2 xia4
上一山 shang4 yi2 shan1
下一山 xia4 yi2 shan1
休息 xiu1 xi2
东西 dong1 xi
上一届 shang4 yi2 jie4
便宜 pian2 yi4
加长 jia1 chang2
单田芳 shan4 tian2 fang1
帧 zhen1
长时间 chang2 shi2 jian1
长时 chang2 shi2
识别 shi2 bie2
生命中 sheng1 ming4 zhong1
踏实 ta1 shi
嗯 en4
溜达 liu1 da
少儿 shao4 er2
爷爷 ye2 ye
不是 bu2 shi4
一圈 yi1 quan1
厜读一声 zui1 du2 yi4 sheng1
一种 yi4 zhong3
一簇簇 yi2 cu4 cu4
一个 yi2 ge4
一样 yi2 yang4
一跩一跩 yi4 zhuai3 yi4 zhuai3
一会儿 yi2 hui4 er
一幢 yi2 zhuang4
挨了 ai2 le
熬菜 ao1 cai4
扒鸡 pa2 ji1
背枪 bei1 qiang1
绷瓷儿 beng4 ci2 er2
绷劲儿 beng3 jin4 er
绷着脸 beng3 zhe lian3
藏医 zang4 yi1
噌吰 cheng1 hong2
差点儿 cha4 dian3 er
差失 cha1 shi1
差误 cha1 wu4
孱头 can4 tou
乘间 cheng2 jian4
锄镰棘矜 chu2 lian2 ji2 qin2
川藏 chuan1 zang4
穿著 chuan1 zhuo2
答讪 da1 shan4
答言 da1 yan2
大伯子 da4 bai3 zi
大夫 dai4 fu
弹冠 tan2 guan1
当间 dang1 jian4
当然咯 dang1 ran2 lo
点种 dian3 zhong3
垛好 duo4 hao3
发疟子 fa1 yao4 zi
饭熟了 fan4 shou2 le
附著 fu4 zhuo2
复沓 fu4 ta4
供稿 gong1 gao3
供养 gong1 yang3
骨朵 gu1 duo
骨碌 gu1 lu
果脯 guo3 fu3
哈什玛 ha4 shi2 ma3
海蜇 hai3 zhe2
呵欠 he1 qian
河水汤汤 he2 shui3 shang1 shang1
鹄立 hu2 li4
鹄望 hu2 wang4
混人 hun2 ren2
混水 hun2 shui3
鸡血 ji1 xie3
缉鞋口 qi1 xie2 kou3
亟来闻讯 qi4 lai2 wen2 xun4
计量 ji4 liang2
济水 ji3 shui3
间杂 jian4 za2
脚跐两只船 jiao3 ci3 liang3 zhi1 chuan2
脚儿 jue2 er2
口角 kou3 jiao3
勒石 le4 shi2
累进 lei3 jin4
累累如丧家之犬 lei2 lei2 ru2 sang4 jia1 zhi1 quan3
累年 lei3 nian2
脸涨通红 lian3 zhang4 tong1 hong2
踉锵 liang4 qiang1
燎眉毛 liao3 mei2 mao2
燎头发 liao3 tou2 fa4
溜达 liu1 da
溜缝儿 liu4 feng4 er
馏口饭 liu4 kou3 fan4
遛马 liu4 ma3
遛鸟 liu4 niao3
遛弯儿 liu4 wan1 er
楼枪机 lou1 qiang1 ji1
搂钱 lou1 qian2
鹿脯 lu4 fu3
露头 lou4 tou2
落魄 luo4 po4
捋胡子 lv3 hu2 zi
绿地 lv4 di4
麦垛 mai4 duo4
没劲儿 mei2 jin4 er
闷棍 men4 gun4
闷葫芦 men4 hu2 lu
闷头干 men1 tou2 gan4
蒙古 meng3 gu3
靡日不思 mi3 ri4 bu4 si1
缪姓 miao4 xing4
抹墙 mo4 qiang2
抹下脸 ma1 xia4 lian3
泥子 ni4 zi
拗不过 niu4 bu guo4
排车 pai3 che1
盘诘 pan2 jie2
膀肿 pang1 zhong3
炮干 bao1 gan1
炮格 pao2 ge2
碰钉子 peng4 ding1 zi
缥色 piao3 se4
瀑河 bao4 he2
蹊径 xi1 jing4
前后相属 qian2 hou4 xiang1 zhu3
翘尾巴 qiao4 wei3 ba
趄坡儿 qie4 po1 er
秦桧 qin2 hui4
圈马 juan1 ma3
雀盲眼 qiao3 mang2 yan3
雀子 qiao1 zi
三年五载 san1 nian2 wu3 zai3
加载 jia1 zai3
山大王 shan1 dai4 wang
苫屋草 shan4 wu1 cao3
数数 shu3 shu4
说客 shui4 ke4
思量 si1 liang2
伺侯 ci4 hou
踏实 ta1 shi
提溜 di1 liu
调拨 diao4 bo1
帖子 tie3 zi
铜钿 tong2 tian2
头昏脑涨 tou2 hun1 nao3 zhang4
褪色 tui4 se4
褪着手 tun4 zhe shou3
圩子 wei2 zi
尾巴 wei3 ba
系好船只 xi4 hao3 chuan2 zhi1
系好马匹 xi4 hao3 ma3 pi3
杏脯 xing4 fu3
姓单 xing4 shan4
姓葛 xing4 ge3
姓哈 xing4 ha3
姓解 xing4 xie4
姓秘 xing4 bi4
姓宁 xing4 ning4
旋风 xuan4 feng1
旋根车轴 xuan4 gen1 che1 zhou2
荨麻 qian2 ma2
一幢楼房 yi1 zhuang4 lou2 fang2
遗之千金 wei4 zhi1 qian1 jin1
殷殷 yin3 yin3
应招 ying4 zhao1
用称约 yong4 cheng4 yao1
约斤肉 yao1 jin1 rou4
晕机 yun4 ji1
熨贴 yu4 tie1
咋办 za3 ban4
咋呼 zha1 hu
仔兽 zi3 shou4
扎彩 za1 cai3
扎实 zha1 shi
扎腰带 za1 yao1 dai4
轧朋友 ga2 peng2 you3
爪子 zhua3 zi
折腾 zhe1 teng
着实 zhuo2 shi2
着我旧时裳 zhuo2 wo3 jiu4 shi2 chang2
枝蔓 zhi1 man4
中鹄 zhong1 hu2
中选 zhong4 xuan3
猪圈 zhu1 juan4
拽住不放 zhuai4 zhu4 bu4 fang4
转悠 zhuan4 you
庄稼熟了 zhuang1 jia shou2 le
酌量 zhuo2 liang2
罪行累累 zui4 xing2 lei3 lei3
一手 yi4 shou3
一去不复返 yi2 qu4 bu2 fu4 fan3
一颗 yi4 ke1
一件 yi2 jian4
一斤 yi4 jin1
一点 yi4 dian3
一朵 yi4 duo3
一声 yi4 sheng1
一身 yi4 shen1
不要 bu2 yao4
一人 yi4 ren2
一个 yi2 ge4
一把 yi4 ba3
一门 yi4 men2
一門 yi4 men2
一艘 yi4 sou1
一片 yi2 pian4
一篇 yi2 pian1
一份 yi2 fen4
好嗲 hao3 dia3
随地 sui2 di4
扁担长 bian3 dan4 chang3
一堆 yi4 dui1
不义 bu2 yi4
放一放 fang4 yi2 fang4
一米 yi4 mi3
一顿 yi2 dun4
一层楼 yi4 ceng2 lou2
一条 yi4 tiao2
一件 yi2 jian4
一棵 yi4 ke1
一小股 yi4 xiao3 gu3
一拐一拐 yi4 guai3 yi4 guai3
一根 yi4 gen1
沆瀣一气 hang4 xie4 yi2 qi4
一丝 yi4 si1
一毫 yi4 hao2
一樣 yi2 yang4
处处 chu4 chu4
一餐 yi4 can
永不 yong3 bu2
一看 yi2 kan4
一架 yi2 jia4
送还 song4 huan2
一见 yi2 jian4
一座 yi2 zuo4
一块 yi2 kuai4
一天 yi4 tian1
一只 yi4 zhi1
一支 yi4 zhi1
一字 yi2 zi4
一句 yi2 ju4
一张 yi4 zhang1
一條 yi4 tiao2
一场 yi4 chang3
一粒 yi2 li4
小俩口 xiao3 liang3 kou3
一首 yi4 shou3
一对 yi2 dui4
一手 yi4 shou3
又一村 you4 yi4 cun1
一概而论 yi2 gai4 er2 lun4
一峰峰 yi4 feng1 feng1
不但 bu2 dan4
一笑 yi2 xiao4
挠痒痒 nao2 yang3 yang
不对 bu2 dui4
拧开 ning3 kai1
爱不释手 ai4 bu2 shi4 shou3
一念 yi2 nian4
夺得 duo2 de2
一袭 yi4 xi2
一定 yi2 ding4
不慎 bu2 shen4
剽窃 piao2 qie4
一时 yi4 shi2
撇开 pie3 kai1
一祭 yi2 ji4
发卡 fa4 qia3
少不了 shao3 bu4 liao3
千虑一失 qian1 lv4 yi4 shi1
呛得 qiang4 de2
切菜 qie1 cai4
茄盒 qie2 he2
不去 bu2 qu4
一大圈 yi2 da4 quan1
不再 bu2 zai4
一群 yi4 qun2
不必 bu2 bi4
一些 yi4 xie1
一路 yi2 lu4
一股 yi4 gu3
一到 yi2 dao4
一拨 yi4 bo1
一排 yi4 pai2
一空 yi4 kong1
吮吸着 shun3 xi1 zhe
不适合 bu2 shi4 he2
一串串 yi2 chuan4 chuan4
一提起 yi4 ti2 qi3
一尘不染 yi4 chen2 bu4 ran3
一生 yi4 sheng1
一派 yi2 pai4
不断 bu2 duan4
一次 yi2 ci4
不进步 bu2 jin4 bu4
娃娃 wa2 wa
万户侯 wan4 hu4 hou2
一方 yi4 fang1
一番话 yi4 fan1 hua4
一遍 yi2 bian4
不计较 bu2 ji4 jiao4
诇 xiong4
一边 yi4 bian1
一束 yi2 shu4
一听到 yi4 ting1 dao4
炸鸡 zha2 ji1
乍暧还寒 zha4 ai4 huan2 han2
我说诶 wo3 shuo1 ei1
棒诶 bang4 ei1
寒碜 han2 chen4
应采儿 ying4 cai3 er2
晕车 yun1 che1
必应 bi4 ying4
应援 ying4 yuan2
应力 ying4 li4

View File

@ -0,0 +1,73 @@
# This file is copied from
# https://github.com/UEhQZXI/vits_chinese/blob/master/text/symbols.py
_pause = ["sil", "eos", "sp", "#0", "#1", "#2", "#3"]
_initials = [
"^",
"b",
"c",
"ch",
"d",
"f",
"g",
"h",
"j",
"k",
"l",
"m",
"n",
"p",
"q",
"r",
"s",
"sh",
"t",
"x",
"z",
"zh",
]
_tones = ["1", "2", "3", "4", "5"]
_finals = [
"a",
"ai",
"an",
"ang",
"ao",
"e",
"ei",
"en",
"eng",
"er",
"i",
"ia",
"ian",
"iang",
"iao",
"ie",
"ii",
"iii",
"in",
"ing",
"iong",
"iou",
"o",
"ong",
"ou",
"u",
"ua",
"uai",
"uan",
"uang",
"uei",
"uen",
"ueng",
"uo",
"v",
"van",
"ve",
"vn",
]
symbols = _pause + _initials + [i + j for i in _finals for j in _tones]

View File

@ -0,0 +1,137 @@
# This file is modified from
# https://github.com/UEhQZXI/vits_chinese/blob/master/vits_strings.py
import logging
from pathlib import Path
from typing import List
# Note pinyin_dict is from ./pinyin_dict.py
from pinyin_dict import pinyin_dict
from pypinyin import Style
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
from pypinyin.converter import DefaultConverter
from pypinyin.core import Pinyin, load_phrases_dict
class _MyConverter(NeutralToneWith5Mixin, DefaultConverter):
pass
class Tokenizer:
def __init__(self, tokens: str = ""):
self._load_pinyin_dict()
self._pinyin_parser = Pinyin(_MyConverter())
if tokens != "":
self._load_tokens(tokens)
def texts_to_token_ids(self, texts: List[str], **kwargs) -> List[List[int]]:
"""
Args:
texts:
A list of sentences.
kwargs:
Not used. It is for compatibility with other TTS recipes in icefall.
"""
tokens = []
for text in texts:
tokens.append(self.text_to_tokens(text))
return self.tokens_to_token_ids(tokens)
def tokens_to_token_ids(self, tokens: List[List[str]]) -> List[List[int]]:
ans = []
for token_list in tokens:
token_ids = []
for t in token_list:
if t not in self.token2id:
logging.warning(f"Skip OOV {t}")
continue
token_ids.append(self.token2id[t])
ans.append(token_ids)
return ans
def text_to_tokens(self, text: str) -> List[str]:
# Convert "" to ["sp", "sil"]
# Convert "。" to ["sil"]
# append ["eos"] at the end of a sentence
phonemes = ["sil"]
pinyins = self._pinyin_parser.pinyin(
text,
style=Style.TONE3,
errors=lambda x: [[w] for w in x],
)
new_pinyin = []
for p in pinyins:
p = p[0]
if p == "":
new_pinyin.extend(["sp", "sil"])
elif p == "":
new_pinyin.append("sil")
else:
new_pinyin.append(p)
sub_phonemes = self._get_phoneme4pinyin(new_pinyin)
sub_phonemes.append("eos")
phonemes.extend(sub_phonemes)
return phonemes
def _get_phoneme4pinyin(self, pinyins):
result = []
for pinyin in pinyins:
if pinyin in ("sil", "sp"):
result.append(pinyin)
elif pinyin[:-1] in pinyin_dict:
tone = pinyin[-1]
a = pinyin[:-1]
a1, a2 = pinyin_dict[a]
# every word is appended with a #0
result += [a1, a2 + tone, "#0"]
return result
def _load_pinyin_dict(self):
this_dir = Path(__file__).parent.resolve()
my_dict = {}
with open(f"{this_dir}/pypinyin-local.dict", "r", encoding="utf-8") as f:
content = f.readlines()
for line in content:
cuts = line.strip().split()
hanzi = cuts[0]
pinyin = cuts[1:]
my_dict[hanzi] = [[p] for p in pinyin]
load_phrases_dict(my_dict)
def _load_tokens(self, filename):
token2id: Dict[str, int] = {}
with open(filename, "r", encoding="utf-8") as f:
for line in f.readlines():
info = line.rstrip().split()
if len(info) == 1:
# case of space
token = " "
idx = int(info[0])
else:
token, idx = info[0], int(info[1])
assert token not in token2id, token
token2id[token] = idx
self.token2id = token2id
self.vocab_size = len(self.token2id)
self.pad_id = self.token2id["#0"]
def main():
tokenizer = Tokenizer()
tokenizer._sentence_to_ids("你好,好的。")
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/local/validate_manifest.py

141
egs/aishell3/TTS/prepare.sh Executable file
View File

@ -0,0 +1,141 @@
#!/usr/bin/env bash
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
set -eou pipefail
stage=-1
stop_stage=100
dl_dir=$PWD/download
. shared/parse_options.sh || exit 1
# All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
log "dl_dir: $dl_dir"
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: build monotonic_align lib"
if [ ! -d vits/monotonic_align/build ]; then
cd vits/monotonic_align
python3 setup.py build_ext --inplace
cd ../../
else
log "monotonic_align lib already built"
fi
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Download data"
# The directory $dl_dir/aishell3 will contain the following files
# and sub directories
# ChangeLog ReadMe.txt phone_set.txt spk-info.txt test train
# If you have pre-downloaded it to /path/to/aishell3, you can create a symlink
#
# ln -sfv /path/to/aishell3 $dl_dir/
# touch $dl_dir/aishell3/.completed
#
if [ ! -d $dl_dir/aishell3 ]; then
lhotse download aishell3 $dl_dir
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Prepare aishell3 manifest (may take 13 minutes)"
# We assume that you have downloaded the baker corpus
# to $dl_dir/aishell3.
# You can find files like spk-info.txt inside $dl_dir/aishell3
mkdir -p data/manifests
if [ ! -e data/manifests/.aishell3.done ]; then
lhotse prepare aishell3 $dl_dir/aishell3 data/manifests >/dev/null 2>&1
touch data/manifests/.aishell3.done
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Compute spectrogram for aishell3 (may take 5 minutes)"
mkdir -p data/spectrogram
if [ ! -e data/spectrogram/.aishell3.done ]; then
./local/compute_spectrogram_aishell3.py
touch data/spectrogram/.aishell3.done
fi
if [ ! -e data/spectrogram/.aishell3-validated.done ]; then
log "Validating data/spectrogram for aishell3"
python3 ./local/validate_manifest.py \
data/spectrogram/aishell3_cuts_train.jsonl.gz
python3 ./local/validate_manifest.py \
data/spectrogram/aishell3_cuts_test.jsonl.gz
touch data/spectrogram/.aishell3-validated.done
fi
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Prepare tokens for aishell3 (may take 20 seconds)"
if [ ! -e data/spectrogram/.aishell3_with_token.done ]; then
./local/prepare_tokens_aishell3.py
mv -v data/spectrogram/aishell3_cuts_with_tokens_train.jsonl.gz \
data/spectrogram/aishell3_cuts_train.jsonl.gz
mv -v data/spectrogram/aishell3_cuts_with_tokens_test.jsonl.gz \
data/spectrogram/aishell3_cuts_test.jsonl.gz
touch data/spectrogram/.aishell3_with_token.done
fi
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Split the aishell3 cuts into train, valid and test sets (may take 25 seconds)"
if [ ! -e data/spectrogram/.aishell3_split.done ]; then
lhotse subset --last 1000 \
data/spectrogram/aishell3_cuts_test.jsonl.gz \
data/spectrogram/aishell3_cuts_valid.jsonl.gz
n=$(( $(gunzip -c data/spectrogram/aishell3_cuts_test.jsonl.gz | wc -l) - 1000 ))
lhotse subset --first $n \
data/spectrogram/aishell3_cuts_test.jsonl.gz \
data/spectrogram/aishell3_cuts_test2.jsonl.gz
mv data/spectrogram/aishell3_cuts_test2.jsonl.gz data/spectrogram/aishell3_cuts_test.jsonl.gz
touch data/spectrogram/.aishell3_split.done
fi
fi
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Generate tokens.txt and lexicon.txt "
if [ ! -e data/tokens.txt ]; then
./local/prepare_token_file.py --tokens data/tokens.txt
fi
if [ ! -e data/lexicon.txt ]; then
./local/generate_lexicon.py --tokens data/tokens.txt --lexicon data/lexicon.txt
fi
fi
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 7: Generate speakers file"
if [ ! -e data/speakers.txt ]; then
gunzip -c data/manifests/aishell3_supervisions_train.jsonl.gz \
| jq '.speaker' | sed 's/"//g' \
| sort | uniq > data/speakers.txt
fi
fi

1
egs/aishell3/TTS/shared Symbolic link
View File

@ -0,0 +1 @@
../../../icefall/shared

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/vits/duration_predictor.py

View File

@ -0,0 +1,433 @@
#!/usr/bin/env python3
#
# Copyright 2023 Xiaomi Corporation (Author: Zengwei Yao)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This script exports a VITS model from PyTorch to ONNX.
Export the model to ONNX:
./vits/export-onnx.py \
--epoch 1000 \
--speakers ./data/speakers.txt \
--exp-dir vits/exp \
--tokens data/tokens.txt
It will generate one file inside vits/exp:
- vits-epoch-1000.onnx
See ./test_onnx.py for how to use the exported ONNX models.
"""
import argparse
import logging
from pathlib import Path
from typing import Dict, Tuple
import onnx
import torch
import torch.nn as nn
from tokenizer import Tokenizer
from train import get_model, get_params
from icefall.checkpoint import load_checkpoint
def get_parser():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--epoch",
type=int,
default=1000,
help="""It specifies the checkpoint to use for decoding.
Note: Epoch counts from 1.
""",
)
parser.add_argument(
"--exp-dir",
type=str,
default="vits/exp",
help="The experiment dir",
)
parser.add_argument(
"--tokens",
type=str,
default="data/tokens.txt",
help="""Path to vocabulary.""",
)
parser.add_argument(
"--speakers",
type=Path,
default=Path("data/speakers.txt"),
help="Path to speakers.txt file.",
)
parser.add_argument(
"--model-type",
type=str,
default="low",
choices=["low", "medium", "high"],
help="""If not empty, valid values are: low, medium, high.
It controls the model size. low -> runs faster.
""",
)
return parser
def add_meta_data(filename: str, meta_data: Dict[str, str]):
"""Add meta data to an ONNX model. It is changed in-place.
Args:
filename:
Filename of the ONNX model to be changed.
meta_data:
Key-value pairs.
"""
model = onnx.load(filename)
for key, value in meta_data.items():
meta = model.metadata_props.add()
meta.key = key
meta.value = str(value)
onnx.save(model, filename)
class OnnxModel(nn.Module):
"""A wrapper for VITS generator."""
def __init__(self, model: nn.Module):
"""
Args:
model:
A VITS generator.
frame_shift:
The frame shift in samples.
"""
super().__init__()
self.model = model
def forward(
self,
tokens: torch.Tensor,
tokens_lens: torch.Tensor,
noise_scale: float = 0.667,
alpha: float = 1.0,
noise_scale_dur: float = 0.8,
speaker: int = 0,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Please see the help information of VITS.inference_batch
Args:
tokens:
Input text token indexes (1, T_text)
tokens_lens:
Number of tokens of shape (1,)
noise_scale (float):
Noise scale parameter for flow.
noise_scale_dur (float):
Noise scale parameter for duration predictor.
speaker (int):
Speaker ID.
alpha (float):
Alpha parameter to control the speed of generated speech.
Returns:
Return a tuple containing:
- audio, generated wavform tensor, (B, T_wav)
"""
audio, _, _ = self.model.generator.inference(
text=tokens,
text_lengths=tokens_lens,
noise_scale=noise_scale,
noise_scale_dur=noise_scale_dur,
sids=speaker,
alpha=alpha,
)
return audio
def export_model_onnx(
model: nn.Module,
model_filename: str,
vocab_size: int,
opset_version: int = 11,
) -> None:
"""Export the given generator model to ONNX format.
The exported model has one input:
- tokens, a tensor of shape (1, T_text); dtype is torch.int64
and it has one output:
- audio, a tensor of shape (1, T'); dtype is torch.float32
Args:
model:
The VITS generator.
model_filename:
The filename to save the exported ONNX model.
vocab_size:
Number of tokens used in training.
opset_version:
The opset version to use.
"""
tokens = torch.randint(low=0, high=vocab_size, size=(1, 13), dtype=torch.int64)
tokens_lens = torch.tensor([tokens.shape[1]], dtype=torch.int64)
noise_scale = torch.tensor([1], dtype=torch.float32)
noise_scale_dur = torch.tensor([1], dtype=torch.float32)
alpha = torch.tensor([1], dtype=torch.float32)
speaker = torch.tensor([1], dtype=torch.int64)
torch.onnx.export(
model,
(tokens, tokens_lens, noise_scale, alpha, noise_scale_dur, speaker),
model_filename,
verbose=False,
opset_version=opset_version,
input_names=[
"tokens",
"tokens_lens",
"noise_scale",
"alpha",
"noise_scale_dur",
"speaker",
],
output_names=["audio"],
dynamic_axes={
"tokens": {0: "N", 1: "T"},
"tokens_lens": {0: "N"},
"audio": {0: "N", 1: "T"},
"speaker": {0: "N"},
},
)
if model.model.spks is None:
num_speakers = 1
else:
num_speakers = model.model.spks
meta_data = {
"model_type": "vits",
"version": "1",
"model_author": "k2-fsa",
"comment": "icefall", # must be icefall for models from icefall
"language": "Chinese",
"n_speakers": num_speakers,
"sample_rate": model.model.sampling_rate, # Must match the real sample rate
}
logging.info(f"meta_data: {meta_data}")
add_meta_data(filename=model_filename, meta_data=meta_data)
@torch.no_grad()
def main():
args = get_parser().parse_args()
args.exp_dir = Path(args.exp_dir)
params = get_params()
params.update(vars(args))
tokenizer = Tokenizer(params.tokens)
params.blank_id = tokenizer.pad_id
params.vocab_size = tokenizer.vocab_size
with open(args.speakers) as f:
speaker_map = {line.strip(): i for i, line in enumerate(f)}
params.num_spks = len(speaker_map)
logging.info(params)
logging.info("About to create model")
model = get_model(params)
load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
model.to("cpu")
model.eval()
model = OnnxModel(model=model)
num_param = sum([p.numel() for p in model.parameters()])
logging.info(f"generator parameters: {num_param}, or {num_param/1000/1000} M")
suffix = f"epoch-{params.epoch}"
opset_version = 13
logging.info("Exporting encoder")
model_filename = params.exp_dir / f"vits-{suffix}.onnx"
export_model_onnx(
model,
model_filename,
params.vocab_size,
opset_version=opset_version,
)
logging.info(f"Exported generator to {model_filename}")
if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
main()
"""
Supported languages.
LJSpeech is using "en-us" from the second column.
Pty Language Age/Gender VoiceName File Other Languages
5 af --/M Afrikaans gmw/af
5 am --/M Amharic sem/am
5 an --/M Aragonese roa/an
5 ar --/M Arabic sem/ar
5 as --/M Assamese inc/as
5 az --/M Azerbaijani trk/az
5 ba --/M Bashkir trk/ba
5 be --/M Belarusian zle/be
5 bg --/M Bulgarian zls/bg
5 bn --/M Bengali inc/bn
5 bpy --/M Bishnupriya_Manipuri inc/bpy
5 bs --/M Bosnian zls/bs
5 ca --/M Catalan roa/ca
5 chr-US-Qaaa-x-west --/M Cherokee_ iro/chr
5 cmn --/M Chinese_(Mandarin,_latin_as_English) sit/cmn (zh-cmn 5)(zh 5)
5 cmn-latn-pinyin --/M Chinese_(Mandarin,_latin_as_Pinyin) sit/cmn-Latn-pinyin (zh-cmn 5)(zh 5)
5 cs --/M Czech zlw/cs
5 cv --/M Chuvash trk/cv
5 cy --/M Welsh cel/cy
5 da --/M Danish gmq/da
5 de --/M German gmw/de
5 el --/M Greek grk/el
5 en-029 --/M English_(Caribbean) gmw/en-029 (en 10)
2 en-gb --/M English_(Great_Britain) gmw/en (en 2)
5 en-gb-scotland --/M English_(Scotland) gmw/en-GB-scotland (en 4)
5 en-gb-x-gbclan --/M English_(Lancaster) gmw/en-GB-x-gbclan (en-gb 3)(en 5)
5 en-gb-x-gbcwmd --/M English_(West_Midlands) gmw/en-GB-x-gbcwmd (en-gb 9)(en 9)
5 en-gb-x-rp --/M English_(Received_Pronunciation) gmw/en-GB-x-rp (en-gb 4)(en 5)
2 en-us --/M English_(America) gmw/en-US (en 3)
5 en-us-nyc --/M English_(America,_New_York_City) gmw/en-US-nyc
5 eo --/M Esperanto art/eo
5 es --/M Spanish_(Spain) roa/es
5 es-419 --/M Spanish_(Latin_America) roa/es-419 (es-mx 6)
5 et --/M Estonian urj/et
5 eu --/M Basque eu
5 fa --/M Persian ira/fa
5 fa-latn --/M Persian_(Pinglish) ira/fa-Latn
5 fi --/M Finnish urj/fi
5 fr-be --/M French_(Belgium) roa/fr-BE (fr 8)
5 fr-ch --/M French_(Switzerland) roa/fr-CH (fr 8)
5 fr-fr --/M French_(France) roa/fr (fr 5)
5 ga --/M Gaelic_(Irish) cel/ga
5 gd --/M Gaelic_(Scottish) cel/gd
5 gn --/M Guarani sai/gn
5 grc --/M Greek_(Ancient) grk/grc
5 gu --/M Gujarati inc/gu
5 hak --/M Hakka_Chinese sit/hak
5 haw --/M Hawaiian map/haw
5 he --/M Hebrew sem/he
5 hi --/M Hindi inc/hi
5 hr --/M Croatian zls/hr (hbs 5)
5 ht --/M Haitian_Creole roa/ht
5 hu --/M Hungarian urj/hu
5 hy --/M Armenian_(East_Armenia) ine/hy (hy-arevela 5)
5 hyw --/M Armenian_(West_Armenia) ine/hyw (hy-arevmda 5)(hy 8)
5 ia --/M Interlingua art/ia
5 id --/M Indonesian poz/id
5 io --/M Ido art/io
5 is --/M Icelandic gmq/is
5 it --/M Italian roa/it
5 ja --/M Japanese jpx/ja
5 jbo --/M Lojban art/jbo
5 ka --/M Georgian ccs/ka
5 kk --/M Kazakh trk/kk
5 kl --/M Greenlandic esx/kl
5 kn --/M Kannada dra/kn
5 ko --/M Korean ko
5 kok --/M Konkani inc/kok
5 ku --/M Kurdish ira/ku
5 ky --/M Kyrgyz trk/ky
5 la --/M Latin itc/la
5 lb --/M Luxembourgish gmw/lb
5 lfn --/M Lingua_Franca_Nova art/lfn
5 lt --/M Lithuanian bat/lt
5 ltg --/M Latgalian bat/ltg
5 lv --/M Latvian bat/lv
5 mi --/M Māori poz/mi
5 mk --/M Macedonian zls/mk
5 ml --/M Malayalam dra/ml
5 mr --/M Marathi inc/mr
5 ms --/M Malay poz/ms
5 mt --/M Maltese sem/mt
5 mto --/M Totontepec_Mixe miz/mto
5 my --/M Myanmar_(Burmese) sit/my
5 nb --/M Norwegian_Bokmål gmq/nb (no 5)
5 nci --/M Nahuatl_(Classical) azc/nci
5 ne --/M Nepali inc/ne
5 nl --/M Dutch gmw/nl
5 nog --/M Nogai trk/nog
5 om --/M Oromo cus/om
5 or --/M Oriya inc/or
5 pa --/M Punjabi inc/pa
5 pap --/M Papiamento roa/pap
5 piqd --/M Klingon art/piqd
5 pl --/M Polish zlw/pl
5 pt --/M Portuguese_(Portugal) roa/pt (pt-pt 5)
5 pt-br --/M Portuguese_(Brazil) roa/pt-BR (pt 6)
5 py --/M Pyash art/py
5 qdb --/M Lang_Belta art/qdb
5 qu --/M Quechua qu
5 quc --/M K'iche' myn/quc
5 qya --/M Quenya art/qya
5 ro --/M Romanian roa/ro
5 ru --/M Russian zle/ru
5 ru-cl --/M Russian_(Classic) zle/ru-cl
2 ru-lv --/M Russian_(Latvia) zle/ru-LV
5 sd --/M Sindhi inc/sd
5 shn --/M Shan_(Tai_Yai) tai/shn
5 si --/M Sinhala inc/si
5 sjn --/M Sindarin art/sjn
5 sk --/M Slovak zlw/sk
5 sl --/M Slovenian zls/sl
5 smj --/M Lule_Saami urj/smj
5 sq --/M Albanian ine/sq
5 sr --/M Serbian zls/sr
5 sv --/M Swedish gmq/sv
5 sw --/M Swahili bnt/sw
5 ta --/M Tamil dra/ta
5 te --/M Telugu dra/te
5 th --/M Thai tai/th
5 tk --/M Turkmen trk/tk
5 tn --/M Setswana bnt/tn
5 tr --/M Turkish trk/tr
5 tt --/M Tatar trk/tt
5 ug --/M Uyghur trk/ug
5 uk --/M Ukrainian zle/uk
5 ur --/M Urdu inc/ur
5 uz --/M Uzbek trk/uz
5 vi --/M Vietnamese_(Northern) aav/vi
5 vi-vn-x-central --/M Vietnamese_(Central) aav/vi-VN-x-central
5 vi-vn-x-south --/M Vietnamese_(Southern) aav/vi-VN-x-south
5 yue --/M Chinese_(Cantonese) sit/yue (zh-yue 5)(zh 8)
5 yue --/M Chinese_(Cantonese,_latin_as_Jyutping) sit/yue-Latn-jyutping (zh-yue 5)(zh 8)
"""

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/vits/flow.py

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/vits/generator.py

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/vits/hifigan.py

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/vits/loss.py

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/vits/monotonic_align/

View File

@ -0,0 +1 @@
../local/pinyin_dict.py

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/vits/posterior_encoder.py

View File

@ -0,0 +1 @@
../local/pypinyin-local.dict

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/vits/residual_coupling.py

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/vits/text_encoder.py

View File

@ -0,0 +1 @@
../local/tokenizer.py

1007
egs/aishell3/TTS/vits/train.py Executable file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/vits/transform.py

View File

@ -0,0 +1,349 @@
# Copyright 2021 Piotr Żelasko
# Copyright 2022-2023 Xiaomi Corporation (Authors: Mingshuang Luo,
# Zengwei Yao)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, Optional
import torch
from lhotse import CutSet, Spectrogram, SpectrogramConfig, load_manifest_lazy
from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures
CutConcatenate,
CutMix,
DynamicBucketingSampler,
PrecomputedFeatures,
SimpleCutSampler,
SpecAugment,
SpeechSynthesisDataset,
)
from lhotse.dataset.input_strategies import ( # noqa F401 For AudioSamples
AudioSamples,
OnTheFlyFeatures,
)
from lhotse.utils import fix_random_seed
from torch.utils.data import DataLoader
from icefall.utils import str2bool
class _SeedWorkers:
def __init__(self, seed: int):
self.seed = seed
def __call__(self, worker_id: int):
fix_random_seed(self.seed + worker_id)
class Aishell3SpeechTtsDataModule:
"""
DataModule for tts experiments.
It assumes there is always one train and valid dataloader,
but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
and test-other).
It contains all the common data pipeline modules used in ASR
experiments, e.g.:
- dynamic batch size,
- bucketing samplers,
- cut concatenation,
- on-the-fly feature extraction
This class should be derived for specific corpora used in TTS tasks.
"""
def __init__(self, args: argparse.Namespace):
self.args = args
self.sampling_rate = 8000
@classmethod
def add_arguments(cls, parser: argparse.ArgumentParser):
group = parser.add_argument_group(
title="TTS data related options",
description="These options are used for the preparation of "
"PyTorch DataLoaders from Lhotse CutSet's -- they control the "
"effective batch sizes, sampling strategies, applied data "
"augmentations, etc.",
)
group.add_argument(
"--manifest-dir",
type=Path,
default=Path("data/spectrogram"),
help="Path to directory with train/valid/test cuts.",
)
group.add_argument(
"--speakers",
type=Path,
default=Path("data/speakers.txt"),
help="Path to speakers.txt file.",
)
group.add_argument(
"--max-duration",
type=int,
default=200.0,
help="Maximum pooled recordings duration (seconds) in a "
"single batch. You can reduce it if it causes CUDA OOM.",
)
group.add_argument(
"--bucketing-sampler",
type=str2bool,
default=True,
help="When enabled, the batches will come from buckets of "
"similar duration (saves padding frames).",
)
group.add_argument(
"--num-buckets",
type=int,
default=30,
help="The number of buckets for the DynamicBucketingSampler"
"(you might want to increase it for larger datasets).",
)
group.add_argument(
"--on-the-fly-feats",
type=str2bool,
default=False,
help="When enabled, use on-the-fly cut mixing and feature "
"extraction. Will drop existing precomputed feature manifests "
"if available.",
)
group.add_argument(
"--shuffle",
type=str2bool,
default=True,
help="When enabled (=default), the examples will be "
"shuffled for each epoch.",
)
group.add_argument(
"--drop-last",
type=str2bool,
default=True,
help="Whether to drop last batch. Used by sampler.",
)
group.add_argument(
"--return-cuts",
type=str2bool,
default=False,
help="When enabled, each batch will have the "
"field: batch['cut'] with the cuts that "
"were used to construct it.",
)
group.add_argument(
"--num-workers",
type=int,
default=2,
help="The number of training dataloader workers that "
"collect the batches.",
)
group.add_argument(
"--input-strategy",
type=str,
default="PrecomputedFeatures",
help="AudioSamples or PrecomputedFeatures",
)
def train_dataloaders(
self,
cuts_train: CutSet,
sampler_state_dict: Optional[Dict[str, Any]] = None,
) -> DataLoader:
"""
Args:
cuts_train:
CutSet for training.
sampler_state_dict:
The state dict for the training sampler.
"""
logging.info("About to create train dataset")
train = SpeechSynthesisDataset(
return_text=False,
return_tokens=True,
return_spk_ids=True,
feature_input_strategy=eval(self.args.input_strategy)(),
return_cuts=self.args.return_cuts,
)
if self.args.on_the_fly_feats:
sampling_rate = self.sampling_rate
config = SpectrogramConfig(
sampling_rate=sampling_rate,
frame_length=1024 / sampling_rate, # (in second),
frame_shift=256 / sampling_rate, # (in second)
use_fft_mag=True,
)
train = SpeechSynthesisDataset(
return_text=False,
return_tokens=True,
return_spk_ids=True,
feature_input_strategy=OnTheFlyFeatures(Spectrogram(config)),
return_cuts=self.args.return_cuts,
)
if self.args.bucketing_sampler:
logging.info("Using DynamicBucketingSampler.")
train_sampler = DynamicBucketingSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
num_buckets=self.args.num_buckets,
buffer_size=self.args.num_buckets * 2000,
shuffle_buffer_size=self.args.num_buckets * 5000,
drop_last=self.args.drop_last,
)
else:
logging.info("Using SimpleCutSampler.")
train_sampler = SimpleCutSampler(
cuts_train,
max_duration=self.args.max_duration,
shuffle=self.args.shuffle,
)
logging.info("About to create train dataloader")
if sampler_state_dict is not None:
logging.info("Loading sampler state dict")
train_sampler.load_state_dict(sampler_state_dict)
# 'seed' is derived from the current random state, which will have
# previously been set in the main process.
seed = torch.randint(0, 100000, ()).item()
worker_init_fn = _SeedWorkers(seed)
train_dl = DataLoader(
train,
sampler=train_sampler,
batch_size=None,
num_workers=self.args.num_workers,
persistent_workers=False,
worker_init_fn=worker_init_fn,
)
return train_dl
def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
logging.info("About to create dev dataset")
if self.args.on_the_fly_feats:
sampling_rate = self.sampling_rate
config = SpectrogramConfig(
sampling_rate=sampling_rate,
frame_length=1024 / sampling_rate, # (in second),
frame_shift=256 / sampling_rate, # (in second)
use_fft_mag=True,
)
validate = SpeechSynthesisDataset(
return_text=False,
return_tokens=True,
return_spk_ids=True,
feature_input_strategy=OnTheFlyFeatures(Spectrogram(config)),
return_cuts=self.args.return_cuts,
)
else:
validate = SpeechSynthesisDataset(
return_text=False,
return_tokens=True,
return_spk_ids=True,
feature_input_strategy=eval(self.args.input_strategy)(),
return_cuts=self.args.return_cuts,
)
valid_sampler = DynamicBucketingSampler(
cuts_valid,
max_duration=self.args.max_duration,
num_buckets=self.args.num_buckets,
shuffle=False,
)
logging.info("About to create valid dataloader")
valid_dl = DataLoader(
validate,
sampler=valid_sampler,
batch_size=None,
num_workers=2,
persistent_workers=False,
)
return valid_dl
def test_dataloaders(self, cuts: CutSet) -> DataLoader:
logging.info("About to create test dataset")
if self.args.on_the_fly_feats:
sampling_rate = self.sampling_rate
config = SpectrogramConfig(
sampling_rate=sampling_rate,
frame_length=1024 / sampling_rate, # (in second),
frame_shift=256 / sampling_rate, # (in second)
use_fft_mag=True,
)
test = SpeechSynthesisDataset(
return_text=False,
return_tokens=True,
return_spk_ids=True,
feature_input_strategy=OnTheFlyFeatures(Spectrogram(config)),
return_cuts=self.args.return_cuts,
)
else:
test = SpeechSynthesisDataset(
return_text=False,
return_tokens=True,
return_spk_ids=True,
feature_input_strategy=eval(self.args.input_strategy)(),
return_cuts=self.args.return_cuts,
)
test_sampler = DynamicBucketingSampler(
cuts,
max_duration=self.args.max_duration,
num_buckets=self.args.num_buckets,
shuffle=False,
)
logging.info("About to create test dataloader")
test_dl = DataLoader(
test,
batch_size=None,
sampler=test_sampler,
num_workers=self.args.num_workers,
)
return test_dl
@lru_cache()
def train_cuts(self) -> CutSet:
logging.info("About to get train cuts")
return load_manifest_lazy(
self.args.manifest_dir / "aishell3_cuts_train.jsonl.gz"
)
@lru_cache()
def valid_cuts(self) -> CutSet:
logging.info("About to get validation cuts")
return load_manifest_lazy(
self.args.manifest_dir / "aishell3_cuts_valid.jsonl.gz"
)
@lru_cache()
def test_cuts(self) -> CutSet:
logging.info("About to get test cuts")
return load_manifest_lazy(
self.args.manifest_dir / "aishell3_cuts_test.jsonl.gz"
)
@lru_cache()
def speakers(self) -> Dict[str, int]:
logging.info("About to get speakers")
with open(self.args.speakers) as f:
speakers = {line.strip(): i for i, line in enumerate(f)}
return speakers

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/vits/utils.py

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/vits/vits.py

View File

@ -0,0 +1 @@
../../../ljspeech/TTS/vits/wavenet.py

View File

@ -1,7 +1,10 @@
# https://github.com/espnet/espnet/blob/master/espnet2/gan_tts/vits/monotonic_align/setup.py
"""Setup cython code."""
from Cython.Build import cythonize
try:
from Cython.Build import cythonize
except ModuleNotFoundError as ex:
raise RuntimeError(f'{ex}\nPlease run:\n pip install cython')
from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext as _build_ext

View File

@ -44,11 +44,11 @@ class Tokenizer(object):
if len(info) == 1:
# case of space
token = " "
id = int(info[0])
idx = int(info[0])
else:
token, id = info[0], int(info[1])
token, idx = info[0], int(info[1])
assert token not in self.token2id, token
self.token2id[token] = id
self.token2id[token] = idx
# Refer to https://github.com/rhasspy/piper/blob/master/TRAINING.md
self.pad_id = self.token2id["_"] # padding

View File

@ -66,7 +66,7 @@ class LJSpeechTtsDataModule:
- cut concatenation,
- on-the-fly feature extraction
This class should be derived for specific corpora used in ASR tasks.
This class should be derived for specific corpora used in TTS tasks.
"""
def __init__(self, args: argparse.Namespace):