mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-08-09 01:52:41 +00:00
Merge c25dc02d5d192a03fc61302d05d2ee602c008b4d into 9293edc62f4a3ebf769d66cc037d4e67953440f5
This commit is contained in:
commit
bf048133e1
118
.github/scripts/aishell3/TTS/run.sh
vendored
Executable file
118
.github/scripts/aishell3/TTS/run.sh
vendored
Executable file
@ -0,0 +1,118 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
python3 -m pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
|
||||||
|
python3 -m pip install numba
|
||||||
|
python3 -m pip install pypinyin
|
||||||
|
python3 -m pip install cython
|
||||||
|
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y jq
|
||||||
|
|
||||||
|
log() {
|
||||||
|
# This function is from espnet
|
||||||
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
cd egs/aishell3/TTS
|
||||||
|
|
||||||
|
sed -i.bak s/1000/10/g ./prepare.sh
|
||||||
|
|
||||||
|
|
||||||
|
function download_data() {
|
||||||
|
mkdir download
|
||||||
|
pushd download
|
||||||
|
curl -SL -O https://huggingface.co/csukuangfj/aishell3-ci-data/resolve/main/aishell3.tar.bz2
|
||||||
|
tar xf aishell3.tar.bz2
|
||||||
|
rm aishell3.tar.bz2
|
||||||
|
ls -lh
|
||||||
|
popd
|
||||||
|
}
|
||||||
|
|
||||||
|
function prepare_data() {
|
||||||
|
./prepare.sh
|
||||||
|
|
||||||
|
echo "----------tokens.txt----------"
|
||||||
|
cat data/tokens.txt
|
||||||
|
echo "------------------------------"
|
||||||
|
wc -l data/tokens.txt
|
||||||
|
echo "------------------------------"
|
||||||
|
|
||||||
|
echo "----------lexicon.txt----------"
|
||||||
|
head data/lexicon.txt
|
||||||
|
echo "----"
|
||||||
|
tail data/lexicon.txt
|
||||||
|
echo "----"
|
||||||
|
wc -l data/lexicon.txt
|
||||||
|
}
|
||||||
|
|
||||||
|
function train() {
|
||||||
|
pushd ./vits
|
||||||
|
sed -i.bak s/200/50/g ./train.py
|
||||||
|
git diff .
|
||||||
|
popd
|
||||||
|
|
||||||
|
# for t in low medium high; do
|
||||||
|
for t in low; do
|
||||||
|
./vits/train.py \
|
||||||
|
--exp-dir vits/exp-$t \
|
||||||
|
--model-type $t \
|
||||||
|
--num-epochs 1 \
|
||||||
|
--save-every-n 1 \
|
||||||
|
--num-buckets 2 \
|
||||||
|
--tokens data/tokens.txt \
|
||||||
|
--max-duration 20
|
||||||
|
|
||||||
|
ls -lh vits/exp-$t
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
function export_onnx() {
|
||||||
|
# for t in low medium high; do
|
||||||
|
for t in low; do
|
||||||
|
./vits/export-onnx.py \
|
||||||
|
--model-type $t \
|
||||||
|
--epoch 1 \
|
||||||
|
--exp-dir ./vits/exp-$t \
|
||||||
|
--tokens data/tokens.txt \
|
||||||
|
--speakers ./data/speakers.txt
|
||||||
|
|
||||||
|
ls -lh vits/exp-$t/
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
function test_low() {
|
||||||
|
git clone https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06
|
||||||
|
repo=icefall-tts-aishell3-vits-low-2024-04-06
|
||||||
|
|
||||||
|
./vits/export-onnx.py \
|
||||||
|
--model-type low \
|
||||||
|
--epoch 1000 \
|
||||||
|
--exp-dir $repo/exp \
|
||||||
|
--tokens $repo/data/tokens.txt \
|
||||||
|
--speakers $repo/data/speakers.txt
|
||||||
|
|
||||||
|
ls -lh $repo/exp/vits-epoch-1000.onnx
|
||||||
|
|
||||||
|
python3 -m pip install sherpa-onnx
|
||||||
|
|
||||||
|
sherpa-onnx-offline-tts \
|
||||||
|
--vits-model=$repo/exp/vits-epoch-960.onnx \
|
||||||
|
--vits-tokens=$repo/data/tokens.txt \
|
||||||
|
--vits-lexicon=$repo/data/lexicon.txt \
|
||||||
|
--num-threads=1 \
|
||||||
|
--vits-length-scale=1.0 \
|
||||||
|
--sid=33 \
|
||||||
|
--output-filename=/icefall/low.wav \
|
||||||
|
--debug=1 \
|
||||||
|
"这是一个语音合成测试"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
download_data
|
||||||
|
prepare_data
|
||||||
|
train
|
||||||
|
export_onnx
|
||||||
|
test_low
|
84
.github/workflows/aishell3.yml
vendored
Normal file
84
.github/workflows/aishell3.yml
vendored
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
name: aishell3
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
- tts-aishell3
|
||||||
|
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: aishell3-${{ github.ref }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
generate_build_matrix:
|
||||||
|
if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event.label.name == 'ready' || github.event_name == 'push' || github.event_name == 'aishell3')
|
||||||
|
|
||||||
|
# see https://github.com/pytorch/pytorch/pull/50633
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
outputs:
|
||||||
|
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
- name: Generating build matrix
|
||||||
|
id: set-matrix
|
||||||
|
run: |
|
||||||
|
# outputting for debugging purposes
|
||||||
|
python ./.github/scripts/docker/generate_build_matrix.py
|
||||||
|
MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
|
||||||
|
echo "::set-output name=matrix::${MATRIX}"
|
||||||
|
aishell3:
|
||||||
|
needs: generate_build_matrix
|
||||||
|
name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: Free space
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
df -h
|
||||||
|
rm -rf /opt/hostedtoolcache
|
||||||
|
df -h
|
||||||
|
echo "pwd: $PWD"
|
||||||
|
echo "github.workspace ${{ github.workspace }}"
|
||||||
|
|
||||||
|
- name: Run aishell3 tests
|
||||||
|
uses: addnab/docker-run-action@v3
|
||||||
|
with:
|
||||||
|
image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
|
||||||
|
options: |
|
||||||
|
--volume ${{ github.workspace }}/:/icefall
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
export PYTHONPATH=/icefall:$PYTHONPATH
|
||||||
|
cd /icefall
|
||||||
|
git config --global --add safe.directory /icefall
|
||||||
|
|
||||||
|
.github/scripts/aishell3/TTS/run.sh
|
||||||
|
|
||||||
|
- name: display files
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
ls -lh
|
||||||
|
|
||||||
|
- uses: actions/upload-artifact@v4
|
||||||
|
if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
|
||||||
|
with:
|
||||||
|
name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
|
||||||
|
path: ./*.wav
|
4
.gitignore
vendored
4
.gitignore
vendored
@ -36,3 +36,7 @@ node_modules
|
|||||||
.DS_Store
|
.DS_Store
|
||||||
*.fst
|
*.fst
|
||||||
*.arpa
|
*.arpa
|
||||||
|
core.c
|
||||||
|
*.so
|
||||||
|
build
|
||||||
|
*.wav
|
||||||
|
@ -19,7 +19,7 @@ Install extra dependencies
|
|||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
|
pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
|
||||||
pip install numba espnet_tts_frontend
|
pip install numba espnet_tts_frontend cython
|
||||||
|
|
||||||
Data preparation
|
Data preparation
|
||||||
----------------
|
----------------
|
||||||
|
110
egs/aishell3/TTS/local/compute_spectrogram_aishell3.py
Executable file
110
egs/aishell3/TTS/local/compute_spectrogram_aishell3.py
Executable file
@ -0,0 +1,110 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2021-2023 Xiaomi Corp. (authors: Fangjun Kuang,
|
||||||
|
# Zengwei Yao)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file computes fbank features of the aishell3 dataset.
|
||||||
|
It looks for manifests in the directory data/manifests.
|
||||||
|
|
||||||
|
The generated spectrogram features are saved in data/spectrogram.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from lhotse import (
|
||||||
|
CutSet,
|
||||||
|
LilcomChunkyWriter,
|
||||||
|
Spectrogram,
|
||||||
|
SpectrogramConfig,
|
||||||
|
load_manifest,
|
||||||
|
)
|
||||||
|
from lhotse.audio import RecordingSet
|
||||||
|
from lhotse.supervision import SupervisionSet
|
||||||
|
|
||||||
|
from icefall.utils import get_executor
|
||||||
|
|
||||||
|
# Torch's multithreaded behavior needs to be disabled or
|
||||||
|
# it wastes a lot of CPU and slow things down.
|
||||||
|
# Do this outside of main() in case it needs to take effect
|
||||||
|
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||||
|
torch.set_num_threads(1)
|
||||||
|
torch.set_num_interop_threads(1)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_spectrogram_aishell3():
|
||||||
|
src_dir = Path("data/manifests")
|
||||||
|
output_dir = Path("data/spectrogram")
|
||||||
|
num_jobs = min(4, os.cpu_count())
|
||||||
|
|
||||||
|
sampling_rate = 8000
|
||||||
|
frame_length = 1024 / sampling_rate # (in second)
|
||||||
|
frame_shift = 256 / sampling_rate # (in second)
|
||||||
|
use_fft_mag = True
|
||||||
|
|
||||||
|
prefix = "aishell3"
|
||||||
|
suffix = "jsonl.gz"
|
||||||
|
partitions = ("test", "train")
|
||||||
|
|
||||||
|
config = SpectrogramConfig(
|
||||||
|
sampling_rate=sampling_rate,
|
||||||
|
frame_length=frame_length,
|
||||||
|
frame_shift=frame_shift,
|
||||||
|
use_fft_mag=use_fft_mag,
|
||||||
|
)
|
||||||
|
extractor = Spectrogram(config)
|
||||||
|
|
||||||
|
for partition in partitions:
|
||||||
|
recordings = load_manifest(
|
||||||
|
src_dir / f"{prefix}_recordings_{partition}.{suffix}", RecordingSet
|
||||||
|
)
|
||||||
|
supervisions = load_manifest(
|
||||||
|
src_dir / f"{prefix}_supervisions_{partition}.{suffix}", SupervisionSet
|
||||||
|
)
|
||||||
|
|
||||||
|
# resample from 44100 to 8000
|
||||||
|
recordings = recordings.resample(sampling_rate)
|
||||||
|
|
||||||
|
with get_executor() as ex: # Initialize the executor only once.
|
||||||
|
cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
|
||||||
|
if (output_dir / cuts_filename).is_file():
|
||||||
|
logging.info(f"{cuts_filename} already exists - skipping.")
|
||||||
|
return
|
||||||
|
logging.info(f"Processing {partition}")
|
||||||
|
cut_set = CutSet.from_manifests(
|
||||||
|
recordings=recordings, supervisions=supervisions
|
||||||
|
)
|
||||||
|
|
||||||
|
cut_set = cut_set.compute_and_store_features(
|
||||||
|
extractor=extractor,
|
||||||
|
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||||
|
# when an executor is specified, make more partitions
|
||||||
|
num_jobs=num_jobs if ex is None else 80,
|
||||||
|
executor=ex,
|
||||||
|
storage_type=LilcomChunkyWriter,
|
||||||
|
)
|
||||||
|
cut_set.to_file(output_dir / cuts_filename)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
|
||||||
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
|
compute_spectrogram_aishell3()
|
68
egs/aishell3/TTS/local/generate_lexicon.py
Executable file
68
egs/aishell3/TTS/local/generate_lexicon.py
Executable file
@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file generates the file lexicon.txt that contains pronunciations of all
|
||||||
|
words and phrases
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pypinyin import phrases_dict, pinyin_dict
|
||||||
|
from tokenizer import Tokenizer
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tokens",
|
||||||
|
type=str,
|
||||||
|
default="data/tokens.txt",
|
||||||
|
help="""Path to vocabulary.""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--lexicon",
|
||||||
|
type=str,
|
||||||
|
default="data/lexicon.txt",
|
||||||
|
help="""Path to save the generated lexicon.""",
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = get_parser().parse_args()
|
||||||
|
filename = args.lexicon
|
||||||
|
tokens = args.tokens
|
||||||
|
tokenizer = Tokenizer(tokens)
|
||||||
|
|
||||||
|
word_dict = pinyin_dict.pinyin_dict
|
||||||
|
phrases = phrases_dict.phrases_dict
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
|
for key in word_dict:
|
||||||
|
if not (0x4E00 <= key <= 0x9FFF):
|
||||||
|
continue
|
||||||
|
|
||||||
|
w = chr(key)
|
||||||
|
|
||||||
|
# 1 to remove the initial sil
|
||||||
|
# :-1 to remove the final eos
|
||||||
|
tokens = tokenizer.text_to_tokens(w)[1:-1]
|
||||||
|
|
||||||
|
tokens = " ".join(tokens)
|
||||||
|
f.write(f"{w} {tokens}\n")
|
||||||
|
|
||||||
|
# TODO(fangjun): Add phrases
|
||||||
|
# for key in phrases:
|
||||||
|
# # 1 to remove the initial sil
|
||||||
|
# # :-1 to remove the final eos
|
||||||
|
# tokens = tokenizer.text_to_tokens(key)[1:-1]
|
||||||
|
# tokens = " ".join(tokens)
|
||||||
|
# f.write(f"{key} {tokens}\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
421
egs/aishell3/TTS/local/pinyin_dict.py
Normal file
421
egs/aishell3/TTS/local/pinyin_dict.py
Normal file
@ -0,0 +1,421 @@
|
|||||||
|
# This dict is copied from
|
||||||
|
# https://github.com/UEhQZXI/vits_chinese/blob/master/vits_strings.py
|
||||||
|
pinyin_dict = {
|
||||||
|
"a": ("^", "a"),
|
||||||
|
"ai": ("^", "ai"),
|
||||||
|
"an": ("^", "an"),
|
||||||
|
"ang": ("^", "ang"),
|
||||||
|
"ao": ("^", "ao"),
|
||||||
|
"ba": ("b", "a"),
|
||||||
|
"bai": ("b", "ai"),
|
||||||
|
"ban": ("b", "an"),
|
||||||
|
"bang": ("b", "ang"),
|
||||||
|
"bao": ("b", "ao"),
|
||||||
|
"be": ("b", "e"),
|
||||||
|
"bei": ("b", "ei"),
|
||||||
|
"ben": ("b", "en"),
|
||||||
|
"beng": ("b", "eng"),
|
||||||
|
"bi": ("b", "i"),
|
||||||
|
"bian": ("b", "ian"),
|
||||||
|
"biao": ("b", "iao"),
|
||||||
|
"bie": ("b", "ie"),
|
||||||
|
"bin": ("b", "in"),
|
||||||
|
"bing": ("b", "ing"),
|
||||||
|
"bo": ("b", "o"),
|
||||||
|
"bu": ("b", "u"),
|
||||||
|
"ca": ("c", "a"),
|
||||||
|
"cai": ("c", "ai"),
|
||||||
|
"can": ("c", "an"),
|
||||||
|
"cang": ("c", "ang"),
|
||||||
|
"cao": ("c", "ao"),
|
||||||
|
"ce": ("c", "e"),
|
||||||
|
"cen": ("c", "en"),
|
||||||
|
"ceng": ("c", "eng"),
|
||||||
|
"cha": ("ch", "a"),
|
||||||
|
"chai": ("ch", "ai"),
|
||||||
|
"chan": ("ch", "an"),
|
||||||
|
"chang": ("ch", "ang"),
|
||||||
|
"chao": ("ch", "ao"),
|
||||||
|
"che": ("ch", "e"),
|
||||||
|
"chen": ("ch", "en"),
|
||||||
|
"cheng": ("ch", "eng"),
|
||||||
|
"chi": ("ch", "iii"),
|
||||||
|
"chong": ("ch", "ong"),
|
||||||
|
"chou": ("ch", "ou"),
|
||||||
|
"chu": ("ch", "u"),
|
||||||
|
"chua": ("ch", "ua"),
|
||||||
|
"chuai": ("ch", "uai"),
|
||||||
|
"chuan": ("ch", "uan"),
|
||||||
|
"chuang": ("ch", "uang"),
|
||||||
|
"chui": ("ch", "uei"),
|
||||||
|
"chun": ("ch", "uen"),
|
||||||
|
"chuo": ("ch", "uo"),
|
||||||
|
"ci": ("c", "ii"),
|
||||||
|
"cong": ("c", "ong"),
|
||||||
|
"cou": ("c", "ou"),
|
||||||
|
"cu": ("c", "u"),
|
||||||
|
"cuan": ("c", "uan"),
|
||||||
|
"cui": ("c", "uei"),
|
||||||
|
"cun": ("c", "uen"),
|
||||||
|
"cuo": ("c", "uo"),
|
||||||
|
"da": ("d", "a"),
|
||||||
|
"dai": ("d", "ai"),
|
||||||
|
"dan": ("d", "an"),
|
||||||
|
"dang": ("d", "ang"),
|
||||||
|
"dao": ("d", "ao"),
|
||||||
|
"de": ("d", "e"),
|
||||||
|
"dei": ("d", "ei"),
|
||||||
|
"den": ("d", "en"),
|
||||||
|
"deng": ("d", "eng"),
|
||||||
|
"di": ("d", "i"),
|
||||||
|
"dia": ("d", "ia"),
|
||||||
|
"dian": ("d", "ian"),
|
||||||
|
"diao": ("d", "iao"),
|
||||||
|
"die": ("d", "ie"),
|
||||||
|
"ding": ("d", "ing"),
|
||||||
|
"diu": ("d", "iou"),
|
||||||
|
"dong": ("d", "ong"),
|
||||||
|
"dou": ("d", "ou"),
|
||||||
|
"du": ("d", "u"),
|
||||||
|
"duan": ("d", "uan"),
|
||||||
|
"dui": ("d", "uei"),
|
||||||
|
"dun": ("d", "uen"),
|
||||||
|
"duo": ("d", "uo"),
|
||||||
|
"e": ("^", "e"),
|
||||||
|
"ei": ("^", "ei"),
|
||||||
|
"en": ("^", "en"),
|
||||||
|
"ng": ("^", "en"),
|
||||||
|
"eng": ("^", "eng"),
|
||||||
|
"er": ("^", "er"),
|
||||||
|
"fa": ("f", "a"),
|
||||||
|
"fan": ("f", "an"),
|
||||||
|
"fang": ("f", "ang"),
|
||||||
|
"fei": ("f", "ei"),
|
||||||
|
"fen": ("f", "en"),
|
||||||
|
"feng": ("f", "eng"),
|
||||||
|
"fo": ("f", "o"),
|
||||||
|
"fou": ("f", "ou"),
|
||||||
|
"fu": ("f", "u"),
|
||||||
|
"ga": ("g", "a"),
|
||||||
|
"gai": ("g", "ai"),
|
||||||
|
"gan": ("g", "an"),
|
||||||
|
"gang": ("g", "ang"),
|
||||||
|
"gao": ("g", "ao"),
|
||||||
|
"ge": ("g", "e"),
|
||||||
|
"gei": ("g", "ei"),
|
||||||
|
"gen": ("g", "en"),
|
||||||
|
"geng": ("g", "eng"),
|
||||||
|
"gong": ("g", "ong"),
|
||||||
|
"gou": ("g", "ou"),
|
||||||
|
"gu": ("g", "u"),
|
||||||
|
"gua": ("g", "ua"),
|
||||||
|
"guai": ("g", "uai"),
|
||||||
|
"guan": ("g", "uan"),
|
||||||
|
"guang": ("g", "uang"),
|
||||||
|
"gui": ("g", "uei"),
|
||||||
|
"gun": ("g", "uen"),
|
||||||
|
"guo": ("g", "uo"),
|
||||||
|
"ha": ("h", "a"),
|
||||||
|
"hai": ("h", "ai"),
|
||||||
|
"han": ("h", "an"),
|
||||||
|
"hang": ("h", "ang"),
|
||||||
|
"hao": ("h", "ao"),
|
||||||
|
"he": ("h", "e"),
|
||||||
|
"hei": ("h", "ei"),
|
||||||
|
"hen": ("h", "en"),
|
||||||
|
"heng": ("h", "eng"),
|
||||||
|
"hong": ("h", "ong"),
|
||||||
|
"hou": ("h", "ou"),
|
||||||
|
"hu": ("h", "u"),
|
||||||
|
"hua": ("h", "ua"),
|
||||||
|
"huai": ("h", "uai"),
|
||||||
|
"huan": ("h", "uan"),
|
||||||
|
"huang": ("h", "uang"),
|
||||||
|
"hui": ("h", "uei"),
|
||||||
|
"hun": ("h", "uen"),
|
||||||
|
"huo": ("h", "uo"),
|
||||||
|
"ji": ("j", "i"),
|
||||||
|
"jia": ("j", "ia"),
|
||||||
|
"jian": ("j", "ian"),
|
||||||
|
"jiang": ("j", "iang"),
|
||||||
|
"jiao": ("j", "iao"),
|
||||||
|
"jie": ("j", "ie"),
|
||||||
|
"jin": ("j", "in"),
|
||||||
|
"jing": ("j", "ing"),
|
||||||
|
"jiong": ("j", "iong"),
|
||||||
|
"jiu": ("j", "iou"),
|
||||||
|
"ju": ("j", "v"),
|
||||||
|
"juan": ("j", "van"),
|
||||||
|
"jue": ("j", "ve"),
|
||||||
|
"jun": ("j", "vn"),
|
||||||
|
"ka": ("k", "a"),
|
||||||
|
"kai": ("k", "ai"),
|
||||||
|
"kan": ("k", "an"),
|
||||||
|
"kang": ("k", "ang"),
|
||||||
|
"kao": ("k", "ao"),
|
||||||
|
"ke": ("k", "e"),
|
||||||
|
"kei": ("k", "ei"),
|
||||||
|
"ken": ("k", "en"),
|
||||||
|
"keng": ("k", "eng"),
|
||||||
|
"kong": ("k", "ong"),
|
||||||
|
"kou": ("k", "ou"),
|
||||||
|
"ku": ("k", "u"),
|
||||||
|
"kua": ("k", "ua"),
|
||||||
|
"kuai": ("k", "uai"),
|
||||||
|
"kuan": ("k", "uan"),
|
||||||
|
"kuang": ("k", "uang"),
|
||||||
|
"kui": ("k", "uei"),
|
||||||
|
"kun": ("k", "uen"),
|
||||||
|
"kuo": ("k", "uo"),
|
||||||
|
"la": ("l", "a"),
|
||||||
|
"lai": ("l", "ai"),
|
||||||
|
"lan": ("l", "an"),
|
||||||
|
"lang": ("l", "ang"),
|
||||||
|
"lao": ("l", "ao"),
|
||||||
|
"le": ("l", "e"),
|
||||||
|
"lei": ("l", "ei"),
|
||||||
|
"leng": ("l", "eng"),
|
||||||
|
"li": ("l", "i"),
|
||||||
|
"lia": ("l", "ia"),
|
||||||
|
"lian": ("l", "ian"),
|
||||||
|
"liang": ("l", "iang"),
|
||||||
|
"liao": ("l", "iao"),
|
||||||
|
"lie": ("l", "ie"),
|
||||||
|
"lin": ("l", "in"),
|
||||||
|
"ling": ("l", "ing"),
|
||||||
|
"liu": ("l", "iou"),
|
||||||
|
"lo": ("l", "o"),
|
||||||
|
"long": ("l", "ong"),
|
||||||
|
"lou": ("l", "ou"),
|
||||||
|
"lu": ("l", "u"),
|
||||||
|
"lv": ("l", "v"),
|
||||||
|
"luan": ("l", "uan"),
|
||||||
|
"lve": ("l", "ve"),
|
||||||
|
"lue": ("l", "ve"),
|
||||||
|
"lun": ("l", "uen"),
|
||||||
|
"luo": ("l", "uo"),
|
||||||
|
"ma": ("m", "a"),
|
||||||
|
"mai": ("m", "ai"),
|
||||||
|
"man": ("m", "an"),
|
||||||
|
"mang": ("m", "ang"),
|
||||||
|
"mao": ("m", "ao"),
|
||||||
|
"me": ("m", "e"),
|
||||||
|
"mei": ("m", "ei"),
|
||||||
|
"men": ("m", "en"),
|
||||||
|
"meng": ("m", "eng"),
|
||||||
|
"mi": ("m", "i"),
|
||||||
|
"mian": ("m", "ian"),
|
||||||
|
"miao": ("m", "iao"),
|
||||||
|
"mie": ("m", "ie"),
|
||||||
|
"min": ("m", "in"),
|
||||||
|
"ming": ("m", "ing"),
|
||||||
|
"miu": ("m", "iou"),
|
||||||
|
"mo": ("m", "o"),
|
||||||
|
"mou": ("m", "ou"),
|
||||||
|
"mu": ("m", "u"),
|
||||||
|
"na": ("n", "a"),
|
||||||
|
"nai": ("n", "ai"),
|
||||||
|
"nan": ("n", "an"),
|
||||||
|
"nang": ("n", "ang"),
|
||||||
|
"nao": ("n", "ao"),
|
||||||
|
"ne": ("n", "e"),
|
||||||
|
"nei": ("n", "ei"),
|
||||||
|
"nen": ("n", "en"),
|
||||||
|
"neng": ("n", "eng"),
|
||||||
|
"ni": ("n", "i"),
|
||||||
|
"nia": ("n", "ia"),
|
||||||
|
"nian": ("n", "ian"),
|
||||||
|
"niang": ("n", "iang"),
|
||||||
|
"niao": ("n", "iao"),
|
||||||
|
"nie": ("n", "ie"),
|
||||||
|
"nin": ("n", "in"),
|
||||||
|
"ning": ("n", "ing"),
|
||||||
|
"niu": ("n", "iou"),
|
||||||
|
"nong": ("n", "ong"),
|
||||||
|
"nou": ("n", "ou"),
|
||||||
|
"nu": ("n", "u"),
|
||||||
|
"nv": ("n", "v"),
|
||||||
|
"nuan": ("n", "uan"),
|
||||||
|
"nve": ("n", "ve"),
|
||||||
|
"nue": ("n", "ve"),
|
||||||
|
"nuo": ("n", "uo"),
|
||||||
|
"o": ("^", "o"),
|
||||||
|
"ou": ("^", "ou"),
|
||||||
|
"pa": ("p", "a"),
|
||||||
|
"pai": ("p", "ai"),
|
||||||
|
"pan": ("p", "an"),
|
||||||
|
"pang": ("p", "ang"),
|
||||||
|
"pao": ("p", "ao"),
|
||||||
|
"pe": ("p", "e"),
|
||||||
|
"pei": ("p", "ei"),
|
||||||
|
"pen": ("p", "en"),
|
||||||
|
"peng": ("p", "eng"),
|
||||||
|
"pi": ("p", "i"),
|
||||||
|
"pian": ("p", "ian"),
|
||||||
|
"piao": ("p", "iao"),
|
||||||
|
"pie": ("p", "ie"),
|
||||||
|
"pin": ("p", "in"),
|
||||||
|
"ping": ("p", "ing"),
|
||||||
|
"po": ("p", "o"),
|
||||||
|
"pou": ("p", "ou"),
|
||||||
|
"pu": ("p", "u"),
|
||||||
|
"qi": ("q", "i"),
|
||||||
|
"qia": ("q", "ia"),
|
||||||
|
"qian": ("q", "ian"),
|
||||||
|
"qiang": ("q", "iang"),
|
||||||
|
"qiao": ("q", "iao"),
|
||||||
|
"qie": ("q", "ie"),
|
||||||
|
"qin": ("q", "in"),
|
||||||
|
"qing": ("q", "ing"),
|
||||||
|
"qiong": ("q", "iong"),
|
||||||
|
"qiu": ("q", "iou"),
|
||||||
|
"qu": ("q", "v"),
|
||||||
|
"quan": ("q", "van"),
|
||||||
|
"que": ("q", "ve"),
|
||||||
|
"qun": ("q", "vn"),
|
||||||
|
"ran": ("r", "an"),
|
||||||
|
"rang": ("r", "ang"),
|
||||||
|
"rao": ("r", "ao"),
|
||||||
|
"re": ("r", "e"),
|
||||||
|
"ren": ("r", "en"),
|
||||||
|
"reng": ("r", "eng"),
|
||||||
|
"ri": ("r", "iii"),
|
||||||
|
"rong": ("r", "ong"),
|
||||||
|
"rou": ("r", "ou"),
|
||||||
|
"ru": ("r", "u"),
|
||||||
|
"rua": ("r", "ua"),
|
||||||
|
"ruan": ("r", "uan"),
|
||||||
|
"rui": ("r", "uei"),
|
||||||
|
"run": ("r", "uen"),
|
||||||
|
"ruo": ("r", "uo"),
|
||||||
|
"sa": ("s", "a"),
|
||||||
|
"sai": ("s", "ai"),
|
||||||
|
"san": ("s", "an"),
|
||||||
|
"sang": ("s", "ang"),
|
||||||
|
"sao": ("s", "ao"),
|
||||||
|
"se": ("s", "e"),
|
||||||
|
"sen": ("s", "en"),
|
||||||
|
"seng": ("s", "eng"),
|
||||||
|
"sha": ("sh", "a"),
|
||||||
|
"shai": ("sh", "ai"),
|
||||||
|
"shan": ("sh", "an"),
|
||||||
|
"shang": ("sh", "ang"),
|
||||||
|
"shao": ("sh", "ao"),
|
||||||
|
"she": ("sh", "e"),
|
||||||
|
"shei": ("sh", "ei"),
|
||||||
|
"shen": ("sh", "en"),
|
||||||
|
"sheng": ("sh", "eng"),
|
||||||
|
"shi": ("sh", "iii"),
|
||||||
|
"shou": ("sh", "ou"),
|
||||||
|
"shu": ("sh", "u"),
|
||||||
|
"shua": ("sh", "ua"),
|
||||||
|
"shuai": ("sh", "uai"),
|
||||||
|
"shuan": ("sh", "uan"),
|
||||||
|
"shuang": ("sh", "uang"),
|
||||||
|
"shui": ("sh", "uei"),
|
||||||
|
"shun": ("sh", "uen"),
|
||||||
|
"shuo": ("sh", "uo"),
|
||||||
|
"si": ("s", "ii"),
|
||||||
|
"song": ("s", "ong"),
|
||||||
|
"sou": ("s", "ou"),
|
||||||
|
"su": ("s", "u"),
|
||||||
|
"suan": ("s", "uan"),
|
||||||
|
"sui": ("s", "uei"),
|
||||||
|
"sun": ("s", "uen"),
|
||||||
|
"suo": ("s", "uo"),
|
||||||
|
"ta": ("t", "a"),
|
||||||
|
"tai": ("t", "ai"),
|
||||||
|
"tan": ("t", "an"),
|
||||||
|
"tang": ("t", "ang"),
|
||||||
|
"tao": ("t", "ao"),
|
||||||
|
"te": ("t", "e"),
|
||||||
|
"tei": ("t", "ei"),
|
||||||
|
"teng": ("t", "eng"),
|
||||||
|
"ti": ("t", "i"),
|
||||||
|
"tian": ("t", "ian"),
|
||||||
|
"tiao": ("t", "iao"),
|
||||||
|
"tie": ("t", "ie"),
|
||||||
|
"ting": ("t", "ing"),
|
||||||
|
"tong": ("t", "ong"),
|
||||||
|
"tou": ("t", "ou"),
|
||||||
|
"tu": ("t", "u"),
|
||||||
|
"tuan": ("t", "uan"),
|
||||||
|
"tui": ("t", "uei"),
|
||||||
|
"tun": ("t", "uen"),
|
||||||
|
"tuo": ("t", "uo"),
|
||||||
|
"wa": ("^", "ua"),
|
||||||
|
"wai": ("^", "uai"),
|
||||||
|
"wan": ("^", "uan"),
|
||||||
|
"wang": ("^", "uang"),
|
||||||
|
"wei": ("^", "uei"),
|
||||||
|
"wen": ("^", "uen"),
|
||||||
|
"weng": ("^", "ueng"),
|
||||||
|
"wo": ("^", "uo"),
|
||||||
|
"wu": ("^", "u"),
|
||||||
|
"xi": ("x", "i"),
|
||||||
|
"xia": ("x", "ia"),
|
||||||
|
"xian": ("x", "ian"),
|
||||||
|
"xiang": ("x", "iang"),
|
||||||
|
"xiao": ("x", "iao"),
|
||||||
|
"xie": ("x", "ie"),
|
||||||
|
"xin": ("x", "in"),
|
||||||
|
"xing": ("x", "ing"),
|
||||||
|
"xiong": ("x", "iong"),
|
||||||
|
"xiu": ("x", "iou"),
|
||||||
|
"xu": ("x", "v"),
|
||||||
|
"xuan": ("x", "van"),
|
||||||
|
"xue": ("x", "ve"),
|
||||||
|
"xun": ("x", "vn"),
|
||||||
|
"ya": ("^", "ia"),
|
||||||
|
"yan": ("^", "ian"),
|
||||||
|
"yang": ("^", "iang"),
|
||||||
|
"yao": ("^", "iao"),
|
||||||
|
"ye": ("^", "ie"),
|
||||||
|
"yi": ("^", "i"),
|
||||||
|
"yin": ("^", "in"),
|
||||||
|
"ying": ("^", "ing"),
|
||||||
|
"yo": ("^", "iou"),
|
||||||
|
"yong": ("^", "iong"),
|
||||||
|
"you": ("^", "iou"),
|
||||||
|
"yu": ("^", "v"),
|
||||||
|
"yuan": ("^", "van"),
|
||||||
|
"yue": ("^", "ve"),
|
||||||
|
"yun": ("^", "vn"),
|
||||||
|
"za": ("z", "a"),
|
||||||
|
"zai": ("z", "ai"),
|
||||||
|
"zan": ("z", "an"),
|
||||||
|
"zang": ("z", "ang"),
|
||||||
|
"zao": ("z", "ao"),
|
||||||
|
"ze": ("z", "e"),
|
||||||
|
"zei": ("z", "ei"),
|
||||||
|
"zen": ("z", "en"),
|
||||||
|
"zeng": ("z", "eng"),
|
||||||
|
"zha": ("zh", "a"),
|
||||||
|
"zhai": ("zh", "ai"),
|
||||||
|
"zhan": ("zh", "an"),
|
||||||
|
"zhang": ("zh", "ang"),
|
||||||
|
"zhao": ("zh", "ao"),
|
||||||
|
"zhe": ("zh", "e"),
|
||||||
|
"zhei": ("zh", "ei"),
|
||||||
|
"zhen": ("zh", "en"),
|
||||||
|
"zheng": ("zh", "eng"),
|
||||||
|
"zhi": ("zh", "iii"),
|
||||||
|
"zhong": ("zh", "ong"),
|
||||||
|
"zhou": ("zh", "ou"),
|
||||||
|
"zhu": ("zh", "u"),
|
||||||
|
"zhua": ("zh", "ua"),
|
||||||
|
"zhuai": ("zh", "uai"),
|
||||||
|
"zhuan": ("zh", "uan"),
|
||||||
|
"zhuang": ("zh", "uang"),
|
||||||
|
"zhui": ("zh", "uei"),
|
||||||
|
"zhun": ("zh", "uen"),
|
||||||
|
"zhuo": ("zh", "uo"),
|
||||||
|
"zi": ("z", "ii"),
|
||||||
|
"zong": ("z", "ong"),
|
||||||
|
"zou": ("z", "ou"),
|
||||||
|
"zu": ("z", "u"),
|
||||||
|
"zuan": ("z", "uan"),
|
||||||
|
"zui": ("z", "uei"),
|
||||||
|
"zun": ("z", "uen"),
|
||||||
|
"zuo": ("z", "uo"),
|
||||||
|
}
|
53
egs/aishell3/TTS/local/prepare_token_file.py
Executable file
53
egs/aishell3/TTS/local/prepare_token_file.py
Executable file
@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2023 Xiaomi Corp. (authors: Zengwei Yao)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file generates the file tokens.txt that maps tokens to IDs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict
|
||||||
|
from symbols import symbols
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--tokens",
|
||||||
|
type=Path,
|
||||||
|
default=Path("data/tokens.txt"),
|
||||||
|
help="Path to the dict that maps the text tokens to IDs",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = get_args()
|
||||||
|
tokens = Path(args.tokens)
|
||||||
|
|
||||||
|
with open(tokens, "w", encoding="utf-8") as f:
|
||||||
|
for token_id, token in enumerate(symbols):
|
||||||
|
f.write(f"{token} {token_id}\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
62
egs/aishell3/TTS/local/prepare_tokens_aishell3.py
Executable file
62
egs/aishell3/TTS/local/prepare_tokens_aishell3.py
Executable file
@ -0,0 +1,62 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2023 Xiaomi Corp. (authors: Zengwei Yao)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This file reads the texts in given manifest and save the new cuts with tokens.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from lhotse import CutSet, load_manifest
|
||||||
|
|
||||||
|
from tokenizer import Tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_tokens_aishell3():
|
||||||
|
output_dir = Path("data/spectrogram")
|
||||||
|
prefix = "aishell3"
|
||||||
|
suffix = "jsonl.gz"
|
||||||
|
partitions = ("train", "test")
|
||||||
|
|
||||||
|
tokenizer = Tokenizer()
|
||||||
|
|
||||||
|
for partition in partitions:
|
||||||
|
cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
|
||||||
|
|
||||||
|
new_cuts = []
|
||||||
|
i = 0
|
||||||
|
for cut in cut_set:
|
||||||
|
# Each cut only contains one supervision
|
||||||
|
assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
|
||||||
|
text = cut.supervisions[0].text
|
||||||
|
cut.tokens = tokenizer.text_to_tokens(text)
|
||||||
|
|
||||||
|
new_cuts.append(cut)
|
||||||
|
|
||||||
|
new_cut_set = CutSet.from_cuts(new_cuts)
|
||||||
|
new_cut_set.to_file(
|
||||||
|
output_dir / f"{prefix}_cuts_with_tokens_{partition}.{suffix}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
|
|
||||||
|
prepare_tokens_aishell3()
|
328
egs/aishell3/TTS/local/pypinyin-local.dict
Normal file
328
egs/aishell3/TTS/local/pypinyin-local.dict
Normal file
@ -0,0 +1,328 @@
|
|||||||
|
姐姐 jie3 jie
|
||||||
|
宝宝 bao3 bao
|
||||||
|
哥哥 ge1 ge
|
||||||
|
妹妹 mei4 mei
|
||||||
|
弟弟 di4 di
|
||||||
|
妈妈 ma1 ma
|
||||||
|
开心哦 kai1 xin1 o
|
||||||
|
爸爸 ba4 ba
|
||||||
|
秘密哟 mi4 mi4 yo
|
||||||
|
哦 o
|
||||||
|
一年 yi4 nian2
|
||||||
|
一夜 yi2 ye4
|
||||||
|
一切 yi2 qie4
|
||||||
|
一座 yi2 zuo4
|
||||||
|
一下 yi2 xia4
|
||||||
|
上一山 shang4 yi2 shan1
|
||||||
|
下一山 xia4 yi2 shan1
|
||||||
|
休息 xiu1 xi2
|
||||||
|
东西 dong1 xi
|
||||||
|
上一届 shang4 yi2 jie4
|
||||||
|
便宜 pian2 yi4
|
||||||
|
加长 jia1 chang2
|
||||||
|
单田芳 shan4 tian2 fang1
|
||||||
|
帧 zhen1
|
||||||
|
长时间 chang2 shi2 jian1
|
||||||
|
长时 chang2 shi2
|
||||||
|
识别 shi2 bie2
|
||||||
|
生命中 sheng1 ming4 zhong1
|
||||||
|
踏实 ta1 shi
|
||||||
|
嗯 en4
|
||||||
|
溜达 liu1 da
|
||||||
|
少儿 shao4 er2
|
||||||
|
爷爷 ye2 ye
|
||||||
|
不是 bu2 shi4
|
||||||
|
一圈 yi1 quan1
|
||||||
|
厜读一声 zui1 du2 yi4 sheng1
|
||||||
|
一种 yi4 zhong3
|
||||||
|
一簇簇 yi2 cu4 cu4
|
||||||
|
一个 yi2 ge4
|
||||||
|
一样 yi2 yang4
|
||||||
|
一跩一跩 yi4 zhuai3 yi4 zhuai3
|
||||||
|
一会儿 yi2 hui4 er
|
||||||
|
一幢 yi2 zhuang4
|
||||||
|
挨了 ai2 le
|
||||||
|
熬菜 ao1 cai4
|
||||||
|
扒鸡 pa2 ji1
|
||||||
|
背枪 bei1 qiang1
|
||||||
|
绷瓷儿 beng4 ci2 er2
|
||||||
|
绷劲儿 beng3 jin4 er
|
||||||
|
绷着脸 beng3 zhe lian3
|
||||||
|
藏医 zang4 yi1
|
||||||
|
噌吰 cheng1 hong2
|
||||||
|
差点儿 cha4 dian3 er
|
||||||
|
差失 cha1 shi1
|
||||||
|
差误 cha1 wu4
|
||||||
|
孱头 can4 tou
|
||||||
|
乘间 cheng2 jian4
|
||||||
|
锄镰棘矜 chu2 lian2 ji2 qin2
|
||||||
|
川藏 chuan1 zang4
|
||||||
|
穿著 chuan1 zhuo2
|
||||||
|
答讪 da1 shan4
|
||||||
|
答言 da1 yan2
|
||||||
|
大伯子 da4 bai3 zi
|
||||||
|
大夫 dai4 fu
|
||||||
|
弹冠 tan2 guan1
|
||||||
|
当间 dang1 jian4
|
||||||
|
当然咯 dang1 ran2 lo
|
||||||
|
点种 dian3 zhong3
|
||||||
|
垛好 duo4 hao3
|
||||||
|
发疟子 fa1 yao4 zi
|
||||||
|
饭熟了 fan4 shou2 le
|
||||||
|
附著 fu4 zhuo2
|
||||||
|
复沓 fu4 ta4
|
||||||
|
供稿 gong1 gao3
|
||||||
|
供养 gong1 yang3
|
||||||
|
骨朵 gu1 duo
|
||||||
|
骨碌 gu1 lu
|
||||||
|
果脯 guo3 fu3
|
||||||
|
哈什玛 ha4 shi2 ma3
|
||||||
|
海蜇 hai3 zhe2
|
||||||
|
呵欠 he1 qian
|
||||||
|
河水汤汤 he2 shui3 shang1 shang1
|
||||||
|
鹄立 hu2 li4
|
||||||
|
鹄望 hu2 wang4
|
||||||
|
混人 hun2 ren2
|
||||||
|
混水 hun2 shui3
|
||||||
|
鸡血 ji1 xie3
|
||||||
|
缉鞋口 qi1 xie2 kou3
|
||||||
|
亟来闻讯 qi4 lai2 wen2 xun4
|
||||||
|
计量 ji4 liang2
|
||||||
|
济水 ji3 shui3
|
||||||
|
间杂 jian4 za2
|
||||||
|
脚跐两只船 jiao3 ci3 liang3 zhi1 chuan2
|
||||||
|
脚儿 jue2 er2
|
||||||
|
口角 kou3 jiao3
|
||||||
|
勒石 le4 shi2
|
||||||
|
累进 lei3 jin4
|
||||||
|
累累如丧家之犬 lei2 lei2 ru2 sang4 jia1 zhi1 quan3
|
||||||
|
累年 lei3 nian2
|
||||||
|
脸涨通红 lian3 zhang4 tong1 hong2
|
||||||
|
踉锵 liang4 qiang1
|
||||||
|
燎眉毛 liao3 mei2 mao2
|
||||||
|
燎头发 liao3 tou2 fa4
|
||||||
|
溜达 liu1 da
|
||||||
|
溜缝儿 liu4 feng4 er
|
||||||
|
馏口饭 liu4 kou3 fan4
|
||||||
|
遛马 liu4 ma3
|
||||||
|
遛鸟 liu4 niao3
|
||||||
|
遛弯儿 liu4 wan1 er
|
||||||
|
楼枪机 lou1 qiang1 ji1
|
||||||
|
搂钱 lou1 qian2
|
||||||
|
鹿脯 lu4 fu3
|
||||||
|
露头 lou4 tou2
|
||||||
|
落魄 luo4 po4
|
||||||
|
捋胡子 lv3 hu2 zi
|
||||||
|
绿地 lv4 di4
|
||||||
|
麦垛 mai4 duo4
|
||||||
|
没劲儿 mei2 jin4 er
|
||||||
|
闷棍 men4 gun4
|
||||||
|
闷葫芦 men4 hu2 lu
|
||||||
|
闷头干 men1 tou2 gan4
|
||||||
|
蒙古 meng3 gu3
|
||||||
|
靡日不思 mi3 ri4 bu4 si1
|
||||||
|
缪姓 miao4 xing4
|
||||||
|
抹墙 mo4 qiang2
|
||||||
|
抹下脸 ma1 xia4 lian3
|
||||||
|
泥子 ni4 zi
|
||||||
|
拗不过 niu4 bu guo4
|
||||||
|
排车 pai3 che1
|
||||||
|
盘诘 pan2 jie2
|
||||||
|
膀肿 pang1 zhong3
|
||||||
|
炮干 bao1 gan1
|
||||||
|
炮格 pao2 ge2
|
||||||
|
碰钉子 peng4 ding1 zi
|
||||||
|
缥色 piao3 se4
|
||||||
|
瀑河 bao4 he2
|
||||||
|
蹊径 xi1 jing4
|
||||||
|
前后相属 qian2 hou4 xiang1 zhu3
|
||||||
|
翘尾巴 qiao4 wei3 ba
|
||||||
|
趄坡儿 qie4 po1 er
|
||||||
|
秦桧 qin2 hui4
|
||||||
|
圈马 juan1 ma3
|
||||||
|
雀盲眼 qiao3 mang2 yan3
|
||||||
|
雀子 qiao1 zi
|
||||||
|
三年五载 san1 nian2 wu3 zai3
|
||||||
|
加载 jia1 zai3
|
||||||
|
山大王 shan1 dai4 wang
|
||||||
|
苫屋草 shan4 wu1 cao3
|
||||||
|
数数 shu3 shu4
|
||||||
|
说客 shui4 ke4
|
||||||
|
思量 si1 liang2
|
||||||
|
伺侯 ci4 hou
|
||||||
|
踏实 ta1 shi
|
||||||
|
提溜 di1 liu
|
||||||
|
调拨 diao4 bo1
|
||||||
|
帖子 tie3 zi
|
||||||
|
铜钿 tong2 tian2
|
||||||
|
头昏脑涨 tou2 hun1 nao3 zhang4
|
||||||
|
褪色 tui4 se4
|
||||||
|
褪着手 tun4 zhe shou3
|
||||||
|
圩子 wei2 zi
|
||||||
|
尾巴 wei3 ba
|
||||||
|
系好船只 xi4 hao3 chuan2 zhi1
|
||||||
|
系好马匹 xi4 hao3 ma3 pi3
|
||||||
|
杏脯 xing4 fu3
|
||||||
|
姓单 xing4 shan4
|
||||||
|
姓葛 xing4 ge3
|
||||||
|
姓哈 xing4 ha3
|
||||||
|
姓解 xing4 xie4
|
||||||
|
姓秘 xing4 bi4
|
||||||
|
姓宁 xing4 ning4
|
||||||
|
旋风 xuan4 feng1
|
||||||
|
旋根车轴 xuan4 gen1 che1 zhou2
|
||||||
|
荨麻 qian2 ma2
|
||||||
|
一幢楼房 yi1 zhuang4 lou2 fang2
|
||||||
|
遗之千金 wei4 zhi1 qian1 jin1
|
||||||
|
殷殷 yin3 yin3
|
||||||
|
应招 ying4 zhao1
|
||||||
|
用称约 yong4 cheng4 yao1
|
||||||
|
约斤肉 yao1 jin1 rou4
|
||||||
|
晕机 yun4 ji1
|
||||||
|
熨贴 yu4 tie1
|
||||||
|
咋办 za3 ban4
|
||||||
|
咋呼 zha1 hu
|
||||||
|
仔兽 zi3 shou4
|
||||||
|
扎彩 za1 cai3
|
||||||
|
扎实 zha1 shi
|
||||||
|
扎腰带 za1 yao1 dai4
|
||||||
|
轧朋友 ga2 peng2 you3
|
||||||
|
爪子 zhua3 zi
|
||||||
|
折腾 zhe1 teng
|
||||||
|
着实 zhuo2 shi2
|
||||||
|
着我旧时裳 zhuo2 wo3 jiu4 shi2 chang2
|
||||||
|
枝蔓 zhi1 man4
|
||||||
|
中鹄 zhong1 hu2
|
||||||
|
中选 zhong4 xuan3
|
||||||
|
猪圈 zhu1 juan4
|
||||||
|
拽住不放 zhuai4 zhu4 bu4 fang4
|
||||||
|
转悠 zhuan4 you
|
||||||
|
庄稼熟了 zhuang1 jia shou2 le
|
||||||
|
酌量 zhuo2 liang2
|
||||||
|
罪行累累 zui4 xing2 lei3 lei3
|
||||||
|
一手 yi4 shou3
|
||||||
|
一去不复返 yi2 qu4 bu2 fu4 fan3
|
||||||
|
一颗 yi4 ke1
|
||||||
|
一件 yi2 jian4
|
||||||
|
一斤 yi4 jin1
|
||||||
|
一点 yi4 dian3
|
||||||
|
一朵 yi4 duo3
|
||||||
|
一声 yi4 sheng1
|
||||||
|
一身 yi4 shen1
|
||||||
|
不要 bu2 yao4
|
||||||
|
一人 yi4 ren2
|
||||||
|
一个 yi2 ge4
|
||||||
|
一把 yi4 ba3
|
||||||
|
一门 yi4 men2
|
||||||
|
一門 yi4 men2
|
||||||
|
一艘 yi4 sou1
|
||||||
|
一片 yi2 pian4
|
||||||
|
一篇 yi2 pian1
|
||||||
|
一份 yi2 fen4
|
||||||
|
好嗲 hao3 dia3
|
||||||
|
随地 sui2 di4
|
||||||
|
扁担长 bian3 dan4 chang3
|
||||||
|
一堆 yi4 dui1
|
||||||
|
不义 bu2 yi4
|
||||||
|
放一放 fang4 yi2 fang4
|
||||||
|
一米 yi4 mi3
|
||||||
|
一顿 yi2 dun4
|
||||||
|
一层楼 yi4 ceng2 lou2
|
||||||
|
一条 yi4 tiao2
|
||||||
|
一件 yi2 jian4
|
||||||
|
一棵 yi4 ke1
|
||||||
|
一小股 yi4 xiao3 gu3
|
||||||
|
一拐一拐 yi4 guai3 yi4 guai3
|
||||||
|
一根 yi4 gen1
|
||||||
|
沆瀣一气 hang4 xie4 yi2 qi4
|
||||||
|
一丝 yi4 si1
|
||||||
|
一毫 yi4 hao2
|
||||||
|
一樣 yi2 yang4
|
||||||
|
处处 chu4 chu4
|
||||||
|
一餐 yi4 can
|
||||||
|
永不 yong3 bu2
|
||||||
|
一看 yi2 kan4
|
||||||
|
一架 yi2 jia4
|
||||||
|
送还 song4 huan2
|
||||||
|
一见 yi2 jian4
|
||||||
|
一座 yi2 zuo4
|
||||||
|
一块 yi2 kuai4
|
||||||
|
一天 yi4 tian1
|
||||||
|
一只 yi4 zhi1
|
||||||
|
一支 yi4 zhi1
|
||||||
|
一字 yi2 zi4
|
||||||
|
一句 yi2 ju4
|
||||||
|
一张 yi4 zhang1
|
||||||
|
一條 yi4 tiao2
|
||||||
|
一场 yi4 chang3
|
||||||
|
一粒 yi2 li4
|
||||||
|
小俩口 xiao3 liang3 kou3
|
||||||
|
一首 yi4 shou3
|
||||||
|
一对 yi2 dui4
|
||||||
|
一手 yi4 shou3
|
||||||
|
又一村 you4 yi4 cun1
|
||||||
|
一概而论 yi2 gai4 er2 lun4
|
||||||
|
一峰峰 yi4 feng1 feng1
|
||||||
|
不但 bu2 dan4
|
||||||
|
一笑 yi2 xiao4
|
||||||
|
挠痒痒 nao2 yang3 yang
|
||||||
|
不对 bu2 dui4
|
||||||
|
拧开 ning3 kai1
|
||||||
|
爱不释手 ai4 bu2 shi4 shou3
|
||||||
|
一念 yi2 nian4
|
||||||
|
夺得 duo2 de2
|
||||||
|
一袭 yi4 xi2
|
||||||
|
一定 yi2 ding4
|
||||||
|
不慎 bu2 shen4
|
||||||
|
剽窃 piao2 qie4
|
||||||
|
一时 yi4 shi2
|
||||||
|
撇开 pie3 kai1
|
||||||
|
一祭 yi2 ji4
|
||||||
|
发卡 fa4 qia3
|
||||||
|
少不了 shao3 bu4 liao3
|
||||||
|
千虑一失 qian1 lv4 yi4 shi1
|
||||||
|
呛得 qiang4 de2
|
||||||
|
切菜 qie1 cai4
|
||||||
|
茄盒 qie2 he2
|
||||||
|
不去 bu2 qu4
|
||||||
|
一大圈 yi2 da4 quan1
|
||||||
|
不再 bu2 zai4
|
||||||
|
一群 yi4 qun2
|
||||||
|
不必 bu2 bi4
|
||||||
|
一些 yi4 xie1
|
||||||
|
一路 yi2 lu4
|
||||||
|
一股 yi4 gu3
|
||||||
|
一到 yi2 dao4
|
||||||
|
一拨 yi4 bo1
|
||||||
|
一排 yi4 pai2
|
||||||
|
一空 yi4 kong1
|
||||||
|
吮吸着 shun3 xi1 zhe
|
||||||
|
不适合 bu2 shi4 he2
|
||||||
|
一串串 yi2 chuan4 chuan4
|
||||||
|
一提起 yi4 ti2 qi3
|
||||||
|
一尘不染 yi4 chen2 bu4 ran3
|
||||||
|
一生 yi4 sheng1
|
||||||
|
一派 yi2 pai4
|
||||||
|
不断 bu2 duan4
|
||||||
|
一次 yi2 ci4
|
||||||
|
不进步 bu2 jin4 bu4
|
||||||
|
娃娃 wa2 wa
|
||||||
|
万户侯 wan4 hu4 hou2
|
||||||
|
一方 yi4 fang1
|
||||||
|
一番话 yi4 fan1 hua4
|
||||||
|
一遍 yi2 bian4
|
||||||
|
不计较 bu2 ji4 jiao4
|
||||||
|
诇 xiong4
|
||||||
|
一边 yi4 bian1
|
||||||
|
一束 yi2 shu4
|
||||||
|
一听到 yi4 ting1 dao4
|
||||||
|
炸鸡 zha2 ji1
|
||||||
|
乍暧还寒 zha4 ai4 huan2 han2
|
||||||
|
我说诶 wo3 shuo1 ei1
|
||||||
|
棒诶 bang4 ei1
|
||||||
|
寒碜 han2 chen4
|
||||||
|
应采儿 ying4 cai3 er2
|
||||||
|
晕车 yun1 che1
|
||||||
|
必应 bi4 ying4
|
||||||
|
应援 ying4 yuan2
|
||||||
|
应力 ying4 li4
|
73
egs/aishell3/TTS/local/symbols.py
Normal file
73
egs/aishell3/TTS/local/symbols.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
# This file is copied from
|
||||||
|
# https://github.com/UEhQZXI/vits_chinese/blob/master/text/symbols.py
|
||||||
|
_pause = ["sil", "eos", "sp", "#0", "#1", "#2", "#3"]
|
||||||
|
|
||||||
|
_initials = [
|
||||||
|
"^",
|
||||||
|
"b",
|
||||||
|
"c",
|
||||||
|
"ch",
|
||||||
|
"d",
|
||||||
|
"f",
|
||||||
|
"g",
|
||||||
|
"h",
|
||||||
|
"j",
|
||||||
|
"k",
|
||||||
|
"l",
|
||||||
|
"m",
|
||||||
|
"n",
|
||||||
|
"p",
|
||||||
|
"q",
|
||||||
|
"r",
|
||||||
|
"s",
|
||||||
|
"sh",
|
||||||
|
"t",
|
||||||
|
"x",
|
||||||
|
"z",
|
||||||
|
"zh",
|
||||||
|
]
|
||||||
|
|
||||||
|
_tones = ["1", "2", "3", "4", "5"]
|
||||||
|
|
||||||
|
_finals = [
|
||||||
|
"a",
|
||||||
|
"ai",
|
||||||
|
"an",
|
||||||
|
"ang",
|
||||||
|
"ao",
|
||||||
|
"e",
|
||||||
|
"ei",
|
||||||
|
"en",
|
||||||
|
"eng",
|
||||||
|
"er",
|
||||||
|
"i",
|
||||||
|
"ia",
|
||||||
|
"ian",
|
||||||
|
"iang",
|
||||||
|
"iao",
|
||||||
|
"ie",
|
||||||
|
"ii",
|
||||||
|
"iii",
|
||||||
|
"in",
|
||||||
|
"ing",
|
||||||
|
"iong",
|
||||||
|
"iou",
|
||||||
|
"o",
|
||||||
|
"ong",
|
||||||
|
"ou",
|
||||||
|
"u",
|
||||||
|
"ua",
|
||||||
|
"uai",
|
||||||
|
"uan",
|
||||||
|
"uang",
|
||||||
|
"uei",
|
||||||
|
"uen",
|
||||||
|
"ueng",
|
||||||
|
"uo",
|
||||||
|
"v",
|
||||||
|
"van",
|
||||||
|
"ve",
|
||||||
|
"vn",
|
||||||
|
]
|
||||||
|
|
||||||
|
symbols = _pause + _initials + [i + j for i in _finals for j in _tones]
|
137
egs/aishell3/TTS/local/tokenizer.py
Normal file
137
egs/aishell3/TTS/local/tokenizer.py
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
# This file is modified from
|
||||||
|
# https://github.com/UEhQZXI/vits_chinese/blob/master/vits_strings.py
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
# Note pinyin_dict is from ./pinyin_dict.py
|
||||||
|
from pinyin_dict import pinyin_dict
|
||||||
|
from pypinyin import Style
|
||||||
|
from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
|
||||||
|
from pypinyin.converter import DefaultConverter
|
||||||
|
from pypinyin.core import Pinyin, load_phrases_dict
|
||||||
|
|
||||||
|
|
||||||
|
class _MyConverter(NeutralToneWith5Mixin, DefaultConverter):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Tokenizer:
|
||||||
|
def __init__(self, tokens: str = ""):
|
||||||
|
self._load_pinyin_dict()
|
||||||
|
self._pinyin_parser = Pinyin(_MyConverter())
|
||||||
|
|
||||||
|
if tokens != "":
|
||||||
|
self._load_tokens(tokens)
|
||||||
|
|
||||||
|
def texts_to_token_ids(self, texts: List[str], **kwargs) -> List[List[int]]:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
texts:
|
||||||
|
A list of sentences.
|
||||||
|
kwargs:
|
||||||
|
Not used. It is for compatibility with other TTS recipes in icefall.
|
||||||
|
"""
|
||||||
|
tokens = []
|
||||||
|
|
||||||
|
for text in texts:
|
||||||
|
tokens.append(self.text_to_tokens(text))
|
||||||
|
|
||||||
|
return self.tokens_to_token_ids(tokens)
|
||||||
|
|
||||||
|
def tokens_to_token_ids(self, tokens: List[List[str]]) -> List[List[int]]:
|
||||||
|
ans = []
|
||||||
|
|
||||||
|
for token_list in tokens:
|
||||||
|
token_ids = []
|
||||||
|
for t in token_list:
|
||||||
|
if t not in self.token2id:
|
||||||
|
logging.warning(f"Skip OOV {t}")
|
||||||
|
continue
|
||||||
|
token_ids.append(self.token2id[t])
|
||||||
|
ans.append(token_ids)
|
||||||
|
|
||||||
|
return ans
|
||||||
|
|
||||||
|
def text_to_tokens(self, text: str) -> List[str]:
|
||||||
|
# Convert "," to ["sp", "sil"]
|
||||||
|
# Convert "。" to ["sil"]
|
||||||
|
# append ["eos"] at the end of a sentence
|
||||||
|
phonemes = ["sil"]
|
||||||
|
pinyins = self._pinyin_parser.pinyin(
|
||||||
|
text,
|
||||||
|
style=Style.TONE3,
|
||||||
|
errors=lambda x: [[w] for w in x],
|
||||||
|
)
|
||||||
|
|
||||||
|
new_pinyin = []
|
||||||
|
for p in pinyins:
|
||||||
|
p = p[0]
|
||||||
|
if p == ",":
|
||||||
|
new_pinyin.extend(["sp", "sil"])
|
||||||
|
elif p == "。":
|
||||||
|
new_pinyin.append("sil")
|
||||||
|
else:
|
||||||
|
new_pinyin.append(p)
|
||||||
|
sub_phonemes = self._get_phoneme4pinyin(new_pinyin)
|
||||||
|
sub_phonemes.append("eos")
|
||||||
|
phonemes.extend(sub_phonemes)
|
||||||
|
return phonemes
|
||||||
|
|
||||||
|
def _get_phoneme4pinyin(self, pinyins):
|
||||||
|
result = []
|
||||||
|
for pinyin in pinyins:
|
||||||
|
if pinyin in ("sil", "sp"):
|
||||||
|
result.append(pinyin)
|
||||||
|
elif pinyin[:-1] in pinyin_dict:
|
||||||
|
tone = pinyin[-1]
|
||||||
|
a = pinyin[:-1]
|
||||||
|
a1, a2 = pinyin_dict[a]
|
||||||
|
# every word is appended with a #0
|
||||||
|
result += [a1, a2 + tone, "#0"]
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _load_pinyin_dict(self):
|
||||||
|
this_dir = Path(__file__).parent.resolve()
|
||||||
|
my_dict = {}
|
||||||
|
with open(f"{this_dir}/pypinyin-local.dict", "r", encoding="utf-8") as f:
|
||||||
|
content = f.readlines()
|
||||||
|
for line in content:
|
||||||
|
cuts = line.strip().split()
|
||||||
|
hanzi = cuts[0]
|
||||||
|
pinyin = cuts[1:]
|
||||||
|
my_dict[hanzi] = [[p] for p in pinyin]
|
||||||
|
|
||||||
|
load_phrases_dict(my_dict)
|
||||||
|
|
||||||
|
def _load_tokens(self, filename):
|
||||||
|
token2id: Dict[str, int] = {}
|
||||||
|
|
||||||
|
with open(filename, "r", encoding="utf-8") as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
info = line.rstrip().split()
|
||||||
|
if len(info) == 1:
|
||||||
|
# case of space
|
||||||
|
token = " "
|
||||||
|
idx = int(info[0])
|
||||||
|
else:
|
||||||
|
token, idx = info[0], int(info[1])
|
||||||
|
|
||||||
|
assert token not in token2id, token
|
||||||
|
|
||||||
|
token2id[token] = idx
|
||||||
|
|
||||||
|
self.token2id = token2id
|
||||||
|
self.vocab_size = len(self.token2id)
|
||||||
|
self.pad_id = self.token2id["#0"]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
tokenizer = Tokenizer()
|
||||||
|
tokenizer._sentence_to_ids("你好,好的。")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
1
egs/aishell3/TTS/local/validate_manifest.py
Symbolic link
1
egs/aishell3/TTS/local/validate_manifest.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/local/validate_manifest.py
|
141
egs/aishell3/TTS/prepare.sh
Executable file
141
egs/aishell3/TTS/prepare.sh
Executable file
@ -0,0 +1,141 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
||||||
|
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
||||||
|
|
||||||
|
set -eou pipefail
|
||||||
|
|
||||||
|
stage=-1
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
dl_dir=$PWD/download
|
||||||
|
|
||||||
|
. shared/parse_options.sh || exit 1
|
||||||
|
|
||||||
|
# All files generated by this script are saved in "data".
|
||||||
|
# You can safely remove "data" and rerun this script to regenerate it.
|
||||||
|
mkdir -p data
|
||||||
|
|
||||||
|
log() {
|
||||||
|
# This function is from espnet
|
||||||
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
log "dl_dir: $dl_dir"
|
||||||
|
|
||||||
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||||
|
log "Stage 0: build monotonic_align lib"
|
||||||
|
if [ ! -d vits/monotonic_align/build ]; then
|
||||||
|
cd vits/monotonic_align
|
||||||
|
python3 setup.py build_ext --inplace
|
||||||
|
cd ../../
|
||||||
|
else
|
||||||
|
log "monotonic_align lib already built"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||||
|
log "Stage 1: Download data"
|
||||||
|
|
||||||
|
# The directory $dl_dir/aishell3 will contain the following files
|
||||||
|
# and sub directories
|
||||||
|
# ChangeLog ReadMe.txt phone_set.txt spk-info.txt test train
|
||||||
|
# If you have pre-downloaded it to /path/to/aishell3, you can create a symlink
|
||||||
|
#
|
||||||
|
# ln -sfv /path/to/aishell3 $dl_dir/
|
||||||
|
# touch $dl_dir/aishell3/.completed
|
||||||
|
#
|
||||||
|
if [ ! -d $dl_dir/aishell3 ]; then
|
||||||
|
lhotse download aishell3 $dl_dir
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
|
log "Stage 2: Prepare aishell3 manifest (may take 13 minutes)"
|
||||||
|
# We assume that you have downloaded the baker corpus
|
||||||
|
# to $dl_dir/aishell3.
|
||||||
|
# You can find files like spk-info.txt inside $dl_dir/aishell3
|
||||||
|
mkdir -p data/manifests
|
||||||
|
if [ ! -e data/manifests/.aishell3.done ]; then
|
||||||
|
lhotse prepare aishell3 $dl_dir/aishell3 data/manifests >/dev/null 2>&1
|
||||||
|
touch data/manifests/.aishell3.done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
|
log "Stage 3: Compute spectrogram for aishell3 (may take 5 minutes)"
|
||||||
|
mkdir -p data/spectrogram
|
||||||
|
if [ ! -e data/spectrogram/.aishell3.done ]; then
|
||||||
|
./local/compute_spectrogram_aishell3.py
|
||||||
|
touch data/spectrogram/.aishell3.done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -e data/spectrogram/.aishell3-validated.done ]; then
|
||||||
|
log "Validating data/spectrogram for aishell3"
|
||||||
|
python3 ./local/validate_manifest.py \
|
||||||
|
data/spectrogram/aishell3_cuts_train.jsonl.gz
|
||||||
|
|
||||||
|
python3 ./local/validate_manifest.py \
|
||||||
|
data/spectrogram/aishell3_cuts_test.jsonl.gz
|
||||||
|
|
||||||
|
touch data/spectrogram/.aishell3-validated.done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||||
|
log "Stage 4: Prepare tokens for aishell3 (may take 20 seconds)"
|
||||||
|
if [ ! -e data/spectrogram/.aishell3_with_token.done ]; then
|
||||||
|
|
||||||
|
./local/prepare_tokens_aishell3.py
|
||||||
|
|
||||||
|
mv -v data/spectrogram/aishell3_cuts_with_tokens_train.jsonl.gz \
|
||||||
|
data/spectrogram/aishell3_cuts_train.jsonl.gz
|
||||||
|
|
||||||
|
mv -v data/spectrogram/aishell3_cuts_with_tokens_test.jsonl.gz \
|
||||||
|
data/spectrogram/aishell3_cuts_test.jsonl.gz
|
||||||
|
|
||||||
|
touch data/spectrogram/.aishell3_with_token.done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||||
|
log "Stage 5: Split the aishell3 cuts into train, valid and test sets (may take 25 seconds)"
|
||||||
|
if [ ! -e data/spectrogram/.aishell3_split.done ]; then
|
||||||
|
lhotse subset --last 1000 \
|
||||||
|
data/spectrogram/aishell3_cuts_test.jsonl.gz \
|
||||||
|
data/spectrogram/aishell3_cuts_valid.jsonl.gz
|
||||||
|
|
||||||
|
n=$(( $(gunzip -c data/spectrogram/aishell3_cuts_test.jsonl.gz | wc -l) - 1000 ))
|
||||||
|
|
||||||
|
lhotse subset --first $n \
|
||||||
|
data/spectrogram/aishell3_cuts_test.jsonl.gz \
|
||||||
|
data/spectrogram/aishell3_cuts_test2.jsonl.gz
|
||||||
|
|
||||||
|
mv data/spectrogram/aishell3_cuts_test2.jsonl.gz data/spectrogram/aishell3_cuts_test.jsonl.gz
|
||||||
|
|
||||||
|
touch data/spectrogram/.aishell3_split.done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||||
|
log "Stage 6: Generate tokens.txt and lexicon.txt "
|
||||||
|
if [ ! -e data/tokens.txt ]; then
|
||||||
|
./local/prepare_token_file.py --tokens data/tokens.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -e data/lexicon.txt ]; then
|
||||||
|
./local/generate_lexicon.py --tokens data/tokens.txt --lexicon data/lexicon.txt
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
|
||||||
|
log "Stage 7: Generate speakers file"
|
||||||
|
if [ ! -e data/speakers.txt ]; then
|
||||||
|
gunzip -c data/manifests/aishell3_supervisions_train.jsonl.gz \
|
||||||
|
| jq '.speaker' | sed 's/"//g' \
|
||||||
|
| sort | uniq > data/speakers.txt
|
||||||
|
fi
|
||||||
|
fi
|
1
egs/aishell3/TTS/shared
Symbolic link
1
egs/aishell3/TTS/shared
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../icefall/shared
|
1
egs/aishell3/TTS/vits/duration_predictor.py
Symbolic link
1
egs/aishell3/TTS/vits/duration_predictor.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/vits/duration_predictor.py
|
433
egs/aishell3/TTS/vits/export-onnx.py
Executable file
433
egs/aishell3/TTS/vits/export-onnx.py
Executable file
@ -0,0 +1,433 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# Copyright 2023 Xiaomi Corporation (Author: Zengwei Yao)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script exports a VITS model from PyTorch to ONNX.
|
||||||
|
|
||||||
|
Export the model to ONNX:
|
||||||
|
./vits/export-onnx.py \
|
||||||
|
--epoch 1000 \
|
||||||
|
--speakers ./data/speakers.txt \
|
||||||
|
--exp-dir vits/exp \
|
||||||
|
--tokens data/tokens.txt
|
||||||
|
|
||||||
|
It will generate one file inside vits/exp:
|
||||||
|
- vits-epoch-1000.onnx
|
||||||
|
|
||||||
|
See ./test_onnx.py for how to use the exported ONNX models.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Tuple
|
||||||
|
|
||||||
|
import onnx
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from tokenizer import Tokenizer
|
||||||
|
from train import get_model, get_params
|
||||||
|
|
||||||
|
from icefall.checkpoint import load_checkpoint
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--epoch",
|
||||||
|
type=int,
|
||||||
|
default=1000,
|
||||||
|
help="""It specifies the checkpoint to use for decoding.
|
||||||
|
Note: Epoch counts from 1.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--exp-dir",
|
||||||
|
type=str,
|
||||||
|
default="vits/exp",
|
||||||
|
help="The experiment dir",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--tokens",
|
||||||
|
type=str,
|
||||||
|
default="data/tokens.txt",
|
||||||
|
help="""Path to vocabulary.""",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--speakers",
|
||||||
|
type=Path,
|
||||||
|
default=Path("data/speakers.txt"),
|
||||||
|
help="Path to speakers.txt file.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--model-type",
|
||||||
|
type=str,
|
||||||
|
default="low",
|
||||||
|
choices=["low", "medium", "high"],
|
||||||
|
help="""If not empty, valid values are: low, medium, high.
|
||||||
|
It controls the model size. low -> runs faster.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
def add_meta_data(filename: str, meta_data: Dict[str, str]):
|
||||||
|
"""Add meta data to an ONNX model. It is changed in-place.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename:
|
||||||
|
Filename of the ONNX model to be changed.
|
||||||
|
meta_data:
|
||||||
|
Key-value pairs.
|
||||||
|
"""
|
||||||
|
model = onnx.load(filename)
|
||||||
|
for key, value in meta_data.items():
|
||||||
|
meta = model.metadata_props.add()
|
||||||
|
meta.key = key
|
||||||
|
meta.value = str(value)
|
||||||
|
|
||||||
|
onnx.save(model, filename)
|
||||||
|
|
||||||
|
|
||||||
|
class OnnxModel(nn.Module):
|
||||||
|
"""A wrapper for VITS generator."""
|
||||||
|
|
||||||
|
def __init__(self, model: nn.Module):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
model:
|
||||||
|
A VITS generator.
|
||||||
|
frame_shift:
|
||||||
|
The frame shift in samples.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
tokens: torch.Tensor,
|
||||||
|
tokens_lens: torch.Tensor,
|
||||||
|
noise_scale: float = 0.667,
|
||||||
|
alpha: float = 1.0,
|
||||||
|
noise_scale_dur: float = 0.8,
|
||||||
|
speaker: int = 0,
|
||||||
|
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
"""Please see the help information of VITS.inference_batch
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tokens:
|
||||||
|
Input text token indexes (1, T_text)
|
||||||
|
tokens_lens:
|
||||||
|
Number of tokens of shape (1,)
|
||||||
|
noise_scale (float):
|
||||||
|
Noise scale parameter for flow.
|
||||||
|
noise_scale_dur (float):
|
||||||
|
Noise scale parameter for duration predictor.
|
||||||
|
speaker (int):
|
||||||
|
Speaker ID.
|
||||||
|
alpha (float):
|
||||||
|
Alpha parameter to control the speed of generated speech.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Return a tuple containing:
|
||||||
|
- audio, generated wavform tensor, (B, T_wav)
|
||||||
|
"""
|
||||||
|
audio, _, _ = self.model.generator.inference(
|
||||||
|
text=tokens,
|
||||||
|
text_lengths=tokens_lens,
|
||||||
|
noise_scale=noise_scale,
|
||||||
|
noise_scale_dur=noise_scale_dur,
|
||||||
|
sids=speaker,
|
||||||
|
alpha=alpha,
|
||||||
|
)
|
||||||
|
return audio
|
||||||
|
|
||||||
|
|
||||||
|
def export_model_onnx(
|
||||||
|
model: nn.Module,
|
||||||
|
model_filename: str,
|
||||||
|
vocab_size: int,
|
||||||
|
opset_version: int = 11,
|
||||||
|
) -> None:
|
||||||
|
"""Export the given generator model to ONNX format.
|
||||||
|
The exported model has one input:
|
||||||
|
|
||||||
|
- tokens, a tensor of shape (1, T_text); dtype is torch.int64
|
||||||
|
|
||||||
|
and it has one output:
|
||||||
|
|
||||||
|
- audio, a tensor of shape (1, T'); dtype is torch.float32
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model:
|
||||||
|
The VITS generator.
|
||||||
|
model_filename:
|
||||||
|
The filename to save the exported ONNX model.
|
||||||
|
vocab_size:
|
||||||
|
Number of tokens used in training.
|
||||||
|
opset_version:
|
||||||
|
The opset version to use.
|
||||||
|
"""
|
||||||
|
tokens = torch.randint(low=0, high=vocab_size, size=(1, 13), dtype=torch.int64)
|
||||||
|
tokens_lens = torch.tensor([tokens.shape[1]], dtype=torch.int64)
|
||||||
|
noise_scale = torch.tensor([1], dtype=torch.float32)
|
||||||
|
noise_scale_dur = torch.tensor([1], dtype=torch.float32)
|
||||||
|
alpha = torch.tensor([1], dtype=torch.float32)
|
||||||
|
speaker = torch.tensor([1], dtype=torch.int64)
|
||||||
|
|
||||||
|
torch.onnx.export(
|
||||||
|
model,
|
||||||
|
(tokens, tokens_lens, noise_scale, alpha, noise_scale_dur, speaker),
|
||||||
|
model_filename,
|
||||||
|
verbose=False,
|
||||||
|
opset_version=opset_version,
|
||||||
|
input_names=[
|
||||||
|
"tokens",
|
||||||
|
"tokens_lens",
|
||||||
|
"noise_scale",
|
||||||
|
"alpha",
|
||||||
|
"noise_scale_dur",
|
||||||
|
"speaker",
|
||||||
|
],
|
||||||
|
output_names=["audio"],
|
||||||
|
dynamic_axes={
|
||||||
|
"tokens": {0: "N", 1: "T"},
|
||||||
|
"tokens_lens": {0: "N"},
|
||||||
|
"audio": {0: "N", 1: "T"},
|
||||||
|
"speaker": {0: "N"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if model.model.spks is None:
|
||||||
|
num_speakers = 1
|
||||||
|
else:
|
||||||
|
num_speakers = model.model.spks
|
||||||
|
|
||||||
|
meta_data = {
|
||||||
|
"model_type": "vits",
|
||||||
|
"version": "1",
|
||||||
|
"model_author": "k2-fsa",
|
||||||
|
"comment": "icefall", # must be icefall for models from icefall
|
||||||
|
"language": "Chinese",
|
||||||
|
"n_speakers": num_speakers,
|
||||||
|
"sample_rate": model.model.sampling_rate, # Must match the real sample rate
|
||||||
|
}
|
||||||
|
logging.info(f"meta_data: {meta_data}")
|
||||||
|
|
||||||
|
add_meta_data(filename=model_filename, meta_data=meta_data)
|
||||||
|
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def main():
|
||||||
|
args = get_parser().parse_args()
|
||||||
|
args.exp_dir = Path(args.exp_dir)
|
||||||
|
|
||||||
|
params = get_params()
|
||||||
|
params.update(vars(args))
|
||||||
|
|
||||||
|
tokenizer = Tokenizer(params.tokens)
|
||||||
|
params.blank_id = tokenizer.pad_id
|
||||||
|
params.vocab_size = tokenizer.vocab_size
|
||||||
|
|
||||||
|
with open(args.speakers) as f:
|
||||||
|
speaker_map = {line.strip(): i for i, line in enumerate(f)}
|
||||||
|
params.num_spks = len(speaker_map)
|
||||||
|
|
||||||
|
logging.info(params)
|
||||||
|
|
||||||
|
logging.info("About to create model")
|
||||||
|
model = get_model(params)
|
||||||
|
|
||||||
|
load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
|
||||||
|
|
||||||
|
model.to("cpu")
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
model = OnnxModel(model=model)
|
||||||
|
|
||||||
|
num_param = sum([p.numel() for p in model.parameters()])
|
||||||
|
logging.info(f"generator parameters: {num_param}, or {num_param/1000/1000} M")
|
||||||
|
|
||||||
|
suffix = f"epoch-{params.epoch}"
|
||||||
|
|
||||||
|
opset_version = 13
|
||||||
|
|
||||||
|
logging.info("Exporting encoder")
|
||||||
|
model_filename = params.exp_dir / f"vits-{suffix}.onnx"
|
||||||
|
export_model_onnx(
|
||||||
|
model,
|
||||||
|
model_filename,
|
||||||
|
params.vocab_size,
|
||||||
|
opset_version=opset_version,
|
||||||
|
)
|
||||||
|
logging.info(f"Exported generator to {model_filename}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
|
main()
|
||||||
|
|
||||||
|
"""
|
||||||
|
Supported languages.
|
||||||
|
|
||||||
|
LJSpeech is using "en-us" from the second column.
|
||||||
|
|
||||||
|
Pty Language Age/Gender VoiceName File Other Languages
|
||||||
|
5 af --/M Afrikaans gmw/af
|
||||||
|
5 am --/M Amharic sem/am
|
||||||
|
5 an --/M Aragonese roa/an
|
||||||
|
5 ar --/M Arabic sem/ar
|
||||||
|
5 as --/M Assamese inc/as
|
||||||
|
5 az --/M Azerbaijani trk/az
|
||||||
|
5 ba --/M Bashkir trk/ba
|
||||||
|
5 be --/M Belarusian zle/be
|
||||||
|
5 bg --/M Bulgarian zls/bg
|
||||||
|
5 bn --/M Bengali inc/bn
|
||||||
|
5 bpy --/M Bishnupriya_Manipuri inc/bpy
|
||||||
|
5 bs --/M Bosnian zls/bs
|
||||||
|
5 ca --/M Catalan roa/ca
|
||||||
|
5 chr-US-Qaaa-x-west --/M Cherokee_ iro/chr
|
||||||
|
5 cmn --/M Chinese_(Mandarin,_latin_as_English) sit/cmn (zh-cmn 5)(zh 5)
|
||||||
|
5 cmn-latn-pinyin --/M Chinese_(Mandarin,_latin_as_Pinyin) sit/cmn-Latn-pinyin (zh-cmn 5)(zh 5)
|
||||||
|
5 cs --/M Czech zlw/cs
|
||||||
|
5 cv --/M Chuvash trk/cv
|
||||||
|
5 cy --/M Welsh cel/cy
|
||||||
|
5 da --/M Danish gmq/da
|
||||||
|
5 de --/M German gmw/de
|
||||||
|
5 el --/M Greek grk/el
|
||||||
|
5 en-029 --/M English_(Caribbean) gmw/en-029 (en 10)
|
||||||
|
2 en-gb --/M English_(Great_Britain) gmw/en (en 2)
|
||||||
|
5 en-gb-scotland --/M English_(Scotland) gmw/en-GB-scotland (en 4)
|
||||||
|
5 en-gb-x-gbclan --/M English_(Lancaster) gmw/en-GB-x-gbclan (en-gb 3)(en 5)
|
||||||
|
5 en-gb-x-gbcwmd --/M English_(West_Midlands) gmw/en-GB-x-gbcwmd (en-gb 9)(en 9)
|
||||||
|
5 en-gb-x-rp --/M English_(Received_Pronunciation) gmw/en-GB-x-rp (en-gb 4)(en 5)
|
||||||
|
2 en-us --/M English_(America) gmw/en-US (en 3)
|
||||||
|
5 en-us-nyc --/M English_(America,_New_York_City) gmw/en-US-nyc
|
||||||
|
5 eo --/M Esperanto art/eo
|
||||||
|
5 es --/M Spanish_(Spain) roa/es
|
||||||
|
5 es-419 --/M Spanish_(Latin_America) roa/es-419 (es-mx 6)
|
||||||
|
5 et --/M Estonian urj/et
|
||||||
|
5 eu --/M Basque eu
|
||||||
|
5 fa --/M Persian ira/fa
|
||||||
|
5 fa-latn --/M Persian_(Pinglish) ira/fa-Latn
|
||||||
|
5 fi --/M Finnish urj/fi
|
||||||
|
5 fr-be --/M French_(Belgium) roa/fr-BE (fr 8)
|
||||||
|
5 fr-ch --/M French_(Switzerland) roa/fr-CH (fr 8)
|
||||||
|
5 fr-fr --/M French_(France) roa/fr (fr 5)
|
||||||
|
5 ga --/M Gaelic_(Irish) cel/ga
|
||||||
|
5 gd --/M Gaelic_(Scottish) cel/gd
|
||||||
|
5 gn --/M Guarani sai/gn
|
||||||
|
5 grc --/M Greek_(Ancient) grk/grc
|
||||||
|
5 gu --/M Gujarati inc/gu
|
||||||
|
5 hak --/M Hakka_Chinese sit/hak
|
||||||
|
5 haw --/M Hawaiian map/haw
|
||||||
|
5 he --/M Hebrew sem/he
|
||||||
|
5 hi --/M Hindi inc/hi
|
||||||
|
5 hr --/M Croatian zls/hr (hbs 5)
|
||||||
|
5 ht --/M Haitian_Creole roa/ht
|
||||||
|
5 hu --/M Hungarian urj/hu
|
||||||
|
5 hy --/M Armenian_(East_Armenia) ine/hy (hy-arevela 5)
|
||||||
|
5 hyw --/M Armenian_(West_Armenia) ine/hyw (hy-arevmda 5)(hy 8)
|
||||||
|
5 ia --/M Interlingua art/ia
|
||||||
|
5 id --/M Indonesian poz/id
|
||||||
|
5 io --/M Ido art/io
|
||||||
|
5 is --/M Icelandic gmq/is
|
||||||
|
5 it --/M Italian roa/it
|
||||||
|
5 ja --/M Japanese jpx/ja
|
||||||
|
5 jbo --/M Lojban art/jbo
|
||||||
|
5 ka --/M Georgian ccs/ka
|
||||||
|
5 kk --/M Kazakh trk/kk
|
||||||
|
5 kl --/M Greenlandic esx/kl
|
||||||
|
5 kn --/M Kannada dra/kn
|
||||||
|
5 ko --/M Korean ko
|
||||||
|
5 kok --/M Konkani inc/kok
|
||||||
|
5 ku --/M Kurdish ira/ku
|
||||||
|
5 ky --/M Kyrgyz trk/ky
|
||||||
|
5 la --/M Latin itc/la
|
||||||
|
5 lb --/M Luxembourgish gmw/lb
|
||||||
|
5 lfn --/M Lingua_Franca_Nova art/lfn
|
||||||
|
5 lt --/M Lithuanian bat/lt
|
||||||
|
5 ltg --/M Latgalian bat/ltg
|
||||||
|
5 lv --/M Latvian bat/lv
|
||||||
|
5 mi --/M Māori poz/mi
|
||||||
|
5 mk --/M Macedonian zls/mk
|
||||||
|
5 ml --/M Malayalam dra/ml
|
||||||
|
5 mr --/M Marathi inc/mr
|
||||||
|
5 ms --/M Malay poz/ms
|
||||||
|
5 mt --/M Maltese sem/mt
|
||||||
|
5 mto --/M Totontepec_Mixe miz/mto
|
||||||
|
5 my --/M Myanmar_(Burmese) sit/my
|
||||||
|
5 nb --/M Norwegian_Bokmål gmq/nb (no 5)
|
||||||
|
5 nci --/M Nahuatl_(Classical) azc/nci
|
||||||
|
5 ne --/M Nepali inc/ne
|
||||||
|
5 nl --/M Dutch gmw/nl
|
||||||
|
5 nog --/M Nogai trk/nog
|
||||||
|
5 om --/M Oromo cus/om
|
||||||
|
5 or --/M Oriya inc/or
|
||||||
|
5 pa --/M Punjabi inc/pa
|
||||||
|
5 pap --/M Papiamento roa/pap
|
||||||
|
5 piqd --/M Klingon art/piqd
|
||||||
|
5 pl --/M Polish zlw/pl
|
||||||
|
5 pt --/M Portuguese_(Portugal) roa/pt (pt-pt 5)
|
||||||
|
5 pt-br --/M Portuguese_(Brazil) roa/pt-BR (pt 6)
|
||||||
|
5 py --/M Pyash art/py
|
||||||
|
5 qdb --/M Lang_Belta art/qdb
|
||||||
|
5 qu --/M Quechua qu
|
||||||
|
5 quc --/M K'iche' myn/quc
|
||||||
|
5 qya --/M Quenya art/qya
|
||||||
|
5 ro --/M Romanian roa/ro
|
||||||
|
5 ru --/M Russian zle/ru
|
||||||
|
5 ru-cl --/M Russian_(Classic) zle/ru-cl
|
||||||
|
2 ru-lv --/M Russian_(Latvia) zle/ru-LV
|
||||||
|
5 sd --/M Sindhi inc/sd
|
||||||
|
5 shn --/M Shan_(Tai_Yai) tai/shn
|
||||||
|
5 si --/M Sinhala inc/si
|
||||||
|
5 sjn --/M Sindarin art/sjn
|
||||||
|
5 sk --/M Slovak zlw/sk
|
||||||
|
5 sl --/M Slovenian zls/sl
|
||||||
|
5 smj --/M Lule_Saami urj/smj
|
||||||
|
5 sq --/M Albanian ine/sq
|
||||||
|
5 sr --/M Serbian zls/sr
|
||||||
|
5 sv --/M Swedish gmq/sv
|
||||||
|
5 sw --/M Swahili bnt/sw
|
||||||
|
5 ta --/M Tamil dra/ta
|
||||||
|
5 te --/M Telugu dra/te
|
||||||
|
5 th --/M Thai tai/th
|
||||||
|
5 tk --/M Turkmen trk/tk
|
||||||
|
5 tn --/M Setswana bnt/tn
|
||||||
|
5 tr --/M Turkish trk/tr
|
||||||
|
5 tt --/M Tatar trk/tt
|
||||||
|
5 ug --/M Uyghur trk/ug
|
||||||
|
5 uk --/M Ukrainian zle/uk
|
||||||
|
5 ur --/M Urdu inc/ur
|
||||||
|
5 uz --/M Uzbek trk/uz
|
||||||
|
5 vi --/M Vietnamese_(Northern) aav/vi
|
||||||
|
5 vi-vn-x-central --/M Vietnamese_(Central) aav/vi-VN-x-central
|
||||||
|
5 vi-vn-x-south --/M Vietnamese_(Southern) aav/vi-VN-x-south
|
||||||
|
5 yue --/M Chinese_(Cantonese) sit/yue (zh-yue 5)(zh 8)
|
||||||
|
5 yue --/M Chinese_(Cantonese,_latin_as_Jyutping) sit/yue-Latn-jyutping (zh-yue 5)(zh 8)
|
||||||
|
"""
|
1
egs/aishell3/TTS/vits/flow.py
Symbolic link
1
egs/aishell3/TTS/vits/flow.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/vits/flow.py
|
1
egs/aishell3/TTS/vits/generator.py
Symbolic link
1
egs/aishell3/TTS/vits/generator.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/vits/generator.py
|
1
egs/aishell3/TTS/vits/hifigan.py
Symbolic link
1
egs/aishell3/TTS/vits/hifigan.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/vits/hifigan.py
|
1
egs/aishell3/TTS/vits/loss.py
Symbolic link
1
egs/aishell3/TTS/vits/loss.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/vits/loss.py
|
1
egs/aishell3/TTS/vits/monotonic_align
Symbolic link
1
egs/aishell3/TTS/vits/monotonic_align
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/vits/monotonic_align/
|
1
egs/aishell3/TTS/vits/pinyin_dict.py
Symbolic link
1
egs/aishell3/TTS/vits/pinyin_dict.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../local/pinyin_dict.py
|
1
egs/aishell3/TTS/vits/posterior_encoder.py
Symbolic link
1
egs/aishell3/TTS/vits/posterior_encoder.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/vits/posterior_encoder.py
|
1
egs/aishell3/TTS/vits/pypinyin-local.dict
Symbolic link
1
egs/aishell3/TTS/vits/pypinyin-local.dict
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../local/pypinyin-local.dict
|
1
egs/aishell3/TTS/vits/residual_coupling.py
Symbolic link
1
egs/aishell3/TTS/vits/residual_coupling.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/vits/residual_coupling.py
|
1
egs/aishell3/TTS/vits/text_encoder.py
Symbolic link
1
egs/aishell3/TTS/vits/text_encoder.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/vits/text_encoder.py
|
1
egs/aishell3/TTS/vits/tokenizer.py
Symbolic link
1
egs/aishell3/TTS/vits/tokenizer.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../local/tokenizer.py
|
1007
egs/aishell3/TTS/vits/train.py
Executable file
1007
egs/aishell3/TTS/vits/train.py
Executable file
File diff suppressed because it is too large
Load Diff
1
egs/aishell3/TTS/vits/transform.py
Symbolic link
1
egs/aishell3/TTS/vits/transform.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/vits/transform.py
|
349
egs/aishell3/TTS/vits/tts_datamodule.py
Normal file
349
egs/aishell3/TTS/vits/tts_datamodule.py
Normal file
@ -0,0 +1,349 @@
|
|||||||
|
# Copyright 2021 Piotr Żelasko
|
||||||
|
# Copyright 2022-2023 Xiaomi Corporation (Authors: Mingshuang Luo,
|
||||||
|
# Zengwei Yao)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
from functools import lru_cache
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from lhotse import CutSet, Spectrogram, SpectrogramConfig, load_manifest_lazy
|
||||||
|
from lhotse.dataset import ( # noqa F401 for PrecomputedFeatures
|
||||||
|
CutConcatenate,
|
||||||
|
CutMix,
|
||||||
|
DynamicBucketingSampler,
|
||||||
|
PrecomputedFeatures,
|
||||||
|
SimpleCutSampler,
|
||||||
|
SpecAugment,
|
||||||
|
SpeechSynthesisDataset,
|
||||||
|
)
|
||||||
|
from lhotse.dataset.input_strategies import ( # noqa F401 For AudioSamples
|
||||||
|
AudioSamples,
|
||||||
|
OnTheFlyFeatures,
|
||||||
|
)
|
||||||
|
from lhotse.utils import fix_random_seed
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
|
||||||
|
from icefall.utils import str2bool
|
||||||
|
|
||||||
|
|
||||||
|
class _SeedWorkers:
|
||||||
|
def __init__(self, seed: int):
|
||||||
|
self.seed = seed
|
||||||
|
|
||||||
|
def __call__(self, worker_id: int):
|
||||||
|
fix_random_seed(self.seed + worker_id)
|
||||||
|
|
||||||
|
|
||||||
|
class Aishell3SpeechTtsDataModule:
|
||||||
|
"""
|
||||||
|
DataModule for tts experiments.
|
||||||
|
It assumes there is always one train and valid dataloader,
|
||||||
|
but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
|
||||||
|
and test-other).
|
||||||
|
|
||||||
|
It contains all the common data pipeline modules used in ASR
|
||||||
|
experiments, e.g.:
|
||||||
|
- dynamic batch size,
|
||||||
|
- bucketing samplers,
|
||||||
|
- cut concatenation,
|
||||||
|
- on-the-fly feature extraction
|
||||||
|
|
||||||
|
This class should be derived for specific corpora used in TTS tasks.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, args: argparse.Namespace):
|
||||||
|
self.args = args
|
||||||
|
self.sampling_rate = 8000
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def add_arguments(cls, parser: argparse.ArgumentParser):
|
||||||
|
group = parser.add_argument_group(
|
||||||
|
title="TTS data related options",
|
||||||
|
description="These options are used for the preparation of "
|
||||||
|
"PyTorch DataLoaders from Lhotse CutSet's -- they control the "
|
||||||
|
"effective batch sizes, sampling strategies, applied data "
|
||||||
|
"augmentations, etc.",
|
||||||
|
)
|
||||||
|
|
||||||
|
group.add_argument(
|
||||||
|
"--manifest-dir",
|
||||||
|
type=Path,
|
||||||
|
default=Path("data/spectrogram"),
|
||||||
|
help="Path to directory with train/valid/test cuts.",
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--speakers",
|
||||||
|
type=Path,
|
||||||
|
default=Path("data/speakers.txt"),
|
||||||
|
help="Path to speakers.txt file.",
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--max-duration",
|
||||||
|
type=int,
|
||||||
|
default=200.0,
|
||||||
|
help="Maximum pooled recordings duration (seconds) in a "
|
||||||
|
"single batch. You can reduce it if it causes CUDA OOM.",
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--bucketing-sampler",
|
||||||
|
type=str2bool,
|
||||||
|
default=True,
|
||||||
|
help="When enabled, the batches will come from buckets of "
|
||||||
|
"similar duration (saves padding frames).",
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--num-buckets",
|
||||||
|
type=int,
|
||||||
|
default=30,
|
||||||
|
help="The number of buckets for the DynamicBucketingSampler"
|
||||||
|
"(you might want to increase it for larger datasets).",
|
||||||
|
)
|
||||||
|
|
||||||
|
group.add_argument(
|
||||||
|
"--on-the-fly-feats",
|
||||||
|
type=str2bool,
|
||||||
|
default=False,
|
||||||
|
help="When enabled, use on-the-fly cut mixing and feature "
|
||||||
|
"extraction. Will drop existing precomputed feature manifests "
|
||||||
|
"if available.",
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--shuffle",
|
||||||
|
type=str2bool,
|
||||||
|
default=True,
|
||||||
|
help="When enabled (=default), the examples will be "
|
||||||
|
"shuffled for each epoch.",
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--drop-last",
|
||||||
|
type=str2bool,
|
||||||
|
default=True,
|
||||||
|
help="Whether to drop last batch. Used by sampler.",
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--return-cuts",
|
||||||
|
type=str2bool,
|
||||||
|
default=False,
|
||||||
|
help="When enabled, each batch will have the "
|
||||||
|
"field: batch['cut'] with the cuts that "
|
||||||
|
"were used to construct it.",
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--num-workers",
|
||||||
|
type=int,
|
||||||
|
default=2,
|
||||||
|
help="The number of training dataloader workers that "
|
||||||
|
"collect the batches.",
|
||||||
|
)
|
||||||
|
|
||||||
|
group.add_argument(
|
||||||
|
"--input-strategy",
|
||||||
|
type=str,
|
||||||
|
default="PrecomputedFeatures",
|
||||||
|
help="AudioSamples or PrecomputedFeatures",
|
||||||
|
)
|
||||||
|
|
||||||
|
def train_dataloaders(
|
||||||
|
self,
|
||||||
|
cuts_train: CutSet,
|
||||||
|
sampler_state_dict: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> DataLoader:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
cuts_train:
|
||||||
|
CutSet for training.
|
||||||
|
sampler_state_dict:
|
||||||
|
The state dict for the training sampler.
|
||||||
|
"""
|
||||||
|
logging.info("About to create train dataset")
|
||||||
|
train = SpeechSynthesisDataset(
|
||||||
|
return_text=False,
|
||||||
|
return_tokens=True,
|
||||||
|
return_spk_ids=True,
|
||||||
|
feature_input_strategy=eval(self.args.input_strategy)(),
|
||||||
|
return_cuts=self.args.return_cuts,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.args.on_the_fly_feats:
|
||||||
|
sampling_rate = self.sampling_rate
|
||||||
|
config = SpectrogramConfig(
|
||||||
|
sampling_rate=sampling_rate,
|
||||||
|
frame_length=1024 / sampling_rate, # (in second),
|
||||||
|
frame_shift=256 / sampling_rate, # (in second)
|
||||||
|
use_fft_mag=True,
|
||||||
|
)
|
||||||
|
train = SpeechSynthesisDataset(
|
||||||
|
return_text=False,
|
||||||
|
return_tokens=True,
|
||||||
|
return_spk_ids=True,
|
||||||
|
feature_input_strategy=OnTheFlyFeatures(Spectrogram(config)),
|
||||||
|
return_cuts=self.args.return_cuts,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.args.bucketing_sampler:
|
||||||
|
logging.info("Using DynamicBucketingSampler.")
|
||||||
|
train_sampler = DynamicBucketingSampler(
|
||||||
|
cuts_train,
|
||||||
|
max_duration=self.args.max_duration,
|
||||||
|
shuffle=self.args.shuffle,
|
||||||
|
num_buckets=self.args.num_buckets,
|
||||||
|
buffer_size=self.args.num_buckets * 2000,
|
||||||
|
shuffle_buffer_size=self.args.num_buckets * 5000,
|
||||||
|
drop_last=self.args.drop_last,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logging.info("Using SimpleCutSampler.")
|
||||||
|
train_sampler = SimpleCutSampler(
|
||||||
|
cuts_train,
|
||||||
|
max_duration=self.args.max_duration,
|
||||||
|
shuffle=self.args.shuffle,
|
||||||
|
)
|
||||||
|
logging.info("About to create train dataloader")
|
||||||
|
|
||||||
|
if sampler_state_dict is not None:
|
||||||
|
logging.info("Loading sampler state dict")
|
||||||
|
train_sampler.load_state_dict(sampler_state_dict)
|
||||||
|
|
||||||
|
# 'seed' is derived from the current random state, which will have
|
||||||
|
# previously been set in the main process.
|
||||||
|
seed = torch.randint(0, 100000, ()).item()
|
||||||
|
worker_init_fn = _SeedWorkers(seed)
|
||||||
|
|
||||||
|
train_dl = DataLoader(
|
||||||
|
train,
|
||||||
|
sampler=train_sampler,
|
||||||
|
batch_size=None,
|
||||||
|
num_workers=self.args.num_workers,
|
||||||
|
persistent_workers=False,
|
||||||
|
worker_init_fn=worker_init_fn,
|
||||||
|
)
|
||||||
|
|
||||||
|
return train_dl
|
||||||
|
|
||||||
|
def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
|
||||||
|
logging.info("About to create dev dataset")
|
||||||
|
if self.args.on_the_fly_feats:
|
||||||
|
sampling_rate = self.sampling_rate
|
||||||
|
config = SpectrogramConfig(
|
||||||
|
sampling_rate=sampling_rate,
|
||||||
|
frame_length=1024 / sampling_rate, # (in second),
|
||||||
|
frame_shift=256 / sampling_rate, # (in second)
|
||||||
|
use_fft_mag=True,
|
||||||
|
)
|
||||||
|
validate = SpeechSynthesisDataset(
|
||||||
|
return_text=False,
|
||||||
|
return_tokens=True,
|
||||||
|
return_spk_ids=True,
|
||||||
|
feature_input_strategy=OnTheFlyFeatures(Spectrogram(config)),
|
||||||
|
return_cuts=self.args.return_cuts,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
validate = SpeechSynthesisDataset(
|
||||||
|
return_text=False,
|
||||||
|
return_tokens=True,
|
||||||
|
return_spk_ids=True,
|
||||||
|
feature_input_strategy=eval(self.args.input_strategy)(),
|
||||||
|
return_cuts=self.args.return_cuts,
|
||||||
|
)
|
||||||
|
valid_sampler = DynamicBucketingSampler(
|
||||||
|
cuts_valid,
|
||||||
|
max_duration=self.args.max_duration,
|
||||||
|
num_buckets=self.args.num_buckets,
|
||||||
|
shuffle=False,
|
||||||
|
)
|
||||||
|
logging.info("About to create valid dataloader")
|
||||||
|
valid_dl = DataLoader(
|
||||||
|
validate,
|
||||||
|
sampler=valid_sampler,
|
||||||
|
batch_size=None,
|
||||||
|
num_workers=2,
|
||||||
|
persistent_workers=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
return valid_dl
|
||||||
|
|
||||||
|
def test_dataloaders(self, cuts: CutSet) -> DataLoader:
|
||||||
|
logging.info("About to create test dataset")
|
||||||
|
if self.args.on_the_fly_feats:
|
||||||
|
sampling_rate = self.sampling_rate
|
||||||
|
config = SpectrogramConfig(
|
||||||
|
sampling_rate=sampling_rate,
|
||||||
|
frame_length=1024 / sampling_rate, # (in second),
|
||||||
|
frame_shift=256 / sampling_rate, # (in second)
|
||||||
|
use_fft_mag=True,
|
||||||
|
)
|
||||||
|
test = SpeechSynthesisDataset(
|
||||||
|
return_text=False,
|
||||||
|
return_tokens=True,
|
||||||
|
return_spk_ids=True,
|
||||||
|
feature_input_strategy=OnTheFlyFeatures(Spectrogram(config)),
|
||||||
|
return_cuts=self.args.return_cuts,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
test = SpeechSynthesisDataset(
|
||||||
|
return_text=False,
|
||||||
|
return_tokens=True,
|
||||||
|
return_spk_ids=True,
|
||||||
|
feature_input_strategy=eval(self.args.input_strategy)(),
|
||||||
|
return_cuts=self.args.return_cuts,
|
||||||
|
)
|
||||||
|
test_sampler = DynamicBucketingSampler(
|
||||||
|
cuts,
|
||||||
|
max_duration=self.args.max_duration,
|
||||||
|
num_buckets=self.args.num_buckets,
|
||||||
|
shuffle=False,
|
||||||
|
)
|
||||||
|
logging.info("About to create test dataloader")
|
||||||
|
test_dl = DataLoader(
|
||||||
|
test,
|
||||||
|
batch_size=None,
|
||||||
|
sampler=test_sampler,
|
||||||
|
num_workers=self.args.num_workers,
|
||||||
|
)
|
||||||
|
return test_dl
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def train_cuts(self) -> CutSet:
|
||||||
|
logging.info("About to get train cuts")
|
||||||
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "aishell3_cuts_train.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def valid_cuts(self) -> CutSet:
|
||||||
|
logging.info("About to get validation cuts")
|
||||||
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "aishell3_cuts_valid.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def test_cuts(self) -> CutSet:
|
||||||
|
logging.info("About to get test cuts")
|
||||||
|
return load_manifest_lazy(
|
||||||
|
self.args.manifest_dir / "aishell3_cuts_test.jsonl.gz"
|
||||||
|
)
|
||||||
|
|
||||||
|
@lru_cache()
|
||||||
|
def speakers(self) -> Dict[str, int]:
|
||||||
|
logging.info("About to get speakers")
|
||||||
|
with open(self.args.speakers) as f:
|
||||||
|
speakers = {line.strip(): i for i, line in enumerate(f)}
|
||||||
|
return speakers
|
1
egs/aishell3/TTS/vits/utils.py
Symbolic link
1
egs/aishell3/TTS/vits/utils.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/vits/utils.py
|
1
egs/aishell3/TTS/vits/vits.py
Symbolic link
1
egs/aishell3/TTS/vits/vits.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/vits/vits.py
|
1
egs/aishell3/TTS/vits/wavenet.py
Symbolic link
1
egs/aishell3/TTS/vits/wavenet.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../../ljspeech/TTS/vits/wavenet.py
|
@ -1,7 +1,10 @@
|
|||||||
# https://github.com/espnet/espnet/blob/master/espnet2/gan_tts/vits/monotonic_align/setup.py
|
# https://github.com/espnet/espnet/blob/master/espnet2/gan_tts/vits/monotonic_align/setup.py
|
||||||
"""Setup cython code."""
|
"""Setup cython code."""
|
||||||
|
|
||||||
|
try:
|
||||||
from Cython.Build import cythonize
|
from Cython.Build import cythonize
|
||||||
|
except ModuleNotFoundError as ex:
|
||||||
|
raise RuntimeError(f'{ex}\nPlease run:\n pip install cython')
|
||||||
from setuptools import Extension, setup
|
from setuptools import Extension, setup
|
||||||
from setuptools.command.build_ext import build_ext as _build_ext
|
from setuptools.command.build_ext import build_ext as _build_ext
|
||||||
|
|
||||||
|
@ -44,11 +44,11 @@ class Tokenizer(object):
|
|||||||
if len(info) == 1:
|
if len(info) == 1:
|
||||||
# case of space
|
# case of space
|
||||||
token = " "
|
token = " "
|
||||||
id = int(info[0])
|
idx = int(info[0])
|
||||||
else:
|
else:
|
||||||
token, id = info[0], int(info[1])
|
token, idx = info[0], int(info[1])
|
||||||
assert token not in self.token2id, token
|
assert token not in self.token2id, token
|
||||||
self.token2id[token] = id
|
self.token2id[token] = idx
|
||||||
|
|
||||||
# Refer to https://github.com/rhasspy/piper/blob/master/TRAINING.md
|
# Refer to https://github.com/rhasspy/piper/blob/master/TRAINING.md
|
||||||
self.pad_id = self.token2id["_"] # padding
|
self.pad_id = self.token2id["_"] # padding
|
||||||
|
@ -66,7 +66,7 @@ class LJSpeechTtsDataModule:
|
|||||||
- cut concatenation,
|
- cut concatenation,
|
||||||
- on-the-fly feature extraction
|
- on-the-fly feature extraction
|
||||||
|
|
||||||
This class should be derived for specific corpora used in ASR tasks.
|
This class should be derived for specific corpora used in TTS tasks.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, args: argparse.Namespace):
|
def __init__(self, args: argparse.Namespace):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user