add emilia data preparation pipeline

This commit is contained in:
pkufool 2025-06-17 19:38:46 +08:00
parent 60572c2444
commit 2376ed2117
14 changed files with 501 additions and 603 deletions

View File

@ -57,21 +57,20 @@ To generate speech with our pre-trained ZipVoice or ZipVoice-Distill models, use
### 1. Inference of a single sentence:
```bash
# Chinese example
python3 zipvoice/zipvoice_infer.py \
--model-name "zipvoice_distill" \
--prompt-wav assets/prompt-zh.wav \
--prompt-text "对,这就是我,万人敬仰的太乙真人,虽然有点婴儿肥,但也掩不住我逼人的帅气。" \
--text "欢迎使用我们的语音合成模型,希望它能给你带来惊喜!" \
--res-wav-path result-zh.wav
--prompt-wav prompt.wav \
--prompt-text "I am the transcription of the prompt wav." \
--text "I am the text to be synthesized." \
--res-wav-path result.wav
# English example
# Example with a pre-defined prompt wav and text
python3 zipvoice/zipvoice_infer.py \
--model-name "zipvoice_distill" \
--prompt-wav assets/prompt-en.wav \
--prompt-text "Some call me nature, others call me mother nature. I've been here for over four point five billion years, twenty two thousand five hundred times longer than you." \
--text "Welcome to use our tts model, have fun!" \
--res-wav-path result-en.wav
--res-wav-path result.wav
```
### 2. Inference of a list of sentences:
@ -95,13 +94,29 @@ export HF_ENDPOINT=https://hf-mirror.com
The following steps show how to train a model from scratch on Emilia and LibriTTS datasets, respectively.
### 0. Install dependencies for training
```bash
pip install -r ../../requirements.txt
```
### 1. Data Preparation
#### 1.1. Prepare the Emilia dataset
```bash
bash scripts/prepare_emilia.sh --stage 0 --stop-stage 4
```
See [scripts/prepare_emilia.sh](scripts/prepare_emilia.sh) for step by step instructions.
#### 1.2 Prepare the LibriTTS dataset
See [local/prepare_libritts.sh](local/prepare_libritts.sh)
```bash
bash scripts/prepare_libritts.sh --stage 0 --stop-stage 3
```
See [scripts/prepare_libritts.sh](scripts/prepare_libritts.sh) for step by step instructions.
### 2. Training

Binary file not shown.

View File

@ -0,0 +1,288 @@
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Wei Kang)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
from pathlib import Path
from typing import Optional
from concurrent.futures import ProcessPoolExecutor as Pool
import torch
from lhotse import (
CutSet,
LilcomChunkyWriter,
load_manifest_lazy,
set_audio_duration_mismatch_tolerance,
)
from feature import TorchAudioFbank, TorchAudioFbankConfig
import lhotse
# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
# even when we are not invoking the main (e.g. when spawning subprocesses).
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
def str2bool(v):
"""Used in argparse.ArgumentParser.add_argument to indicate
that a type is a bool type and user can enter
- yes, true, t, y, 1, to represent True
- no, false, f, n, 0, to represent False
See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse # noqa
"""
if isinstance(v, bool):
return v
if v.lower() in ("yes", "true", "t", "y", "1"):
return True
elif v.lower() in ("no", "false", "f", "n", "0"):
return False
else:
raise argparse.ArgumentTypeError("Boolean value expected.")
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--sampling-rate",
type=int,
default=24000,
help="The target sampling rate, the audio will be resampled to this sampling_rate.",
)
parser.add_argument(
"--frame-shift",
type=int,
default=256,
help="Frame shift in samples",
)
parser.add_argument(
"--frame-length",
type=int,
default=1024,
help="Frame length in samples",
)
parser.add_argument(
"--num-mel-bins",
type=int,
default=100,
help="The num of mel filters.",
)
parser.add_argument(
"--dataset",
type=str,
help="Dataset name.",
)
parser.add_argument(
"--subset",
type=str,
help="The subset of the dataset.",
)
parser.add_argument(
"--source-dir",
type=str,
default="data/manifests",
help="The source directory of manifest files.",
)
parser.add_argument(
"--dest-dir",
type=str,
default="data/fbank",
help="The destination directory of manifest files.",
)
parser.add_argument(
"--split-cuts",
type=str2bool,
default=False,
help="Whether to use splited cuts.",
)
parser.add_argument(
"--split-begin",
type=int,
help="Start idx of splited cuts.",
)
parser.add_argument(
"--split-end",
type=int,
help="End idx of splited cuts.",
)
parser.add_argument(
"--batch-duration",
type=int,
default=1000,
help="The batch duration when computing the features.",
)
parser.add_argument(
"--num-jobs", type=int, default=20, help="The number of extractor workers."
)
return parser.parse_args()
def compute_fbank_split_single(params, idx):
lhotse.set_audio_duration_mismatch_tolerance(0.1) # for emilia
src_dir = Path(params.source_dir)
output_dir = Path(params.dest_dir)
num_mel_bins = params.num_mel_bins
if not src_dir.exists():
logging.error(f"{src_dir} not exists")
return
if not output_dir.exists():
output_dir.mkdir(parents=True, exist_ok=True)
num_digits = 8
config = TorchAudioFbankConfig(
sampling_rate=params.sampling_rate,
n_mels=params.num_mel_bins,
n_fft=params.frame_length,
hop_length=params.frame_shift,
)
extractor = TorchAudioFbank(config)
prefix = params.dataset
subset = params.subset
suffix = "jsonl.gz"
idx = f"{idx}".zfill(num_digits)
cuts_filename = f"{prefix}_cuts_{subset}.{idx}.{suffix}"
if (src_dir / cuts_filename).is_file():
logging.info(f"Loading manifests {src_dir / cuts_filename}")
cut_set = load_manifest_lazy(src_dir / cuts_filename)
else:
logging.warning(f"Raw {cuts_filename} not exists, skipping")
return
cut_set = cut_set.resample(params.sampling_rate)
if (output_dir / cuts_filename).is_file():
logging.info(f"{cuts_filename} already exists - skipping.")
return
logging.info(f"Processing {subset}.{idx} of {prefix}")
cut_set = cut_set.compute_and_store_features_batch(
extractor=extractor,
storage_path=f"{output_dir}/{prefix}_feats_{subset}_{idx}",
num_workers=4,
batch_duration=params.batch_duration,
storage_type=LilcomChunkyWriter,
overwrite=True,
)
cut_set.to_file(output_dir / cuts_filename)
def compute_fbank_split(params):
if params.split_end < params.split_begin:
logging.warning(
f"Split begin should be smaller than split end, given "
f"{params.split_begin} -> {params.split_end}."
)
with Pool(max_workers=params.num_jobs) as pool:
futures = [
pool.submit(compute_fbank_split_single, params, i)
for i in range(params.split_begin, params.split_end)
]
for f in futures:
f.result()
f.done()
def compute_fbank(params):
src_dir = Path(params.source_dir)
output_dir = Path(params.dest_dir)
num_jobs = params.num_jobs
num_mel_bins = params.num_mel_bins
prefix = params.dataset
subset = params.subset
suffix = "jsonl.gz"
cut_set_name = f"{prefix}_cuts_{subset}.{suffix}"
if (src_dir / cut_set_name).is_file():
logging.info(f"Loading manifests {src_dir / cut_set_name}")
cut_set = load_manifest_lazy(src_dir / cut_set_name)
else:
recordings = load_manifest_lazy(
src_dir / f"{prefix}_recordings_{subset}.{suffix}"
)
supervisions = load_manifest_lazy(
src_dir / f"{prefix}_supervisions_{subset}.{suffix}"
)
cut_set = CutSet.from_manifests(
recordings=recordings,
supervisions=supervisions,
)
cut_set = cut_set.resample(params.sampling_rate)
config = TorchAudioFbankConfig(
sampling_rate=params.sampling_rate,
n_mels=params.num_mel_bins,
n_fft=params.frame_length,
hop_length=params.frame_shift,
)
extractor = TorchAudioFbank(config)
cuts_filename = f"{prefix}_cuts_{subset}.{suffix}"
if (output_dir / cuts_filename).is_file():
logging.info(f"{prefix} {subset} already exists - skipping.")
return
logging.info(f"Processing {subset} of {prefix}")
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/{prefix}_feats_{subset}",
num_jobs=num_jobs,
storage_type=LilcomChunkyWriter,
)
cut_set.to_file(output_dir / cuts_filename)
if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
args = get_args()
logging.info(vars(args))
if args.split_cuts:
compute_fbank_split(params=args)
else:
compute_fbank(params=args)

View File

@ -1,140 +0,0 @@
#!/usr/bin/env python3
# Copyright 2021-2023 Xiaomi Corp. (authors: Fangjun Kuang,
# Zengwei Yao,)
# 2024 The Chinese Univ. of HK (authors: Zengrui Jin)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This file computes fbank features of the LibriTTS dataset.
It looks for manifests in the directory data/manifests.
The generated fbank features are saved in data/fbank.
"""
import argparse
import logging
import os
from pathlib import Path
from typing import Optional
import torch
from feature import TorchAudioFbank, TorchAudioFbankConfig
from lhotse import CutSet, LilcomChunkyWriter
from lhotse.recipes.utils import read_manifests_if_cached
from icefall.utils import get_executor
# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
# even when we are not invoking the main (e.g. when spawning subprocesses).
torch.set_num_threads(1)
torch.set_num_interop_threads(1)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--dataset",
type=str,
help="""Dataset parts to compute fbank. If None, we will use all""",
)
parser.add_argument(
"--sampling-rate",
type=int,
default=24000,
help="""Sampling rate of the waveform for computing fbank,
the default value for LibriTTS is 24000, waveform files will be
resampled if a different sample rate is provided""",
)
return parser.parse_args()
def compute_fbank_libritts(dataset: Optional[str] = None, sampling_rate: int = 24000):
src_dir = Path("data/manifests_libritts")
output_dir = Path("data/fbank_libritts")
num_jobs = min(32, os.cpu_count())
prefix = "libritts"
suffix = "jsonl.gz"
if dataset is None:
dataset_parts = (
"dev-clean",
"test-clean",
"train-clean-100",
"train-clean-360",
"train-other-500",
)
else:
dataset_parts = dataset.split(" ", -1)
manifests = read_manifests_if_cached(
dataset_parts=dataset_parts,
output_dir=src_dir,
prefix=prefix,
suffix=suffix,
)
assert manifests is not None
assert len(manifests) == len(dataset_parts), (
len(manifests),
len(dataset_parts),
list(manifests.keys()),
dataset_parts,
)
config = TorchAudioFbankConfig(
sampling_rate=sampling_rate,
n_mels=100,
n_fft=1024,
hop_length=256,
)
extractor = TorchAudioFbank(config)
with get_executor() as ex: # Initialize the executor only once.
for partition, m in manifests.items():
cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
if (output_dir / cuts_filename).is_file():
logging.info(f"{partition} already exists - skipping.")
return
logging.info(f"Processing {partition}")
cut_set = CutSet.from_manifests(
recordings=m["recordings"],
supervisions=m["supervisions"],
)
if sampling_rate != 24000:
logging.info(f"Resampling waveforms to {sampling_rate}")
cut_set = cut_set.resample(sampling_rate)
cut_set = cut_set.compute_and_store_features(
extractor=extractor,
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
# when an executor is specified, make more partitions
num_jobs=num_jobs if ex is None else 80,
executor=ex,
storage_type=LilcomChunkyWriter,
)
cut_set.to_file(output_dir / cuts_filename)
if __name__ == "__main__":
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)
compute_fbank_libritts()

View File

@ -1,135 +0,0 @@
#!/usr/bin/env python3
# Copyright 2024 Xiaomi Corp. (authors: Han Zhu)
#
# See ../../../../LICENSE for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from typing import Union
import numpy as np
import torch
import torch.nn as nn
import torchaudio
from lhotse.features.base import FeatureExtractor, register_extractor
from lhotse.utils import Seconds, compute_num_frames
class MelSpectrogramFeatures(nn.Module):
def __init__(
self,
sampling_rate=24000,
n_mels=100,
n_fft=1024,
hop_length=256,
):
super().__init__()
self.mel_spec = torchaudio.transforms.MelSpectrogram(
sample_rate=sampling_rate,
n_fft=n_fft,
hop_length=hop_length,
n_mels=n_mels,
center=True,
power=1,
)
def forward(self, inp):
assert len(inp.shape) == 2
mel = self.mel_spec(inp)
logmel = mel.clamp(min=1e-7).log()
return logmel
@dataclass
class TorchAudioFbankConfig:
sampling_rate: int
n_mels: int
n_fft: int
hop_length: int
@register_extractor
class TorchAudioFbank(FeatureExtractor):
name = "TorchAudioFbank"
config_type = TorchAudioFbankConfig
def __init__(self, config):
super().__init__(config=config)
def _feature_fn(self, sample):
fbank = MelSpectrogramFeatures(
sampling_rate=self.config.sampling_rate,
n_mels=self.config.n_mels,
n_fft=self.config.n_fft,
hop_length=self.config.hop_length,
)
return fbank(sample)
@property
def device(self) -> Union[str, torch.device]:
return self.config.device
def feature_dim(self, sampling_rate: int) -> int:
return self.config.n_mels
def extract(
self,
samples: Union[np.ndarray, torch.Tensor],
sampling_rate: int,
) -> Union[np.ndarray, torch.Tensor]:
# Check for sampling rate compatibility.
expected_sr = self.config.sampling_rate
assert sampling_rate == expected_sr, (
f"Mismatched sampling rate: extractor expects {expected_sr}, "
f"got {sampling_rate}"
)
is_numpy = False
if not isinstance(samples, torch.Tensor):
samples = torch.from_numpy(samples)
is_numpy = True
if len(samples.shape) == 1:
samples = samples.unsqueeze(0)
assert samples.ndim == 2, samples.shape
assert samples.shape[0] == 1, samples.shape
mel = self._feature_fn(samples).squeeze().t()
assert mel.ndim == 2, mel.shape
assert mel.shape[1] == self.config.n_mels, mel.shape
num_frames = compute_num_frames(
samples.shape[1] / sampling_rate, self.frame_shift, sampling_rate
)
if mel.shape[0] > num_frames:
mel = mel[:num_frames]
elif mel.shape[0] < num_frames:
mel = mel.unsqueeze(0)
mel = torch.nn.functional.pad(
mel, (0, 0, 0, num_frames - mel.shape[1]), mode="replicate"
).squeeze(0)
if is_numpy:
return mel.cpu().numpy()
else:
return mel
@property
def frame_shift(self) -> Seconds:
return self.config.hop_length / self.config.sampling_rate

View File

@ -0,0 +1 @@
../zipvoice/feature.py

View File

@ -20,20 +20,26 @@
"""
This file reads the texts in given manifest and save the new cuts with phoneme tokens.
This file reads the texts in given manifest and save the cleaned new cuts.
"""
import argparse
import glob
import logging
import re
from concurrent.futures import ProcessPoolExecutor as Pool
import glob
import os
from pathlib import Path
from typing import List
import jieba
from lhotse import load_manifest_lazy
from tokenizer import Tokenizer, is_alphabet, is_chinese, is_hangul, is_japanese
from lhotse import CutSet, load_manifest_lazy
from concurrent.futures import ProcessPoolExecutor as Pool
from tokenizer import (
is_alphabet,
is_chinese,
is_hangul,
is_japanese,
tokenize_by_CJK_char,
)
def get_args():
@ -48,71 +54,32 @@ def get_args():
parser.add_argument(
"--jobs",
type=int,
default=50,
default=20,
help="Number of jobs to processing.",
)
parser.add_argument(
"--source-dir",
type=str,
default="data/manifests_emilia/splits",
default="data/manifests/splits_raw",
help="The source directory of manifest files.",
)
parser.add_argument(
"--dest-dir",
type=str,
default="data/manifests/splits",
help="The destination directory of manifest files.",
)
return parser.parse_args()
def tokenize_by_CJK_char(line: str) -> List[str]:
"""
Tokenize a line of text with CJK char.
Note: All return characters will be upper case.
Example:
input = "你好世界是 hello world 的中文"
output = [, , , , , HELLO, WORLD, , , ]
Args:
line:
The input text.
Return:
A new string tokenize by CJK char.
"""
# The CJK ranges is from https://github.com/alvations/nltk/blob/79eed6ddea0d0a2c212c1060b477fc268fec4d4b/nltk/tokenize/util.py
pattern = re.compile(
r"([\u1100-\u11ff\u2e80-\ua4cf\ua840-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F\uFF65-\uFFDC\U00020000-\U0002FFFF])"
)
chars = pattern.split(line.strip().upper())
char_list = []
for w in chars:
if w.strip():
char_list += w.strip().split()
return char_list
def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
def preprocess_emilia(file_name: str, input_dir: Path, output_dir: Path):
logging.info(f"Processing {file_name}")
if (output_dir / file_name).is_file():
logging.info(f"{file_name} exists, skipping.")
return
jieba.setLogLevel(logging.INFO)
tokenizer = Tokenizer()
def _prepare_cut(cut):
# Each cut only contains one supervision
assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
text = cut.supervisions[0].text
cut.supervisions[0].normalized_text = text
tokens = tokenizer.texts_to_tokens([text])[0]
cut.tokens = tokens
return cut
def _filter_cut(cut):
text = cut.supervisions[0].text
@ -124,10 +91,10 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
clean_chars = []
for x in text:
if is_hangul(x):
logging.info(f"Delete cut with text containing Korean : {text}")
logging.warning(f"Delete cut with text containing Korean : {text}")
return False
if is_japanese(x):
logging.info(f"Delete cut with text containing Japanese : {text}")
logging.warning(f"Delete cut with text containing Japanese : {text}")
return False
if is_chinese(x):
chinese.append(x)
@ -138,18 +105,19 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
if x == " ":
clean_chars.append(x)
if len(english) + len(chinese) == 0:
logging.info(f"Delete cut with text has no valid chars : {text}")
logging.warning(f"Delete cut with text has no valid chars : {text}")
return False
words = tokenize_by_CJK_char("".join(clean_chars))
for i in range(len(words) - 10):
if words[i : i + 10].count(words[i]) == 10:
logging.info(f"Delete cut with text with too much repeats : {text}")
logging.warning(f"Delete cut with text with too much repeats : {text}")
return False
# word speed, 20 - 600 / minute
if duration < len(words) / 600 * 60 or duration > len(words) / 20 * 60:
logging.info(
f"Delete cut with audio text mismatch, duration : {duration}s, words : {len(words)}, text : {text}"
logging.warning(
f"Delete cut with audio text mismatch, duration : {duration}s, "
f"words : {len(words)}, text : {text}"
)
return False
return True
@ -157,11 +125,10 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
try:
cut_set = load_manifest_lazy(input_dir / file_name)
cut_set = cut_set.filter(_filter_cut)
cut_set = cut_set.map(_prepare_cut)
cut_set.to_file(output_dir / file_name)
except Exception as e:
logging.error(f"Manifest {file_name} failed with error: {e}")
raise
os.remove(str(output_dir / file_name))
if __name__ == "__main__":
@ -179,14 +146,11 @@ if __name__ == "__main__":
with Pool(max_workers=args.jobs) as pool:
futures = [
pool.submit(
prepare_tokens_emilia, filename.split("/")[-1], input_dir, output_dir
preprocess_emilia, filename.split("/")[-1], input_dir, output_dir
)
for filename in cut_files
]
for f in futures:
try:
f.result()
f.done()
except Exception as e:
logging.error(f"Future failed with error: {e}")
logging.info("Processing done.")

View File

@ -0,0 +1 @@
../zipvoice/tokenizer.py

View File

@ -1,232 +0,0 @@
#!/usr/bin/env bash
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
# add icefall to PYTHONPATH
export PYTHONPATH=../../../:$PYTHONPATH
set -eou pipefail
stage=0
stop_stage=100
token_type=bpe # bpe, letter, phone
bpe_vocab_size=500
nj=32
dl_dir=$PWD/download
. shared/parse_options.sh || exit 1
# All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
log "dl_dir: $dl_dir"
if [ $stage -le -2 ] && [ $stop_stage -ge -2 ]; then
if [ ! -d $dl_dir/xvector_nnet_1a_libritts_clean_460 ]; then
log "Downloading x-vector"
git clone https://huggingface.co/datasets/zrjin/xvector_nnet_1a_libritts_clean_460 $dl_dir/xvector_nnet_1a_libritts_clean_460
mkdir -p exp/xvector_nnet_1a/
cp -r $dl_dir/xvector_nnet_1a_libritts_clean_460/* exp/xvector_nnet_1a/
fi
fi
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
log "Stage -1: build monotonic_align lib"
if [ ! -d vits/monotonic_align/build ]; then
cd vits/monotonic_align
python setup.py build_ext --inplace
cd ../../
else
log "monotonic_align lib already built"
fi
fi
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"
# If you have pre-downloaded it to /path/to/LibriTTS,
# you can create a symlink
#
# ln -sfv /path/to/LibriTTS $dl_dir/LibriTTS
#
if [ ! -d $dl_dir/LibriTTS ]; then
lhotse download libritts $dl_dir
fi
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare LibriTTS manifest"
# We assume that you have downloaded the LibriTTS corpus
# to $dl_dir/LibriTTS
mkdir -p data/manifests
if [ ! -e data/manifests/.libritts.done ]; then
lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests
touch data/manifests/.libritts.done
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Compute Fbank for LibriTTS"
mkdir -p data/fbank
for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do
python local/compute_fbank.py --dataset libritts --subset ${subset}
done
# Here we shuffle and combine the train-clean-100, train-clean-360 and
# train-other-500 together to form the training set.
if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \
<(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \
shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
fi
if [ ! -f data/fbank/libritts_cuts_train-clean-460.jsonl.gz ]; then
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) | \
shuf | gzip -c > data/fbank/libritts_cuts_train-clean-460.jsonl.gz
fi
if [ ! -e data/fbank/.libritts-validated.done ]; then
log "Validating data/fbank for LibriTTS"
./local/validate_manifest.py \
data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
touch data/fbank/.libritts-validated.done
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Prepare tokens.txt"
if [ $token_type == "bpe" ] || [ $token_type == "letter" ]; then
if [ ! -e data/texts.txt ]; then
./local/export_normalized_texts.py --output data/texts.txt \
--manifests data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
fi
fi
if [ $token_type == "bpe" ]; then
mkdir -p data/lang_bpe_${bpe_vocab_size}
if [ ! -e data/lang_bpe_${bpe_vocab_size}/tokens.txt ]; then
./local/train_bpe_model.py --transcript data/texts.txt \
--lang-dir data/lang_bpe_${bpe_vocab_size} \
--vocab-size $bpe_vocab_size
fi
fi
if [ $token_type == "phone" ]; then
mkdir -p data/lang_phone
./local/export_tokens.py --token-type phone \
--output data/lang_phone/tokens.txt
fi
if [ $token_type == "letter" ]; then
mkdir -p data/lang_letter
./local/export_tokens.py --token-type letter \
--texts data/texts.txt \
--output data/lang_letter/tokens.txt
fi
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Download and prepare librispeech-pc test clean for testing."
if [ ! -e $dl_dir/test-clean.tar.gz ]; then
wget https://huggingface.co/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir
fi
# For China users.
if [ ! -e $dl_dir/test-clean.tar.gz ]; then
wget https://hf-mirror.com/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir
fi
if [ ! -d $dl_dir/LibriSpeech/test-clean ]; then
tar -xvf $dl_dir/test-clean.tar.gz -C $dl_dir
fi
mkdir -p $dl_dir/LibriSpeech-PC
if [ ! -e $dl_dir/LibriSpeech-PC/test-clean.json ]; then
wget https://us.openslr.org/resources/145/manifests.tar.gz -P $dl_dir/LibriSpeech-PC
tar -xvf $dl_dir/LibriSpeech-PC/manifests.tar.gz -C $dl_dir/LibriSpeech-PC
fi
python local/compute_fbank.py --dataset librispeech --subset test-clean
python local/prepare_prompts_librispeech_test_clean.py
fi
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
log "Stage 5: Compute Spectrogram for LibriTTS (for VITS system)"
mkdir -p data/spectrogram
if [ ! -e data/spectrogram/.libritts.done ]; then
./local/compute_spectrogram_libritts.py --sampling-rate $sampling_rate
touch data/spectrogram/.libritts.done
fi
# Here we shuffle and combine the train-clean-100, train-clean-360 and
# train-other-500 together to form the training set.
if [ ! -f data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz ]; then
cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) \
<(gunzip -c data/spectrogram/libritts_cuts_train-other-500.jsonl.gz) | \
shuf | gzip -c > data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
fi
# Here we shuffle and combine the train-clean-100, train-clean-360
# together to form the training set.
if [ ! -f data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz ]; then
cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) | \
shuf | gzip -c > data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz
fi
if [ ! -e data/spectrogram/.libritts-validated.done ]; then
log "Validating data/spectrogram for LibriTTS"
./local/validate_manifest.py \
data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
touch data/spectrogram/.libritts-validated.done
fi
fi
audio_feats_dir=data/tokenized
dataset_parts="--dataset-parts all" # debug "-p dev-clean -p test-clean"
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
log "Stage 6: Tokenize/Fbank LibriTTS for valle"
mkdir -p ${audio_feats_dir}
if [ ! -e ${audio_feats_dir}/.libritts.tokenize.done ]; then
python3 ./local/compute_neural_codec_and_prepare_text_tokens.py --dataset-parts "${dataset_parts}" \
--audio-extractor "Encodec" \
--batch-duration 400 \
--src-dir "data/manifests" \
--output-dir "${audio_feats_dir}"
fi
touch ${audio_feats_dir}/.libritts.tokenize.done
lhotse combine \
${audio_feats_dir}/libritts_cuts_train-clean-100.jsonl.gz \
${audio_feats_dir}/libritts_cuts_train-clean-360.jsonl.gz \
${audio_feats_dir}/libritts_cuts_train-other-500.jsonl.gz \
${audio_feats_dir}/cuts_train.jsonl.gz
lhotse copy \
${audio_feats_dir}/libritts_cuts_dev-clean.jsonl.gz \
${audio_feats_dir}/cuts_dev.jsonl.gz
lhotse copy \
${audio_feats_dir}/libritts_cuts_test-clean.jsonl.gz \
${audio_feats_dir}/cuts_test.jsonl.gz
fi

View File

@ -0,0 +1,126 @@
#!/usr/bin/env bash
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
set -eou pipefail
stage=0
stop_stage=5
sampling_rate=24000
nj=32
dl_dir=$PWD/download
# All files generated by this script are saved in "data".
# You can safely remove "data" and rerun this script to regenerate it.
mkdir -p data
log() {
# This function is from espnet
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
log "dl_dir: $dl_dir"
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
log "Stage 0: Download data"
# Your download directory should look like this:
#
# download/Amphion___Emilia
# ├── metafile.yaml
# ├── raw
# │ ├── DE
# │ ├── EN
# │ ├── FR
# │ ├── JA
# │ ├── KO
# │ ├── openemilia_45batches.tar.gz
# │ ├── openemilia_all.tar.gz
# │ └── ZH
# └── README.md
if [ ! -d $dl_dir/Amphion___Emilia/raw ]; then
log "Please refer https://openxlab.org.cn/datasets/Amphion/Emilia to download the dataset."
exit(-1)
fi
fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
log "Stage 1: Prepare emilia manifests (EN and ZH only)"
# We assume that you have downloaded the Emilia corpus
# to $dl_dir/Amphion___Emilia
# see stage 0 for the directory structure
mkdir -p data/manifests
if [ ! -e data/manifests/.emilia.done ]; then
lhotse prepare emilia --lang en --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
lhotse prepare emilia --lang zh --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
touch data/manifests/.emilia.done
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Preprocess Emilia dataset, mainly for cleaning"
mkdir -p data/manifests/splits_raw
if [ ! -e data/manifests/split_raw/.emilia.split.done ]; then
lhotse split-lazy data/manifests/emilia_cuts_EN.jsonl.gz data/manifests/splits_raw 10000
lhotse split-lazy data/manifests/emilia_cuts_ZH.jsonl.gz data/manifests/splits_raw 10000
touch data/manifests/splits_raw/.emilia.split.done
fi
mkdir -p data/manifests/splits
if [ ! -e data/manifests/splits/.emilia.preprocess.done ]; then
python local/preprocess_emilia.py --subset EN
python local/preprocess_emilia.py --subset ZH
touch data/manifests/splits/.emilia.preprocess.done
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 3: Extract Fbank for Emilia"
mkdir -p data/fbank/emilia_splits
if [ ! -e data/fbank/emilia_splits/.emilia.fbank.done ]; then
# You can speed up the extraction by distributing splits to multiple machines.
for subset in EN ZH; do
python local/compute_fbank.py \
--source-dir data/manifests/splits \
--dest-dir data/fbank/emilia_splits \
--dataset emilia \
--subset ${subset} \
--splits-cuts 1 \
--split-begin 0 \
--split-end 2000 \
--num-jobs ${nj}
done
touch data/fbank/emilia_splits/.emilia.fbank.done
fi
if [ ! -e data/fbank/emilia_cuts_EN.jsonl.gz ]; then
log "Combining EN fbank cuts and spliting EN dev set"
gunzip -c data/fbank/emilia_splits/emilia_cuts_EN.*.jsonl.gz > data/fbank/emilia_cuts_EN.jsonl
head -n 1500 data/fbank/emilia_cuts_EN.jsonl | gzip -c > data/fbank/emilia_cuts_EN_dev.jsonl.gz
sed -i '1,1500d' data/fbank/emilia_cuts_EN.jsonl
gzip data/fbank/emilia_cuts_EN.jsonl
fi
if [ ! -e data/fbank/emilia_cuts_ZH.jsonl.gz ]; then
log "Combining ZH fbank cuts and spliting ZH dev set"
gunzip -c data/fbank/emilia_splits/emilia_cuts_ZH.*.jsonl.gz > data/fbank/emilia_cuts_ZH.jsonl
head -n 1500 data/fbank/emilia_cuts_ZH.jsonl | gzip -c > data/fbank/emilia_cuts_ZH_dev.jsonl.gz
sed -i '1,1500d' data/fbank/emilia_cuts_ZH.jsonl
gzip data/fbank/emilia_cuts_ZH.jsonl
fi
fi
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Generate token file"
if [ ! -e data/tokens_emilia.txt ]; then
./local/prepare_token_file_emilia.py --tokens data/tokens_emilia.txt
fi
fi

View File

@ -8,7 +8,7 @@ set -eou pipefail
stage=0
stop_stage=5
sampling_rate=24000
nj=32
nj=20
dl_dir=$PWD/download
@ -44,44 +44,53 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
# to $dl_dir/LibriTTS
mkdir -p data/manifests_libritts
if [ ! -e data/manifests_libritts/.libritts.done ]; then
lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests_libritts
touch data/manifests_libritts/.libritts.done
lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests
touch data/manifests/.libritts.done
fi
fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Compute Fbank for LibriTTS"
mkdir -p data/fbank
if [ ! -e data/fbank_libritts/.libritts.done ]; then
./local/compute_fbank_libritts.py --sampling-rate $sampling_rate
touch data/fbank_libritts/.libritts.done
if [ ! -e data/fbank/.libritts.done ]; then
for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do
python local/compute_fbank.py \
--source-dir data/manifests \
--dest-dir data/fbank \
--dataset libritts \
--subset ${subset} \
--sampling-rate $sampling_rate \
--num-jobs ${nj}
done
touch data/fbank/.libritts.done
fi
# Here we shuffle and combine the train-clean-100, train-clean-360 and
# train-other-500 together to form the training set.
if [ ! -f data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz ]; then
cat <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-360.jsonl.gz) \
<(gunzip -c data/fbank_libritts/libritts_cuts_train-other-500.jsonl.gz) | \
shuf | gzip -c > data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz
if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \
<(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \
shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
fi
if [ ! -f data/fbank_libritts/libritts_cuts_train-clean-460.jsonl.gz ]; then
cat <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-360.jsonl.gz) | \
shuf | gzip -c > data/fbank_libritts/libritts_cuts_train-clean-460.jsonl.gz
if [ ! -f data/fbank/libritts_cuts_train-clean-460.jsonl.gz ]; then
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) | \
shuf | gzip -c > data/fbank/libritts_cuts_train-clean-460.jsonl.gz
fi
if [ ! -e data/fbank_libritts/.libritts-validated.done ]; then
if [ ! -e data/fbank/.libritts-validated.done ]; then
log "Validating data/fbank for LibriTTS"
./local/validate_manifest.py \
data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz
touch data/fbank_libritts/.libritts-validated.done
data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
touch data/fbank/.libritts-validated.done
fi
fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "Stage 4: Generate token file"
log "Stage 3: Generate token file"
if [ ! -e data/tokens_libritts.txt ]; then
./local/prepare_token_file_libritts.py --tokens data/tokens_libritts.txt
fi

1
egs/zipvoice/shared Symbolic link
View File

@ -0,0 +1 @@
../../icefall/shared

View File

@ -31,7 +31,7 @@ python3 zipvoice/train_distill.py \
--base-lr 0.0005 \
--max-duration 500 \
--token-file "data/tokens_emilia.txt" \
--manifest-dir "data/fbank_emilia" \
--manifest-dir "data/fbank" \
--teacher-model zipvoice/exp_zipvoice/epoch-11-avg-4.pt \
--num-updates 60000 \
--distill-stage "first" \
@ -46,7 +46,7 @@ python3 zipvoice/train_distill.py \
--base-lr 0.0001 \
--max-duration 500 \
--token-file "data/tokens_emilia.txt" \
--manifest-dir "data/fbank_emilia" \
--manifest-dir "data/fbank" \
--teacher-model zipvoice/exp_zipvoice_distill_1stage/iter-60000-avg-7.pt \
--num-updates 2000 \
--distill-stage "second" \

View File

@ -29,7 +29,7 @@ python3 zipvoice/train_flow.py \
--lr-hours 30000 \
--lr-batches 7500 \
--token-file "data/tokens_emilia.txt" \
--manifest-dir "data/fbank_emilia" \
--manifest-dir "data/fbank" \
--num-epochs 11 \
--exp-dir zipvoice/exp_zipvoice
"""

View File

@ -347,14 +347,14 @@ class TtsDataModule:
train-clean-360 and train-other-500 cuts"
)
return load_manifest_lazy(
self.args.manifest_dir / "libritts_cuts_with_tokens_train-all-shuf.jsonl.gz"
self.args.manifest_dir / "libritts_cuts_train-all-shuf.jsonl.gz"
)
@lru_cache()
def dev_libritts_cuts(self) -> CutSet:
logging.info("About to get dev-clean cuts")
return load_manifest_lazy(
self.args.manifest_dir / "libritts_cuts_with_tokens_dev-clean.jsonl.gz"
self.args.manifest_dir / "libritts_cuts_dev-clean.jsonl.gz"
)