mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-09 14:05:33 +00:00
add emilia data preparation pipeline
This commit is contained in:
parent
60572c2444
commit
2376ed2117
@ -57,21 +57,20 @@ To generate speech with our pre-trained ZipVoice or ZipVoice-Distill models, use
|
||||
|
||||
### 1. Inference of a single sentence:
|
||||
```bash
|
||||
# Chinese example
|
||||
python3 zipvoice/zipvoice_infer.py \
|
||||
--model-name "zipvoice_distill" \
|
||||
--prompt-wav assets/prompt-zh.wav \
|
||||
--prompt-text "对,这就是我,万人敬仰的太乙真人,虽然有点婴儿肥,但也掩不住我逼人的帅气。" \
|
||||
--text "欢迎使用我们的语音合成模型,希望它能给你带来惊喜!" \
|
||||
--res-wav-path result-zh.wav
|
||||
--prompt-wav prompt.wav \
|
||||
--prompt-text "I am the transcription of the prompt wav." \
|
||||
--text "I am the text to be synthesized." \
|
||||
--res-wav-path result.wav
|
||||
|
||||
# English example
|
||||
# Example with a pre-defined prompt wav and text
|
||||
python3 zipvoice/zipvoice_infer.py \
|
||||
--model-name "zipvoice_distill" \
|
||||
--prompt-wav assets/prompt-en.wav \
|
||||
--prompt-text "Some call me nature, others call me mother nature. I've been here for over four point five billion years, twenty two thousand five hundred times longer than you." \
|
||||
--text "Welcome to use our tts model, have fun!" \
|
||||
--res-wav-path result-en.wav
|
||||
--res-wav-path result.wav
|
||||
```
|
||||
|
||||
### 2. Inference of a list of sentences:
|
||||
@ -95,13 +94,29 @@ export HF_ENDPOINT=https://hf-mirror.com
|
||||
|
||||
The following steps show how to train a model from scratch on Emilia and LibriTTS datasets, respectively.
|
||||
|
||||
### 0. Install dependencies for training
|
||||
|
||||
```bash
|
||||
pip install -r ../../requirements.txt
|
||||
```
|
||||
|
||||
### 1. Data Preparation
|
||||
|
||||
#### 1.1. Prepare the Emilia dataset
|
||||
|
||||
```bash
|
||||
bash scripts/prepare_emilia.sh --stage 0 --stop-stage 4
|
||||
```
|
||||
|
||||
See [scripts/prepare_emilia.sh](scripts/prepare_emilia.sh) for step by step instructions.
|
||||
|
||||
#### 1.2 Prepare the LibriTTS dataset
|
||||
|
||||
See [local/prepare_libritts.sh](local/prepare_libritts.sh)
|
||||
```bash
|
||||
bash scripts/prepare_libritts.sh --stage 0 --stop-stage 3
|
||||
```
|
||||
|
||||
See [scripts/prepare_libritts.sh](scripts/prepare_libritts.sh) for step by step instructions.
|
||||
|
||||
### 2. Training
|
||||
|
||||
|
||||
Binary file not shown.
288
egs/zipvoice/local/compute_fbank.py
Normal file
288
egs/zipvoice/local/compute_fbank.py
Normal file
@ -0,0 +1,288 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2025 Xiaomi Corp. (authors: Wei Kang)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from concurrent.futures import ProcessPoolExecutor as Pool
|
||||
|
||||
import torch
|
||||
from lhotse import (
|
||||
CutSet,
|
||||
LilcomChunkyWriter,
|
||||
load_manifest_lazy,
|
||||
set_audio_duration_mismatch_tolerance,
|
||||
)
|
||||
|
||||
from feature import TorchAudioFbank, TorchAudioFbankConfig
|
||||
import lhotse
|
||||
|
||||
# Torch's multithreaded behavior needs to be disabled or
|
||||
# it wastes a lot of CPU and slow things down.
|
||||
# Do this outside of main() in case it needs to take effect
|
||||
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||
torch.set_num_threads(1)
|
||||
torch.set_num_interop_threads(1)
|
||||
|
||||
|
||||
def str2bool(v):
|
||||
"""Used in argparse.ArgumentParser.add_argument to indicate
|
||||
that a type is a bool type and user can enter
|
||||
|
||||
- yes, true, t, y, 1, to represent True
|
||||
- no, false, f, n, 0, to represent False
|
||||
|
||||
See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse # noqa
|
||||
"""
|
||||
if isinstance(v, bool):
|
||||
return v
|
||||
if v.lower() in ("yes", "true", "t", "y", "1"):
|
||||
return True
|
||||
elif v.lower() in ("no", "false", "f", "n", "0"):
|
||||
return False
|
||||
else:
|
||||
raise argparse.ArgumentTypeError("Boolean value expected.")
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--sampling-rate",
|
||||
type=int,
|
||||
default=24000,
|
||||
help="The target sampling rate, the audio will be resampled to this sampling_rate.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--frame-shift",
|
||||
type=int,
|
||||
default=256,
|
||||
help="Frame shift in samples",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--frame-length",
|
||||
type=int,
|
||||
default=1024,
|
||||
help="Frame length in samples",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--num-mel-bins",
|
||||
type=int,
|
||||
default=100,
|
||||
help="The num of mel filters.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
help="Dataset name.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--subset",
|
||||
type=str,
|
||||
help="The subset of the dataset.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--source-dir",
|
||||
type=str,
|
||||
default="data/manifests",
|
||||
help="The source directory of manifest files.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dest-dir",
|
||||
type=str,
|
||||
default="data/fbank",
|
||||
help="The destination directory of manifest files.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--split-cuts",
|
||||
type=str2bool,
|
||||
default=False,
|
||||
help="Whether to use splited cuts.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--split-begin",
|
||||
type=int,
|
||||
help="Start idx of splited cuts.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--split-end",
|
||||
type=int,
|
||||
help="End idx of splited cuts.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--batch-duration",
|
||||
type=int,
|
||||
default=1000,
|
||||
help="The batch duration when computing the features.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--num-jobs", type=int, default=20, help="The number of extractor workers."
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def compute_fbank_split_single(params, idx):
|
||||
lhotse.set_audio_duration_mismatch_tolerance(0.1) # for emilia
|
||||
src_dir = Path(params.source_dir)
|
||||
output_dir = Path(params.dest_dir)
|
||||
num_mel_bins = params.num_mel_bins
|
||||
|
||||
if not src_dir.exists():
|
||||
logging.error(f"{src_dir} not exists")
|
||||
return
|
||||
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
num_digits = 8
|
||||
|
||||
config = TorchAudioFbankConfig(
|
||||
sampling_rate=params.sampling_rate,
|
||||
n_mels=params.num_mel_bins,
|
||||
n_fft=params.frame_length,
|
||||
hop_length=params.frame_shift,
|
||||
)
|
||||
extractor = TorchAudioFbank(config)
|
||||
|
||||
prefix = params.dataset
|
||||
subset = params.subset
|
||||
suffix = "jsonl.gz"
|
||||
|
||||
idx = f"{idx}".zfill(num_digits)
|
||||
cuts_filename = f"{prefix}_cuts_{subset}.{idx}.{suffix}"
|
||||
|
||||
if (src_dir / cuts_filename).is_file():
|
||||
logging.info(f"Loading manifests {src_dir / cuts_filename}")
|
||||
cut_set = load_manifest_lazy(src_dir / cuts_filename)
|
||||
else:
|
||||
logging.warning(f"Raw {cuts_filename} not exists, skipping")
|
||||
return
|
||||
|
||||
cut_set = cut_set.resample(params.sampling_rate)
|
||||
|
||||
if (output_dir / cuts_filename).is_file():
|
||||
logging.info(f"{cuts_filename} already exists - skipping.")
|
||||
return
|
||||
|
||||
logging.info(f"Processing {subset}.{idx} of {prefix}")
|
||||
|
||||
cut_set = cut_set.compute_and_store_features_batch(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/{prefix}_feats_{subset}_{idx}",
|
||||
num_workers=4,
|
||||
batch_duration=params.batch_duration,
|
||||
storage_type=LilcomChunkyWriter,
|
||||
overwrite=True,
|
||||
)
|
||||
cut_set.to_file(output_dir / cuts_filename)
|
||||
|
||||
|
||||
def compute_fbank_split(params):
|
||||
if params.split_end < params.split_begin:
|
||||
logging.warning(
|
||||
f"Split begin should be smaller than split end, given "
|
||||
f"{params.split_begin} -> {params.split_end}."
|
||||
)
|
||||
|
||||
with Pool(max_workers=params.num_jobs) as pool:
|
||||
futures = [
|
||||
pool.submit(compute_fbank_split_single, params, i)
|
||||
for i in range(params.split_begin, params.split_end)
|
||||
]
|
||||
for f in futures:
|
||||
f.result()
|
||||
f.done()
|
||||
|
||||
|
||||
def compute_fbank(params):
|
||||
src_dir = Path(params.source_dir)
|
||||
output_dir = Path(params.dest_dir)
|
||||
num_jobs = params.num_jobs
|
||||
num_mel_bins = params.num_mel_bins
|
||||
|
||||
prefix = params.dataset
|
||||
subset = params.subset
|
||||
suffix = "jsonl.gz"
|
||||
|
||||
cut_set_name = f"{prefix}_cuts_{subset}.{suffix}"
|
||||
|
||||
if (src_dir / cut_set_name).is_file():
|
||||
logging.info(f"Loading manifests {src_dir / cut_set_name}")
|
||||
cut_set = load_manifest_lazy(src_dir / cut_set_name)
|
||||
else:
|
||||
recordings = load_manifest_lazy(
|
||||
src_dir / f"{prefix}_recordings_{subset}.{suffix}"
|
||||
)
|
||||
supervisions = load_manifest_lazy(
|
||||
src_dir / f"{prefix}_supervisions_{subset}.{suffix}"
|
||||
)
|
||||
cut_set = CutSet.from_manifests(
|
||||
recordings=recordings,
|
||||
supervisions=supervisions,
|
||||
)
|
||||
|
||||
cut_set = cut_set.resample(params.sampling_rate)
|
||||
|
||||
config = TorchAudioFbankConfig(
|
||||
sampling_rate=params.sampling_rate,
|
||||
n_mels=params.num_mel_bins,
|
||||
n_fft=params.frame_length,
|
||||
hop_length=params.frame_shift,
|
||||
)
|
||||
extractor = TorchAudioFbank(config)
|
||||
|
||||
cuts_filename = f"{prefix}_cuts_{subset}.{suffix}"
|
||||
if (output_dir / cuts_filename).is_file():
|
||||
logging.info(f"{prefix} {subset} already exists - skipping.")
|
||||
return
|
||||
logging.info(f"Processing {subset} of {prefix}")
|
||||
|
||||
cut_set = cut_set.compute_and_store_features(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/{prefix}_feats_{subset}",
|
||||
num_jobs=num_jobs,
|
||||
storage_type=LilcomChunkyWriter,
|
||||
)
|
||||
cut_set.to_file(output_dir / cuts_filename)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||
|
||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||
args = get_args()
|
||||
logging.info(vars(args))
|
||||
if args.split_cuts:
|
||||
compute_fbank_split(params=args)
|
||||
else:
|
||||
compute_fbank(params=args)
|
||||
@ -1,140 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2021-2023 Xiaomi Corp. (authors: Fangjun Kuang,
|
||||
# Zengwei Yao,)
|
||||
# 2024 The Chinese Univ. of HK (authors: Zengrui Jin)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
"""
|
||||
This file computes fbank features of the LibriTTS dataset.
|
||||
It looks for manifests in the directory data/manifests.
|
||||
|
||||
The generated fbank features are saved in data/fbank.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
from feature import TorchAudioFbank, TorchAudioFbankConfig
|
||||
from lhotse import CutSet, LilcomChunkyWriter
|
||||
from lhotse.recipes.utils import read_manifests_if_cached
|
||||
|
||||
from icefall.utils import get_executor
|
||||
|
||||
# Torch's multithreaded behavior needs to be disabled or
|
||||
# it wastes a lot of CPU and slow things down.
|
||||
# Do this outside of main() in case it needs to take effect
|
||||
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||
torch.set_num_threads(1)
|
||||
torch.set_num_interop_threads(1)
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
type=str,
|
||||
help="""Dataset parts to compute fbank. If None, we will use all""",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sampling-rate",
|
||||
type=int,
|
||||
default=24000,
|
||||
help="""Sampling rate of the waveform for computing fbank,
|
||||
the default value for LibriTTS is 24000, waveform files will be
|
||||
resampled if a different sample rate is provided""",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def compute_fbank_libritts(dataset: Optional[str] = None, sampling_rate: int = 24000):
|
||||
src_dir = Path("data/manifests_libritts")
|
||||
output_dir = Path("data/fbank_libritts")
|
||||
num_jobs = min(32, os.cpu_count())
|
||||
|
||||
prefix = "libritts"
|
||||
suffix = "jsonl.gz"
|
||||
if dataset is None:
|
||||
dataset_parts = (
|
||||
"dev-clean",
|
||||
"test-clean",
|
||||
"train-clean-100",
|
||||
"train-clean-360",
|
||||
"train-other-500",
|
||||
)
|
||||
else:
|
||||
dataset_parts = dataset.split(" ", -1)
|
||||
|
||||
manifests = read_manifests_if_cached(
|
||||
dataset_parts=dataset_parts,
|
||||
output_dir=src_dir,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
)
|
||||
assert manifests is not None
|
||||
|
||||
assert len(manifests) == len(dataset_parts), (
|
||||
len(manifests),
|
||||
len(dataset_parts),
|
||||
list(manifests.keys()),
|
||||
dataset_parts,
|
||||
)
|
||||
|
||||
config = TorchAudioFbankConfig(
|
||||
sampling_rate=sampling_rate,
|
||||
n_mels=100,
|
||||
n_fft=1024,
|
||||
hop_length=256,
|
||||
)
|
||||
extractor = TorchAudioFbank(config)
|
||||
|
||||
with get_executor() as ex: # Initialize the executor only once.
|
||||
for partition, m in manifests.items():
|
||||
cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
|
||||
if (output_dir / cuts_filename).is_file():
|
||||
logging.info(f"{partition} already exists - skipping.")
|
||||
return
|
||||
logging.info(f"Processing {partition}")
|
||||
cut_set = CutSet.from_manifests(
|
||||
recordings=m["recordings"],
|
||||
supervisions=m["supervisions"],
|
||||
)
|
||||
if sampling_rate != 24000:
|
||||
logging.info(f"Resampling waveforms to {sampling_rate}")
|
||||
cut_set = cut_set.resample(sampling_rate)
|
||||
|
||||
cut_set = cut_set.compute_and_store_features(
|
||||
extractor=extractor,
|
||||
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
||||
# when an executor is specified, make more partitions
|
||||
num_jobs=num_jobs if ex is None else 80,
|
||||
executor=ex,
|
||||
storage_type=LilcomChunkyWriter,
|
||||
)
|
||||
cut_set.to_file(output_dir / cuts_filename)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||
|
||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||
compute_fbank_libritts()
|
||||
@ -1,135 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright 2024 Xiaomi Corp. (authors: Han Zhu)
|
||||
#
|
||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torchaudio
|
||||
from lhotse.features.base import FeatureExtractor, register_extractor
|
||||
from lhotse.utils import Seconds, compute_num_frames
|
||||
|
||||
|
||||
class MelSpectrogramFeatures(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
sampling_rate=24000,
|
||||
n_mels=100,
|
||||
n_fft=1024,
|
||||
hop_length=256,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
self.mel_spec = torchaudio.transforms.MelSpectrogram(
|
||||
sample_rate=sampling_rate,
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_length,
|
||||
n_mels=n_mels,
|
||||
center=True,
|
||||
power=1,
|
||||
)
|
||||
|
||||
def forward(self, inp):
|
||||
assert len(inp.shape) == 2
|
||||
|
||||
mel = self.mel_spec(inp)
|
||||
logmel = mel.clamp(min=1e-7).log()
|
||||
return logmel
|
||||
|
||||
|
||||
@dataclass
|
||||
class TorchAudioFbankConfig:
|
||||
sampling_rate: int
|
||||
n_mels: int
|
||||
n_fft: int
|
||||
hop_length: int
|
||||
|
||||
|
||||
@register_extractor
|
||||
class TorchAudioFbank(FeatureExtractor):
|
||||
|
||||
name = "TorchAudioFbank"
|
||||
config_type = TorchAudioFbankConfig
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config=config)
|
||||
|
||||
def _feature_fn(self, sample):
|
||||
fbank = MelSpectrogramFeatures(
|
||||
sampling_rate=self.config.sampling_rate,
|
||||
n_mels=self.config.n_mels,
|
||||
n_fft=self.config.n_fft,
|
||||
hop_length=self.config.hop_length,
|
||||
)
|
||||
|
||||
return fbank(sample)
|
||||
|
||||
@property
|
||||
def device(self) -> Union[str, torch.device]:
|
||||
return self.config.device
|
||||
|
||||
def feature_dim(self, sampling_rate: int) -> int:
|
||||
return self.config.n_mels
|
||||
|
||||
def extract(
|
||||
self,
|
||||
samples: Union[np.ndarray, torch.Tensor],
|
||||
sampling_rate: int,
|
||||
) -> Union[np.ndarray, torch.Tensor]:
|
||||
# Check for sampling rate compatibility.
|
||||
expected_sr = self.config.sampling_rate
|
||||
assert sampling_rate == expected_sr, (
|
||||
f"Mismatched sampling rate: extractor expects {expected_sr}, "
|
||||
f"got {sampling_rate}"
|
||||
)
|
||||
is_numpy = False
|
||||
if not isinstance(samples, torch.Tensor):
|
||||
samples = torch.from_numpy(samples)
|
||||
is_numpy = True
|
||||
|
||||
if len(samples.shape) == 1:
|
||||
samples = samples.unsqueeze(0)
|
||||
assert samples.ndim == 2, samples.shape
|
||||
assert samples.shape[0] == 1, samples.shape
|
||||
|
||||
mel = self._feature_fn(samples).squeeze().t()
|
||||
|
||||
assert mel.ndim == 2, mel.shape
|
||||
assert mel.shape[1] == self.config.n_mels, mel.shape
|
||||
|
||||
num_frames = compute_num_frames(
|
||||
samples.shape[1] / sampling_rate, self.frame_shift, sampling_rate
|
||||
)
|
||||
|
||||
if mel.shape[0] > num_frames:
|
||||
mel = mel[:num_frames]
|
||||
elif mel.shape[0] < num_frames:
|
||||
mel = mel.unsqueeze(0)
|
||||
mel = torch.nn.functional.pad(
|
||||
mel, (0, 0, 0, num_frames - mel.shape[1]), mode="replicate"
|
||||
).squeeze(0)
|
||||
|
||||
if is_numpy:
|
||||
return mel.cpu().numpy()
|
||||
else:
|
||||
return mel
|
||||
|
||||
@property
|
||||
def frame_shift(self) -> Seconds:
|
||||
return self.config.hop_length / self.config.sampling_rate
|
||||
1
egs/zipvoice/local/feature.py
Symbolic link
1
egs/zipvoice/local/feature.py
Symbolic link
@ -0,0 +1 @@
|
||||
../zipvoice/feature.py
|
||||
@ -20,20 +20,26 @@
|
||||
|
||||
|
||||
"""
|
||||
This file reads the texts in given manifest and save the new cuts with phoneme tokens.
|
||||
This file reads the texts in given manifest and save the cleaned new cuts.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import logging
|
||||
import re
|
||||
from concurrent.futures import ProcessPoolExecutor as Pool
|
||||
import glob
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import jieba
|
||||
from lhotse import load_manifest_lazy
|
||||
from tokenizer import Tokenizer, is_alphabet, is_chinese, is_hangul, is_japanese
|
||||
from lhotse import CutSet, load_manifest_lazy
|
||||
from concurrent.futures import ProcessPoolExecutor as Pool
|
||||
|
||||
from tokenizer import (
|
||||
is_alphabet,
|
||||
is_chinese,
|
||||
is_hangul,
|
||||
is_japanese,
|
||||
tokenize_by_CJK_char,
|
||||
)
|
||||
|
||||
|
||||
def get_args():
|
||||
@ -48,71 +54,32 @@ def get_args():
|
||||
parser.add_argument(
|
||||
"--jobs",
|
||||
type=int,
|
||||
default=50,
|
||||
default=20,
|
||||
help="Number of jobs to processing.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--source-dir",
|
||||
type=str,
|
||||
default="data/manifests_emilia/splits",
|
||||
default="data/manifests/splits_raw",
|
||||
help="The source directory of manifest files.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dest-dir",
|
||||
type=str,
|
||||
default="data/manifests/splits",
|
||||
help="The destination directory of manifest files.",
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def tokenize_by_CJK_char(line: str) -> List[str]:
|
||||
"""
|
||||
Tokenize a line of text with CJK char.
|
||||
|
||||
Note: All return characters will be upper case.
|
||||
|
||||
Example:
|
||||
input = "你好世界是 hello world 的中文"
|
||||
output = [你, 好, 世, 界, 是, HELLO, WORLD, 的, 中, 文]
|
||||
|
||||
Args:
|
||||
line:
|
||||
The input text.
|
||||
|
||||
Return:
|
||||
A new string tokenize by CJK char.
|
||||
"""
|
||||
# The CJK ranges is from https://github.com/alvations/nltk/blob/79eed6ddea0d0a2c212c1060b477fc268fec4d4b/nltk/tokenize/util.py
|
||||
pattern = re.compile(
|
||||
r"([\u1100-\u11ff\u2e80-\ua4cf\ua840-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F\uFF65-\uFFDC\U00020000-\U0002FFFF])"
|
||||
)
|
||||
chars = pattern.split(line.strip().upper())
|
||||
char_list = []
|
||||
for w in chars:
|
||||
if w.strip():
|
||||
char_list += w.strip().split()
|
||||
return char_list
|
||||
|
||||
|
||||
def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
|
||||
def preprocess_emilia(file_name: str, input_dir: Path, output_dir: Path):
|
||||
logging.info(f"Processing {file_name}")
|
||||
if (output_dir / file_name).is_file():
|
||||
logging.info(f"{file_name} exists, skipping.")
|
||||
return
|
||||
jieba.setLogLevel(logging.INFO)
|
||||
tokenizer = Tokenizer()
|
||||
|
||||
def _prepare_cut(cut):
|
||||
# Each cut only contains one supervision
|
||||
assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
|
||||
text = cut.supervisions[0].text
|
||||
cut.supervisions[0].normalized_text = text
|
||||
tokens = tokenizer.texts_to_tokens([text])[0]
|
||||
cut.tokens = tokens
|
||||
return cut
|
||||
|
||||
def _filter_cut(cut):
|
||||
text = cut.supervisions[0].text
|
||||
@ -124,10 +91,10 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
|
||||
clean_chars = []
|
||||
for x in text:
|
||||
if is_hangul(x):
|
||||
logging.info(f"Delete cut with text containing Korean : {text}")
|
||||
logging.warning(f"Delete cut with text containing Korean : {text}")
|
||||
return False
|
||||
if is_japanese(x):
|
||||
logging.info(f"Delete cut with text containing Japanese : {text}")
|
||||
logging.warning(f"Delete cut with text containing Japanese : {text}")
|
||||
return False
|
||||
if is_chinese(x):
|
||||
chinese.append(x)
|
||||
@ -138,18 +105,19 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
|
||||
if x == " ":
|
||||
clean_chars.append(x)
|
||||
if len(english) + len(chinese) == 0:
|
||||
logging.info(f"Delete cut with text has no valid chars : {text}")
|
||||
logging.warning(f"Delete cut with text has no valid chars : {text}")
|
||||
return False
|
||||
|
||||
words = tokenize_by_CJK_char("".join(clean_chars))
|
||||
for i in range(len(words) - 10):
|
||||
if words[i : i + 10].count(words[i]) == 10:
|
||||
logging.info(f"Delete cut with text with too much repeats : {text}")
|
||||
logging.warning(f"Delete cut with text with too much repeats : {text}")
|
||||
return False
|
||||
# word speed, 20 - 600 / minute
|
||||
if duration < len(words) / 600 * 60 or duration > len(words) / 20 * 60:
|
||||
logging.info(
|
||||
f"Delete cut with audio text mismatch, duration : {duration}s, words : {len(words)}, text : {text}"
|
||||
logging.warning(
|
||||
f"Delete cut with audio text mismatch, duration : {duration}s, "
|
||||
f"words : {len(words)}, text : {text}"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
@ -157,11 +125,10 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
|
||||
try:
|
||||
cut_set = load_manifest_lazy(input_dir / file_name)
|
||||
cut_set = cut_set.filter(_filter_cut)
|
||||
cut_set = cut_set.map(_prepare_cut)
|
||||
cut_set.to_file(output_dir / file_name)
|
||||
except Exception as e:
|
||||
logging.error(f"Manifest {file_name} failed with error: {e}")
|
||||
raise
|
||||
os.remove(str(output_dir / file_name))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -179,14 +146,11 @@ if __name__ == "__main__":
|
||||
with Pool(max_workers=args.jobs) as pool:
|
||||
futures = [
|
||||
pool.submit(
|
||||
prepare_tokens_emilia, filename.split("/")[-1], input_dir, output_dir
|
||||
preprocess_emilia, filename.split("/")[-1], input_dir, output_dir
|
||||
)
|
||||
for filename in cut_files
|
||||
]
|
||||
for f in futures:
|
||||
try:
|
||||
f.result()
|
||||
f.done()
|
||||
except Exception as e:
|
||||
logging.error(f"Future failed with error: {e}")
|
||||
f.result()
|
||||
f.done()
|
||||
logging.info("Processing done.")
|
||||
1
egs/zipvoice/local/tokenizer.py
Symbolic link
1
egs/zipvoice/local/tokenizer.py
Symbolic link
@ -0,0 +1 @@
|
||||
../zipvoice/tokenizer.py
|
||||
@ -1,232 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
||||
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
||||
|
||||
# add icefall to PYTHONPATH
|
||||
export PYTHONPATH=../../../:$PYTHONPATH
|
||||
|
||||
set -eou pipefail
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
token_type=bpe # bpe, letter, phone
|
||||
bpe_vocab_size=500
|
||||
|
||||
nj=32
|
||||
|
||||
dl_dir=$PWD/download
|
||||
|
||||
. shared/parse_options.sh || exit 1
|
||||
|
||||
# All files generated by this script are saved in "data".
|
||||
# You can safely remove "data" and rerun this script to regenerate it.
|
||||
mkdir -p data
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
log "dl_dir: $dl_dir"
|
||||
|
||||
|
||||
if [ $stage -le -2 ] && [ $stop_stage -ge -2 ]; then
|
||||
|
||||
if [ ! -d $dl_dir/xvector_nnet_1a_libritts_clean_460 ]; then
|
||||
log "Downloading x-vector"
|
||||
|
||||
git clone https://huggingface.co/datasets/zrjin/xvector_nnet_1a_libritts_clean_460 $dl_dir/xvector_nnet_1a_libritts_clean_460
|
||||
|
||||
mkdir -p exp/xvector_nnet_1a/
|
||||
cp -r $dl_dir/xvector_nnet_1a_libritts_clean_460/* exp/xvector_nnet_1a/
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
||||
log "Stage -1: build monotonic_align lib"
|
||||
if [ ! -d vits/monotonic_align/build ]; then
|
||||
cd vits/monotonic_align
|
||||
python setup.py build_ext --inplace
|
||||
cd ../../
|
||||
else
|
||||
log "monotonic_align lib already built"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||
log "Stage 0: Download data"
|
||||
|
||||
# If you have pre-downloaded it to /path/to/LibriTTS,
|
||||
# you can create a symlink
|
||||
#
|
||||
# ln -sfv /path/to/LibriTTS $dl_dir/LibriTTS
|
||||
#
|
||||
if [ ! -d $dl_dir/LibriTTS ]; then
|
||||
lhotse download libritts $dl_dir
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
log "Stage 1: Prepare LibriTTS manifest"
|
||||
# We assume that you have downloaded the LibriTTS corpus
|
||||
# to $dl_dir/LibriTTS
|
||||
mkdir -p data/manifests
|
||||
if [ ! -e data/manifests/.libritts.done ]; then
|
||||
lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests
|
||||
touch data/manifests/.libritts.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||
log "Stage 2: Compute Fbank for LibriTTS"
|
||||
mkdir -p data/fbank
|
||||
|
||||
for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do
|
||||
python local/compute_fbank.py --dataset libritts --subset ${subset}
|
||||
done
|
||||
|
||||
# Here we shuffle and combine the train-clean-100, train-clean-360 and
|
||||
# train-other-500 together to form the training set.
|
||||
if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then
|
||||
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
|
||||
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \
|
||||
<(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \
|
||||
shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
|
||||
fi
|
||||
|
||||
if [ ! -f data/fbank/libritts_cuts_train-clean-460.jsonl.gz ]; then
|
||||
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
|
||||
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) | \
|
||||
shuf | gzip -c > data/fbank/libritts_cuts_train-clean-460.jsonl.gz
|
||||
fi
|
||||
|
||||
if [ ! -e data/fbank/.libritts-validated.done ]; then
|
||||
log "Validating data/fbank for LibriTTS"
|
||||
./local/validate_manifest.py \
|
||||
data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
|
||||
touch data/fbank/.libritts-validated.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
log "Stage 3: Prepare tokens.txt"
|
||||
|
||||
if [ $token_type == "bpe" ] || [ $token_type == "letter" ]; then
|
||||
if [ ! -e data/texts.txt ]; then
|
||||
./local/export_normalized_texts.py --output data/texts.txt \
|
||||
--manifests data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $token_type == "bpe" ]; then
|
||||
mkdir -p data/lang_bpe_${bpe_vocab_size}
|
||||
if [ ! -e data/lang_bpe_${bpe_vocab_size}/tokens.txt ]; then
|
||||
./local/train_bpe_model.py --transcript data/texts.txt \
|
||||
--lang-dir data/lang_bpe_${bpe_vocab_size} \
|
||||
--vocab-size $bpe_vocab_size
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $token_type == "phone" ]; then
|
||||
mkdir -p data/lang_phone
|
||||
./local/export_tokens.py --token-type phone \
|
||||
--output data/lang_phone/tokens.txt
|
||||
fi
|
||||
|
||||
if [ $token_type == "letter" ]; then
|
||||
mkdir -p data/lang_letter
|
||||
./local/export_tokens.py --token-type letter \
|
||||
--texts data/texts.txt \
|
||||
--output data/lang_letter/tokens.txt
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
log "Stage 4: Download and prepare librispeech-pc test clean for testing."
|
||||
|
||||
if [ ! -e $dl_dir/test-clean.tar.gz ]; then
|
||||
wget https://huggingface.co/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir
|
||||
fi
|
||||
# For China users.
|
||||
if [ ! -e $dl_dir/test-clean.tar.gz ]; then
|
||||
wget https://hf-mirror.com/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir
|
||||
fi
|
||||
|
||||
if [ ! -d $dl_dir/LibriSpeech/test-clean ]; then
|
||||
tar -xvf $dl_dir/test-clean.tar.gz -C $dl_dir
|
||||
fi
|
||||
|
||||
mkdir -p $dl_dir/LibriSpeech-PC
|
||||
if [ ! -e $dl_dir/LibriSpeech-PC/test-clean.json ]; then
|
||||
wget https://us.openslr.org/resources/145/manifests.tar.gz -P $dl_dir/LibriSpeech-PC
|
||||
tar -xvf $dl_dir/LibriSpeech-PC/manifests.tar.gz -C $dl_dir/LibriSpeech-PC
|
||||
fi
|
||||
|
||||
python local/compute_fbank.py --dataset librispeech --subset test-clean
|
||||
python local/prepare_prompts_librispeech_test_clean.py
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
||||
log "Stage 5: Compute Spectrogram for LibriTTS (for VITS system)"
|
||||
mkdir -p data/spectrogram
|
||||
if [ ! -e data/spectrogram/.libritts.done ]; then
|
||||
./local/compute_spectrogram_libritts.py --sampling-rate $sampling_rate
|
||||
touch data/spectrogram/.libritts.done
|
||||
fi
|
||||
|
||||
# Here we shuffle and combine the train-clean-100, train-clean-360 and
|
||||
# train-other-500 together to form the training set.
|
||||
if [ ! -f data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz ]; then
|
||||
cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
|
||||
<(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) \
|
||||
<(gunzip -c data/spectrogram/libritts_cuts_train-other-500.jsonl.gz) | \
|
||||
shuf | gzip -c > data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
|
||||
fi
|
||||
|
||||
# Here we shuffle and combine the train-clean-100, train-clean-360
|
||||
# together to form the training set.
|
||||
if [ ! -f data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz ]; then
|
||||
cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
|
||||
<(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) | \
|
||||
shuf | gzip -c > data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz
|
||||
fi
|
||||
|
||||
if [ ! -e data/spectrogram/.libritts-validated.done ]; then
|
||||
log "Validating data/spectrogram for LibriTTS"
|
||||
./local/validate_manifest.py \
|
||||
data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
|
||||
touch data/spectrogram/.libritts-validated.done
|
||||
fi
|
||||
fi
|
||||
|
||||
audio_feats_dir=data/tokenized
|
||||
dataset_parts="--dataset-parts all" # debug "-p dev-clean -p test-clean"
|
||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
||||
log "Stage 6: Tokenize/Fbank LibriTTS for valle"
|
||||
mkdir -p ${audio_feats_dir}
|
||||
if [ ! -e ${audio_feats_dir}/.libritts.tokenize.done ]; then
|
||||
python3 ./local/compute_neural_codec_and_prepare_text_tokens.py --dataset-parts "${dataset_parts}" \
|
||||
--audio-extractor "Encodec" \
|
||||
--batch-duration 400 \
|
||||
--src-dir "data/manifests" \
|
||||
--output-dir "${audio_feats_dir}"
|
||||
fi
|
||||
touch ${audio_feats_dir}/.libritts.tokenize.done
|
||||
|
||||
lhotse combine \
|
||||
${audio_feats_dir}/libritts_cuts_train-clean-100.jsonl.gz \
|
||||
${audio_feats_dir}/libritts_cuts_train-clean-360.jsonl.gz \
|
||||
${audio_feats_dir}/libritts_cuts_train-other-500.jsonl.gz \
|
||||
${audio_feats_dir}/cuts_train.jsonl.gz
|
||||
lhotse copy \
|
||||
${audio_feats_dir}/libritts_cuts_dev-clean.jsonl.gz \
|
||||
${audio_feats_dir}/cuts_dev.jsonl.gz
|
||||
lhotse copy \
|
||||
${audio_feats_dir}/libritts_cuts_test-clean.jsonl.gz \
|
||||
${audio_feats_dir}/cuts_test.jsonl.gz
|
||||
fi
|
||||
126
egs/zipvoice/scripts/prepare_emilia.sh
Executable file
126
egs/zipvoice/scripts/prepare_emilia.sh
Executable file
@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
||||
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
||||
|
||||
set -eou pipefail
|
||||
|
||||
stage=0
|
||||
stop_stage=5
|
||||
sampling_rate=24000
|
||||
nj=32
|
||||
|
||||
dl_dir=$PWD/download
|
||||
|
||||
# All files generated by this script are saved in "data".
|
||||
# You can safely remove "data" and rerun this script to regenerate it.
|
||||
mkdir -p data
|
||||
|
||||
log() {
|
||||
# This function is from espnet
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
|
||||
log "dl_dir: $dl_dir"
|
||||
|
||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||
log "Stage 0: Download data"
|
||||
|
||||
# Your download directory should look like this:
|
||||
#
|
||||
# download/Amphion___Emilia
|
||||
# ├── metafile.yaml
|
||||
# ├── raw
|
||||
# │ ├── DE
|
||||
# │ ├── EN
|
||||
# │ ├── FR
|
||||
# │ ├── JA
|
||||
# │ ├── KO
|
||||
# │ ├── openemilia_45batches.tar.gz
|
||||
# │ ├── openemilia_all.tar.gz
|
||||
# │ └── ZH
|
||||
# └── README.md
|
||||
|
||||
if [ ! -d $dl_dir/Amphion___Emilia/raw ]; then
|
||||
log "Please refer https://openxlab.org.cn/datasets/Amphion/Emilia to download the dataset."
|
||||
exit(-1)
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
log "Stage 1: Prepare emilia manifests (EN and ZH only)"
|
||||
# We assume that you have downloaded the Emilia corpus
|
||||
# to $dl_dir/Amphion___Emilia
|
||||
# see stage 0 for the directory structure
|
||||
mkdir -p data/manifests
|
||||
if [ ! -e data/manifests/.emilia.done ]; then
|
||||
lhotse prepare emilia --lang en --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
|
||||
lhotse prepare emilia --lang zh --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
|
||||
touch data/manifests/.emilia.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||
log "Stage 2: Preprocess Emilia dataset, mainly for cleaning"
|
||||
mkdir -p data/manifests/splits_raw
|
||||
if [ ! -e data/manifests/split_raw/.emilia.split.done ]; then
|
||||
lhotse split-lazy data/manifests/emilia_cuts_EN.jsonl.gz data/manifests/splits_raw 10000
|
||||
lhotse split-lazy data/manifests/emilia_cuts_ZH.jsonl.gz data/manifests/splits_raw 10000
|
||||
touch data/manifests/splits_raw/.emilia.split.done
|
||||
fi
|
||||
|
||||
mkdir -p data/manifests/splits
|
||||
|
||||
if [ ! -e data/manifests/splits/.emilia.preprocess.done ]; then
|
||||
python local/preprocess_emilia.py --subset EN
|
||||
python local/preprocess_emilia.py --subset ZH
|
||||
touch data/manifests/splits/.emilia.preprocess.done
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
log "Stage 3: Extract Fbank for Emilia"
|
||||
mkdir -p data/fbank/emilia_splits
|
||||
if [ ! -e data/fbank/emilia_splits/.emilia.fbank.done ]; then
|
||||
# You can speed up the extraction by distributing splits to multiple machines.
|
||||
for subset in EN ZH; do
|
||||
python local/compute_fbank.py \
|
||||
--source-dir data/manifests/splits \
|
||||
--dest-dir data/fbank/emilia_splits \
|
||||
--dataset emilia \
|
||||
--subset ${subset} \
|
||||
--splits-cuts 1 \
|
||||
--split-begin 0 \
|
||||
--split-end 2000 \
|
||||
--num-jobs ${nj}
|
||||
done
|
||||
touch data/fbank/emilia_splits/.emilia.fbank.done
|
||||
fi
|
||||
|
||||
if [ ! -e data/fbank/emilia_cuts_EN.jsonl.gz ]; then
|
||||
log "Combining EN fbank cuts and spliting EN dev set"
|
||||
gunzip -c data/fbank/emilia_splits/emilia_cuts_EN.*.jsonl.gz > data/fbank/emilia_cuts_EN.jsonl
|
||||
head -n 1500 data/fbank/emilia_cuts_EN.jsonl | gzip -c > data/fbank/emilia_cuts_EN_dev.jsonl.gz
|
||||
sed -i '1,1500d' data/fbank/emilia_cuts_EN.jsonl
|
||||
gzip data/fbank/emilia_cuts_EN.jsonl
|
||||
fi
|
||||
|
||||
if [ ! -e data/fbank/emilia_cuts_ZH.jsonl.gz ]; then
|
||||
log "Combining ZH fbank cuts and spliting ZH dev set"
|
||||
gunzip -c data/fbank/emilia_splits/emilia_cuts_ZH.*.jsonl.gz > data/fbank/emilia_cuts_ZH.jsonl
|
||||
head -n 1500 data/fbank/emilia_cuts_ZH.jsonl | gzip -c > data/fbank/emilia_cuts_ZH_dev.jsonl.gz
|
||||
sed -i '1,1500d' data/fbank/emilia_cuts_ZH.jsonl
|
||||
gzip data/fbank/emilia_cuts_ZH.jsonl
|
||||
fi
|
||||
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||
log "Stage 4: Generate token file"
|
||||
if [ ! -e data/tokens_emilia.txt ]; then
|
||||
./local/prepare_token_file_emilia.py --tokens data/tokens_emilia.txt
|
||||
fi
|
||||
fi
|
||||
@ -8,7 +8,7 @@ set -eou pipefail
|
||||
stage=0
|
||||
stop_stage=5
|
||||
sampling_rate=24000
|
||||
nj=32
|
||||
nj=20
|
||||
|
||||
dl_dir=$PWD/download
|
||||
|
||||
@ -44,44 +44,53 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||
# to $dl_dir/LibriTTS
|
||||
mkdir -p data/manifests_libritts
|
||||
if [ ! -e data/manifests_libritts/.libritts.done ]; then
|
||||
lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests_libritts
|
||||
touch data/manifests_libritts/.libritts.done
|
||||
lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests
|
||||
touch data/manifests/.libritts.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||
log "Stage 2: Compute Fbank for LibriTTS"
|
||||
mkdir -p data/fbank
|
||||
if [ ! -e data/fbank_libritts/.libritts.done ]; then
|
||||
./local/compute_fbank_libritts.py --sampling-rate $sampling_rate
|
||||
touch data/fbank_libritts/.libritts.done
|
||||
|
||||
if [ ! -e data/fbank/.libritts.done ]; then
|
||||
for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do
|
||||
python local/compute_fbank.py \
|
||||
--source-dir data/manifests \
|
||||
--dest-dir data/fbank \
|
||||
--dataset libritts \
|
||||
--subset ${subset} \
|
||||
--sampling-rate $sampling_rate \
|
||||
--num-jobs ${nj}
|
||||
done
|
||||
touch data/fbank/.libritts.done
|
||||
fi
|
||||
|
||||
# Here we shuffle and combine the train-clean-100, train-clean-360 and
|
||||
# train-other-500 together to form the training set.
|
||||
if [ ! -f data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz ]; then
|
||||
cat <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-100.jsonl.gz) \
|
||||
<(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-360.jsonl.gz) \
|
||||
<(gunzip -c data/fbank_libritts/libritts_cuts_train-other-500.jsonl.gz) | \
|
||||
shuf | gzip -c > data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz
|
||||
if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then
|
||||
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
|
||||
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \
|
||||
<(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \
|
||||
shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
|
||||
fi
|
||||
|
||||
if [ ! -f data/fbank_libritts/libritts_cuts_train-clean-460.jsonl.gz ]; then
|
||||
cat <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-100.jsonl.gz) \
|
||||
<(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-360.jsonl.gz) | \
|
||||
shuf | gzip -c > data/fbank_libritts/libritts_cuts_train-clean-460.jsonl.gz
|
||||
if [ ! -f data/fbank/libritts_cuts_train-clean-460.jsonl.gz ]; then
|
||||
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
|
||||
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) | \
|
||||
shuf | gzip -c > data/fbank/libritts_cuts_train-clean-460.jsonl.gz
|
||||
fi
|
||||
|
||||
if [ ! -e data/fbank_libritts/.libritts-validated.done ]; then
|
||||
if [ ! -e data/fbank/.libritts-validated.done ]; then
|
||||
log "Validating data/fbank for LibriTTS"
|
||||
./local/validate_manifest.py \
|
||||
data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz
|
||||
touch data/fbank_libritts/.libritts-validated.done
|
||||
data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
|
||||
touch data/fbank/.libritts-validated.done
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||
log "Stage 4: Generate token file"
|
||||
log "Stage 3: Generate token file"
|
||||
if [ ! -e data/tokens_libritts.txt ]; then
|
||||
./local/prepare_token_file_libritts.py --tokens data/tokens_libritts.txt
|
||||
fi
|
||||
|
||||
1
egs/zipvoice/shared
Symbolic link
1
egs/zipvoice/shared
Symbolic link
@ -0,0 +1 @@
|
||||
../../icefall/shared
|
||||
@ -31,7 +31,7 @@ python3 zipvoice/train_distill.py \
|
||||
--base-lr 0.0005 \
|
||||
--max-duration 500 \
|
||||
--token-file "data/tokens_emilia.txt" \
|
||||
--manifest-dir "data/fbank_emilia" \
|
||||
--manifest-dir "data/fbank" \
|
||||
--teacher-model zipvoice/exp_zipvoice/epoch-11-avg-4.pt \
|
||||
--num-updates 60000 \
|
||||
--distill-stage "first" \
|
||||
@ -46,7 +46,7 @@ python3 zipvoice/train_distill.py \
|
||||
--base-lr 0.0001 \
|
||||
--max-duration 500 \
|
||||
--token-file "data/tokens_emilia.txt" \
|
||||
--manifest-dir "data/fbank_emilia" \
|
||||
--manifest-dir "data/fbank" \
|
||||
--teacher-model zipvoice/exp_zipvoice_distill_1stage/iter-60000-avg-7.pt \
|
||||
--num-updates 2000 \
|
||||
--distill-stage "second" \
|
||||
|
||||
@ -29,7 +29,7 @@ python3 zipvoice/train_flow.py \
|
||||
--lr-hours 30000 \
|
||||
--lr-batches 7500 \
|
||||
--token-file "data/tokens_emilia.txt" \
|
||||
--manifest-dir "data/fbank_emilia" \
|
||||
--manifest-dir "data/fbank" \
|
||||
--num-epochs 11 \
|
||||
--exp-dir zipvoice/exp_zipvoice
|
||||
"""
|
||||
|
||||
@ -347,14 +347,14 @@ class TtsDataModule:
|
||||
train-clean-360 and train-other-500 cuts"
|
||||
)
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "libritts_cuts_with_tokens_train-all-shuf.jsonl.gz"
|
||||
self.args.manifest_dir / "libritts_cuts_train-all-shuf.jsonl.gz"
|
||||
)
|
||||
|
||||
@lru_cache()
|
||||
def dev_libritts_cuts(self) -> CutSet:
|
||||
logging.info("About to get dev-clean cuts")
|
||||
return load_manifest_lazy(
|
||||
self.args.manifest_dir / "libritts_cuts_with_tokens_dev-clean.jsonl.gz"
|
||||
self.args.manifest_dir / "libritts_cuts_dev-clean.jsonl.gz"
|
||||
)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user