mirror of
https://github.com/k2-fsa/icefall.git
synced 2025-12-09 14:05:33 +00:00
add emilia data preparation pipeline
This commit is contained in:
parent
60572c2444
commit
2376ed2117
@ -57,21 +57,20 @@ To generate speech with our pre-trained ZipVoice or ZipVoice-Distill models, use
|
|||||||
|
|
||||||
### 1. Inference of a single sentence:
|
### 1. Inference of a single sentence:
|
||||||
```bash
|
```bash
|
||||||
# Chinese example
|
|
||||||
python3 zipvoice/zipvoice_infer.py \
|
python3 zipvoice/zipvoice_infer.py \
|
||||||
--model-name "zipvoice_distill" \
|
--model-name "zipvoice_distill" \
|
||||||
--prompt-wav assets/prompt-zh.wav \
|
--prompt-wav prompt.wav \
|
||||||
--prompt-text "对,这就是我,万人敬仰的太乙真人,虽然有点婴儿肥,但也掩不住我逼人的帅气。" \
|
--prompt-text "I am the transcription of the prompt wav." \
|
||||||
--text "欢迎使用我们的语音合成模型,希望它能给你带来惊喜!" \
|
--text "I am the text to be synthesized." \
|
||||||
--res-wav-path result-zh.wav
|
--res-wav-path result.wav
|
||||||
|
|
||||||
# English example
|
# Example with a pre-defined prompt wav and text
|
||||||
python3 zipvoice/zipvoice_infer.py \
|
python3 zipvoice/zipvoice_infer.py \
|
||||||
--model-name "zipvoice_distill" \
|
--model-name "zipvoice_distill" \
|
||||||
--prompt-wav assets/prompt-en.wav \
|
--prompt-wav assets/prompt-en.wav \
|
||||||
--prompt-text "Some call me nature, others call me mother nature. I've been here for over four point five billion years, twenty two thousand five hundred times longer than you." \
|
--prompt-text "Some call me nature, others call me mother nature. I've been here for over four point five billion years, twenty two thousand five hundred times longer than you." \
|
||||||
--text "Welcome to use our tts model, have fun!" \
|
--text "Welcome to use our tts model, have fun!" \
|
||||||
--res-wav-path result-en.wav
|
--res-wav-path result.wav
|
||||||
```
|
```
|
||||||
|
|
||||||
### 2. Inference of a list of sentences:
|
### 2. Inference of a list of sentences:
|
||||||
@ -95,13 +94,29 @@ export HF_ENDPOINT=https://hf-mirror.com
|
|||||||
|
|
||||||
The following steps show how to train a model from scratch on Emilia and LibriTTS datasets, respectively.
|
The following steps show how to train a model from scratch on Emilia and LibriTTS datasets, respectively.
|
||||||
|
|
||||||
|
### 0. Install dependencies for training
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r ../../requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
### 1. Data Preparation
|
### 1. Data Preparation
|
||||||
|
|
||||||
#### 1.1. Prepare the Emilia dataset
|
#### 1.1. Prepare the Emilia dataset
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash scripts/prepare_emilia.sh --stage 0 --stop-stage 4
|
||||||
|
```
|
||||||
|
|
||||||
|
See [scripts/prepare_emilia.sh](scripts/prepare_emilia.sh) for step by step instructions.
|
||||||
|
|
||||||
#### 1.2 Prepare the LibriTTS dataset
|
#### 1.2 Prepare the LibriTTS dataset
|
||||||
|
|
||||||
See [local/prepare_libritts.sh](local/prepare_libritts.sh)
|
```bash
|
||||||
|
bash scripts/prepare_libritts.sh --stage 0 --stop-stage 3
|
||||||
|
```
|
||||||
|
|
||||||
|
See [scripts/prepare_libritts.sh](scripts/prepare_libritts.sh) for step by step instructions.
|
||||||
|
|
||||||
### 2. Training
|
### 2. Training
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
288
egs/zipvoice/local/compute_fbank.py
Normal file
288
egs/zipvoice/local/compute_fbank.py
Normal file
@ -0,0 +1,288 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# Copyright 2025 Xiaomi Corp. (authors: Wei Kang)
|
||||||
|
#
|
||||||
|
# See ../../../../LICENSE for clarification regarding multiple authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
from concurrent.futures import ProcessPoolExecutor as Pool
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from lhotse import (
|
||||||
|
CutSet,
|
||||||
|
LilcomChunkyWriter,
|
||||||
|
load_manifest_lazy,
|
||||||
|
set_audio_duration_mismatch_tolerance,
|
||||||
|
)
|
||||||
|
|
||||||
|
from feature import TorchAudioFbank, TorchAudioFbankConfig
|
||||||
|
import lhotse
|
||||||
|
|
||||||
|
# Torch's multithreaded behavior needs to be disabled or
|
||||||
|
# it wastes a lot of CPU and slow things down.
|
||||||
|
# Do this outside of main() in case it needs to take effect
|
||||||
|
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
||||||
|
torch.set_num_threads(1)
|
||||||
|
torch.set_num_interop_threads(1)
|
||||||
|
|
||||||
|
|
||||||
|
def str2bool(v):
|
||||||
|
"""Used in argparse.ArgumentParser.add_argument to indicate
|
||||||
|
that a type is a bool type and user can enter
|
||||||
|
|
||||||
|
- yes, true, t, y, 1, to represent True
|
||||||
|
- no, false, f, n, 0, to represent False
|
||||||
|
|
||||||
|
See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse # noqa
|
||||||
|
"""
|
||||||
|
if isinstance(v, bool):
|
||||||
|
return v
|
||||||
|
if v.lower() in ("yes", "true", "t", "y", "1"):
|
||||||
|
return True
|
||||||
|
elif v.lower() in ("no", "false", "f", "n", "0"):
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
raise argparse.ArgumentTypeError("Boolean value expected.")
|
||||||
|
|
||||||
|
|
||||||
|
def get_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--sampling-rate",
|
||||||
|
type=int,
|
||||||
|
default=24000,
|
||||||
|
help="The target sampling rate, the audio will be resampled to this sampling_rate.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--frame-shift",
|
||||||
|
type=int,
|
||||||
|
default=256,
|
||||||
|
help="Frame shift in samples",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--frame-length",
|
||||||
|
type=int,
|
||||||
|
default=1024,
|
||||||
|
help="Frame length in samples",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-mel-bins",
|
||||||
|
type=int,
|
||||||
|
default=100,
|
||||||
|
help="The num of mel filters.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset",
|
||||||
|
type=str,
|
||||||
|
help="Dataset name.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--subset",
|
||||||
|
type=str,
|
||||||
|
help="The subset of the dataset.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--source-dir",
|
||||||
|
type=str,
|
||||||
|
default="data/manifests",
|
||||||
|
help="The source directory of manifest files.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--dest-dir",
|
||||||
|
type=str,
|
||||||
|
default="data/fbank",
|
||||||
|
help="The destination directory of manifest files.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-cuts",
|
||||||
|
type=str2bool,
|
||||||
|
default=False,
|
||||||
|
help="Whether to use splited cuts.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-begin",
|
||||||
|
type=int,
|
||||||
|
help="Start idx of splited cuts.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-end",
|
||||||
|
type=int,
|
||||||
|
help="End idx of splited cuts.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch-duration",
|
||||||
|
type=int,
|
||||||
|
default=1000,
|
||||||
|
help="The batch duration when computing the features.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-jobs", type=int, default=20, help="The number of extractor workers."
|
||||||
|
)
|
||||||
|
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def compute_fbank_split_single(params, idx):
|
||||||
|
lhotse.set_audio_duration_mismatch_tolerance(0.1) # for emilia
|
||||||
|
src_dir = Path(params.source_dir)
|
||||||
|
output_dir = Path(params.dest_dir)
|
||||||
|
num_mel_bins = params.num_mel_bins
|
||||||
|
|
||||||
|
if not src_dir.exists():
|
||||||
|
logging.error(f"{src_dir} not exists")
|
||||||
|
return
|
||||||
|
|
||||||
|
if not output_dir.exists():
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
num_digits = 8
|
||||||
|
|
||||||
|
config = TorchAudioFbankConfig(
|
||||||
|
sampling_rate=params.sampling_rate,
|
||||||
|
n_mels=params.num_mel_bins,
|
||||||
|
n_fft=params.frame_length,
|
||||||
|
hop_length=params.frame_shift,
|
||||||
|
)
|
||||||
|
extractor = TorchAudioFbank(config)
|
||||||
|
|
||||||
|
prefix = params.dataset
|
||||||
|
subset = params.subset
|
||||||
|
suffix = "jsonl.gz"
|
||||||
|
|
||||||
|
idx = f"{idx}".zfill(num_digits)
|
||||||
|
cuts_filename = f"{prefix}_cuts_{subset}.{idx}.{suffix}"
|
||||||
|
|
||||||
|
if (src_dir / cuts_filename).is_file():
|
||||||
|
logging.info(f"Loading manifests {src_dir / cuts_filename}")
|
||||||
|
cut_set = load_manifest_lazy(src_dir / cuts_filename)
|
||||||
|
else:
|
||||||
|
logging.warning(f"Raw {cuts_filename} not exists, skipping")
|
||||||
|
return
|
||||||
|
|
||||||
|
cut_set = cut_set.resample(params.sampling_rate)
|
||||||
|
|
||||||
|
if (output_dir / cuts_filename).is_file():
|
||||||
|
logging.info(f"{cuts_filename} already exists - skipping.")
|
||||||
|
return
|
||||||
|
|
||||||
|
logging.info(f"Processing {subset}.{idx} of {prefix}")
|
||||||
|
|
||||||
|
cut_set = cut_set.compute_and_store_features_batch(
|
||||||
|
extractor=extractor,
|
||||||
|
storage_path=f"{output_dir}/{prefix}_feats_{subset}_{idx}",
|
||||||
|
num_workers=4,
|
||||||
|
batch_duration=params.batch_duration,
|
||||||
|
storage_type=LilcomChunkyWriter,
|
||||||
|
overwrite=True,
|
||||||
|
)
|
||||||
|
cut_set.to_file(output_dir / cuts_filename)
|
||||||
|
|
||||||
|
|
||||||
|
def compute_fbank_split(params):
|
||||||
|
if params.split_end < params.split_begin:
|
||||||
|
logging.warning(
|
||||||
|
f"Split begin should be smaller than split end, given "
|
||||||
|
f"{params.split_begin} -> {params.split_end}."
|
||||||
|
)
|
||||||
|
|
||||||
|
with Pool(max_workers=params.num_jobs) as pool:
|
||||||
|
futures = [
|
||||||
|
pool.submit(compute_fbank_split_single, params, i)
|
||||||
|
for i in range(params.split_begin, params.split_end)
|
||||||
|
]
|
||||||
|
for f in futures:
|
||||||
|
f.result()
|
||||||
|
f.done()
|
||||||
|
|
||||||
|
|
||||||
|
def compute_fbank(params):
|
||||||
|
src_dir = Path(params.source_dir)
|
||||||
|
output_dir = Path(params.dest_dir)
|
||||||
|
num_jobs = params.num_jobs
|
||||||
|
num_mel_bins = params.num_mel_bins
|
||||||
|
|
||||||
|
prefix = params.dataset
|
||||||
|
subset = params.subset
|
||||||
|
suffix = "jsonl.gz"
|
||||||
|
|
||||||
|
cut_set_name = f"{prefix}_cuts_{subset}.{suffix}"
|
||||||
|
|
||||||
|
if (src_dir / cut_set_name).is_file():
|
||||||
|
logging.info(f"Loading manifests {src_dir / cut_set_name}")
|
||||||
|
cut_set = load_manifest_lazy(src_dir / cut_set_name)
|
||||||
|
else:
|
||||||
|
recordings = load_manifest_lazy(
|
||||||
|
src_dir / f"{prefix}_recordings_{subset}.{suffix}"
|
||||||
|
)
|
||||||
|
supervisions = load_manifest_lazy(
|
||||||
|
src_dir / f"{prefix}_supervisions_{subset}.{suffix}"
|
||||||
|
)
|
||||||
|
cut_set = CutSet.from_manifests(
|
||||||
|
recordings=recordings,
|
||||||
|
supervisions=supervisions,
|
||||||
|
)
|
||||||
|
|
||||||
|
cut_set = cut_set.resample(params.sampling_rate)
|
||||||
|
|
||||||
|
config = TorchAudioFbankConfig(
|
||||||
|
sampling_rate=params.sampling_rate,
|
||||||
|
n_mels=params.num_mel_bins,
|
||||||
|
n_fft=params.frame_length,
|
||||||
|
hop_length=params.frame_shift,
|
||||||
|
)
|
||||||
|
extractor = TorchAudioFbank(config)
|
||||||
|
|
||||||
|
cuts_filename = f"{prefix}_cuts_{subset}.{suffix}"
|
||||||
|
if (output_dir / cuts_filename).is_file():
|
||||||
|
logging.info(f"{prefix} {subset} already exists - skipping.")
|
||||||
|
return
|
||||||
|
logging.info(f"Processing {subset} of {prefix}")
|
||||||
|
|
||||||
|
cut_set = cut_set.compute_and_store_features(
|
||||||
|
extractor=extractor,
|
||||||
|
storage_path=f"{output_dir}/{prefix}_feats_{subset}",
|
||||||
|
num_jobs=num_jobs,
|
||||||
|
storage_type=LilcomChunkyWriter,
|
||||||
|
)
|
||||||
|
cut_set.to_file(output_dir / cuts_filename)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
||||||
|
|
||||||
|
logging.basicConfig(format=formatter, level=logging.INFO)
|
||||||
|
args = get_args()
|
||||||
|
logging.info(vars(args))
|
||||||
|
if args.split_cuts:
|
||||||
|
compute_fbank_split(params=args)
|
||||||
|
else:
|
||||||
|
compute_fbank(params=args)
|
||||||
@ -1,140 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# Copyright 2021-2023 Xiaomi Corp. (authors: Fangjun Kuang,
|
|
||||||
# Zengwei Yao,)
|
|
||||||
# 2024 The Chinese Univ. of HK (authors: Zengrui Jin)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
This file computes fbank features of the LibriTTS dataset.
|
|
||||||
It looks for manifests in the directory data/manifests.
|
|
||||||
|
|
||||||
The generated fbank features are saved in data/fbank.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from feature import TorchAudioFbank, TorchAudioFbankConfig
|
|
||||||
from lhotse import CutSet, LilcomChunkyWriter
|
|
||||||
from lhotse.recipes.utils import read_manifests_if_cached
|
|
||||||
|
|
||||||
from icefall.utils import get_executor
|
|
||||||
|
|
||||||
# Torch's multithreaded behavior needs to be disabled or
|
|
||||||
# it wastes a lot of CPU and slow things down.
|
|
||||||
# Do this outside of main() in case it needs to take effect
|
|
||||||
# even when we are not invoking the main (e.g. when spawning subprocesses).
|
|
||||||
torch.set_num_threads(1)
|
|
||||||
torch.set_num_interop_threads(1)
|
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--dataset",
|
|
||||||
type=str,
|
|
||||||
help="""Dataset parts to compute fbank. If None, we will use all""",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--sampling-rate",
|
|
||||||
type=int,
|
|
||||||
default=24000,
|
|
||||||
help="""Sampling rate of the waveform for computing fbank,
|
|
||||||
the default value for LibriTTS is 24000, waveform files will be
|
|
||||||
resampled if a different sample rate is provided""",
|
|
||||||
)
|
|
||||||
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def compute_fbank_libritts(dataset: Optional[str] = None, sampling_rate: int = 24000):
|
|
||||||
src_dir = Path("data/manifests_libritts")
|
|
||||||
output_dir = Path("data/fbank_libritts")
|
|
||||||
num_jobs = min(32, os.cpu_count())
|
|
||||||
|
|
||||||
prefix = "libritts"
|
|
||||||
suffix = "jsonl.gz"
|
|
||||||
if dataset is None:
|
|
||||||
dataset_parts = (
|
|
||||||
"dev-clean",
|
|
||||||
"test-clean",
|
|
||||||
"train-clean-100",
|
|
||||||
"train-clean-360",
|
|
||||||
"train-other-500",
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
dataset_parts = dataset.split(" ", -1)
|
|
||||||
|
|
||||||
manifests = read_manifests_if_cached(
|
|
||||||
dataset_parts=dataset_parts,
|
|
||||||
output_dir=src_dir,
|
|
||||||
prefix=prefix,
|
|
||||||
suffix=suffix,
|
|
||||||
)
|
|
||||||
assert manifests is not None
|
|
||||||
|
|
||||||
assert len(manifests) == len(dataset_parts), (
|
|
||||||
len(manifests),
|
|
||||||
len(dataset_parts),
|
|
||||||
list(manifests.keys()),
|
|
||||||
dataset_parts,
|
|
||||||
)
|
|
||||||
|
|
||||||
config = TorchAudioFbankConfig(
|
|
||||||
sampling_rate=sampling_rate,
|
|
||||||
n_mels=100,
|
|
||||||
n_fft=1024,
|
|
||||||
hop_length=256,
|
|
||||||
)
|
|
||||||
extractor = TorchAudioFbank(config)
|
|
||||||
|
|
||||||
with get_executor() as ex: # Initialize the executor only once.
|
|
||||||
for partition, m in manifests.items():
|
|
||||||
cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
|
|
||||||
if (output_dir / cuts_filename).is_file():
|
|
||||||
logging.info(f"{partition} already exists - skipping.")
|
|
||||||
return
|
|
||||||
logging.info(f"Processing {partition}")
|
|
||||||
cut_set = CutSet.from_manifests(
|
|
||||||
recordings=m["recordings"],
|
|
||||||
supervisions=m["supervisions"],
|
|
||||||
)
|
|
||||||
if sampling_rate != 24000:
|
|
||||||
logging.info(f"Resampling waveforms to {sampling_rate}")
|
|
||||||
cut_set = cut_set.resample(sampling_rate)
|
|
||||||
|
|
||||||
cut_set = cut_set.compute_and_store_features(
|
|
||||||
extractor=extractor,
|
|
||||||
storage_path=f"{output_dir}/{prefix}_feats_{partition}",
|
|
||||||
# when an executor is specified, make more partitions
|
|
||||||
num_jobs=num_jobs if ex is None else 80,
|
|
||||||
executor=ex,
|
|
||||||
storage_type=LilcomChunkyWriter,
|
|
||||||
)
|
|
||||||
cut_set.to_file(output_dir / cuts_filename)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
|
||||||
|
|
||||||
logging.basicConfig(format=formatter, level=logging.INFO)
|
|
||||||
compute_fbank_libritts()
|
|
||||||
@ -1,135 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# Copyright 2024 Xiaomi Corp. (authors: Han Zhu)
|
|
||||||
#
|
|
||||||
# See ../../../../LICENSE for clarification regarding multiple authors
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
import torchaudio
|
|
||||||
from lhotse.features.base import FeatureExtractor, register_extractor
|
|
||||||
from lhotse.utils import Seconds, compute_num_frames
|
|
||||||
|
|
||||||
|
|
||||||
class MelSpectrogramFeatures(nn.Module):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
sampling_rate=24000,
|
|
||||||
n_mels=100,
|
|
||||||
n_fft=1024,
|
|
||||||
hop_length=256,
|
|
||||||
):
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
self.mel_spec = torchaudio.transforms.MelSpectrogram(
|
|
||||||
sample_rate=sampling_rate,
|
|
||||||
n_fft=n_fft,
|
|
||||||
hop_length=hop_length,
|
|
||||||
n_mels=n_mels,
|
|
||||||
center=True,
|
|
||||||
power=1,
|
|
||||||
)
|
|
||||||
|
|
||||||
def forward(self, inp):
|
|
||||||
assert len(inp.shape) == 2
|
|
||||||
|
|
||||||
mel = self.mel_spec(inp)
|
|
||||||
logmel = mel.clamp(min=1e-7).log()
|
|
||||||
return logmel
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class TorchAudioFbankConfig:
|
|
||||||
sampling_rate: int
|
|
||||||
n_mels: int
|
|
||||||
n_fft: int
|
|
||||||
hop_length: int
|
|
||||||
|
|
||||||
|
|
||||||
@register_extractor
|
|
||||||
class TorchAudioFbank(FeatureExtractor):
|
|
||||||
|
|
||||||
name = "TorchAudioFbank"
|
|
||||||
config_type = TorchAudioFbankConfig
|
|
||||||
|
|
||||||
def __init__(self, config):
|
|
||||||
super().__init__(config=config)
|
|
||||||
|
|
||||||
def _feature_fn(self, sample):
|
|
||||||
fbank = MelSpectrogramFeatures(
|
|
||||||
sampling_rate=self.config.sampling_rate,
|
|
||||||
n_mels=self.config.n_mels,
|
|
||||||
n_fft=self.config.n_fft,
|
|
||||||
hop_length=self.config.hop_length,
|
|
||||||
)
|
|
||||||
|
|
||||||
return fbank(sample)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def device(self) -> Union[str, torch.device]:
|
|
||||||
return self.config.device
|
|
||||||
|
|
||||||
def feature_dim(self, sampling_rate: int) -> int:
|
|
||||||
return self.config.n_mels
|
|
||||||
|
|
||||||
def extract(
|
|
||||||
self,
|
|
||||||
samples: Union[np.ndarray, torch.Tensor],
|
|
||||||
sampling_rate: int,
|
|
||||||
) -> Union[np.ndarray, torch.Tensor]:
|
|
||||||
# Check for sampling rate compatibility.
|
|
||||||
expected_sr = self.config.sampling_rate
|
|
||||||
assert sampling_rate == expected_sr, (
|
|
||||||
f"Mismatched sampling rate: extractor expects {expected_sr}, "
|
|
||||||
f"got {sampling_rate}"
|
|
||||||
)
|
|
||||||
is_numpy = False
|
|
||||||
if not isinstance(samples, torch.Tensor):
|
|
||||||
samples = torch.from_numpy(samples)
|
|
||||||
is_numpy = True
|
|
||||||
|
|
||||||
if len(samples.shape) == 1:
|
|
||||||
samples = samples.unsqueeze(0)
|
|
||||||
assert samples.ndim == 2, samples.shape
|
|
||||||
assert samples.shape[0] == 1, samples.shape
|
|
||||||
|
|
||||||
mel = self._feature_fn(samples).squeeze().t()
|
|
||||||
|
|
||||||
assert mel.ndim == 2, mel.shape
|
|
||||||
assert mel.shape[1] == self.config.n_mels, mel.shape
|
|
||||||
|
|
||||||
num_frames = compute_num_frames(
|
|
||||||
samples.shape[1] / sampling_rate, self.frame_shift, sampling_rate
|
|
||||||
)
|
|
||||||
|
|
||||||
if mel.shape[0] > num_frames:
|
|
||||||
mel = mel[:num_frames]
|
|
||||||
elif mel.shape[0] < num_frames:
|
|
||||||
mel = mel.unsqueeze(0)
|
|
||||||
mel = torch.nn.functional.pad(
|
|
||||||
mel, (0, 0, 0, num_frames - mel.shape[1]), mode="replicate"
|
|
||||||
).squeeze(0)
|
|
||||||
|
|
||||||
if is_numpy:
|
|
||||||
return mel.cpu().numpy()
|
|
||||||
else:
|
|
||||||
return mel
|
|
||||||
|
|
||||||
@property
|
|
||||||
def frame_shift(self) -> Seconds:
|
|
||||||
return self.config.hop_length / self.config.sampling_rate
|
|
||||||
1
egs/zipvoice/local/feature.py
Symbolic link
1
egs/zipvoice/local/feature.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../zipvoice/feature.py
|
||||||
@ -20,20 +20,26 @@
|
|||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
This file reads the texts in given manifest and save the new cuts with phoneme tokens.
|
This file reads the texts in given manifest and save the cleaned new cuts.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import glob
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import glob
|
||||||
from concurrent.futures import ProcessPoolExecutor as Pool
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import jieba
|
from lhotse import CutSet, load_manifest_lazy
|
||||||
from lhotse import load_manifest_lazy
|
from concurrent.futures import ProcessPoolExecutor as Pool
|
||||||
from tokenizer import Tokenizer, is_alphabet, is_chinese, is_hangul, is_japanese
|
|
||||||
|
from tokenizer import (
|
||||||
|
is_alphabet,
|
||||||
|
is_chinese,
|
||||||
|
is_hangul,
|
||||||
|
is_japanese,
|
||||||
|
tokenize_by_CJK_char,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
def get_args():
|
||||||
@ -48,71 +54,32 @@ def get_args():
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--jobs",
|
"--jobs",
|
||||||
type=int,
|
type=int,
|
||||||
default=50,
|
default=20,
|
||||||
help="Number of jobs to processing.",
|
help="Number of jobs to processing.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--source-dir",
|
"--source-dir",
|
||||||
type=str,
|
type=str,
|
||||||
default="data/manifests_emilia/splits",
|
default="data/manifests/splits_raw",
|
||||||
help="The source directory of manifest files.",
|
help="The source directory of manifest files.",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dest-dir",
|
"--dest-dir",
|
||||||
type=str,
|
type=str,
|
||||||
|
default="data/manifests/splits",
|
||||||
help="The destination directory of manifest files.",
|
help="The destination directory of manifest files.",
|
||||||
)
|
)
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def tokenize_by_CJK_char(line: str) -> List[str]:
|
def preprocess_emilia(file_name: str, input_dir: Path, output_dir: Path):
|
||||||
"""
|
|
||||||
Tokenize a line of text with CJK char.
|
|
||||||
|
|
||||||
Note: All return characters will be upper case.
|
|
||||||
|
|
||||||
Example:
|
|
||||||
input = "你好世界是 hello world 的中文"
|
|
||||||
output = [你, 好, 世, 界, 是, HELLO, WORLD, 的, 中, 文]
|
|
||||||
|
|
||||||
Args:
|
|
||||||
line:
|
|
||||||
The input text.
|
|
||||||
|
|
||||||
Return:
|
|
||||||
A new string tokenize by CJK char.
|
|
||||||
"""
|
|
||||||
# The CJK ranges is from https://github.com/alvations/nltk/blob/79eed6ddea0d0a2c212c1060b477fc268fec4d4b/nltk/tokenize/util.py
|
|
||||||
pattern = re.compile(
|
|
||||||
r"([\u1100-\u11ff\u2e80-\ua4cf\ua840-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F\uFF65-\uFFDC\U00020000-\U0002FFFF])"
|
|
||||||
)
|
|
||||||
chars = pattern.split(line.strip().upper())
|
|
||||||
char_list = []
|
|
||||||
for w in chars:
|
|
||||||
if w.strip():
|
|
||||||
char_list += w.strip().split()
|
|
||||||
return char_list
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
|
|
||||||
logging.info(f"Processing {file_name}")
|
logging.info(f"Processing {file_name}")
|
||||||
if (output_dir / file_name).is_file():
|
if (output_dir / file_name).is_file():
|
||||||
logging.info(f"{file_name} exists, skipping.")
|
logging.info(f"{file_name} exists, skipping.")
|
||||||
return
|
return
|
||||||
jieba.setLogLevel(logging.INFO)
|
|
||||||
tokenizer = Tokenizer()
|
|
||||||
|
|
||||||
def _prepare_cut(cut):
|
|
||||||
# Each cut only contains one supervision
|
|
||||||
assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
|
|
||||||
text = cut.supervisions[0].text
|
|
||||||
cut.supervisions[0].normalized_text = text
|
|
||||||
tokens = tokenizer.texts_to_tokens([text])[0]
|
|
||||||
cut.tokens = tokens
|
|
||||||
return cut
|
|
||||||
|
|
||||||
def _filter_cut(cut):
|
def _filter_cut(cut):
|
||||||
text = cut.supervisions[0].text
|
text = cut.supervisions[0].text
|
||||||
@ -124,10 +91,10 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
|
|||||||
clean_chars = []
|
clean_chars = []
|
||||||
for x in text:
|
for x in text:
|
||||||
if is_hangul(x):
|
if is_hangul(x):
|
||||||
logging.info(f"Delete cut with text containing Korean : {text}")
|
logging.warning(f"Delete cut with text containing Korean : {text}")
|
||||||
return False
|
return False
|
||||||
if is_japanese(x):
|
if is_japanese(x):
|
||||||
logging.info(f"Delete cut with text containing Japanese : {text}")
|
logging.warning(f"Delete cut with text containing Japanese : {text}")
|
||||||
return False
|
return False
|
||||||
if is_chinese(x):
|
if is_chinese(x):
|
||||||
chinese.append(x)
|
chinese.append(x)
|
||||||
@ -138,18 +105,19 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
|
|||||||
if x == " ":
|
if x == " ":
|
||||||
clean_chars.append(x)
|
clean_chars.append(x)
|
||||||
if len(english) + len(chinese) == 0:
|
if len(english) + len(chinese) == 0:
|
||||||
logging.info(f"Delete cut with text has no valid chars : {text}")
|
logging.warning(f"Delete cut with text has no valid chars : {text}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
words = tokenize_by_CJK_char("".join(clean_chars))
|
words = tokenize_by_CJK_char("".join(clean_chars))
|
||||||
for i in range(len(words) - 10):
|
for i in range(len(words) - 10):
|
||||||
if words[i : i + 10].count(words[i]) == 10:
|
if words[i : i + 10].count(words[i]) == 10:
|
||||||
logging.info(f"Delete cut with text with too much repeats : {text}")
|
logging.warning(f"Delete cut with text with too much repeats : {text}")
|
||||||
return False
|
return False
|
||||||
# word speed, 20 - 600 / minute
|
# word speed, 20 - 600 / minute
|
||||||
if duration < len(words) / 600 * 60 or duration > len(words) / 20 * 60:
|
if duration < len(words) / 600 * 60 or duration > len(words) / 20 * 60:
|
||||||
logging.info(
|
logging.warning(
|
||||||
f"Delete cut with audio text mismatch, duration : {duration}s, words : {len(words)}, text : {text}"
|
f"Delete cut with audio text mismatch, duration : {duration}s, "
|
||||||
|
f"words : {len(words)}, text : {text}"
|
||||||
)
|
)
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
@ -157,11 +125,10 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
|
|||||||
try:
|
try:
|
||||||
cut_set = load_manifest_lazy(input_dir / file_name)
|
cut_set = load_manifest_lazy(input_dir / file_name)
|
||||||
cut_set = cut_set.filter(_filter_cut)
|
cut_set = cut_set.filter(_filter_cut)
|
||||||
cut_set = cut_set.map(_prepare_cut)
|
|
||||||
cut_set.to_file(output_dir / file_name)
|
cut_set.to_file(output_dir / file_name)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Manifest {file_name} failed with error: {e}")
|
logging.error(f"Manifest {file_name} failed with error: {e}")
|
||||||
raise
|
os.remove(str(output_dir / file_name))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
@ -179,14 +146,11 @@ if __name__ == "__main__":
|
|||||||
with Pool(max_workers=args.jobs) as pool:
|
with Pool(max_workers=args.jobs) as pool:
|
||||||
futures = [
|
futures = [
|
||||||
pool.submit(
|
pool.submit(
|
||||||
prepare_tokens_emilia, filename.split("/")[-1], input_dir, output_dir
|
preprocess_emilia, filename.split("/")[-1], input_dir, output_dir
|
||||||
)
|
)
|
||||||
for filename in cut_files
|
for filename in cut_files
|
||||||
]
|
]
|
||||||
for f in futures:
|
for f in futures:
|
||||||
try:
|
f.result()
|
||||||
f.result()
|
f.done()
|
||||||
f.done()
|
|
||||||
except Exception as e:
|
|
||||||
logging.error(f"Future failed with error: {e}")
|
|
||||||
logging.info("Processing done.")
|
logging.info("Processing done.")
|
||||||
1
egs/zipvoice/local/tokenizer.py
Symbolic link
1
egs/zipvoice/local/tokenizer.py
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../zipvoice/tokenizer.py
|
||||||
@ -1,232 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
|
||||||
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
|
||||||
|
|
||||||
# add icefall to PYTHONPATH
|
|
||||||
export PYTHONPATH=../../../:$PYTHONPATH
|
|
||||||
|
|
||||||
set -eou pipefail
|
|
||||||
|
|
||||||
stage=0
|
|
||||||
stop_stage=100
|
|
||||||
|
|
||||||
token_type=bpe # bpe, letter, phone
|
|
||||||
bpe_vocab_size=500
|
|
||||||
|
|
||||||
nj=32
|
|
||||||
|
|
||||||
dl_dir=$PWD/download
|
|
||||||
|
|
||||||
. shared/parse_options.sh || exit 1
|
|
||||||
|
|
||||||
# All files generated by this script are saved in "data".
|
|
||||||
# You can safely remove "data" and rerun this script to regenerate it.
|
|
||||||
mkdir -p data
|
|
||||||
|
|
||||||
log() {
|
|
||||||
# This function is from espnet
|
|
||||||
local fname=${BASH_SOURCE[1]##*/}
|
|
||||||
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
|
||||||
}
|
|
||||||
|
|
||||||
log "dl_dir: $dl_dir"
|
|
||||||
|
|
||||||
|
|
||||||
if [ $stage -le -2 ] && [ $stop_stage -ge -2 ]; then
|
|
||||||
|
|
||||||
if [ ! -d $dl_dir/xvector_nnet_1a_libritts_clean_460 ]; then
|
|
||||||
log "Downloading x-vector"
|
|
||||||
|
|
||||||
git clone https://huggingface.co/datasets/zrjin/xvector_nnet_1a_libritts_clean_460 $dl_dir/xvector_nnet_1a_libritts_clean_460
|
|
||||||
|
|
||||||
mkdir -p exp/xvector_nnet_1a/
|
|
||||||
cp -r $dl_dir/xvector_nnet_1a_libritts_clean_460/* exp/xvector_nnet_1a/
|
|
||||||
fi
|
|
||||||
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
|
|
||||||
log "Stage -1: build monotonic_align lib"
|
|
||||||
if [ ! -d vits/monotonic_align/build ]; then
|
|
||||||
cd vits/monotonic_align
|
|
||||||
python setup.py build_ext --inplace
|
|
||||||
cd ../../
|
|
||||||
else
|
|
||||||
log "monotonic_align lib already built"
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
|
||||||
log "Stage 0: Download data"
|
|
||||||
|
|
||||||
# If you have pre-downloaded it to /path/to/LibriTTS,
|
|
||||||
# you can create a symlink
|
|
||||||
#
|
|
||||||
# ln -sfv /path/to/LibriTTS $dl_dir/LibriTTS
|
|
||||||
#
|
|
||||||
if [ ! -d $dl_dir/LibriTTS ]; then
|
|
||||||
lhotse download libritts $dl_dir
|
|
||||||
fi
|
|
||||||
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|
||||||
log "Stage 1: Prepare LibriTTS manifest"
|
|
||||||
# We assume that you have downloaded the LibriTTS corpus
|
|
||||||
# to $dl_dir/LibriTTS
|
|
||||||
mkdir -p data/manifests
|
|
||||||
if [ ! -e data/manifests/.libritts.done ]; then
|
|
||||||
lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests
|
|
||||||
touch data/manifests/.libritts.done
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
|
||||||
log "Stage 2: Compute Fbank for LibriTTS"
|
|
||||||
mkdir -p data/fbank
|
|
||||||
|
|
||||||
for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do
|
|
||||||
python local/compute_fbank.py --dataset libritts --subset ${subset}
|
|
||||||
done
|
|
||||||
|
|
||||||
# Here we shuffle and combine the train-clean-100, train-clean-360 and
|
|
||||||
# train-other-500 together to form the training set.
|
|
||||||
if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then
|
|
||||||
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
|
|
||||||
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \
|
|
||||||
<(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \
|
|
||||||
shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -f data/fbank/libritts_cuts_train-clean-460.jsonl.gz ]; then
|
|
||||||
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
|
|
||||||
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) | \
|
|
||||||
shuf | gzip -c > data/fbank/libritts_cuts_train-clean-460.jsonl.gz
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -e data/fbank/.libritts-validated.done ]; then
|
|
||||||
log "Validating data/fbank for LibriTTS"
|
|
||||||
./local/validate_manifest.py \
|
|
||||||
data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
|
|
||||||
touch data/fbank/.libritts-validated.done
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
|
||||||
log "Stage 3: Prepare tokens.txt"
|
|
||||||
|
|
||||||
if [ $token_type == "bpe" ] || [ $token_type == "letter" ]; then
|
|
||||||
if [ ! -e data/texts.txt ]; then
|
|
||||||
./local/export_normalized_texts.py --output data/texts.txt \
|
|
||||||
--manifests data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $token_type == "bpe" ]; then
|
|
||||||
mkdir -p data/lang_bpe_${bpe_vocab_size}
|
|
||||||
if [ ! -e data/lang_bpe_${bpe_vocab_size}/tokens.txt ]; then
|
|
||||||
./local/train_bpe_model.py --transcript data/texts.txt \
|
|
||||||
--lang-dir data/lang_bpe_${bpe_vocab_size} \
|
|
||||||
--vocab-size $bpe_vocab_size
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $token_type == "phone" ]; then
|
|
||||||
mkdir -p data/lang_phone
|
|
||||||
./local/export_tokens.py --token-type phone \
|
|
||||||
--output data/lang_phone/tokens.txt
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $token_type == "letter" ]; then
|
|
||||||
mkdir -p data/lang_letter
|
|
||||||
./local/export_tokens.py --token-type letter \
|
|
||||||
--texts data/texts.txt \
|
|
||||||
--output data/lang_letter/tokens.txt
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
|
||||||
log "Stage 4: Download and prepare librispeech-pc test clean for testing."
|
|
||||||
|
|
||||||
if [ ! -e $dl_dir/test-clean.tar.gz ]; then
|
|
||||||
wget https://huggingface.co/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir
|
|
||||||
fi
|
|
||||||
# For China users.
|
|
||||||
if [ ! -e $dl_dir/test-clean.tar.gz ]; then
|
|
||||||
wget https://hf-mirror.com/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -d $dl_dir/LibriSpeech/test-clean ]; then
|
|
||||||
tar -xvf $dl_dir/test-clean.tar.gz -C $dl_dir
|
|
||||||
fi
|
|
||||||
|
|
||||||
mkdir -p $dl_dir/LibriSpeech-PC
|
|
||||||
if [ ! -e $dl_dir/LibriSpeech-PC/test-clean.json ]; then
|
|
||||||
wget https://us.openslr.org/resources/145/manifests.tar.gz -P $dl_dir/LibriSpeech-PC
|
|
||||||
tar -xvf $dl_dir/LibriSpeech-PC/manifests.tar.gz -C $dl_dir/LibriSpeech-PC
|
|
||||||
fi
|
|
||||||
|
|
||||||
python local/compute_fbank.py --dataset librispeech --subset test-clean
|
|
||||||
python local/prepare_prompts_librispeech_test_clean.py
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
|
|
||||||
log "Stage 5: Compute Spectrogram for LibriTTS (for VITS system)"
|
|
||||||
mkdir -p data/spectrogram
|
|
||||||
if [ ! -e data/spectrogram/.libritts.done ]; then
|
|
||||||
./local/compute_spectrogram_libritts.py --sampling-rate $sampling_rate
|
|
||||||
touch data/spectrogram/.libritts.done
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Here we shuffle and combine the train-clean-100, train-clean-360 and
|
|
||||||
# train-other-500 together to form the training set.
|
|
||||||
if [ ! -f data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz ]; then
|
|
||||||
cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
|
|
||||||
<(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) \
|
|
||||||
<(gunzip -c data/spectrogram/libritts_cuts_train-other-500.jsonl.gz) | \
|
|
||||||
shuf | gzip -c > data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Here we shuffle and combine the train-clean-100, train-clean-360
|
|
||||||
# together to form the training set.
|
|
||||||
if [ ! -f data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz ]; then
|
|
||||||
cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
|
|
||||||
<(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) | \
|
|
||||||
shuf | gzip -c > data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -e data/spectrogram/.libritts-validated.done ]; then
|
|
||||||
log "Validating data/spectrogram for LibriTTS"
|
|
||||||
./local/validate_manifest.py \
|
|
||||||
data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
|
|
||||||
touch data/spectrogram/.libritts-validated.done
|
|
||||||
fi
|
|
||||||
fi
|
|
||||||
|
|
||||||
audio_feats_dir=data/tokenized
|
|
||||||
dataset_parts="--dataset-parts all" # debug "-p dev-clean -p test-clean"
|
|
||||||
if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
|
|
||||||
log "Stage 6: Tokenize/Fbank LibriTTS for valle"
|
|
||||||
mkdir -p ${audio_feats_dir}
|
|
||||||
if [ ! -e ${audio_feats_dir}/.libritts.tokenize.done ]; then
|
|
||||||
python3 ./local/compute_neural_codec_and_prepare_text_tokens.py --dataset-parts "${dataset_parts}" \
|
|
||||||
--audio-extractor "Encodec" \
|
|
||||||
--batch-duration 400 \
|
|
||||||
--src-dir "data/manifests" \
|
|
||||||
--output-dir "${audio_feats_dir}"
|
|
||||||
fi
|
|
||||||
touch ${audio_feats_dir}/.libritts.tokenize.done
|
|
||||||
|
|
||||||
lhotse combine \
|
|
||||||
${audio_feats_dir}/libritts_cuts_train-clean-100.jsonl.gz \
|
|
||||||
${audio_feats_dir}/libritts_cuts_train-clean-360.jsonl.gz \
|
|
||||||
${audio_feats_dir}/libritts_cuts_train-other-500.jsonl.gz \
|
|
||||||
${audio_feats_dir}/cuts_train.jsonl.gz
|
|
||||||
lhotse copy \
|
|
||||||
${audio_feats_dir}/libritts_cuts_dev-clean.jsonl.gz \
|
|
||||||
${audio_feats_dir}/cuts_dev.jsonl.gz
|
|
||||||
lhotse copy \
|
|
||||||
${audio_feats_dir}/libritts_cuts_test-clean.jsonl.gz \
|
|
||||||
${audio_feats_dir}/cuts_test.jsonl.gz
|
|
||||||
fi
|
|
||||||
126
egs/zipvoice/scripts/prepare_emilia.sh
Executable file
126
egs/zipvoice/scripts/prepare_emilia.sh
Executable file
@ -0,0 +1,126 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
|
||||||
|
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
|
||||||
|
|
||||||
|
set -eou pipefail
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
stop_stage=5
|
||||||
|
sampling_rate=24000
|
||||||
|
nj=32
|
||||||
|
|
||||||
|
dl_dir=$PWD/download
|
||||||
|
|
||||||
|
# All files generated by this script are saved in "data".
|
||||||
|
# You can safely remove "data" and rerun this script to regenerate it.
|
||||||
|
mkdir -p data
|
||||||
|
|
||||||
|
log() {
|
||||||
|
# This function is from espnet
|
||||||
|
local fname=${BASH_SOURCE[1]##*/}
|
||||||
|
echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||||
|
}
|
||||||
|
|
||||||
|
log "dl_dir: $dl_dir"
|
||||||
|
|
||||||
|
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
|
||||||
|
log "Stage 0: Download data"
|
||||||
|
|
||||||
|
# Your download directory should look like this:
|
||||||
|
#
|
||||||
|
# download/Amphion___Emilia
|
||||||
|
# ├── metafile.yaml
|
||||||
|
# ├── raw
|
||||||
|
# │ ├── DE
|
||||||
|
# │ ├── EN
|
||||||
|
# │ ├── FR
|
||||||
|
# │ ├── JA
|
||||||
|
# │ ├── KO
|
||||||
|
# │ ├── openemilia_45batches.tar.gz
|
||||||
|
# │ ├── openemilia_all.tar.gz
|
||||||
|
# │ └── ZH
|
||||||
|
# └── README.md
|
||||||
|
|
||||||
|
if [ ! -d $dl_dir/Amphion___Emilia/raw ]; then
|
||||||
|
log "Please refer https://openxlab.org.cn/datasets/Amphion/Emilia to download the dataset."
|
||||||
|
exit(-1)
|
||||||
|
fi
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
||||||
|
log "Stage 1: Prepare emilia manifests (EN and ZH only)"
|
||||||
|
# We assume that you have downloaded the Emilia corpus
|
||||||
|
# to $dl_dir/Amphion___Emilia
|
||||||
|
# see stage 0 for the directory structure
|
||||||
|
mkdir -p data/manifests
|
||||||
|
if [ ! -e data/manifests/.emilia.done ]; then
|
||||||
|
lhotse prepare emilia --lang en --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
|
||||||
|
lhotse prepare emilia --lang zh --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
|
||||||
|
touch data/manifests/.emilia.done
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
|
log "Stage 2: Preprocess Emilia dataset, mainly for cleaning"
|
||||||
|
mkdir -p data/manifests/splits_raw
|
||||||
|
if [ ! -e data/manifests/split_raw/.emilia.split.done ]; then
|
||||||
|
lhotse split-lazy data/manifests/emilia_cuts_EN.jsonl.gz data/manifests/splits_raw 10000
|
||||||
|
lhotse split-lazy data/manifests/emilia_cuts_ZH.jsonl.gz data/manifests/splits_raw 10000
|
||||||
|
touch data/manifests/splits_raw/.emilia.split.done
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p data/manifests/splits
|
||||||
|
|
||||||
|
if [ ! -e data/manifests/splits/.emilia.preprocess.done ]; then
|
||||||
|
python local/preprocess_emilia.py --subset EN
|
||||||
|
python local/preprocess_emilia.py --subset ZH
|
||||||
|
touch data/manifests/splits/.emilia.preprocess.done
|
||||||
|
fi
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
|
log "Stage 3: Extract Fbank for Emilia"
|
||||||
|
mkdir -p data/fbank/emilia_splits
|
||||||
|
if [ ! -e data/fbank/emilia_splits/.emilia.fbank.done ]; then
|
||||||
|
# You can speed up the extraction by distributing splits to multiple machines.
|
||||||
|
for subset in EN ZH; do
|
||||||
|
python local/compute_fbank.py \
|
||||||
|
--source-dir data/manifests/splits \
|
||||||
|
--dest-dir data/fbank/emilia_splits \
|
||||||
|
--dataset emilia \
|
||||||
|
--subset ${subset} \
|
||||||
|
--splits-cuts 1 \
|
||||||
|
--split-begin 0 \
|
||||||
|
--split-end 2000 \
|
||||||
|
--num-jobs ${nj}
|
||||||
|
done
|
||||||
|
touch data/fbank/emilia_splits/.emilia.fbank.done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -e data/fbank/emilia_cuts_EN.jsonl.gz ]; then
|
||||||
|
log "Combining EN fbank cuts and spliting EN dev set"
|
||||||
|
gunzip -c data/fbank/emilia_splits/emilia_cuts_EN.*.jsonl.gz > data/fbank/emilia_cuts_EN.jsonl
|
||||||
|
head -n 1500 data/fbank/emilia_cuts_EN.jsonl | gzip -c > data/fbank/emilia_cuts_EN_dev.jsonl.gz
|
||||||
|
sed -i '1,1500d' data/fbank/emilia_cuts_EN.jsonl
|
||||||
|
gzip data/fbank/emilia_cuts_EN.jsonl
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -e data/fbank/emilia_cuts_ZH.jsonl.gz ]; then
|
||||||
|
log "Combining ZH fbank cuts and spliting ZH dev set"
|
||||||
|
gunzip -c data/fbank/emilia_splits/emilia_cuts_ZH.*.jsonl.gz > data/fbank/emilia_cuts_ZH.jsonl
|
||||||
|
head -n 1500 data/fbank/emilia_cuts_ZH.jsonl | gzip -c > data/fbank/emilia_cuts_ZH_dev.jsonl.gz
|
||||||
|
sed -i '1,1500d' data/fbank/emilia_cuts_ZH.jsonl
|
||||||
|
gzip data/fbank/emilia_cuts_ZH.jsonl
|
||||||
|
fi
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
|
||||||
|
log "Stage 4: Generate token file"
|
||||||
|
if [ ! -e data/tokens_emilia.txt ]; then
|
||||||
|
./local/prepare_token_file_emilia.py --tokens data/tokens_emilia.txt
|
||||||
|
fi
|
||||||
|
fi
|
||||||
@ -8,7 +8,7 @@ set -eou pipefail
|
|||||||
stage=0
|
stage=0
|
||||||
stop_stage=5
|
stop_stage=5
|
||||||
sampling_rate=24000
|
sampling_rate=24000
|
||||||
nj=32
|
nj=20
|
||||||
|
|
||||||
dl_dir=$PWD/download
|
dl_dir=$PWD/download
|
||||||
|
|
||||||
@ -44,44 +44,53 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
|
|||||||
# to $dl_dir/LibriTTS
|
# to $dl_dir/LibriTTS
|
||||||
mkdir -p data/manifests_libritts
|
mkdir -p data/manifests_libritts
|
||||||
if [ ! -e data/manifests_libritts/.libritts.done ]; then
|
if [ ! -e data/manifests_libritts/.libritts.done ]; then
|
||||||
lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests_libritts
|
lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests
|
||||||
touch data/manifests_libritts/.libritts.done
|
touch data/manifests/.libritts.done
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
|
||||||
log "Stage 2: Compute Fbank for LibriTTS"
|
log "Stage 2: Compute Fbank for LibriTTS"
|
||||||
mkdir -p data/fbank
|
mkdir -p data/fbank
|
||||||
if [ ! -e data/fbank_libritts/.libritts.done ]; then
|
|
||||||
./local/compute_fbank_libritts.py --sampling-rate $sampling_rate
|
if [ ! -e data/fbank/.libritts.done ]; then
|
||||||
touch data/fbank_libritts/.libritts.done
|
for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do
|
||||||
|
python local/compute_fbank.py \
|
||||||
|
--source-dir data/manifests \
|
||||||
|
--dest-dir data/fbank \
|
||||||
|
--dataset libritts \
|
||||||
|
--subset ${subset} \
|
||||||
|
--sampling-rate $sampling_rate \
|
||||||
|
--num-jobs ${nj}
|
||||||
|
done
|
||||||
|
touch data/fbank/.libritts.done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Here we shuffle and combine the train-clean-100, train-clean-360 and
|
# Here we shuffle and combine the train-clean-100, train-clean-360 and
|
||||||
# train-other-500 together to form the training set.
|
# train-other-500 together to form the training set.
|
||||||
if [ ! -f data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz ]; then
|
if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then
|
||||||
cat <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-100.jsonl.gz) \
|
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
|
||||||
<(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-360.jsonl.gz) \
|
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \
|
||||||
<(gunzip -c data/fbank_libritts/libritts_cuts_train-other-500.jsonl.gz) | \
|
<(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \
|
||||||
shuf | gzip -c > data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz
|
shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f data/fbank_libritts/libritts_cuts_train-clean-460.jsonl.gz ]; then
|
if [ ! -f data/fbank/libritts_cuts_train-clean-460.jsonl.gz ]; then
|
||||||
cat <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-100.jsonl.gz) \
|
cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
|
||||||
<(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-360.jsonl.gz) | \
|
<(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) | \
|
||||||
shuf | gzip -c > data/fbank_libritts/libritts_cuts_train-clean-460.jsonl.gz
|
shuf | gzip -c > data/fbank/libritts_cuts_train-clean-460.jsonl.gz
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -e data/fbank_libritts/.libritts-validated.done ]; then
|
if [ ! -e data/fbank/.libritts-validated.done ]; then
|
||||||
log "Validating data/fbank for LibriTTS"
|
log "Validating data/fbank for LibriTTS"
|
||||||
./local/validate_manifest.py \
|
./local/validate_manifest.py \
|
||||||
data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz
|
data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
|
||||||
touch data/fbank_libritts/.libritts-validated.done
|
touch data/fbank/.libritts-validated.done
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
|
||||||
log "Stage 4: Generate token file"
|
log "Stage 3: Generate token file"
|
||||||
if [ ! -e data/tokens_libritts.txt ]; then
|
if [ ! -e data/tokens_libritts.txt ]; then
|
||||||
./local/prepare_token_file_libritts.py --tokens data/tokens_libritts.txt
|
./local/prepare_token_file_libritts.py --tokens data/tokens_libritts.txt
|
||||||
fi
|
fi
|
||||||
|
|||||||
1
egs/zipvoice/shared
Symbolic link
1
egs/zipvoice/shared
Symbolic link
@ -0,0 +1 @@
|
|||||||
|
../../icefall/shared
|
||||||
@ -31,7 +31,7 @@ python3 zipvoice/train_distill.py \
|
|||||||
--base-lr 0.0005 \
|
--base-lr 0.0005 \
|
||||||
--max-duration 500 \
|
--max-duration 500 \
|
||||||
--token-file "data/tokens_emilia.txt" \
|
--token-file "data/tokens_emilia.txt" \
|
||||||
--manifest-dir "data/fbank_emilia" \
|
--manifest-dir "data/fbank" \
|
||||||
--teacher-model zipvoice/exp_zipvoice/epoch-11-avg-4.pt \
|
--teacher-model zipvoice/exp_zipvoice/epoch-11-avg-4.pt \
|
||||||
--num-updates 60000 \
|
--num-updates 60000 \
|
||||||
--distill-stage "first" \
|
--distill-stage "first" \
|
||||||
@ -46,7 +46,7 @@ python3 zipvoice/train_distill.py \
|
|||||||
--base-lr 0.0001 \
|
--base-lr 0.0001 \
|
||||||
--max-duration 500 \
|
--max-duration 500 \
|
||||||
--token-file "data/tokens_emilia.txt" \
|
--token-file "data/tokens_emilia.txt" \
|
||||||
--manifest-dir "data/fbank_emilia" \
|
--manifest-dir "data/fbank" \
|
||||||
--teacher-model zipvoice/exp_zipvoice_distill_1stage/iter-60000-avg-7.pt \
|
--teacher-model zipvoice/exp_zipvoice_distill_1stage/iter-60000-avg-7.pt \
|
||||||
--num-updates 2000 \
|
--num-updates 2000 \
|
||||||
--distill-stage "second" \
|
--distill-stage "second" \
|
||||||
|
|||||||
@ -29,7 +29,7 @@ python3 zipvoice/train_flow.py \
|
|||||||
--lr-hours 30000 \
|
--lr-hours 30000 \
|
||||||
--lr-batches 7500 \
|
--lr-batches 7500 \
|
||||||
--token-file "data/tokens_emilia.txt" \
|
--token-file "data/tokens_emilia.txt" \
|
||||||
--manifest-dir "data/fbank_emilia" \
|
--manifest-dir "data/fbank" \
|
||||||
--num-epochs 11 \
|
--num-epochs 11 \
|
||||||
--exp-dir zipvoice/exp_zipvoice
|
--exp-dir zipvoice/exp_zipvoice
|
||||||
"""
|
"""
|
||||||
|
|||||||
@ -347,14 +347,14 @@ class TtsDataModule:
|
|||||||
train-clean-360 and train-other-500 cuts"
|
train-clean-360 and train-other-500 cuts"
|
||||||
)
|
)
|
||||||
return load_manifest_lazy(
|
return load_manifest_lazy(
|
||||||
self.args.manifest_dir / "libritts_cuts_with_tokens_train-all-shuf.jsonl.gz"
|
self.args.manifest_dir / "libritts_cuts_train-all-shuf.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache()
|
||||||
def dev_libritts_cuts(self) -> CutSet:
|
def dev_libritts_cuts(self) -> CutSet:
|
||||||
logging.info("About to get dev-clean cuts")
|
logging.info("About to get dev-clean cuts")
|
||||||
return load_manifest_lazy(
|
return load_manifest_lazy(
|
||||||
self.args.manifest_dir / "libritts_cuts_with_tokens_dev-clean.jsonl.gz"
|
self.args.manifest_dir / "libritts_cuts_dev-clean.jsonl.gz"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user