diff --git a/egs/zipvoice/README.md b/egs/zipvoice/README.md index a80025564..27b30d611 100644 --- a/egs/zipvoice/README.md +++ b/egs/zipvoice/README.md @@ -57,21 +57,20 @@ To generate speech with our pre-trained ZipVoice or ZipVoice-Distill models, use ### 1. Inference of a single sentence: ```bash -# Chinese example python3 zipvoice/zipvoice_infer.py \ --model-name "zipvoice_distill" \ - --prompt-wav assets/prompt-zh.wav \ - --prompt-text "对,这就是我,万人敬仰的太乙真人,虽然有点婴儿肥,但也掩不住我逼人的帅气。" \ - --text "欢迎使用我们的语音合成模型,希望它能给你带来惊喜!" \ - --res-wav-path result-zh.wav + --prompt-wav prompt.wav \ + --prompt-text "I am the transcription of the prompt wav." \ + --text "I am the text to be synthesized." \ + --res-wav-path result.wav -# English example +# Example with a pre-defined prompt wav and text python3 zipvoice/zipvoice_infer.py \ --model-name "zipvoice_distill" \ --prompt-wav assets/prompt-en.wav \ --prompt-text "Some call me nature, others call me mother nature. I've been here for over four point five billion years, twenty two thousand five hundred times longer than you." \ --text "Welcome to use our tts model, have fun!" \ - --res-wav-path result-en.wav + --res-wav-path result.wav ``` ### 2. Inference of a list of sentences: @@ -95,13 +94,29 @@ export HF_ENDPOINT=https://hf-mirror.com The following steps show how to train a model from scratch on Emilia and LibriTTS datasets, respectively. +### 0. Install dependencies for training + +```bash +pip install -r ../../requirements.txt +``` + ### 1. Data Preparation #### 1.1. Prepare the Emilia dataset +```bash +bash scripts/prepare_emilia.sh --stage 0 --stop-stage 4 +``` + +See [scripts/prepare_emilia.sh](scripts/prepare_emilia.sh) for step by step instructions. + #### 1.2 Prepare the LibriTTS dataset -See [local/prepare_libritts.sh](local/prepare_libritts.sh) +```bash +bash scripts/prepare_libritts.sh --stage 0 --stop-stage 3 +``` + +See [scripts/prepare_libritts.sh](scripts/prepare_libritts.sh) for step by step instructions. ### 2. Training diff --git a/egs/zipvoice/assets/prompt-zh.wav b/egs/zipvoice/assets/prompt-zh.wav deleted file mode 100644 index af1366329..000000000 Binary files a/egs/zipvoice/assets/prompt-zh.wav and /dev/null differ diff --git a/egs/zipvoice/local/compute_fbank.py b/egs/zipvoice/local/compute_fbank.py new file mode 100644 index 000000000..09a154229 --- /dev/null +++ b/egs/zipvoice/local/compute_fbank.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python3 +# Copyright 2025 Xiaomi Corp. (authors: Wei Kang) +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import logging +import os +from pathlib import Path +from typing import Optional +from concurrent.futures import ProcessPoolExecutor as Pool + +import torch +from lhotse import ( + CutSet, + LilcomChunkyWriter, + load_manifest_lazy, + set_audio_duration_mismatch_tolerance, +) + +from feature import TorchAudioFbank, TorchAudioFbankConfig +import lhotse + +# Torch's multithreaded behavior needs to be disabled or +# it wastes a lot of CPU and slow things down. +# Do this outside of main() in case it needs to take effect +# even when we are not invoking the main (e.g. when spawning subprocesses). +torch.set_num_threads(1) +torch.set_num_interop_threads(1) + + +def str2bool(v): + """Used in argparse.ArgumentParser.add_argument to indicate + that a type is a bool type and user can enter + + - yes, true, t, y, 1, to represent True + - no, false, f, n, 0, to represent False + + See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse # noqa + """ + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("Boolean value expected.") + + +def get_args(): + parser = argparse.ArgumentParser() + + parser.add_argument( + "--sampling-rate", + type=int, + default=24000, + help="The target sampling rate, the audio will be resampled to this sampling_rate.", + ) + + parser.add_argument( + "--frame-shift", + type=int, + default=256, + help="Frame shift in samples", + ) + + parser.add_argument( + "--frame-length", + type=int, + default=1024, + help="Frame length in samples", + ) + + parser.add_argument( + "--num-mel-bins", + type=int, + default=100, + help="The num of mel filters.", + ) + + parser.add_argument( + "--dataset", + type=str, + help="Dataset name.", + ) + + parser.add_argument( + "--subset", + type=str, + help="The subset of the dataset.", + ) + + parser.add_argument( + "--source-dir", + type=str, + default="data/manifests", + help="The source directory of manifest files.", + ) + + parser.add_argument( + "--dest-dir", + type=str, + default="data/fbank", + help="The destination directory of manifest files.", + ) + + parser.add_argument( + "--split-cuts", + type=str2bool, + default=False, + help="Whether to use splited cuts.", + ) + + parser.add_argument( + "--split-begin", + type=int, + help="Start idx of splited cuts.", + ) + + parser.add_argument( + "--split-end", + type=int, + help="End idx of splited cuts.", + ) + + parser.add_argument( + "--batch-duration", + type=int, + default=1000, + help="The batch duration when computing the features.", + ) + + parser.add_argument( + "--num-jobs", type=int, default=20, help="The number of extractor workers." + ) + + return parser.parse_args() + + +def compute_fbank_split_single(params, idx): + lhotse.set_audio_duration_mismatch_tolerance(0.1) # for emilia + src_dir = Path(params.source_dir) + output_dir = Path(params.dest_dir) + num_mel_bins = params.num_mel_bins + + if not src_dir.exists(): + logging.error(f"{src_dir} not exists") + return + + if not output_dir.exists(): + output_dir.mkdir(parents=True, exist_ok=True) + + num_digits = 8 + + config = TorchAudioFbankConfig( + sampling_rate=params.sampling_rate, + n_mels=params.num_mel_bins, + n_fft=params.frame_length, + hop_length=params.frame_shift, + ) + extractor = TorchAudioFbank(config) + + prefix = params.dataset + subset = params.subset + suffix = "jsonl.gz" + + idx = f"{idx}".zfill(num_digits) + cuts_filename = f"{prefix}_cuts_{subset}.{idx}.{suffix}" + + if (src_dir / cuts_filename).is_file(): + logging.info(f"Loading manifests {src_dir / cuts_filename}") + cut_set = load_manifest_lazy(src_dir / cuts_filename) + else: + logging.warning(f"Raw {cuts_filename} not exists, skipping") + return + + cut_set = cut_set.resample(params.sampling_rate) + + if (output_dir / cuts_filename).is_file(): + logging.info(f"{cuts_filename} already exists - skipping.") + return + + logging.info(f"Processing {subset}.{idx} of {prefix}") + + cut_set = cut_set.compute_and_store_features_batch( + extractor=extractor, + storage_path=f"{output_dir}/{prefix}_feats_{subset}_{idx}", + num_workers=4, + batch_duration=params.batch_duration, + storage_type=LilcomChunkyWriter, + overwrite=True, + ) + cut_set.to_file(output_dir / cuts_filename) + + +def compute_fbank_split(params): + if params.split_end < params.split_begin: + logging.warning( + f"Split begin should be smaller than split end, given " + f"{params.split_begin} -> {params.split_end}." + ) + + with Pool(max_workers=params.num_jobs) as pool: + futures = [ + pool.submit(compute_fbank_split_single, params, i) + for i in range(params.split_begin, params.split_end) + ] + for f in futures: + f.result() + f.done() + + +def compute_fbank(params): + src_dir = Path(params.source_dir) + output_dir = Path(params.dest_dir) + num_jobs = params.num_jobs + num_mel_bins = params.num_mel_bins + + prefix = params.dataset + subset = params.subset + suffix = "jsonl.gz" + + cut_set_name = f"{prefix}_cuts_{subset}.{suffix}" + + if (src_dir / cut_set_name).is_file(): + logging.info(f"Loading manifests {src_dir / cut_set_name}") + cut_set = load_manifest_lazy(src_dir / cut_set_name) + else: + recordings = load_manifest_lazy( + src_dir / f"{prefix}_recordings_{subset}.{suffix}" + ) + supervisions = load_manifest_lazy( + src_dir / f"{prefix}_supervisions_{subset}.{suffix}" + ) + cut_set = CutSet.from_manifests( + recordings=recordings, + supervisions=supervisions, + ) + + cut_set = cut_set.resample(params.sampling_rate) + + config = TorchAudioFbankConfig( + sampling_rate=params.sampling_rate, + n_mels=params.num_mel_bins, + n_fft=params.frame_length, + hop_length=params.frame_shift, + ) + extractor = TorchAudioFbank(config) + + cuts_filename = f"{prefix}_cuts_{subset}.{suffix}" + if (output_dir / cuts_filename).is_file(): + logging.info(f"{prefix} {subset} already exists - skipping.") + return + logging.info(f"Processing {subset} of {prefix}") + + cut_set = cut_set.compute_and_store_features( + extractor=extractor, + storage_path=f"{output_dir}/{prefix}_feats_{subset}", + num_jobs=num_jobs, + storage_type=LilcomChunkyWriter, + ) + cut_set.to_file(output_dir / cuts_filename) + + +if __name__ == "__main__": + formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" + + logging.basicConfig(format=formatter, level=logging.INFO) + args = get_args() + logging.info(vars(args)) + if args.split_cuts: + compute_fbank_split(params=args) + else: + compute_fbank(params=args) diff --git a/egs/zipvoice/local/compute_fbank_libritts.py b/egs/zipvoice/local/compute_fbank_libritts.py deleted file mode 100644 index 0c9f464ea..000000000 --- a/egs/zipvoice/local/compute_fbank_libritts.py +++ /dev/null @@ -1,140 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2021-2023 Xiaomi Corp. (authors: Fangjun Kuang, -# Zengwei Yao,) -# 2024 The Chinese Univ. of HK (authors: Zengrui Jin) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -""" -This file computes fbank features of the LibriTTS dataset. -It looks for manifests in the directory data/manifests. - -The generated fbank features are saved in data/fbank. -""" - -import argparse -import logging -import os -from pathlib import Path -from typing import Optional - -import torch -from feature import TorchAudioFbank, TorchAudioFbankConfig -from lhotse import CutSet, LilcomChunkyWriter -from lhotse.recipes.utils import read_manifests_if_cached - -from icefall.utils import get_executor - -# Torch's multithreaded behavior needs to be disabled or -# it wastes a lot of CPU and slow things down. -# Do this outside of main() in case it needs to take effect -# even when we are not invoking the main (e.g. when spawning subprocesses). -torch.set_num_threads(1) -torch.set_num_interop_threads(1) - - -def get_args(): - parser = argparse.ArgumentParser() - - parser.add_argument( - "--dataset", - type=str, - help="""Dataset parts to compute fbank. If None, we will use all""", - ) - parser.add_argument( - "--sampling-rate", - type=int, - default=24000, - help="""Sampling rate of the waveform for computing fbank, - the default value for LibriTTS is 24000, waveform files will be - resampled if a different sample rate is provided""", - ) - - return parser.parse_args() - - -def compute_fbank_libritts(dataset: Optional[str] = None, sampling_rate: int = 24000): - src_dir = Path("data/manifests_libritts") - output_dir = Path("data/fbank_libritts") - num_jobs = min(32, os.cpu_count()) - - prefix = "libritts" - suffix = "jsonl.gz" - if dataset is None: - dataset_parts = ( - "dev-clean", - "test-clean", - "train-clean-100", - "train-clean-360", - "train-other-500", - ) - else: - dataset_parts = dataset.split(" ", -1) - - manifests = read_manifests_if_cached( - dataset_parts=dataset_parts, - output_dir=src_dir, - prefix=prefix, - suffix=suffix, - ) - assert manifests is not None - - assert len(manifests) == len(dataset_parts), ( - len(manifests), - len(dataset_parts), - list(manifests.keys()), - dataset_parts, - ) - - config = TorchAudioFbankConfig( - sampling_rate=sampling_rate, - n_mels=100, - n_fft=1024, - hop_length=256, - ) - extractor = TorchAudioFbank(config) - - with get_executor() as ex: # Initialize the executor only once. - for partition, m in manifests.items(): - cuts_filename = f"{prefix}_cuts_{partition}.{suffix}" - if (output_dir / cuts_filename).is_file(): - logging.info(f"{partition} already exists - skipping.") - return - logging.info(f"Processing {partition}") - cut_set = CutSet.from_manifests( - recordings=m["recordings"], - supervisions=m["supervisions"], - ) - if sampling_rate != 24000: - logging.info(f"Resampling waveforms to {sampling_rate}") - cut_set = cut_set.resample(sampling_rate) - - cut_set = cut_set.compute_and_store_features( - extractor=extractor, - storage_path=f"{output_dir}/{prefix}_feats_{partition}", - # when an executor is specified, make more partitions - num_jobs=num_jobs if ex is None else 80, - executor=ex, - storage_type=LilcomChunkyWriter, - ) - cut_set.to_file(output_dir / cuts_filename) - - -if __name__ == "__main__": - formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" - - logging.basicConfig(format=formatter, level=logging.INFO) - compute_fbank_libritts() diff --git a/egs/zipvoice/local/feature.py b/egs/zipvoice/local/feature.py deleted file mode 100644 index e7d484d10..000000000 --- a/egs/zipvoice/local/feature.py +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2024 Xiaomi Corp. (authors: Han Zhu) -# -# See ../../../../LICENSE for clarification regarding multiple authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass -from typing import Union - -import numpy as np -import torch -import torch.nn as nn -import torchaudio -from lhotse.features.base import FeatureExtractor, register_extractor -from lhotse.utils import Seconds, compute_num_frames - - -class MelSpectrogramFeatures(nn.Module): - def __init__( - self, - sampling_rate=24000, - n_mels=100, - n_fft=1024, - hop_length=256, - ): - super().__init__() - - self.mel_spec = torchaudio.transforms.MelSpectrogram( - sample_rate=sampling_rate, - n_fft=n_fft, - hop_length=hop_length, - n_mels=n_mels, - center=True, - power=1, - ) - - def forward(self, inp): - assert len(inp.shape) == 2 - - mel = self.mel_spec(inp) - logmel = mel.clamp(min=1e-7).log() - return logmel - - -@dataclass -class TorchAudioFbankConfig: - sampling_rate: int - n_mels: int - n_fft: int - hop_length: int - - -@register_extractor -class TorchAudioFbank(FeatureExtractor): - - name = "TorchAudioFbank" - config_type = TorchAudioFbankConfig - - def __init__(self, config): - super().__init__(config=config) - - def _feature_fn(self, sample): - fbank = MelSpectrogramFeatures( - sampling_rate=self.config.sampling_rate, - n_mels=self.config.n_mels, - n_fft=self.config.n_fft, - hop_length=self.config.hop_length, - ) - - return fbank(sample) - - @property - def device(self) -> Union[str, torch.device]: - return self.config.device - - def feature_dim(self, sampling_rate: int) -> int: - return self.config.n_mels - - def extract( - self, - samples: Union[np.ndarray, torch.Tensor], - sampling_rate: int, - ) -> Union[np.ndarray, torch.Tensor]: - # Check for sampling rate compatibility. - expected_sr = self.config.sampling_rate - assert sampling_rate == expected_sr, ( - f"Mismatched sampling rate: extractor expects {expected_sr}, " - f"got {sampling_rate}" - ) - is_numpy = False - if not isinstance(samples, torch.Tensor): - samples = torch.from_numpy(samples) - is_numpy = True - - if len(samples.shape) == 1: - samples = samples.unsqueeze(0) - assert samples.ndim == 2, samples.shape - assert samples.shape[0] == 1, samples.shape - - mel = self._feature_fn(samples).squeeze().t() - - assert mel.ndim == 2, mel.shape - assert mel.shape[1] == self.config.n_mels, mel.shape - - num_frames = compute_num_frames( - samples.shape[1] / sampling_rate, self.frame_shift, sampling_rate - ) - - if mel.shape[0] > num_frames: - mel = mel[:num_frames] - elif mel.shape[0] < num_frames: - mel = mel.unsqueeze(0) - mel = torch.nn.functional.pad( - mel, (0, 0, 0, num_frames - mel.shape[1]), mode="replicate" - ).squeeze(0) - - if is_numpy: - return mel.cpu().numpy() - else: - return mel - - @property - def frame_shift(self) -> Seconds: - return self.config.hop_length / self.config.sampling_rate diff --git a/egs/zipvoice/local/feature.py b/egs/zipvoice/local/feature.py new file mode 120000 index 000000000..08ef7d228 --- /dev/null +++ b/egs/zipvoice/local/feature.py @@ -0,0 +1 @@ +../zipvoice/feature.py \ No newline at end of file diff --git a/egs/zipvoice/local/prepare_tokens_emilia.py b/egs/zipvoice/local/preprocess_emilia.py similarity index 62% rename from egs/zipvoice/local/prepare_tokens_emilia.py rename to egs/zipvoice/local/preprocess_emilia.py index 023d57524..bd76446aa 100644 --- a/egs/zipvoice/local/prepare_tokens_emilia.py +++ b/egs/zipvoice/local/preprocess_emilia.py @@ -20,20 +20,26 @@ """ -This file reads the texts in given manifest and save the new cuts with phoneme tokens. +This file reads the texts in given manifest and save the cleaned new cuts. """ import argparse -import glob import logging -import re -from concurrent.futures import ProcessPoolExecutor as Pool +import glob +import os from pathlib import Path from typing import List -import jieba -from lhotse import load_manifest_lazy -from tokenizer import Tokenizer, is_alphabet, is_chinese, is_hangul, is_japanese +from lhotse import CutSet, load_manifest_lazy +from concurrent.futures import ProcessPoolExecutor as Pool + +from tokenizer import ( + is_alphabet, + is_chinese, + is_hangul, + is_japanese, + tokenize_by_CJK_char, +) def get_args(): @@ -48,71 +54,32 @@ def get_args(): parser.add_argument( "--jobs", type=int, - default=50, + default=20, help="Number of jobs to processing.", ) parser.add_argument( "--source-dir", type=str, - default="data/manifests_emilia/splits", + default="data/manifests/splits_raw", help="The source directory of manifest files.", ) parser.add_argument( "--dest-dir", type=str, + default="data/manifests/splits", help="The destination directory of manifest files.", ) return parser.parse_args() -def tokenize_by_CJK_char(line: str) -> List[str]: - """ - Tokenize a line of text with CJK char. - - Note: All return characters will be upper case. - - Example: - input = "你好世界是 hello world 的中文" - output = [你, 好, 世, 界, 是, HELLO, WORLD, 的, 中, 文] - - Args: - line: - The input text. - - Return: - A new string tokenize by CJK char. - """ - # The CJK ranges is from https://github.com/alvations/nltk/blob/79eed6ddea0d0a2c212c1060b477fc268fec4d4b/nltk/tokenize/util.py - pattern = re.compile( - r"([\u1100-\u11ff\u2e80-\ua4cf\ua840-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F\uFF65-\uFFDC\U00020000-\U0002FFFF])" - ) - chars = pattern.split(line.strip().upper()) - char_list = [] - for w in chars: - if w.strip(): - char_list += w.strip().split() - return char_list - - -def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path): +def preprocess_emilia(file_name: str, input_dir: Path, output_dir: Path): logging.info(f"Processing {file_name}") if (output_dir / file_name).is_file(): logging.info(f"{file_name} exists, skipping.") return - jieba.setLogLevel(logging.INFO) - tokenizer = Tokenizer() - - def _prepare_cut(cut): - # Each cut only contains one supervision - assert len(cut.supervisions) == 1, (len(cut.supervisions), cut) - text = cut.supervisions[0].text - cut.supervisions[0].normalized_text = text - tokens = tokenizer.texts_to_tokens([text])[0] - cut.tokens = tokens - return cut def _filter_cut(cut): text = cut.supervisions[0].text @@ -124,10 +91,10 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path): clean_chars = [] for x in text: if is_hangul(x): - logging.info(f"Delete cut with text containing Korean : {text}") + logging.warning(f"Delete cut with text containing Korean : {text}") return False if is_japanese(x): - logging.info(f"Delete cut with text containing Japanese : {text}") + logging.warning(f"Delete cut with text containing Japanese : {text}") return False if is_chinese(x): chinese.append(x) @@ -138,18 +105,19 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path): if x == " ": clean_chars.append(x) if len(english) + len(chinese) == 0: - logging.info(f"Delete cut with text has no valid chars : {text}") + logging.warning(f"Delete cut with text has no valid chars : {text}") return False words = tokenize_by_CJK_char("".join(clean_chars)) for i in range(len(words) - 10): if words[i : i + 10].count(words[i]) == 10: - logging.info(f"Delete cut with text with too much repeats : {text}") + logging.warning(f"Delete cut with text with too much repeats : {text}") return False # word speed, 20 - 600 / minute if duration < len(words) / 600 * 60 or duration > len(words) / 20 * 60: - logging.info( - f"Delete cut with audio text mismatch, duration : {duration}s, words : {len(words)}, text : {text}" + logging.warning( + f"Delete cut with audio text mismatch, duration : {duration}s, " + f"words : {len(words)}, text : {text}" ) return False return True @@ -157,11 +125,10 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path): try: cut_set = load_manifest_lazy(input_dir / file_name) cut_set = cut_set.filter(_filter_cut) - cut_set = cut_set.map(_prepare_cut) cut_set.to_file(output_dir / file_name) except Exception as e: logging.error(f"Manifest {file_name} failed with error: {e}") - raise + os.remove(str(output_dir / file_name)) if __name__ == "__main__": @@ -179,14 +146,11 @@ if __name__ == "__main__": with Pool(max_workers=args.jobs) as pool: futures = [ pool.submit( - prepare_tokens_emilia, filename.split("/")[-1], input_dir, output_dir + preprocess_emilia, filename.split("/")[-1], input_dir, output_dir ) for filename in cut_files ] for f in futures: - try: - f.result() - f.done() - except Exception as e: - logging.error(f"Future failed with error: {e}") + f.result() + f.done() logging.info("Processing done.") diff --git a/egs/zipvoice/local/tokenizer.py b/egs/zipvoice/local/tokenizer.py new file mode 120000 index 000000000..024e340cc --- /dev/null +++ b/egs/zipvoice/local/tokenizer.py @@ -0,0 +1 @@ +../zipvoice/tokenizer.py \ No newline at end of file diff --git a/egs/zipvoice/scripts/prepare.sh b/egs/zipvoice/scripts/prepare.sh deleted file mode 100755 index 011301f2d..000000000 --- a/egs/zipvoice/scripts/prepare.sh +++ /dev/null @@ -1,232 +0,0 @@ -#!/usr/bin/env bash - -# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 -export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - -# add icefall to PYTHONPATH -export PYTHONPATH=../../../:$PYTHONPATH - -set -eou pipefail - -stage=0 -stop_stage=100 - -token_type=bpe # bpe, letter, phone -bpe_vocab_size=500 - -nj=32 - -dl_dir=$PWD/download - -. shared/parse_options.sh || exit 1 - -# All files generated by this script are saved in "data". -# You can safely remove "data" and rerun this script to regenerate it. -mkdir -p data - -log() { - # This function is from espnet - local fname=${BASH_SOURCE[1]##*/} - echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" -} - -log "dl_dir: $dl_dir" - - -if [ $stage -le -2 ] && [ $stop_stage -ge -2 ]; then - - if [ ! -d $dl_dir/xvector_nnet_1a_libritts_clean_460 ]; then - log "Downloading x-vector" - - git clone https://huggingface.co/datasets/zrjin/xvector_nnet_1a_libritts_clean_460 $dl_dir/xvector_nnet_1a_libritts_clean_460 - - mkdir -p exp/xvector_nnet_1a/ - cp -r $dl_dir/xvector_nnet_1a_libritts_clean_460/* exp/xvector_nnet_1a/ - fi - -fi - -if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then - log "Stage -1: build monotonic_align lib" - if [ ! -d vits/monotonic_align/build ]; then - cd vits/monotonic_align - python setup.py build_ext --inplace - cd ../../ - else - log "monotonic_align lib already built" - fi -fi - -if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then - log "Stage 0: Download data" - - # If you have pre-downloaded it to /path/to/LibriTTS, - # you can create a symlink - # - # ln -sfv /path/to/LibriTTS $dl_dir/LibriTTS - # - if [ ! -d $dl_dir/LibriTTS ]; then - lhotse download libritts $dl_dir - fi - -fi - -if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then - log "Stage 1: Prepare LibriTTS manifest" - # We assume that you have downloaded the LibriTTS corpus - # to $dl_dir/LibriTTS - mkdir -p data/manifests - if [ ! -e data/manifests/.libritts.done ]; then - lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests - touch data/manifests/.libritts.done - fi -fi - -if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then - log "Stage 2: Compute Fbank for LibriTTS" - mkdir -p data/fbank - - for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do - python local/compute_fbank.py --dataset libritts --subset ${subset} - done - - # Here we shuffle and combine the train-clean-100, train-clean-360 and - # train-other-500 together to form the training set. - if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then - cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \ - <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \ - <(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \ - shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz - fi - - if [ ! -f data/fbank/libritts_cuts_train-clean-460.jsonl.gz ]; then - cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \ - <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) | \ - shuf | gzip -c > data/fbank/libritts_cuts_train-clean-460.jsonl.gz - fi - - if [ ! -e data/fbank/.libritts-validated.done ]; then - log "Validating data/fbank for LibriTTS" - ./local/validate_manifest.py \ - data/fbank/libritts_cuts_train-all-shuf.jsonl.gz - touch data/fbank/.libritts-validated.done - fi -fi - -if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 3: Prepare tokens.txt" - - if [ $token_type == "bpe" ] || [ $token_type == "letter" ]; then - if [ ! -e data/texts.txt ]; then - ./local/export_normalized_texts.py --output data/texts.txt \ - --manifests data/fbank/libritts_cuts_train-all-shuf.jsonl.gz - fi - fi - - if [ $token_type == "bpe" ]; then - mkdir -p data/lang_bpe_${bpe_vocab_size} - if [ ! -e data/lang_bpe_${bpe_vocab_size}/tokens.txt ]; then - ./local/train_bpe_model.py --transcript data/texts.txt \ - --lang-dir data/lang_bpe_${bpe_vocab_size} \ - --vocab-size $bpe_vocab_size - fi - fi - - if [ $token_type == "phone" ]; then - mkdir -p data/lang_phone - ./local/export_tokens.py --token-type phone \ - --output data/lang_phone/tokens.txt - fi - - if [ $token_type == "letter" ]; then - mkdir -p data/lang_letter - ./local/export_tokens.py --token-type letter \ - --texts data/texts.txt \ - --output data/lang_letter/tokens.txt - fi -fi - -if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then - log "Stage 4: Download and prepare librispeech-pc test clean for testing." - - if [ ! -e $dl_dir/test-clean.tar.gz ]; then - wget https://huggingface.co/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir - fi - # For China users. - if [ ! -e $dl_dir/test-clean.tar.gz ]; then - wget https://hf-mirror.com/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir - fi - - if [ ! -d $dl_dir/LibriSpeech/test-clean ]; then - tar -xvf $dl_dir/test-clean.tar.gz -C $dl_dir - fi - - mkdir -p $dl_dir/LibriSpeech-PC - if [ ! -e $dl_dir/LibriSpeech-PC/test-clean.json ]; then - wget https://us.openslr.org/resources/145/manifests.tar.gz -P $dl_dir/LibriSpeech-PC - tar -xvf $dl_dir/LibriSpeech-PC/manifests.tar.gz -C $dl_dir/LibriSpeech-PC - fi - - python local/compute_fbank.py --dataset librispeech --subset test-clean - python local/prepare_prompts_librispeech_test_clean.py -fi - -if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then - log "Stage 5: Compute Spectrogram for LibriTTS (for VITS system)" - mkdir -p data/spectrogram - if [ ! -e data/spectrogram/.libritts.done ]; then - ./local/compute_spectrogram_libritts.py --sampling-rate $sampling_rate - touch data/spectrogram/.libritts.done - fi - - # Here we shuffle and combine the train-clean-100, train-clean-360 and - # train-other-500 together to form the training set. - if [ ! -f data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz ]; then - cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \ - <(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) \ - <(gunzip -c data/spectrogram/libritts_cuts_train-other-500.jsonl.gz) | \ - shuf | gzip -c > data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz - fi - - # Here we shuffle and combine the train-clean-100, train-clean-360 - # together to form the training set. - if [ ! -f data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz ]; then - cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \ - <(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) | \ - shuf | gzip -c > data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz - fi - - if [ ! -e data/spectrogram/.libritts-validated.done ]; then - log "Validating data/spectrogram for LibriTTS" - ./local/validate_manifest.py \ - data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz - touch data/spectrogram/.libritts-validated.done - fi -fi - -audio_feats_dir=data/tokenized -dataset_parts="--dataset-parts all" # debug "-p dev-clean -p test-clean" -if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then - log "Stage 6: Tokenize/Fbank LibriTTS for valle" - mkdir -p ${audio_feats_dir} - if [ ! -e ${audio_feats_dir}/.libritts.tokenize.done ]; then - python3 ./local/compute_neural_codec_and_prepare_text_tokens.py --dataset-parts "${dataset_parts}" \ - --audio-extractor "Encodec" \ - --batch-duration 400 \ - --src-dir "data/manifests" \ - --output-dir "${audio_feats_dir}" - fi - touch ${audio_feats_dir}/.libritts.tokenize.done - - lhotse combine \ - ${audio_feats_dir}/libritts_cuts_train-clean-100.jsonl.gz \ - ${audio_feats_dir}/libritts_cuts_train-clean-360.jsonl.gz \ - ${audio_feats_dir}/libritts_cuts_train-other-500.jsonl.gz \ - ${audio_feats_dir}/cuts_train.jsonl.gz - lhotse copy \ - ${audio_feats_dir}/libritts_cuts_dev-clean.jsonl.gz \ - ${audio_feats_dir}/cuts_dev.jsonl.gz - lhotse copy \ - ${audio_feats_dir}/libritts_cuts_test-clean.jsonl.gz \ - ${audio_feats_dir}/cuts_test.jsonl.gz -fi diff --git a/egs/zipvoice/scripts/prepare_emilia.sh b/egs/zipvoice/scripts/prepare_emilia.sh new file mode 100755 index 000000000..bf19ed1a5 --- /dev/null +++ b/egs/zipvoice/scripts/prepare_emilia.sh @@ -0,0 +1,126 @@ +#!/usr/bin/env bash + +# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674 +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python + +set -eou pipefail + +stage=0 +stop_stage=5 +sampling_rate=24000 +nj=32 + +dl_dir=$PWD/download + +# All files generated by this script are saved in "data". +# You can safely remove "data" and rerun this script to regenerate it. +mkdir -p data + +log() { + # This function is from espnet + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} + +log "dl_dir: $dl_dir" + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then + log "Stage 0: Download data" + + # Your download directory should look like this: + # + # download/Amphion___Emilia + # ├── metafile.yaml + # ├── raw + # │ ├── DE + # │ ├── EN + # │ ├── FR + # │ ├── JA + # │ ├── KO + # │ ├── openemilia_45batches.tar.gz + # │ ├── openemilia_all.tar.gz + # │ └── ZH + # └── README.md + + if [ ! -d $dl_dir/Amphion___Emilia/raw ]; then + log "Please refer https://openxlab.org.cn/datasets/Amphion/Emilia to download the dataset." + exit(-1) + fi + +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + log "Stage 1: Prepare emilia manifests (EN and ZH only)" + # We assume that you have downloaded the Emilia corpus + # to $dl_dir/Amphion___Emilia + # see stage 0 for the directory structure + mkdir -p data/manifests + if [ ! -e data/manifests/.emilia.done ]; then + lhotse prepare emilia --lang en --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests + lhotse prepare emilia --lang zh --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests + touch data/manifests/.emilia.done + fi +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then + log "Stage 2: Preprocess Emilia dataset, mainly for cleaning" + mkdir -p data/manifests/splits_raw + if [ ! -e data/manifests/split_raw/.emilia.split.done ]; then + lhotse split-lazy data/manifests/emilia_cuts_EN.jsonl.gz data/manifests/splits_raw 10000 + lhotse split-lazy data/manifests/emilia_cuts_ZH.jsonl.gz data/manifests/splits_raw 10000 + touch data/manifests/splits_raw/.emilia.split.done + fi + + mkdir -p data/manifests/splits + + if [ ! -e data/manifests/splits/.emilia.preprocess.done ]; then + python local/preprocess_emilia.py --subset EN + python local/preprocess_emilia.py --subset ZH + touch data/manifests/splits/.emilia.preprocess.done + fi + +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then + log "Stage 3: Extract Fbank for Emilia" + mkdir -p data/fbank/emilia_splits + if [ ! -e data/fbank/emilia_splits/.emilia.fbank.done ]; then + # You can speed up the extraction by distributing splits to multiple machines. + for subset in EN ZH; do + python local/compute_fbank.py \ + --source-dir data/manifests/splits \ + --dest-dir data/fbank/emilia_splits \ + --dataset emilia \ + --subset ${subset} \ + --splits-cuts 1 \ + --split-begin 0 \ + --split-end 2000 \ + --num-jobs ${nj} + done + touch data/fbank/emilia_splits/.emilia.fbank.done + fi + + if [ ! -e data/fbank/emilia_cuts_EN.jsonl.gz ]; then + log "Combining EN fbank cuts and spliting EN dev set" + gunzip -c data/fbank/emilia_splits/emilia_cuts_EN.*.jsonl.gz > data/fbank/emilia_cuts_EN.jsonl + head -n 1500 data/fbank/emilia_cuts_EN.jsonl | gzip -c > data/fbank/emilia_cuts_EN_dev.jsonl.gz + sed -i '1,1500d' data/fbank/emilia_cuts_EN.jsonl + gzip data/fbank/emilia_cuts_EN.jsonl + fi + + if [ ! -e data/fbank/emilia_cuts_ZH.jsonl.gz ]; then + log "Combining ZH fbank cuts and spliting ZH dev set" + gunzip -c data/fbank/emilia_splits/emilia_cuts_ZH.*.jsonl.gz > data/fbank/emilia_cuts_ZH.jsonl + head -n 1500 data/fbank/emilia_cuts_ZH.jsonl | gzip -c > data/fbank/emilia_cuts_ZH_dev.jsonl.gz + sed -i '1,1500d' data/fbank/emilia_cuts_ZH.jsonl + gzip data/fbank/emilia_cuts_ZH.jsonl + fi + +fi + +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + log "Stage 4: Generate token file" + if [ ! -e data/tokens_emilia.txt ]; then + ./local/prepare_token_file_emilia.py --tokens data/tokens_emilia.txt + fi +fi diff --git a/egs/zipvoice/scripts/prepare_libritts.sh b/egs/zipvoice/scripts/prepare_libritts.sh index da35eec7b..6d643145e 100755 --- a/egs/zipvoice/scripts/prepare_libritts.sh +++ b/egs/zipvoice/scripts/prepare_libritts.sh @@ -8,7 +8,7 @@ set -eou pipefail stage=0 stop_stage=5 sampling_rate=24000 -nj=32 +nj=20 dl_dir=$PWD/download @@ -44,44 +44,53 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then # to $dl_dir/LibriTTS mkdir -p data/manifests_libritts if [ ! -e data/manifests_libritts/.libritts.done ]; then - lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests_libritts - touch data/manifests_libritts/.libritts.done + lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests + touch data/manifests/.libritts.done fi fi if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then log "Stage 2: Compute Fbank for LibriTTS" mkdir -p data/fbank - if [ ! -e data/fbank_libritts/.libritts.done ]; then - ./local/compute_fbank_libritts.py --sampling-rate $sampling_rate - touch data/fbank_libritts/.libritts.done + + if [ ! -e data/fbank/.libritts.done ]; then + for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do + python local/compute_fbank.py \ + --source-dir data/manifests \ + --dest-dir data/fbank \ + --dataset libritts \ + --subset ${subset} \ + --sampling-rate $sampling_rate \ + --num-jobs ${nj} + done + touch data/fbank/.libritts.done fi # Here we shuffle and combine the train-clean-100, train-clean-360 and # train-other-500 together to form the training set. - if [ ! -f data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz ]; then - cat <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-100.jsonl.gz) \ - <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-360.jsonl.gz) \ - <(gunzip -c data/fbank_libritts/libritts_cuts_train-other-500.jsonl.gz) | \ - shuf | gzip -c > data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz + if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then + cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \ + <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \ + <(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \ + shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz fi - if [ ! -f data/fbank_libritts/libritts_cuts_train-clean-460.jsonl.gz ]; then - cat <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-100.jsonl.gz) \ - <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-360.jsonl.gz) | \ - shuf | gzip -c > data/fbank_libritts/libritts_cuts_train-clean-460.jsonl.gz + if [ ! -f data/fbank/libritts_cuts_train-clean-460.jsonl.gz ]; then + cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \ + <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) | \ + shuf | gzip -c > data/fbank/libritts_cuts_train-clean-460.jsonl.gz fi - if [ ! -e data/fbank_libritts/.libritts-validated.done ]; then + if [ ! -e data/fbank/.libritts-validated.done ]; then log "Validating data/fbank for LibriTTS" ./local/validate_manifest.py \ - data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz - touch data/fbank_libritts/.libritts-validated.done + data/fbank/libritts_cuts_train-all-shuf.jsonl.gz + touch data/fbank/.libritts-validated.done fi fi if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then - log "Stage 4: Generate token file" + log "Stage 3: Generate token file" if [ ! -e data/tokens_libritts.txt ]; then ./local/prepare_token_file_libritts.py --tokens data/tokens_libritts.txt fi diff --git a/egs/zipvoice/shared b/egs/zipvoice/shared new file mode 120000 index 000000000..ee1982eba --- /dev/null +++ b/egs/zipvoice/shared @@ -0,0 +1 @@ +../../icefall/shared \ No newline at end of file diff --git a/egs/zipvoice/zipvoice/train_distill.py b/egs/zipvoice/zipvoice/train_distill.py index ae784050b..9e52a3790 100644 --- a/egs/zipvoice/zipvoice/train_distill.py +++ b/egs/zipvoice/zipvoice/train_distill.py @@ -31,7 +31,7 @@ python3 zipvoice/train_distill.py \ --base-lr 0.0005 \ --max-duration 500 \ --token-file "data/tokens_emilia.txt" \ - --manifest-dir "data/fbank_emilia" \ + --manifest-dir "data/fbank" \ --teacher-model zipvoice/exp_zipvoice/epoch-11-avg-4.pt \ --num-updates 60000 \ --distill-stage "first" \ @@ -46,7 +46,7 @@ python3 zipvoice/train_distill.py \ --base-lr 0.0001 \ --max-duration 500 \ --token-file "data/tokens_emilia.txt" \ - --manifest-dir "data/fbank_emilia" \ + --manifest-dir "data/fbank" \ --teacher-model zipvoice/exp_zipvoice_distill_1stage/iter-60000-avg-7.pt \ --num-updates 2000 \ --distill-stage "second" \ diff --git a/egs/zipvoice/zipvoice/train_flow.py b/egs/zipvoice/zipvoice/train_flow.py index 74d81b726..0bf023273 100644 --- a/egs/zipvoice/zipvoice/train_flow.py +++ b/egs/zipvoice/zipvoice/train_flow.py @@ -29,7 +29,7 @@ python3 zipvoice/train_flow.py \ --lr-hours 30000 \ --lr-batches 7500 \ --token-file "data/tokens_emilia.txt" \ - --manifest-dir "data/fbank_emilia" \ + --manifest-dir "data/fbank" \ --num-epochs 11 \ --exp-dir zipvoice/exp_zipvoice """ diff --git a/egs/zipvoice/zipvoice/tts_datamodule.py b/egs/zipvoice/zipvoice/tts_datamodule.py index e8ea7a4eb..972c700f7 100644 --- a/egs/zipvoice/zipvoice/tts_datamodule.py +++ b/egs/zipvoice/zipvoice/tts_datamodule.py @@ -347,14 +347,14 @@ class TtsDataModule: train-clean-360 and train-other-500 cuts" ) return load_manifest_lazy( - self.args.manifest_dir / "libritts_cuts_with_tokens_train-all-shuf.jsonl.gz" + self.args.manifest_dir / "libritts_cuts_train-all-shuf.jsonl.gz" ) @lru_cache() def dev_libritts_cuts(self) -> CutSet: logging.info("About to get dev-clean cuts") return load_manifest_lazy( - self.args.manifest_dir / "libritts_cuts_with_tokens_dev-clean.jsonl.gz" + self.args.manifest_dir / "libritts_cuts_dev-clean.jsonl.gz" )