add emilia data preparation pipeline

2025-12-09 14:05:33 +00:00 · 2025-06-17 19:38:46 +08:00 · 2025-06-17 19:38:46 +08:00 · 2376ed2117
commit 2376ed2117
parent 60572c2444
14 changed files with 501 additions and 603 deletions
--- a/egs/zipvoice/README.md
+++ b/egs/zipvoice/README.md
@ -57,21 +57,20 @@ To generate speech with our pre-trained ZipVoice or ZipVoice-Distill models, use
 ### 1. Inference of a single sentence:
 ```bash
 # Chinese example
 python3 zipvoice/zipvoice_infer.py \
    --model-name "zipvoice_distill" \
-    --prompt-wav assets/prompt-zh.wav \
+    --prompt-wav prompt.wav \
-    --prompt-text "对，这就是我，万人敬仰的太乙真人，虽然有点婴儿肥，但也掩不住我逼人的帅气。" \
+    --prompt-text "I am the transcription of the prompt wav." \
-    --text "欢迎使用我们的语音合成模型，希望它能给你带来惊喜！" \
+    --text "I am the text to be synthesized." \
-    --res-wav-path result-zh.wav
+    --res-wav-path result.wav
-# English example
+# Example with a pre-defined prompt wav and text
 python3 zipvoice/zipvoice_infer.py \
    --model-name "zipvoice_distill" \
    --prompt-wav assets/prompt-en.wav \
    --prompt-text "Some call me nature, others call me mother nature. I've been here for over four point five billion years, twenty two thousand five hundred times longer than you." \
    --text "Welcome to use our tts model, have fun!" \
-    --res-wav-path result-en.wav
+    --res-wav-path result.wav
 ```
 ### 2. Inference of a list of sentences:
@ -95,13 +94,29 @@ export HF_ENDPOINT=https://hf-mirror.com
 The following steps show how to train a model from scratch on Emilia and LibriTTS datasets, respectively.
 ### 0. Install dependencies for training
 ```bash
 pip install -r ../../requirements.txt
 ```
 ### 1. Data Preparation
 #### 1.1. Prepare the Emilia dataset
 ```bash
 bash scripts/prepare_emilia.sh --stage 0 --stop-stage 4
 ```
 See [scripts/prepare_emilia.sh](scripts/prepare_emilia.sh) for step by step instructions.
 #### 1.2 Prepare the LibriTTS dataset
-See [local/prepare_libritts.sh](local/prepare_libritts.sh)
+```bash
 bash scripts/prepare_libritts.sh --stage 0 --stop-stage 3
 ```
 See [scripts/prepare_libritts.sh](scripts/prepare_libritts.sh) for step by step instructions.
 ### 2. Training
--- a/egs/zipvoice/assets/prompt-zh.wav
+++ b/egs/zipvoice/assets/prompt-zh.wav
--- a/egs/zipvoice/local/compute_fbank.py
+++ b/egs/zipvoice/local/compute_fbank.py
@ -0,0 +1,288 @@
 #!/usr/bin/env python3
 # Copyright    2025  Xiaomi Corp.        (authors: Wei Kang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 import os
 from pathlib import Path
 from typing import Optional
 from concurrent.futures import ProcessPoolExecutor as Pool
 import torch
 from lhotse import (
    CutSet,
    LilcomChunkyWriter,
    load_manifest_lazy,
    set_audio_duration_mismatch_tolerance,
 )
 from feature import TorchAudioFbank, TorchAudioFbankConfig
 import lhotse
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def str2bool(v):
    """Used in argparse.ArgumentParser.add_argument to indicate
    that a type is a bool type and user can enter
        - yes, true, t, y, 1, to represent True
        - no, false, f, n, 0, to represent False
    See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse  # noqa
    """
    if isinstance(v, bool):
        return v
    if v.lower() in ("yes", "true", "t", "y", "1"):
        return True
    elif v.lower() in ("no", "false", "f", "n", "0"):
        return False
    else:
        raise argparse.ArgumentTypeError("Boolean value expected.")
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--sampling-rate",
        type=int,
        default=24000,
        help="The target sampling rate, the audio will be resampled to this sampling_rate.",
    )
    parser.add_argument(
        "--frame-shift",
        type=int,
        default=256,
        help="Frame shift in samples",
    )
    parser.add_argument(
        "--frame-length",
        type=int,
        default=1024,
        help="Frame length in samples",
    )
    parser.add_argument(
        "--num-mel-bins",
        type=int,
        default=100,
        help="The num of mel filters.",
    )
    parser.add_argument(
        "--dataset",
        type=str,
        help="Dataset name.",
    )
    parser.add_argument(
        "--subset",
        type=str,
        help="The subset of the dataset.",
    )
    parser.add_argument(
        "--source-dir",
        type=str,
        default="data/manifests",
        help="The source directory of manifest files.",
    )
    parser.add_argument(
        "--dest-dir",
        type=str,
        default="data/fbank",
        help="The destination directory of manifest files.",
    )
    parser.add_argument(
        "--split-cuts",
        type=str2bool,
        default=False,
        help="Whether to use splited cuts.",
    )
    parser.add_argument(
        "--split-begin",
        type=int,
        help="Start idx of splited cuts.",
    )
    parser.add_argument(
        "--split-end",
        type=int,
        help="End idx of splited cuts.",
    )
    parser.add_argument(
        "--batch-duration",
        type=int,
        default=1000,
        help="The batch duration when computing the features.",
    )
    parser.add_argument(
        "--num-jobs", type=int, default=20, help="The number of extractor workers."
    )
    return parser.parse_args()
 def compute_fbank_split_single(params, idx):
    lhotse.set_audio_duration_mismatch_tolerance(0.1)  # for emilia
    src_dir = Path(params.source_dir)
    output_dir = Path(params.dest_dir)
    num_mel_bins = params.num_mel_bins
    if not src_dir.exists():
        logging.error(f"{src_dir} not exists")
        return
    if not output_dir.exists():
        output_dir.mkdir(parents=True, exist_ok=True)
    num_digits = 8
    config = TorchAudioFbankConfig(
        sampling_rate=params.sampling_rate,
        n_mels=params.num_mel_bins,
        n_fft=params.frame_length,
        hop_length=params.frame_shift,
    )
    extractor = TorchAudioFbank(config)
    prefix = params.dataset
    subset = params.subset
    suffix = "jsonl.gz"
    idx = f"{idx}".zfill(num_digits)
    cuts_filename = f"{prefix}_cuts_{subset}.{idx}.{suffix}"
    if (src_dir / cuts_filename).is_file():
        logging.info(f"Loading manifests {src_dir / cuts_filename}")
        cut_set = load_manifest_lazy(src_dir / cuts_filename)
    else:
        logging.warning(f"Raw {cuts_filename} not exists, skipping")
        return
    cut_set = cut_set.resample(params.sampling_rate)
    if (output_dir / cuts_filename).is_file():
        logging.info(f"{cuts_filename} already exists - skipping.")
        return
    logging.info(f"Processing {subset}.{idx} of {prefix}")
    cut_set = cut_set.compute_and_store_features_batch(
        extractor=extractor,
        storage_path=f"{output_dir}/{prefix}_feats_{subset}_{idx}",
        num_workers=4,
        batch_duration=params.batch_duration,
        storage_type=LilcomChunkyWriter,
        overwrite=True,
    )
    cut_set.to_file(output_dir / cuts_filename)
 def compute_fbank_split(params):
    if params.split_end < params.split_begin:
        logging.warning(
            f"Split begin should be smaller than split end, given "
            f"{params.split_begin} -> {params.split_end}."
        )
    with Pool(max_workers=params.num_jobs) as pool:
        futures = [
            pool.submit(compute_fbank_split_single, params, i)
            for i in range(params.split_begin, params.split_end)
        ]
        for f in futures:
            f.result()
            f.done()
 def compute_fbank(params):
    src_dir = Path(params.source_dir)
    output_dir = Path(params.dest_dir)
    num_jobs = params.num_jobs
    num_mel_bins = params.num_mel_bins
    prefix = params.dataset
    subset = params.subset
    suffix = "jsonl.gz"
    cut_set_name = f"{prefix}_cuts_{subset}.{suffix}"
    if (src_dir / cut_set_name).is_file():
        logging.info(f"Loading manifests {src_dir / cut_set_name}")
        cut_set = load_manifest_lazy(src_dir / cut_set_name)
    else:
        recordings = load_manifest_lazy(
            src_dir / f"{prefix}_recordings_{subset}.{suffix}"
        )
        supervisions = load_manifest_lazy(
            src_dir / f"{prefix}_supervisions_{subset}.{suffix}"
        )
        cut_set = CutSet.from_manifests(
            recordings=recordings,
            supervisions=supervisions,
        )
    cut_set = cut_set.resample(params.sampling_rate)
    config = TorchAudioFbankConfig(
        sampling_rate=params.sampling_rate,
        n_mels=params.num_mel_bins,
        n_fft=params.frame_length,
        hop_length=params.frame_shift,
    )
    extractor = TorchAudioFbank(config)
    cuts_filename = f"{prefix}_cuts_{subset}.{suffix}"
    if (output_dir / cuts_filename).is_file():
        logging.info(f"{prefix} {subset} already exists - skipping.")
        return
    logging.info(f"Processing {subset} of {prefix}")
    cut_set = cut_set.compute_and_store_features(
        extractor=extractor,
        storage_path=f"{output_dir}/{prefix}_feats_{subset}",
        num_jobs=num_jobs,
        storage_type=LilcomChunkyWriter,
    )
    cut_set.to_file(output_dir / cuts_filename)
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    args = get_args()
    logging.info(vars(args))
    if args.split_cuts:
        compute_fbank_split(params=args)
    else:
        compute_fbank(params=args)
--- a/egs/zipvoice/local/compute_fbank_libritts.py
+++ b/egs/zipvoice/local/compute_fbank_libritts.py
@ -1,140 +0,0 @@
 #!/usr/bin/env python3
 # Copyright    2021-2023  Xiaomi Corp.        (authors: Fangjun Kuang,
 #                                                       Zengwei Yao,)
 #              2024       The Chinese Univ. of HK  (authors: Zengrui Jin)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This file computes fbank features of the LibriTTS dataset.
 It looks for manifests in the directory data/manifests.
 The generated fbank features are saved in data/fbank.
 """
 import argparse
 import logging
 import os
 from pathlib import Path
 from typing import Optional
 import torch
 from feature import TorchAudioFbank, TorchAudioFbankConfig
 from lhotse import CutSet, LilcomChunkyWriter
 from lhotse.recipes.utils import read_manifests_if_cached
 from icefall.utils import get_executor
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
 # even when we are not invoking the main (e.g. when spawning subprocesses).
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--dataset",
        type=str,
        help="""Dataset parts to compute fbank. If None, we will use all""",
    )
    parser.add_argument(
        "--sampling-rate",
        type=int,
        default=24000,
        help="""Sampling rate of the waveform for computing fbank, 
        the default value for LibriTTS is 24000, waveform files will be 
        resampled if a different sample rate is provided""",
    )
    return parser.parse_args()
 def compute_fbank_libritts(dataset: Optional[str] = None, sampling_rate: int = 24000):
    src_dir = Path("data/manifests_libritts")
    output_dir = Path("data/fbank_libritts")
    num_jobs = min(32, os.cpu_count())
    prefix = "libritts"
    suffix = "jsonl.gz"
    if dataset is None:
        dataset_parts = (
            "dev-clean",
            "test-clean",
            "train-clean-100",
            "train-clean-360",
            "train-other-500",
        )
    else:
        dataset_parts = dataset.split(" ", -1)
    manifests = read_manifests_if_cached(
        dataset_parts=dataset_parts,
        output_dir=src_dir,
        prefix=prefix,
        suffix=suffix,
    )
    assert manifests is not None
    assert len(manifests) == len(dataset_parts), (
        len(manifests),
        len(dataset_parts),
        list(manifests.keys()),
        dataset_parts,
    )
    config = TorchAudioFbankConfig(
        sampling_rate=sampling_rate,
        n_mels=100,
        n_fft=1024,
        hop_length=256,
    )
    extractor = TorchAudioFbank(config)
    with get_executor() as ex:  # Initialize the executor only once.
        for partition, m in manifests.items():
            cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
            if (output_dir / cuts_filename).is_file():
                logging.info(f"{partition} already exists - skipping.")
                return
            logging.info(f"Processing {partition}")
            cut_set = CutSet.from_manifests(
                recordings=m["recordings"],
                supervisions=m["supervisions"],
            )
            if sampling_rate != 24000:
                logging.info(f"Resampling waveforms to {sampling_rate}")
                cut_set = cut_set.resample(sampling_rate)
            cut_set = cut_set.compute_and_store_features(
                extractor=extractor,
                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
                # when an executor is specified, make more partitions
                num_jobs=num_jobs if ex is None else 80,
                executor=ex,
                storage_type=LilcomChunkyWriter,
            )
            cut_set.to_file(output_dir / cuts_filename)
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    compute_fbank_libritts()
--- a/egs/zipvoice/local/feature.py
+++ b/egs/zipvoice/local/feature.py
@ -1,135 +0,0 @@
 #!/usr/bin/env python3
 # Copyright         2024  Xiaomi Corp.        (authors: Han Zhu)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
 from typing import Union
 import numpy as np
 import torch
 import torch.nn as nn
 import torchaudio
 from lhotse.features.base import FeatureExtractor, register_extractor
 from lhotse.utils import Seconds, compute_num_frames
 class MelSpectrogramFeatures(nn.Module):
    def __init__(
        self,
        sampling_rate=24000,
        n_mels=100,
        n_fft=1024,
        hop_length=256,
    ):
        super().__init__()
        self.mel_spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=sampling_rate,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels,
            center=True,
            power=1,
        )
    def forward(self, inp):
        assert len(inp.shape) == 2
        mel = self.mel_spec(inp)
        logmel = mel.clamp(min=1e-7).log()
        return logmel
@dataclass
 class TorchAudioFbankConfig:
    sampling_rate: int
    n_mels: int
    n_fft: int
    hop_length: int
@register_extractor
 class TorchAudioFbank(FeatureExtractor):
    name = "TorchAudioFbank"
    config_type = TorchAudioFbankConfig
    def __init__(self, config):
        super().__init__(config=config)
    def _feature_fn(self, sample):
        fbank = MelSpectrogramFeatures(
            sampling_rate=self.config.sampling_rate,
            n_mels=self.config.n_mels,
            n_fft=self.config.n_fft,
            hop_length=self.config.hop_length,
        )
        return fbank(sample)
    @property
    def device(self) -> Union[str, torch.device]:
        return self.config.device
    def feature_dim(self, sampling_rate: int) -> int:
        return self.config.n_mels
    def extract(
        self,
        samples: Union[np.ndarray, torch.Tensor],
        sampling_rate: int,
    ) -> Union[np.ndarray, torch.Tensor]:
        # Check for sampling rate compatibility.
        expected_sr = self.config.sampling_rate
        assert sampling_rate == expected_sr, (
            f"Mismatched sampling rate: extractor expects {expected_sr}, "
            f"got {sampling_rate}"
        )
        is_numpy = False
        if not isinstance(samples, torch.Tensor):
            samples = torch.from_numpy(samples)
            is_numpy = True
        if len(samples.shape) == 1:
            samples = samples.unsqueeze(0)
        assert samples.ndim == 2, samples.shape
        assert samples.shape[0] == 1, samples.shape
        mel = self._feature_fn(samples).squeeze().t()
        assert mel.ndim == 2, mel.shape
        assert mel.shape[1] == self.config.n_mels, mel.shape
        num_frames = compute_num_frames(
            samples.shape[1] / sampling_rate, self.frame_shift, sampling_rate
        )
        if mel.shape[0] > num_frames:
            mel = mel[:num_frames]
        elif mel.shape[0] < num_frames:
            mel = mel.unsqueeze(0)
            mel = torch.nn.functional.pad(
                mel, (0, 0, 0, num_frames - mel.shape[1]), mode="replicate"
            ).squeeze(0)
        if is_numpy:
            return mel.cpu().numpy()
        else:
            return mel
    @property
    def frame_shift(self) -> Seconds:
        return self.config.hop_length / self.config.sampling_rate
--- a/egs/zipvoice/local/feature.py
+++ b/egs/zipvoice/local/feature.py
@ -0,0 +1 @@
 ../zipvoice/feature.py
--- a/egs/zipvoice/local/prepare_tokens_emilia.py
+++ b/egs/zipvoice/local/prepare_tokens_emilia.py
@ -20,20 +20,26 @@
 """
-This file reads the texts in given manifest and save the new cuts with phoneme tokens.
+This file reads the texts in given manifest and save the cleaned new cuts.
 """
 import argparse
 import glob
 import logging
-import re
+import glob
-from concurrent.futures import ProcessPoolExecutor as Pool
+import os
 from pathlib import Path
 from typing import List
-import jieba
+from lhotse import CutSet, load_manifest_lazy
-from lhotse import load_manifest_lazy
+from concurrent.futures import ProcessPoolExecutor as Pool
-from tokenizer import Tokenizer, is_alphabet, is_chinese, is_hangul, is_japanese
+
 from tokenizer import (
    is_alphabet,
    is_chinese,
    is_hangul,
    is_japanese,
    tokenize_by_CJK_char,
 )
 def get_args():
@ -48,71 +54,32 @@ def get_args():
    parser.add_argument(
        "--jobs",
        type=int,
-        default=50,
+        default=20,
        help="Number of jobs to processing.",
    )
    parser.add_argument(
        "--source-dir",
        type=str,
-        default="data/manifests_emilia/splits",
+        default="data/manifests/splits_raw",
        help="The source directory of manifest files.",
    )
    parser.add_argument(
        "--dest-dir",
        type=str,
        default="data/manifests/splits",
        help="The destination directory of manifest files.",
    )
    return parser.parse_args()
-def tokenize_by_CJK_char(line: str) -> List[str]:
+def preprocess_emilia(file_name: str, input_dir: Path, output_dir: Path):
    """
    Tokenize a line of text with CJK char.
    Note: All return characters will be upper case.
    Example:
      input = "你好世界是 hello world 的中文"
      output = [你, 好, 世, 界, 是, HELLO, WORLD, 的, 中, 文]
    Args:
      line:
        The input text.
    Return:
      A new string tokenize by CJK char.
    """
    # The CJK ranges is from https://github.com/alvations/nltk/blob/79eed6ddea0d0a2c212c1060b477fc268fec4d4b/nltk/tokenize/util.py
    pattern = re.compile(
        r"([\u1100-\u11ff\u2e80-\ua4cf\ua840-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F\uFF65-\uFFDC\U00020000-\U0002FFFF])"
    )
    chars = pattern.split(line.strip().upper())
    char_list = []
    for w in chars:
        if w.strip():
            char_list += w.strip().split()
    return char_list
 def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
    logging.info(f"Processing {file_name}")
    if (output_dir / file_name).is_file():
        logging.info(f"{file_name} exists, skipping.")
        return
    jieba.setLogLevel(logging.INFO)
    tokenizer = Tokenizer()
    def _prepare_cut(cut):
        # Each cut only contains one supervision
        assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
        text = cut.supervisions[0].text
        cut.supervisions[0].normalized_text = text
        tokens = tokenizer.texts_to_tokens([text])[0]
        cut.tokens = tokens
        return cut
    def _filter_cut(cut):
        text = cut.supervisions[0].text
@ -124,10 +91,10 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
        clean_chars = []
        for x in text:
            if is_hangul(x):
-                logging.info(f"Delete cut with text containing Korean : {text}")
+                logging.warning(f"Delete cut with text containing Korean : {text}")
                return False
            if is_japanese(x):
-                logging.info(f"Delete cut with text containing Japanese : {text}")
+                logging.warning(f"Delete cut with text containing Japanese : {text}")
                return False
            if is_chinese(x):
                chinese.append(x)
@ -138,18 +105,19 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
            if x == " ":
                clean_chars.append(x)
        if len(english) + len(chinese) == 0:
-            logging.info(f"Delete cut with text has no valid chars : {text}")
+            logging.warning(f"Delete cut with text has no valid chars : {text}")
            return False
        words = tokenize_by_CJK_char("".join(clean_chars))
        for i in range(len(words) - 10):
            if words[i : i + 10].count(words[i]) == 10:
-                logging.info(f"Delete cut with text with too much repeats : {text}")
+                logging.warning(f"Delete cut with text with too much repeats : {text}")
                return False
        # word speed, 20 - 600 / minute
        if duration < len(words) / 600 * 60 or duration > len(words) / 20 * 60:
-            logging.info(
+            logging.warning(
-                f"Delete cut with audio text mismatch, duration : {duration}s, words : {len(words)}, text : {text}"
+                f"Delete cut with audio text mismatch, duration : {duration}s, "
                f"words : {len(words)}, text : {text}"
            )
            return False
        return True
@ -157,11 +125,10 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
    try:
        cut_set = load_manifest_lazy(input_dir / file_name)
        cut_set = cut_set.filter(_filter_cut)
        cut_set = cut_set.map(_prepare_cut)
        cut_set.to_file(output_dir / file_name)
    except Exception as e:
        logging.error(f"Manifest {file_name} failed with error: {e}")
-        raise
+        os.remove(str(output_dir / file_name))
 if __name__ == "__main__":
@ -179,14 +146,11 @@ if __name__ == "__main__":
    with Pool(max_workers=args.jobs) as pool:
        futures = [
            pool.submit(
-                prepare_tokens_emilia, filename.split("/")[-1], input_dir, output_dir
+                preprocess_emilia, filename.split("/")[-1], input_dir, output_dir
            )
            for filename in cut_files
        ]
        for f in futures:
-            try:
+            f.result()
-                f.result()
+            f.done()
                f.done()
            except Exception as e:
                logging.error(f"Future failed with error: {e}")
    logging.info("Processing done.")
--- a/egs/zipvoice/local/tokenizer.py
+++ b/egs/zipvoice/local/tokenizer.py
@ -0,0 +1 @@
 ../zipvoice/tokenizer.py
--- a/egs/zipvoice/scripts/prepare.sh
+++ b/egs/zipvoice/scripts/prepare.sh
@ -1,232 +0,0 @@
 #!/usr/bin/env bash
 # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 # add icefall to PYTHONPATH
 export PYTHONPATH=../../../:$PYTHONPATH
 set -eou pipefail
 stage=0
 stop_stage=100
 token_type=bpe  # bpe, letter, phone
 bpe_vocab_size=500
 nj=32
 dl_dir=$PWD/download
 . shared/parse_options.sh || exit 1
 # All files generated by this script are saved in "data".
 # You can safely remove "data" and rerun this script to regenerate it.
 mkdir -p data
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 log "dl_dir: $dl_dir"
 if [ $stage -le -2 ] && [ $stop_stage -ge -2 ]; then
  if [ ! -d $dl_dir/xvector_nnet_1a_libritts_clean_460 ]; then
    log "Downloading x-vector"
    git clone https://huggingface.co/datasets/zrjin/xvector_nnet_1a_libritts_clean_460 $dl_dir/xvector_nnet_1a_libritts_clean_460
    mkdir -p exp/xvector_nnet_1a/
    cp -r $dl_dir/xvector_nnet_1a_libritts_clean_460/* exp/xvector_nnet_1a/
  fi
 fi
 if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
  log "Stage -1: build monotonic_align lib"
  if [ ! -d vits/monotonic_align/build ]; then
    cd vits/monotonic_align
    python setup.py build_ext --inplace
    cd ../../
  else
    log "monotonic_align lib already built"
  fi
 fi
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "Stage 0: Download data"
  # If you have pre-downloaded it to /path/to/LibriTTS,
  # you can create a symlink
  #
  #   ln -sfv /path/to/LibriTTS $dl_dir/LibriTTS
  #
  if [ ! -d $dl_dir/LibriTTS ]; then
    lhotse download libritts $dl_dir
  fi
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare LibriTTS manifest"
  # We assume that you have downloaded the LibriTTS corpus
  # to $dl_dir/LibriTTS
  mkdir -p data/manifests
  if [ ! -e data/manifests/.libritts.done ]; then
    lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests
    touch data/manifests/.libritts.done
  fi
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Compute Fbank for LibriTTS"
  mkdir -p data/fbank
  for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do
    python local/compute_fbank.py --dataset libritts --subset ${subset}
  done
  # Here we shuffle and combine the train-clean-100, train-clean-360 and
  # train-other-500 together to form the training set.
  if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then
    cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
      <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \
      <(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \
      shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
  fi
  if [ ! -f data/fbank/libritts_cuts_train-clean-460.jsonl.gz ]; then
    cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
      <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) | \
      shuf | gzip -c > data/fbank/libritts_cuts_train-clean-460.jsonl.gz
  fi
  if [ ! -e data/fbank/.libritts-validated.done ]; then
    log "Validating data/fbank for LibriTTS"
    ./local/validate_manifest.py \
      data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
    touch data/fbank/.libritts-validated.done
  fi
 fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Prepare tokens.txt"
  if [ $token_type == "bpe" ] || [ $token_type == "letter" ]; then
    if [ ! -e data/texts.txt ]; then
      ./local/export_normalized_texts.py --output data/texts.txt \
        --manifests data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
    fi
  fi
  if [ $token_type == "bpe" ]; then
    mkdir -p data/lang_bpe_${bpe_vocab_size}
    if [ ! -e data/lang_bpe_${bpe_vocab_size}/tokens.txt ]; then
      ./local/train_bpe_model.py --transcript data/texts.txt \
        --lang-dir data/lang_bpe_${bpe_vocab_size} \
        --vocab-size $bpe_vocab_size
    fi
  fi
  if [ $token_type == "phone" ]; then
    mkdir -p data/lang_phone
    ./local/export_tokens.py --token-type phone \
      --output data/lang_phone/tokens.txt
  fi
  if [ $token_type == "letter" ]; then
    mkdir -p data/lang_letter
    ./local/export_tokens.py --token-type letter  \
      --texts data/texts.txt \
      --output data/lang_letter/tokens.txt
  fi
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Download and prepare librispeech-pc test clean for testing."
  if [ ! -e $dl_dir/test-clean.tar.gz ]; then
    wget https://huggingface.co/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir
  fi
  # For China users.
  if [ ! -e $dl_dir/test-clean.tar.gz ]; then
    wget https://hf-mirror.com/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir
  fi
  if [ ! -d $dl_dir/LibriSpeech/test-clean ]; then
    tar -xvf $dl_dir/test-clean.tar.gz -C $dl_dir
  fi
  mkdir -p $dl_dir/LibriSpeech-PC
  if [ ! -e $dl_dir/LibriSpeech-PC/test-clean.json ]; then
    wget https://us.openslr.org/resources/145/manifests.tar.gz -P $dl_dir/LibriSpeech-PC
    tar -xvf $dl_dir/LibriSpeech-PC/manifests.tar.gz -C $dl_dir/LibriSpeech-PC
  fi
  python local/compute_fbank.py --dataset librispeech --subset test-clean
  python local/prepare_prompts_librispeech_test_clean.py
 fi
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
  log "Stage 5: Compute Spectrogram for LibriTTS (for VITS system)"
  mkdir -p data/spectrogram
  if [ ! -e data/spectrogram/.libritts.done ]; then
    ./local/compute_spectrogram_libritts.py --sampling-rate $sampling_rate
    touch data/spectrogram/.libritts.done
  fi
  # Here we shuffle and combine the train-clean-100, train-clean-360 and
  # train-other-500 together to form the training set.
  if [ ! -f data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz ]; then
    cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
      <(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) \
      <(gunzip -c data/spectrogram/libritts_cuts_train-other-500.jsonl.gz) | \
      shuf | gzip -c > data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
  fi
  # Here we shuffle and combine the train-clean-100, train-clean-360
  # together to form the training set.
  if [ ! -f data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz ]; then
    cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
      <(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) | \
      shuf | gzip -c > data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz
  fi
  if [ ! -e data/spectrogram/.libritts-validated.done ]; then
    log "Validating data/spectrogram for LibriTTS"
    ./local/validate_manifest.py \
      data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
    touch data/spectrogram/.libritts-validated.done
  fi
 fi
 audio_feats_dir=data/tokenized
 dataset_parts="--dataset-parts all"  # debug "-p dev-clean -p test-clean"
 if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
  log "Stage 6: Tokenize/Fbank LibriTTS for valle"
  mkdir -p ${audio_feats_dir}
  if [ ! -e ${audio_feats_dir}/.libritts.tokenize.done ]; then
    python3 ./local/compute_neural_codec_and_prepare_text_tokens.py --dataset-parts "${dataset_parts}" \
        --audio-extractor "Encodec" \
        --batch-duration 400 \
        --src-dir "data/manifests" \
        --output-dir "${audio_feats_dir}"
  fi
  touch ${audio_feats_dir}/.libritts.tokenize.done
  lhotse combine \
    ${audio_feats_dir}/libritts_cuts_train-clean-100.jsonl.gz \
    ${audio_feats_dir}/libritts_cuts_train-clean-360.jsonl.gz \
    ${audio_feats_dir}/libritts_cuts_train-other-500.jsonl.gz \
    ${audio_feats_dir}/cuts_train.jsonl.gz
  lhotse copy \
    ${audio_feats_dir}/libritts_cuts_dev-clean.jsonl.gz \
    ${audio_feats_dir}/cuts_dev.jsonl.gz
  lhotse copy \
    ${audio_feats_dir}/libritts_cuts_test-clean.jsonl.gz \
    ${audio_feats_dir}/cuts_test.jsonl.gz
 fi
--- a/egs/zipvoice/scripts/prepare_emilia.sh
+++ b/egs/zipvoice/scripts/prepare_emilia.sh
@ -0,0 +1,126 @@
 #!/usr/bin/env bash
 # fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
 export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 set -eou pipefail
 stage=0
 stop_stage=5
 sampling_rate=24000
 nj=32
 dl_dir=$PWD/download
 # All files generated by this script are saved in "data".
 # You can safely remove "data" and rerun this script to regenerate it.
 mkdir -p data
 log() {
  # This function is from espnet
  local fname=${BASH_SOURCE[1]##*/}
  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
 }
 log "dl_dir: $dl_dir"
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
  log "Stage 0: Download data"
  # Your download directory should look like this:
  #
  #    download/Amphion___Emilia
  #    ├── metafile.yaml
  #    ├── raw
  #    │   ├── DE
  #    │   ├── EN
  #    │   ├── FR
  #    │   ├── JA
  #    │   ├── KO
  #    │   ├── openemilia_45batches.tar.gz
  #    │   ├── openemilia_all.tar.gz
  #    │   └── ZH
  #    └── README.md
  if [ ! -d $dl_dir/Amphion___Emilia/raw ]; then
    log "Please refer https://openxlab.org.cn/datasets/Amphion/Emilia to download the dataset."
    exit(-1)
  fi
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  log "Stage 1: Prepare emilia manifests (EN and ZH only)"
  # We assume that you have downloaded the Emilia corpus
  # to $dl_dir/Amphion___Emilia
  # see stage 0 for the directory structure
  mkdir -p data/manifests
  if [ ! -e data/manifests/.emilia.done ]; then
    lhotse prepare emilia --lang en --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
    lhotse prepare emilia --lang zh --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
    touch data/manifests/.emilia.done
  fi
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Preprocess Emilia dataset, mainly for cleaning"
  mkdir -p data/manifests/splits_raw
  if [ ! -e data/manifests/split_raw/.emilia.split.done ]; then
    lhotse split-lazy data/manifests/emilia_cuts_EN.jsonl.gz data/manifests/splits_raw 10000
    lhotse split-lazy data/manifests/emilia_cuts_ZH.jsonl.gz data/manifests/splits_raw 10000
    touch data/manifests/splits_raw/.emilia.split.done
  fi
  mkdir -p data/manifests/splits
  if [ ! -e data/manifests/splits/.emilia.preprocess.done ]; then
    python local/preprocess_emilia.py --subset EN
    python local/preprocess_emilia.py --subset ZH
    touch data/manifests/splits/.emilia.preprocess.done
  fi
 fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
  log "Stage 3: Extract Fbank for Emilia"
  mkdir -p data/fbank/emilia_splits
  if [ ! -e data/fbank/emilia_splits/.emilia.fbank.done ]; then
    # You can speed up the extraction by distributing splits to multiple machines.
    for subset in EN ZH; do
      python local/compute_fbank.py \
        --source-dir data/manifests/splits \
        --dest-dir data/fbank/emilia_splits \
        --dataset emilia \
        --subset ${subset} \
        --splits-cuts 1 \
        --split-begin 0 \
        --split-end 2000 \
        --num-jobs ${nj}
    done
    touch data/fbank/emilia_splits/.emilia.fbank.done
  fi
  if [ ! -e data/fbank/emilia_cuts_EN.jsonl.gz ]; then
    log "Combining EN fbank cuts and spliting EN dev set"
    gunzip -c data/fbank/emilia_splits/emilia_cuts_EN.*.jsonl.gz > data/fbank/emilia_cuts_EN.jsonl
    head -n 1500 data/fbank/emilia_cuts_EN.jsonl | gzip -c > data/fbank/emilia_cuts_EN_dev.jsonl.gz
    sed -i '1,1500d' data/fbank/emilia_cuts_EN.jsonl
    gzip data/fbank/emilia_cuts_EN.jsonl
  fi
  if [ ! -e data/fbank/emilia_cuts_ZH.jsonl.gz ]; then
    log "Combining ZH fbank cuts and spliting ZH dev set"
    gunzip -c data/fbank/emilia_splits/emilia_cuts_ZH.*.jsonl.gz > data/fbank/emilia_cuts_ZH.jsonl
    head -n 1500 data/fbank/emilia_cuts_ZH.jsonl | gzip -c > data/fbank/emilia_cuts_ZH_dev.jsonl.gz
    sed -i '1,1500d' data/fbank/emilia_cuts_ZH.jsonl
    gzip data/fbank/emilia_cuts_ZH.jsonl
  fi
 fi
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
  log "Stage 4: Generate token file"
  if [ ! -e data/tokens_emilia.txt ]; then
    ./local/prepare_token_file_emilia.py --tokens data/tokens_emilia.txt
  fi
 fi
--- a/egs/zipvoice/scripts/prepare_libritts.sh
+++ b/egs/zipvoice/scripts/prepare_libritts.sh
@ -8,7 +8,7 @@ set -eou pipefail
 stage=0
 stop_stage=5
 sampling_rate=24000
-nj=32
+nj=20
 dl_dir=$PWD/download
@ -44,44 +44,53 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  # to $dl_dir/LibriTTS
  mkdir -p data/manifests_libritts
  if [ ! -e data/manifests_libritts/.libritts.done ]; then
-    lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests_libritts
+    lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests
-    touch data/manifests_libritts/.libritts.done
+    touch data/manifests/.libritts.done
  fi
 fi
 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Compute Fbank for LibriTTS"
  mkdir -p data/fbank
-  if [ ! -e data/fbank_libritts/.libritts.done ]; then
+
-    ./local/compute_fbank_libritts.py --sampling-rate $sampling_rate
+  if [ ! -e data/fbank/.libritts.done ]; then
-    touch data/fbank_libritts/.libritts.done
+    for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do
      python local/compute_fbank.py \
        --source-dir data/manifests \
        --dest-dir data/fbank \
        --dataset libritts \
        --subset ${subset} \
        --sampling-rate $sampling_rate \
        --num-jobs ${nj}
    done
    touch data/fbank/.libritts.done
  fi
  # Here we shuffle and combine the train-clean-100, train-clean-360 and
  # train-other-500 together to form the training set.
-  if [ ! -f data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz ]; then
+  if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then
-    cat <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-100.jsonl.gz) \
+    cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
-      <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-360.jsonl.gz) \
+      <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \
-      <(gunzip -c data/fbank_libritts/libritts_cuts_train-other-500.jsonl.gz) | \
+      <(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \
-      shuf | gzip -c > data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz
+      shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
  fi
-  if [ ! -f data/fbank_libritts/libritts_cuts_train-clean-460.jsonl.gz ]; then
+  if [ ! -f data/fbank/libritts_cuts_train-clean-460.jsonl.gz ]; then
-    cat <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-100.jsonl.gz) \
+    cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
-      <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-360.jsonl.gz) | \
+      <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) | \
-      shuf | gzip -c > data/fbank_libritts/libritts_cuts_train-clean-460.jsonl.gz
+      shuf | gzip -c > data/fbank/libritts_cuts_train-clean-460.jsonl.gz
  fi
-  if [ ! -e data/fbank_libritts/.libritts-validated.done ]; then
+  if [ ! -e data/fbank/.libritts-validated.done ]; then
    log "Validating data/fbank for LibriTTS"
    ./local/validate_manifest.py \
-      data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz
+      data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
-    touch data/fbank_libritts/.libritts-validated.done
+    touch data/fbank/.libritts-validated.done
  fi
 fi
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
-  log "Stage 4: Generate token file"
+  log "Stage 3: Generate token file"
  if [ ! -e data/tokens_libritts.txt ]; then
    ./local/prepare_token_file_libritts.py --tokens data/tokens_libritts.txt
  fi
--- a/egs/zipvoice/shared
+++ b/egs/zipvoice/shared
@ -0,0 +1 @@
 ../../icefall/shared
--- a/egs/zipvoice/zipvoice/train_distill.py
+++ b/egs/zipvoice/zipvoice/train_distill.py
@ -31,7 +31,7 @@ python3 zipvoice/train_distill.py \
        --base-lr 0.0005 \
        --max-duration 500 \
        --token-file "data/tokens_emilia.txt" \
-        --manifest-dir "data/fbank_emilia" \
+        --manifest-dir "data/fbank" \
        --teacher-model zipvoice/exp_zipvoice/epoch-11-avg-4.pt \
        --num-updates 60000 \
        --distill-stage "first" \
@ -46,7 +46,7 @@ python3 zipvoice/train_distill.py \
        --base-lr 0.0001 \
        --max-duration 500 \
        --token-file "data/tokens_emilia.txt" \
-        --manifest-dir "data/fbank_emilia" \
+        --manifest-dir "data/fbank" \
        --teacher-model zipvoice/exp_zipvoice_distill_1stage/iter-60000-avg-7.pt \
        --num-updates 2000 \
        --distill-stage "second" \
--- a/egs/zipvoice/zipvoice/train_flow.py
+++ b/egs/zipvoice/zipvoice/train_flow.py
@ -29,7 +29,7 @@ python3 zipvoice/train_flow.py \
        --lr-hours 30000 \
        --lr-batches 7500 \
        --token-file "data/tokens_emilia.txt" \
-        --manifest-dir "data/fbank_emilia" \
+        --manifest-dir "data/fbank" \
        --num-epochs 11 \
        --exp-dir zipvoice/exp_zipvoice
 """
--- a/egs/zipvoice/zipvoice/tts_datamodule.py
+++ b/egs/zipvoice/zipvoice/tts_datamodule.py
@ -347,14 +347,14 @@ class TtsDataModule:
            train-clean-360 and train-other-500 cuts"
        )
        return load_manifest_lazy(
-            self.args.manifest_dir / "libritts_cuts_with_tokens_train-all-shuf.jsonl.gz"
+            self.args.manifest_dir / "libritts_cuts_train-all-shuf.jsonl.gz"
        )
    @lru_cache()
    def dev_libritts_cuts(self) -> CutSet:
        logging.info("About to get dev-clean cuts")
        return load_manifest_lazy(
-            self.args.manifest_dir / "libritts_cuts_with_tokens_dev-clean.jsonl.gz"
+            self.args.manifest_dir / "libritts_cuts_dev-clean.jsonl.gz"
        )