add emilia data preparation pipeline

2025-12-09 14:05:33 +00:00 · 2025-06-17 19:38:46 +08:00 · 2025-06-17 19:38:46 +08:00 · 2376ed2117
commit 2376ed2117
parent 60572c2444
14 changed files with 501 additions and 603 deletions
--- a/egs/zipvoice/README.md
+++ b/egs/zipvoice/README.md
@ -57,21 +57,20 @@ To generate speech with our pre-trained ZipVoice or ZipVoice-Distill models, use

 ### 1. Inference of a single sentence:
 ```bash
-# Chinese example
 python3 zipvoice/zipvoice_infer.py \
    --model-name "zipvoice_distill" \
-    --prompt-wav assets/prompt-zh.wav \
-    --prompt-text "对，这就是我，万人敬仰的太乙真人，虽然有点婴儿肥，但也掩不住我逼人的帅气。" \
-    --text "欢迎使用我们的语音合成模型，希望它能给你带来惊喜！" \
-    --res-wav-path result-zh.wav
+    --prompt-wav prompt.wav \
+    --prompt-text "I am the transcription of the prompt wav." \
+    --text "I am the text to be synthesized." \
+    --res-wav-path result.wav

-# English example
+# Example with a pre-defined prompt wav and text
 python3 zipvoice/zipvoice_infer.py \
    --model-name "zipvoice_distill" \
    --prompt-wav assets/prompt-en.wav \
    --prompt-text "Some call me nature, others call me mother nature. I've been here for over four point five billion years, twenty two thousand five hundred times longer than you." \
    --text "Welcome to use our tts model, have fun!" \
-    --res-wav-path result-en.wav
+    --res-wav-path result.wav
 ```

 ### 2. Inference of a list of sentences:
@ -95,13 +94,29 @@ export HF_ENDPOINT=https://hf-mirror.com

 The following steps show how to train a model from scratch on Emilia and LibriTTS datasets, respectively.

+### 0. Install dependencies for training
+
+```bash
+pip install -r ../../requirements.txt
+```
+
 ### 1. Data Preparation

 #### 1.1. Prepare the Emilia dataset

+```bash
+bash scripts/prepare_emilia.sh --stage 0 --stop-stage 4
+```
+
+See [scripts/prepare_emilia.sh](scripts/prepare_emilia.sh) for step by step instructions.
+
 #### 1.2 Prepare the LibriTTS dataset

-See [local/prepare_libritts.sh](local/prepare_libritts.sh)
+```bash
+bash scripts/prepare_libritts.sh --stage 0 --stop-stage 3
+```
+
+See [scripts/prepare_libritts.sh](scripts/prepare_libritts.sh) for step by step instructions.

 ### 2. Training

--- a/egs/zipvoice/assets/prompt-zh.wav
+++ b/egs/zipvoice/assets/prompt-zh.wav
--- a/egs/zipvoice/local/compute_fbank.py
+++ b/egs/zipvoice/local/compute_fbank.py
@ -0,0 +1,288 @@
+#!/usr/bin/env python3
+# Copyright    2025  Xiaomi Corp.        (authors: Wei Kang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import Optional
+from concurrent.futures import ProcessPoolExecutor as Pool
+
+import torch
+from lhotse import (
+    CutSet,
+    LilcomChunkyWriter,
+    load_manifest_lazy,
+    set_audio_duration_mismatch_tolerance,
+)
+
+from feature import TorchAudioFbank, TorchAudioFbankConfig
+import lhotse
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+
+def str2bool(v):
+    """Used in argparse.ArgumentParser.add_argument to indicate
+    that a type is a bool type and user can enter
+
+        - yes, true, t, y, 1, to represent True
+        - no, false, f, n, 0, to represent False
+
+    See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse  # noqa
+    """
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--sampling-rate",
+        type=int,
+        default=24000,
+        help="The target sampling rate, the audio will be resampled to this sampling_rate.",
+    )
+
+    parser.add_argument(
+        "--frame-shift",
+        type=int,
+        default=256,
+        help="Frame shift in samples",
+    )
+
+    parser.add_argument(
+        "--frame-length",
+        type=int,
+        default=1024,
+        help="Frame length in samples",
+    )
+
+    parser.add_argument(
+        "--num-mel-bins",
+        type=int,
+        default=100,
+        help="The num of mel filters.",
+    )
+
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        help="Dataset name.",
+    )
+
+    parser.add_argument(
+        "--subset",
+        type=str,
+        help="The subset of the dataset.",
+    )
+
+    parser.add_argument(
+        "--source-dir",
+        type=str,
+        default="data/manifests",
+        help="The source directory of manifest files.",
+    )
+
+    parser.add_argument(
+        "--dest-dir",
+        type=str,
+        default="data/fbank",
+        help="The destination directory of manifest files.",
+    )
+
+    parser.add_argument(
+        "--split-cuts",
+        type=str2bool,
+        default=False,
+        help="Whether to use splited cuts.",
+    )
+
+    parser.add_argument(
+        "--split-begin",
+        type=int,
+        help="Start idx of splited cuts.",
+    )
+
+    parser.add_argument(
+        "--split-end",
+        type=int,
+        help="End idx of splited cuts.",
+    )
+
+    parser.add_argument(
+        "--batch-duration",
+        type=int,
+        default=1000,
+        help="The batch duration when computing the features.",
+    )
+
+    parser.add_argument(
+        "--num-jobs", type=int, default=20, help="The number of extractor workers."
+    )
+
+    return parser.parse_args()
+
+
+def compute_fbank_split_single(params, idx):
+    lhotse.set_audio_duration_mismatch_tolerance(0.1)  # for emilia
+    src_dir = Path(params.source_dir)
+    output_dir = Path(params.dest_dir)
+    num_mel_bins = params.num_mel_bins
+
+    if not src_dir.exists():
+        logging.error(f"{src_dir} not exists")
+        return
+
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    num_digits = 8
+
+    config = TorchAudioFbankConfig(
+        sampling_rate=params.sampling_rate,
+        n_mels=params.num_mel_bins,
+        n_fft=params.frame_length,
+        hop_length=params.frame_shift,
+    )
+    extractor = TorchAudioFbank(config)
+
+    prefix = params.dataset
+    subset = params.subset
+    suffix = "jsonl.gz"
+
+    idx = f"{idx}".zfill(num_digits)
+    cuts_filename = f"{prefix}_cuts_{subset}.{idx}.{suffix}"
+
+    if (src_dir / cuts_filename).is_file():
+        logging.info(f"Loading manifests {src_dir / cuts_filename}")
+        cut_set = load_manifest_lazy(src_dir / cuts_filename)
+    else:
+        logging.warning(f"Raw {cuts_filename} not exists, skipping")
+        return
+
+    cut_set = cut_set.resample(params.sampling_rate)
+
+    if (output_dir / cuts_filename).is_file():
+        logging.info(f"{cuts_filename} already exists - skipping.")
+        return
+
+    logging.info(f"Processing {subset}.{idx} of {prefix}")
+
+    cut_set = cut_set.compute_and_store_features_batch(
+        extractor=extractor,
+        storage_path=f"{output_dir}/{prefix}_feats_{subset}_{idx}",
+        num_workers=4,
+        batch_duration=params.batch_duration,
+        storage_type=LilcomChunkyWriter,
+        overwrite=True,
+    )
+    cut_set.to_file(output_dir / cuts_filename)
+
+
+def compute_fbank_split(params):
+    if params.split_end < params.split_begin:
+        logging.warning(
+            f"Split begin should be smaller than split end, given "
+            f"{params.split_begin} -> {params.split_end}."
+        )
+
+    with Pool(max_workers=params.num_jobs) as pool:
+        futures = [
+            pool.submit(compute_fbank_split_single, params, i)
+            for i in range(params.split_begin, params.split_end)
+        ]
+        for f in futures:
+            f.result()
+            f.done()
+
+
+def compute_fbank(params):
+    src_dir = Path(params.source_dir)
+    output_dir = Path(params.dest_dir)
+    num_jobs = params.num_jobs
+    num_mel_bins = params.num_mel_bins
+
+    prefix = params.dataset
+    subset = params.subset
+    suffix = "jsonl.gz"
+
+    cut_set_name = f"{prefix}_cuts_{subset}.{suffix}"
+
+    if (src_dir / cut_set_name).is_file():
+        logging.info(f"Loading manifests {src_dir / cut_set_name}")
+        cut_set = load_manifest_lazy(src_dir / cut_set_name)
+    else:
+        recordings = load_manifest_lazy(
+            src_dir / f"{prefix}_recordings_{subset}.{suffix}"
+        )
+        supervisions = load_manifest_lazy(
+            src_dir / f"{prefix}_supervisions_{subset}.{suffix}"
+        )
+        cut_set = CutSet.from_manifests(
+            recordings=recordings,
+            supervisions=supervisions,
+        )
+
+    cut_set = cut_set.resample(params.sampling_rate)
+
+    config = TorchAudioFbankConfig(
+        sampling_rate=params.sampling_rate,
+        n_mels=params.num_mel_bins,
+        n_fft=params.frame_length,
+        hop_length=params.frame_shift,
+    )
+    extractor = TorchAudioFbank(config)
+
+    cuts_filename = f"{prefix}_cuts_{subset}.{suffix}"
+    if (output_dir / cuts_filename).is_file():
+        logging.info(f"{prefix} {subset} already exists - skipping.")
+        return
+    logging.info(f"Processing {subset} of {prefix}")
+
+    cut_set = cut_set.compute_and_store_features(
+        extractor=extractor,
+        storage_path=f"{output_dir}/{prefix}_feats_{subset}",
+        num_jobs=num_jobs,
+        storage_type=LilcomChunkyWriter,
+    )
+    cut_set.to_file(output_dir / cuts_filename)
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    args = get_args()
+    logging.info(vars(args))
+    if args.split_cuts:
+        compute_fbank_split(params=args)
+    else:
+        compute_fbank(params=args)
--- a/egs/zipvoice/local/compute_fbank_libritts.py
+++ b/egs/zipvoice/local/compute_fbank_libritts.py
@ -1,140 +0,0 @@
-#!/usr/bin/env python3
-# Copyright    2021-2023  Xiaomi Corp.        (authors: Fangjun Kuang,
-#                                                       Zengwei Yao,)
-#              2024       The Chinese Univ. of HK  (authors: Zengrui Jin)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-"""
-This file computes fbank features of the LibriTTS dataset.
-It looks for manifests in the directory data/manifests.
-
-The generated fbank features are saved in data/fbank.
-"""
-
-import argparse
-import logging
-import os
-from pathlib import Path
-from typing import Optional
-
-import torch
-from feature import TorchAudioFbank, TorchAudioFbankConfig
-from lhotse import CutSet, LilcomChunkyWriter
-from lhotse.recipes.utils import read_manifests_if_cached
-
-from icefall.utils import get_executor
-
-# Torch's multithreaded behavior needs to be disabled or
-# it wastes a lot of CPU and slow things down.
-# Do this outside of main() in case it needs to take effect
-# even when we are not invoking the main (e.g. when spawning subprocesses).
-torch.set_num_threads(1)
-torch.set_num_interop_threads(1)
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        help="""Dataset parts to compute fbank. If None, we will use all""",
-    )
-    parser.add_argument(
-        "--sampling-rate",
-        type=int,
-        default=24000,
-        help="""Sampling rate of the waveform for computing fbank, 
-        the default value for LibriTTS is 24000, waveform files will be 
-        resampled if a different sample rate is provided""",
-    )
-
-    return parser.parse_args()
-
-
-def compute_fbank_libritts(dataset: Optional[str] = None, sampling_rate: int = 24000):
-    src_dir = Path("data/manifests_libritts")
-    output_dir = Path("data/fbank_libritts")
-    num_jobs = min(32, os.cpu_count())
-
-    prefix = "libritts"
-    suffix = "jsonl.gz"
-    if dataset is None:
-        dataset_parts = (
-            "dev-clean",
-            "test-clean",
-            "train-clean-100",
-            "train-clean-360",
-            "train-other-500",
-        )
-    else:
-        dataset_parts = dataset.split(" ", -1)
-
-    manifests = read_manifests_if_cached(
-        dataset_parts=dataset_parts,
-        output_dir=src_dir,
-        prefix=prefix,
-        suffix=suffix,
-    )
-    assert manifests is not None
-
-    assert len(manifests) == len(dataset_parts), (
-        len(manifests),
-        len(dataset_parts),
-        list(manifests.keys()),
-        dataset_parts,
-    )
-
-    config = TorchAudioFbankConfig(
-        sampling_rate=sampling_rate,
-        n_mels=100,
-        n_fft=1024,
-        hop_length=256,
-    )
-    extractor = TorchAudioFbank(config)
-
-    with get_executor() as ex:  # Initialize the executor only once.
-        for partition, m in manifests.items():
-            cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
-            if (output_dir / cuts_filename).is_file():
-                logging.info(f"{partition} already exists - skipping.")
-                return
-            logging.info(f"Processing {partition}")
-            cut_set = CutSet.from_manifests(
-                recordings=m["recordings"],
-                supervisions=m["supervisions"],
-            )
-            if sampling_rate != 24000:
-                logging.info(f"Resampling waveforms to {sampling_rate}")
-                cut_set = cut_set.resample(sampling_rate)
-
-            cut_set = cut_set.compute_and_store_features(
-                extractor=extractor,
-                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
-                # when an executor is specified, make more partitions
-                num_jobs=num_jobs if ex is None else 80,
-                executor=ex,
-                storage_type=LilcomChunkyWriter,
-            )
-            cut_set.to_file(output_dir / cuts_filename)
-
-
-if __name__ == "__main__":
-    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
-
-    logging.basicConfig(format=formatter, level=logging.INFO)
-    compute_fbank_libritts()
--- a/egs/zipvoice/local/feature.py
+++ b/egs/zipvoice/local/feature.py
@ -1,135 +0,0 @@
-#!/usr/bin/env python3
-# Copyright         2024  Xiaomi Corp.        (authors: Han Zhu)
-#
-# See ../../../../LICENSE for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import Union
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torchaudio
-from lhotse.features.base import FeatureExtractor, register_extractor
-from lhotse.utils import Seconds, compute_num_frames
-
-
-class MelSpectrogramFeatures(nn.Module):
-    def __init__(
-        self,
-        sampling_rate=24000,
-        n_mels=100,
-        n_fft=1024,
-        hop_length=256,
-    ):
-        super().__init__()
-
-        self.mel_spec = torchaudio.transforms.MelSpectrogram(
-            sample_rate=sampling_rate,
-            n_fft=n_fft,
-            hop_length=hop_length,
-            n_mels=n_mels,
-            center=True,
-            power=1,
-        )
-
-    def forward(self, inp):
-        assert len(inp.shape) == 2
-
-        mel = self.mel_spec(inp)
-        logmel = mel.clamp(min=1e-7).log()
-        return logmel
-
-
-@dataclass
-class TorchAudioFbankConfig:
-    sampling_rate: int
-    n_mels: int
-    n_fft: int
-    hop_length: int
-
-
-@register_extractor
-class TorchAudioFbank(FeatureExtractor):
-
-    name = "TorchAudioFbank"
-    config_type = TorchAudioFbankConfig
-
-    def __init__(self, config):
-        super().__init__(config=config)
-
-    def _feature_fn(self, sample):
-        fbank = MelSpectrogramFeatures(
-            sampling_rate=self.config.sampling_rate,
-            n_mels=self.config.n_mels,
-            n_fft=self.config.n_fft,
-            hop_length=self.config.hop_length,
-        )
-
-        return fbank(sample)
-
-    @property
-    def device(self) -> Union[str, torch.device]:
-        return self.config.device
-
-    def feature_dim(self, sampling_rate: int) -> int:
-        return self.config.n_mels
-
-    def extract(
-        self,
-        samples: Union[np.ndarray, torch.Tensor],
-        sampling_rate: int,
-    ) -> Union[np.ndarray, torch.Tensor]:
-        # Check for sampling rate compatibility.
-        expected_sr = self.config.sampling_rate
-        assert sampling_rate == expected_sr, (
-            f"Mismatched sampling rate: extractor expects {expected_sr}, "
-            f"got {sampling_rate}"
-        )
-        is_numpy = False
-        if not isinstance(samples, torch.Tensor):
-            samples = torch.from_numpy(samples)
-            is_numpy = True
-
-        if len(samples.shape) == 1:
-            samples = samples.unsqueeze(0)
-        assert samples.ndim == 2, samples.shape
-        assert samples.shape[0] == 1, samples.shape
-
-        mel = self._feature_fn(samples).squeeze().t()
-
-        assert mel.ndim == 2, mel.shape
-        assert mel.shape[1] == self.config.n_mels, mel.shape
-
-        num_frames = compute_num_frames(
-            samples.shape[1] / sampling_rate, self.frame_shift, sampling_rate
-        )
-
-        if mel.shape[0] > num_frames:
-            mel = mel[:num_frames]
-        elif mel.shape[0] < num_frames:
-            mel = mel.unsqueeze(0)
-            mel = torch.nn.functional.pad(
-                mel, (0, 0, 0, num_frames - mel.shape[1]), mode="replicate"
-            ).squeeze(0)
-
-        if is_numpy:
-            return mel.cpu().numpy()
-        else:
-            return mel
-
-    @property
-    def frame_shift(self) -> Seconds:
-        return self.config.hop_length / self.config.sampling_rate
--- a/egs/zipvoice/local/feature.py
+++ b/egs/zipvoice/local/feature.py
@ -0,0 +1 @@
+../zipvoice/feature.py
--- a/egs/zipvoice/local/prepare_tokens_emilia.py
+++ b/egs/zipvoice/local/prepare_tokens_emilia.py
@ -20,20 +20,26 @@


 """
-This file reads the texts in given manifest and save the new cuts with phoneme tokens.
+This file reads the texts in given manifest and save the cleaned new cuts.
 """

 import argparse
-import glob
 import logging
-import re
-from concurrent.futures import ProcessPoolExecutor as Pool
+import glob
+import os
 from pathlib import Path
 from typing import List

-import jieba
-from lhotse import load_manifest_lazy
-from tokenizer import Tokenizer, is_alphabet, is_chinese, is_hangul, is_japanese
+from lhotse import CutSet, load_manifest_lazy
+from concurrent.futures import ProcessPoolExecutor as Pool
+
+from tokenizer import (
+    is_alphabet,
+    is_chinese,
+    is_hangul,
+    is_japanese,
+    tokenize_by_CJK_char,
+)


 def get_args():
@ -48,71 +54,32 @@ def get_args():
    parser.add_argument(
        "--jobs",
        type=int,
-        default=50,
+        default=20,
        help="Number of jobs to processing.",
    )

    parser.add_argument(
        "--source-dir",
        type=str,
-        default="data/manifests_emilia/splits",
+        default="data/manifests/splits_raw",
        help="The source directory of manifest files.",
    )

    parser.add_argument(
        "--dest-dir",
        type=str,
+        default="data/manifests/splits",
        help="The destination directory of manifest files.",
    )

    return parser.parse_args()


-def tokenize_by_CJK_char(line: str) -> List[str]:
-    """
-    Tokenize a line of text with CJK char.
-
-    Note: All return characters will be upper case.
-
-    Example:
-      input = "你好世界是 hello world 的中文"
-      output = [你, 好, 世, 界, 是, HELLO, WORLD, 的, 中, 文]
-
-    Args:
-      line:
-        The input text.
-
-    Return:
-      A new string tokenize by CJK char.
-    """
-    # The CJK ranges is from https://github.com/alvations/nltk/blob/79eed6ddea0d0a2c212c1060b477fc268fec4d4b/nltk/tokenize/util.py
-    pattern = re.compile(
-        r"([\u1100-\u11ff\u2e80-\ua4cf\ua840-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F\uFF65-\uFFDC\U00020000-\U0002FFFF])"
-    )
-    chars = pattern.split(line.strip().upper())
-    char_list = []
-    for w in chars:
-        if w.strip():
-            char_list += w.strip().split()
-    return char_list
-
-
-def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
+def preprocess_emilia(file_name: str, input_dir: Path, output_dir: Path):
    logging.info(f"Processing {file_name}")
    if (output_dir / file_name).is_file():
        logging.info(f"{file_name} exists, skipping.")
        return
-    jieba.setLogLevel(logging.INFO)
-    tokenizer = Tokenizer()
-
-    def _prepare_cut(cut):
-        # Each cut only contains one supervision
-        assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
-        text = cut.supervisions[0].text
-        cut.supervisions[0].normalized_text = text
-        tokens = tokenizer.texts_to_tokens([text])[0]
-        cut.tokens = tokens
-        return cut

    def _filter_cut(cut):
        text = cut.supervisions[0].text
@ -124,10 +91,10 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
        clean_chars = []
        for x in text:
            if is_hangul(x):
-                logging.info(f"Delete cut with text containing Korean : {text}")
+                logging.warning(f"Delete cut with text containing Korean : {text}")
                return False
            if is_japanese(x):
-                logging.info(f"Delete cut with text containing Japanese : {text}")
+                logging.warning(f"Delete cut with text containing Japanese : {text}")
                return False
            if is_chinese(x):
                chinese.append(x)
@ -138,18 +105,19 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
            if x == " ":
                clean_chars.append(x)
        if len(english) + len(chinese) == 0:
-            logging.info(f"Delete cut with text has no valid chars : {text}")
+            logging.warning(f"Delete cut with text has no valid chars : {text}")
            return False

        words = tokenize_by_CJK_char("".join(clean_chars))
        for i in range(len(words) - 10):
            if words[i : i + 10].count(words[i]) == 10:
-                logging.info(f"Delete cut with text with too much repeats : {text}")
+                logging.warning(f"Delete cut with text with too much repeats : {text}")
                return False
        # word speed, 20 - 600 / minute
        if duration < len(words) / 600 * 60 or duration > len(words) / 20 * 60:
-            logging.info(
-                f"Delete cut with audio text mismatch, duration : {duration}s, words : {len(words)}, text : {text}"
+            logging.warning(
+                f"Delete cut with audio text mismatch, duration : {duration}s, "
+                f"words : {len(words)}, text : {text}"
            )
            return False
        return True
@ -157,11 +125,10 @@ def prepare_tokens_emilia(file_name: str, input_dir: Path, output_dir: Path):
    try:
        cut_set = load_manifest_lazy(input_dir / file_name)
        cut_set = cut_set.filter(_filter_cut)
-        cut_set = cut_set.map(_prepare_cut)
        cut_set.to_file(output_dir / file_name)
    except Exception as e:
        logging.error(f"Manifest {file_name} failed with error: {e}")
-        raise
+        os.remove(str(output_dir / file_name))


 if __name__ == "__main__":
@ -179,14 +146,11 @@ if __name__ == "__main__":
    with Pool(max_workers=args.jobs) as pool:
        futures = [
            pool.submit(
-                prepare_tokens_emilia, filename.split("/")[-1], input_dir, output_dir
+                preprocess_emilia, filename.split("/")[-1], input_dir, output_dir
            )
            for filename in cut_files
        ]
        for f in futures:
-            try:
-                f.result()
-                f.done()
-            except Exception as e:
-                logging.error(f"Future failed with error: {e}")
+            f.result()
+            f.done()
    logging.info("Processing done.")
--- a/egs/zipvoice/local/tokenizer.py
+++ b/egs/zipvoice/local/tokenizer.py
@ -0,0 +1 @@
+../zipvoice/tokenizer.py
--- a/egs/zipvoice/scripts/prepare.sh
+++ b/egs/zipvoice/scripts/prepare.sh
@ -1,232 +0,0 @@
-#!/usr/bin/env bash
-
-# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
-export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
-
-# add icefall to PYTHONPATH
-export PYTHONPATH=../../../:$PYTHONPATH
-
-set -eou pipefail
-
-stage=0
-stop_stage=100
-
-token_type=bpe  # bpe, letter, phone
-bpe_vocab_size=500
-
-nj=32
-
-dl_dir=$PWD/download
-
-. shared/parse_options.sh || exit 1
-
-# All files generated by this script are saved in "data".
-# You can safely remove "data" and rerun this script to regenerate it.
-mkdir -p data
-
-log() {
-  # This function is from espnet
-  local fname=${BASH_SOURCE[1]##*/}
-  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
-}
-
-log "dl_dir: $dl_dir"
-
-
-if [ $stage -le -2 ] && [ $stop_stage -ge -2 ]; then
-
-  if [ ! -d $dl_dir/xvector_nnet_1a_libritts_clean_460 ]; then
-    log "Downloading x-vector"
-
-    git clone https://huggingface.co/datasets/zrjin/xvector_nnet_1a_libritts_clean_460 $dl_dir/xvector_nnet_1a_libritts_clean_460
-
-    mkdir -p exp/xvector_nnet_1a/
-    cp -r $dl_dir/xvector_nnet_1a_libritts_clean_460/* exp/xvector_nnet_1a/
-  fi
-
-fi
-
-if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
-  log "Stage -1: build monotonic_align lib"
-  if [ ! -d vits/monotonic_align/build ]; then
-    cd vits/monotonic_align
-    python setup.py build_ext --inplace
-    cd ../../
-  else
-    log "monotonic_align lib already built"
-  fi
-fi
-
-if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
-  log "Stage 0: Download data"
-
-  # If you have pre-downloaded it to /path/to/LibriTTS,
-  # you can create a symlink
-  #
-  #   ln -sfv /path/to/LibriTTS $dl_dir/LibriTTS
-  #
-  if [ ! -d $dl_dir/LibriTTS ]; then
-    lhotse download libritts $dl_dir
-  fi
-
-fi
-
-if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
-  log "Stage 1: Prepare LibriTTS manifest"
-  # We assume that you have downloaded the LibriTTS corpus
-  # to $dl_dir/LibriTTS
-  mkdir -p data/manifests
-  if [ ! -e data/manifests/.libritts.done ]; then
-    lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests
-    touch data/manifests/.libritts.done
-  fi
-fi
-
-if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
-  log "Stage 2: Compute Fbank for LibriTTS"
-  mkdir -p data/fbank
-
-  for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do
-    python local/compute_fbank.py --dataset libritts --subset ${subset}
-  done
-
-  # Here we shuffle and combine the train-clean-100, train-clean-360 and
-  # train-other-500 together to form the training set.
-  if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then
-    cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
-      <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \
-      <(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \
-      shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
-  fi
-
-  if [ ! -f data/fbank/libritts_cuts_train-clean-460.jsonl.gz ]; then
-    cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
-      <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) | \
-      shuf | gzip -c > data/fbank/libritts_cuts_train-clean-460.jsonl.gz
-  fi
-
-  if [ ! -e data/fbank/.libritts-validated.done ]; then
-    log "Validating data/fbank for LibriTTS"
-    ./local/validate_manifest.py \
-      data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
-    touch data/fbank/.libritts-validated.done
-  fi
-fi
-
-if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
-  log "Stage 3: Prepare tokens.txt"
-
-  if [ $token_type == "bpe" ] || [ $token_type == "letter" ]; then
-    if [ ! -e data/texts.txt ]; then
-      ./local/export_normalized_texts.py --output data/texts.txt \
-        --manifests data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
-    fi
-  fi
-
-  if [ $token_type == "bpe" ]; then
-    mkdir -p data/lang_bpe_${bpe_vocab_size}
-    if [ ! -e data/lang_bpe_${bpe_vocab_size}/tokens.txt ]; then
-      ./local/train_bpe_model.py --transcript data/texts.txt \
-        --lang-dir data/lang_bpe_${bpe_vocab_size} \
-        --vocab-size $bpe_vocab_size
-    fi
-  fi
-
-  if [ $token_type == "phone" ]; then
-    mkdir -p data/lang_phone
-    ./local/export_tokens.py --token-type phone \
-      --output data/lang_phone/tokens.txt
-  fi
-
-  if [ $token_type == "letter" ]; then
-    mkdir -p data/lang_letter
-    ./local/export_tokens.py --token-type letter  \
-      --texts data/texts.txt \
-      --output data/lang_letter/tokens.txt
-  fi
-fi
-
-if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
-  log "Stage 4: Download and prepare librispeech-pc test clean for testing."
-
-  if [ ! -e $dl_dir/test-clean.tar.gz ]; then
-    wget https://huggingface.co/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir
-  fi
-  # For China users.
-  if [ ! -e $dl_dir/test-clean.tar.gz ]; then
-    wget https://hf-mirror.com/datasets/k2-fsa/LibriSpeech/resolve/main/test-clean.tar.gz -P $dl_dir
-  fi
-
-  if [ ! -d $dl_dir/LibriSpeech/test-clean ]; then
-    tar -xvf $dl_dir/test-clean.tar.gz -C $dl_dir
-  fi
-
-  mkdir -p $dl_dir/LibriSpeech-PC
-  if [ ! -e $dl_dir/LibriSpeech-PC/test-clean.json ]; then
-    wget https://us.openslr.org/resources/145/manifests.tar.gz -P $dl_dir/LibriSpeech-PC
-    tar -xvf $dl_dir/LibriSpeech-PC/manifests.tar.gz -C $dl_dir/LibriSpeech-PC
-  fi
-
-  python local/compute_fbank.py --dataset librispeech --subset test-clean
-  python local/prepare_prompts_librispeech_test_clean.py
-fi
-
-if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
-  log "Stage 5: Compute Spectrogram for LibriTTS (for VITS system)"
-  mkdir -p data/spectrogram
-  if [ ! -e data/spectrogram/.libritts.done ]; then
-    ./local/compute_spectrogram_libritts.py --sampling-rate $sampling_rate
-    touch data/spectrogram/.libritts.done
-  fi
-
-  # Here we shuffle and combine the train-clean-100, train-clean-360 and
-  # train-other-500 together to form the training set.
-  if [ ! -f data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz ]; then
-    cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
-      <(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) \
-      <(gunzip -c data/spectrogram/libritts_cuts_train-other-500.jsonl.gz) | \
-      shuf | gzip -c > data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
-  fi
-
-  # Here we shuffle and combine the train-clean-100, train-clean-360
-  # together to form the training set.
-  if [ ! -f data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz ]; then
-    cat <(gunzip -c data/spectrogram/libritts_cuts_train-clean-100.jsonl.gz) \
-      <(gunzip -c data/spectrogram/libritts_cuts_train-clean-360.jsonl.gz) | \
-      shuf | gzip -c > data/spectrogram/libritts_cuts_train-clean-460.jsonl.gz
-  fi
-
-  if [ ! -e data/spectrogram/.libritts-validated.done ]; then
-    log "Validating data/spectrogram for LibriTTS"
-    ./local/validate_manifest.py \
-      data/spectrogram/libritts_cuts_train-all-shuf.jsonl.gz
-    touch data/spectrogram/.libritts-validated.done
-  fi
-fi
-
-audio_feats_dir=data/tokenized
-dataset_parts="--dataset-parts all"  # debug "-p dev-clean -p test-clean"
-if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
-  log "Stage 6: Tokenize/Fbank LibriTTS for valle"
-  mkdir -p ${audio_feats_dir}
-  if [ ! -e ${audio_feats_dir}/.libritts.tokenize.done ]; then
-    python3 ./local/compute_neural_codec_and_prepare_text_tokens.py --dataset-parts "${dataset_parts}" \
-        --audio-extractor "Encodec" \
-        --batch-duration 400 \
-        --src-dir "data/manifests" \
-        --output-dir "${audio_feats_dir}"
-  fi
-  touch ${audio_feats_dir}/.libritts.tokenize.done
-
-  lhotse combine \
-    ${audio_feats_dir}/libritts_cuts_train-clean-100.jsonl.gz \
-    ${audio_feats_dir}/libritts_cuts_train-clean-360.jsonl.gz \
-    ${audio_feats_dir}/libritts_cuts_train-other-500.jsonl.gz \
-    ${audio_feats_dir}/cuts_train.jsonl.gz
-  lhotse copy \
-    ${audio_feats_dir}/libritts_cuts_dev-clean.jsonl.gz \
-    ${audio_feats_dir}/cuts_dev.jsonl.gz
-  lhotse copy \
-    ${audio_feats_dir}/libritts_cuts_test-clean.jsonl.gz \
-    ${audio_feats_dir}/cuts_test.jsonl.gz
-fi
--- a/egs/zipvoice/scripts/prepare_emilia.sh
+++ b/egs/zipvoice/scripts/prepare_emilia.sh
@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+set -eou pipefail
+
+stage=0
+stop_stage=5
+sampling_rate=24000
+nj=32
+
+dl_dir=$PWD/download
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: Download data"
+
+  # Your download directory should look like this:
+  #
+  #    download/Amphion___Emilia
+  #    ├── metafile.yaml
+  #    ├── raw
+  #    │   ├── DE
+  #    │   ├── EN
+  #    │   ├── FR
+  #    │   ├── JA
+  #    │   ├── KO
+  #    │   ├── openemilia_45batches.tar.gz
+  #    │   ├── openemilia_all.tar.gz
+  #    │   └── ZH
+  #    └── README.md
+
+  if [ ! -d $dl_dir/Amphion___Emilia/raw ]; then
+    log "Please refer https://openxlab.org.cn/datasets/Amphion/Emilia to download the dataset."
+    exit(-1)
+  fi
+
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Prepare emilia manifests (EN and ZH only)"
+  # We assume that you have downloaded the Emilia corpus
+  # to $dl_dir/Amphion___Emilia
+  # see stage 0 for the directory structure
+  mkdir -p data/manifests
+  if [ ! -e data/manifests/.emilia.done ]; then
+    lhotse prepare emilia --lang en --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
+    lhotse prepare emilia --lang zh --num-jobs ${nj} $dl_dir/Amphion___Emilia data/manifests
+    touch data/manifests/.emilia.done
+  fi
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Preprocess Emilia dataset, mainly for cleaning"
+  mkdir -p data/manifests/splits_raw
+  if [ ! -e data/manifests/split_raw/.emilia.split.done ]; then
+    lhotse split-lazy data/manifests/emilia_cuts_EN.jsonl.gz data/manifests/splits_raw 10000
+    lhotse split-lazy data/manifests/emilia_cuts_ZH.jsonl.gz data/manifests/splits_raw 10000
+    touch data/manifests/splits_raw/.emilia.split.done
+  fi
+
+  mkdir -p data/manifests/splits
+
+  if [ ! -e data/manifests/splits/.emilia.preprocess.done ]; then
+    python local/preprocess_emilia.py --subset EN
+    python local/preprocess_emilia.py --subset ZH
+    touch data/manifests/splits/.emilia.preprocess.done
+  fi
+
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Extract Fbank for Emilia"
+  mkdir -p data/fbank/emilia_splits
+  if [ ! -e data/fbank/emilia_splits/.emilia.fbank.done ]; then
+    # You can speed up the extraction by distributing splits to multiple machines.
+    for subset in EN ZH; do
+      python local/compute_fbank.py \
+        --source-dir data/manifests/splits \
+        --dest-dir data/fbank/emilia_splits \
+        --dataset emilia \
+        --subset ${subset} \
+        --splits-cuts 1 \
+        --split-begin 0 \
+        --split-end 2000 \
+        --num-jobs ${nj}
+    done
+    touch data/fbank/emilia_splits/.emilia.fbank.done
+  fi
+
+  if [ ! -e data/fbank/emilia_cuts_EN.jsonl.gz ]; then
+    log "Combining EN fbank cuts and spliting EN dev set"
+    gunzip -c data/fbank/emilia_splits/emilia_cuts_EN.*.jsonl.gz > data/fbank/emilia_cuts_EN.jsonl
+    head -n 1500 data/fbank/emilia_cuts_EN.jsonl | gzip -c > data/fbank/emilia_cuts_EN_dev.jsonl.gz
+    sed -i '1,1500d' data/fbank/emilia_cuts_EN.jsonl
+    gzip data/fbank/emilia_cuts_EN.jsonl
+  fi
+
+  if [ ! -e data/fbank/emilia_cuts_ZH.jsonl.gz ]; then
+    log "Combining ZH fbank cuts and spliting ZH dev set"
+    gunzip -c data/fbank/emilia_splits/emilia_cuts_ZH.*.jsonl.gz > data/fbank/emilia_cuts_ZH.jsonl
+    head -n 1500 data/fbank/emilia_cuts_ZH.jsonl | gzip -c > data/fbank/emilia_cuts_ZH_dev.jsonl.gz
+    sed -i '1,1500d' data/fbank/emilia_cuts_ZH.jsonl
+    gzip data/fbank/emilia_cuts_ZH.jsonl
+  fi
+
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  log "Stage 4: Generate token file"
+  if [ ! -e data/tokens_emilia.txt ]; then
+    ./local/prepare_token_file_emilia.py --tokens data/tokens_emilia.txt
+  fi
+fi
--- a/egs/zipvoice/scripts/prepare_libritts.sh
+++ b/egs/zipvoice/scripts/prepare_libritts.sh
@ -8,7 +8,7 @@ set -eou pipefail
 stage=0
 stop_stage=5
 sampling_rate=24000
-nj=32
+nj=20

 dl_dir=$PWD/download

@ -44,44 +44,53 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
  # to $dl_dir/LibriTTS
  mkdir -p data/manifests_libritts
  if [ ! -e data/manifests_libritts/.libritts.done ]; then
-    lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests_libritts
-    touch data/manifests_libritts/.libritts.done
+    lhotse prepare libritts --num-jobs ${nj} $dl_dir/LibriTTS data/manifests
+    touch data/manifests/.libritts.done
  fi
 fi

 if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
  log "Stage 2: Compute Fbank for LibriTTS"
  mkdir -p data/fbank
-  if [ ! -e data/fbank_libritts/.libritts.done ]; then
-    ./local/compute_fbank_libritts.py --sampling-rate $sampling_rate
-    touch data/fbank_libritts/.libritts.done
+
+  if [ ! -e data/fbank/.libritts.done ]; then
+    for subset in train-clean-100 train-clean-360 train-other-500 dev-clean test-clean; do
+      python local/compute_fbank.py \
+        --source-dir data/manifests \
+        --dest-dir data/fbank \
+        --dataset libritts \
+        --subset ${subset} \
+        --sampling-rate $sampling_rate \
+        --num-jobs ${nj}
+    done
+    touch data/fbank/.libritts.done
  fi

  # Here we shuffle and combine the train-clean-100, train-clean-360 and
  # train-other-500 together to form the training set.
-  if [ ! -f data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz ]; then
-    cat <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-100.jsonl.gz) \
-      <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-360.jsonl.gz) \
-      <(gunzip -c data/fbank_libritts/libritts_cuts_train-other-500.jsonl.gz) | \
-      shuf | gzip -c > data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz
+  if [ ! -f data/fbank/libritts_cuts_train-all-shuf.jsonl.gz ]; then
+    cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
+      <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) \
+      <(gunzip -c data/fbank/libritts_cuts_train-other-500.jsonl.gz) | \
+      shuf | gzip -c > data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
  fi

-  if [ ! -f data/fbank_libritts/libritts_cuts_train-clean-460.jsonl.gz ]; then
-    cat <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-100.jsonl.gz) \
-      <(gunzip -c data/fbank_libritts/libritts_cuts_train-clean-360.jsonl.gz) | \
-      shuf | gzip -c > data/fbank_libritts/libritts_cuts_train-clean-460.jsonl.gz
+  if [ ! -f data/fbank/libritts_cuts_train-clean-460.jsonl.gz ]; then
+    cat <(gunzip -c data/fbank/libritts_cuts_train-clean-100.jsonl.gz) \
+      <(gunzip -c data/fbank/libritts_cuts_train-clean-360.jsonl.gz) | \
+      shuf | gzip -c > data/fbank/libritts_cuts_train-clean-460.jsonl.gz
  fi

-  if [ ! -e data/fbank_libritts/.libritts-validated.done ]; then
+  if [ ! -e data/fbank/.libritts-validated.done ]; then
    log "Validating data/fbank for LibriTTS"
    ./local/validate_manifest.py \
-      data/fbank_libritts/libritts_cuts_train-all-shuf.jsonl.gz
-    touch data/fbank_libritts/.libritts-validated.done
+      data/fbank/libritts_cuts_train-all-shuf.jsonl.gz
+    touch data/fbank/.libritts-validated.done
  fi
 fi

 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
-  log "Stage 4: Generate token file"
+  log "Stage 3: Generate token file"
  if [ ! -e data/tokens_libritts.txt ]; then
    ./local/prepare_token_file_libritts.py --tokens data/tokens_libritts.txt
  fi
--- a/egs/zipvoice/shared
+++ b/egs/zipvoice/shared
@ -0,0 +1 @@
+../../icefall/shared
--- a/egs/zipvoice/zipvoice/train_distill.py
+++ b/egs/zipvoice/zipvoice/train_distill.py
@ -31,7 +31,7 @@ python3 zipvoice/train_distill.py \
        --base-lr 0.0005 \
        --max-duration 500 \
        --token-file "data/tokens_emilia.txt" \
-        --manifest-dir "data/fbank_emilia" \
+        --manifest-dir "data/fbank" \
        --teacher-model zipvoice/exp_zipvoice/epoch-11-avg-4.pt \
        --num-updates 60000 \
        --distill-stage "first" \
@ -46,7 +46,7 @@ python3 zipvoice/train_distill.py \
        --base-lr 0.0001 \
        --max-duration 500 \
        --token-file "data/tokens_emilia.txt" \
-        --manifest-dir "data/fbank_emilia" \
+        --manifest-dir "data/fbank" \
        --teacher-model zipvoice/exp_zipvoice_distill_1stage/iter-60000-avg-7.pt \
        --num-updates 2000 \
        --distill-stage "second" \
--- a/egs/zipvoice/zipvoice/train_flow.py
+++ b/egs/zipvoice/zipvoice/train_flow.py
@ -29,7 +29,7 @@ python3 zipvoice/train_flow.py \
        --lr-hours 30000 \
        --lr-batches 7500 \
        --token-file "data/tokens_emilia.txt" \
-        --manifest-dir "data/fbank_emilia" \
+        --manifest-dir "data/fbank" \
        --num-epochs 11 \
        --exp-dir zipvoice/exp_zipvoice
 """
--- a/egs/zipvoice/zipvoice/tts_datamodule.py
+++ b/egs/zipvoice/zipvoice/tts_datamodule.py
@ -347,14 +347,14 @@ class TtsDataModule:
            train-clean-360 and train-other-500 cuts"
        )
        return load_manifest_lazy(
-            self.args.manifest_dir / "libritts_cuts_with_tokens_train-all-shuf.jsonl.gz"
+            self.args.manifest_dir / "libritts_cuts_train-all-shuf.jsonl.gz"
        )

    @lru_cache()
    def dev_libritts_cuts(self) -> CutSet:
        logging.info("About to get dev-clean cuts")
        return load_manifest_lazy(
-            self.args.manifest_dir / "libritts_cuts_with_tokens_dev-clean.jsonl.gz"
+            self.args.manifest_dir / "libritts_cuts_dev-clean.jsonl.gz"
        )