Merge c25dc02d5d192a03fc61302d05d2ee602c008b4d into 9293edc62f4a3ebf769d66cc037d4e67953440f5

2025-08-08 09:32:20 +00:00 · 2025-07-08 17:44:55 +08:00 · 2025-07-08 17:44:55 +08:00 · bf048133e1
commit bf048133e1
parent 9293edc62f c25dc02d5d
37 changed files with 3415 additions and 6 deletions
--- a/.github/scripts/aishell3/TTS/run.sh
+++ b/.github/scripts/aishell3/TTS/run.sh
@ -0,0 +1,118 @@
+#!/usr/bin/env bash
+
+set -ex
+
+python3 -m pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
+python3 -m pip install numba
+python3 -m pip install pypinyin
+python3 -m pip install cython
+
+apt-get update
+apt-get install -y jq
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+cd egs/aishell3/TTS
+
+sed -i.bak s/1000/10/g ./prepare.sh
+
+
+function download_data() {
+  mkdir download
+  pushd download
+  curl -SL -O https://huggingface.co/csukuangfj/aishell3-ci-data/resolve/main/aishell3.tar.bz2
+  tar xf aishell3.tar.bz2
+  rm aishell3.tar.bz2
+  ls -lh
+  popd
+}
+
+function prepare_data() {
+  ./prepare.sh
+
+  echo "----------tokens.txt----------"
+  cat data/tokens.txt
+  echo "------------------------------"
+  wc -l data/tokens.txt
+  echo "------------------------------"
+
+  echo "----------lexicon.txt----------"
+  head data/lexicon.txt
+  echo "----"
+  tail data/lexicon.txt
+  echo "----"
+  wc -l data/lexicon.txt
+}
+
+function train() {
+  pushd ./vits
+  sed -i.bak s/200/50/g ./train.py
+  git diff .
+  popd
+
+  # for t in low medium high; do
+  for t in low; do
+    ./vits/train.py \
+      --exp-dir vits/exp-$t \
+      --model-type $t \
+      --num-epochs 1 \
+      --save-every-n 1 \
+      --num-buckets 2 \
+      --tokens data/tokens.txt \
+      --max-duration 20
+
+    ls -lh vits/exp-$t
+  done
+}
+
+function export_onnx() {
+  # for t in low medium high; do
+  for t in low; do
+    ./vits/export-onnx.py \
+      --model-type $t \
+      --epoch 1 \
+      --exp-dir ./vits/exp-$t \
+      --tokens data/tokens.txt \
+      --speakers ./data/speakers.txt
+
+    ls -lh vits/exp-$t/
+  done
+}
+
+function test_low() {
+  git clone https://huggingface.co/csukuangfj/icefall-tts-aishell3-vits-low-2024-04-06
+  repo=icefall-tts-aishell3-vits-low-2024-04-06
+
+  ./vits/export-onnx.py \
+    --model-type low \
+    --epoch 1000 \
+    --exp-dir $repo/exp \
+    --tokens $repo/data/tokens.txt \
+    --speakers $repo/data/speakers.txt
+
+  ls -lh $repo/exp/vits-epoch-1000.onnx
+
+  python3 -m pip install sherpa-onnx
+
+  sherpa-onnx-offline-tts \
+    --vits-model=$repo/exp/vits-epoch-960.onnx \
+    --vits-tokens=$repo/data/tokens.txt \
+    --vits-lexicon=$repo/data/lexicon.txt \
+    --num-threads=1 \
+    --vits-length-scale=1.0 \
+    --sid=33 \
+    --output-filename=/icefall/low.wav \
+    --debug=1 \
+    "这是一个语音合成测试"
+}
+
+
+download_data
+prepare_data
+train
+export_onnx
+test_low
--- a/.github/workflows/aishell3.yml
+++ b/.github/workflows/aishell3.yml
@ -0,0 +1,84 @@
+name: aishell3
+
+on:
+  push:
+    branches:
+      - master
+      - tts-aishell3
+
+  pull_request:
+    branches:
+      - master
+
+  workflow_dispatch:
+
+concurrency:
+  group: aishell3-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  generate_build_matrix:
+    if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && (github.event.label.name == 'ready' || github.event_name == 'push' || github.event_name == 'aishell3')
+
+    # see https://github.com/pytorch/pytorch/pull/50633
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Generating build matrix
+        id: set-matrix
+        run: |
+          # outputting for debugging purposes
+          python ./.github/scripts/docker/generate_build_matrix.py
+          MATRIX=$(python ./.github/scripts/docker/generate_build_matrix.py)
+          echo "::set-output name=matrix::${MATRIX}"
+  aishell3:
+    needs: generate_build_matrix
+    name: py${{ matrix.python-version }} torch${{ matrix.torch-version }} v${{ matrix.version }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Free space
+        shell: bash
+        run: |
+          df -h
+          rm -rf /opt/hostedtoolcache
+          df -h
+          echo "pwd: $PWD"
+          echo "github.workspace ${{ github.workspace }}"
+
+      - name: Run aishell3 tests
+        uses: addnab/docker-run-action@v3
+        with:
+            image: ghcr.io/${{ github.repository_owner }}/icefall:cpu-py${{ matrix.python-version }}-torch${{ matrix.torch-version }}-v${{ matrix.version }}
+            options: |
+              --volume ${{ github.workspace }}/:/icefall
+            shell: bash
+            run: |
+              export PYTHONPATH=/icefall:$PYTHONPATH
+              cd /icefall
+              git config --global --add safe.directory /icefall
+
+              .github/scripts/aishell3/TTS/run.sh
+
+      - name: display files
+        shell: bash
+        run: |
+          ls -lh
+
+      - uses: actions/upload-artifact@v4
+        if: matrix.python-version == '3.9' && matrix.torch-version == '2.2.0'
+        with:
+          name: generated-test-files-${{ matrix.python-version }}-${{ matrix.torch-version }}
+          path: ./*.wav
--- a/.gitignore
+++ b/.gitignore
@ -36,3 +36,7 @@ node_modules
 .DS_Store
 *.fst
 *.arpa
+core.c
+*.so
+build
+*.wav
--- a/docs/source/recipes/TTS/ljspeech/vits.rst
+++ b/docs/source/recipes/TTS/ljspeech/vits.rst
@ -19,7 +19,7 @@ Install extra dependencies
 .. code-block:: bash

  pip install piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html
-  pip install numba espnet_tts_frontend
+  pip install numba espnet_tts_frontend cython

 Data preparation
 ----------------
--- a/egs/aishell3/TTS/local/compute_spectrogram_aishell3.py
+++ b/egs/aishell3/TTS/local/compute_spectrogram_aishell3.py
@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# Copyright    2021-2023  Xiaomi Corp.        (authors: Fangjun Kuang,
+#                                                       Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file computes fbank features of the aishell3 dataset.
+It looks for manifests in the directory data/manifests.
+
+The generated spectrogram features are saved in data/spectrogram.
+"""
+
+import logging
+import os
+from pathlib import Path
+
+import torch
+from lhotse import (
+    CutSet,
+    LilcomChunkyWriter,
+    Spectrogram,
+    SpectrogramConfig,
+    load_manifest,
+)
+from lhotse.audio import RecordingSet
+from lhotse.supervision import SupervisionSet
+
+from icefall.utils import get_executor
+
+# Torch's multithreaded behavior needs to be disabled or
+# it wastes a lot of CPU and slow things down.
+# Do this outside of main() in case it needs to take effect
+# even when we are not invoking the main (e.g. when spawning subprocesses).
+torch.set_num_threads(1)
+torch.set_num_interop_threads(1)
+
+
+def compute_spectrogram_aishell3():
+    src_dir = Path("data/manifests")
+    output_dir = Path("data/spectrogram")
+    num_jobs = min(4, os.cpu_count())
+
+    sampling_rate = 8000
+    frame_length = 1024 / sampling_rate  # (in second)
+    frame_shift = 256 / sampling_rate  # (in second)
+    use_fft_mag = True
+
+    prefix = "aishell3"
+    suffix = "jsonl.gz"
+    partitions = ("test", "train")
+
+    config = SpectrogramConfig(
+        sampling_rate=sampling_rate,
+        frame_length=frame_length,
+        frame_shift=frame_shift,
+        use_fft_mag=use_fft_mag,
+    )
+    extractor = Spectrogram(config)
+
+    for partition in partitions:
+        recordings = load_manifest(
+            src_dir / f"{prefix}_recordings_{partition}.{suffix}", RecordingSet
+        )
+        supervisions = load_manifest(
+            src_dir / f"{prefix}_supervisions_{partition}.{suffix}", SupervisionSet
+        )
+
+        # resample from 44100 to 8000
+        recordings = recordings.resample(sampling_rate)
+
+        with get_executor() as ex:  # Initialize the executor only once.
+            cuts_filename = f"{prefix}_cuts_{partition}.{suffix}"
+            if (output_dir / cuts_filename).is_file():
+                logging.info(f"{cuts_filename} already exists - skipping.")
+                return
+            logging.info(f"Processing {partition}")
+            cut_set = CutSet.from_manifests(
+                recordings=recordings, supervisions=supervisions
+            )
+
+            cut_set = cut_set.compute_and_store_features(
+                extractor=extractor,
+                storage_path=f"{output_dir}/{prefix}_feats_{partition}",
+                # when an executor is specified, make more partitions
+                num_jobs=num_jobs if ex is None else 80,
+                executor=ex,
+                storage_type=LilcomChunkyWriter,
+            )
+            cut_set.to_file(output_dir / cuts_filename)
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    compute_spectrogram_aishell3()
--- a/egs/aishell3/TTS/local/generate_lexicon.py
+++ b/egs/aishell3/TTS/local/generate_lexicon.py
@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+
+"""
+This file generates the file lexicon.txt that contains pronunciations of all
+words and phrases
+"""
+
+from pypinyin import phrases_dict, pinyin_dict
+from tokenizer import Tokenizer
+
+import argparse
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        default="data/tokens.txt",
+        help="""Path to vocabulary.""",
+    )
+
+    parser.add_argument(
+        "--lexicon",
+        type=str,
+        default="data/lexicon.txt",
+        help="""Path to save the generated lexicon.""",
+    )
+    return parser
+
+
+def main():
+    args = get_parser().parse_args()
+    filename = args.lexicon
+    tokens = args.tokens
+    tokenizer = Tokenizer(tokens)
+
+    word_dict = pinyin_dict.pinyin_dict
+    phrases = phrases_dict.phrases_dict
+
+    i = 0
+    with open(filename, "w", encoding="utf-8") as f:
+        for key in word_dict:
+            if not (0x4E00 <= key <= 0x9FFF):
+                continue
+
+            w = chr(key)
+
+            # 1 to remove the initial sil
+            # :-1 to remove the final eos
+            tokens = tokenizer.text_to_tokens(w)[1:-1]
+
+            tokens = " ".join(tokens)
+            f.write(f"{w} {tokens}\n")
+
+        # TODO(fangjun): Add phrases
+        #  for key in phrases:
+        #      # 1 to remove the initial sil
+        #      # :-1 to remove the final eos
+        #      tokens = tokenizer.text_to_tokens(key)[1:-1]
+        #      tokens = " ".join(tokens)
+        #      f.write(f"{key} {tokens}\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/aishell3/TTS/local/pinyin_dict.py
+++ b/egs/aishell3/TTS/local/pinyin_dict.py
@ -0,0 +1,421 @@
+# This dict is copied from
+# https://github.com/UEhQZXI/vits_chinese/blob/master/vits_strings.py
+pinyin_dict = {
+    "a": ("^", "a"),
+    "ai": ("^", "ai"),
+    "an": ("^", "an"),
+    "ang": ("^", "ang"),
+    "ao": ("^", "ao"),
+    "ba": ("b", "a"),
+    "bai": ("b", "ai"),
+    "ban": ("b", "an"),
+    "bang": ("b", "ang"),
+    "bao": ("b", "ao"),
+    "be": ("b", "e"),
+    "bei": ("b", "ei"),
+    "ben": ("b", "en"),
+    "beng": ("b", "eng"),
+    "bi": ("b", "i"),
+    "bian": ("b", "ian"),
+    "biao": ("b", "iao"),
+    "bie": ("b", "ie"),
+    "bin": ("b", "in"),
+    "bing": ("b", "ing"),
+    "bo": ("b", "o"),
+    "bu": ("b", "u"),
+    "ca": ("c", "a"),
+    "cai": ("c", "ai"),
+    "can": ("c", "an"),
+    "cang": ("c", "ang"),
+    "cao": ("c", "ao"),
+    "ce": ("c", "e"),
+    "cen": ("c", "en"),
+    "ceng": ("c", "eng"),
+    "cha": ("ch", "a"),
+    "chai": ("ch", "ai"),
+    "chan": ("ch", "an"),
+    "chang": ("ch", "ang"),
+    "chao": ("ch", "ao"),
+    "che": ("ch", "e"),
+    "chen": ("ch", "en"),
+    "cheng": ("ch", "eng"),
+    "chi": ("ch", "iii"),
+    "chong": ("ch", "ong"),
+    "chou": ("ch", "ou"),
+    "chu": ("ch", "u"),
+    "chua": ("ch", "ua"),
+    "chuai": ("ch", "uai"),
+    "chuan": ("ch", "uan"),
+    "chuang": ("ch", "uang"),
+    "chui": ("ch", "uei"),
+    "chun": ("ch", "uen"),
+    "chuo": ("ch", "uo"),
+    "ci": ("c", "ii"),
+    "cong": ("c", "ong"),
+    "cou": ("c", "ou"),
+    "cu": ("c", "u"),
+    "cuan": ("c", "uan"),
+    "cui": ("c", "uei"),
+    "cun": ("c", "uen"),
+    "cuo": ("c", "uo"),
+    "da": ("d", "a"),
+    "dai": ("d", "ai"),
+    "dan": ("d", "an"),
+    "dang": ("d", "ang"),
+    "dao": ("d", "ao"),
+    "de": ("d", "e"),
+    "dei": ("d", "ei"),
+    "den": ("d", "en"),
+    "deng": ("d", "eng"),
+    "di": ("d", "i"),
+    "dia": ("d", "ia"),
+    "dian": ("d", "ian"),
+    "diao": ("d", "iao"),
+    "die": ("d", "ie"),
+    "ding": ("d", "ing"),
+    "diu": ("d", "iou"),
+    "dong": ("d", "ong"),
+    "dou": ("d", "ou"),
+    "du": ("d", "u"),
+    "duan": ("d", "uan"),
+    "dui": ("d", "uei"),
+    "dun": ("d", "uen"),
+    "duo": ("d", "uo"),
+    "e": ("^", "e"),
+    "ei": ("^", "ei"),
+    "en": ("^", "en"),
+    "ng": ("^", "en"),
+    "eng": ("^", "eng"),
+    "er": ("^", "er"),
+    "fa": ("f", "a"),
+    "fan": ("f", "an"),
+    "fang": ("f", "ang"),
+    "fei": ("f", "ei"),
+    "fen": ("f", "en"),
+    "feng": ("f", "eng"),
+    "fo": ("f", "o"),
+    "fou": ("f", "ou"),
+    "fu": ("f", "u"),
+    "ga": ("g", "a"),
+    "gai": ("g", "ai"),
+    "gan": ("g", "an"),
+    "gang": ("g", "ang"),
+    "gao": ("g", "ao"),
+    "ge": ("g", "e"),
+    "gei": ("g", "ei"),
+    "gen": ("g", "en"),
+    "geng": ("g", "eng"),
+    "gong": ("g", "ong"),
+    "gou": ("g", "ou"),
+    "gu": ("g", "u"),
+    "gua": ("g", "ua"),
+    "guai": ("g", "uai"),
+    "guan": ("g", "uan"),
+    "guang": ("g", "uang"),
+    "gui": ("g", "uei"),
+    "gun": ("g", "uen"),
+    "guo": ("g", "uo"),
+    "ha": ("h", "a"),
+    "hai": ("h", "ai"),
+    "han": ("h", "an"),
+    "hang": ("h", "ang"),
+    "hao": ("h", "ao"),
+    "he": ("h", "e"),
+    "hei": ("h", "ei"),
+    "hen": ("h", "en"),
+    "heng": ("h", "eng"),
+    "hong": ("h", "ong"),
+    "hou": ("h", "ou"),
+    "hu": ("h", "u"),
+    "hua": ("h", "ua"),
+    "huai": ("h", "uai"),
+    "huan": ("h", "uan"),
+    "huang": ("h", "uang"),
+    "hui": ("h", "uei"),
+    "hun": ("h", "uen"),
+    "huo": ("h", "uo"),
+    "ji": ("j", "i"),
+    "jia": ("j", "ia"),
+    "jian": ("j", "ian"),
+    "jiang": ("j", "iang"),
+    "jiao": ("j", "iao"),
+    "jie": ("j", "ie"),
+    "jin": ("j", "in"),
+    "jing": ("j", "ing"),
+    "jiong": ("j", "iong"),
+    "jiu": ("j", "iou"),
+    "ju": ("j", "v"),
+    "juan": ("j", "van"),
+    "jue": ("j", "ve"),
+    "jun": ("j", "vn"),
+    "ka": ("k", "a"),
+    "kai": ("k", "ai"),
+    "kan": ("k", "an"),
+    "kang": ("k", "ang"),
+    "kao": ("k", "ao"),
+    "ke": ("k", "e"),
+    "kei": ("k", "ei"),
+    "ken": ("k", "en"),
+    "keng": ("k", "eng"),
+    "kong": ("k", "ong"),
+    "kou": ("k", "ou"),
+    "ku": ("k", "u"),
+    "kua": ("k", "ua"),
+    "kuai": ("k", "uai"),
+    "kuan": ("k", "uan"),
+    "kuang": ("k", "uang"),
+    "kui": ("k", "uei"),
+    "kun": ("k", "uen"),
+    "kuo": ("k", "uo"),
+    "la": ("l", "a"),
+    "lai": ("l", "ai"),
+    "lan": ("l", "an"),
+    "lang": ("l", "ang"),
+    "lao": ("l", "ao"),
+    "le": ("l", "e"),
+    "lei": ("l", "ei"),
+    "leng": ("l", "eng"),
+    "li": ("l", "i"),
+    "lia": ("l", "ia"),
+    "lian": ("l", "ian"),
+    "liang": ("l", "iang"),
+    "liao": ("l", "iao"),
+    "lie": ("l", "ie"),
+    "lin": ("l", "in"),
+    "ling": ("l", "ing"),
+    "liu": ("l", "iou"),
+    "lo": ("l", "o"),
+    "long": ("l", "ong"),
+    "lou": ("l", "ou"),
+    "lu": ("l", "u"),
+    "lv": ("l", "v"),
+    "luan": ("l", "uan"),
+    "lve": ("l", "ve"),
+    "lue": ("l", "ve"),
+    "lun": ("l", "uen"),
+    "luo": ("l", "uo"),
+    "ma": ("m", "a"),
+    "mai": ("m", "ai"),
+    "man": ("m", "an"),
+    "mang": ("m", "ang"),
+    "mao": ("m", "ao"),
+    "me": ("m", "e"),
+    "mei": ("m", "ei"),
+    "men": ("m", "en"),
+    "meng": ("m", "eng"),
+    "mi": ("m", "i"),
+    "mian": ("m", "ian"),
+    "miao": ("m", "iao"),
+    "mie": ("m", "ie"),
+    "min": ("m", "in"),
+    "ming": ("m", "ing"),
+    "miu": ("m", "iou"),
+    "mo": ("m", "o"),
+    "mou": ("m", "ou"),
+    "mu": ("m", "u"),
+    "na": ("n", "a"),
+    "nai": ("n", "ai"),
+    "nan": ("n", "an"),
+    "nang": ("n", "ang"),
+    "nao": ("n", "ao"),
+    "ne": ("n", "e"),
+    "nei": ("n", "ei"),
+    "nen": ("n", "en"),
+    "neng": ("n", "eng"),
+    "ni": ("n", "i"),
+    "nia": ("n", "ia"),
+    "nian": ("n", "ian"),
+    "niang": ("n", "iang"),
+    "niao": ("n", "iao"),
+    "nie": ("n", "ie"),
+    "nin": ("n", "in"),
+    "ning": ("n", "ing"),
+    "niu": ("n", "iou"),
+    "nong": ("n", "ong"),
+    "nou": ("n", "ou"),
+    "nu": ("n", "u"),
+    "nv": ("n", "v"),
+    "nuan": ("n", "uan"),
+    "nve": ("n", "ve"),
+    "nue": ("n", "ve"),
+    "nuo": ("n", "uo"),
+    "o": ("^", "o"),
+    "ou": ("^", "ou"),
+    "pa": ("p", "a"),
+    "pai": ("p", "ai"),
+    "pan": ("p", "an"),
+    "pang": ("p", "ang"),
+    "pao": ("p", "ao"),
+    "pe": ("p", "e"),
+    "pei": ("p", "ei"),
+    "pen": ("p", "en"),
+    "peng": ("p", "eng"),
+    "pi": ("p", "i"),
+    "pian": ("p", "ian"),
+    "piao": ("p", "iao"),
+    "pie": ("p", "ie"),
+    "pin": ("p", "in"),
+    "ping": ("p", "ing"),
+    "po": ("p", "o"),
+    "pou": ("p", "ou"),
+    "pu": ("p", "u"),
+    "qi": ("q", "i"),
+    "qia": ("q", "ia"),
+    "qian": ("q", "ian"),
+    "qiang": ("q", "iang"),
+    "qiao": ("q", "iao"),
+    "qie": ("q", "ie"),
+    "qin": ("q", "in"),
+    "qing": ("q", "ing"),
+    "qiong": ("q", "iong"),
+    "qiu": ("q", "iou"),
+    "qu": ("q", "v"),
+    "quan": ("q", "van"),
+    "que": ("q", "ve"),
+    "qun": ("q", "vn"),
+    "ran": ("r", "an"),
+    "rang": ("r", "ang"),
+    "rao": ("r", "ao"),
+    "re": ("r", "e"),
+    "ren": ("r", "en"),
+    "reng": ("r", "eng"),
+    "ri": ("r", "iii"),
+    "rong": ("r", "ong"),
+    "rou": ("r", "ou"),
+    "ru": ("r", "u"),
+    "rua": ("r", "ua"),
+    "ruan": ("r", "uan"),
+    "rui": ("r", "uei"),
+    "run": ("r", "uen"),
+    "ruo": ("r", "uo"),
+    "sa": ("s", "a"),
+    "sai": ("s", "ai"),
+    "san": ("s", "an"),
+    "sang": ("s", "ang"),
+    "sao": ("s", "ao"),
+    "se": ("s", "e"),
+    "sen": ("s", "en"),
+    "seng": ("s", "eng"),
+    "sha": ("sh", "a"),
+    "shai": ("sh", "ai"),
+    "shan": ("sh", "an"),
+    "shang": ("sh", "ang"),
+    "shao": ("sh", "ao"),
+    "she": ("sh", "e"),
+    "shei": ("sh", "ei"),
+    "shen": ("sh", "en"),
+    "sheng": ("sh", "eng"),
+    "shi": ("sh", "iii"),
+    "shou": ("sh", "ou"),
+    "shu": ("sh", "u"),
+    "shua": ("sh", "ua"),
+    "shuai": ("sh", "uai"),
+    "shuan": ("sh", "uan"),
+    "shuang": ("sh", "uang"),
+    "shui": ("sh", "uei"),
+    "shun": ("sh", "uen"),
+    "shuo": ("sh", "uo"),
+    "si": ("s", "ii"),
+    "song": ("s", "ong"),
+    "sou": ("s", "ou"),
+    "su": ("s", "u"),
+    "suan": ("s", "uan"),
+    "sui": ("s", "uei"),
+    "sun": ("s", "uen"),
+    "suo": ("s", "uo"),
+    "ta": ("t", "a"),
+    "tai": ("t", "ai"),
+    "tan": ("t", "an"),
+    "tang": ("t", "ang"),
+    "tao": ("t", "ao"),
+    "te": ("t", "e"),
+    "tei": ("t", "ei"),
+    "teng": ("t", "eng"),
+    "ti": ("t", "i"),
+    "tian": ("t", "ian"),
+    "tiao": ("t", "iao"),
+    "tie": ("t", "ie"),
+    "ting": ("t", "ing"),
+    "tong": ("t", "ong"),
+    "tou": ("t", "ou"),
+    "tu": ("t", "u"),
+    "tuan": ("t", "uan"),
+    "tui": ("t", "uei"),
+    "tun": ("t", "uen"),
+    "tuo": ("t", "uo"),
+    "wa": ("^", "ua"),
+    "wai": ("^", "uai"),
+    "wan": ("^", "uan"),
+    "wang": ("^", "uang"),
+    "wei": ("^", "uei"),
+    "wen": ("^", "uen"),
+    "weng": ("^", "ueng"),
+    "wo": ("^", "uo"),
+    "wu": ("^", "u"),
+    "xi": ("x", "i"),
+    "xia": ("x", "ia"),
+    "xian": ("x", "ian"),
+    "xiang": ("x", "iang"),
+    "xiao": ("x", "iao"),
+    "xie": ("x", "ie"),
+    "xin": ("x", "in"),
+    "xing": ("x", "ing"),
+    "xiong": ("x", "iong"),
+    "xiu": ("x", "iou"),
+    "xu": ("x", "v"),
+    "xuan": ("x", "van"),
+    "xue": ("x", "ve"),
+    "xun": ("x", "vn"),
+    "ya": ("^", "ia"),
+    "yan": ("^", "ian"),
+    "yang": ("^", "iang"),
+    "yao": ("^", "iao"),
+    "ye": ("^", "ie"),
+    "yi": ("^", "i"),
+    "yin": ("^", "in"),
+    "ying": ("^", "ing"),
+    "yo": ("^", "iou"),
+    "yong": ("^", "iong"),
+    "you": ("^", "iou"),
+    "yu": ("^", "v"),
+    "yuan": ("^", "van"),
+    "yue": ("^", "ve"),
+    "yun": ("^", "vn"),
+    "za": ("z", "a"),
+    "zai": ("z", "ai"),
+    "zan": ("z", "an"),
+    "zang": ("z", "ang"),
+    "zao": ("z", "ao"),
+    "ze": ("z", "e"),
+    "zei": ("z", "ei"),
+    "zen": ("z", "en"),
+    "zeng": ("z", "eng"),
+    "zha": ("zh", "a"),
+    "zhai": ("zh", "ai"),
+    "zhan": ("zh", "an"),
+    "zhang": ("zh", "ang"),
+    "zhao": ("zh", "ao"),
+    "zhe": ("zh", "e"),
+    "zhei": ("zh", "ei"),
+    "zhen": ("zh", "en"),
+    "zheng": ("zh", "eng"),
+    "zhi": ("zh", "iii"),
+    "zhong": ("zh", "ong"),
+    "zhou": ("zh", "ou"),
+    "zhu": ("zh", "u"),
+    "zhua": ("zh", "ua"),
+    "zhuai": ("zh", "uai"),
+    "zhuan": ("zh", "uan"),
+    "zhuang": ("zh", "uang"),
+    "zhui": ("zh", "uei"),
+    "zhun": ("zh", "uen"),
+    "zhuo": ("zh", "uo"),
+    "zi": ("z", "ii"),
+    "zong": ("z", "ong"),
+    "zou": ("z", "ou"),
+    "zu": ("z", "u"),
+    "zuan": ("z", "uan"),
+    "zui": ("z", "uei"),
+    "zun": ("z", "uen"),
+    "zuo": ("z", "uo"),
+}
--- a/egs/aishell3/TTS/local/prepare_token_file.py
+++ b/egs/aishell3/TTS/local/prepare_token_file.py
@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+# Copyright         2023  Xiaomi Corp.        (authors: Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file generates the file tokens.txt that maps tokens to IDs.
+"""
+
+import argparse
+import logging
+from pathlib import Path
+from typing import Dict
+from symbols import symbols
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--tokens",
+        type=Path,
+        default=Path("data/tokens.txt"),
+        help="Path to the dict that maps the text tokens to IDs",
+    )
+
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    tokens = Path(args.tokens)
+
+    with open(tokens, "w", encoding="utf-8") as f:
+        for token_id, token in enumerate(symbols):
+            f.write(f"{token} {token_id}\n")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/aishell3/TTS/local/prepare_tokens_aishell3.py
+++ b/egs/aishell3/TTS/local/prepare_tokens_aishell3.py
@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# Copyright         2023  Xiaomi Corp.        (authors: Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+This file reads the texts in given manifest and save the new cuts with tokens.
+"""
+
+import logging
+from pathlib import Path
+
+from lhotse import CutSet, load_manifest
+
+from tokenizer import Tokenizer
+
+
+def prepare_tokens_aishell3():
+    output_dir = Path("data/spectrogram")
+    prefix = "aishell3"
+    suffix = "jsonl.gz"
+    partitions = ("train", "test")
+
+    tokenizer = Tokenizer()
+
+    for partition in partitions:
+        cut_set = load_manifest(output_dir / f"{prefix}_cuts_{partition}.{suffix}")
+
+        new_cuts = []
+        i = 0
+        for cut in cut_set:
+            # Each cut only contains one supervision
+            assert len(cut.supervisions) == 1, (len(cut.supervisions), cut)
+            text = cut.supervisions[0].text
+            cut.tokens = tokenizer.text_to_tokens(text)
+
+            new_cuts.append(cut)
+
+        new_cut_set = CutSet.from_cuts(new_cuts)
+        new_cut_set.to_file(
+            output_dir / f"{prefix}_cuts_with_tokens_{partition}.{suffix}"
+        )
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+
+    prepare_tokens_aishell3()
--- a/egs/aishell3/TTS/local/pypinyin-local.dict
+++ b/egs/aishell3/TTS/local/pypinyin-local.dict
@ -0,0 +1,328 @@
+姐姐 jie3 jie
+宝宝 bao3 bao
+哥哥 ge1 ge
+妹妹 mei4 mei
+弟弟 di4 di
+妈妈 ma1 ma
+开心哦 kai1 xin1 o
+爸爸 ba4 ba
+秘密哟 mi4 mi4 yo
+哦 o
+一年 yi4 nian2
+一夜 yi2 ye4
+一切 yi2 qie4
+一座 yi2 zuo4
+一下 yi2 xia4
+上一山 shang4 yi2 shan1
+下一山 xia4 yi2 shan1
+休息 xiu1 xi2
+东西 dong1 xi
+上一届 shang4 yi2 jie4
+便宜 pian2 yi4
+加长 jia1 chang2
+单田芳 shan4 tian2 fang1
+帧 zhen1
+长时间 chang2 shi2 jian1
+长时 chang2 shi2
+识别 shi2 bie2
+生命中 sheng1 ming4 zhong1
+踏实 ta1 shi
+嗯 en4
+溜达 liu1 da
+少儿 shao4 er2
+爷爷 ye2 ye
+不是 bu2 shi4
+一圈 yi1 quan1
+厜读一声 zui1 du2 yi4 sheng1
+一种 yi4 zhong3
+一簇簇 yi2 cu4 cu4
+一个 yi2 ge4
+一样 yi2 yang4
+一跩一跩 yi4 zhuai3 yi4 zhuai3
+一会儿 yi2 hui4 er
+一幢 yi2 zhuang4
+挨了 ai2 le
+熬菜 ao1 cai4
+扒鸡 pa2 ji1
+背枪 bei1 qiang1
+绷瓷儿 beng4 ci2 er2
+绷劲儿 beng3 jin4 er
+绷着脸 beng3 zhe lian3
+藏医 zang4 yi1
+噌吰 cheng1 hong2
+差点儿 cha4 dian3 er
+差失 cha1 shi1
+差误 cha1 wu4
+孱头 can4 tou
+乘间 cheng2 jian4
+锄镰棘矜 chu2 lian2 ji2 qin2
+川藏 chuan1 zang4
+穿著 chuan1 zhuo2
+答讪 da1 shan4
+答言 da1 yan2
+大伯子 da4 bai3 zi
+大夫 dai4 fu
+弹冠 tan2 guan1
+当间 dang1 jian4
+当然咯 dang1 ran2 lo
+点种 dian3 zhong3
+垛好 duo4 hao3
+发疟子 fa1 yao4 zi
+饭熟了 fan4 shou2 le
+附著 fu4 zhuo2
+复沓 fu4 ta4
+供稿 gong1 gao3
+供养 gong1 yang3
+骨朵 gu1 duo
+骨碌 gu1 lu
+果脯 guo3 fu3
+哈什玛 ha4 shi2 ma3
+海蜇 hai3 zhe2
+呵欠 he1 qian
+河水汤汤 he2 shui3 shang1 shang1
+鹄立 hu2 li4
+鹄望 hu2 wang4
+混人 hun2 ren2
+混水 hun2 shui3
+鸡血 ji1 xie3
+缉鞋口 qi1 xie2 kou3
+亟来闻讯 qi4 lai2 wen2 xun4
+计量 ji4 liang2
+济水 ji3 shui3
+间杂 jian4 za2
+脚跐两只船 jiao3 ci3 liang3 zhi1 chuan2
+脚儿 jue2 er2
+口角 kou3 jiao3
+勒石 le4 shi2
+累进 lei3 jin4
+累累如丧家之犬 lei2 lei2 ru2 sang4 jia1 zhi1 quan3
+累年 lei3 nian2
+脸涨通红 lian3 zhang4 tong1 hong2
+踉锵 liang4 qiang1
+燎眉毛 liao3 mei2 mao2
+燎头发 liao3 tou2 fa4
+溜达 liu1 da
+溜缝儿 liu4 feng4 er
+馏口饭 liu4 kou3 fan4
+遛马 liu4 ma3
+遛鸟 liu4 niao3
+遛弯儿 liu4 wan1 er
+楼枪机 lou1 qiang1 ji1
+搂钱 lou1 qian2
+鹿脯 lu4 fu3
+露头 lou4 tou2
+落魄 luo4 po4
+捋胡子 lv3 hu2 zi
+绿地 lv4 di4
+麦垛 mai4 duo4
+没劲儿 mei2 jin4 er
+闷棍 men4 gun4 
+闷葫芦 men4 hu2 lu
+闷头干 men1 tou2 gan4
+蒙古 meng3 gu3
+靡日不思 mi3 ri4 bu4 si1
+缪姓 miao4 xing4
+抹墙 mo4 qiang2
+抹下脸 ma1 xia4 lian3
+泥子 ni4 zi
+拗不过 niu4 bu guo4
+排车 pai3 che1
+盘诘 pan2 jie2
+膀肿 pang1 zhong3
+炮干 bao1 gan1
+炮格 pao2 ge2
+碰钉子 peng4 ding1 zi
+缥色 piao3 se4
+瀑河 bao4 he2
+蹊径 xi1 jing4
+前后相属 qian2 hou4 xiang1 zhu3
+翘尾巴 qiao4 wei3 ba
+趄坡儿 qie4 po1 er
+秦桧 qin2 hui4
+圈马 juan1 ma3
+雀盲眼 qiao3 mang2 yan3
+雀子 qiao1 zi
+三年五载 san1 nian2 wu3 zai3
+加载 jia1 zai3
+山大王 shan1 dai4 wang
+苫屋草 shan4 wu1 cao3
+数数 shu3 shu4
+说客 shui4 ke4
+思量 si1 liang2
+伺侯 ci4 hou
+踏实 ta1 shi
+提溜 di1 liu
+调拨 diao4 bo1
+帖子 tie3 zi
+铜钿 tong2 tian2
+头昏脑涨 tou2 hun1 nao3 zhang4
+褪色 tui4 se4
+褪着手 tun4 zhe shou3
+圩子 wei2 zi
+尾巴 wei3 ba
+系好船只 xi4 hao3 chuan2 zhi1
+系好马匹 xi4 hao3 ma3 pi3
+杏脯 xing4 fu3
+姓单 xing4 shan4
+姓葛 xing4 ge3
+姓哈 xing4 ha3
+姓解 xing4 xie4
+姓秘 xing4 bi4
+姓宁 xing4 ning4
+旋风 xuan4 feng1
+旋根车轴 xuan4 gen1 che1 zhou2
+荨麻 qian2 ma2
+一幢楼房 yi1 zhuang4 lou2 fang2
+遗之千金 wei4 zhi1 qian1 jin1
+殷殷 yin3 yin3
+应招 ying4 zhao1
+用称约 yong4 cheng4 yao1
+约斤肉 yao1 jin1 rou4
+晕机 yun4 ji1
+熨贴 yu4 tie1
+咋办 za3 ban4
+咋呼 zha1 hu
+仔兽 zi3 shou4
+扎彩 za1 cai3
+扎实 zha1 shi
+扎腰带 za1 yao1 dai4
+轧朋友 ga2 peng2 you3
+爪子 zhua3 zi
+折腾 zhe1 teng
+着实 zhuo2 shi2
+着我旧时裳 zhuo2 wo3 jiu4 shi2 chang2
+枝蔓 zhi1 man4
+中鹄 zhong1 hu2
+中选 zhong4 xuan3
+猪圈 zhu1 juan4
+拽住不放 zhuai4 zhu4 bu4 fang4
+转悠 zhuan4 you
+庄稼熟了 zhuang1 jia shou2 le
+酌量 zhuo2 liang2
+罪行累累 zui4 xing2 lei3 lei3
+一手 yi4 shou3
+一去不复返 yi2 qu4 bu2 fu4 fan3
+一颗 yi4 ke1
+一件 yi2 jian4
+一斤 yi4 jin1
+一点 yi4 dian3
+一朵 yi4 duo3
+一声 yi4 sheng1
+一身 yi4 shen1
+不要 bu2 yao4
+一人 yi4 ren2
+一个 yi2 ge4
+一把 yi4 ba3
+一门 yi4 men2
+一門 yi4 men2
+一艘 yi4 sou1
+一片 yi2 pian4
+一篇 yi2 pian1
+一份 yi2 fen4
+好嗲 hao3 dia3
+随地 sui2 di4
+扁担长 bian3 dan4 chang3
+一堆 yi4 dui1
+不义 bu2 yi4
+放一放 fang4 yi2 fang4
+一米 yi4 mi3
+一顿 yi2 dun4
+一层楼 yi4 ceng2 lou2
+一条 yi4 tiao2
+一件 yi2 jian4
+一棵 yi4 ke1
+一小股 yi4 xiao3 gu3
+一拐一拐 yi4 guai3 yi4 guai3
+一根 yi4 gen1
+沆瀣一气 hang4 xie4 yi2 qi4
+一丝 yi4 si1
+一毫 yi4 hao2
+一樣 yi2 yang4
+处处 chu4 chu4
+一餐 yi4 can
+永不 yong3 bu2
+一看 yi2 kan4
+一架 yi2 jia4
+送还 song4 huan2
+一见 yi2 jian4
+一座 yi2 zuo4
+一块 yi2 kuai4
+一天 yi4 tian1
+一只 yi4 zhi1
+一支 yi4 zhi1
+一字 yi2 zi4 
+一句 yi2 ju4
+一张 yi4 zhang1
+一條 yi4 tiao2
+一场 yi4 chang3
+一粒 yi2 li4
+小俩口 xiao3 liang3 kou3
+一首 yi4 shou3
+一对 yi2 dui4
+一手 yi4 shou3
+又一村 you4 yi4 cun1
+一概而论 yi2 gai4 er2 lun4
+一峰峰 yi4 feng1 feng1
+不但 bu2 dan4
+一笑 yi2 xiao4
+挠痒痒 nao2 yang3 yang
+不对 bu2 dui4
+拧开 ning3 kai1
+爱不释手 ai4 bu2 shi4 shou3
+一念 yi2 nian4
+夺得 duo2 de2
+一袭 yi4 xi2
+一定 yi2 ding4
+不慎 bu2 shen4
+剽窃 piao2 qie4
+一时 yi4 shi2
+撇开 pie3 kai1
+一祭 yi2 ji4
+发卡 fa4 qia3
+少不了 shao3 bu4 liao3
+千虑一失 qian1 lv4 yi4 shi1
+呛得 qiang4 de2
+切菜 qie1 cai4
+茄盒 qie2 he2
+不去 bu2 qu4
+一大圈 yi2 da4 quan1
+不再 bu2 zai4
+一群 yi4 qun2
+不必 bu2 bi4
+一些 yi4 xie1
+一路 yi2 lu4
+一股 yi4 gu3
+一到 yi2 dao4
+一拨 yi4 bo1
+一排 yi4 pai2
+一空 yi4 kong1
+吮吸着 shun3 xi1 zhe
+不适合 bu2 shi4 he2
+一串串 yi2 chuan4 chuan4
+一提起 yi4 ti2 qi3
+一尘不染 yi4 chen2 bu4 ran3
+一生 yi4 sheng1
+一派 yi2 pai4
+不断 bu2 duan4
+一次 yi2 ci4
+不进步 bu2 jin4 bu4
+娃娃 wa2 wa
+万户侯 wan4 hu4 hou2
+一方 yi4 fang1 
+一番话 yi4 fan1 hua4
+一遍 yi2 bian4 
+不计较  bu2 ji4 jiao4
+诇 xiong4
+一边 yi4 bian1
+一束 yi2 shu4
+一听到 yi4 ting1 dao4
+炸鸡 zha2 ji1
+乍暧还寒 zha4 ai4 huan2 han2
+我说诶 wo3 shuo1 ei1
+棒诶 bang4 ei1
+寒碜 han2 chen4
+应采儿 ying4 cai3 er2
+晕车 yun1 che1
+必应 bi4 ying4
+应援 ying4 yuan2
+应力 ying4 li4
--- a/egs/aishell3/TTS/local/symbols.py
+++ b/egs/aishell3/TTS/local/symbols.py
@ -0,0 +1,73 @@
+# This file is copied from
+# https://github.com/UEhQZXI/vits_chinese/blob/master/text/symbols.py
+_pause = ["sil", "eos", "sp", "#0", "#1", "#2", "#3"]
+
+_initials = [
+    "^",
+    "b",
+    "c",
+    "ch",
+    "d",
+    "f",
+    "g",
+    "h",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "p",
+    "q",
+    "r",
+    "s",
+    "sh",
+    "t",
+    "x",
+    "z",
+    "zh",
+]
+
+_tones = ["1", "2", "3", "4", "5"]
+
+_finals = [
+    "a",
+    "ai",
+    "an",
+    "ang",
+    "ao",
+    "e",
+    "ei",
+    "en",
+    "eng",
+    "er",
+    "i",
+    "ia",
+    "ian",
+    "iang",
+    "iao",
+    "ie",
+    "ii",
+    "iii",
+    "in",
+    "ing",
+    "iong",
+    "iou",
+    "o",
+    "ong",
+    "ou",
+    "u",
+    "ua",
+    "uai",
+    "uan",
+    "uang",
+    "uei",
+    "uen",
+    "ueng",
+    "uo",
+    "v",
+    "van",
+    "ve",
+    "vn",
+]
+
+symbols = _pause + _initials + [i + j for i in _finals for j in _tones]
--- a/egs/aishell3/TTS/local/tokenizer.py
+++ b/egs/aishell3/TTS/local/tokenizer.py
@ -0,0 +1,137 @@
+# This file is modified from
+# https://github.com/UEhQZXI/vits_chinese/blob/master/vits_strings.py
+
+import logging
+from pathlib import Path
+from typing import List
+
+# Note pinyin_dict is from ./pinyin_dict.py
+from pinyin_dict import pinyin_dict
+from pypinyin import Style
+from pypinyin.contrib.neutral_tone import NeutralToneWith5Mixin
+from pypinyin.converter import DefaultConverter
+from pypinyin.core import Pinyin, load_phrases_dict
+
+
+class _MyConverter(NeutralToneWith5Mixin, DefaultConverter):
+    pass
+
+
+class Tokenizer:
+    def __init__(self, tokens: str = ""):
+        self._load_pinyin_dict()
+        self._pinyin_parser = Pinyin(_MyConverter())
+
+        if tokens != "":
+            self._load_tokens(tokens)
+
+    def texts_to_token_ids(self, texts: List[str], **kwargs) -> List[List[int]]:
+        """
+        Args:
+          texts:
+            A list of sentences.
+          kwargs:
+            Not used. It is for compatibility with other TTS recipes in icefall.
+        """
+        tokens = []
+
+        for text in texts:
+            tokens.append(self.text_to_tokens(text))
+
+        return self.tokens_to_token_ids(tokens)
+
+    def tokens_to_token_ids(self, tokens: List[List[str]]) -> List[List[int]]:
+        ans = []
+
+        for token_list in tokens:
+            token_ids = []
+            for t in token_list:
+                if t not in self.token2id:
+                    logging.warning(f"Skip OOV {t}")
+                    continue
+                token_ids.append(self.token2id[t])
+            ans.append(token_ids)
+
+        return ans
+
+    def text_to_tokens(self, text: str) -> List[str]:
+        # Convert "，" to  ["sp", "sil"]
+        # Convert "。" to  ["sil"]
+        # append ["eos"] at the end of a sentence
+        phonemes = ["sil"]
+        pinyins = self._pinyin_parser.pinyin(
+            text,
+            style=Style.TONE3,
+            errors=lambda x: [[w] for w in x],
+        )
+
+        new_pinyin = []
+        for p in pinyins:
+            p = p[0]
+            if p == "，":
+                new_pinyin.extend(["sp", "sil"])
+            elif p == "。":
+                new_pinyin.append("sil")
+            else:
+                new_pinyin.append(p)
+        sub_phonemes = self._get_phoneme4pinyin(new_pinyin)
+        sub_phonemes.append("eos")
+        phonemes.extend(sub_phonemes)
+        return phonemes
+
+    def _get_phoneme4pinyin(self, pinyins):
+        result = []
+        for pinyin in pinyins:
+            if pinyin in ("sil", "sp"):
+                result.append(pinyin)
+            elif pinyin[:-1] in pinyin_dict:
+                tone = pinyin[-1]
+                a = pinyin[:-1]
+                a1, a2 = pinyin_dict[a]
+                # every word is appended with a #0
+                result += [a1, a2 + tone, "#0"]
+
+        return result
+
+    def _load_pinyin_dict(self):
+        this_dir = Path(__file__).parent.resolve()
+        my_dict = {}
+        with open(f"{this_dir}/pypinyin-local.dict", "r", encoding="utf-8") as f:
+            content = f.readlines()
+            for line in content:
+                cuts = line.strip().split()
+                hanzi = cuts[0]
+                pinyin = cuts[1:]
+                my_dict[hanzi] = [[p] for p in pinyin]
+
+        load_phrases_dict(my_dict)
+
+    def _load_tokens(self, filename):
+        token2id: Dict[str, int] = {}
+
+        with open(filename, "r", encoding="utf-8") as f:
+            for line in f.readlines():
+                info = line.rstrip().split()
+                if len(info) == 1:
+                    # case of space
+                    token = " "
+                    idx = int(info[0])
+                else:
+                    token, idx = info[0], int(info[1])
+
+                assert token not in token2id, token
+
+                token2id[token] = idx
+
+        self.token2id = token2id
+        self.vocab_size = len(self.token2id)
+        self.pad_id = self.token2id["#0"]
+
+
+def main():
+    tokenizer = Tokenizer()
+    tokenizer._sentence_to_ids("你好，好的。")
+
+
+if __name__ == "__main__":
+    main()
--- a/egs/aishell3/TTS/local/validate_manifest.py
+++ b/egs/aishell3/TTS/local/validate_manifest.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/local/validate_manifest.py
--- a/egs/aishell3/TTS/prepare.sh
+++ b/egs/aishell3/TTS/prepare.sh
@ -0,0 +1,141 @@
+#!/usr/bin/env bash
+
+# fix segmentation fault reported in https://github.com/k2-fsa/icefall/issues/674
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+
+set -eou pipefail
+
+stage=-1
+stop_stage=100
+
+dl_dir=$PWD/download
+
+. shared/parse_options.sh || exit 1
+
+# All files generated by this script are saved in "data".
+# You can safely remove "data" and rerun this script to regenerate it.
+mkdir -p data
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+log "dl_dir: $dl_dir"
+
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
+  log "Stage 0: build monotonic_align lib"
+  if [ ! -d vits/monotonic_align/build ]; then
+    cd vits/monotonic_align
+    python3 setup.py build_ext --inplace
+    cd ../../
+  else
+    log "monotonic_align lib already built"
+  fi
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  log "Stage 1: Download data"
+
+  # The directory $dl_dir/aishell3 will contain the following files
+  # and sub directories
+  #      ChangeLog  ReadMe.txt  phone_set.txt  spk-info.txt  test  train
+  # If you have pre-downloaded it to /path/to/aishell3, you can create a symlink
+  #
+  #   ln -sfv /path/to/aishell3 $dl_dir/
+  #   touch $dl_dir/aishell3/.completed
+  #
+  if [ ! -d $dl_dir/aishell3 ]; then
+    lhotse download aishell3 $dl_dir
+  fi
+fi
+
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  log "Stage 2: Prepare aishell3 manifest (may take 13 minutes)"
+  # We assume that you have downloaded the baker corpus
+  # to $dl_dir/aishell3.
+  # You can find files like spk-info.txt inside $dl_dir/aishell3
+  mkdir -p data/manifests
+  if [ ! -e data/manifests/.aishell3.done ]; then
+    lhotse prepare aishell3 $dl_dir/aishell3 data/manifests >/dev/null 2>&1
+    touch data/manifests/.aishell3.done
+  fi
+fi
+
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  log "Stage 3: Compute spectrogram for aishell3 (may take 5 minutes)"
+  mkdir -p data/spectrogram
+  if [ ! -e data/spectrogram/.aishell3.done ]; then
+    ./local/compute_spectrogram_aishell3.py
+    touch data/spectrogram/.aishell3.done
+  fi
+
+  if [ ! -e data/spectrogram/.aishell3-validated.done ]; then
+    log "Validating data/spectrogram for aishell3"
+    python3 ./local/validate_manifest.py \
+      data/spectrogram/aishell3_cuts_train.jsonl.gz
+
+    python3 ./local/validate_manifest.py \
+      data/spectrogram/aishell3_cuts_test.jsonl.gz
+
+    touch data/spectrogram/.aishell3-validated.done
+  fi
+fi
+
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  log "Stage 4: Prepare tokens for aishell3 (may take 20 seconds)"
+  if [ ! -e data/spectrogram/.aishell3_with_token.done ]; then
+
+    ./local/prepare_tokens_aishell3.py
+
+    mv -v data/spectrogram/aishell3_cuts_with_tokens_train.jsonl.gz \
+      data/spectrogram/aishell3_cuts_train.jsonl.gz
+
+    mv -v data/spectrogram/aishell3_cuts_with_tokens_test.jsonl.gz \
+      data/spectrogram/aishell3_cuts_test.jsonl.gz
+
+    touch data/spectrogram/.aishell3_with_token.done
+  fi
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  log "Stage 5: Split the aishell3 cuts into train, valid and test sets (may take 25 seconds)"
+  if [ ! -e data/spectrogram/.aishell3_split.done ]; then
+    lhotse subset --last 1000 \
+      data/spectrogram/aishell3_cuts_test.jsonl.gz \
+      data/spectrogram/aishell3_cuts_valid.jsonl.gz
+
+    n=$(( $(gunzip -c data/spectrogram/aishell3_cuts_test.jsonl.gz | wc -l) - 1000 ))
+
+    lhotse subset --first $n  \
+      data/spectrogram/aishell3_cuts_test.jsonl.gz \
+      data/spectrogram/aishell3_cuts_test2.jsonl.gz
+
+    mv data/spectrogram/aishell3_cuts_test2.jsonl.gz data/spectrogram/aishell3_cuts_test.jsonl.gz
+
+    touch data/spectrogram/.aishell3_split.done
+  fi
+fi
+
+if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
+  log "Stage 6: Generate tokens.txt and lexicon.txt "
+  if [ ! -e data/tokens.txt ]; then
+    ./local/prepare_token_file.py --tokens data/tokens.txt
+  fi
+
+  if [ ! -e data/lexicon.txt ]; then
+    ./local/generate_lexicon.py --tokens data/tokens.txt --lexicon data/lexicon.txt
+  fi
+fi
+
+if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
+  log "Stage 7: Generate speakers file"
+  if [ ! -e data/speakers.txt ]; then
+    gunzip -c data/manifests/aishell3_supervisions_train.jsonl.gz \
+      | jq '.speaker' | sed 's/"//g' \
+      | sort | uniq > data/speakers.txt
+  fi
+fi
--- a/egs/aishell3/TTS/shared
+++ b/egs/aishell3/TTS/shared
@ -0,0 +1 @@
+../../../icefall/shared
--- a/egs/aishell3/TTS/vits/duration_predictor.py
+++ b/egs/aishell3/TTS/vits/duration_predictor.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/vits/duration_predictor.py
--- a/egs/aishell3/TTS/vits/export-onnx.py
+++ b/egs/aishell3/TTS/vits/export-onnx.py
@ -0,0 +1,433 @@
+#!/usr/bin/env python3
+#
+# Copyright      2023 Xiaomi Corporation     (Author: Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This script exports a VITS model from PyTorch to ONNX.
+
+Export the model to ONNX:
+./vits/export-onnx.py \
+  --epoch 1000 \
+  --speakers ./data/speakers.txt \
+  --exp-dir vits/exp \
+  --tokens data/tokens.txt
+
+It will generate one file inside vits/exp:
+  - vits-epoch-1000.onnx
+
+See ./test_onnx.py for how to use the exported ONNX models.
+"""
+
+import argparse
+import logging
+from pathlib import Path
+from typing import Dict, Tuple
+
+import onnx
+import torch
+import torch.nn as nn
+from tokenizer import Tokenizer
+from train import get_model, get_params
+
+from icefall.checkpoint import load_checkpoint
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--epoch",
+        type=int,
+        default=1000,
+        help="""It specifies the checkpoint to use for decoding.
+        Note: Epoch counts from 1.
+        """,
+    )
+
+    parser.add_argument(
+        "--exp-dir",
+        type=str,
+        default="vits/exp",
+        help="The experiment dir",
+    )
+
+    parser.add_argument(
+        "--tokens",
+        type=str,
+        default="data/tokens.txt",
+        help="""Path to vocabulary.""",
+    )
+
+    parser.add_argument(
+        "--speakers",
+        type=Path,
+        default=Path("data/speakers.txt"),
+        help="Path to speakers.txt file.",
+    )
+
+    parser.add_argument(
+        "--model-type",
+        type=str,
+        default="low",
+        choices=["low", "medium", "high"],
+        help="""If not empty, valid values are: low, medium, high.
+        It controls the model size. low -> runs faster.
+        """,
+    )
+
+    return parser
+
+
+def add_meta_data(filename: str, meta_data: Dict[str, str]):
+    """Add meta data to an ONNX model. It is changed in-place.
+
+    Args:
+      filename:
+        Filename of the ONNX model to be changed.
+      meta_data:
+        Key-value pairs.
+    """
+    model = onnx.load(filename)
+    for key, value in meta_data.items():
+        meta = model.metadata_props.add()
+        meta.key = key
+        meta.value = str(value)
+
+    onnx.save(model, filename)
+
+
+class OnnxModel(nn.Module):
+    """A wrapper for VITS generator."""
+
+    def __init__(self, model: nn.Module):
+        """
+        Args:
+          model:
+            A VITS generator.
+          frame_shift:
+            The frame shift in samples.
+        """
+        super().__init__()
+        self.model = model
+
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        tokens_lens: torch.Tensor,
+        noise_scale: float = 0.667,
+        alpha: float = 1.0,
+        noise_scale_dur: float = 0.8,
+        speaker: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Please see the help information of VITS.inference_batch
+
+        Args:
+          tokens:
+            Input text token indexes (1, T_text)
+          tokens_lens:
+            Number of tokens of shape (1,)
+          noise_scale (float):
+            Noise scale parameter for flow.
+          noise_scale_dur (float):
+            Noise scale parameter for duration predictor.
+          speaker (int):
+            Speaker ID.
+          alpha (float):
+            Alpha parameter to control the speed of generated speech.
+
+        Returns:
+          Return a tuple containing:
+            - audio, generated wavform tensor, (B, T_wav)
+        """
+        audio, _, _ = self.model.generator.inference(
+            text=tokens,
+            text_lengths=tokens_lens,
+            noise_scale=noise_scale,
+            noise_scale_dur=noise_scale_dur,
+            sids=speaker,
+            alpha=alpha,
+        )
+        return audio
+
+
+def export_model_onnx(
+    model: nn.Module,
+    model_filename: str,
+    vocab_size: int,
+    opset_version: int = 11,
+) -> None:
+    """Export the given generator model to ONNX format.
+    The exported model has one input:
+
+        - tokens, a tensor of shape (1, T_text); dtype is torch.int64
+
+    and it has one output:
+
+        - audio, a tensor of shape (1, T'); dtype is torch.float32
+
+    Args:
+      model:
+        The VITS generator.
+      model_filename:
+        The filename to save the exported ONNX model.
+      vocab_size:
+        Number of tokens used in training.
+      opset_version:
+        The opset version to use.
+    """
+    tokens = torch.randint(low=0, high=vocab_size, size=(1, 13), dtype=torch.int64)
+    tokens_lens = torch.tensor([tokens.shape[1]], dtype=torch.int64)
+    noise_scale = torch.tensor([1], dtype=torch.float32)
+    noise_scale_dur = torch.tensor([1], dtype=torch.float32)
+    alpha = torch.tensor([1], dtype=torch.float32)
+    speaker = torch.tensor([1], dtype=torch.int64)
+
+    torch.onnx.export(
+        model,
+        (tokens, tokens_lens, noise_scale, alpha, noise_scale_dur, speaker),
+        model_filename,
+        verbose=False,
+        opset_version=opset_version,
+        input_names=[
+            "tokens",
+            "tokens_lens",
+            "noise_scale",
+            "alpha",
+            "noise_scale_dur",
+            "speaker",
+        ],
+        output_names=["audio"],
+        dynamic_axes={
+            "tokens": {0: "N", 1: "T"},
+            "tokens_lens": {0: "N"},
+            "audio": {0: "N", 1: "T"},
+            "speaker": {0: "N"},
+        },
+    )
+
+    if model.model.spks is None:
+        num_speakers = 1
+    else:
+        num_speakers = model.model.spks
+
+    meta_data = {
+        "model_type": "vits",
+        "version": "1",
+        "model_author": "k2-fsa",
+        "comment": "icefall",  # must be icefall for models from icefall
+        "language": "Chinese",
+        "n_speakers": num_speakers,
+        "sample_rate": model.model.sampling_rate,  # Must match the real sample rate
+    }
+    logging.info(f"meta_data: {meta_data}")
+
+    add_meta_data(filename=model_filename, meta_data=meta_data)
+
+
+@torch.no_grad()
+def main():
+    args = get_parser().parse_args()
+    args.exp_dir = Path(args.exp_dir)
+
+    params = get_params()
+    params.update(vars(args))
+
+    tokenizer = Tokenizer(params.tokens)
+    params.blank_id = tokenizer.pad_id
+    params.vocab_size = tokenizer.vocab_size
+
+    with open(args.speakers) as f:
+        speaker_map = {line.strip(): i for i, line in enumerate(f)}
+    params.num_spks = len(speaker_map)
+
+    logging.info(params)
+
+    logging.info("About to create model")
+    model = get_model(params)
+
+    load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
+
+    model.to("cpu")
+    model.eval()
+
+    model = OnnxModel(model=model)
+
+    num_param = sum([p.numel() for p in model.parameters()])
+    logging.info(f"generator parameters: {num_param}, or {num_param/1000/1000} M")
+
+    suffix = f"epoch-{params.epoch}"
+
+    opset_version = 13
+
+    logging.info("Exporting encoder")
+    model_filename = params.exp_dir / f"vits-{suffix}.onnx"
+    export_model_onnx(
+        model,
+        model_filename,
+        params.vocab_size,
+        opset_version=opset_version,
+    )
+    logging.info(f"Exported generator to {model_filename}")
+
+
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()
+
+"""
+Supported languages.
+
+LJSpeech is using "en-us" from the second column.
+
+Pty Language       Age/Gender VoiceName          File                 Other Languages
+ 5  af              --/M      Afrikaans          gmw/af
+ 5  am              --/M      Amharic            sem/am
+ 5  an              --/M      Aragonese          roa/an
+ 5  ar              --/M      Arabic             sem/ar
+ 5  as              --/M      Assamese           inc/as
+ 5  az              --/M      Azerbaijani        trk/az
+ 5  ba              --/M      Bashkir            trk/ba
+ 5  be              --/M      Belarusian         zle/be
+ 5  bg              --/M      Bulgarian          zls/bg
+ 5  bn              --/M      Bengali            inc/bn
+ 5  bpy             --/M      Bishnupriya_Manipuri inc/bpy
+ 5  bs              --/M      Bosnian            zls/bs
+ 5  ca              --/M      Catalan            roa/ca
+ 5  chr-US-Qaaa-x-west --/M      Cherokee_          iro/chr
+ 5  cmn             --/M      Chinese_(Mandarin,_latin_as_English) sit/cmn              (zh-cmn 5)(zh 5)
+ 5  cmn-latn-pinyin --/M      Chinese_(Mandarin,_latin_as_Pinyin) sit/cmn-Latn-pinyin  (zh-cmn 5)(zh 5)
+ 5  cs              --/M      Czech              zlw/cs
+ 5  cv              --/M      Chuvash            trk/cv
+ 5  cy              --/M      Welsh              cel/cy
+ 5  da              --/M      Danish             gmq/da
+ 5  de              --/M      German             gmw/de
+ 5  el              --/M      Greek              grk/el
+ 5  en-029          --/M      English_(Caribbean) gmw/en-029           (en 10)
+ 2  en-gb           --/M      English_(Great_Britain) gmw/en               (en 2)
+ 5  en-gb-scotland  --/M      English_(Scotland) gmw/en-GB-scotland   (en 4)
+ 5  en-gb-x-gbclan  --/M      English_(Lancaster) gmw/en-GB-x-gbclan   (en-gb 3)(en 5)
+ 5  en-gb-x-gbcwmd  --/M      English_(West_Midlands) gmw/en-GB-x-gbcwmd   (en-gb 9)(en 9)
+ 5  en-gb-x-rp      --/M      English_(Received_Pronunciation) gmw/en-GB-x-rp       (en-gb 4)(en 5)
+ 2  en-us           --/M      English_(America)  gmw/en-US            (en 3)
+ 5  en-us-nyc       --/M      English_(America,_New_York_City) gmw/en-US-nyc
+ 5  eo              --/M      Esperanto          art/eo
+ 5  es              --/M      Spanish_(Spain)    roa/es
+ 5  es-419          --/M      Spanish_(Latin_America) roa/es-419           (es-mx 6)
+ 5  et              --/M      Estonian           urj/et
+ 5  eu              --/M      Basque             eu
+ 5  fa              --/M      Persian            ira/fa
+ 5  fa-latn         --/M      Persian_(Pinglish) ira/fa-Latn
+ 5  fi              --/M      Finnish            urj/fi
+ 5  fr-be           --/M      French_(Belgium)   roa/fr-BE            (fr 8)
+ 5  fr-ch           --/M      French_(Switzerland) roa/fr-CH            (fr 8)
+ 5  fr-fr           --/M      French_(France)    roa/fr               (fr 5)
+ 5  ga              --/M      Gaelic_(Irish)     cel/ga
+ 5  gd              --/M      Gaelic_(Scottish)  cel/gd
+ 5  gn              --/M      Guarani            sai/gn
+ 5  grc             --/M      Greek_(Ancient)    grk/grc
+ 5  gu              --/M      Gujarati           inc/gu
+ 5  hak             --/M      Hakka_Chinese      sit/hak
+ 5  haw             --/M      Hawaiian           map/haw
+ 5  he              --/M      Hebrew             sem/he
+ 5  hi              --/M      Hindi              inc/hi
+ 5  hr              --/M      Croatian           zls/hr               (hbs 5)
+ 5  ht              --/M      Haitian_Creole     roa/ht
+ 5  hu              --/M      Hungarian          urj/hu
+ 5  hy              --/M      Armenian_(East_Armenia) ine/hy               (hy-arevela 5)
+ 5  hyw             --/M      Armenian_(West_Armenia) ine/hyw              (hy-arevmda 5)(hy 8)
+ 5  ia              --/M      Interlingua        art/ia
+ 5  id              --/M      Indonesian         poz/id
+ 5  io              --/M      Ido                art/io
+ 5  is              --/M      Icelandic          gmq/is
+ 5  it              --/M      Italian            roa/it
+ 5  ja              --/M      Japanese           jpx/ja
+ 5  jbo             --/M      Lojban             art/jbo
+ 5  ka              --/M      Georgian           ccs/ka
+ 5  kk              --/M      Kazakh             trk/kk
+ 5  kl              --/M      Greenlandic        esx/kl
+ 5  kn              --/M      Kannada            dra/kn
+ 5  ko              --/M      Korean             ko
+ 5  kok             --/M      Konkani            inc/kok
+ 5  ku              --/M      Kurdish            ira/ku
+ 5  ky              --/M      Kyrgyz             trk/ky
+ 5  la              --/M      Latin              itc/la
+ 5  lb              --/M      Luxembourgish      gmw/lb
+ 5  lfn             --/M      Lingua_Franca_Nova art/lfn
+ 5  lt              --/M      Lithuanian         bat/lt
+ 5  ltg             --/M      Latgalian          bat/ltg
+ 5  lv              --/M      Latvian            bat/lv
+ 5  mi              --/M      Māori             poz/mi
+ 5  mk              --/M      Macedonian         zls/mk
+ 5  ml              --/M      Malayalam          dra/ml
+ 5  mr              --/M      Marathi            inc/mr
+ 5  ms              --/M      Malay              poz/ms
+ 5  mt              --/M      Maltese            sem/mt
+ 5  mto             --/M      Totontepec_Mixe    miz/mto
+ 5  my              --/M      Myanmar_(Burmese)  sit/my
+ 5  nb              --/M      Norwegian_Bokmål  gmq/nb               (no 5)
+ 5  nci             --/M      Nahuatl_(Classical) azc/nci
+ 5  ne              --/M      Nepali             inc/ne
+ 5  nl              --/M      Dutch              gmw/nl
+ 5  nog             --/M      Nogai              trk/nog
+ 5  om              --/M      Oromo              cus/om
+ 5  or              --/M      Oriya              inc/or
+ 5  pa              --/M      Punjabi            inc/pa
+ 5  pap             --/M      Papiamento         roa/pap
+ 5  piqd            --/M      Klingon            art/piqd
+ 5  pl              --/M      Polish             zlw/pl
+ 5  pt              --/M      Portuguese_(Portugal) roa/pt               (pt-pt 5)
+ 5  pt-br           --/M      Portuguese_(Brazil) roa/pt-BR            (pt 6)
+ 5  py              --/M      Pyash              art/py
+ 5  qdb             --/M      Lang_Belta         art/qdb
+ 5  qu              --/M      Quechua            qu
+ 5  quc             --/M      K'iche'            myn/quc
+ 5  qya             --/M      Quenya             art/qya
+ 5  ro              --/M      Romanian           roa/ro
+ 5  ru              --/M      Russian            zle/ru
+ 5  ru-cl           --/M      Russian_(Classic)  zle/ru-cl
+ 2  ru-lv           --/M      Russian_(Latvia)   zle/ru-LV
+ 5  sd              --/M      Sindhi             inc/sd
+ 5  shn             --/M      Shan_(Tai_Yai)     tai/shn
+ 5  si              --/M      Sinhala            inc/si
+ 5  sjn             --/M      Sindarin           art/sjn
+ 5  sk              --/M      Slovak             zlw/sk
+ 5  sl              --/M      Slovenian          zls/sl
+ 5  smj             --/M      Lule_Saami         urj/smj
+ 5  sq              --/M      Albanian           ine/sq
+ 5  sr              --/M      Serbian            zls/sr
+ 5  sv              --/M      Swedish            gmq/sv
+ 5  sw              --/M      Swahili            bnt/sw
+ 5  ta              --/M      Tamil              dra/ta
+ 5  te              --/M      Telugu             dra/te
+ 5  th              --/M      Thai               tai/th
+ 5  tk              --/M      Turkmen            trk/tk
+ 5  tn              --/M      Setswana           bnt/tn
+ 5  tr              --/M      Turkish            trk/tr
+ 5  tt              --/M      Tatar              trk/tt
+ 5  ug              --/M      Uyghur             trk/ug
+ 5  uk              --/M      Ukrainian          zle/uk
+ 5  ur              --/M      Urdu               inc/ur
+ 5  uz              --/M      Uzbek              trk/uz
+ 5  vi              --/M      Vietnamese_(Northern) aav/vi
+ 5  vi-vn-x-central --/M      Vietnamese_(Central) aav/vi-VN-x-central
+ 5  vi-vn-x-south   --/M      Vietnamese_(Southern) aav/vi-VN-x-south
+ 5  yue             --/M      Chinese_(Cantonese) sit/yue              (zh-yue 5)(zh 8)
+ 5  yue             --/M      Chinese_(Cantonese,_latin_as_Jyutping) sit/yue-Latn-jyutping (zh-yue 5)(zh 8)
+"""
--- a/egs/aishell3/TTS/vits/flow.py
+++ b/egs/aishell3/TTS/vits/flow.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/vits/flow.py
--- a/egs/aishell3/TTS/vits/generator.py
+++ b/egs/aishell3/TTS/vits/generator.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/vits/generator.py
--- a/egs/aishell3/TTS/vits/hifigan.py
+++ b/egs/aishell3/TTS/vits/hifigan.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/vits/hifigan.py
--- a/egs/aishell3/TTS/vits/loss.py
+++ b/egs/aishell3/TTS/vits/loss.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/vits/loss.py
--- a/egs/aishell3/TTS/vits/monotonic_align
+++ b/egs/aishell3/TTS/vits/monotonic_align
@ -0,0 +1 @@
+../../../ljspeech/TTS/vits/monotonic_align/
--- a/egs/aishell3/TTS/vits/pinyin_dict.py
+++ b/egs/aishell3/TTS/vits/pinyin_dict.py
@ -0,0 +1 @@
+../local/pinyin_dict.py
--- a/egs/aishell3/TTS/vits/posterior_encoder.py
+++ b/egs/aishell3/TTS/vits/posterior_encoder.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/vits/posterior_encoder.py
--- a/egs/aishell3/TTS/vits/pypinyin-local.dict
+++ b/egs/aishell3/TTS/vits/pypinyin-local.dict
@ -0,0 +1 @@
+../local/pypinyin-local.dict
--- a/egs/aishell3/TTS/vits/residual_coupling.py
+++ b/egs/aishell3/TTS/vits/residual_coupling.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/vits/residual_coupling.py
--- a/egs/aishell3/TTS/vits/text_encoder.py
+++ b/egs/aishell3/TTS/vits/text_encoder.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/vits/text_encoder.py
--- a/egs/aishell3/TTS/vits/tokenizer.py
+++ b/egs/aishell3/TTS/vits/tokenizer.py
@ -0,0 +1 @@
+../local/tokenizer.py
--- a/egs/aishell3/TTS/vits/train.py
+++ b/egs/aishell3/TTS/vits/train.py
--- a/egs/aishell3/TTS/vits/transform.py
+++ b/egs/aishell3/TTS/vits/transform.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/vits/transform.py
--- a/egs/aishell3/TTS/vits/tts_datamodule.py
+++ b/egs/aishell3/TTS/vits/tts_datamodule.py
@ -0,0 +1,349 @@
+# Copyright      2021  Piotr Żelasko
+# Copyright      2022-2023  Xiaomi Corporation     (Authors: Mingshuang Luo,
+#                                                            Zengwei Yao)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+from functools import lru_cache
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import torch
+from lhotse import CutSet, Spectrogram, SpectrogramConfig, load_manifest_lazy
+from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
+    CutConcatenate,
+    CutMix,
+    DynamicBucketingSampler,
+    PrecomputedFeatures,
+    SimpleCutSampler,
+    SpecAugment,
+    SpeechSynthesisDataset,
+)
+from lhotse.dataset.input_strategies import (  # noqa F401 For AudioSamples
+    AudioSamples,
+    OnTheFlyFeatures,
+)
+from lhotse.utils import fix_random_seed
+from torch.utils.data import DataLoader
+
+from icefall.utils import str2bool
+
+
+class _SeedWorkers:
+    def __init__(self, seed: int):
+        self.seed = seed
+
+    def __call__(self, worker_id: int):
+        fix_random_seed(self.seed + worker_id)
+
+
+class Aishell3SpeechTtsDataModule:
+    """
+    DataModule for tts experiments.
+    It assumes there is always one train and valid dataloader,
+    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
+    and test-other).
+
+    It contains all the common data pipeline modules used in ASR
+    experiments, e.g.:
+    - dynamic batch size,
+    - bucketing samplers,
+    - cut concatenation,
+    - on-the-fly feature extraction
+
+    This class should be derived for specific corpora used in TTS tasks.
+    """
+
+    def __init__(self, args: argparse.Namespace):
+        self.args = args
+        self.sampling_rate = 8000
+
+    @classmethod
+    def add_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(
+            title="TTS data related options",
+            description="These options are used for the preparation of "
+            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
+            "effective batch sizes, sampling strategies, applied data "
+            "augmentations, etc.",
+        )
+
+        group.add_argument(
+            "--manifest-dir",
+            type=Path,
+            default=Path("data/spectrogram"),
+            help="Path to directory with train/valid/test cuts.",
+        )
+        group.add_argument(
+            "--speakers",
+            type=Path,
+            default=Path("data/speakers.txt"),
+            help="Path to speakers.txt file.",
+        )
+        group.add_argument(
+            "--max-duration",
+            type=int,
+            default=200.0,
+            help="Maximum pooled recordings duration (seconds) in a "
+            "single batch. You can reduce it if it causes CUDA OOM.",
+        )
+        group.add_argument(
+            "--bucketing-sampler",
+            type=str2bool,
+            default=True,
+            help="When enabled, the batches will come from buckets of "
+            "similar duration (saves padding frames).",
+        )
+        group.add_argument(
+            "--num-buckets",
+            type=int,
+            default=30,
+            help="The number of buckets for the DynamicBucketingSampler"
+            "(you might want to increase it for larger datasets).",
+        )
+
+        group.add_argument(
+            "--on-the-fly-feats",
+            type=str2bool,
+            default=False,
+            help="When enabled, use on-the-fly cut mixing and feature "
+            "extraction. Will drop existing precomputed feature manifests "
+            "if available.",
+        )
+        group.add_argument(
+            "--shuffle",
+            type=str2bool,
+            default=True,
+            help="When enabled (=default), the examples will be "
+            "shuffled for each epoch.",
+        )
+        group.add_argument(
+            "--drop-last",
+            type=str2bool,
+            default=True,
+            help="Whether to drop last batch. Used by sampler.",
+        )
+        group.add_argument(
+            "--return-cuts",
+            type=str2bool,
+            default=False,
+            help="When enabled, each batch will have the "
+            "field: batch['cut'] with the cuts that "
+            "were used to construct it.",
+        )
+        group.add_argument(
+            "--num-workers",
+            type=int,
+            default=2,
+            help="The number of training dataloader workers that "
+            "collect the batches.",
+        )
+
+        group.add_argument(
+            "--input-strategy",
+            type=str,
+            default="PrecomputedFeatures",
+            help="AudioSamples or PrecomputedFeatures",
+        )
+
+    def train_dataloaders(
+        self,
+        cuts_train: CutSet,
+        sampler_state_dict: Optional[Dict[str, Any]] = None,
+    ) -> DataLoader:
+        """
+        Args:
+          cuts_train:
+            CutSet for training.
+          sampler_state_dict:
+            The state dict for the training sampler.
+        """
+        logging.info("About to create train dataset")
+        train = SpeechSynthesisDataset(
+            return_text=False,
+            return_tokens=True,
+            return_spk_ids=True,
+            feature_input_strategy=eval(self.args.input_strategy)(),
+            return_cuts=self.args.return_cuts,
+        )
+
+        if self.args.on_the_fly_feats:
+            sampling_rate = self.sampling_rate
+            config = SpectrogramConfig(
+                sampling_rate=sampling_rate,
+                frame_length=1024 / sampling_rate,  # (in second),
+                frame_shift=256 / sampling_rate,  # (in second)
+                use_fft_mag=True,
+            )
+            train = SpeechSynthesisDataset(
+                return_text=False,
+                return_tokens=True,
+                return_spk_ids=True,
+                feature_input_strategy=OnTheFlyFeatures(Spectrogram(config)),
+                return_cuts=self.args.return_cuts,
+            )
+
+        if self.args.bucketing_sampler:
+            logging.info("Using DynamicBucketingSampler.")
+            train_sampler = DynamicBucketingSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                shuffle=self.args.shuffle,
+                num_buckets=self.args.num_buckets,
+                buffer_size=self.args.num_buckets * 2000,
+                shuffle_buffer_size=self.args.num_buckets * 5000,
+                drop_last=self.args.drop_last,
+            )
+        else:
+            logging.info("Using SimpleCutSampler.")
+            train_sampler = SimpleCutSampler(
+                cuts_train,
+                max_duration=self.args.max_duration,
+                shuffle=self.args.shuffle,
+            )
+        logging.info("About to create train dataloader")
+
+        if sampler_state_dict is not None:
+            logging.info("Loading sampler state dict")
+            train_sampler.load_state_dict(sampler_state_dict)
+
+        # 'seed' is derived from the current random state, which will have
+        # previously been set in the main process.
+        seed = torch.randint(0, 100000, ()).item()
+        worker_init_fn = _SeedWorkers(seed)
+
+        train_dl = DataLoader(
+            train,
+            sampler=train_sampler,
+            batch_size=None,
+            num_workers=self.args.num_workers,
+            persistent_workers=False,
+            worker_init_fn=worker_init_fn,
+        )
+
+        return train_dl
+
+    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
+        logging.info("About to create dev dataset")
+        if self.args.on_the_fly_feats:
+            sampling_rate = self.sampling_rate
+            config = SpectrogramConfig(
+                sampling_rate=sampling_rate,
+                frame_length=1024 / sampling_rate,  # (in second),
+                frame_shift=256 / sampling_rate,  # (in second)
+                use_fft_mag=True,
+            )
+            validate = SpeechSynthesisDataset(
+                return_text=False,
+                return_tokens=True,
+                return_spk_ids=True,
+                feature_input_strategy=OnTheFlyFeatures(Spectrogram(config)),
+                return_cuts=self.args.return_cuts,
+            )
+        else:
+            validate = SpeechSynthesisDataset(
+                return_text=False,
+                return_tokens=True,
+                return_spk_ids=True,
+                feature_input_strategy=eval(self.args.input_strategy)(),
+                return_cuts=self.args.return_cuts,
+            )
+        valid_sampler = DynamicBucketingSampler(
+            cuts_valid,
+            max_duration=self.args.max_duration,
+            num_buckets=self.args.num_buckets,
+            shuffle=False,
+        )
+        logging.info("About to create valid dataloader")
+        valid_dl = DataLoader(
+            validate,
+            sampler=valid_sampler,
+            batch_size=None,
+            num_workers=2,
+            persistent_workers=False,
+        )
+
+        return valid_dl
+
+    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
+        logging.info("About to create test dataset")
+        if self.args.on_the_fly_feats:
+            sampling_rate = self.sampling_rate
+            config = SpectrogramConfig(
+                sampling_rate=sampling_rate,
+                frame_length=1024 / sampling_rate,  # (in second),
+                frame_shift=256 / sampling_rate,  # (in second)
+                use_fft_mag=True,
+            )
+            test = SpeechSynthesisDataset(
+                return_text=False,
+                return_tokens=True,
+                return_spk_ids=True,
+                feature_input_strategy=OnTheFlyFeatures(Spectrogram(config)),
+                return_cuts=self.args.return_cuts,
+            )
+        else:
+            test = SpeechSynthesisDataset(
+                return_text=False,
+                return_tokens=True,
+                return_spk_ids=True,
+                feature_input_strategy=eval(self.args.input_strategy)(),
+                return_cuts=self.args.return_cuts,
+            )
+        test_sampler = DynamicBucketingSampler(
+            cuts,
+            max_duration=self.args.max_duration,
+            num_buckets=self.args.num_buckets,
+            shuffle=False,
+        )
+        logging.info("About to create test dataloader")
+        test_dl = DataLoader(
+            test,
+            batch_size=None,
+            sampler=test_sampler,
+            num_workers=self.args.num_workers,
+        )
+        return test_dl
+
+    @lru_cache()
+    def train_cuts(self) -> CutSet:
+        logging.info("About to get train cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aishell3_cuts_train.jsonl.gz"
+        )
+
+    @lru_cache()
+    def valid_cuts(self) -> CutSet:
+        logging.info("About to get validation cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aishell3_cuts_valid.jsonl.gz"
+        )
+
+    @lru_cache()
+    def test_cuts(self) -> CutSet:
+        logging.info("About to get test cuts")
+        return load_manifest_lazy(
+            self.args.manifest_dir / "aishell3_cuts_test.jsonl.gz"
+        )
+
+    @lru_cache()
+    def speakers(self) -> Dict[str, int]:
+        logging.info("About to get speakers")
+        with open(self.args.speakers) as f:
+            speakers = {line.strip(): i for i, line in enumerate(f)}
+        return speakers
--- a/egs/aishell3/TTS/vits/utils.py
+++ b/egs/aishell3/TTS/vits/utils.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/vits/utils.py
--- a/egs/aishell3/TTS/vits/vits.py
+++ b/egs/aishell3/TTS/vits/vits.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/vits/vits.py
--- a/egs/aishell3/TTS/vits/wavenet.py
+++ b/egs/aishell3/TTS/vits/wavenet.py
@ -0,0 +1 @@
+../../../ljspeech/TTS/vits/wavenet.py
--- a/egs/ljspeech/TTS/vits/monotonic_align/setup.py
+++ b/egs/ljspeech/TTS/vits/monotonic_align/setup.py
@ -1,7 +1,10 @@
 # https://github.com/espnet/espnet/blob/master/espnet2/gan_tts/vits/monotonic_align/setup.py
 """Setup cython code."""

-from Cython.Build import cythonize
+try:
+    from Cython.Build import cythonize
+except ModuleNotFoundError as ex:
+    raise RuntimeError(f'{ex}\nPlease run:\n pip install cython')
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext as _build_ext

--- a/egs/ljspeech/TTS/vits/tokenizer.py
+++ b/egs/ljspeech/TTS/vits/tokenizer.py
@ -44,11 +44,11 @@ class Tokenizer(object):
                if len(info) == 1:
                    # case of space
                    token = " "
-                    id = int(info[0])
+                    idx = int(info[0])
                else:
-                    token, id = info[0], int(info[1])
+                    token, idx = info[0], int(info[1])
                assert token not in self.token2id, token
-                self.token2id[token] = id
+                self.token2id[token] = idx

        # Refer to https://github.com/rhasspy/piper/blob/master/TRAINING.md
        self.pad_id = self.token2id["_"]  # padding
--- a/egs/ljspeech/TTS/vits/tts_datamodule.py
+++ b/egs/ljspeech/TTS/vits/tts_datamodule.py
@ -66,7 +66,7 @@ class LJSpeechTtsDataModule:
    - cut concatenation,
    - on-the-fly feature extraction

-    This class should be derived for specific corpora used in ASR tasks.
+    This class should be derived for specific corpora used in TTS tasks.
    """

    def __init__(self, args: argparse.Namespace):
				`@ -0,0 +1 @@`
				`../../../ljspeech/TTS/local/validate_manifest.py`
				`@ -0,0 +1 @@`
				`../../../ljspeech/TTS/vits/duration_predictor.py`
				`@ -0,0 +1 @@`
				`../../../ljspeech/TTS/vits/posterior_encoder.py`
				`@ -0,0 +1 @@`
				`../../../ljspeech/TTS/vits/residual_coupling.py`