Merge d8a0a4095554e58db0fc8f4e30a6a33932ab37dd into 34fc1fdf0d8ff520e2bb18267d046ca207c78ef9

2025-08-08 17:42:21 +00:00 · 2025-07-25 09:16:26 +02:00 · 2025-07-25 09:16:26 +02:00 · cfbd4208cc
commit cfbd4208cc
parent 34fc1fdf0d d8a0a40955
27 changed files with 6309 additions and 11 deletions
--- a/egs/libritts/TTS/vocos/discriminators.py
+++ b/egs/libritts/TTS/vocos/discriminators.py
@ -0,0 +1,296 @@
 from typing import List, Optional, Tuple
 import torch
 from torch import nn
 from torch.nn import Conv2d
 from torch.nn.utils.parametrizations import weight_norm
 from torchaudio.transforms import Spectrogram
 class MultiPeriodDiscriminator(nn.Module):
    """
    Multi-Period Discriminator module adapted from https://github.com/jik876/hifi-gan.
    Additionally, it allows incorporating conditional information with a learned embeddings table.
    Args:
        periods (tuple[int]): Tuple of periods for each discriminator.
        num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
            Defaults to None.
    """
    def __init__(
        self,
        periods: Tuple[int, ...] = (2, 3, 5, 7, 11),
        num_embeddings: Optional[int] = None,
    ):
        super().__init__()
        self.discriminators = nn.ModuleList(
            [DiscriminatorP(period=p, num_embeddings=num_embeddings) for p in periods]
        )
    def forward(
        self,
        y: torch.Tensor,
        y_hat: torch.Tensor,
        bandwidth_id: Optional[torch.Tensor] = None,
    ) -> Tuple[
        List[torch.Tensor],
        List[torch.Tensor],
        List[List[torch.Tensor]],
        List[List[torch.Tensor]],
    ]:
        y_d_rs = []
        y_d_gs = []
        fmap_rs = []
        fmap_gs = []
        for d in self.discriminators:
            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
            y_d_rs.append(y_d_r)
            fmap_rs.append(fmap_r)
            y_d_gs.append(y_d_g)
            fmap_gs.append(fmap_g)
        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
 class DiscriminatorP(nn.Module):
    def __init__(
        self,
        period: int,
        in_channels: int = 1,
        kernel_size: int = 5,
        stride: int = 3,
        lrelu_slope: float = 0.1,
        num_embeddings: Optional[int] = None,
    ):
        super().__init__()
        self.period = period
        self.convs = nn.ModuleList(
            [
                weight_norm(
                    Conv2d(
                        in_channels,
                        32,
                        (kernel_size, 1),
                        (stride, 1),
                        padding=(kernel_size // 2, 0),
                    )
                ),
                weight_norm(
                    Conv2d(
                        32,
                        128,
                        (kernel_size, 1),
                        (stride, 1),
                        padding=(kernel_size // 2, 0),
                    )
                ),
                weight_norm(
                    Conv2d(
                        128,
                        512,
                        (kernel_size, 1),
                        (stride, 1),
                        padding=(kernel_size // 2, 0),
                    )
                ),
                weight_norm(
                    Conv2d(
                        512,
                        1024,
                        (kernel_size, 1),
                        (stride, 1),
                        padding=(kernel_size // 2, 0),
                    )
                ),
                weight_norm(
                    Conv2d(
                        1024,
                        1024,
                        (kernel_size, 1),
                        (1, 1),
                        padding=(kernel_size // 2, 0),
                    )
                ),
            ]
        )
        if num_embeddings is not None:
            self.emb = torch.nn.Embedding(
                num_embeddings=num_embeddings, embedding_dim=1024
            )
            torch.nn.init.zeros_(self.emb.weight)
        self.conv_post = weight_norm(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
        self.lrelu_slope = lrelu_slope
    def forward(
        self, x: torch.Tensor, cond_embedding_id: Optional[torch.Tensor] = None
    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
        x = x.unsqueeze(1)
        fmap = []
        # 1d to 2d
        b, c, t = x.shape
        if t % self.period != 0:  # pad first
            n_pad = self.period - (t % self.period)
            x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
            t = t + n_pad
        x = x.view(b, c, t // self.period, self.period)
        for i, l in enumerate(self.convs):
            x = l(x)
            x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
            if i > 0:
                fmap.append(x)
        if cond_embedding_id is not None:
            emb = self.emb(cond_embedding_id)
            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
        else:
            h = 0
        x = self.conv_post(x)
        fmap.append(x)
        x += h
        x = torch.flatten(x, 1, -1)
        return x, fmap
 class MultiResolutionDiscriminator(nn.Module):
    def __init__(
        self,
        fft_sizes: Tuple[int, ...] = (2048, 1024, 512),
        num_embeddings: Optional[int] = None,
    ):
        """
        Multi-Resolution Discriminator module adapted from https://github.com/descriptinc/descript-audio-codec.
        Additionally, it allows incorporating conditional information with a learned embeddings table.
        Args:
            fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
            num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
                Defaults to None.
        """
        super().__init__()
        self.discriminators = nn.ModuleList(
            [
                DiscriminatorR(window_length=w, num_embeddings=num_embeddings)
                for w in fft_sizes
            ]
        )
    def forward(
        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
    ) -> Tuple[
        List[torch.Tensor],
        List[torch.Tensor],
        List[List[torch.Tensor]],
        List[List[torch.Tensor]],
    ]:
        y_d_rs = []
        y_d_gs = []
        fmap_rs = []
        fmap_gs = []
        for d in self.discriminators:
            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
            y_d_rs.append(y_d_r)
            fmap_rs.append(fmap_r)
            y_d_gs.append(y_d_g)
            fmap_gs.append(fmap_g)
        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
 class DiscriminatorR(nn.Module):
    def __init__(
        self,
        window_length: int,
        num_embeddings: Optional[int] = None,
        channels: int = 32,
        hop_factor: float = 0.25,
        bands: Tuple[Tuple[float, float], ...] = (
            (0.0, 0.1),
            (0.1, 0.25),
            (0.25, 0.5),
            (0.5, 0.75),
            (0.75, 1.0),
        ),
    ):
        super().__init__()
        self.window_length = window_length
        self.hop_factor = hop_factor
        self.spec_fn = Spectrogram(
            n_fft=window_length,
            hop_length=int(window_length * hop_factor),
            win_length=window_length,
            power=None,
        )
        n_fft = window_length // 2 + 1
        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
        self.bands = bands
        convs = lambda: nn.ModuleList(
            [
                weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
                weight_norm(
                    nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
                ),
                weight_norm(
                    nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
                ),
                weight_norm(
                    nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
                ),
                weight_norm(
                    nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))
                ),
            ]
        )
        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
        if num_embeddings is not None:
            self.emb = torch.nn.Embedding(
                num_embeddings=num_embeddings, embedding_dim=channels
            )
            torch.nn.init.zeros_(self.emb.weight)
        self.conv_post = weight_norm(
            nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1))
        )
    def spectrogram(self, x):
        # Remove DC offset
        x = x - x.mean(dim=-1, keepdims=True)
        # Peak normalize the volume of input audio
        x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
        x = self.spec_fn(x)
        x = torch.view_as_real(x)
        # x = rearrange(x, "b f t c -> b c t f")
        x = x.permute(0, 3, 2, 1)
        # Split into bands
        x_bands = [x[..., b[0] : b[1]] for b in self.bands]
        return x_bands
    def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None):
        x_bands = self.spectrogram(x)
        fmap = []
        x = []
        for band, stack in zip(x_bands, self.band_convs):
            for i, layer in enumerate(stack):
                band = layer(band)
                band = torch.nn.functional.leaky_relu(band, 0.1)
                if i > 0:
                    fmap.append(band)
            x.append(band)
        x = torch.cat(x, dim=-1)
        if cond_embedding_id is not None:
            emb = self.emb(cond_embedding_id)
            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
        else:
            h = 0
        x = self.conv_post(x)
        fmap.append(x)
        x += h
        return x, fmap
--- a/egs/libritts/TTS/vocos/export-onnx.py
+++ b/egs/libritts/TTS/vocos/export-onnx.py
@ -0,0 +1,371 @@
 #!/usr/bin/env python3
 #
 # Copyright 2023 Xiaomi Corporation (Author: Fangjun Kuang, Wei Kang)
 # Copyright 2023 Danqing Fu (danqing.fu@gmail.com)
 """
 This script exports a transducer model from PyTorch to ONNX.
 We use the pre-trained model from
 https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
 as an example to show how to use this file.
 1. Download the pre-trained model
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 pushd $repo
 git lfs pull --include "exp/pretrained.pt"
 cd exp
 ln -s pretrained.pt epoch-99.pt
 popd
 2. Export the model to ONNX
 ./zipformer/export-onnx.py \
  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --use-averaged-model 0 \
  --epoch 99 \
  --avg 1 \
  --exp-dir $repo/exp \
  --num-encoder-layers "2,2,3,4,3,2" \
  --downsampling-factor "1,2,4,8,4,2" \
  --feedforward-dim "512,768,1024,1536,1024,768" \
  --num-heads "4,4,4,8,4,4" \
  --encoder-dim "192,256,384,512,384,256" \
  --query-head-dim 32 \
  --value-head-dim 12 \
  --pos-head-dim 4 \
  --pos-dim 48 \
  --encoder-unmasked-dim "192,192,256,256,256,192" \
  --cnn-module-kernel "31,31,15,15,15,31" \
  --decoder-dim 512 \
  --joiner-dim 512 \
  --causal False \
  --chunk-size "16,32,64,-1" \
  --left-context-frames "64,128,256,-1" \
  --fp16 True
 It will generate the following 3 files inside $repo/exp:
  - encoder-epoch-99-avg-1.onnx
  - decoder-epoch-99-avg-1.onnx
  - joiner-epoch-99-avg-1.onnx
 See ./onnx_pretrained.py and ./onnx_check.py for how to
 use the exported ONNX models.
 """
 import argparse
 import logging
 from pathlib import Path
 from typing import Dict, Tuple
 import onnx
 import torch
 import torch.nn as nn
 from onnxconverter_common import float16
 from onnxruntime.quantization import QuantType, quantize_dynamic
 from train import add_model_arguments, get_model, get_params
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.utils import make_pad_mask, num_tokens, str2bool
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--sampling-rate",
        type=int,
        default=24000,
        help="The sampleing rate of libritts dataset",
    )
    parser.add_argument(
        "--frame-shift",
        type=int,
        default=256,
        help="Frame shift.",
    )
    parser.add_argument(
        "--frame-length",
        type=int,
        default=1024,
        help="Frame shift.",
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=28,
        help="""It specifies the checkpoint to use for averaging.
        Note: Epoch counts from 0.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=15,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
    )
    parser.add_argument(
        "--use-averaged-model",
        type=str2bool,
        default=True,
        help="Whether to load averaged model. Currently it only supports "
        "using --epoch. If True, it would decode with the averaged model "
        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
        "Actually only the models with epoch number of `epoch-avg` and "
        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="zipformer/exp",
        help="""It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
        """,
    )
    parser.add_argument(
        "--fp16",
        type=str2bool,
        default=False,
        help="Whether to export models in fp16",
    )
    add_model_arguments(parser)
    return parser
 def add_meta_data(filename: str, meta_data: Dict[str, str]):
    """Add meta data to an ONNX model. It is changed in-place.
    Args:
      filename:
        Filename of the ONNX model to be changed.
      meta_data:
        Key-value pairs.
    """
    model = onnx.load(filename)
    for key, value in meta_data.items():
        meta = model.metadata_props.add()
        meta.key = key
        meta.value = value
    onnx.save(model, filename)
 def export_model_onnx(
    model: nn.Module,
    model_filename: str,
    opset_version: int = 13,
 ) -> None:
    """Export the joiner model to ONNX format.
    The exported joiner model has two inputs:
        - encoder_out: a tensor of shape (N, joiner_dim)
        - decoder_out: a tensor of shape (N, joiner_dim)
    and produces one output:
        - logit: a tensor of shape (N, vocab_size)
    """
    input_tensor = torch.rand((2, 80, 100), dtype=torch.float32)
    torch.onnx.export(
        model,
        (input_tensor,),
        model_filename,
        verbose=False,
        opset_version=opset_version,
        input_names=[
            "features",
        ],
        output_names=["audio"],
        dynamic_axes={
            "features": {0: "N", 2: "F"},
            "audio": {0: "N", 1: "T"},
        },
    )
    meta_data = {
        "model_type": "Vocos",
        "version": "1",
        "model_author": "k2-fsa",
        "comment": "ConvNext Vocos",
    }
    logging.info(f"meta_data: {meta_data}")
    add_meta_data(filename=model_filename, meta_data=meta_data)
@torch.no_grad()
 def main():
    args = get_parser().parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    device = torch.device("cpu")
    params.device = device
    logging.info(params)
    logging.info("About to create model")
    model = get_model(params)
    model.to(device)
    if not params.use_averaged_model:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if i >= 1:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg + 1
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model.eval()
    vocos = model.generator
    if params.iter > 0:
        suffix = f"iter-{params.iter}"
    else:
        suffix = f"epoch-{params.epoch}"
    suffix += f"-avg-{params.avg}"
    opset_version = 13
    logging.info("Exporting model")
    model_filename = params.exp_dir / f"vocos-{suffix}.onnx"
    export_model_onnx(
        vocos,
        model_filename,
        opset_version=opset_version,
    )
    logging.info(f"Exported vocos generator to {model_filename}")
    if params.fp16:
        logging.info("Generate fp16 models")
        model = onnx.load(model_filename)
        model_fp16 = float16.convert_float_to_float16(model, keep_io_types=True)
        model_filename_fp16 = params.exp_dir / f"vocos-{suffix}.fp16.onnx"
        onnx.save(model_fp16, model_filename_fp16)
    # Generate int8 quantization models
    # See https://onnxruntime.ai/docs/performance/model-optimizations/quantization.html#data-type-selection
    logging.info("Generate int8 quantization models")
    model_filename_int8 = params.exp_dir / f"vocos-{suffix}.int8.onnx"
    quantize_dynamic(
        model_input=model_filename,
        model_output=model_filename_int8,
        op_types_to_quantize=["MatMul"],
        weight_type=QuantType.QInt8,
    )
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/libritts/TTS/vocos/export.py
+++ b/egs/libritts/TTS/vocos/export.py
@ -0,0 +1,407 @@
 #!/usr/bin/env python3
 #
 # Copyright 2024 Xiaomi Corporation (Author: Wei Kang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # This script converts several saved checkpoints
 # to a single one using model averaging.
 """
 Usage:
 Note: This is a example for libritts dataset, if you are using different
 dataset, you should change the argument values according to your dataset.
 (1) Export to torchscript model using torch.jit.script()
 ./vocos/export.py \
  --exp-dir ./vocos/exp \
  --epoch 30 \
  --avg 9 \
  --jit 1
 It will generate a file `jit_script.pt` in the given `exp_dir`. You can later
 load it by `torch.jit.load("jit_script.pt")`.
 Check ./jit_pretrained.py for its usage.
 Check https://github.com/k2-fsa/sherpa
 for how to use the exported models outside of icefall.
 - For streaming model:
 ./zipformer/export.py \
  --exp-dir ./zipformer/exp \
  --causal 1 \
  --chunk-size 16 \
  --left-context-frames 128 \
  --tokens data/lang_bpe_500/tokens.txt \
  --epoch 30 \
  --avg 9 \
  --jit 1
 It will generate a file `jit_script_chunk_16_left_128.pt` in the given `exp_dir`.
 You can later load it by `torch.jit.load("jit_script_chunk_16_left_128.pt")`.
 Check ./jit_pretrained_streaming.py for its usage.
 Check https://github.com/k2-fsa/sherpa
 for how to use the exported models outside of icefall.
 (2) Export `model.state_dict()`
 - For non-streaming model:
 ./zipformer/export.py \
  --exp-dir ./zipformer/exp \
  --tokens data/lang_bpe_500/tokens.txt \
  --epoch 30 \
  --avg 9
 - For streaming model:
 ./zipformer/export.py \
  --exp-dir ./zipformer/exp \
  --causal 1 \
  --tokens data/lang_bpe_500/tokens.txt \
  --epoch 30 \
  --avg 9
 It will generate a file `pretrained.pt` in the given `exp_dir`. You can later
 load it by `icefall.checkpoint.load_checkpoint()`.
 - For non-streaming model:
 To use the generated file with `zipformer/decode.py`,
 you can do:
    cd /path/to/exp_dir
    ln -s pretrained.pt epoch-9999.pt
    cd /path/to/egs/librispeech/ASR
    ./zipformer/decode.py \
        --exp-dir ./zipformer/exp \
        --epoch 9999 \
        --avg 1 \
        --max-duration 600 \
        --decoding-method greedy_search \
        --bpe-model data/lang_bpe_500/bpe.model
 - For streaming model:
 To use the generated file with `zipformer/decode.py` and `zipformer/streaming_decode.py`, you can do:
    cd /path/to/exp_dir
    ln -s pretrained.pt epoch-9999.pt
    cd /path/to/egs/librispeech/ASR
    # simulated streaming decoding
    ./zipformer/decode.py \
        --exp-dir ./zipformer/exp \
        --epoch 9999 \
        --avg 1 \
        --max-duration 600 \
        --causal 1 \
        --chunk-size 16 \
        --left-context-frames 128 \
        --decoding-method greedy_search \
        --bpe-model data/lang_bpe_500/bpe.model
    # chunk-wise streaming decoding
    ./zipformer/streaming_decode.py \
        --exp-dir ./zipformer/exp \
        --epoch 9999 \
        --avg 1 \
        --max-duration 600 \
        --causal 1 \
        --chunk-size 16 \
        --left-context-frames 128 \
        --decoding-method greedy_search \
        --bpe-model data/lang_bpe_500/bpe.model
 Check ./pretrained.py for its usage.
 Note: If you don't want to train a model from scratch, we have
 provided one for you. You can get it at
 - non-streaming model:
 https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
 - streaming model:
 https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17
 with the following commands:
    sudo apt-get install git-lfs
    git lfs install
    git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
    git clone https://huggingface.co/Zengwei/icefall-asr-librispeech-streaming-zipformer-2023-05-17
    # You will find the pre-trained models in exp dir
 """
 import argparse
 import logging
 from pathlib import Path
 from typing import List, Tuple
 import torch
 from torch import Tensor, nn
 from train import add_model_arguments, get_model, get_params
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
 )
 from icefall.utils import str2bool
 from utils import load_checkpoint
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--sampling-rate",
        type=int,
        default=24000,
        help="The sampleing rate of libritts dataset",
    )
    parser.add_argument(
        "--frame-shift",
        type=int,
        default=256,
        help="Frame shift.",
    )
    parser.add_argument(
        "--frame-length",
        type=int,
        default=1024,
        help="Frame shift.",
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=30,
        help="""It specifies the checkpoint to use for decoding.
        Note: Epoch counts from 1.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=9,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
    )
    parser.add_argument(
        "--use-averaged-model",
        type=str2bool,
        default=True,
        help="Whether to load averaged model. Currently it only supports "
        "using --epoch. If True, it would decode with the averaged model "
        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
        "Actually only the models with epoch number of `epoch-avg` and "
        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="vocos/exp",
        help="""It specifies the directory where all training related
        files, e.g., checkpoints, log, etc, are saved
        """,
    )
    parser.add_argument(
        "--jit",
        type=str2bool,
        default=False,
        help="""True to save a model after applying torch.jit.script.
        It will generate a file named jit_script.pt.
        Check ./jit_pretrained.py for how to use it.
        """,
    )
    add_model_arguments(parser)
    return parser
 class EncoderModel(nn.Module):
    """A wrapper for encoder and encoder_embed"""
    def __init__(self, encoder: nn.Module, encoder_embed: nn.Module) -> None:
        super().__init__()
        self.encoder = encoder
        self.encoder_embed = encoder_embed
    def forward(
        self, features: Tensor, feature_lengths: Tensor
    ) -> Tuple[Tensor, Tensor]:
        """
        Args:
            features: (N, T, C)
            feature_lengths: (N,)
        """
        x, x_lens = self.encoder_embed(features, feature_lengths)
        src_key_padding_mask = make_pad_mask(x_lens)
        x = x.permute(1, 0, 2)  # (N, T, C) -> (T, N, C)
        encoder_out, encoder_out_lens = self.encoder(x, x_lens, src_key_padding_mask)
        encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
        return encoder_out, encoder_out_lens
@torch.no_grad()
 def main():
    args = get_parser().parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    device = torch.device("cpu")
    params.device = device
    logging.info(f"device: {device}")
    logging.info(params)
    logging.info("About to create model")
    model = get_model(params)
    if not params.use_averaged_model:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if i >= 1:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg + 1
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model.eval()
    model = model.generator
    if params.jit is True:
        model.encoder = EncoderModel(model.encoder, model.encoder_embed)
        filename = "jit_script.pt"
        logging.info("Using torch.jit.script")
        model = torch.jit.script(model)
        model.save(str(params.exp_dir / filename))
        logging.info(f"Saved to {filename}")
    else:
        logging.info("Not using torchscript. Export model.state_dict()")
        # Save it using a format so that it can be loaded
        # by :func:`load_checkpoint`
        filename = params.exp_dir / "generator.pt"
        torch.save({"model": model.state_dict()}, str(filename))
        logging.info(f"Saved to {filename}")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/libritts/TTS/vocos/generator.py
+++ b/egs/libritts/TTS/vocos/generator.py
@ -0,0 +1,264 @@
 import logging
 from typing import Optional
 import numpy as np
 import torch
 from torch import nn
 from torch.autograd import Variable
 from torch.nn import functional as F
 def window_sumsquare(
    window: torch.Tensor,
    n_samples: int,
    hop_length: int = 256,
    win_length: int = 1024,
 ):
    """
    Compute the sum-square envelope of a window function at a given hop length.
    This is used to estimate modulation effects induced by windowing
    observations in short-time fourier transforms.
    Parameters
    ----------
    window : string, tuple, number, callable, or list-like
        Window specification, as in `get_window`
    n_samples : int > 0
        The number of expected samples.
    hop_length : int > 0
        The number of samples to advance between frames
    win_length :
        The length of the window function.
    Returns
    -------
    wss : torch.Tensor, The sum-squared envelope of the window function.
    """
    n_frames = (n_samples - win_length) // hop_length + 1
    output_size = (n_frames - 1) * hop_length + win_length
    device = window.device
    # Window envelope
    window_sq = window.square().expand(1, n_frames, -1).transpose(1, 2)
    window_envelope = torch.nn.functional.fold(
        window_sq,
        output_size=(1, output_size),
        kernel_size=(1, win_length),
        stride=(1, hop_length),
    ).squeeze()
    window_envelope = torch.nn.functional.pad(
        window_envelope, (0, n_samples - output_size)
    )
    return window_envelope
 class ISTFT(torch.nn.Module):
    """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
    def __init__(
        self,
        filter_length: int = 1024,
        hop_length: int = 256,
        win_length: int = 1024,
        padding: str = "none",
        window_type: str = "povey",
        max_samples: int = 1440000,  # 1440000 / 24000 = 60s
    ):
        super(ISTFT, self).__init__()
        self.filter_length = filter_length
        self.hop_length = hop_length
        self.win_length = win_length
        self.padding = padding
        scale = self.filter_length / self.hop_length
        fourier_basis = np.fft.fft(np.eye(self.filter_length))
        cutoff = int((self.filter_length / 2 + 1))
        fourier_basis = np.vstack(
            [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
        )
        inverse_basis = torch.FloatTensor(
            np.linalg.pinv(scale * fourier_basis).T[:, None, :]
        )
        assert filter_length >= win_length
        # Consistence with lhotse, search "create_frame_window" in https://github.com/lhotse-speech/lhotse
        assert window_type in [
            "hanning",
            "povey",
        ], f"Only 'hanning' and 'povey' windows are supported, given {window_type}."
        fft_window = torch.hann_window(win_length, periodic=False)
        if window_type == "povey":
            fft_window = fft_window.pow(0.85)
        if filter_length > win_length:
            pad_size = (filter_length - win_length) // 2
            fft_window = torch.nn.functional.pad(fft_window, (pad_size, pad_size))
        window_sum = window_sumsquare(
            window=fft_window,
            n_samples=max_samples,
            hop_length=hop_length,
            win_length=filter_length,
        )
        inverse_basis *= fft_window
        self.register_buffer("inverse_basis", inverse_basis.float())
        self.register_buffer("fft_window", fft_window)
        self.register_buffer("window_sum", window_sum)
        self.tiny = torch.finfo(torch.float16).tiny
    def forward(self, magnitude, phase):
        magnitude_phase = torch.cat(
            [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
        )
        inverse_transform = F.conv_transpose1d(
            magnitude_phase,
            Variable(self.inverse_basis, requires_grad=False),
            stride=self.hop_length,
            padding=0,
        )
        inverse_transform = inverse_transform.squeeze(1)
        window_sum = self.window_sum
        if not torch.jit.is_scripting() and not torch.jit.is_tracing():
            if self.window_sum.size(-1) < inverse_transform.size(-1):
                logging.warning(
                    f"The precomputed `window_sumsquare` is too small, recomputing, "
                    f"from {self.window_sum.size(-1)} to {inverse_transform.size(-1)}"
                )
                window_sum = window_sumsquare(
                    window=self.fft_window,
                    n_samples=inverse_transform.size(-1),
                    win_length=self.filter_length,
                    hop_length=self.hop_length,
                )
        window_sum = window_sum[: inverse_transform.size(-1)]
        approx_nonzero_indices = (window_sum > self.tiny).nonzero().squeeze()
        inverse_transform[:, approx_nonzero_indices] /= window_sum[
            approx_nonzero_indices
        ]
        # scale by hop ratio
        inverse_transform *= float(self.filter_length) / self.hop_length
        assert self.padding in ["none", "same", "center"]
        if self.padding == "center":
            pad_len = self.filter_length // 2
        elif self.padding == "same":
            pad_len = (self.filter_length - self.hop_length) // 2
        else:
            return inverse_transform
        return inverse_transform[:, pad_len:-pad_len]
 class ConvNeXtBlock(nn.Module):
    """ConvNeXt Block adapted from https://github.com/facebookresearch/ConvNeXt to 1D audio signal.
    Args:
        dim (int): Number of input channels.
        intermediate_dim (int): Dimensionality of the intermediate layer.
        layer_scale_init_value (float, optional): Initial value for the layer scale. None means no scaling.
            Defaults to None.
    """
    def __init__(
        self,
        dim: int,
        intermediate_dim: int,
        layer_scale_init_value: Optional[float] = None,
    ):
        super().__init__()
        self.dwconv = nn.Conv1d(
            dim, dim, kernel_size=7, padding=3, groups=dim
        )  # depthwise conv
        self.norm = nn.LayerNorm(dim, eps=1e-6)
        # pointwise/1x1 convs, implemented with linear layers
        self.pwconv1 = nn.Linear(dim, intermediate_dim)
        self.act = nn.GELU()
        self.pwconv2 = nn.Linear(intermediate_dim, dim)
        self.gamma = (
            nn.Parameter(layer_scale_init_value * torch.ones(dim), requires_grad=True)
            if layer_scale_init_value > 0
            else None
        )
    def forward(
        self,
        x: torch.Tensor,
    ) -> torch.Tensor:
        residual = x
        x = self.dwconv(x)
        x = x.transpose(1, 2)  # (B, C, T) -> (B, T, C)
        x = self.norm(x)
        x = self.pwconv1(x)
        x = self.act(x)
        x = self.pwconv2(x)
        if self.gamma is not None:
            x = self.gamma * x
        x = x.transpose(1, 2)  # (B, T, C) -> (B, C, T)
        x = residual + x
        return x
 class Generator(torch.nn.Module):
    def __init__(
        self,
        feature_dim: int = 80,
        dim: int = 512,
        n_fft: int = 1024,
        hop_length: int = 256,
        intermediate_dim: int = 1536,
        num_layers: int = 8,
        padding: str = "none",
        max_samples: int = 1440000,  # 1440000 / 24000 = 60s
    ):
        super(Generator, self).__init__()
        self.feature_dim = feature_dim
        self.embed = nn.Conv1d(feature_dim, dim, kernel_size=7, padding=3)
        self.norm = nn.LayerNorm(dim, eps=1e-6)
        layer_scale_init_value = 1 / num_layers
        self.convnext = nn.ModuleList(
            [
                ConvNeXtBlock(
                    dim=dim,
                    intermediate_dim=intermediate_dim,
                    layer_scale_init_value=layer_scale_init_value,
                )
                for _ in range(num_layers)
            ]
        )
        self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6)
        self.apply(self._init_weights)
        self.out_proj = torch.nn.Linear(dim, n_fft + 2)
        self.istft = ISTFT(
            filter_length=n_fft,
            hop_length=hop_length,
            win_length=n_fft,
            padding=padding,
            max_samples=max_samples,
        )
    def _init_weights(self, m):
        if isinstance(m, (nn.Conv1d, nn.Linear)):
            nn.init.trunc_normal_(m.weight, std=0.02)
            nn.init.constant_(m.bias, 0)
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embed(x)
        x = self.norm(x.transpose(1, 2))
        x = x.transpose(1, 2)
        for conv_block in self.convnext:
            x = conv_block(x)
        x = self.final_layer_norm(x.transpose(1, 2))
        x = self.out_proj(x).transpose(1, 2)
        mag, phase = x.chunk(2, dim=1)
        mag = torch.exp(mag)
        # safeguard to prevent excessively large magnitudes
        mag = torch.clip(mag, max=1e2)
        audio = self.istft(mag, phase)
        return audio
--- a/egs/libritts/TTS/vocos/infer.py
+++ b/egs/libritts/TTS/vocos/infer.py
@ -0,0 +1,352 @@
 #!/usr/bin/env python3
 # Copyright         2024  Xiaomi Corp.        (authors: Wei Kang
 #                                                       Han Zhu)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import json
 import logging
 import math
 import time
 import os
 from functools import partial
 from pathlib import Path
 import torch
 import torch.nn as nn
 from lhotse.utils import fix_random_seed
 from scipy.io.wavfile import write
 from train import add_model_arguments, get_model, get_params
 from tts_datamodule import LibriTTSDataModule
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.utils import AttributeDict, setup_logger, str2bool
 LOG_EPS = math.log(1e-10)
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=100,
        help="""It specifies the checkpoint to use for decoding.
        Note: Epoch counts from 1.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=10,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
    )
    parser.add_argument(
        "--use-averaged-model",
        type=str2bool,
        default=False,
        help="Whether to load averaged model. Currently it only supports "
        "using --epoch. If True, it would decode with the averaged model "
        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
        "Actually only the models with epoch number of `epoch-avg` and "
        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="vocos/exp",
        help="The experiment dir",
    )
    parser.add_argument(
        "--generate-dir",
        type=str,
        default="generated_wavs",
        help="Path name of the generated wavs",
    )
    add_model_arguments(parser)
    return parser
 def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
    batch: dict,
 ):
    """
    Args:
      params:
        It's the return value of :func:`get_params`.
      model:
        The text-to-feature neural model.
      batch:
        It is the return value from iterating
        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
        for the format of the `batch`.
    Returns:
      Return the decoding result. See above description for the format of
      the returned dict.
    """
    device = next(model.parameters()).device
    cut_ids = [cut.id for cut in batch["cut"]]
    infer_time = 0
    audio_time = 0
    features = batch["features"]  # (B, T, F)
    utt_durations = batch["features_lens"]
    x = features.permute(0, 2, 1)  # (B, F, T)
    audio_time += torch.sum(utt_durations)
    start = time.time()
    audios = model(x.to(device))  # (B, T)
    infer_time += time.time() - start
    wav_dir = f"{params.res_dir}/{params.suffix}"
    os.makedirs(wav_dir, exist_ok=True)
    for i in range(audios.shape[0]):
        audio = audios[i][: int(utt_durations[i] * 256)]
        audio = audio.cpu().squeeze().numpy()
        write(f"{wav_dir}/{cut_ids[i]}.wav", 24000, audio)
    print(f"RTF : {infer_time / (audio_time * (256/24000))}")
 def decode_dataset(
    dl: torch.utils.data.DataLoader,
    params: AttributeDict,
    model: nn.Module,
    test_set: str,
 ):
    """Decode dataset.
    Args:
      dl:
        PyTorch's dataloader containing the dataset to decode.
      params:
        It is returned by :func:`get_params`.
      model:
        The text-to-feature neural model.
      test_set:
        The name of the test_set
    """
    num_cuts = 0
    try:
        num_batches = len(dl)
    except TypeError:
        num_batches = "?"
    with open(f"{params.res_dir}/{test_set}.scp", "w", encoding="utf8") as f:
        for batch_idx, batch in enumerate(dl):
            # texts = batch["text"]
            cut_ids = [cut.id for cut in batch["cut"]]
            decode_one_batch(
                params=params,
                model=model,
                batch=batch,
            )
            # assert len(texts) == len(cut_ids), (len(texts), len(cut_ids))
            # for i in range(len(texts)):
            # f.write(f"{cut_ids[i]}\t{texts[i]}\n")
            # num_cuts += len(texts)
            if batch_idx % 50 == 0:
                batch_str = f"{batch_idx}/{num_batches}"
                logging.info(
                    f"batch {batch_str}, cuts processed until now is {num_cuts}"
                )
@torch.no_grad()
 def main():
    parser = get_parser()
    LibriTTSDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    params.res_dir = params.exp_dir / params.generate_dir
    if params.iter > 0:
        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
    else:
        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
    if params.use_averaged_model:
        params.suffix += "-use-averaged-model"
    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
    logging.info("Decoding started")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    params.device = device
    logging.info(f"Device: {device}")
    logging.info(params)
    fix_random_seed(666)
    logging.info("About to create model")
    model = get_model(params)
    if not params.use_averaged_model:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if i >= 1:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg + 1
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model = model.to(device)
    model.eval()
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    # we need cut ids to display recognition results.
    args.return_cuts = True
    libritts = LibriTTSDataModule(args)
    test_cuts = libritts.test_clean_cuts()
    test_dl = libritts.test_dataloaders(test_cuts)
    test_sets = ["test"]
    test_dls = [test_dl]
    for test_set, test_dl in zip(test_sets, test_dls):
        decode_dataset(
            dl=test_dl,
            params=params,
            model=model,
            test_set=test_set,
        )
    logging.info("Done!")
 if __name__ == "__main__":
    main()
--- a/egs/libritts/TTS/vocos/loss.py
+++ b/egs/libritts/TTS/vocos/loss.py
@ -0,0 +1,133 @@
 from typing import List, Tuple
 import torch
 import torchaudio
 from torch import nn
 from utils import safe_log
 class MelSpecReconstructionLoss(nn.Module):
    """
    L1 distance between the mel-scaled magnitude spectrograms of the ground truth sample and the generated sample
    """
    def __init__(
        self,
        sample_rate: int = 24000,
        n_fft: int = 1024,
        hop_length: int = 256,
        n_mels: int = 100,
    ):
        super().__init__()
        self.mel_spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=n_fft,
            hop_length=hop_length,
            n_mels=n_mels,
            center=True,
            power=1,
        )
    def forward(self, y_hat, y) -> torch.Tensor:
        """
        Args:
            y_hat (Tensor): Predicted audio waveform.
            y (Tensor): Ground truth audio waveform.
        Returns:
            Tensor: L1 loss between the mel-scaled magnitude spectrograms.
        """
        mel_hat = safe_log(self.mel_spec(y_hat))
        mel = safe_log(self.mel_spec(y))
        loss = torch.nn.functional.l1_loss(mel, mel_hat)
        return loss
 class GeneratorLoss(nn.Module):
    """
    Generator Loss module. Calculates the loss for the generator based on discriminator outputs.
    """
    def forward(
        self, disc_outputs: List[torch.Tensor]
    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
        """
        Args:
            disc_outputs (List[Tensor]): List of discriminator outputs.
        Returns:
            Tuple[Tensor, List[Tensor]]: Tuple containing the total loss and a list of loss values from
                                         the sub-discriminators
        """
        loss = torch.zeros(
            1, device=disc_outputs[0].device, dtype=disc_outputs[0].dtype
        )
        gen_losses = []
        for dg in disc_outputs:
            l = torch.mean(torch.clamp(1 - dg, min=0))
            gen_losses.append(l)
            loss += l
        return loss, gen_losses
 class DiscriminatorLoss(nn.Module):
    """
    Discriminator Loss module. Calculates the loss for the discriminator based on real and generated outputs.
    """
    def forward(
        self,
        disc_real_outputs: List[torch.Tensor],
        disc_generated_outputs: List[torch.Tensor],
    ) -> Tuple[torch.Tensor, List[torch.Tensor], List[torch.Tensor]]:
        """
        Args:
            disc_real_outputs (List[Tensor]): List of discriminator outputs for real samples.
            disc_generated_outputs (List[Tensor]): List of discriminator outputs for generated samples.
        Returns:
            Tuple[Tensor, List[Tensor], List[Tensor]]: A tuple containing the total loss, a list of loss values from
                                                       the sub-discriminators for real outputs, and a list of
                                                       loss values for generated outputs.
        """
        loss = torch.zeros(
            1, device=disc_real_outputs[0].device, dtype=disc_real_outputs[0].dtype
        )
        r_losses = []
        g_losses = []
        for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
            r_loss = torch.mean(torch.clamp(1 - dr, min=0))
            g_loss = torch.mean(torch.clamp(1 + dg, min=0))
            loss += r_loss + g_loss
            r_losses.append(r_loss.item())
            g_losses.append(g_loss.item())
        return loss, r_losses, g_losses
 class FeatureMatchingLoss(nn.Module):
    """
    Feature Matching Loss module. Calculates the feature matching loss between feature maps of the sub-discriminators.
    """
    def forward(
        self, fmap_r: List[List[torch.Tensor]], fmap_g: List[List[torch.Tensor]]
    ) -> torch.Tensor:
        """
        Args:
            fmap_r (List[List[Tensor]]): List of feature maps from real samples.
            fmap_g (List[List[Tensor]]): List of feature maps from generated samples.
        Returns:
            Tensor: The calculated feature matching loss.
        """
        loss = torch.zeros(1, device=fmap_r[0][0].device, dtype=fmap_r[0][0].dtype)
        for dr, dg in zip(fmap_r, fmap_g):
            for rl, gl in zip(dr, dg):
                loss += torch.mean(torch.abs(rl - gl))
        return loss
--- a/egs/libritts/TTS/vocos/model.py
+++ b/egs/libritts/TTS/vocos/model.py
@ -0,0 +1,48 @@
 import logging
 import torch
 from discriminators import MultiPeriodDiscriminator, MultiResolutionDiscriminator
 from generator import Generator
 from loss import (
    DiscriminatorLoss,
    GeneratorLoss,
    FeatureMatchingLoss,
    MelSpecReconstructionLoss,
 )
 class Vocos(torch.nn.Module):
    def __init__(
        self,
        feature_dim: int = 80,
        dim: int = 512,
        n_fft: int = 1024,
        hop_length: int = 256,
        intermediate_dim: int = 1536,
        num_layers: int = 8,
        padding: str = "none",
        sample_rate: int = 24000,
        max_seconds: int = 60,
    ):
        super(Vocos, self).__init__()
        self.generator = Generator(
            feature_dim=feature_dim,
            dim=dim,
            n_fft=n_fft,
            hop_length=hop_length,
            num_layers=num_layers,
            intermediate_dim=intermediate_dim,
            padding=padding,
            max_samples=int(sample_rate * max_seconds),
        )
        self.mpd = MultiPeriodDiscriminator()
        self.mrd = MultiResolutionDiscriminator()
        self.disc_loss = DiscriminatorLoss()
        self.gen_loss = GeneratorLoss()
        self.feat_matching_loss = FeatureMatchingLoss()
        self.melspec_loss = MelSpecReconstructionLoss(sample_rate=sample_rate)
    def forward(self, features: torch.Tensor):
        audio = self.generator(features)
        return audio
--- a/egs/libritts/TTS/vocos/onnx_pretrained.py
+++ b/egs/libritts/TTS/vocos/onnx_pretrained.py
@ -0,0 +1,268 @@
 #!/usr/bin/env python3
 # Copyright      2022  Xiaomi Corp.        (authors: Fangjun Kuang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script loads ONNX models and uses them to decode waves.
 You can use the following command to get the exported models:
 We use the pre-trained model from
 https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
 as an example to show how to use this file.
 1. Download the pre-trained model
 cd egs/librispeech/ASR
 repo_url=https://huggingface.co/Zengwei/icefall-asr-librispeech-zipformer-2023-05-15
 GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
 repo=$(basename $repo_url)
 pushd $repo
 git lfs pull --include "exp/pretrained.pt"
 cd exp
 ln -s pretrained.pt epoch-99.pt
 popd
 2. Export the model to ONNX
 ./zipformer/export-onnx.py \
  --tokens $repo/data/lang_bpe_500/tokens.txt \
  --use-averaged-model 0 \
  --epoch 99 \
  --avg 1 \
  --exp-dir $repo/exp \
  --causal False
 It will generate the following 3 files inside $repo/exp:
  - encoder-epoch-99-avg-1.onnx
  - decoder-epoch-99-avg-1.onnx
  - joiner-epoch-99-avg-1.onnx
 3. Run this file
 ./zipformer/onnx_pretrained.py \
  --encoder-model-filename $repo/exp/encoder-epoch-99-avg-1.onnx \
  --decoder-model-filename $repo/exp/decoder-epoch-99-avg-1.onnx \
  --joiner-model-filename $repo/exp/joiner-epoch-99-avg-1.onnx \
  --tokens $repo/data/lang_bpe_500/tokens.txt \
  $repo/test_wavs/1089-134686-0001.wav \
  $repo/test_wavs/1221-135766-0001.wav \
  $repo/test_wavs/1221-135766-0002.wav
 """
 import argparse
 import logging
 import math
 from pathlib import Path
 from typing import List, Tuple
 import onnxruntime as ort
 import torch
 import torchaudio
 from torch.nn.utils.rnn import pad_sequence
 from lhotse import Fbank, FbankConfig
 from icefall.utils import str2bool
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--model-filename",
        type=str,
        required=True,
        help="Path to the encoder onnx model. ",
    )
    parser.add_argument(
        "--sampling-rate",
        type=int,
        default=24000,
        help="The sampleing rate of libritts dataset",
    )
    parser.add_argument(
        "--frame-shift",
        type=int,
        default=256,
        help="Frame shift.",
    )
    parser.add_argument(
        "--frame-length",
        type=int,
        default=1024,
        help="Frame shift.",
    )
    parser.add_argument(
        "--use-fft-mag",
        type=str2bool,
        default=True,
        help="Whether to use magnitude of fbank, false to use power energy.",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="generated_audios",
        help="The generated will be written to.",
    )
    parser.add_argument(
        "sound_files",
        type=str,
        nargs="+",
        help="The input sound file(s) to transcribe. "
        "Supported formats are those supported by torchaudio.load(). "
        "For example, wav and flac are supported. "
        "The sample rate has to be 16kHz.",
    )
    return parser
 class OnnxModel:
    def __init__(
        self,
        model_filename: str,
    ):
        session_opts = ort.SessionOptions()
        session_opts.inter_op_num_threads = 1
        session_opts.intra_op_num_threads = 4
        self.session_opts = session_opts
        self.init_model(model_filename)
    def init_model(self, model_filename: str):
        self.model = ort.InferenceSession(
            model_filename,
            sess_options=self.session_opts,
            providers=["CPUExecutionProvider"],
        )
    def run_model(
        self,
        x: torch.Tensor,
    ) -> torch.Tensor:
        """
        Args:
          x:
            A 3-D tensor of shape (N, T, C)
          x_lens:
            A 2-D tensor of shape (N,). Its dtype is torch.int64
        Returns:
          Return a tuple containing:
            - encoder_out, its shape is (N, T', joiner_dim)
            - encoder_out_lens, its shape is (N,)
        """
        out = self.model.run(
            [
                self.model.get_outputs()[0].name,
            ],
            {
                self.model.get_inputs()[0].name: x.numpy(),
            },
        )
        return torch.from_numpy(out[0])
 def read_sound_files(
    filenames: List[str], expected_sample_rate: float
 ) -> List[torch.Tensor]:
    """Read a list of sound files into a list 1-D float32 torch tensors.
    Args:
      filenames:
        A list of sound filenames.
      expected_sample_rate:
        The expected sample rate of the sound files.
    Returns:
      Return a list of 1-D float32 torch tensors.
    """
    ans = []
    for f in filenames:
        wave, sample_rate = torchaudio.load(f)
        assert (
            sample_rate == expected_sample_rate
        ), f"expected sample rate: {expected_sample_rate}. Given: {sample_rate}"
        # We use only the first channel
        ans.append(wave[0])
    return ans
@torch.no_grad()
 def main():
    parser = get_parser()
    args = parser.parse_args()
    output_dir = Path(args.model_filename).parent / args.output_dir
    output_dir.mkdir(exist_ok=True)
    args.output_dir = output_dir
    logging.info(vars(args))
    model = OnnxModel(model_filename=args.model_filename)
    config = FbankConfig(
        sampling_rate=args.sampling_rate,
        frame_length=args.frame_length / args.sampling_rate,  # (in second),
        frame_shift=args.frame_shift / args.sampling_rate,  # (in second)
        use_fft_mag=args.use_fft_mag,
    )
    fbank = Fbank(config)
    logging.info(f"Reading sound files: {args.sound_files}")
    waves = read_sound_files(
        filenames=args.sound_files, expected_sample_rate=args.sampling_rate
    )
    wave_lengths = [w.size(0) for w in waves]
    waves = pad_sequence(waves, batch_first=True, padding_value=0)
    logging.info(f"waves : {waves.shape}")
    features = fbank.extract_batch(waves, sampling_rate=args.sampling_rate)
    if features.dim() == 2:
        features = features.unsqueeze(0)
    features = features.permute(0, 2, 1)
    logging.info(f"features : {features.shape}")
    logging.info("Generating started")
    # model forward
    audios = model.run_model(features)
    for i, filename in enumerate(args.sound_files):
        audio = audios[i : i + 1, 0 : wave_lengths[i]]
        ofilename = args.output_dir / filename.split("/")[-1]
        logging.info(f"Writting audio : {ofilename}")
        torchaudio.save(str(ofilename), audio.cpu(), args.sampling_rate)
    logging.info("Generating Done")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/libritts/TTS/vocos/pretrained.py
+++ b/egs/libritts/TTS/vocos/pretrained.py
@ -0,0 +1,196 @@
 #!/usr/bin/env python3
 # Copyright      2024  Xiaomi Corp.        (authors: Wei Kang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 This script loads a checkpoint and uses it to decode waves.
 You can generate the checkpoint with the following command:
 """
 import argparse
 import logging
 import math
 from pathlib import Path
 from typing import List
 import torch
 import torchaudio
 from torch.nn.utils.rnn import pad_sequence
 from train import add_model_arguments, get_model, get_params
 from lhotse import Fbank, FbankConfig
 from icefall.utils import str2bool
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--checkpoint",
        type=str,
        required=True,
        help="Path to the checkpoint. "
        "The checkpoint is assumed to be saved by "
        "icefall.checkpoint.save_checkpoint().",
    )
    parser.add_argument(
        "--sampling-rate",
        type=int,
        default=24000,
        help="The sampleing rate of libritts dataset",
    )
    parser.add_argument(
        "--frame-shift",
        type=int,
        default=256,
        help="Frame shift.",
    )
    parser.add_argument(
        "--frame-length",
        type=int,
        default=1024,
        help="Frame shift.",
    )
    parser.add_argument(
        "--use-fft-mag",
        type=str2bool,
        default=True,
        help="Whether to use magnitude of fbank, false to use power energy.",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default="generated_audios",
        help="The generated will be written to.",
    )
    parser.add_argument(
        "sound_files",
        type=str,
        nargs="+",
        help="The input sound file(s) to transcribe. "
        "Supported formats are those supported by torchaudio.load(). "
        "For example, wav and flac are supported. "
        "The sample rate has to be 16kHz.",
    )
    add_model_arguments(parser)
    return parser
 def read_sound_files(
    filenames: List[str], expected_sample_rate: float
 ) -> List[torch.Tensor]:
    """Read a list of sound files into a list 1-D float32 torch tensors.
    Args:
      filenames:
        A list of sound filenames.
      expected_sample_rate:
        The expected sample rate of the sound files.
    Returns:
      Return a list of 1-D float32 torch tensors.
    """
    ans = []
    for f in filenames:
        wave, sample_rate = torchaudio.load(f)
        assert (
            sample_rate == expected_sample_rate
        ), f"expected sample rate: {expected_sample_rate}. Given: {sample_rate}"
        # We use only the first channel
        ans.append(wave[0].contiguous())
    return ans
@torch.no_grad()
 def main():
    parser = get_parser()
    args = parser.parse_args()
    params = get_params()
    params.update(vars(args))
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    params.device = device
    output_dir = Path(params.checkpoint).parent / params.output_dir
    output_dir.mkdir(exist_ok=True)
    params.output_dir = output_dir
    logging.info(f"{params}")
    logging.info("Creating model")
    model = get_model(params)
    model = model.generator
    checkpoint = torch.load(params.checkpoint, map_location="cpu")
    model.load_state_dict(checkpoint["model"], strict=False)
    model.to(device)
    model.eval()
    logging.info("Constructing Fbank computer")
    config = FbankConfig(
        sampling_rate=params.sampling_rate,
        frame_length=params.frame_length / params.sampling_rate,  # (in second),
        frame_shift=params.frame_shift / params.sampling_rate,  # (in second)
        use_fft_mag=params.use_fft_mag,
    )
    fbank = Fbank(config)
    logging.info(f"Reading sound files: {params.sound_files}")
    waves = read_sound_files(
        filenames=params.sound_files, expected_sample_rate=params.sampling_rate
    )
    wave_lengths = [w.size(0) for w in waves]
    waves = pad_sequence(waves, batch_first=True, padding_value=0)
    features = (
        fbank.extract_batch(waves, sampling_rate=params.sampling_rate)
        .permute(0, 2, 1)
        .to(device)
    )
    logging.info("Generating started")
    # model forward
    audios = model(features)
    for i, filename in enumerate(params.sound_files):
        audio = audios[i : i + 1, 0 : wave_lengths[i]]
        ofilename = params.output_dir / filename.split("/")[-1]
        logging.info(f"Writting audio : {ofilename}")
        torchaudio.save(str(ofilename), audio.cpu(), params.sampling_rate)
    logging.info("Generating Done")
 if __name__ == "__main__":
    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
    logging.basicConfig(format=formatter, level=logging.INFO)
    main()
--- a/egs/libritts/TTS/vocos/train.py
+++ b/egs/libritts/TTS/vocos/train.py
--- a/egs/libritts/TTS/vocos/tts_datamodule.py
+++ b/egs/libritts/TTS/vocos/tts_datamodule.py
@ -0,0 +1,419 @@
 # Copyright      2021  Piotr Żelasko
 # Copyright      2022-2024  Xiaomi Corporation     (Authors: Mingshuang Luo,
 #                                                            Zengwei Yao,
 #                                                            Wei Kang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 from functools import lru_cache
 from pathlib import Path
 from typing import Any, Dict, Optional
 import torch
 from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
    PrecomputedFeatures,
    SimpleCutSampler,
    SpecAugment,
    SpeechSynthesisDataset,
 )
 from lhotse.dataset.input_strategies import (  # noqa F401 For AudioSamples
    AudioSamples,
    OnTheFlyFeatures,
 )
 from lhotse.utils import fix_random_seed
 from torch.utils.data import DataLoader
 from icefall.utils import str2bool
 class _SeedWorkers:
    def __init__(self, seed: int):
        self.seed = seed
    def __call__(self, worker_id: int):
        fix_random_seed(self.seed + worker_id)
 class LibriTTSDataModule:
    """
    DataModule for tts experiments.
    It assumes there is always one train and valid dataloader,
    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
    and test-other).
    It contains all the common data pipeline modules used in ASR
    experiments, e.g.:
    - dynamic batch size,
    - bucketing samplers,
    - cut concatenation,
    - on-the-fly feature extraction
    This class should be derived for specific corpora used in ASR tasks.
    """
    def __init__(self, args: argparse.Namespace):
        self.args = args
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        group = parser.add_argument_group(
            title="TTS data related options",
            description="These options are used for the preparation of "
            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
            "effective batch sizes, sampling strategies, applied data "
            "augmentations, etc.",
        )
        group.add_argument(
            "--manifest-dir",
            type=Path,
            default=Path("data/fbank"),
            help="Path to directory with train/valid/test cuts.",
        )
        group.add_argument(
            "--max-duration",
            type=int,
            default=200.0,
            help="Maximum pooled recordings duration (seconds) in a "
            "single batch. You can reduce it if it causes CUDA OOM.",
        )
        group.add_argument(
            "--bucketing-sampler",
            type=str2bool,
            default=True,
            help="When enabled, the batches will come from buckets of "
            "similar duration (saves padding frames).",
        )
        group.add_argument(
            "--num-buckets",
            type=int,
            default=30,
            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
            "--on-the-fly-feats",
            type=str2bool,
            default=False,
            help="When enabled, use on-the-fly cut mixing and feature "
            "extraction. Will drop existing precomputed feature manifests "
            "if available.",
        )
        group.add_argument(
            "--shuffle",
            type=str2bool,
            default=True,
            help="When enabled (=default), the examples will be "
            "shuffled for each epoch.",
        )
        group.add_argument(
            "--drop-last",
            type=str2bool,
            default=True,
            help="Whether to drop last batch. Used by sampler.",
        )
        group.add_argument(
            "--return-cuts",
            type=str2bool,
            default=True,
            help="When enabled, each batch will have the "
            "field: batch['cut'] with the cuts that "
            "were used to construct it.",
        )
        group.add_argument(
            "--return-text",
            type=str2bool,
            default=True,
            help="Whether to return the text of the audio.",
        )
        group.add_argument(
            "--return-tokens",
            type=str2bool,
            default=False,
            help="Whether the return the tokens of the text of the audio.",
        )
        group.add_argument(
            "--num-workers",
            type=int,
            default=4,
            help="The number of training dataloader workers that "
            "collect the batches.",
        )
        group.add_argument(
            "--sampling-rate",
            type=int,
            default=24000,
            help="The sampleing rate of libritts dataset",
        )
        group.add_argument(
            "--frame-shift",
            type=int,
            default=256,
            help="Frame shift.",
        )
        group.add_argument(
            "--frame-length",
            type=int,
            default=1024,
            help="Frame shift.",
        )
        group.add_argument(
            "--input-strategy",
            type=str,
            default="PrecomputedFeatures",
            help="AudioSamples or PrecomputedFeatures",
        )
        group.add_argument(
            "--use-fft-mag",
            type=str2bool,
            default=True,
            help="Whether to use magnitude of fbank, false to use power energy.",
        )
    def train_dataloaders(
        self,
        cuts_train: CutSet,
        sampler_state_dict: Optional[Dict[str, Any]] = None,
    ) -> DataLoader:
        """
        Args:
          cuts_train:
            CutSet for training.
          sampler_state_dict:
            The state dict for the training sampler.
        """
        logging.info("About to create train dataset")
        train = SpeechSynthesisDataset(
            return_text=self.args.return_text,
            return_tokens=self.args.return_tokens,
            feature_input_strategy=eval(self.args.input_strategy)(),
            return_cuts=self.args.return_cuts,
        )
        if self.args.on_the_fly_feats:
            sampling_rate = self.args.sampling_rate
            config = FbankConfig(
                sampling_rate=sampling_rate,
                frame_length=self.args.frame_length / sampling_rate,  # (in second),
                frame_shift=self.args.frame_shift / sampling_rate,  # (in second)
                use_fft_mag=self.args.use_fft_mag,
            )
            train = SpeechSynthesisDataset(
                return_text=self.args.return_text,
                return_tokens=self.args.return_tokens,
                feature_input_strategy=OnTheFlyFeatures(Fbank(config)),
                return_cuts=self.args.return_cuts,
            )
        if self.args.bucketing_sampler:
            logging.info("Using DynamicBucketingSampler.")
            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                buffer_size=self.args.num_buckets * 2000,
                shuffle_buffer_size=self.args.num_buckets * 5000,
                drop_last=self.args.drop_last,
            )
        else:
            logging.info("Using SimpleCutSampler.")
            train_sampler = SimpleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
            )
        logging.info("About to create train dataloader")
        if sampler_state_dict is not None:
            logging.info("Loading sampler state dict")
            train_sampler.load_state_dict(sampler_state_dict)
        # 'seed' is derived from the current random state, which will have
        # previously been set in the main process.
        seed = torch.randint(0, 100000, ()).item()
        worker_init_fn = _SeedWorkers(seed)
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=self.args.num_workers,
            persistent_workers=False,
            worker_init_fn=worker_init_fn,
        )
        return train_dl
    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            sampling_rate = self.args.sampling_rate
            config = FbankConfig(
                sampling_rate=sampling_rate,
                frame_length=self.args.frame_length / sampling_rate,  # (in second),
                frame_shift=self.args.frame_shift / sampling_rate,  # (in second)
                use_fft_mag=self.args.use_fft_mag,
            )
            validate = SpeechSynthesisDataset(
                return_text=self.args.return_text,
                return_tokens=self.args.return_tokens,
                feature_input_strategy=OnTheFlyFeatures(Fbank(config)),
                return_cuts=self.args.return_cuts,
            )
        else:
            validate = SpeechSynthesisDataset(
                return_text=self.args.return_text,
                return_tokens=self.args.return_tokens,
                feature_input_strategy=eval(self.args.input_strategy)(),
                return_cuts=self.args.return_cuts,
            )
        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            num_buckets=self.args.num_buckets,
            shuffle=False,
        )
        logging.info("About to create valid dataloader")
        valid_dl = DataLoader(
            validate,
            sampler=valid_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=False,
        )
        return valid_dl
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.info("About to create test dataset")
        if self.args.on_the_fly_feats:
            sampling_rate = self.args.sampling_rate
            config = FbankConfig(
                sampling_rate=sampling_rate,
                frame_length=self.args.frame_length / sampling_rate,  # (in second),
                frame_shift=self.args.frame_shift / sampling_rate,  # (in second)
                use_fft_mag=self.args.use_fft_mag,
            )
            test = SpeechSynthesisDataset(
                return_text=self.args.return_text,
                return_tokens=self.args.return_tokens,
                feature_input_strategy=OnTheFlyFeatures(Fbank(config)),
                return_cuts=self.args.return_cuts,
            )
        else:
            test = SpeechSynthesisDataset(
                return_text=self.args.return_text,
                return_tokens=self.args.return_tokens,
                feature_input_strategy=eval(self.args.input_strategy)(),
                return_cuts=self.args.return_cuts,
            )
        test_sampler = DynamicBucketingSampler(
            cuts,
            max_duration=self.args.max_duration,
            num_buckets=self.args.num_buckets,
            shuffle=False,
        )
        logging.info("About to create test dataloader")
        test_dl = DataLoader(
            test,
            batch_size=None,
            sampler=test_sampler,
            num_workers=self.args.num_workers,
        )
        return test_dl
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "libritts_cuts_train-all-shuf.jsonl.gz"
        )
    @lru_cache()
    def train_clean_cuts(self) -> CutSet:
        logging.info("About to get train clean cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "libritts_cuts_train-clean-460.jsonl.gz"
        )
    @lru_cache()
    def train_clean_100_cuts(self) -> CutSet:
        logging.info("About to get train clean 100 cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "libritts_cuts_train-clean-100.jsonl.gz"
        )
    @lru_cache()
    def train_clean_360_cuts(self) -> CutSet:
        logging.info("About to get train clean 360 cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "libritts_cuts_train-clean-360.jsonl.gz"
        )
    @lru_cache()
    def dev_clean_cuts(self) -> CutSet:
        logging.info("About to get dev clean cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "libritts_cuts_dev-clean.jsonl.gz"
        )
    @lru_cache()
    def dev_other_cuts(self) -> CutSet:
        logging.info("About to get dev other cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "libritts_cuts_dev-other.jsonl.gz"
        )
    @lru_cache()
    def test_clean_cuts(self) -> CutSet:
        logging.info("About to get test clean cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "libritts_cuts_test-clean.jsonl.gz"
        )
    @lru_cache()
    def test_other_cuts(self) -> CutSet:
        logging.info("About to get test other cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "libritts_cuts_test-other.jsonl.gz"
        )
    @lru_cache()
    def train_cuts_finetune(self) -> CutSet:
        logging.info("About to get train cuts finetune")
        return load_manifest_lazy(
            self.args.manifest_dir / "libritts_cuts_train_finetune.jsonl.gz"
        )
    @lru_cache()
    def valid_cuts_finetune(self) -> CutSet:
        logging.info("About to get validation cuts finetune")
        return load_manifest_lazy(
            self.args.manifest_dir / "libritts_cuts_valid_finetune.jsonl.gz"
        )
--- a/egs/libritts/TTS/vocos/utils.py
+++ b/egs/libritts/TTS/vocos/utils.py
@ -0,0 +1,282 @@
 import glob
 import os
 import logging
 import matplotlib
 import math
 import torch
 import torch.nn as nn
 from functools import partial
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
 from torch.nn.utils import weight_norm
 from torch.optim.lr_scheduler import LRScheduler
 from torch.optim import Optimizer
 from torch.cuda.amp import GradScaler
 from lhotse.dataset.sampling.base import CutSampler
 from torch import Tensor
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR
 matplotlib.use("Agg")
 import matplotlib.pylab as plt
 def plot_spectrogram(spectrogram):
    fig, ax = plt.subplots(figsize=(10, 2))
    im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
    plt.colorbar(im, ax=ax)
    fig.canvas.draw()
    plt.close()
    return fig
 def save_checkpoint_with_global_batch_idx(
    out_dir: Path,
    global_batch_idx: int,
    model: Union[nn.Module, DDP],
    model_avg: Optional[nn.Module] = None,
    params: Optional[Dict[str, Any]] = None,
    optimizer_g: Optional[Optimizer] = None,
    optimizer_d: Optional[Optimizer] = None,
    scheduler_g: Optional[LRScheduler] = None,
    scheduler_d: Optional[LRScheduler] = None,
    scaler: Optional[GradScaler] = None,
    sampler: Optional[CutSampler] = None,
    rank: int = 0,
 ):
    """Save training info after processing given number of batches.
    Args:
      out_dir:
        The directory to save the checkpoint.
      global_batch_idx:
        The number of batches processed so far from the very start of the
        training. The saved checkpoint will have the following filename:
            f'out_dir / checkpoint-{global_batch_idx}.pt'
      model:
        The neural network model whose `state_dict` will be saved in the
        checkpoint.
      model_avg:
        The stored model averaged from the start of training.
      params:
        A dict of training configurations to be saved.
      optimizer:
        The optimizer used in the training. Its `state_dict` will be saved.
      scheduler:
        The learning rate scheduler used in the training. Its `state_dict` will
        be saved.
      scaler:
        The scaler used for mix precision training. Its `state_dict` will
        be saved.
      sampler:
        The sampler used in the training dataset.
      rank:
        The rank ID used in DDP training of the current node. Set it to 0
        if DDP is not used.
    """
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    filename = out_dir / f"checkpoint-{global_batch_idx}.pt"
    save_checkpoint(
        filename=filename,
        model=model,
        model_avg=model_avg,
        params=params,
        optimizer_g=optimizer_g,
        scheduler_g=scheduler_g,
        optimizer_d=optimizer_d,
        scheduler_d=scheduler_d,
        scaler=scaler,
        sampler=sampler,
        rank=rank,
    )
 def load_checkpoint(
    filename: Path,
    model: nn.Module,
    model_avg: Optional[nn.Module] = None,
    optimizer_g: Optional[Optimizer] = None,
    optimizer_d: Optional[Optimizer] = None,
    scheduler_g: Optional[LRScheduler] = None,
    scheduler_d: Optional[LRScheduler] = None,
    scaler: Optional[GradScaler] = None,
    sampler: Optional[CutSampler] = None,
    strict: bool = False,
 ) -> Dict[str, Any]:
    logging.info(f"Loading checkpoint from {filename}")
    checkpoint = torch.load(filename, map_location="cpu")
    if next(iter(checkpoint["model"])).startswith("module."):
        logging.info("Loading checkpoint saved by DDP")
        dst_state_dict = model.state_dict()
        src_state_dict = checkpoint["model"]
        for key in dst_state_dict.keys():
            src_key = "{}.{}".format("module", key)
            dst_state_dict[key] = src_state_dict.pop(src_key)
        assert len(src_state_dict) == 0
        model.load_state_dict(dst_state_dict, strict=strict)
    else:
        model.load_state_dict(checkpoint["model"], strict=strict)
    checkpoint.pop("model")
    if model_avg is not None and "model_avg" in checkpoint:
        logging.info("Loading averaged model")
        model_avg.load_state_dict(checkpoint["model_avg"], strict=strict)
        checkpoint.pop("model_avg")
    def load(name, obj):
        s = checkpoint.get(name, None)
        if obj and s:
            obj.load_state_dict(s)
            checkpoint.pop(name)
    load("optimizer_g", optimizer_g)
    load("optimizer_d", optimizer_d)
    load("scheduler_g", scheduler_g)
    load("scheduler_d", scheduler_d)
    load("grad_scaler", scaler)
    load("sampler", sampler)
    return checkpoint
 def save_checkpoint(
    filename: Path,
    model: Union[nn.Module, DDP],
    model_avg: Optional[nn.Module] = None,
    params: Optional[Dict[str, Any]] = None,
    optimizer_g: Optional[Optimizer] = None,
    optimizer_d: Optional[Optimizer] = None,
    scheduler_g: Optional[LRScheduler] = None,
    scheduler_d: Optional[LRScheduler] = None,
    scaler: Optional[GradScaler] = None,
    sampler: Optional[CutSampler] = None,
    rank: int = 0,
 ) -> None:
    """Save training information to a file.
    Args:
      filename:
        The checkpoint filename.
      model:
        The model to be saved. We only save its `state_dict()`.
      model_avg:
        The stored model averaged from the start of training.
      params:
        User defined parameters, e.g., epoch, loss.
      optimizer:
        The optimizer to be saved. We only save its `state_dict()`.
      scheduler:
        The scheduler to be saved. We only save its `state_dict()`.
      scalar:
        The GradScaler to be saved. We only save its `state_dict()`.
      rank:
        Used in DDP. We save checkpoint only for the node whose rank is 0.
    Returns:
      Return None.
    """
    if rank != 0:
        return
    logging.info(f"Saving checkpoint to {filename}")
    if isinstance(model, DDP):
        model = model.module
    checkpoint = {
        "model": model.state_dict(),
        "optimizer_g": optimizer_g.state_dict() if optimizer_g is not None else None,
        "optimizer_d": optimizer_d.state_dict() if optimizer_d is not None else None,
        "scheduler_g": scheduler_g.state_dict() if scheduler_g is not None else None,
        "scheduler_d": scheduler_d.state_dict() if scheduler_d is not None else None,
        "grad_scaler": scaler.state_dict() if scaler is not None else None,
        "sampler": sampler.state_dict() if sampler is not None else None,
    }
    if model_avg is not None:
        checkpoint["model_avg"] = model_avg.to(torch.float32).state_dict()
    if params:
        for k, v in params.items():
            assert k not in checkpoint
            checkpoint[k] = v
    torch.save(checkpoint, filename)
 def _get_cosine_schedule_with_warmup_lr_lambda(
    current_step: int,
    *,
    num_warmup_steps: int,
    num_training_steps: int,
    num_cycles: float,
    min_lr_rate: float = 0.0,
 ):
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    progress = float(current_step - num_warmup_steps) / float(
        max(1, num_training_steps - num_warmup_steps)
    )
    factor = 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
    factor = factor * (1 - min_lr_rate) + min_lr_rate
    return max(0, factor)
 def get_cosine_schedule_with_warmup(
    optimizer: Optimizer,
    num_warmup_steps: int,
    num_training_steps: int,
    num_cycles: float = 0.5,
    last_epoch: int = -1,
 ):
    """
    Create a schedule with a learning rate that decreases following the values of the cosine function between the
    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
    initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        num_cycles (`float`, *optional*, defaults to 0.5):
            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
            following a half-cosine).
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """
    lr_lambda = partial(
        _get_cosine_schedule_with_warmup_lr_lambda,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        num_cycles=num_cycles,
    )
    return LambdaLR(optimizer, lr_lambda, last_epoch)
 def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:
    """
    Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.
    Args:
        x (Tensor): Input tensor.
        clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.
    Returns:
        Tensor: Element-wise logarithm of the input tensor with clipping applied.
    """
    return torch.log(torch.clip(x, min=clip_val))
--- a/egs/ljspeech/TTS/local/evaluate_fsd.py
+++ b/egs/ljspeech/TTS/local/evaluate_fsd.py
@ -0,0 +1,287 @@
 """
 Calculate Frechet Speech Distance betweeen two speech directories.
 Adapted from: https://github.com/gudgud96/frechet-audio-distance/blob/main/frechet_audio_distance/fad.py
 """
 import argparse
 import logging
 import os
 from multiprocessing.dummy import Pool as ThreadPool
 import librosa
 import numpy as np
 import soundfile as sf
 import torch
 from scipy import linalg
 from tqdm import tqdm
 from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
 logging.basicConfig(level=logging.INFO)
 def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--real-path", type=str, help="path of the real speech directory"
    )
    parser.add_argument(
        "--eval-path", type=str, help="path of the evaluated speech directory"
    )
    parser.add_argument(
        "--model-path",
        type=str,
        default="model/huggingface/wav2vec2_base",
        help="path of the wav2vec 2.0 model directory",
    )
    parser.add_argument(
        "--real-embds-path",
        type=str,
        default=None,
        help="path of the real embedding directory",
    )
    parser.add_argument(
        "--eval-embds-path",
        type=str,
        default=None,
        help="path of the evaluated embedding directory",
    )
    return parser
 class FrechetSpeechDistance:
    def __init__(
        self,
        model_path="resources/wav2vec2_base",
        pca_dim=128,
        speech_load_worker=8,
    ):
        """
        Initialize FSD
        """
        self.sample_rate = 16000
        self.channels = 1
        self.device = (
            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        )
        logging.info("[Frechet Speech Distance] Using device: {}".format(self.device))
        self.speech_load_worker = speech_load_worker
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_path)
        self.model = Wav2Vec2Model.from_pretrained(model_path)
        self.model.to(self.device)
        self.model.eval()
        self.pca_dim = pca_dim
    def load_speech_files(self, dir, dtype="float32"):
        def _load_speech_task(fname, sample_rate, channels, dtype="float32"):
            if dtype not in ["float64", "float32", "int32", "int16"]:
                raise ValueError(f"dtype not supported: {dtype}")
            wav_data, sr = sf.read(fname, dtype=dtype)
            # For integer type PCM input, convert to [-1.0, +1.0]
            if dtype == "int16":
                wav_data = wav_data / 32768.0
            elif dtype == "int32":
                wav_data = wav_data / float(2**31)
            # Convert to mono
            assert channels in [1, 2], "channels must be 1 or 2"
            if len(wav_data.shape) > channels:
                wav_data = np.mean(wav_data, axis=1)
            if sr != sample_rate:
                wav_data = (
                    librosa.resample(wav_data, orig_sr=sr, target_sr=sample_rate),
                )
            return wav_data
        task_results = []
        pool = ThreadPool(self.speech_load_worker)
        logging.info("[Frechet Speech Distance] Loading speech from {}...".format(dir))
        for fname in os.listdir(dir):
            res = pool.apply_async(
                _load_speech_task,
                args=(os.path.join(dir, fname), self.sample_rate, self.channels, dtype),
            )
            task_results.append(res)
        pool.close()
        pool.join()
        return [k.get() for k in task_results]
    def get_embeddings(self, x):
        """
        Get embeddings
        Params:
        -- x    : a list of np.ndarray speech samples
        -- sr   : sampling rate.
        """
        embd_lst = []
        try:
            for speech in tqdm(x):
                input_features = self.feature_extractor(
                    speech, sampling_rate=self.sample_rate, return_tensors="pt"
                ).input_values.to(self.device)
                with torch.no_grad():
                    embd = self.model(input_features).last_hidden_state.mean(1)
                if embd.device != torch.device("cpu"):
                    embd = embd.cpu()
                if torch.is_tensor(embd):
                    embd = embd.detach().numpy()
                embd_lst.append(embd)
        except Exception as e:
            print(
                "[Frechet Speech Distance] get_embeddings throw an exception: {}".format(
                    str(e)
                )
            )
        return np.concatenate(embd_lst, axis=0)
    def calculate_embd_statistics(self, embd_lst):
        if isinstance(embd_lst, list):
            embd_lst = np.array(embd_lst)
        mu = np.mean(embd_lst, axis=0)
        sigma = np.cov(embd_lst, rowvar=False)
        return mu, sigma
    def calculate_frechet_distance(self, mu1, sigma1, mu2, sigma2, eps=1e-6):
        """
        Adapted from: https://github.com/mseitzer/pytorch-fid/blob/master/src/pytorch_fid/fid_score.py
        Numpy implementation of the Frechet Distance.
        The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
        and X_2 ~ N(mu_2, C_2) is
                d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
        Stable version by Dougal J. Sutherland.
        Params:
        -- mu1   : Numpy array containing the activations of a layer of the
                inception net (like returned by the function 'get_predictions')
                for generated samples.
        -- mu2   : The sample mean over activations, precalculated on an
                representative data set.
        -- sigma1: The covariance matrix over activations for generated samples.
        -- sigma2: The covariance matrix over activations, precalculated on an
                representative data set.
        Returns:
        --   : The Frechet Distance.
        """
        mu1 = np.atleast_1d(mu1)
        mu2 = np.atleast_1d(mu2)
        sigma1 = np.atleast_2d(sigma1)
        sigma2 = np.atleast_2d(sigma2)
        assert (
            mu1.shape == mu2.shape
        ), "Training and test mean vectors have different lengths"
        assert (
            sigma1.shape == sigma2.shape
        ), "Training and test covariances have different dimensions"
        diff = mu1 - mu2
        # Product might be almost singular
        covmean, _ = linalg.sqrtm(sigma1.dot(sigma2).astype(complex), disp=False)
        if not np.isfinite(covmean).all():
            msg = (
                "fid calculation produces singular product; "
                "adding %s to diagonal of cov estimates"
            ) % eps
            logging.info(msg)
            offset = np.eye(sigma1.shape[0]) * eps
            covmean = linalg.sqrtm(
                (sigma1 + offset).dot(sigma2 + offset).astype(complex)
            )
        # Numerical error might give slight imaginary component
        if np.iscomplexobj(covmean):
            if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
                m = np.max(np.abs(covmean.imag))
                raise ValueError("Imaginary component {}".format(m))
            covmean = covmean.real
        tr_covmean = np.trace(covmean)
        return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
    def score(
        self,
        real_path,
        eval_path,
        real_embds_path=None,
        eval_embds_path=None,
        dtype="float32",
    ):
        """
        Computes the Frechet Speech Distance (FSD) between two directories of speech files.
        Parameters:
        - real_path (str): Path to the directory containing real speech files.
        - eval_path (str): Path to the directory containing evaluation speech files.
        - real_embds_path (str, optional): Path to save/load real speech embeddings (e.g., /folder/bkg_embs.npy). If None, embeddings won't be saved.
        - eval_embds_path (str, optional): Path to save/load evaluation speech embeddings (e.g., /folder/test_embs.npy). If None, embeddings won't be saved.
        - dtype (str, optional): Data type for loading speech. Default is "float32".
        Returns:
        - float: The Frechet Speech Distance (FSD) score between the two directories of speech files.
        """
        # Load or compute real embeddings
        if real_embds_path is not None and os.path.exists(real_embds_path):
            logging.info(
                f"[Frechet Speech Distance] Loading embeddings from {real_embds_path}..."
            )
            embds_real = np.load(real_embds_path)
        else:
            speech_real = self.load_speech_files(real_path, dtype=dtype)
            embds_real = self.get_embeddings(speech_real)
            if real_embds_path:
                os.makedirs(os.path.dirname(real_embds_path), exist_ok=True)
                np.save(real_embds_path, embds_real)
        # Load or compute eval embeddings
        if eval_embds_path is not None and os.path.exists(eval_embds_path):
            logging.info(
                f"[Frechet Speech Distance] Loading embeddings from {eval_embds_path}..."
            )
            embds_eval = np.load(eval_embds_path)
        else:
            speech_eval = self.load_speech_files(eval_path, dtype=dtype)
            embds_eval = self.get_embeddings(speech_eval)
            if eval_embds_path:
                os.makedirs(os.path.dirname(eval_embds_path), exist_ok=True)
                np.save(eval_embds_path, embds_eval)
        # Check if embeddings are empty
        if len(embds_real) == 0:
            logging.info("[Frechet Speech Distance] real set dir is empty, exiting...")
            return -10.46
        if len(embds_eval) == 0:
            logging.info("[Frechet Speech Distance] eval set dir is empty, exiting...")
            return -1
        # Compute statistics and FSD score
        mu_real, sigma_real = self.calculate_embd_statistics(embds_real)
        mu_eval, sigma_eval = self.calculate_embd_statistics(embds_eval)
        fsd_score = self.calculate_frechet_distance(
            mu_real, sigma_real, mu_eval, sigma_eval
        )
        return fsd_score
 if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()
    FSD = FrechetSpeechDistance(model_path=args.model_path)
    score = FSD.score(
        args.real_path, args.eval_path, args.real_embds_path, args.eval_embds_path
    )
    logging.info(f"FSD score: {score:.2f}")
--- a/egs/ljspeech/TTS/local/evaluate_wer_whisper.py
+++ b/egs/ljspeech/TTS/local/evaluate_wer_whisper.py
@ -0,0 +1,139 @@
 """
 Calculate WER with Whisper model
 """
 import argparse
 import logging
 import os
 import re
 from pathlib import Path
 from typing import List, Tuple
 import librosa
 import soundfile as sf
 import torch
 from num2words import num2words
 from tqdm import tqdm
 from transformers import pipeline
 from icefall.utils import store_transcripts, write_error_stats
 logging.basicConfig(level=logging.INFO)
 def get_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("--wav-path", type=str, help="path of the speech directory")
    parser.add_argument("--decode-path", type=str, help="path of the speech directory")
    parser.add_argument(
        "--model-path",
        type=str,
        default="model/huggingface/whisper_medium",
        help="path of the huggingface whisper model",
    )
    parser.add_argument(
        "--transcript-path",
        type=str,
        default="data/transcript/test.tsv",
        help="path of the transcript tsv file",
    )
    parser.add_argument(
        "--batch-size", type=int, default=64, help="decoding batch size"
    )
    parser.add_argument(
        "--device", type=str, default="cuda:0", help="decoding device, cuda:0 or cpu"
    )
    return parser
 def post_process(text: str):
    def convert_numbers(match):
        return num2words(match.group())
    text = re.sub(r"\b\d{1,2}\b", convert_numbers, text)
    text = re.sub(r"[^a-zA-Z0-9']", " ", text.lower())
    text = re.sub(r"\s+", " ", text)
    return text
 def save_results(
    res_dir: str,
    results: List[Tuple[str, List[str], List[str]]],
 ):
    if not os.path.exists(res_dir):
        os.makedirs(res_dir)
    recog_path = os.path.join(res_dir, "recogs.txt")
    results = sorted(results)
    store_transcripts(filename=recog_path, texts=results)
    logging.info(f"The transcripts are stored in {recog_path}")
    errs_filename = os.path.join(res_dir, "errs.txt")
    with open(errs_filename, "w") as f:
        _ = write_error_stats(f, "test", results, enable_log=True)
    logging.info("Wrote detailed error stats to {}".format(errs_filename))
 class SpeechEvalDataset(torch.utils.data.Dataset):
    def __init__(self, wav_path: str, transcript_path: str):
        super().__init__()
        self.audio_name = []
        self.audio_paths = []
        self.transcripts = []
        with Path(transcript_path).open("r", encoding="utf8") as f:
            meta = [item.split("\t") for item in f.read().rstrip().split("\n")]
        for item in meta:
            self.audio_name.append(item[0])
            self.audio_paths.append(Path(wav_path, item[0] + ".wav"))
            self.transcripts.append(item[1])
    def __len__(self):
        return len(self.audio_paths)
    def __getitem__(self, index: int):
        audio, sampling_rate = sf.read(self.audio_paths[index])
        item = {
            "array": librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000),
            "sampling_rate": 16000,
            "reference": self.transcripts[index],
            "audio_name": self.audio_name[index],
        }
        return item
 def main(args):
    batch_size = args.batch_size
    pipe = pipeline(
        "automatic-speech-recognition",
        model=args.model_path,
        device=args.device,
        tokenizer=args.model_path,
    )
    dataset = SpeechEvalDataset(args.wav_path, args.transcript_path)
    results = []
    bar = tqdm(
        pipe(
            dataset,
            generate_kwargs={"language": "english", "task": "transcribe"},
            batch_size=batch_size,
        ),
        total=len(dataset),
    )
    for out in bar:
        results.append(
            (
                out["audio_name"][0],
                post_process(out["reference"][0].strip()).split(),
                post_process(out["text"].strip()).split(),
            )
        )
    save_results(args.decode_path, results)
 if __name__ == "__main__":
    parser = get_parser()
    args = parser.parse_args()
    main(args)
--- a/egs/ljspeech/TTS/vocos/discriminators.py
+++ b/egs/ljspeech/TTS/vocos/discriminators.py
@ -0,0 +1 @@
 ../../../libritts/TTS/vocos/discriminators.py
--- a/egs/ljspeech/TTS/vocos/export-onnx.py
+++ b/egs/ljspeech/TTS/vocos/export-onnx.py
@ -0,0 +1 @@
 ../../../libritts/TTS/vocos/export-onnx.py
--- a/egs/ljspeech/TTS/vocos/export.py
+++ b/egs/ljspeech/TTS/vocos/export.py
@ -0,0 +1 @@
 ../../../libritts/TTS/vocos/export.py
--- a/egs/ljspeech/TTS/vocos/generator.py
+++ b/egs/ljspeech/TTS/vocos/generator.py
@ -0,0 +1 @@
 ../../../libritts/TTS/vocos/generator.py
--- a/egs/ljspeech/TTS/vocos/infer.py
+++ b/egs/ljspeech/TTS/vocos/infer.py
@ -0,0 +1,340 @@
 #!/usr/bin/env python3
 # Copyright         2024  Xiaomi Corp.        (authors: Wei Kang
 #                                                       Han Zhu)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import json
 import logging
 import math
 import os
 from functools import partial
 from pathlib import Path
 import torch
 import torch.nn as nn
 from lhotse.utils import fix_random_seed
 from scipy.io.wavfile import write
 from train import add_model_arguments, get_model, get_params
 from tts_datamodule import LJSpeechTtsDataModule
 from icefall.checkpoint import (
    average_checkpoints,
    average_checkpoints_with_averaged_model,
    find_checkpoints,
    load_checkpoint,
 )
 from icefall.utils import AttributeDict, setup_logger, str2bool
 LOG_EPS = math.log(1e-10)
 def get_parser():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--epoch",
        type=int,
        default=100,
        help="""It specifies the checkpoint to use for decoding.
        Note: Epoch counts from 1.
        You can specify --avg to use more checkpoints for model averaging.""",
    )
    parser.add_argument(
        "--iter",
        type=int,
        default=0,
        help="""If positive, --epoch is ignored and it
        will use the checkpoint exp_dir/checkpoint-iter.pt.
        You can specify --avg to use more checkpoints for model averaging.
        """,
    )
    parser.add_argument(
        "--avg",
        type=int,
        default=10,
        help="Number of checkpoints to average. Automatically select "
        "consecutive checkpoints before the checkpoint specified by "
        "'--epoch' and '--iter'",
    )
    parser.add_argument(
        "--use-averaged-model",
        type=str2bool,
        default=False,
        help="Whether to load averaged model. Currently it only supports "
        "using --epoch. If True, it would decode with the averaged model "
        "over the epoch range from `epoch-avg` (excluded) to `epoch`."
        "Actually only the models with epoch number of `epoch-avg` and "
        "`epoch` are loaded for averaging. ",
    )
    parser.add_argument(
        "--exp-dir",
        type=str,
        default="flow_match/exp",
        help="The experiment dir",
    )
    parser.add_argument(
        "--generate-dir",
        type=str,
        default="generated_wavs",
        help="Path name of the generated wavs",
    )
    add_model_arguments(parser)
    return parser
 def decode_one_batch(
    params: AttributeDict,
    model: nn.Module,
    batch: dict,
 ):
    """
    Args:
      params:
        It's the return value of :func:`get_params`.
      model:
        The text-to-feature neural model.
      batch:
        It is the return value from iterating
        `lhotse.dataset.K2SpeechRecognitionDataset`. See its documentation
        for the format of the `batch`.
    Returns:
      Return the decoding result. See above description for the format of
      the returned dict.
    """
    device = next(model.parameters()).device
    cut_ids = [cut.id for cut in batch["cut"]]
    features = batch["features"]  # (B, T, F)
    utt_durations = batch["features_lens"]
    x = features.permute(0, 2, 1)  # (B, F, T)
    audios = model(x.to(device))  # (B, T)
    wav_dir = f"{params.res_dir}/{params.suffix}"
    os.makedirs(wav_dir, exist_ok=True)
    for i in range(audios.shape[0]):
        audio = audios[i][: (utt_durations[i] - 1) * 256 + 1024]
        audio = audio.cpu().squeeze().numpy()
        write(f"{wav_dir}/{cut_ids[i]}.wav", 22050, audio)
 def decode_dataset(
    dl: torch.utils.data.DataLoader,
    params: AttributeDict,
    model: nn.Module,
    test_set: str,
 ):
    """Decode dataset.
    Args:
      dl:
        PyTorch's dataloader containing the dataset to decode.
      params:
        It is returned by :func:`get_params`.
      model:
        The text-to-feature neural model.
      test_set:
        The name of the test_set
    """
    num_cuts = 0
    try:
        num_batches = len(dl)
    except TypeError:
        num_batches = "?"
    with open(f"{params.res_dir}/{test_set}.scp", "w", encoding="utf8") as f:
        for batch_idx, batch in enumerate(dl):
            texts = batch["text"]
            cut_ids = [cut.id for cut in batch["cut"]]
            decode_one_batch(
                params=params,
                model=model,
                batch=batch,
            )
            assert len(texts) == len(cut_ids), (len(texts), len(cut_ids))
            for i in range(len(texts)):
                f.write(f"{cut_ids[i]}\t{texts[i]}\n")
            num_cuts += len(texts)
            if batch_idx % 50 == 0:
                batch_str = f"{batch_idx}/{num_batches}"
                logging.info(
                    f"batch {batch_str}, cuts processed until now is {num_cuts}"
                )
@torch.no_grad()
 def main():
    parser = get_parser()
    LJSpeechTtsDataModule.add_arguments(parser)
    args = parser.parse_args()
    args.exp_dir = Path(args.exp_dir)
    params = get_params()
    params.update(vars(args))
    params.res_dir = params.exp_dir / params.generate_dir
    if params.iter > 0:
        params.suffix = f"iter-{params.iter}-avg-{params.avg}"
    else:
        params.suffix = f"epoch-{params.epoch}-avg-{params.avg}"
    if params.use_averaged_model:
        params.suffix += "-use-averaged-model"
    setup_logger(f"{params.res_dir}/log-decode-{params.suffix}")
    logging.info("Decoding started")
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)
    params.device = device
    logging.info(f"Device: {device}")
    logging.info(params)
    fix_random_seed(666)
    logging.info("About to create model")
    model = get_model(params)
    if not params.use_averaged_model:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
        elif params.avg == 1:
            load_checkpoint(f"{params.exp_dir}/epoch-{params.epoch}.pt", model)
        else:
            start = params.epoch - params.avg + 1
            filenames = []
            for i in range(start, params.epoch + 1):
                if i >= 1:
                    filenames.append(f"{params.exp_dir}/epoch-{i}.pt")
            logging.info(f"averaging {filenames}")
            model.to(device)
            model.load_state_dict(average_checkpoints(filenames, device=device))
    else:
        if params.iter > 0:
            filenames = find_checkpoints(params.exp_dir, iteration=-params.iter)[
                : params.avg + 1
            ]
            if len(filenames) == 0:
                raise ValueError(
                    f"No checkpoints found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            elif len(filenames) < params.avg + 1:
                raise ValueError(
                    f"Not enough checkpoints ({len(filenames)}) found for"
                    f" --iter {params.iter}, --avg {params.avg}"
                )
            filename_start = filenames[-1]
            filename_end = filenames[0]
            logging.info(
                "Calculating the averaged model over iteration checkpoints"
                f" from {filename_start} (excluded) to {filename_end}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
        else:
            assert params.avg > 0, params.avg
            start = params.epoch - params.avg
            assert start >= 1, start
            filename_start = f"{params.exp_dir}/epoch-{start}.pt"
            filename_end = f"{params.exp_dir}/epoch-{params.epoch}.pt"
            logging.info(
                f"Calculating the averaged model over epoch range from "
                f"{start} (excluded) to {params.epoch}"
            )
            model.to(device)
            model.load_state_dict(
                average_checkpoints_with_averaged_model(
                    filename_start=filename_start,
                    filename_end=filename_end,
                    device=device,
                )
            )
    model = model.to(device)
    model.eval()
    num_param = sum([p.numel() for p in model.parameters()])
    logging.info(f"Number of model parameters: {num_param}")
    # we need cut ids to display recognition results.
    args.return_cuts = True
    ljspeech = LJSpeechTtsDataModule(args)
    test_cuts = ljspeech.test_cuts()
    test_dl = ljspeech.test_dataloaders(test_cuts)
    test_sets = ["test"]
    test_dls = [test_dl]
    for test_set, test_dl in zip(test_sets, test_dls):
        decode_dataset(
            dl=test_dl,
            params=params,
            model=model,
            test_set=test_set,
        )
    logging.info("Done!")
 if __name__ == "__main__":
    main()
--- a/egs/ljspeech/TTS/vocos/loss.py
+++ b/egs/ljspeech/TTS/vocos/loss.py
@ -0,0 +1 @@
 ../../../libritts/TTS/vocos/loss.py
--- a/egs/ljspeech/TTS/vocos/model.py
+++ b/egs/ljspeech/TTS/vocos/model.py
@ -0,0 +1 @@
 ../../../libritts/TTS/vocos/model.py
--- a/egs/ljspeech/TTS/vocos/onnx_pretrained.py
+++ b/egs/ljspeech/TTS/vocos/onnx_pretrained.py
@ -0,0 +1 @@
 ../../../libritts/TTS/vocos/onnx_pretrained.py
--- a/egs/ljspeech/TTS/vocos/pretrained.py
+++ b/egs/ljspeech/TTS/vocos/pretrained.py
@ -0,0 +1 @@
 ../../../libritts/TTS/vocos/pretrained.py
--- a/egs/ljspeech/TTS/vocos/train.py
+++ b/egs/ljspeech/TTS/vocos/train.py
--- a/egs/ljspeech/TTS/vocos/tts_datamodule.py
+++ b/egs/ljspeech/TTS/vocos/tts_datamodule.py
@ -0,0 +1,372 @@
 # Copyright      2021  Piotr Żelasko
 # Copyright      2022-2024  Xiaomi Corporation     (Authors: Mingshuang Luo,
 #                                                            Zengwei Yao,
 #                                                            Wei Kang)
 #
 # See ../../../../LICENSE for clarification regarding multiple authors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import logging
 from functools import lru_cache
 from pathlib import Path
 from typing import Any, Dict, Optional
 import torch
 from lhotse import CutSet, Fbank, FbankConfig, load_manifest_lazy
 from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
    CutConcatenate,
    CutMix,
    DynamicBucketingSampler,
    PrecomputedFeatures,
    SimpleCutSampler,
    SpecAugment,
    SpeechSynthesisDataset,
 )
 from lhotse.dataset.input_strategies import (  # noqa F401 For AudioSamples
    AudioSamples,
    OnTheFlyFeatures,
 )
 from lhotse.utils import fix_random_seed
 from torch.utils.data import DataLoader
 from icefall.utils import str2bool
 class _SeedWorkers:
    def __init__(self, seed: int):
        self.seed = seed
    def __call__(self, worker_id: int):
        fix_random_seed(self.seed + worker_id)
 class LJSpeechTtsDataModule:
    """
    DataModule for tts experiments.
    It assumes there is always one train and valid dataloader,
    but there can be multiple test dataloaders (e.g. LibriSpeech test-clean
    and test-other).
    It contains all the common data pipeline modules used in ASR
    experiments, e.g.:
    - dynamic batch size,
    - bucketing samplers,
    - cut concatenation,
    - on-the-fly feature extraction
    This class should be derived for specific corpora used in ASR tasks.
    """
    def __init__(self, args: argparse.Namespace):
        self.args = args
    @classmethod
    def add_arguments(cls, parser: argparse.ArgumentParser):
        group = parser.add_argument_group(
            title="TTS data related options",
            description="These options are used for the preparation of "
            "PyTorch DataLoaders from Lhotse CutSet's -- they control the "
            "effective batch sizes, sampling strategies, applied data "
            "augmentations, etc.",
        )
        group.add_argument(
            "--manifest-dir",
            type=Path,
            default=Path("data/fbank"),
            help="Path to directory with train/valid/test cuts.",
        )
        group.add_argument(
            "--max-duration",
            type=int,
            default=200.0,
            help="Maximum pooled recordings duration (seconds) in a "
            "single batch. You can reduce it if it causes CUDA OOM.",
        )
        group.add_argument(
            "--bucketing-sampler",
            type=str2bool,
            default=True,
            help="When enabled, the batches will come from buckets of "
            "similar duration (saves padding frames).",
        )
        group.add_argument(
            "--num-buckets",
            type=int,
            default=30,
            help="The number of buckets for the DynamicBucketingSampler"
            "(you might want to increase it for larger datasets).",
        )
        group.add_argument(
            "--on-the-fly-feats",
            type=str2bool,
            default=False,
            help="When enabled, use on-the-fly cut mixing and feature "
            "extraction. Will drop existing precomputed feature manifests "
            "if available.",
        )
        group.add_argument(
            "--shuffle",
            type=str2bool,
            default=True,
            help="When enabled (=default), the examples will be "
            "shuffled for each epoch.",
        )
        group.add_argument(
            "--drop-last",
            type=str2bool,
            default=True,
            help="Whether to drop last batch. Used by sampler.",
        )
        group.add_argument(
            "--return-cuts",
            type=str2bool,
            default=True,
            help="When enabled, each batch will have the "
            "field: batch['cut'] with the cuts that "
            "were used to construct it.",
        )
        group.add_argument(
            "--num-workers",
            type=int,
            default=2,
            help="The number of training dataloader workers that "
            "collect the batches.",
        )
        group.add_argument(
            "--sampling-rate",
            type=int,
            default=22050,
            help="The sampleing rate of ljspeech dataset",
        )
        group.add_argument(
            "--frame-shift",
            type=int,
            default=256,
            help="Frame shift.",
        )
        group.add_argument(
            "--frame-length",
            type=int,
            default=1024,
            help="Frame shift.",
        )
        group.add_argument(
            "--input-strategy",
            type=str,
            default="PrecomputedFeatures",
            help="AudioSamples or PrecomputedFeatures",
        )
        group.add_argument(
            "--use-fft-mag",
            type=str2bool,
            default=True,
            help="Whether to use magnitude of fbank, false to use power energy.",
        )
    def train_dataloaders(
        self,
        cuts_train: CutSet,
        sampler_state_dict: Optional[Dict[str, Any]] = None,
    ) -> DataLoader:
        """
        Args:
          cuts_train:
            CutSet for training.
          sampler_state_dict:
            The state dict for the training sampler.
        """
        logging.info("About to create train dataset")
        train = SpeechSynthesisDataset(
            return_text=True,
            return_tokens=False,
            feature_input_strategy=eval(self.args.input_strategy)(),
            return_cuts=self.args.return_cuts,
        )
        if self.args.on_the_fly_feats:
            sampling_rate = self.args.sampling_rate
            config = FbankConfig(
                sampling_rate=sampling_rate,
                frame_length=self.args.frame_length / sampling_rate,  # (in second),
                frame_shift=self.args.frame_shift / sampling_rate,  # (in second)
                use_fft_mag=self.args.use_fft_mag,
            )
            train = SpeechSynthesisDataset(
                return_text=True,
                return_tokens=False,
                feature_input_strategy=OnTheFlyFeatures(Fbank(config)),
                return_cuts=self.args.return_cuts,
            )
        if self.args.bucketing_sampler:
            logging.info("Using DynamicBucketingSampler.")
            train_sampler = DynamicBucketingSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
                num_buckets=self.args.num_buckets,
                buffer_size=self.args.num_buckets * 2000,
                shuffle_buffer_size=self.args.num_buckets * 5000,
                drop_last=self.args.drop_last,
            )
        else:
            logging.info("Using SimpleCutSampler.")
            train_sampler = SimpleCutSampler(
                cuts_train,
                max_duration=self.args.max_duration,
                shuffle=self.args.shuffle,
            )
        logging.info("About to create train dataloader")
        if sampler_state_dict is not None:
            logging.info("Loading sampler state dict")
            train_sampler.load_state_dict(sampler_state_dict)
        # 'seed' is derived from the current random state, which will have
        # previously been set in the main process.
        seed = torch.randint(0, 100000, ()).item()
        worker_init_fn = _SeedWorkers(seed)
        train_dl = DataLoader(
            train,
            sampler=train_sampler,
            batch_size=None,
            num_workers=self.args.num_workers,
            persistent_workers=False,
            worker_init_fn=worker_init_fn,
        )
        return train_dl
    def valid_dataloaders(self, cuts_valid: CutSet) -> DataLoader:
        logging.info("About to create dev dataset")
        if self.args.on_the_fly_feats:
            sampling_rate = self.args.sampling_rate
            config = FbankConfig(
                sampling_rate=sampling_rate,
                frame_length=self.args.frame_length / sampling_rate,  # (in second),
                frame_shift=self.args.frame_shift / sampling_rate,  # (in second)
                use_fft_mag=self.args.use_fft_mag,
            )
            validate = SpeechSynthesisDataset(
                return_text=True,
                return_tokens=False,
                feature_input_strategy=OnTheFlyFeatures(Fbank(config)),
                return_cuts=self.args.return_cuts,
            )
        else:
            validate = SpeechSynthesisDataset(
                return_text=True,
                return_tokens=False,
                feature_input_strategy=eval(self.args.input_strategy)(),
                return_cuts=self.args.return_cuts,
            )
        valid_sampler = DynamicBucketingSampler(
            cuts_valid,
            max_duration=self.args.max_duration,
            num_buckets=self.args.num_buckets,
            shuffle=False,
        )
        logging.info("About to create valid dataloader")
        valid_dl = DataLoader(
            validate,
            sampler=valid_sampler,
            batch_size=None,
            num_workers=2,
            persistent_workers=False,
        )
        return valid_dl
    def test_dataloaders(self, cuts: CutSet) -> DataLoader:
        logging.info("About to create test dataset")
        if self.args.on_the_fly_feats:
            sampling_rate = self.args.sampling_rate
            config = FbankConfig(
                sampling_rate=sampling_rate,
                frame_length=self.args.frame_length / sampling_rate,  # (in second),
                frame_shift=self.args.frame_shift / sampling_rate,  # (in second)
                use_fft_mag=self.args.use_fft_mag,
            )
            test = SpeechSynthesisDataset(
                return_text=True,
                return_tokens=False,
                feature_input_strategy=OnTheFlyFeatures(Fbank(config)),
                return_cuts=self.args.return_cuts,
            )
        else:
            test = SpeechSynthesisDataset(
                return_text=True,
                return_tokens=False,
                feature_input_strategy=eval(self.args.input_strategy)(),
                return_cuts=self.args.return_cuts,
            )
        test_sampler = DynamicBucketingSampler(
            cuts,
            max_duration=self.args.max_duration,
            num_buckets=self.args.num_buckets,
            shuffle=False,
        )
        logging.info("About to create test dataloader")
        test_dl = DataLoader(
            test,
            batch_size=None,
            sampler=test_sampler,
            num_workers=self.args.num_workers,
        )
        return test_dl
    @lru_cache()
    def train_cuts(self) -> CutSet:
        logging.info("About to get train cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "ljspeech_cuts_train.jsonl.gz"
        )
    @lru_cache()
    def valid_cuts(self) -> CutSet:
        logging.info("About to get validation cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "ljspeech_cuts_valid.jsonl.gz"
        )
    @lru_cache()
    def train_cuts_finetune(self) -> CutSet:
        logging.info("About to get train cuts finetune")
        return load_manifest_lazy(
            self.args.manifest_dir / "ljspeech_cuts_train_finetune.jsonl.gz"
        )
    @lru_cache()
    def valid_cuts_finetune(self) -> CutSet:
        logging.info("About to get validation cuts finetune")
        return load_manifest_lazy(
            self.args.manifest_dir / "ljspeech_cuts_valid_finetune.jsonl.gz"
        )
    @lru_cache()
    def test_cuts(self) -> CutSet:
        logging.info("About to get test cuts")
        return load_manifest_lazy(
            self.args.manifest_dir / "ljspeech_cuts_test.jsonl.gz"
        )
--- a/egs/ljspeech/TTS/vocos/utils.py
+++ b/egs/ljspeech/TTS/vocos/utils.py
@ -0,0 +1 @@
 ../../../libritts/TTS/vocos/utils.py
--- a/icefall/checkpoint.py
+++ b/icefall/checkpoint.py
@ -251,18 +251,22 @@ def save_checkpoint_with_global_batch_idx(
    )
-def find_checkpoints(out_dir: Path, iteration: int = 0) -> List[str]:
+def find_checkpoints(
    out_dir: Path,
    iteration: int = 0,
    prefix: str = "checkpoint",
 ) -> List[str]:
    """Find all available checkpoints in a directory.
-    The checkpoint filenames have the form: `checkpoint-xxx.pt`
+    The checkpoint filenames have the form: `{prefix}-xxx.pt`
    where xxx is a numerical value.
    Assume you have the following checkpoints in the folder `foo`:
-        - checkpoint-1.pt
+        - {prefix}-1.pt
-        - checkpoint-20.pt
+        - {prefix}-20.pt
-        - checkpoint-300.pt
+        - {prefix}-300.pt
-        - checkpoint-4000.pt
+        - {prefix}-4000.pt
    Case 1 (Return all checkpoints)::
@ -291,8 +295,8 @@ def find_checkpoints(out_dir: Path, iteration: int = 0) -> List[str]:
      Return a list of checkpoint filenames, sorted in descending
      order by the numerical value in the filename.
    """
-    checkpoints = list(glob.glob(f"{out_dir}/checkpoint-[0-9]*.pt"))
+    checkpoints = list(glob.glob(f"{out_dir}/{prefix}-[0-9]*.pt"))
-    pattern = re.compile(r"checkpoint-([0-9]+).pt")
+    pattern = re.compile(rf"{prefix}-([0-9]+).pt")
    iter_checkpoints = []
    for c in checkpoints:
        result = pattern.search(c)
@ -317,12 +321,13 @@ def find_checkpoints(out_dir: Path, iteration: int = 0) -> List[str]:
 def remove_checkpoints(
    out_dir: Path,
    topk: int,
    prefix: str = "checkpoint",
    rank: int = 0,
 ):
    """Remove checkpoints from the given directory.
-    We assume that checkpoint filename has the form `checkpoint-xxx.pt`
+    We assume that checkpoint filename has the form `{prefix}-xxx.pt`
-    where xxx is a number, representing the number of processed batches
+    where xxx is a number, representing the number of processed batches/epochs
    when saving that checkpoint. We sort checkpoints by filename and keep
    only the `topk` checkpoints with the highest `xxx`.
@ -331,6 +336,8 @@ def remove_checkpoints(
        The directory containing checkpoints to be removed.
      topk:
        Number of checkpoints to keep.
      prefix:
        The prefix of the checkpoint filename, normally `epoch`, `checkpoint`.
      rank:
        If using DDP for training, it is the rank of the current node.
        Use 0 if no DDP is used for training.
@ -338,7 +345,7 @@ def remove_checkpoints(
    assert topk >= 1, topk
    if rank != 0:
        return
-    checkpoints = find_checkpoints(out_dir)
+    checkpoints = find_checkpoints(out_dir, prefix=prefix)
    if len(checkpoints) == 0:
        logging.warn(f"No checkpoints found in {out_dir}")
		`@ -0,0 +1 @@`
							`../../../libritts/TTS/vocos/discriminators.py`
		`@ -0,0 +1 @@`
							`../../../libritts/TTS/vocos/onnx_pretrained.py`