From 5492a6a5e2fcfb3c7973696164d19d41c0b72364 Mon Sep 17 00:00:00 2001 From: JinZr Date: Sat, 12 Oct 2024 15:33:38 +0800 Subject: [PATCH] comments updated --- egs/libritts/CODEC/encodec/loss.py | 22 ++++++++++++---------- egs/libritts/CODEC/encodec/train.py | 9 +++------ 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/egs/libritts/CODEC/encodec/loss.py b/egs/libritts/CODEC/encodec/loss.py index 8ec80bb9c..9cf1d42d2 100644 --- a/egs/libritts/CODEC/encodec/loss.py +++ b/egs/libritts/CODEC/encodec/loss.py @@ -1,8 +1,19 @@ +# Modified from egs/ljspeech/TTS/vits/loss.py by: Zengrui JIN (Tsinghua University) +# original implementation is from https://github.com/espnet/espnet/blob/master/espnet2/gan_tts/hifigan/loss.py + +# Copyright 2021 Tomoki Hayashi +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +"""Encodec-related loss modules. + +This code is modified from https://github.com/kan-bayashi/ParallelWaveGAN. + +""" + from typing import List, Tuple, Union import torch import torch.nn.functional as F -from lhotse.features.kaldi import Wav2LogFilterBank from torchaudio.transforms import MelSpectrogram @@ -225,15 +236,6 @@ class MelSpectrogramReconstructionLoss(torch.nn.Module): self.wav_to_specs = [] for i in range(5, 12): s = 2**i - # self.wav_to_specs.append( - # Wav2LogFilterBank( - # sampling_rate=sampling_rate, - # frame_length=s, - # frame_shift=s // 4, - # use_fft_mag=use_fft_mag, - # num_filters=n_mels, - # ) - # ) self.wav_to_specs.append( MelSpectrogram( sample_rate=sampling_rate, diff --git a/egs/libritts/CODEC/encodec/train.py b/egs/libritts/CODEC/encodec/train.py index 934d480f5..49e974310 100755 --- a/egs/libritts/CODEC/encodec/train.py +++ b/egs/libritts/CODEC/encodec/train.py @@ -186,12 +186,11 @@ def get_params() -> AttributeDict: "env_info": get_env_info(), "sampling_rate": 24000, "audio_normalization": False, - "chunk_size": 1.0, # in seconds "lambda_adv": 3.0, # loss scaling coefficient for adversarial loss "lambda_wav": 0.1, # loss scaling coefficient for waveform loss "lambda_feat": 4.0, # loss scaling coefficient for feat loss "lambda_rec": 1.0, # loss scaling coefficient for reconstruction loss - "lambda_com": 1.0, # loss scaling coefficient for commitment loss + "lambda_com": 1000.0, # loss scaling coefficient for commitment loss } ) @@ -342,12 +341,10 @@ def prepare_input( if is_training: audio_dims = audio.size(-1) - start_idx = random.randint( - 0, max(0, audio_dims - params.chunk_size * params.sampling_rate) - ) + start_idx = random.randint(0, max(0, audio_dims - params.sampling_rate)) audio = audio[:, start_idx : params.sampling_rate + start_idx] else: - # NOTE: a very coarse setup + # NOTE(zengrui): a very coarse setup audio = audio[ :, params.sampling_rate : params.sampling_rate + params.sampling_rate ]