added pesq and stoi for reconstruction performance evaluation

2025-08-26 18:24:18 +00:00 · 2024-09-08 15:37:06 +08:00 · 2024-09-08 15:37:06 +08:00 · 1e65a976d0
commit 1e65a976d0
parent c43977ea05
1 changed files with 48 additions and 4 deletions
--- a/egs/libritts/CODEC/encodec/infer.py
+++ b/egs/libritts/CODEC/encodec/infer.py
@ -30,12 +30,16 @@ import argparse
 import logging
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
-from typing import Dict, List
+from statistics import mean
 from typing import List, Tuple
 import numpy as np
 import torch
 import torch.nn.functional as F
 import torchaudio
 from codec_datamodule import LibriTTSCodecDataModule
 from pesq import pesq
 from pystoi import stoi
 from scipy import signal
 from torch import nn
 from train import get_model, get_params
@ -105,12 +109,25 @@ def remove_encodec_weight_norm(model) -> None:
            remove_weight_norm(decoder._modules[key].conv.conv)
 def compute_pesq(ref_wav: np.ndarray, gen_wav: np.ndarray) -> float:
    """Compute PESQ score between reference and generated audio."""
    DEFAULT_SAMPLING_RATE = 16000
    ref = signal.resample(ref_wav, DEFAULT_SAMPLING_RATE)
    deg = signal.resample(gen_wav, DEFAULT_SAMPLING_RATE)
    return pesq(fs=DEFAULT_SAMPLING_RATE, ref=ref, deg=deg, mode="wb")
 def compute_stoi(ref_wav: np.ndarray, gen_wav: np.ndarray, sampling_rate: int) -> float:
    """Compute STOI score between reference and generated audio."""
    return stoi(x=ref_wav, y=gen_wav, fs_sig=sampling_rate, extended=False)
 def infer_dataset(
    dl: torch.utils.data.DataLoader,
    subset: str,
    params: AttributeDict,
    model: nn.Module,
-) -> None:
+) -> Tuple[float, float]:
    """Decode dataset.
    The ground-truth and generated audio pairs will be saved to `params.save_wav_dir`.
@ -123,6 +140,9 @@ def infer_dataset(
        It is returned by :func:`get_params`.
      model:
        The neural model.
    Returns:
        The average PESQ and STOI scores.
    """
    #  Background worker save audios to disk.
@ -150,6 +170,9 @@ def infer_dataset(
    num_cuts = 0
    log_interval = 5
    pesq_wb_scores = []
    stoi_scores = []
    try:
        num_batches = len(dl)
    except TypeError:
@ -169,6 +192,25 @@ def infer_dataset(
            )
            audio_hats = audio_hats.squeeze(1).cpu()
            for cut_id, audio, audio_hat, audio_len in zip(
                cut_ids, audios, audio_hats, audio_lens
            ):
                try:
                    pesq_wb = compute_pesq(
                        ref_wav=audio[:audio_len].numpy(),
                        gen_wav=audio_hat[:audio_len].numpy(),
                    )
                    pesq_wb_scores.append(pesq_wb)
                except Exception as e:
                    logging.error(f"Error while computing PESQ for cut {cut_id}: {e}")
                stoi_score = compute_stoi(
                    ref_wav=audio[:audio_len].numpy(),
                    gen_wav=audio_hat[:audio_len].numpy(),
                    sampling_rate=params.sampling_rate,
                )
                stoi_scores.append(stoi_score)
            futures.append(
                executor.submit(
                    _save_worker,
@ -192,6 +234,7 @@ def infer_dataset(
        # return results
        for f in futures:
            f.result()
    return mean(pesq_wb_scores), mean(stoi_scores)
@torch.no_grad()
@ -285,12 +328,13 @@ def main():
        logging.info(f"Processing {subset} set, saving to {save_wav_dir}")
-        infer_dataset(
+        pesq_wb, stoi = infer_dataset(
            dl=dl,
            subset=subset,
            params=params,
            model=model,
        )
        logging.info(f"{subset}: PESQ-WB: {pesq_wb:.4f}, STOI: {stoi:.4f}")
    logging.info(f"Wav files are saved to {params.save_wav_dir}")
    logging.info("Done!")