kaldifeat/doc/source/code/test_fbank.py

#!/usr/bin/env python3

# Copyright      2021  Xiaomi Corporation (authors: Fangjun Kuang)

import numpy as np
import soundfile as sf
import torch

import kaldifeat


def read_wave(filename) -> torch.Tensor:
    """Read a wave file and return it as a 1-D tensor.

    Note:
      You don't need to scale it to [-32768, 32767].
      We use scaling here to follow the approach in Kaldi.

    Args:
      filename:
        Filename of a sound file.
    Returns:
      Return a 1-D tensor containing audio samples.
    """
    with sf.SoundFile(filename) as sf_desc:
        sampling_rate = sf_desc.samplerate
        assert sampling_rate == 16000
        data = sf_desc.read(dtype=np.float32, always_2d=False)
    data *= 32768
    return torch.from_numpy(data)


def test_fbank():
    device = torch.device("cpu")
    if torch.cuda.is_available():
        device = torch.device("cuda", 0)

    wave0 = read_wave("test_data/test.wav")
    wave1 = read_wave("test_data/test2.wav")

    wave0 = wave0.to(device)
    wave1 = wave1.to(device)

    opts = kaldifeat.FbankOptions()
    opts.frame_opts.dither = 0
    opts.device = device

    fbank = kaldifeat.Fbank(opts)

    # We can compute fbank features in batches
    features = fbank([wave0, wave1])
    assert isinstance(features, list), f"{type(features)}"
    assert len(features) == 2

    # We can also compute fbank features for a single wave
    features0 = fbank(wave0)
    features1 = fbank(wave1)

    assert torch.allclose(features[0], features0)
    assert torch.allclose(features[1], features1)

    # To compute fbank features for only a specified frame
    audio_frames = fbank.convert_samples_to_frames(wave0)
    feature_frame_1 = fbank.compute(audio_frames[1])
    feature_frame_10 = fbank.compute(audio_frames[10])

    assert torch.allclose(features0[1], feature_frame_1)
    assert torch.allclose(features0[10], feature_frame_10)


if __name__ == "__main__":
    test_fbank()