// kaldifeat/csrc/mel-computations.h // // Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang) // // This file is copied/modified from kaldi/src/feat/mel-computations.h #include #include #include "kaldifeat/csrc/feature-window.h" #ifndef KALDIFEAT_CSRC_MEL_COMPUTATIONS_H_ #define KALDIFEAT_CSRC_MEL_COMPUTATIONS_H_ namespace kaldifeat { struct MelBanksOptions { int32_t num_bins = 25; // e.g. 25; number of triangular bins float low_freq = 20; // e.g. 20; lower frequency cutoff // an upper frequency cutoff; 0 -> no cutoff, negative // ->added to the Nyquist frequency to get the cutoff. float high_freq = 0; float vtln_low = 100; // vtln lower cutoff of warping function. // vtln upper cutoff of warping function: if negative, added // to the Nyquist frequency to get the cutoff. float vtln_high = -500; bool debug_mel = false; // htk_mode is a "hidden" config, it does not show up on command line. // Enables more exact compatibility with HTK, for testing purposes. Affects // mel-energy flooring and reproduces a bug in HTK. bool htk_mode = false; std::string ToString() const { std::ostringstream os; os << "MelBanksOptions("; os << "num_bins=" << num_bins << ", "; os << "low_freq=" << low_freq << ", "; os << "high_freq=" << high_freq << ", "; os << "vtln_low=" << vtln_low << ", "; os << "vtln_high=" << vtln_high << ", "; os << "debug_mel=" << (debug_mel ? "True" : "False") << ", "; os << "htk_mode=" << (htk_mode ? "True" : "False") << ")"; return os.str(); } }; std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts); class MelBanks { public: static inline float InverseMelScale(float mel_freq) { return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f); } static inline float MelScale(float freq) { return 1127.0f * logf(1.0f + freq / 700.0f); } static float VtlnWarpFreq( float vtln_low_cutoff, float vtln_high_cutoff, // discontinuities in warp func float low_freq, float high_freq, // upper+lower frequency cutoffs in // the mel computation float vtln_warp_factor, float freq); static float VtlnWarpMelFreq(float vtln_low_cutoff, float vtln_high_cutoff, float low_freq, float high_freq, float vtln_warp_factor, float mel_freq); MelBanks(const MelBanksOptions &opts, const FrameExtractionOptions &frame_opts, float vtln_warp_factor, torch::Device device); // Initialize with a 2-d weights matrix // // Note: This constructor is for Whisper. It does not initialize // center_freqs_. // // @param weights Pointer to the start address of the matrix // @param num_rows It equals to number of mel bins // @param num_cols It equals to (number of fft bins)/2+1 MelBanks(const float *weights, int32_t num_rows, int32_t num_cols, torch::Device device); // CAUTION: we save a transposed version of bins_mat_, so return size(1) here int32_t NumBins() const { return static_cast(bins_mat_.size(1)); } // returns vector of central freq of each bin; needed by plp code. const torch::Tensor &GetCenterFreqs() const { return center_freqs_; } torch::Tensor Compute(const torch::Tensor &spectrum) const; // for debug only const torch::Tensor &GetBinsMat() const { return bins_mat_; } private: // A 2-D matrix. Its shape is NOT [num_bins, num_fft_bins] // Its shape is [num_fft_bins, num_bins] for non-whisper. // For whisper, its shape is [num_fft_bins/2+1, num_bins] torch::Tensor bins_mat_; // center frequencies of bins, numbered from 0 ... num_bins-1. // Needed by GetCenterFreqs(). torch::Tensor center_freqs_; // It's always on CPU bool debug_; bool htk_mode_; }; // Compute liftering coefficients (scaling on cepstral coeffs) // coeffs are numbered slightly differently from HTK: the zeroth // index is C0, which is not affected. // // coeffs is a 1-D float tensor void ComputeLifterCoeffs(float Q, torch::Tensor *coeffs); void GetEqualLoudnessVector(const MelBanks &mel_banks, torch::Tensor *ans); /* Compute LP coefficients from autocorrelation coefficients. * * @param [in] autocorr_in A 2-D tensor. Each row is a frame. Its number of * columns is lpc_order + 1 * @param [out] lpc_coeffs A 2-D tensor. On return, it has as many rows as the * input tensor. Its number of columns is lpc_order. * * @return Returns log energy of residual in a 1-D tensor. It has as many * elements as the number of rows in `autocorr_in`. */ torch::Tensor ComputeLpc(const torch::Tensor &autocorr_in, torch::Tensor *lpc_coeffs); /* * @param [in] lpc It is the output argument `lpc_coeffs` in ComputeLpc(). */ torch::Tensor Lpc2Cepstrum(const torch::Tensor &lpc); } // namespace kaldifeat #endif // KALDIFEAT_CSRC_MEL_COMPUTATIONS_H_