mirror of
https://github.com/csukuangfj/kaldifeat.git
synced 2025-08-10 18:42:17 +00:00
Begin to add compute-kaldi-pitch-feats
This commit is contained in:
parent
a4bb32aaea
commit
3b53580271
0
kaldifeat/csrc/pitch-functions.cc
Normal file
0
kaldifeat/csrc/pitch-functions.cc
Normal file
139
kaldifeat/csrc/pitch-functions.h
Normal file
139
kaldifeat/csrc/pitch-functions.h
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
// kaldifeat/csrc/pitch-functions.h
|
||||||
|
//
|
||||||
|
// Copyright (c) 2021 Xiaomi Corporation (authors: Fangjun Kuang)
|
||||||
|
|
||||||
|
// This file is copied/modified from kaldi/src/feat/pitch-functions.h
|
||||||
|
|
||||||
|
#ifndef KALDIFEAT_CSRC_FEATURE_PITCH_FUNCTIONS_H_
|
||||||
|
#define KALDIFEAT_CSRC_FEATURE_PITCH_FUNCTIONS_H_
|
||||||
|
|
||||||
|
#include "torch/torch.h"
|
||||||
|
|
||||||
|
namespace kaldifeat {
|
||||||
|
|
||||||
|
struct PitchExtractionOptions {
|
||||||
|
// sample frequency in hertz
|
||||||
|
// must match the waveform file
|
||||||
|
float samp_freq = 16000;
|
||||||
|
float frame_shift_ms = 10.0; // in milliseconds.
|
||||||
|
float frame_length_ms = 25.0; // in milliseconds.
|
||||||
|
|
||||||
|
// Preemphasis coefficient. [use is deprecated.]
|
||||||
|
float preemph_coeff = 0.0;
|
||||||
|
|
||||||
|
float min_f0 = 50; // min f0 to search (Hz)
|
||||||
|
float max_f0 = 400; // max f0 to search (Hz)
|
||||||
|
float soft_min_f0 = 10.0; // Minimum f0, applied in soft way, must not
|
||||||
|
// exceed min-f0
|
||||||
|
float penalty_factor = 0.1; // cost factor for FO change
|
||||||
|
float lowpass_cutoff = 1000; // cutoff frequency for Low pass filter (Hz)
|
||||||
|
|
||||||
|
// Integer that determines filter width when
|
||||||
|
// upsampling NCCF
|
||||||
|
// Frequency that we down-sample the signal to. Must be
|
||||||
|
// more than twice lowpass-cutoff
|
||||||
|
float resample_freq = 4000;
|
||||||
|
|
||||||
|
float delta_pitch = 0.005; // the pitch tolerance in pruning lags
|
||||||
|
float nccf_ballast = 7000; // Increasing this factor reduces NCCF for
|
||||||
|
// quiet frames, helping ensure pitch
|
||||||
|
// continuity in unvoiced region
|
||||||
|
int32_t lowpass_filter_width = 1; // Integer that determines filter width of
|
||||||
|
// lowpass filter
|
||||||
|
int32_t upsample_filter_width = 5; // Integer that determines filter width
|
||||||
|
// when upsampling NCCF
|
||||||
|
|
||||||
|
// Below are newer config variables, not present in the original paper,
|
||||||
|
// that relate to the online pitch extraction algorithm.
|
||||||
|
|
||||||
|
// The maximum number of frames of latency that we allow the pitch-processing
|
||||||
|
// to introduce, for online operation. If you set this to a large value,
|
||||||
|
// there would be no inaccuracy from the Viterbi traceback (but it might make
|
||||||
|
// you wait to see the pitch). This is not very relevant for the online
|
||||||
|
// operation: normalization-right-context is more relevant, you
|
||||||
|
// can just leave this value at zero.
|
||||||
|
int32_t max_frames_latency = 0;
|
||||||
|
|
||||||
|
// Only relevant for the function ComputeKaldiPitch which is called by
|
||||||
|
// compute-kaldi-pitch-feats. If nonzero, we provide the input as chunks of
|
||||||
|
// this size. This affects the energy normalization which has a small effect
|
||||||
|
// on the resulting features, especially at the beginning of a file. For best
|
||||||
|
// compatibility with online operation (e.g. if you plan to train models for
|
||||||
|
// the online-deocding setup), you might want to set this to a small value,
|
||||||
|
// like one frame.
|
||||||
|
int32_t frames_per_chunk = 0;
|
||||||
|
|
||||||
|
// Only relevant for the function ComputeKaldiPitch which is called by
|
||||||
|
// compute-kaldi-pitch-feats, and only relevant if frames_per_chunk is
|
||||||
|
// nonzero. If true, it will query the features as soon as they are
|
||||||
|
// available, which simulates the first-pass features you would get in online
|
||||||
|
// decoding. If false, the features you will get will be the same as those
|
||||||
|
// available at the end of the utterance, after InputFinished() has been
|
||||||
|
// called: e.g. during lattice rescoring.
|
||||||
|
bool simulate_first_pass_online = false;
|
||||||
|
|
||||||
|
// Only relevant for online operation or when emulating online operation
|
||||||
|
// (e.g. when setting frames_per_chunk). This is the frame-index on which we
|
||||||
|
// recompute the NCCF (e.g. frame-index 500 = after 5 seconds); if the
|
||||||
|
// segment ends before this we do it when the segment ends. We do this by
|
||||||
|
// re-computing the signal average energy, which affects the NCCF via the
|
||||||
|
// "ballast term", scaling the resampled NCCF by a factor derived from the
|
||||||
|
// average change in the "ballast term", and re-doing the backtrace
|
||||||
|
// computation. Making this infinity would be the most exact, but would
|
||||||
|
// introduce unwanted latency at the end of long utterances, for little
|
||||||
|
// benefit.
|
||||||
|
int32_t recompute_frame = 500;
|
||||||
|
|
||||||
|
// This is a "hidden config" used only for testing the online pitch
|
||||||
|
// extraction. If true, we compute the signal root-mean-squared for the
|
||||||
|
// ballast term, only up to the current frame, rather than the end of the
|
||||||
|
// current chunk of signal. This makes the output insensitive to the
|
||||||
|
// chunking, which is useful for testing purposes.
|
||||||
|
bool nccf_ballast_online = false;
|
||||||
|
bool snip_edges = true;
|
||||||
|
|
||||||
|
torch::Device device{"cpu"};
|
||||||
|
|
||||||
|
PitchExtractionOptions() = default;
|
||||||
|
|
||||||
|
/// Returns the window-size in samples, after resampling. This is the
|
||||||
|
/// "basic window size", not the full window size after extending by max-lag.
|
||||||
|
// Because of floating point representation, it is more reliable to divide
|
||||||
|
// by 1000 instead of multiplying by 0.001, but it is a bit slower.
|
||||||
|
int32_t NccfWindowSize() const {
|
||||||
|
return static_cast<int32_t>(resample_freq * frame_length_ms / 1000.0);
|
||||||
|
}
|
||||||
|
/// Returns the window-shift in samples, after resampling.
|
||||||
|
int32_t NccfWindowShift() const {
|
||||||
|
return static_cast<int32_t>(resample_freq * frame_shift_ms / 1000.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string ToString() const {
|
||||||
|
std::ostringstream os;
|
||||||
|
os << "samp_freq: " << samp_freq << "\n";
|
||||||
|
os << "frame_shift_ms: " << frame_shift_ms << "\n";
|
||||||
|
os << "frame_length_ms: " << frame_length_ms << "\n";
|
||||||
|
os << "preemph_coeff: " << preemph_coeff << "\n";
|
||||||
|
os << "min_f0: " << min_f0 << "\n";
|
||||||
|
os << "max_f0: " << max_f0 << "\n";
|
||||||
|
os << "soft_min_f0: " << soft_min_f0 << "\n";
|
||||||
|
os << "penalty_factor: " << penalty_factor << "\n";
|
||||||
|
os << "lowpass_cutoff: " << lowpass_cutoff << "\n";
|
||||||
|
os << "resample_freq: " << resample_freq << "\n";
|
||||||
|
os << "delta_pitch: " << delta_pitch << "\n";
|
||||||
|
os << "nccf_ballast: " << nccf_ballast << "\n";
|
||||||
|
os << "lowpass_filter_width: " << lowpass_filter_width << "\n";
|
||||||
|
os << "upsample_filter_width: " << upsample_filter_width << "\n";
|
||||||
|
os << "max_frames_latency: " << max_frames_latency << "\n";
|
||||||
|
os << "frames_per_chunk: " << frames_per_chunk << "\n";
|
||||||
|
os << "simulate_first_pass_online: " << simulate_first_pass_online << "\n";
|
||||||
|
os << "recompute_frame: " << recompute_frame << "\n";
|
||||||
|
os << "nccf_ballast_online: " << nccf_ballast_online << "\n";
|
||||||
|
os << "snip_edges: " << snip_edges << "\n";
|
||||||
|
os << "device: " << device << "\n";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace kaldifeat
|
||||||
|
|
||||||
|
#endif // KALDIFEAT_CSRC_FEATURE_PITCH_FUNCTIONS_H_
|
Loading…
x
Reference in New Issue
Block a user