Check embedding zero via simd

This commit is contained in:
Barzan Hayati 2025-09-10 20:27:58 +00:00
parent 9e9b645b95
commit fbc5a1ff96
7 changed files with 153 additions and 31 deletions

View File

@ -15,6 +15,17 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")# Enable all features your current CPU supports
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -march=native")
# Force AVX2
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -mavx2")
# Or force AVX-512
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -mavx512f")
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
# For larger projects
@ -149,4 +160,5 @@ target_link_libraries(${PROJECT_NAME} nvdsgst_infer nvds_meta nvds_inferutils
nvdsgst_meta nvds_utils nvdsgst_helper
prometheus-cpp-core prometheus-cpp-pull # prometheus-cpp-exposer nvdsgst_metnvdsa
microhttpd
nvdsgst_nvmultiurisrcbin)
nvdsgst_nvmultiurisrcbin
nvds_batch_jpegenc)

View File

@ -65,5 +65,6 @@
"redis_broker_host": "localhost",
"redis_broker_port": 6379,
"topic_redis": "redis_stream"
}
},
"compression_coefficient": 0.125
}

View File

@ -45,7 +45,7 @@ FaceNvInferServerManager::FaceNvInferServerManager() {
// FACE_NET_HEIGHT = config["PGIE_NET_HEIGHT"];
MUXER_OUTPUT_WIDTH = config["MUXER_OUTPUT_WIDTH"];
MUXER_OUTPUT_HEIGHT = config["MUXER_OUTPUT_HEIGHT"];
threshold_face_detection = config["threshold_body_detection"];
threshold_face_detection = config["threshold_face_detection"];
}
bool FaceNvInferServerManager::create_face_nv_infer_server(int num_sources) {
@ -1148,6 +1148,58 @@ void FaceNvInferServerManager::add_face_body(int object_id, float face_score) {
return;
}
// AVX check function
bool FaceNvInferServerManager::allZeroAVX(const float *data, size_t size) {
size_t i = 0;
__m256 zero = _mm256_setzero_ps(); // 8 floats of 0.0
for (; i + 8 <= size; i += 8) {
__m256 v = _mm256_loadu_ps(&data[i]); // load 8 floats
__m256 cmp = _mm256_cmp_ps(v, zero, _CMP_NEQ_OQ); // compare != 0
if (_mm256_movemask_ps(cmp)) return false; // if any != 0, exit
}
// leftover elements
for (; i < size; ++i) {
if (data[i] != 0.0f) return false;
}
return true;
}
bool FaceNvInferServerManager::allZero(const float *data, size_t size) {
size_t i = 0;
#if defined(__AVX512F__)
// 16 floats per iteration
__m512 zero512 = _mm512_setzero_ps();
for (; i + 16 <= size; i += 16) {
__m512 v = _mm512_loadu_ps(&data[i]);
__mmask16 cmp = _mm512_cmp_ps_mask(v, zero512, _CMP_NEQ_OQ);
if (cmp) return false;
}
#elif defined(__AVX2__)
// 8 floats per iteration
__m256 zero256 = _mm256_setzero_ps();
for (; i + 8 <= size; i += 8) {
__m256 v = _mm256_loadu_ps(&data[i]);
__m256 cmp = _mm256_cmp_ps(v, zero256, _CMP_NEQ_OQ);
if (_mm256_movemask_ps(cmp)) return false;
}
#elif defined(__SSE__)
// 4 floats per iteration
__m128 zero128 = _mm_setzero_ps();
for (; i + 4 <= size; i += 4) {
__m128 v = _mm_loadu_ps(&data[i]);
__m128 cmp = _mm_cmpneq_ps(v, zero128);
if (_mm_movemask_ps(cmp)) return false;
}
#endif
// leftover elements
for (; i < size; ++i) {
if (data[i] != 0.0f) return false;
}
return true;
}
/* This is the buffer probe function that we have registered on the sink pad
* of the tiler element. All SGIE infer elements in the pipeline shall attach
* their NvDsInferTensorMeta to each object's metadata of each frame, here we
@ -1156,11 +1208,22 @@ void FaceNvInferServerManager::add_face_body(int object_id, float face_score) {
* metadata.
*/
GstPadProbeReturn FaceNvInferServerManager::sgie_pad_buffer_probe(
GstPad *pad, GstPadProbeInfo *info, gpointer u_data) {
GstPad *pad, GstPadProbeInfo *info, gpointer ctx) {
(void)ctx;
GstBuffer *buf = (GstBuffer *)info->data;
GstMapInfo inmap = GST_MAP_INFO_INIT;
if (!gst_buffer_map(buf, &inmap, GST_MAP_READ)) {
GST_ERROR("input buffer mapinfo failed");
return GST_PAD_PROBE_DROP;
}
NvBufSurface *ip_surf = (NvBufSurface *)inmap.data;
gst_buffer_unmap(buf, &inmap);
(void)ip_surf;
(void)pad;
// static guint use_device_mem = 0;
gboolean *use_new_mux = (gboolean *)u_data;
(void)use_new_mux;
// gboolean *use_new_mux = (gboolean *)u_data;
// (void)use_new_mux;
static NvDsInferNetworkInfo networkInfo{FACE_NET_WIDTH, FACE_NET_HEIGHT, 3};
// (void)networkInfo;
@ -1168,6 +1231,7 @@ GstPadProbeReturn FaceNvInferServerManager::sgie_pad_buffer_probe(
NvDsBatchMeta *batch_meta =
gst_buffer_get_nvds_batch_meta(GST_BUFFER(info->data));
if (!batch_meta) return GST_PAD_PROBE_OK;
bool is_zero_embedding_vector;
/* Iterate each frame metadata in batch */
for (NvDsMetaList *l_frame = batch_meta->frame_meta_list; l_frame != NULL;
@ -1280,8 +1344,7 @@ GstPadProbeReturn FaceNvInferServerManager::sgie_pad_buffer_probe(
unsigned int numDims = layer.inferDims.numDims;
unsigned int numElements = layer.inferDims.numElements;
// (void)numElements;
// (void)numDims;
(void)numElements;
// std::cout << "Layer " << jkl << " (" << layer.layerName
// << "):\n";
@ -1298,11 +1361,12 @@ GstPadProbeReturn FaceNvInferServerManager::sgie_pad_buffer_probe(
float *data_face = static_cast<float *>(layer.buffer);
if (!(strcmp(layer.layerName, "embedding") == 0)) {
for (unsigned int xyz = 0; xyz < numElements; xyz++) {
// for (unsigned int xyz = 0; xyz < numElements; xyz++)
// {
// std::cout << "data_face[" << xyz
// << "]= " << data_face[xyz] <<
// std::endl;
}
// }
if ((strcmp(layer.layerName, "bbox") == 0)) {
for (int l = 0; l < 4; l++) {
face_location[l] = data_face[l];
@ -1313,21 +1377,27 @@ GstPadProbeReturn FaceNvInferServerManager::sgie_pad_buffer_probe(
}
if ((strcmp(layer.layerName, "score") == 0)) {
score_face = data_face[0];
// std::cout << "score_face= " << score_face <<
// std::endl; if (score_face>0.9){
// std::cout << "score_face= " << score_face
// <<std::endl; if (score_face>0.9){
// high_confidence_faces++;
// std::cout << "high_confidence_faces= " <<
// high_confidence_faces << std::endl;
// std::quick_exit(0);
// }
}
}
// else{
// for (int l = 0; l < 512; l++) {
} else {
is_zero_embedding_vector = allZero(data_face, 512);
// std::cout<<"is_zero_embedding_vector =
// "<<is_zero_embedding_vector<<std::endl; for (int l =
// 0; l < 512; l++) {
// std::cout << "face_location[" << l
// << "]= " << data_face[l] << std::endl;
// }
// }
}
}
// std::quick_exit(0);
if (is_zero_embedding_vector == 0) {
;
}
if (score_face > threshold_face_detection) {

View File

@ -7,11 +7,12 @@
#include "gstnvdsmeta.h"
#include "nvds_version.h"
// #include "nvdsinfer_custom_impl.h"
#include <immintrin.h> // for AVX intrinsics
#include "config_manager.hpp"
#include "custom_gstnvdsinfer.hpp"
#include "nvdsmeta.h"
#include "nvdsmeta_schema.h"
#include "custom_gstnvdsinfer.hpp"
#include "config_manager.hpp"
class FaceNvInferServerManager {
private:
@ -51,8 +52,7 @@ class FaceNvInferServerManager {
// static gpointer copy_user_meta(gpointer, gpointer);
// static void release_user_meta(gpointer, gpointer);
static GstPadProbeReturn sgie_pad_buffer_probe(GstPad *, GstPadProbeInfo
*,
static GstPadProbeReturn sgie_pad_buffer_probe(GstPad *, GstPadProbeInfo *,
gpointer);
// static GstPadProbeReturn osd_sink_pad_buffer_probe_new(GstPad *,
// GstPadProbeInfo
@ -63,4 +63,6 @@ class FaceNvInferServerManager {
static NvOSD_RectParams *allign_postprocess(NvOSD_RectParams &, float *);
static float numpy_clip(float, float, float);
static void add_face_body(int, float);
static bool allZeroAVX(const float *, size_t);
static bool allZero(const float *, size_t);
};

View File

@ -30,7 +30,7 @@ bool NvOsdManager::create_nv_osd() {
}
// Attach probe to a pad in the pipeline
void NvOsdManager::attach_probe_to_element() {
void NvOsdManager::attach_probe_to_src_nvosd() {
GstPad *src_pad = gst_element_get_static_pad(nvosd, "src");
if (!src_pad) {
std::cerr << "Unable to get nvosd src pad\n";
@ -119,3 +119,22 @@ GstPadProbeReturn NvOsdManager::osd_src_pad_buffer_probe(GstPad *pad,
frame_number++;
return GST_PAD_PROBE_OK;
}
// Attach probe to a pad in the pipeline
void NvOsdManager::attach_probe_to_sink_nvosd() {
GstPad *sink_pad = gst_element_get_static_pad(nvosd, "sink");
if (!sink_pad) {
std::cerr << "Unable to get nvosd src pad\n";
return;
}
gst_pad_add_probe(sink_pad, GST_PAD_PROBE_TYPE_BUFFER,
osd_sink_pad_buffer_probe, NULL, NULL);
gst_object_unref(sink_pad);
}
GstPadProbeReturn NvOsdManager::osd_sink_pad_buffer_probe(GstPad *,
GstPadProbeInfo *,
gpointer) {
return GST_PAD_PROBE_OK;
}

View File

@ -13,8 +13,12 @@ class NvOsdManager {
bool create_nv_osd();
~NvOsdManager();
static gint frame_number;
void attach_probe_to_element();
void attach_probe_to_src_nvosd();
static GstPadProbeReturn osd_src_pad_buffer_probe(GstPad *,
GstPadProbeInfo *,
gpointer);
void attach_probe_to_sink_nvosd();
static GstPadProbeReturn osd_sink_pad_buffer_probe(GstPad *,
GstPadProbeInfo *,
gpointer);
};

View File

@ -1,4 +1,5 @@
#include "pipeline_manager.hpp"
#define GPU_ID 0
double PipelineManager::fps_buffer_probe = 0;
double PipelineManager::fps_probe = 0;
@ -595,6 +596,18 @@ bool PipelineManager::create_pipeline_elements(int num_sources,
sink_manager->create_sink(prop, rtsp_streaming_manager->host,
rtsp_streaming_manager->updsink_port_num);
sink_manager->create_fake_sink();
// Create Context for Object Encoding.
// Creates and initializes an object encoder context.
// This context manages resources such as GPU memory, encoders, and
// parameters (resolution, quality, scaling, etc.) needed for encoding
// objects into images. create this once per pipeline.
NvDsObjEncCtxHandle obj_ctx_handle = nvds_obj_enc_create_context(GPU_ID);
if (!obj_ctx_handle) {
g_print("Unable to create context\n");
return -1;
}
nv_infer_server_manager->create_nv_infer_server(num_sources);
// GstElement *nvinfer = gst_bin_get_by_name(GST_BIN(pipeline),
@ -611,7 +624,7 @@ bool PipelineManager::create_pipeline_elements(int num_sources,
face_nv_infer_server_manager->create_face_nv_infer_server(num_sources);
nv_osd_manager
->attach_probe_to_element(); // nvinfer Or use "nvtracker" if after
->attach_probe_to_src_nvosd(); // nvinfer Or use "nvtracker" if after
message_handling->create_message_handler(pipeline, g_run_forever, loop);
setup_pipeline();
@ -639,7 +652,7 @@ bool PipelineManager::create_pipeline_elements(int num_sources,
face_nv_infer_server_manager->face_detector, "src");
gst_pad_add_probe(sgie_src_pad, GST_PAD_PROBE_TYPE_BUFFER,
face_nv_infer_server_manager->sgie_pad_buffer_probe,
&use_new_mux, NULL);
(gpointer)obj_ctx_handle, NULL);
auto start = std::chrono::system_clock::now();
status_playing = playing_pipeline(num_sources, url_camera);
@ -676,6 +689,7 @@ bool PipelineManager::create_pipeline_elements(int num_sources,
/* Out of the main loop, clean up nicely */
g_print("Returned, stopping playback \n");
nvds_obj_enc_destroy_context(obj_ctx_handle);
/* Release the request pads from the tee, and unref them */
gst_element_release_request_pad(tee_manager->tee, tee_manager->tee_msg_pad);
gst_element_release_request_pad(tee_manager->tee,