[FFmpeg-devel] [PATCH 2/2] libavfilter/vf_dnn_detect: Add two outputs ssd support

Wed Dec 27 06:16:58 EET 2023

From: Wenbin Chen <wenbin.chen at intel.com>

For this kind of model, we can directly use its output as final result
just like ssd model. The difference is that it splits output into two
tensors. [x_min, y_min, x_max, y_max, confidence] and [lable_id].

Model example refer to: https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/intel/person-detection-0106

Signed-off-by: Wenbin Chen <wenbin.chen at intel.com>
---
 libavfilter/vf_dnn_detect.c | 64 +++++++++++++++++++++++++++++--------
 1 file changed, 50 insertions(+), 14 deletions(-)

diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c
index 88865c8a8e..249cbba0f7 100644
--- a/libavfilter/vf_dnn_detect.c
+++ b/libavfilter/vf_dnn_detect.c
@@ -359,24 +359,48 @@ static int dnn_detect_post_proc_yolov3(AVFrame *frame, DNNData *output,
     return 0;
 }
 
-static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
+static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, int nb_outputs,
+                                    AVFilterContext *filter_ctx)
 {
     DnnDetectContext *ctx = filter_ctx->priv;
     float conf_threshold = ctx->confidence;
-    int proposal_count = output->height;
-    int detect_size = output->width;
-    float *detections = output->data;
+    int proposal_count = 0;
+    int detect_size = 0;
+    float *detections = NULL, *labels = NULL;
     int nb_bboxes = 0;
     AVDetectionBBoxHeader *header;
     AVDetectionBBox *bbox;
-
-    if (output->width != 7) {
+    int scale_w = ctx->scale_width;
+    int scale_h = ctx->scale_height;
+
+    if (nb_outputs == 1 && output->width == 7) {
+        proposal_count = output->height;
+        detect_size = output->width;
+        detections = output->data;
+    } else if (nb_outputs == 2 && output[0].width == 5) {
+        proposal_count = output[0].height;
+        detect_size = output[0].width;
+        detections = output[0].data;
+        labels = output[1].data;
+    } else if (nb_outputs == 2 && output[1].width == 5) {
+        proposal_count = output[1].height;
+        detect_size = output[1].width;
+        detections = output[1].data;
+        labels = output[0].data;
+    } else {
         av_log(filter_ctx, AV_LOG_ERROR, "Model output shape doesn't match ssd requirement.\n");
         return AVERROR(EINVAL);
     }
 
+    if (proposal_count == 0)
+        return 0;
+
     for (int i = 0; i < proposal_count; ++i) {
-        float conf = detections[i * detect_size + 2];
+        float conf;
+        if (nb_outputs == 1)
+            conf = detections[i * detect_size + 2];
+        else
+            conf = detections[i * detect_size + 4];
         if (conf < conf_threshold) {
             continue;
         }
@@ -398,12 +422,24 @@ static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, AVFilterCon
 
     for (int i = 0; i < proposal_count; ++i) {
         int av_unused image_id = (int)detections[i * detect_size + 0];
-        int label_id = (int)detections[i * detect_size + 1];
-        float conf   =      detections[i * detect_size + 2];
-        float x0     =      detections[i * detect_size + 3];
-        float y0     =      detections[i * detect_size + 4];
-        float x1     =      detections[i * detect_size + 5];
-        float y1     =      detections[i * detect_size + 6];
+        int label_id;
+        float conf, x0, y0, x1, y1;
+
+        if (nb_outputs == 1) {
+            label_id = (int)detections[i * detect_size + 1];
+            conf = detections[i * detect_size + 2];
+            x0   = detections[i * detect_size + 3];
+            y0   = detections[i * detect_size + 4];
+            x1   = detections[i * detect_size + 5];
+            y1   = detections[i * detect_size + 6];
+        } else {
+            label_id = (int)labels[i];
+            x0     =      detections[i * detect_size] / scale_w;
+            y0     =      detections[i * detect_size + 1] / scale_h;
+            x1     =      detections[i * detect_size + 2] / scale_w;
+            y1     =      detections[i * detect_size + 3] / scale_h;
+            conf   =      detections[i * detect_size + 4];
+        }
 
         if (conf < conf_threshold) {
             continue;
@@ -447,7 +483,7 @@ static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outpu
 
     switch (ctx->model_type) {
     case DDMT_SSD:
-        ret = dnn_detect_post_proc_ssd(frame, output, filter_ctx);
+        ret = dnn_detect_post_proc_ssd(frame, output, nb_outputs, filter_ctx);
         if (ret < 0)
             return ret;
         break;
-- 
2.34.1