[FFmpeg-cvslog] libavfilter/vf_dnn_detect: Add two outputs ssd support
    Wenbin Chen 
    git at videolan.org
       
    Sat Dec 30 06:52:15 EET 2023
    
    
  
ffmpeg | branch: master | Wenbin Chen <wenbin.chen at intel.com> | Wed Dec 27 12:16:58 2023 +0800| [56c5930ec36be3bc63a27913d6f0a199912220c5] | committer: Guo Yejun
libavfilter/vf_dnn_detect: Add two outputs ssd support
For this kind of model, we can directly use its output as final result
just like ssd model. The difference is that it splits output into two
tensors. [x_min, y_min, x_max, y_max, confidence] and [lable_id].
Model example refer to: https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/intel/person-detection-0106
Signed-off-by: Wenbin Chen <wenbin.chen at intel.com>
Reviewed-by: Guo Yejun <yejun.guo at intel.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=56c5930ec36be3bc63a27913d6f0a199912220c5
---
 libavfilter/vf_dnn_detect.c | 64 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 14 deletions(-)
diff --git a/libavfilter/vf_dnn_detect.c b/libavfilter/vf_dnn_detect.c
index 88865c8a8e..249cbba0f7 100644
--- a/libavfilter/vf_dnn_detect.c
+++ b/libavfilter/vf_dnn_detect.c
@@ -359,24 +359,48 @@ static int dnn_detect_post_proc_yolov3(AVFrame *frame, DNNData *output,
     return 0;
 }
 
-static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, AVFilterContext *filter_ctx)
+static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, int nb_outputs,
+                                    AVFilterContext *filter_ctx)
 {
     DnnDetectContext *ctx = filter_ctx->priv;
     float conf_threshold = ctx->confidence;
-    int proposal_count = output->height;
-    int detect_size = output->width;
-    float *detections = output->data;
+    int proposal_count = 0;
+    int detect_size = 0;
+    float *detections = NULL, *labels = NULL;
     int nb_bboxes = 0;
     AVDetectionBBoxHeader *header;
     AVDetectionBBox *bbox;
-
-    if (output->width != 7) {
+    int scale_w = ctx->scale_width;
+    int scale_h = ctx->scale_height;
+
+    if (nb_outputs == 1 && output->width == 7) {
+        proposal_count = output->height;
+        detect_size = output->width;
+        detections = output->data;
+    } else if (nb_outputs == 2 && output[0].width == 5) {
+        proposal_count = output[0].height;
+        detect_size = output[0].width;
+        detections = output[0].data;
+        labels = output[1].data;
+    } else if (nb_outputs == 2 && output[1].width == 5) {
+        proposal_count = output[1].height;
+        detect_size = output[1].width;
+        detections = output[1].data;
+        labels = output[0].data;
+    } else {
         av_log(filter_ctx, AV_LOG_ERROR, "Model output shape doesn't match ssd requirement.\n");
         return AVERROR(EINVAL);
     }
 
+    if (proposal_count == 0)
+        return 0;
+
     for (int i = 0; i < proposal_count; ++i) {
-        float conf = detections[i * detect_size + 2];
+        float conf;
+        if (nb_outputs == 1)
+            conf = detections[i * detect_size + 2];
+        else
+            conf = detections[i * detect_size + 4];
         if (conf < conf_threshold) {
             continue;
         }
@@ -398,12 +422,24 @@ static int dnn_detect_post_proc_ssd(AVFrame *frame, DNNData *output, AVFilterCon
 
     for (int i = 0; i < proposal_count; ++i) {
         int av_unused image_id = (int)detections[i * detect_size + 0];
-        int label_id = (int)detections[i * detect_size + 1];
-        float conf   =      detections[i * detect_size + 2];
-        float x0     =      detections[i * detect_size + 3];
-        float y0     =      detections[i * detect_size + 4];
-        float x1     =      detections[i * detect_size + 5];
-        float y1     =      detections[i * detect_size + 6];
+        int label_id;
+        float conf, x0, y0, x1, y1;
+
+        if (nb_outputs == 1) {
+            label_id = (int)detections[i * detect_size + 1];
+            conf = detections[i * detect_size + 2];
+            x0   = detections[i * detect_size + 3];
+            y0   = detections[i * detect_size + 4];
+            x1   = detections[i * detect_size + 5];
+            y1   = detections[i * detect_size + 6];
+        } else {
+            label_id = (int)labels[i];
+            x0     =      detections[i * detect_size] / scale_w;
+            y0     =      detections[i * detect_size + 1] / scale_h;
+            x1     =      detections[i * detect_size + 2] / scale_w;
+            y1     =      detections[i * detect_size + 3] / scale_h;
+            conf   =      detections[i * detect_size + 4];
+        }
 
         if (conf < conf_threshold) {
             continue;
@@ -447,7 +483,7 @@ static int dnn_detect_post_proc_ov(AVFrame *frame, DNNData *output, int nb_outpu
 
     switch (ctx->model_type) {
     case DDMT_SSD:
-        ret = dnn_detect_post_proc_ssd(frame, output, filter_ctx);
+        ret = dnn_detect_post_proc_ssd(frame, output, nb_outputs, filter_ctx);
         if (ret < 0)
             return ret;
         break;
    
    
More information about the ffmpeg-cvslog
mailing list