[FFmpeg-devel] [PATCH 2/3] libavfilter/dnn: Add scale and mean preprocess to openvino backend

Wed Sep 20 05:26:08 EEST 2023

From: Wenbin Chen <wenbin.chen at intel.com>

Dnn models has different data preprocess requirements. Scale and mean
parameters are added to preprocess input data.

Signed-off-by: Wenbin Chen <wenbin.chen at intel.com>
---
 libavfilter/dnn/dnn_backend_openvino.c | 43 ++++++++++++--
 libavfilter/dnn/dnn_io_proc.c          | 82 +++++++++++++++++++++-----
 libavfilter/dnn_interface.h            |  2 +
 3 files changed, 108 insertions(+), 19 deletions(-)

diff --git a/libavfilter/dnn/dnn_backend_openvino.c b/libavfilter/dnn/dnn_backend_openvino.c
index 3ba5f5331a..4224600f94 100644
--- a/libavfilter/dnn/dnn_backend_openvino.c
+++ b/libavfilter/dnn/dnn_backend_openvino.c
@@ -46,6 +46,8 @@ typedef struct OVOptions{
     int batch_size;
     int input_resizable;
     DNNLayout layout;
+    float scale;
+    float mean;
 } OVOptions;
 
 typedef struct OVContext {
@@ -105,6 +107,8 @@ static const AVOption dnn_openvino_options[] = {
         { "none",  "none", 0, AV_OPT_TYPE_CONST, { .i64 = DL_NONE }, 0, 0, FLAGS, "layout"},
         { "nchw",  "nchw", 0, AV_OPT_TYPE_CONST, { .i64 = DL_NCHW }, 0, 0, FLAGS, "layout"},
         { "nhwc",  "nhwc", 0, AV_OPT_TYPE_CONST, { .i64 = DL_NHWC }, 0, 0, FLAGS, "layout"},
+    { "scale", "Add scale preprocess operation. Divide each element of input by specified value.", OFFSET(options.scale), AV_OPT_TYPE_FLOAT, { .dbl = 0 }, INT_MIN, INT_MAX, FLAGS},
+    { "mean",  "Add mean preprocess operation. Subtract specified value from each element of input.", OFFSET(options.mean),  AV_OPT_TYPE_FLOAT, { .dbl = 0 }, INT_MIN, INT_MAX, FLAGS},
     { NULL }
 };
 
@@ -209,6 +213,7 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request)
     ie_blob_t *input_blob = NULL;
 #endif
 
+    memset(&input, 0, sizeof(input));
     lltask = ff_queue_peek_front(ov_model->lltask_queue);
     av_assert0(lltask);
     task = lltask->task;
@@ -274,6 +279,9 @@ static int fill_model_input_ov(OVModel *ov_model, OVRequestItem *request)
     // all models in openvino open model zoo use BGR as input,
     // change to be an option when necessary.
     input.order = DCO_BGR;
+    // We use preprocess_steps to scale input data, so disable scale and mean here.
+    input.scale = 1;
+    input.mean = 0;
 
     for (int i = 0; i < ctx->options.batch_size; ++i) {
         lltask = ff_queue_pop_front(ov_model->lltask_queue);
@@ -343,6 +351,7 @@ static void infer_completion_callback(void *args)
     ov_shape_t output_shape = {0};
     ov_element_type_e precision;
 
+    memset(&output, 0, sizeof(output));
     status = ov_infer_request_get_output_tensor_by_index(request->infer_request, 0, &output_tensor);
     if (status != OK) {
         av_log(ctx, AV_LOG_ERROR,
@@ -409,6 +418,8 @@ static void infer_completion_callback(void *args)
 #endif
     output.dt       = precision_to_datatype(precision);
     output.layout   = ctx->options.layout;
+    output.scale    = ctx->options.scale;
+    output.mean     = ctx->options.mean;
 
     av_assert0(request->lltask_count >= 1);
     for (int i = 0; i < request->lltask_count; ++i) {
@@ -542,7 +553,9 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
     ie_config_t config = {NULL, NULL, NULL};
     char *all_dev_names = NULL;
 #endif
-
+    // We scale pixel by default when do frame processing.
+    if (fabsf(ctx->options.scale) < 1e-6f)
+        ctx->options.scale = ov_model->model->func_type == DFT_PROCESS_FRAME ? 255 : 1;
     // batch size
     if (ctx->options.batch_size <= 0) {
         ctx->options.batch_size = 1;
@@ -609,15 +622,37 @@ static int init_model_ov(OVModel *ov_model, const char *input_name, const char *
         goto err;
     }
 
+    status = ov_preprocess_input_tensor_info_set_element_type(input_tensor_info, U8);
     if (ov_model->model->func_type != DFT_PROCESS_FRAME)
-        //set precision only for detect and classify
-        status = ov_preprocess_input_tensor_info_set_element_type(input_tensor_info, U8);
-    status |= ov_preprocess_output_set_element_type(output_tensor_info, F32);
+        status |= ov_preprocess_output_set_element_type(output_tensor_info, F32);
+    else if (fabsf(ctx->options.scale - 1) > 1e-6f || fabsf(ctx->options.mean) > 1e-6f)
+        status |= ov_preprocess_output_set_element_type(output_tensor_info, F32);
+    else
+        status |= ov_preprocess_output_set_element_type(output_tensor_info, U8);
     if (status != OK) {
         av_log(ctx, AV_LOG_ERROR, "Failed to set input/output element type\n");
         ret = ov2_map_error(status, NULL);
         goto err;
     }
+    // set preprocess steps.
+    if (fabsf(ctx->options.scale - 1) > 1e-6f || fabsf(ctx->options.mean) > 1e-6f) {
+        ov_preprocess_preprocess_steps_t* input_process_steps = NULL;
+        status = ov_preprocess_input_info_get_preprocess_steps(ov_model->input_info, &input_process_steps);
+        if (status != OK) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to get preprocess steps\n");
+            ret = ov2_map_error(status, NULL);
+            goto err;
+        }
+        status = ov_preprocess_preprocess_steps_convert_element_type(input_process_steps, F32);
+        status |= ov_preprocess_preprocess_steps_mean(input_process_steps, ctx->options.mean);
+        status |= ov_preprocess_preprocess_steps_scale(input_process_steps, ctx->options.scale);
+        if (status != OK) {
+            av_log(ctx, AV_LOG_ERROR, "Failed to set preprocess steps\n");
+            ret = ov2_map_error(status, NULL);
+            goto err;
+        }
+        ov_preprocess_preprocess_steps_free(input_process_steps);
+    }
 
     //update model
     if(ov_model->ov_model)
diff --git a/libavfilter/dnn/dnn_io_proc.c b/libavfilter/dnn/dnn_io_proc.c
index dfa0d5e5da..ab656e8ed7 100644
--- a/libavfilter/dnn/dnn_io_proc.c
+++ b/libavfilter/dnn/dnn_io_proc.c
@@ -24,6 +24,20 @@
 #include "libavutil/avassert.h"
 #include "libavutil/detection_bbox.h"
 
+static int get_datatype_size(DNNDataType dt)
+{
+    switch (dt)
+    {
+    case DNN_FLOAT:
+        return sizeof(float);
+    case DNN_UINT8:
+        return sizeof(uint8_t);
+    default:
+        av_assert0(!"not supported yet.");
+        return 1;
+    }
+}
+
 int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx)
 {
     struct SwsContext *sws_ctx;
@@ -33,14 +47,26 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx)
     void *middle_data = NULL;
     uint8_t *planar_data[4] = { 0 };
     int plane_size = frame->width * frame->height * sizeof(uint8_t);
+    enum AVPixelFormat src_fmt = AV_PIX_FMT_NONE;
+    int src_datatype_size = get_datatype_size(output->dt);
+
     int bytewidth = av_image_get_linesize(frame->format, frame->width, 0);
     if (bytewidth < 0) {
         return AVERROR(EINVAL);
     }
-    if (output->dt != DNN_FLOAT) {
-        avpriv_report_missing_feature(log_ctx, "data type rather than DNN_FLOAT");
+    /* scale == 1 and mean == 0 and dt == UINT8: passthrough */
+    if (fabsf(output->scale - 1) < 1e-6f && fabsf(output->mean) < 1e-6 && output->dt == DNN_UINT8)
+        src_fmt = AV_PIX_FMT_GRAY8;
+    /* (scale == 255 or scale == 0) and mean == 0 and dt == FLOAT: normalization */
+    else if ((fabsf(output->scale - 255) < 1e-6f || fabsf(output->scale) < 1e-6f) &&
+             fabsf(output->mean) < 1e-6 && output->dt == DNN_FLOAT)
+        src_fmt = AV_PIX_FMT_GRAYF32;
+    else {
+        av_log(log_ctx, AV_LOG_ERROR, "dnn_process output data doesn't type: UINT8 "
+                                      "scale: %f, mean: %f\n", output->scale, output->mean);
         return AVERROR(ENOSYS);
     }
+
     dst_data = (void **)frame->data;
     linesize[0] = frame->linesize[0];
     if (output->layout == DL_NCHW) {
@@ -58,7 +84,7 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx)
     case AV_PIX_FMT_BGR24:
         sws_ctx = sws_getContext(frame->width * 3,
                                  frame->height,
-                                 AV_PIX_FMT_GRAYF32,
+                                 src_fmt,
                                  frame->width * 3,
                                  frame->height,
                                  AV_PIX_FMT_GRAY8,
@@ -66,13 +92,13 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx)
         if (!sws_ctx) {
             av_log(log_ctx, AV_LOG_ERROR, "Impossible to create scale context for the conversion "
                 "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
-                av_get_pix_fmt_name(AV_PIX_FMT_GRAYF32), frame->width * 3, frame->height,
+                av_get_pix_fmt_name(src_fmt), frame->width * 3, frame->height,
                 av_get_pix_fmt_name(AV_PIX_FMT_GRAY8),   frame->width * 3, frame->height);
             ret = AVERROR(EINVAL);
             goto err;
         }
         sws_scale(sws_ctx, (const uint8_t *[4]){(const uint8_t *)output->data, 0, 0, 0},
-                           (const int[4]){frame->width * 3 * sizeof(float), 0, 0, 0}, 0, frame->height,
+                           (const int[4]){frame->width * 3 * src_datatype_size, 0, 0, 0}, 0, frame->height,
                            (uint8_t * const*)dst_data, linesize);
         sws_freeContext(sws_ctx);
         // convert data from planar to packed
@@ -131,13 +157,13 @@ int ff_proc_from_dnn_to_frame(AVFrame *frame, DNNData *output, void *log_ctx)
         if (!sws_ctx) {
             av_log(log_ctx, AV_LOG_ERROR, "Impossible to create scale context for the conversion "
                 "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
-                av_get_pix_fmt_name(AV_PIX_FMT_GRAYF32), frame->width, frame->height,
+                av_get_pix_fmt_name(src_fmt), frame->width, frame->height,
                 av_get_pix_fmt_name(AV_PIX_FMT_GRAY8),   frame->width, frame->height);
             ret = AVERROR(EINVAL);
             goto err;
         }
         sws_scale(sws_ctx, (const uint8_t *[4]){(const uint8_t *)output->data, 0, 0, 0},
-                           (const int[4]){frame->width * sizeof(float), 0, 0, 0}, 0, frame->height,
+                           (const int[4]){frame->width * src_datatype_size, 0, 0, 0}, 0, frame->height,
                            (uint8_t * const*)frame->data, frame->linesize);
         sws_freeContext(sws_ctx);
         break;
@@ -161,12 +187,22 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx)
     void *middle_data = NULL;
     uint8_t *planar_data[4] = { 0 };
     int plane_size = frame->width * frame->height * sizeof(uint8_t);
+    enum AVPixelFormat dst_fmt = AV_PIX_FMT_NONE;
+    int dst_datatype_size = get_datatype_size(input->dt);
     int bytewidth = av_image_get_linesize(frame->format, frame->width, 0);
     if (bytewidth < 0) {
         return AVERROR(EINVAL);
     }
-    if (input->dt != DNN_FLOAT) {
-        avpriv_report_missing_feature(log_ctx, "data type rather than DNN_FLOAT");
+    /* scale == 1 and mean == 0 and dt == UINT8: passthrough */
+    if (fabsf(input->scale - 1) < 1e-6f && fabsf(input->mean) < 1e-6 && input->dt == DNN_UINT8)
+        dst_fmt = AV_PIX_FMT_GRAY8;
+    /* (scale == 255 or scale == 0) and mean == 0 and dt == FLOAT: normalization */
+    else if ((fabsf(input->scale - 255) < 1e-6f || fabsf(input->scale) < 1e-6f) &&
+             fabsf(input->mean) < 1e-6 && input->dt == DNN_FLOAT)
+        dst_fmt = AV_PIX_FMT_GRAYF32;
+    else {
+        av_log(log_ctx, AV_LOG_ERROR, "dnn_process input data doesn't support type: UINT8 "
+                                      "scale: %f, mean: %f\n", input->scale, input->mean);
         return AVERROR(ENOSYS);
     }
 
@@ -223,20 +259,20 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx)
                                  AV_PIX_FMT_GRAY8,
                                  frame->width * 3,
                                  frame->height,
-                                 AV_PIX_FMT_GRAYF32,
+                                 dst_fmt,
                                  0, NULL, NULL, NULL);
         if (!sws_ctx) {
             av_log(log_ctx, AV_LOG_ERROR, "Impossible to create scale context for the conversion "
                 "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
                 av_get_pix_fmt_name(AV_PIX_FMT_GRAY8),  frame->width * 3, frame->height,
-                av_get_pix_fmt_name(AV_PIX_FMT_GRAYF32),frame->width * 3, frame->height);
+                av_get_pix_fmt_name(dst_fmt),frame->width * 3, frame->height);
             ret = AVERROR(EINVAL);
             goto err;
         }
         sws_scale(sws_ctx, (const uint8_t **)src_data,
                            linesize, 0, frame->height,
                            (uint8_t * const [4]){input->data, 0, 0, 0},
-                           (const int [4]){frame->width * 3 * sizeof(float), 0, 0, 0});
+                           (const int [4]){frame->width * 3 * dst_datatype_size, 0, 0, 0});
         sws_freeContext(sws_ctx);
         break;
     case AV_PIX_FMT_GRAYF32:
@@ -256,20 +292,20 @@ int ff_proc_from_frame_to_dnn(AVFrame *frame, DNNData *input, void *log_ctx)
                                  AV_PIX_FMT_GRAY8,
                                  frame->width,
                                  frame->height,
-                                 AV_PIX_FMT_GRAYF32,
+                                 dst_fmt,
                                  0, NULL, NULL, NULL);
         if (!sws_ctx) {
             av_log(log_ctx, AV_LOG_ERROR, "Impossible to create scale context for the conversion "
                 "fmt:%s s:%dx%d -> fmt:%s s:%dx%d\n",
                 av_get_pix_fmt_name(AV_PIX_FMT_GRAY8),  frame->width, frame->height,
-                av_get_pix_fmt_name(AV_PIX_FMT_GRAYF32),frame->width, frame->height);
+                av_get_pix_fmt_name(dst_fmt),frame->width, frame->height);
             ret = AVERROR(EINVAL);
             goto err;
         }
         sws_scale(sws_ctx, (const uint8_t **)frame->data,
                            frame->linesize, 0, frame->height,
                            (uint8_t * const [4]){input->data, 0, 0, 0},
-                           (const int [4]){frame->width * sizeof(float), 0, 0, 0});
+                           (const int [4]){frame->width * dst_datatype_size, 0, 0, 0});
         sws_freeContext(sws_ctx);
         break;
     default:
@@ -315,6 +351,14 @@ int ff_frame_to_dnn_classify(AVFrame *frame, DNNData *input, uint32_t bbox_index
     AVFrameSideData *sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
     av_assert0(sd);
 
+    /* (scale != 1 and scale != 0) or mean != 0 */
+    if ((fabsf(input->scale - 1) > 1e-6f && fabsf(input->scale) > 1e-6f) ||
+        fabsf(input->mean) > 1e-6f) {
+        av_log(log_ctx, AV_LOG_ERROR, "dnn_classify input data doesn't support "
+                                      "scale: %f, mean: %f\n", input->scale, input->mean);
+        return AVERROR(ENOSYS);
+    }
+
     if (input->layout == DL_NCHW) {
         av_log(log_ctx, AV_LOG_ERROR, "dnn_classify input data doesn't support layout: NCHW\n");
         return AVERROR(ENOSYS);
@@ -373,6 +417,14 @@ int ff_frame_to_dnn_detect(AVFrame *frame, DNNData *input, void *log_ctx)
     int ret = 0;
     enum AVPixelFormat fmt = get_pixel_format(input);
 
+    /* (scale != 1 and scale != 0) or mean != 0 */
+    if ((fabsf(input->scale - 1) > 1e-6f && fabsf(input->scale) > 1e-6f) ||
+        fabsf(input->mean) > 1e-6f) {
+        av_log(log_ctx, AV_LOG_ERROR, "dnn_detect input data doesn't support "
+                                      "scale: %f, mean: %f\n", input->scale, input->mean);
+        return AVERROR(ENOSYS);
+    }
+
     if (input->layout == DL_NCHW) {
         av_log(log_ctx, AV_LOG_ERROR, "dnn_detect input data doesn't support layout: NCHW\n");
         return AVERROR(ENOSYS);
diff --git a/libavfilter/dnn_interface.h b/libavfilter/dnn_interface.h
index 956a63443a..183d8418b2 100644
--- a/libavfilter/dnn_interface.h
+++ b/libavfilter/dnn_interface.h
@@ -69,6 +69,8 @@ typedef struct DNNData{
     DNNDataType dt;
     DNNColorOrder order;
     DNNLayout layout;
+    float scale;
+    float mean;
 } DNNData;
 
 typedef struct DNNExecBaseParams {
-- 
2.34.1