[FFmpeg-devel] [PATCH] libavfilter: Add more operation supports in FFmpeg dnn native mode.

Sun Apr 28 13:28:21 EEST 2019

On Sun, Apr 28, 2019 at 5:27 PM <xwmeng at pku.edu.cn> wrote:
>
> This patch is for the support of derain filter project in GSoC. It adds supports for the following operations:
>
>
>
>
>  (1) Conv padding method: "SAME" and "VALID"
>
>  (2) Dilation
>
>  (3) Activation: "NONE" and "LEAKY_RELU"
>
>
>
>
> These operations are all needed in derain filter. And if modify the dnn native mode in FFmpeg, the generation process of Super Resolution model should be changed accordingly, e.g. add padding method parameter (= 0) and dilation parameter (= 1).
>
>
>
>
> In addition, I have a question about the Super Resulotion implementation. The model training process of SR uses "VALID" method. According to my understanding of "VALID" mode in tensorflow, the size of output image should be smaller than the current design in SR. Because pixels near the boundary are not processed in "VALID" mode, however, these unprocessed pixels are filled with adjacent pixels in current dnn native mode. I wonder why to do like this here.
>
>
>
>
> From 4d92ef21a5acf064122c51f442d0e2f5437b3343 Mon Sep 17 00:00:00 2001
> From: Xuewei Meng <xwmeng at pku.edu.cn>
> Date: Sun, 28 Apr 2019 17:21:35 +0800
> Subject: [PATCH] Add operation supports in dnn_native
>
> Signed-off-by: Xuewei Meng <xwmeng at pku.edu.cn>
> ---
>  libavfilter/dnn_backend_native.c | 36 +++++++++++++++++++++-----------
>  libavfilter/dnn_backend_native.h |  6 +++++-
>  2 files changed, 29 insertions(+), 13 deletions(-)
>
> diff --git a/libavfilter/dnn_backend_native.c b/libavfilter/dnn_backend_native.c
> index 70d857f5f2..0e3ef5d64d 100644
> --- a/libavfilter/dnn_backend_native.c
> +++ b/libavfilter/dnn_backend_native.c
> @@ -157,13 +157,15 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename)
>                  ff_dnn_free_model_native(&model);
>                  return NULL;
>              }
> +            conv_params->dilation = (int32_t)avio_rl32(model_file_context);
> +            conv_params->padding_method = (int32_t)avio_rl32(model_file_context);
>              conv_params->activation = (int32_t)avio_rl32(model_file_context);
>              conv_params->input_num = (int32_t)avio_rl32(model_file_context);
>              conv_params->output_num = (int32_t)avio_rl32(model_file_context);
>              conv_params->kernel_size = (int32_t)avio_rl32(model_file_context);
>              kernel_size = conv_params->input_num * conv_params->output_num *
>                            conv_params->kernel_size * conv_params->kernel_size;
> -            dnn_size += 16 + (kernel_size + conv_params->output_num << 2);
> +            dnn_size += 24 + (kernel_size + conv_params->output_num << 2);
Add some comments for the number 16 or 24 ?
>              if (dnn_size > file_size || conv_params->input_num <= 0 ||
>                  conv_params->output_num <= 0 || conv_params->kernel_size <= 0){
>                  avio_closep(&model_file_context);
> @@ -221,23 +223,28 @@ DNNModel *ff_dnn_load_model_native(const char *model_filename)
>
>  static void convolve(const float *input, float *output, const ConvolutionalParams *conv_params, int width, int height)
>  {
> -    int y, x, n_filter, ch, kernel_y, kernel_x;
Why?
>      int radius = conv_params->kernel_size >> 1;
>      int src_linesize = width * conv_params->input_num;
>      int filter_linesize = conv_params->kernel_size * conv_params->input_num;
>      int filter_size = conv_params->kernel_size * filter_linesize;
> +    int pad_size = (conv_params->padding_method == VALID) ? (conv_params->kernel_size - 1) / 2 * conv_params->dilation : 0;
>
> -    for (y = 0; y < height; ++y){
> -        for (x = 0; x < width; ++x){
> -            for (n_filter = 0; n_filter < conv_params->output_num; ++n_filter){
> +    for (int y = pad_size; y < height - pad_size; ++y){
> +        for (int x = pad_size; x < width - pad_size; ++x){
> +            for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter){
>                  output[n_filter] = conv_params->biases[n_filter];
> -                for (ch = 0; ch < conv_params->input_num; ++ch){
> -                    for (kernel_y = 0; kernel_y < conv_params->kernel_size; ++kernel_y){
> -                        for (kernel_x = 0; kernel_x < conv_params->kernel_size; ++kernel_x){
> -                            output[n_filter] += input[CLAMP_TO_EDGE(y + kernel_y - radius, height) * src_linesize +
> -                                                      CLAMP_TO_EDGE(x + kernel_x - radius, width) * conv_params->input_num + ch] *
> -                                                conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
> -                                                                    kernel_x * conv_params->input_num + ch];
> +
> +                for (int ch = 0; ch < conv_params->input_num; ++ch){
> +                    for (int kernel_y = 0; kernel_y < conv_params->kernel_size; ++kernel_y){
> +                        for (int kernel_x = 0; kernel_x < conv_params->kernel_size; ++kernel_x){
> +                            int y_pos = y + (kernel_y - radius) * conv_params->dilation;
> +                            int x_pos = x + (kernel_x - radius) * conv_params->dilation;
> +
> +                            float input_pel = (x_pos < 0 || x_pos >= width || y_pos < 0 || y_pos >= height) ? 0.0 :
> +                                               input[y_pos * src_linesize + x_pos * conv_params->input_num + ch];
> +
> +                            output[n_filter] += input_pel * conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
> +                                                                                kernel_x * conv_params->input_num + ch];
>                          }
>                      }
>                  }
> @@ -250,6 +257,11 @@ static void convolve(const float *input, float *output, const ConvolutionalParam
>                      break;
>                  case SIGMOID:
>                      output[n_filter] = 1.0f / (1.0f + exp(-output[n_filter]));
> +                    break;
> +                case NONE:
> +                    break;
> +                case LEAKY_RELU:
> +                    output[n_filter] = FFMAX(output[n_filter], 0.0) + 0.2 * FFMIN(output[n_filter], 0.0);
>                  }
>              }
>              output += conv_params->output_num;
> diff --git a/libavfilter/dnn_backend_native.h b/libavfilter/dnn_backend_native.h
> index 51d4cac955..f7d4eb823b 100644
> --- a/libavfilter/dnn_backend_native.h
> +++ b/libavfilter/dnn_backend_native.h
> @@ -32,7 +32,9 @@
>
>  typedef enum {INPUT, CONV, DEPTH_TO_SPACE} DNNLayerType;
>
> -typedef enum {RELU, TANH, SIGMOID} DNNActivationFunc;
> +typedef enum {RELU, TANH, SIGMOID, NONE, LEAKY_RELU} DNNActivationFunc;
> +
> +typedef enum {VALID, SAME} DNNPaddingFunc;
>
>  typedef struct Layer{
>      DNNLayerType type;
> @@ -43,6 +45,8 @@ typedef struct Layer{
>  typedef struct ConvolutionalParams{
>      int32_t input_num, output_num, kernel_size;
>      DNNActivationFunc activation;
> +    DNNPaddingFunc padding_method;
> +    int32_t dilation;
>      float *kernel;
>      float *biases;
>  } ConvolutionalParams;
> --
> 2.17.1