[FFmpeg-devel] [PATCH 2/3][GSoC] Add x86-sse4 optimization for dnn_execute_layer_conv2d
xujunzz at sjtu.edu.cn
xujunzz at sjtu.edu.cn
Mon Aug 31 20:03:42 EEST 2020
From: Xu Jun <xujunzz at sjtu.edu.cn>
Can be tested with command "./ffmpeg_g -i input.png -vf \
format=yuvj420p,dnn_processing=dnn_backend=native:model= \
espcn.model:input=x:output=y -y sr_native.jpg -benchmark"\
-cpuflags 0x100
before patch: utime=20.817s stime=0.047s rtime=1.051s
after patch: utime=3.744s stime=0.037s rtime=0.252s
Signed-off-by: Xu Jun <xujunzz at sjtu.edu.cn>
ss
Signed-off-by: Xu Jun <xujunzz at sjtu.edu.cn>
---
libavfilter/dnn/Makefile | 1 +
.../dnn/dnn_backend_native_layer_conv2d.c | 123 ++++++++--
.../dnn_backend_native_layer_conv2d_x86.asm | 214 ++++++++++++++++++
3 files changed, 314 insertions(+), 24 deletions(-)
create mode 100644 libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm
diff --git a/libavfilter/dnn/Makefile b/libavfilter/dnn/Makefile
index e0957073ee..bdd334b192 100644
--- a/libavfilter/dnn/Makefile
+++ b/libavfilter/dnn/Makefile
@@ -8,6 +8,7 @@ OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_dep
OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_maximum.o
OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_mathbinary.o
OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_mathunary.o
+OBJS-$(CONFIG_DNN) += dnn/dnn_backend_native_layer_conv2d_x86.o
DNN-OBJS-$(CONFIG_LIBTENSORFLOW) += dnn/dnn_backend_tf.o
DNN-OBJS-$(CONFIG_LIBOPENVINO) += dnn/dnn_backend_openvino.o
diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
index 570b974052..92cc5313dc 100644
--- a/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
+++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d.c
@@ -21,6 +21,7 @@
#include "libavutil/avassert.h"
#include "libavutil/thread.h"
#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
#include "dnn_backend_native_layer_conv2d.h"
#define CLAMP_TO_EDGE(x, w) ((x) < 0 ? 0 : ((x) >= (w) ? (w - 1) : (x)))
@@ -34,8 +35,20 @@ typedef struct thread_data{
NativeContext *ctx;
int32_t thread_num;
int32_t thread_index;
+ int step;
} thread_data;
+typedef struct execute_data{
+ int thread_start, thread_end, input_num, output_num, kernel_size, padding_method, dilation;
+ int pad_size, width, height, radius, src_linesize, filter_size, filter_linesize;
+ float *input;
+ float *output;
+ float *kernel;
+} execute_data;
+
+void ff_dnn_execute_layer_conv2d_sse4(execute_data *execute_data);
+void ff_dnn_execute_layer_conv2d_c(execute_data *execute_data);
+
int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int file_size, int operands_num)
{
ConvolutionalParams *conv_params;
@@ -101,6 +114,56 @@ int dnn_load_layer_conv2d(Layer *layer, AVIOContext *model_file_context, int fil
return dnn_size;
}
+void ff_dnn_execute_layer_conv2d_c(execute_data *execute_data){
+ int thread_start = execute_data->thread_start;
+ int thread_end = execute_data->thread_end;
+ float *input = execute_data->input;
+ float *output = execute_data->output;
+ float *kernel = execute_data->kernel;
+ int input_num = execute_data->input_num;
+ int output_num = execute_data->output_num;
+ int kernel_size = execute_data->kernel_size;
+ int padding_method = execute_data->padding_method;
+ int dilation = execute_data->dilation;
+ int pad_size = execute_data->pad_size;
+ int width = execute_data->width;
+ int height = execute_data->height;
+ int radius = execute_data->radius;
+ int src_linesize = execute_data->src_linesize;
+ int filter_size = execute_data->filter_size;
+ int filter_linesize = execute_data->filter_linesize;
+
+ for (int y = thread_start; y < thread_end; ++y) {
+ for (int x = pad_size; x < width - pad_size; ++x) {
+ for (int n_filter = 0; n_filter < output_num; ++n_filter) {
+ output[n_filter] = 0.0f;
+ for (int ch = 0; ch < input_num; ++ch) {
+ for (int kernel_y = 0; kernel_y < kernel_size; ++kernel_y) {
+ for (int kernel_x = 0; kernel_x < kernel_size; ++kernel_x) {
+ float input_pel;
+ if (padding_method == SAME_CLAMP_TO_EDGE) {
+ int y_pos = CLAMP_TO_EDGE(y + (kernel_y - radius) * dilation, height);
+ int x_pos = CLAMP_TO_EDGE(x + (kernel_x - radius) * dilation, width);
+ input_pel = input[y_pos * src_linesize + x_pos * input_num + ch];
+ } else {
+ int y_pos = y + (kernel_y - radius) * dilation;
+ int x_pos = x + (kernel_x - radius) * dilation;
+ input_pel = (x_pos < 0 || x_pos >= width || y_pos < 0 || y_pos >= height) ? 0.0 :
+ input[y_pos * src_linesize + x_pos * input_num + ch];
+ }
+
+
+ output[n_filter] += input_pel * kernel[n_filter * filter_size + kernel_y * filter_linesize +
+ kernel_x * input_num + ch];
+ }
+ }
+ }
+ }
+ output += output_num;
+ }
+ }
+}
+
static void * dnn_execute_layer_conv2d_thread(void *threadarg)
{
static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
@@ -160,35 +223,40 @@ static void * dnn_execute_layer_conv2d_thread(void *threadarg)
av_assert0(channel == conv_params->input_num);
+ struct execute_data *execute_data;
+ execute_data = av_malloc(sizeof(*execute_data));
+ execute_data->thread_start = thread_start;
+ execute_data->thread_end = thread_end;
+ execute_data->input = input;
+ execute_data->output = output;
+ execute_data->kernel = conv_params->kernel;
+ execute_data->input_num = conv_params->input_num;
+ execute_data->output_num = conv_params->output_num;
+ execute_data->kernel_size = conv_params->kernel_size;
+ execute_data->padding_method = conv_params->padding_method;
+ execute_data->dilation = conv_params->dilation;
+ execute_data->pad_size = pad_size;
+ execute_data->width = width;
+ execute_data->height = height;
+ execute_data->radius = radius;
+ execute_data->src_linesize = src_linesize;
+ execute_data->filter_size = filter_size;
+ execute_data->filter_linesize = filter_linesize;
+ if ((thread_data->step >= 4) && (conv_params->input_num >= 4)) {
+ ff_dnn_execute_layer_conv2d_sse4(execute_data);
+ }
+ else {
+ ff_dnn_execute_layer_conv2d_c(execute_data);
+ }
+
+ output = output_operand->data;
+ output += (conv_params->output_num) * (width - 2 * pad_size) * (thread_start - pad_size);
for (int y = thread_start; y < thread_end; ++y) {
for (int x = pad_size; x < width - pad_size; ++x) {
for (int n_filter = 0; n_filter < conv_params->output_num; ++n_filter) {
if (conv_params->has_bias)
- output[n_filter] = conv_params->biases[n_filter];
- else
- output[n_filter] = 0.f;
+ output[n_filter] += conv_params->biases[n_filter];
- for (int ch = 0; ch < conv_params->input_num; ++ch) {
- for (int kernel_y = 0; kernel_y < conv_params->kernel_size; ++kernel_y) {
- for (int kernel_x = 0; kernel_x < conv_params->kernel_size; ++kernel_x) {
- float input_pel;
- if (conv_params->padding_method == SAME_CLAMP_TO_EDGE) {
- int y_pos = CLAMP_TO_EDGE(y + (kernel_y - radius) * conv_params->dilation, height);
- int x_pos = CLAMP_TO_EDGE(x + (kernel_x - radius) * conv_params->dilation, width);
- input_pel = input[y_pos * src_linesize + x_pos * conv_params->input_num + ch];
- } else {
- int y_pos = y + (kernel_y - radius) * conv_params->dilation;
- int x_pos = x + (kernel_x - radius) * conv_params->dilation;
- input_pel = (x_pos < 0 || x_pos >= width || y_pos < 0 || y_pos >= height) ? 0.0 :
- input[y_pos * src_linesize + x_pos * conv_params->input_num + ch];
- }
-
-
- output[n_filter] += input_pel * conv_params->kernel[n_filter * filter_size + kernel_y * filter_linesize +
- kernel_x * conv_params->input_num + ch];
- }
- }
- }
switch (conv_params->activation){
case RELU:
output[n_filter] = FFMAX(output[n_filter], 0.0);
@@ -208,6 +276,7 @@ static void * dnn_execute_layer_conv2d_thread(void *threadarg)
output += conv_params->output_num;
}
}
+
return (void *)0;
}
@@ -231,6 +300,12 @@ int dnn_execute_layer_conv2d(DnnOperand *operands, const int32_t *input_operand_
thread_data->ctx = ctx;
thread_data->thread_num = thread_num;
thread_data->thread_index = 0;
+ thread_data->step = 1;
+ #if ARCH_X86_64
+ int cpu_flags = av_get_cpu_flags();
+ if (EXTERNAL_SSE4(cpu_flags))
+ thread_data->step = 4;
+ #endif
//create threads
for (int i = 0; i < thread_num; i++){
diff --git a/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm b/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm
new file mode 100644
index 0000000000..dc781d42e5
--- /dev/null
+++ b/libavfilter/dnn/dnn_backend_native_layer_conv2d_x86.asm
@@ -0,0 +1,214 @@
+;*****************************************************************************
+;* x86-optimized functions for dnn native backend convolution
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+%macro COUNT_INPUT 0
+ mov tmp1d, padding_method
+ cmp tmp1d, SAME_CLAMP_TO_EDGE
+ je .clamp
+
+ cmp y_posd, 0
+ jl .out_of_th
+ mov tmp2d, height
+ cmp y_posd, tmp2d
+ jge .out_of_th
+
+ cmp x_posd, 0
+ jl .out_of_th
+ mov tmp2d, width
+ cmp x_posd, tmp2d
+ jge .out_of_th
+
+ mov tmp1d, y_posd
+ imul tmp1d, src_linesize
+ mov tmp2d, x_posd
+ imul tmp2d, input_num
+ add tmp1d, tmp2d
+ jmp .count_end
+
+ .out_of_th:
+ mov tmp1d, -1
+ jmp .count_end
+
+ .clamp:
+ cmp y_posd, 0
+ jl .y_clamp_zero
+ mov tmp1d, height
+ cmp y_posd, tmp1d
+ jge .y_clamp_height
+ mov tmp1d, y_posd
+ jmp .y_normal
+
+ .y_clamp_zero:
+ xor tmp1d, tmp1d
+ jmp .y_normal
+
+ .y_clamp_height:
+ sub tmp1d, 1
+
+ .y_normal:
+
+ cmp x_posd, 0
+ jl .x_clamp_zero
+ mov tmp2d, width
+ cmp x_posd, tmp2d
+ jge .x_clamp_width
+ mov tmp2d, x_posd
+ jmp .x_normal
+
+ .x_clamp_zero:
+ xor tmp2d, tmp2d
+ jmp .x_normal
+
+ .x_clamp_width:
+ sub tmp2d, 1
+
+ .x_normal:
+
+ imul tmp1d, src_linesize
+ imul tmp2d, input_num
+ add tmp1d, tmp2d
+
+ .count_end:
+%endmacro
+
+; void ff_dnn_execute_layer_conv2d_sse4(execute_data *execute_data);
+
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal dnn_execute_layer_conv2d, 8, 15, 3, execute_data,\
+ x, y, n_filter, cha, kernel_x, kernel_y, x_pos, y_pos, kernel_pos,\
+ input, output, kernel, tmp1, tmp2
+
+%define thread_start [execute_dataq]
+%define thread_end [execute_dataq + 1 * 4]
+%define input_num [execute_dataq + 2 * 4]
+%define output_num [execute_dataq + 3 * 4]
+%define kernel_size [execute_dataq + 4 * 4]
+%define padding_method [execute_dataq + 5 * 4]
+%define dilation [execute_dataq + 6 * 4]
+%define pad_size [execute_dataq + 7 * 4]
+%define width [execute_dataq + 8 * 4]
+%define height [execute_dataq + 9 * 4]
+%define radius [execute_dataq + 10 * 4]
+%define src_linesize [execute_dataq + 11 * 4]
+%define filter_size [execute_dataq + 12 * 4]
+%define filter_linesize [execute_dataq + 13 * 4]
+%define SAME_CLAMP_TO_EDGE 2
+
+ mov inputq, [execute_dataq + 14 * 4]
+ mov outputq, [execute_dataq + 14 * 4 + 8]
+ mov kernelq, [execute_dataq + 14 * 4 + 2 * 8]
+
+ mov yd, thread_start
+.loop_y:
+ mov xd, pad_size
+ .loop_x:
+ xor n_filterd, n_filterd
+ xor kernel_posq, kernel_posq
+ .loop_filter:
+ xorps m2, m2
+ xor kernel_yd, kernel_yd
+
+ mov tmp1d, kernel_yd
+ sub tmp1d, radius
+ mov y_posd, dilation
+ imul y_posd, tmp1d
+ add y_posd, yd
+
+ .loop_kery:
+ xor kernel_xd, kernel_xd
+
+ mov tmp1d, kernel_xd
+ sub tmp1d, radius
+ mov x_posd, dilation
+ imul x_posd, tmp1d
+ add x_posd, xd
+
+ .loop_kerx:
+ COUNT_INPUT
+ xor chad, chad
+ .loop_ch:
+ cmp tmp1d, -1
+ je .out
+
+ movsxdifnidn tmp1q, tmp1d
+ movups m0, [inputq + tmp1q * 4]
+ add tmp1d, 4
+ jmp .load_end
+
+ .out:
+ xorps m0, m0
+
+ .load_end:
+
+ movups m1, [kernelq + kernel_posq * 4]
+ add kernel_posq, 4
+
+ mulps m0, m1
+ addps m2, m0
+
+ add chad, 4
+ mov tmp2d, input_num
+ cmp chad, tmp2d
+ jl .loop_ch
+
+ add x_posd, dilation
+ add kernel_xd, 1
+ mov tmp1d, kernel_size
+ cmp kernel_xd, tmp1d
+ jl .loop_kerx
+
+ add y_posd, dilation
+ add kernel_yd, 1
+ mov tmp1d, kernel_size
+ cmp kernel_yd, tmp1d
+ jl .loop_kery
+
+ haddps m2, m2
+ haddps m2, m2
+ movsxdifnidn n_filterq, n_filterd
+ movss [outputq + n_filterq * 4], m2
+
+ add n_filterd, 1
+ mov tmp1d, output_num
+ cmp n_filterd, tmp1d
+ jl .loop_filter
+
+ mov tmp1d, output_num
+ movsxdifnidn tmp1q, tmp1d
+ shl tmp1d, 2
+ add outputq, tmp1q
+ add xd, 1
+ mov tmp2d, width
+ sub tmp2d, pad_size
+ cmp xd, tmp2d
+ jl .loop_x
+
+ add yd, 1
+ mov tmp1d, thread_end
+ cmp yd, tmp1d
+ jl .loop_y
+
+ RET
+%endif
--
2.27.0
More information about the ffmpeg-devel
mailing list