[FFmpeg-devel] [PATCH] avfilter/vf_convolution: add x86 SIMD for filter_column()
xujunzz at sjtu.edu.cn
xujunzz at sjtu.edu.cn
Wed Nov 27 17:13:54 EET 2019
From: Xu Jun <xujunzz at sjtu.edu.cn>
Tested using a simple command:
./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 1000 -f null /dev/null
The fps increase from 284 to 693 on my local machine.
Signed-off-by: Xu Jun <xujunzz at sjtu.edu.cn>
---
libavfilter/x86/vf_convolution.asm | 129 ++++++++++++++++++++++++++
libavfilter/x86/vf_convolution_init.c | 7 ++
2 files changed, 136 insertions(+)
diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm
index b71e9720fb..49dfbab9c0 100755
--- a/libavfilter/x86/vf_convolution.asm
+++ b/libavfilter/x86/vf_convolution.asm
@@ -258,3 +258,132 @@ sub widthq, rq
.end:
RET
%endif
+
+; void filter_column(uint8_t *dst, int height,
+; float rdiv, float bias, const int *const matrix,
+; const uint8_t *c[], int length, int radius,
+; int dstride, int stride);
+
+%if ARCH_X86_64
+INIT_XMM sse4
+%if UNIX64
+cglobal filter_column16, 8, 15, 7, dst, height, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
+%else
+cglobal filter_column16, 8, 15, 7, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
+%endif
+
+%if WIN64
+ SWAP m0, m2
+ SWAP m1, m3
+ mov r2q, matrixmp
+ mov r3q, ptrmp
+ mov r4q, widthmp
+ mov r5q, radmp
+ mov r6q, dstridemp
+ mov r7q, stridemp
+ DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, i, ci, dst_off, off16, c_off, sum, r
+%endif
+
+movsxdifnidn widthq, widthd
+movsxdifnidn radq, radd
+movsxdifnidn dstrideq, dstrided
+movsxdifnidn strideq, strided
+sal radq, 1
+add radq, 1 ;2*radius+1
+movsxdifnidn heightq, heightd
+VBROADCASTSS m0, m0
+VBROADCASTSS m1, m1
+pxor m6, m6
+movss m5, [half]
+VBROADCASTSS m5, m5
+
+xor dst_offq, dst_offq
+xor c_offq, c_offq
+
+.loopy:
+ xor off16q, off16q
+ cmp widthq, mmsize/4
+ jl .loopr
+
+ mov rq, widthq
+ and rq, mmsize/4-1
+ sub widthq, rq
+
+ .loop16: ;parallel process 16 elements in a row
+ pxor m4, m4
+ xor iq, iq
+ .loopi:
+ movss m2, [matrixq + 4*iq]
+ VBROADCASTSS m2, m2
+ mov ciq, [ptrq + iq * gprsize]
+ movss m3, [ciq + c_offq] ;c[i][y*stride + off16]
+ punpcklbw m3, m6
+ punpcklwd m3, m6
+ pmulld m2, m3
+ paddd m4, m2
+
+ add iq, 1
+ cmp iq, radq
+ jl .loopi
+
+ cvtdq2ps m4, m4
+ mulps m4, m0 ; sum *= rdiv
+ addps m4, m1 ; sum += bias
+ addps m4, m5 ; sum += 0.5
+ cvttps2dq m4, m4
+ packssdw m4, m4
+ packuswb m4, m4
+ movss [dstq + dst_offq], m4
+ add c_offq, mmsize/4
+ add dst_offq, mmsize/4
+
+ add off16q, mmsize/4
+ cmp off16q, widthq
+ jl .loop16
+
+ add widthq, rq
+ cmp off16q, widthq
+ jge .paraend
+
+ .loopr:
+ xor sumd, sumd
+ xor iq, iq
+ .loopr_i:
+ mov ciq, [ptrq + iq * gprsize]
+ movzx rd, byte [ciq + c_offq]
+ imul rd, [matrixq + 4*iq]
+ add sumd, rd
+
+ add iq, 1
+ cmp iq, radq
+ jl .loopr_i
+
+ pxor m4, m4
+ cvtsi2ss m4, sumd
+ mulss m4, m0 ; sum *= rdiv
+ addss m4, m1 ; sum += bias
+ addss m4, m5 ; sum += 0.5
+ cvttps2dq m4, m4
+ packssdw m4, m4
+ packuswb m4, m4
+ movd sumd, m4
+ mov [dstq + dst_offq], sumb
+ add c_offq, 1
+ add dst_offq, 1
+ add off16q, 1
+ cmp off16q, widthq
+ jl .loopr
+
+ .paraend:
+ sub c_offq, widthq
+ sub dst_offq, widthq
+ add c_offq, strideq
+ add dst_offq, dstrideq
+
+ sub heightq, 1
+ cmp heightq, 0
+ jg .loopy
+
+.end:
+ RET
+%endif
\ No newline at end of file
diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c
index 6b1c2f0e9f..d9e93296b9 100644
--- a/libavfilter/x86/vf_convolution_init.c
+++ b/libavfilter/x86/vf_convolution_init.c
@@ -56,6 +56,11 @@ static void filter_column16(uint8_t *dst, int height,
}
+void ff_filter_column16_sse4(uint8_t *dst, int width,
+ float rdiv, float bias, const int *const matrix,
+ const uint8_t *c[], int length, int radius,
+ int dstride, int stride);
+
av_cold void ff_convolution_init_x86(ConvolutionContext *s)
{
#if ARCH_X86_64
@@ -74,6 +79,8 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s)
}
if (s->mode[i] == MATRIX_COLUMN)
s->filter[i] = filter_column16;
+ if (EXTERNAL_SSE4(cpu_flags))
+ s->filter[i] = ff_filter_column16_sse4;
}
#endif
}
--
2.17.1
More information about the ffmpeg-devel
mailing list