[FFmpeg-devel] [PATCH v2 3/3] avfilter/vf_convolution: Add X86 SIMD optimizations for filter_column()
chen
chenm003 at 163.com
Mon Dec 23 06:59:17 EET 2019
comments inlined
At 2019-12-22 16:37:03, xujunzz at sjtu.edu.cn wrote:
>From: Xu Jun <xujunzz at sjtu.edu.cn>
>
>Performance improves about 10% compared to v1.
>
>Tested using this command:
>./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 5000 -f null /dev/null -benchmark
>
>after patch:
>frame= 4317 fps=600 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed= 24x
>video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
>bench: utime=21.540s stime=2.091s rtime=7.197s
>
>before patch:
>frame= 4317 fps=263 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=10.5x
>video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
>bench: utime=74.377s stime=1.880s rtime=16.420s
>
>Signed-off-by: Xu Jun <xujunzz at sjtu.edu.cn>
>---
> libavfilter/x86/vf_convolution.asm | 202 ++++++++++++++++++++++++++
> libavfilter/x86/vf_convolution_init.c | 9 ++
> 2 files changed, 211 insertions(+)
>
>diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm
>index 2a09374b00..4c700656d6 100755
>--- a/libavfilter/x86/vf_convolution.asm
>+++ b/libavfilter/x86/vf_convolution.asm
>@@ -22,6 +22,8 @@
>
> SECTION_RODATA
> half: dd 0.5
>+shuf_init: ddq 0x80808003808080028080800180808000
TBD
ps: constant define as Byte (db) or Word (dw) have more readable, in this case, you use it with psuhfb, so Byte.
>+shuf_step: ddq 0x00000004000000040000000400000004
>
> SECTION .text
>
>@@ -285,3 +287,203 @@ sub widthq, rq
> .end:
> RET
> %endif
>+
>+; void filter_column(uint8_t *dst, int height,
>+; float rdiv, float bias, const int *const matrix,
>+; const uint8_t *c[], int length, int radius,
>+; int dstride, int stride);
>+
>+%macro COMPUTE_4COL 1
>+ pshufb m7, m6, m4 ; get 4 uint8s from the 16 uint8s
Unnecessary, see below comment
>+ pmulld m7, m5
>+ paddd m1%1, m7
not error, generally, this sum operator made new dependency link, it may stall pipeline, I suggest sum 4 of register in parallelism.
In this case, I am not sure dynamic range of Matrix, so I am not sure it is good or overflow if sum element of (2 * radius + 1) times.
>+%endmacro
>+
>+%macro CVT_PACK_COL 1
>+ cvtdq2ps m1%1, m1%1
>+ mulps m1%1, m0 ; sum *= rdiv
>+ addps m1%1, m1 ; sum += bias
>+ addps m1%1, m3 ; sum += 0.5
>+ cvttps2dq m1%1, m1%1
>+ packssdw m1%1, m1%1
>+ packuswb m1%1, m1%1
>+%endmacro
>+
>+%if ARCH_X86_64
>+INIT_XMM sse4
>+%if UNIX64
>+cglobal filter_column, 8, 14, 14, dst, height, matrix, ptr, width, rad, dstride, stride, \
>+i, ci, ystride, sum, r, off16
>+%else
>+cglobal filter_column, 8, 14, 14, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, \
>+i, ci, ystride, sum, r, off16
>+%endif
>+
>+%if WIN64
>+ SWAP m0, m2
>+ SWAP m1, m3
>+ mov r2q, matrixmp
>+ mov r3q, ptrmp
>+ mov r4q, widthmp
>+ mov r5q, radmp
>+ mov r6q, dstridemp
>+ mov r7q, stridemp
>+ DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, \
>+ i, ci, ystride, sum, r, off16
>+%endif
>+
>+movsxdifnidn widthq, widthd
>+movsxdifnidn radq, radd
>+lea radq, [radq * 2 + 1]
>+movsxdifnidn dstrideq, dstrided
>+movsxdifnidn strideq, strided
>+movsxdifnidn heightq, heightd
>+
>+VBROADCASTSS m0, m0 ; rdiv
>+VBROADCASTSS m1, m1 ; bias
>+pxor m2, m2 ; zero
>+movss m3, [half]
>+VBROADCASTSS m3, m3 ; 0.5
>+movdqu m8, [shuf_init] ; shuffle initialization
TBD
>+movdqu m9, [shuf_step] ; shuffle step
>+
>+xor ystrideq, ystrideq ; y*stride
>+
>+cmp widthq, mmsize ;if width<16 run loopr, width=16 run 16 parallel
>+jl .less16
>+
>+.equal16:
>+ pxor m10, m10
>+ pxor m11, m11
>+ pxor m12, m12
>+ pxor m13, m13
>+ ; m10-13 hold sums
not error, however, use m0-m7 can be save 1 byte instruction prefix, in the inner loop, it made a little performance improvement.
>+
>+ lea iq, [radq - 1]
>+ .loopi:
>+ movd m5, [matrixq + 4*iq] ; matrix[i]
>+ VBROADCASTSS m5, m5
since you claim SSE4, PSHUFD maybe better, however, it is not problem if you want to upgrade to AVX and above
>+ mov ciq, [ptrq + iq * gprsize]
>+ movdqu m6, [ciq + ystrideq] ; c[i][y*stride] 16 uint8s
SSE4 provided MOVZXBD, it make you reduce above PSHUFB and series constant load
>+
>+ ;m4 controls shuffle
>+ movdqa m4, m8
>+ COMPUTE_4COL 0 ; process 0-3 cols, sum in m10
>+ paddd m4, m9
>+ COMPUTE_4COL 1 ; process 4-7 cols, sum in m11
>+ paddd m4, m9
>+ COMPUTE_4COL 2 ; process 8-11 cols, sum in m12
>+ paddd m4, m9
>+ COMPUTE_4COL 3 ; process 12-15 cols, sum in m13
>+
>+ sub iq, 1
>+ jns .loopi
>+
>+ CVT_PACK_COL 0 ; process 0-3 cols, result in m10's low 32bit
>+ CVT_PACK_COL 1 ; process 4-7 cols, result in m11's low 32bit
>+ CVT_PACK_COL 2 ; process 8-11 cols, result in m12's low 32bit
>+ CVT_PACK_COL 3 ; process 12-15 cols, result in m13's low 32bit
>+ punpckldq m10, m11
>+ punpckldq m12, m13
>+ punpcklqdq m10, m12 ; pack 16 results in m10
>+ movdqu [dstq], m10
>+
>+ add dstq, dstrideq
>+ add ystrideq, strideq
>+ sub heightq, 1
>+ jnz .equal16
>+ jmp .end
>+
>+.less16:
>+ xor off16q, off16q
>+ cmp widthq, mmsize/4
>+ jl .loopr
>+
>+ mov rq, widthq
>+ and rq, mmsize/4-1
>+ sub widthq, rq
>+
>+ pxor m10, m10
>+ pxor m11, m11
>+ pxor m12, m12
>+
>+ lea iq, [radq - 1]
>+ .loopi_4:
>+ movd m5, [matrixq + 4*iq] ; matrix[i]
>+ VBROADCASTSS m5, m5
>+ mov ciq, [ptrq + iq * gprsize]
>+ movdqu m6, [ciq + ystrideq] ; c[i][y*stride] 16 uint8s
>+
>+ ;m4 controls shuffle
>+ movdqa m4, m8
>+ COMPUTE_4COL 0 ; process 0-3 cols, sum in m10
>+ cmp widthq, mmsize/4 ; width = 4
>+ je .i4_end
>+
>+ paddd m4, m9
>+ COMPUTE_4COL 1 ; process 4-7 cols, sum in m11
>+ cmp widthq, mmsize/2 ; width = 8
>+ je .i4_end
>+
>+ paddd m4, m9
>+ COMPUTE_4COL 2 ; process 8-11 cols, sum in m12
>+
>+ .i4_end:
>+ sub iq, 1
>+ jns .loopi_4
>+
>+ CVT_PACK_COL 0 ; process 0-3 cols, result in m10's low 32bit
>+ movd [dstq], m10
>+ cmp widthq, mmsize/4 ; width = 4
>+ je .cvt_end
>+
>+ CVT_PACK_COL 1 ; process 4-7 cols, result in m11's low 32bit
>+ movd [dstq + mmsize/4], m11
>+ cmp widthq, mmsize/2 ; width = 8
>+ je .cvt_end
>+
>+ CVT_PACK_COL 2 ; process 8-11 cols, result in m12's low 32bit
>+ movd [dstq + mmsize/2], m12
>+
>+ .cvt_end:
>+ cmp rq, 0
>+ je .loopr_end
>+ mov off16q, widthq
>+ add widthq, rq
>+
>+ .loopr:
>+ xor sumq, sumq
>+ lea iq, [radq - 1]
>+ .loopr_i:
>+ mov ciq, [ptrq + iq * gprsize]
>+ add ciq, ystrideq
>+ movzx rd, byte [ciq + off16q]
>+ imul rd, [matrixq + 4*iq]
>+ add sumd, rd
>+
>+ sub iq, 1
>+ jns .loopr_i
>+
>+ pxor m7, m7
>+ cvtsi2ss m7, sumd
>+ mulss m7, m0 ; sum *= rdiv
>+ addss m7, m1 ; sum += bias
>+ addss m7, m3 ; sum += 0.5
>+ cvttps2dq m7, m7
>+ packssdw m7, m7
>+ packuswb m7, m7
>+ movd sumd, m7
>+ mov [dstq + off16q], sumb
SSE4 provided PEXTRB
>+ add off16q, 1
>+ cmp off16q, widthq
>+ jl .loopr
>+
>+ .loopr_end:
>+ add dstq, dstrideq
>+ add ystrideq, strideq
>+ sub heightq, 1
>+ jnz .less16
JNZ is not problem, but I more like JGT, it may avoid risk if value goes negative
More information about the ffmpeg-devel
mailing list