[FFmpeg-devel] [PATCH v2 3/3] avfilter/vf_convolution: Add X86 SIMD optimizations for filter_column()

chen chenm003 at 163.com
Mon Dec 23 06:59:17 EET 2019


comments inlined
At 2019-12-22 16:37:03, xujunzz at sjtu.edu.cn wrote:
>From: Xu Jun <xujunzz at sjtu.edu.cn>
>
>Performance improves about 10% compared to v1.
>
>Tested using this command:
>./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 5000 -f null /dev/null -benchmark
>
>after patch:
>frame= 4317 fps=600 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=  24x
>video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
>bench: utime=21.540s stime=2.091s rtime=7.197s
>
>before patch:
>frame= 4317 fps=263 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=10.5x
>video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown
>bench: utime=74.377s stime=1.880s rtime=16.420s
>
>Signed-off-by: Xu Jun <xujunzz at sjtu.edu.cn>
>---
> libavfilter/x86/vf_convolution.asm    | 202 ++++++++++++++++++++++++++
> libavfilter/x86/vf_convolution_init.c |   9 ++
> 2 files changed, 211 insertions(+)
>
>diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm
>index 2a09374b00..4c700656d6 100755
>--- a/libavfilter/x86/vf_convolution.asm
>+++ b/libavfilter/x86/vf_convolution.asm
>@@ -22,6 +22,8 @@
> 
> SECTION_RODATA
> half:   dd 0.5

>+shuf_init:   ddq 0x80808003808080028080800180808000
TBD
ps: constant define as Byte (db) or Word (dw) have more readable, in this case, you use it with psuhfb, so Byte.


>+shuf_step: ddq 0x00000004000000040000000400000004
> 
> SECTION .text
> 
>@@ -285,3 +287,203 @@ sub widthq, rq
> .end:
>     RET
> %endif
>+
>+; void filter_column(uint8_t *dst, int height,
>+;                         float rdiv, float bias, const int *const matrix,
>+;                         const uint8_t *c[], int length, int radius,
>+;                         int dstride, int stride);
>+
>+%macro COMPUTE_4COL 1

>+    pshufb m7, m6, m4    ; get 4 uint8s from the 16 uint8s
Unnecessary, see below comment


>+    pmulld m7, m5

>+    paddd m1%1, m7
not error, generally, this sum operator made new dependency link, it may stall pipeline, I suggest sum 4 of register in parallelism.
In this case, I am not sure dynamic range of Matrix, so I am not sure it is good or overflow if sum element of (2 * radius + 1) times.


>+%endmacro
>+
>+%macro CVT_PACK_COL 1
>+    cvtdq2ps m1%1, m1%1
>+    mulps m1%1, m0     ; sum *= rdiv
>+    addps m1%1, m1     ; sum += bias
>+    addps m1%1, m3     ; sum += 0.5
>+    cvttps2dq m1%1, m1%1
>+    packssdw m1%1, m1%1
>+    packuswb m1%1, m1%1
>+%endmacro
>+
>+%if ARCH_X86_64
>+INIT_XMM sse4
>+%if UNIX64
>+cglobal filter_column, 8, 14, 14, dst, height, matrix, ptr, width, rad, dstride, stride, \
>+i, ci, ystride, sum, r, off16
>+%else
>+cglobal filter_column, 8, 14, 14, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, \
>+i, ci, ystride, sum, r, off16
>+%endif
>+
>+%if WIN64
>+    SWAP m0, m2
>+    SWAP m1, m3
>+    mov r2q, matrixmp
>+    mov r3q, ptrmp
>+    mov r4q, widthmp
>+    mov r5q, radmp
>+    mov r6q, dstridemp
>+    mov r7q, stridemp
>+    DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, \
>+    i, ci, ystride, sum, r, off16
>+%endif
>+
>+movsxdifnidn widthq, widthd
>+movsxdifnidn radq, radd
>+lea radq, [radq * 2 + 1]
>+movsxdifnidn dstrideq, dstrided
>+movsxdifnidn strideq, strided
>+movsxdifnidn heightq, heightd
>+
>+VBROADCASTSS m0, m0    ; rdiv
>+VBROADCASTSS m1, m1    ; bias
>+pxor m2, m2    ; zero
>+movss m3, [half]
>+VBROADCASTSS m3, m3    ; 0.5

>+movdqu m8, [shuf_init]      ; shuffle initialization
TBD


>+movdqu m9, [shuf_step]    ; shuffle step
>+
>+xor ystrideq, ystrideq    ; y*stride
>+
>+cmp widthq, mmsize    ;if width<16 run loopr, width=16 run 16 parallel
>+jl .less16
>+
>+.equal16:
>+    pxor m10, m10
>+    pxor m11, m11
>+    pxor m12, m12
>+    pxor m13, m13

>+    ; m10-13 hold sums
not error, however, use m0-m7 can be save 1 byte instruction prefix, in the inner loop, it made a little performance improvement.


>+
>+    lea iq, [radq - 1]
>+    .loopi:
>+        movd m5, [matrixq + 4*iq]    ; matrix[i]

>+        VBROADCASTSS m5, m5
since you claim SSE4, PSHUFD maybe better, however, it is not problem if you want to upgrade to AVX and above


>+        mov ciq, [ptrq + iq * gprsize]

>+        movdqu m6, [ciq + ystrideq]    ; c[i][y*stride] 16 uint8s
SSE4 provided MOVZXBD, it make you reduce above PSHUFB and series constant load


>+
>+        ;m4 controls shuffle
>+        movdqa m4, m8
>+        COMPUTE_4COL 0    ; process 0-3 cols, sum in m10
>+        paddd m4, m9
>+        COMPUTE_4COL 1    ; process 4-7 cols, sum in m11
>+        paddd m4, m9
>+        COMPUTE_4COL 2    ; process 8-11 cols, sum in m12
>+        paddd m4, m9
>+        COMPUTE_4COL 3    ; process 12-15 cols, sum in m13
>+
>+        sub iq, 1
>+        jns .loopi
>+
>+    CVT_PACK_COL 0    ; process 0-3 cols, result in m10's low 32bit
>+    CVT_PACK_COL 1    ; process 4-7 cols, result in m11's low 32bit
>+    CVT_PACK_COL 2    ; process 8-11 cols, result in m12's low 32bit
>+    CVT_PACK_COL 3    ; process 12-15 cols, result in m13's low 32bit
>+    punpckldq m10, m11
>+    punpckldq m12, m13
>+    punpcklqdq m10, m12    ; pack 16 results in m10
>+    movdqu [dstq], m10
>+
>+    add dstq, dstrideq
>+    add ystrideq, strideq
>+    sub heightq, 1
>+    jnz .equal16
>+    jmp .end
>+
>+.less16:
>+    xor off16q, off16q
>+    cmp widthq, mmsize/4
>+    jl .loopr
>+
>+    mov   rq, widthq
>+    and   rq, mmsize/4-1
>+    sub   widthq, rq
>+
>+    pxor m10, m10
>+    pxor m11, m11
>+    pxor m12, m12
>+
>+    lea iq, [radq - 1]
>+    .loopi_4:
>+        movd m5, [matrixq + 4*iq]    ; matrix[i]
>+        VBROADCASTSS m5, m5
>+        mov ciq, [ptrq + iq * gprsize]
>+        movdqu m6, [ciq + ystrideq]    ; c[i][y*stride] 16 uint8s
>+
>+        ;m4 controls shuffle
>+        movdqa m4, m8
>+        COMPUTE_4COL 0    ; process 0-3 cols, sum in m10
>+        cmp widthq, mmsize/4 ; width = 4
>+        je .i4_end
>+
>+        paddd m4, m9
>+        COMPUTE_4COL 1    ; process 4-7 cols, sum in m11
>+        cmp widthq, mmsize/2 ; width = 8
>+        je .i4_end
>+
>+        paddd m4, m9
>+        COMPUTE_4COL 2    ; process 8-11 cols, sum in m12
>+
>+        .i4_end:
>+        sub iq, 1
>+        jns .loopi_4
>+
>+    CVT_PACK_COL 0    ; process 0-3 cols, result in m10's low 32bit
>+    movd [dstq], m10
>+    cmp widthq, mmsize/4 ; width = 4
>+    je .cvt_end
>+
>+    CVT_PACK_COL 1    ; process 4-7 cols, result in m11's low 32bit
>+    movd [dstq + mmsize/4], m11
>+    cmp widthq, mmsize/2 ; width = 8
>+    je .cvt_end
>+
>+    CVT_PACK_COL 2    ; process 8-11 cols, result in m12's low 32bit
>+    movd [dstq + mmsize/2], m12
>+
>+    .cvt_end:
>+    cmp rq, 0
>+    je .loopr_end
>+    mov off16q, widthq
>+    add widthq, rq
>+
>+    .loopr:
>+        xor sumq, sumq
>+        lea iq, [radq - 1]
>+        .loopr_i:
>+            mov ciq, [ptrq + iq * gprsize]
>+            add ciq, ystrideq
>+            movzx rd, byte [ciq + off16q]
>+            imul rd, [matrixq + 4*iq]
>+            add sumd, rd
>+
>+            sub iq, 1
>+            jns .loopr_i
>+
>+        pxor m7, m7
>+        cvtsi2ss m7, sumd
>+        mulss m7, m0     ; sum *= rdiv
>+        addss m7, m1     ; sum += bias
>+        addss m7, m3     ; sum += 0.5
>+        cvttps2dq m7, m7
>+        packssdw m7, m7
>+        packuswb m7, m7
>+        movd sumd, m7

>+        mov [dstq + off16q], sumb
SSE4 provided PEXTRB


>+        add off16q, 1
>+        cmp off16q, widthq
>+        jl .loopr
>+
>+    .loopr_end:
>+    add dstq, dstrideq
>+    add ystrideq, strideq
>+    sub heightq, 1

>+    jnz .less16
JNZ is not problem, but I more like JGT, it may avoid risk if value goes negative



More information about the ffmpeg-devel mailing list