[FFmpeg-devel] [PATCH 1/2] swscale/x86/output: Fix yuv2planeX_16* with unaligned destination
Michael Niedermayer
michael at niedermayer.cc
Wed Feb 17 03:50:57 CET 2016
Signed-off-by: Michael Niedermayer <michael at niedermayer.cc>
---
libswscale/x86/output.asm | 160 ++++++++++++++++++++++++---------------------
1 file changed, 87 insertions(+), 73 deletions(-)
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index 9ea4af9..7db72f8 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -55,75 +55,8 @@ SECTION .text
; of 2. $offset is either 0 or 3. $dither holds 8 values.
;-----------------------------------------------------------------------------
-%macro yuv2planeX_fn 3
-
-%if ARCH_X86_32
-%define cntr_reg fltsizeq
-%define movsx mov
-%else
-%define cntr_reg r7
-%define movsx movsxd
-%endif
-
-cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
-%if %1 == 8 || %1 == 9 || %1 == 10
- pxor m6, m6
-%endif ; %1 == 8/9/10
-
-%if %1 == 8
-%if ARCH_X86_32
-%assign pad 0x2c - (stack_offset & 15)
- SUB rsp, pad
-%define m_dith m7
-%else ; x86-64
-%define m_dith m9
-%endif ; x86-32
-
- ; create registers holding dither
- movq m_dith, [ditherq] ; dither
- test offsetd, offsetd
- jz .no_rot
-%if mmsize == 16
- punpcklqdq m_dith, m_dith
-%endif ; mmsize == 16
- PALIGNR m_dith, m_dith, 3, m0
-.no_rot:
-%if mmsize == 16
- punpcklbw m_dith, m6
-%if ARCH_X86_64
- punpcklwd m8, m_dith, m6
- pslld m8, 12
-%else ; x86-32
- punpcklwd m5, m_dith, m6
- pslld m5, 12
-%endif ; x86-32/64
- punpckhwd m_dith, m6
- pslld m_dith, 12
-%if ARCH_X86_32
- mova [rsp+ 0], m5
- mova [rsp+16], m_dith
-%endif
-%else ; mmsize == 8
- punpcklbw m5, m_dith, m6
- punpckhbw m_dith, m6
- punpcklwd m4, m5, m6
- punpckhwd m5, m6
- punpcklwd m3, m_dith, m6
- punpckhwd m_dith, m6
- pslld m4, 12
- pslld m5, 12
- pslld m3, 12
- pslld m_dith, 12
- mova [rsp+ 0], m4
- mova [rsp+ 8], m5
- mova [rsp+16], m3
- mova [rsp+24], m_dith
-%endif ; mmsize == 8/16
-%endif ; %1 == 8
-
- xor r5, r5
-
-.pixelloop:
+%macro yuv2planeX_mainloop 2
+.pixelloop_%2:
%assign %%i 0
; the rep here is for the 8bit output mmx case, where dither covers
; 8 pixels but we can only handle 2 pixels per register, and thus 4
@@ -150,7 +83,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
mova m2, m1
%endif ; %1 == 8/9/10/16
movsx cntr_reg, fltsizem
-.filterloop_ %+ %%i:
+.filterloop_%2_ %+ %%i:
; input pixels
mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
%if %1 == 16
@@ -197,7 +130,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%endif ; %1 == 8/9/10/16
sub cntr_reg, 2
- jg .filterloop_ %+ %%i
+ jg .filterloop_%2_ %+ %%i
%if %1 == 16
psrad m2, 31 - %1
@@ -224,7 +157,7 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%endif ; mmxext/sse2/sse4/avx
pminsw m2, [yuv2yuvX_%1_upper]
%endif ; %1 == 9/10/16
- mova [dstq+r5*2], m2
+ mov%2 [dstq+r5*2], m2
%endif ; %1 == 8/9/10/16
add r5, mmsize/2
@@ -232,7 +165,88 @@ cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%assign %%i %%i+2
%endrep
- jg .pixelloop
+ jg .pixelloop_%2
+%endmacro
+
+%macro yuv2planeX_fn 3
+
+%if ARCH_X86_32
+%define cntr_reg fltsizeq
+%define movsx mov
+%else
+%define cntr_reg r7
+%define movsx movsxd
+%endif
+
+cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
+%if %1 == 8 || %1 == 9 || %1 == 10
+ pxor m6, m6
+%endif ; %1 == 8/9/10
+; xor r0,r0
+; mov r0,[r0]
+%if %1 == 8
+%if ARCH_X86_32
+%assign pad 0x2c - (stack_offset & 15)
+ SUB rsp, pad
+%define m_dith m7
+%else ; x86-64
+%define m_dith m9
+%endif ; x86-32
+
+ ; create registers holding dither
+ movq m_dith, [ditherq] ; dither
+ test offsetd, offsetd
+ jz .no_rot
+%if mmsize == 16
+ punpcklqdq m_dith, m_dith
+%endif ; mmsize == 16
+ PALIGNR m_dith, m_dith, 3, m0
+.no_rot:
+%if mmsize == 16
+ punpcklbw m_dith, m6
+%if ARCH_X86_64
+ punpcklwd m8, m_dith, m6
+ pslld m8, 12
+%else ; x86-32
+ punpcklwd m5, m_dith, m6
+ pslld m5, 12
+%endif ; x86-32/64
+ punpckhwd m_dith, m6
+ pslld m_dith, 12
+%if ARCH_X86_32
+ mova [rsp+ 0], m5
+ mova [rsp+16], m_dith
+%endif
+%else ; mmsize == 8
+ punpcklbw m5, m_dith, m6
+ punpckhbw m_dith, m6
+ punpcklwd m4, m5, m6
+ punpckhwd m5, m6
+ punpcklwd m3, m_dith, m6
+ punpckhwd m_dith, m6
+ pslld m4, 12
+ pslld m5, 12
+ pslld m3, 12
+ pslld m_dith, 12
+ mova [rsp+ 0], m4
+ mova [rsp+ 8], m5
+ mova [rsp+16], m3
+ mova [rsp+24], m_dith
+%endif ; mmsize == 8/16
+%endif ; %1 == 8
+
+ xor r5, r5
+
+%if mmsize == 8 || %1 == 8
+ yuv2planeX_mainloop %1, a
+%else ; mmsize == 16
+ test dstq, 15
+ jnz .unaligned
+ yuv2planeX_mainloop %1, a
+ REP_RET
+.unaligned:
+ yuv2planeX_mainloop %1, u
+%endif ; mmsize == 8/16
%if %1 == 8
%if ARCH_X86_32
--
1.7.9.5
More information about the ffmpeg-devel
mailing list