[FFmpeg-devel] [PATCH 3/4] x86: xvid_idct: merged idct_put SSE2 versions
Christophe Gisquet
christophe.gisquet at gmail.com
Wed Mar 11 00:11:53 CET 2015
---
libavcodec/x86/xvididct.asm | 202 ++++++++++++++++++++++++++++-------------
libavcodec/x86/xvididct_init.c | 8 +-
2 files changed, 140 insertions(+), 70 deletions(-)
diff --git a/libavcodec/x86/xvididct.asm b/libavcodec/x86/xvididct.asm
index 4c52bf1..58ffb11 100644
--- a/libavcodec/x86/xvididct.asm
+++ b/libavcodec/x86/xvididct.asm
@@ -292,13 +292,13 @@ SECTION .text
%define TAN3 xmm13
%define TAN1 xmm14
%else
-%define ROW0 [r0 + 0*16]
+%define ROW0 [BLOCK + 0*16]
%define REG0 xmm4
-%define ROW2 [r0 + 2*16]
+%define ROW2 [BLOCK + 2*16]
%define REG2 xmm4
-%define ROW4 [r0 + 4*16]
+%define ROW4 [BLOCK + 4*16]
%define REG4 xmm6
-%define ROW6 [r0 + 6*16]
+%define ROW6 [BLOCK + 6*16]
%define REG6 xmm6
%define XMMS xmm2
%define SREG2 xmm7
@@ -369,8 +369,71 @@ SECTION .text
movdqa TAN1, [tan1]
%endmacro
+%macro FIRST_HALF 2 ; %1=dct %2=type(normal,add,put)
+ psraw xmm5, 6
+ psraw REG0, 6
+ psraw TAN3, 6
+ psraw xmm3, 6
+ ; dct coeffs must still be written for AC prediction
+%if %2 == 0
+ movdqa [%1+1*16], TAN3
+ movdqa [%1+2*16], xmm3
+ movdqa [%1+5*16], REG0
+ movdqa [%1+6*16], xmm5
+%else
+ ; Must now load args as gprs are no longer used for masks
+ ; DEST is set to where address of dest was loaded
+ %if ARCH_X86_32
+ %xdefine DEST r2q ; BLOCK is r0, stride r1
+ movifnidn DEST, destm
+ movifnidn strideq, stridem
+ %else
+ %xdefine DEST r0q
+ %endif
+ lea r3q, [3*strideq]
+ %if %2 == 1
+ packuswb TAN3, xmm3
+ packuswb xmm5, REG0
+ movq [DEST + strideq], TAN3
+ movhps [DEST + 2*strideq], TAN3
+ ; REG0 and TAN3 are now available (and likely used in second half)
+ %else
+ %warning Unimplemented
+ %endif
+%endif
+%endmacro
+
+%macro SECOND_HALF 6 ; %1=dct %2=type(normal,add,put) 3-6: xmms
+ psraw %3, 6
+ psraw %4, 6
+ psraw %5, 6
+ psraw %6, 6
+ ; dct coeffs must still be written for AC prediction
+%if %2 == 0
+ movdqa [%1+0*16], %3
+ movdqa [%1+3*16], %5
+ movdqa [%1+4*16], %6
+ movdqa [%1+7*16], %4
+%elif %2 == 1
+ packuswb %3, %5
+ packuswb %6, %4
+ ; address of dest may have been loaded
+ movq [DEST], %3
+ movhps [DEST + r3q], %3
+ lea DEST, [DEST + 4*strideq]
+ movq [DEST], %6
+ movhps [DEST + r3q], %6
+ ; and now write remainder of first half
+ movq [DEST + 2*strideq], xmm5
+ movhps [DEST + strideq], xmm5
+%elif %2 == 2
+%warning Unimplemented
+%endif
+%endmacro
+
+
; IDCT pass on columns.
-%macro iLLM_PASS 1 ;dct
+%macro iLLM_PASS 2 ; %1=dct %2=type(normal,add,put)
movdqa xmm1, TAN3
movdqa xmm3, TAN1
pmulhw TAN3, xmm4
@@ -407,7 +470,7 @@ SECTION .text
psubsw xmm5, REG6
MOV32 ROW0, REG0
MOV32 ROW4, REG4
- MOV32 TAN1, [r0]
+ MOV32 TAN1, [BLOCK]
movdqa XMMS, REG0
psubsw REG0, REG4
paddsw REG4, XMMS
@@ -423,33 +486,22 @@ SECTION .text
movdqa XMMS, REG0
psubsw REG0, xmm3
paddsw xmm3, XMMS
- MOV32 [r0], TAN1
- psraw xmm5, 6
- psraw REG0, 6
- psraw TAN3, 6
- psraw xmm3, 6
- movdqa [%1+1*16], TAN3
- movdqa [%1+2*16], xmm3
- movdqa [%1+5*16], REG0
- movdqa [%1+6*16], xmm5
+ MOV32 [BLOCK], TAN1
+
+ FIRST_HALF %1, %2
+
movdqa xmm0, xmm7
movdqa xmm4, REG4
psubsw xmm7, xmm1
psubsw REG4, TAN1
paddsw xmm1, xmm0
paddsw TAN1, xmm4
- psraw xmm1, 6
- psraw xmm7, 6
- psraw TAN1, 6
- psraw REG4, 6
- movdqa [%1+0*16], xmm1
- movdqa [%1+3*16], TAN1
- movdqa [%1+4*16], REG4
- movdqa [%1+7*16], xmm7
+
+ SECOND_HALF %1, %2, xmm1, xmm7, TAN1, REG4
%endmacro
; IDCT pass on columns, assuming rows 4-7 are zero
-%macro iLLM_PASS_SPARSE 1 ;dct
+%macro iLLM_PASS_SPARSE 2 ; %1=dct %2=type(normal,put,add)
pmulhw TAN3, xmm4
paddsw TAN3, xmm4
movdqa xmm3, xmm6
@@ -475,7 +527,7 @@ SECTION .text
movdqa xmm6, REG0
psubsw xmm6, SREG2
paddsw SREG2, REG0
- MOV32 TAN1, [r0]
+ MOV32 TAN1, [BLOCK]
movdqa XMMS, REG0
psubsw REG0, xmm5
paddsw xmm5, XMMS
@@ -485,70 +537,92 @@ SECTION .text
movdqa XMMS, REG0
psubsw REG0, xmm3
paddsw xmm3, XMMS
- MOV32 [r0], TAN1
- psraw xmm5, 6
- psraw REG0, 6
- psraw TAN3, 6
- psraw xmm3, 6
- movdqa [%1+1*16], TAN3
- movdqa [%1+2*16], xmm3
- movdqa [%1+5*16], REG0
- movdqa [%1+6*16], xmm5
+ MOV32 [BLOCK], TAN1
+
+ FIRST_HALF %1, %2
+
movdqa xmm0, SREG2
movdqa xmm4, xmm6
psubsw SREG2, xmm1
psubsw xmm6, TAN1
paddsw xmm1, xmm0
paddsw TAN1, xmm4
- psraw xmm1, 6
- psraw SREG2, 6
- psraw TAN1, 6
- psraw xmm6, 6
- movdqa [%1+0*16], xmm1
- movdqa [%1+3*16], TAN1
- movdqa [%1+4*16], xmm6
- movdqa [%1+7*16], SREG2
+
+ SECOND_HALF %1, %2, xmm1, SREG2, TAN1, xmm6
%endmacro
-INIT_XMM sse2
-cglobal xvid_idct, 1, 5, 8+7*ARCH_X86_64, block
+%macro IDCT_SSE2 1 ; 0=normal 1=put 2=add
+%if %1 == 0 || ARCH_X86_32
+ %define GPR0 r1d
+ %define GPR1 r2d
+ %define GPR2 r3d
+ %define GPR3 r4d
+ %define NUM_GPRS 5
+%else
+ %define GPR0 r3d
+ %define GPR1 r4d
+ %define GPR2 r5d
+ %define GPR3 r6d
+ %define NUM_GPRS 7
+%endif
+%if %1 == 0
+cglobal xvid_idct, 1, NUM_GPRS, 8+7*ARCH_X86_64, block
+%xdefine BLOCK blockq
+%else
+ %if %1 == 1
+cglobal xvid_idct_put, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
+ %else
+cglobal xvid_idct_add, 0, NUM_GPRS, 8+7*ARCH_X86_64, dest, stride, block
+ %endif
+ %if ARCH_X86_64
+ %xdefine BLOCK blockq
+ %else
+ mov r0q, blockm
+ %xdefine BLOCK r0q
+ %endif
+%endif
movq mm0, [pb_127]
- iMTX_MULT r0 + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
- iMTX_MULT r0 + 1*16, iTab2, PUT_ODD, ROW1, 1*16
- iMTX_MULT r0 + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
+ iMTX_MULT BLOCK + 0*16, iTab1, PUT_EVEN, ROW0, 0*16
+ iMTX_MULT BLOCK + 1*16, iTab2, PUT_ODD, ROW1, 1*16
+ iMTX_MULT BLOCK + 2*16, iTab3, PUT_EVEN, ROW2, 2*16
- TEST_TWO_ROWS r0 + 3*16, r0 + 4*16, r1d, r2d, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
- JZ r1d, col1
- iMTX_MULT r0 + 3*16, iTab4, PUT_ODD, ROW3, 3*16
+ TEST_TWO_ROWS BLOCK + 3*16, BLOCK + 4*16, GPR0, GPR1, CLEAR_ODD, ROW3, CLEAR_EVEN, ROW4 ; a, c
+ JZ GPR0, col1
+ iMTX_MULT BLOCK + 3*16, iTab4, PUT_ODD, ROW3, 3*16
.col1:
- TEST_TWO_ROWS r0 + 5*16, r0 + 6*16, r1d, r3d, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
- TEST_ONE_ROW r0 + 7*16, r4d, CLEAR_ODD, ROW7 ; esi
+ TEST_TWO_ROWS BLOCK + 5*16, BLOCK + 6*16, GPR0, GPR2, CLEAR_ODD, ROW5, CLEAR_EVEN, ROW6 ; a, d
+ TEST_ONE_ROW BLOCK + 7*16, GPR3, CLEAR_ODD, ROW7 ; esi
iLLM_HEAD
- JNZ r2d, 2
- JNZ r1d, 3
- JNZ r3d, 4
- JNZ r4d, 5
- iLLM_PASS_SPARSE r0
+ JNZ GPR1, 2
+ JNZ GPR0, 3
+ JNZ GPR2, 4
+ JNZ GPR3, 5
+ iLLM_PASS_SPARSE BLOCK, %1
jmp .6
.2:
- iMTX_MULT r0 + 4*16, iTab1, PUT_EVEN, ROW4
+ iMTX_MULT BLOCK + 4*16, iTab1, PUT_EVEN, ROW4
.3:
- iMTX_MULT r0 + 5*16, iTab4, PUT_ODD, ROW5, 4*16
- JZ r3d, col2
+ iMTX_MULT BLOCK + 5*16, iTab4, PUT_ODD, ROW5, 4*16
+ JZ GPR2, col2
.4:
- iMTX_MULT r0 + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
+ iMTX_MULT BLOCK + 6*16, iTab3, PUT_EVEN, ROW6, 5*16
.col2:
- JZ r4d, col3
+ JZ GPR3, col3
.5:
- iMTX_MULT r0 + 7*16, iTab2, PUT_ODD, ROW7, 5*16
+ iMTX_MULT BLOCK + 7*16, iTab2, PUT_ODD, ROW7, 5*16
.col3:
%if ARCH_X86_32
iLLM_HEAD
%endif
- iLLM_PASS r0
+ iLLM_PASS BLOCK, %1
.6:
RET
+%endmacro
+
+INIT_XMM sse2
+IDCT_SSE2 0
+IDCT_SSE2 1
%if ARCH_X86_32
diff --git a/libavcodec/x86/xvididct_init.c b/libavcodec/x86/xvididct_init.c
index b429032..2530d7a 100644
--- a/libavcodec/x86/xvididct_init.c
+++ b/libavcodec/x86/xvididct_init.c
@@ -26,11 +26,7 @@
#include "idctdsp.h"
#include "xvididct.h"
-static void xvid_idct_sse2_put(uint8_t *dest, int line_size, short *block)
-{
- ff_xvid_idct_sse2(block);
- ff_put_pixels_clamped(block, dest, line_size);
-}
+void ff_xvid_idct_put_sse2(uint8_t *dest, int line_size, short *block);
static void xvid_idct_sse2_add(uint8_t *dest, int line_size, short *block)
{
@@ -91,7 +87,7 @@ av_cold void ff_xvid_idct_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
#endif
if (EXTERNAL_SSE2(cpu_flags)) {
- c->idct_put = xvid_idct_sse2_put;
+ c->idct_put = ff_xvid_idct_put_sse2;
c->idct_add = xvid_idct_sse2_add;
c->idct = ff_xvid_idct_sse2;
c->perm_type = FF_IDCT_PERM_SSE2;
--
1.9.2.msysgit.0
More information about the ffmpeg-devel
mailing list