[FFmpeg-devel] [PATCH 2/6] x86: huffyuvdsp: add SSE2 median prediction
Christophe Gisquet
christophe.gisquet at gmail.com
Thu May 29 11:10:37 CEST 2014
>From 5010c to 4566 on lagarith YUY2.
---
libavcodec/x86/huffyuvdsp.asm | 98 ++++++++++++++++++++++++----------------
libavcodec/x86/huffyuvdsp_init.c | 7 +++
2 files changed, 67 insertions(+), 38 deletions(-)
diff --git a/libavcodec/x86/huffyuvdsp.asm b/libavcodec/x86/huffyuvdsp.asm
index 7acab87..9806fed 100644
--- a/libavcodec/x86/huffyuvdsp.asm
+++ b/libavcodec/x86/huffyuvdsp.asm
@@ -33,64 +33,86 @@ SECTION_TEXT
; void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
; const uint8_t *diff, int w,
; int *left, int *left_top)
-INIT_MMX mmxext
-cglobal add_hfyu_median_pred, 6,6,0, dst, top, diff, w, left, left_top
- movq mm0, [topq]
- movq mm2, mm0
- movd mm4, [left_topq]
- psllq mm2, 8
- movq mm1, mm0
- por mm4, mm2
- movd mm3, [leftq]
- psubb mm0, mm4 ; t-tl
+%macro LSHIFT 2
+%if cpuflag(sse2)
+ pslldq %1, %2
+%else
+ psllq %1, 8*(%2)
+%endif
+%endmacro
+
+%macro RSHIFT 2
+%if cpuflag(sse2)
+ psrldq %1, %2
+%else
+ psrlq %1, 8*(%2)
+%endif
+%endmacro
+
+%macro HFYU_MEDIAN 0
+cglobal add_hfyu_median_pred, 6,6,8, dst, top, diff, w, left, left_top
+ movu m0, [topq]
+ mova m2, m0
+ movd m4, [left_topq]
+ LSHIFT m2, 1
+ mova m1, m0
+ por m4, m2
+ movd m3, [leftq]
+ psubb m0, m4 ; t-tl
add dstq, wq
add topq, wq
add diffq, wq
neg wq
jmp .skip
.loop:
- movq mm4, [topq+wq]
- movq mm0, mm4
- psllq mm4, 8
- por mm4, mm1
- movq mm1, mm0 ; t
- psubb mm0, mm4 ; t-tl
+ movu m4, [topq+wq]
+ mova m0, m4
+ LSHIFT m4, 1
+ por m4, m1
+ mova m1, m0 ; t
+ psubb m0, m4 ; t-tl
.skip:
- movq mm2, [diffq+wq]
+ movu m2, [diffq+wq]
%assign i 0
-%rep 8
- movq mm4, mm0
- paddb mm4, mm3 ; t-tl+l
- movq mm5, mm3
- pmaxub mm3, mm1
- pminub mm5, mm1
- pminub mm3, mm4
- pmaxub mm3, mm5 ; median
- paddb mm3, mm2 ; +residual
+%rep mmsize
+ mova m4, m0
+ paddb m4, m3 ; t-tl+l
+ mova m5, m3
+ pmaxub m3, m1
+ pminub m5, m1
+ pminub m3, m4
+ pmaxub m3, m5 ; median
+ paddb m3, m2 ; +residual
%if i==0
- movq mm7, mm3
- psllq mm7, 56
+ mova m7, m3
+ LSHIFT m7, mmsize-1
%else
- movq mm6, mm3
- psrlq mm7, 8
- psllq mm6, 56
- por mm7, mm6
+ mova m6, m3
+ RSHIFT m7, 1
+ LSHIFT m6, mmsize-1
+ por m7, m6
%endif
-%if i<7
- psrlq mm0, 8
- psrlq mm1, 8
- psrlq mm2, 8
+%if i<mmsize-1
+ RSHIFT m0, 1
+ RSHIFT m1, 1
+ RSHIFT m2, 1
%endif
%assign i i+1
%endrep
- movq [dstq+wq], mm7
- add wq, 8
+ movu [dstq+wq], m7
+ add wq, mmsize
jl .loop
movzx r2d, byte [dstq-1]
mov [leftq], r2d
movzx r2d, byte [topq-1]
mov [left_topq], r2d
RET
+%endmacro
+
+INIT_MMX mmxext
+HFYU_MEDIAN
+INIT_XMM sse2
+HFYU_MEDIAN
%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
diff --git a/libavcodec/x86/huffyuvdsp_init.c b/libavcodec/x86/huffyuvdsp_init.c
index 184c2ce..1a42b87 100644
--- a/libavcodec/x86/huffyuvdsp_init.c
+++ b/libavcodec/x86/huffyuvdsp_init.c
@@ -31,6 +31,9 @@ void ff_add_hfyu_median_pred_cmov(uint8_t *dst, const uint8_t *top,
void ff_add_hfyu_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top);
+void ff_add_hfyu_median_pred_sse2(uint8_t *dst, const uint8_t *top,
+ const uint8_t *diff, int w,
+ int *left, int *left_top);
int ff_add_hfyu_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
int w, int left);
@@ -55,6 +58,10 @@ av_cold void ff_huffyuvdsp_init_x86(HuffYUVDSPContext *c)
c->add_hfyu_median_pred = ff_add_hfyu_median_pred_mmxext;
}
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->add_hfyu_median_pred = ff_add_hfyu_median_pred_sse2;
+ }
+
if (EXTERNAL_SSSE3(cpu_flags)) {
c->add_hfyu_left_pred = ff_add_hfyu_left_pred_ssse3;
if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
--
1.8.0.msysgit.0
More information about the ffmpeg-devel
mailing list