[FFmpeg-devel] [PATCH] x86/motion_est: port mmxext and sse2 sad functions to yasm
James Almer
jamrial at gmail.com
Fri Sep 12 00:51:40 CEST 2014
Also add sse2 versions of sad16_x2, sad16_y2 and sad16_xy2, and a missing
c->pix_abs[0][0] initialization.
Signed-off-by: James Almer <jamrial at gmail.com>
---
Not benched.
TODO: Port mmx.
libavcodec/x86/me_cmp.asm | 229 +++++++++++++++++++++++++++++++++++++++++++
libavcodec/x86/me_cmp_init.c | 194 +++++++++---------------------------
2 files changed, 278 insertions(+), 145 deletions(-)
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index b0741f3..3e6bfef 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -23,6 +23,10 @@
%include "libavutil/x86/x86util.asm"
+SECTION_RODATA
+
+cextern pb_1
+
SECTION .text
%macro DIFF_PIXELS_1 4
@@ -465,3 +469,228 @@ cglobal hf_noise%1, 3,3,0, pix1, lsize, h
INIT_MMX mmx
HF_NOISE 8
HF_NOISE 16
+
+;---------------------------------------------------------------------------------------
+;int ff_sad_<opt>(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1, int stride, int h);
+;---------------------------------------------------------------------------------------
+%macro SAD 1
+cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h
+%if %1 == mmsize
+ shr hd, 1
+%define STRIDE strideq
+%else
+%define STRIDE 8
+%endif
+ pxor m2, m2
+
+align 16
+.loop
+ movu m0, [pix2q]
+ movu m1, [pix2q+STRIDE]
+ psadbw m0, [pix1q]
+ psadbw m1, [pix1q+STRIDE]
+ paddw m2, m0
+ paddw m2, m1
+%if %1 == mmsize
+ lea pix1q, [pix1q+strideq*2]
+ lea pix2q, [pix2q+strideq*2]
+%else
+ add pix1q, strideq
+ add pix2q, strideq
+%endif
+ dec hd
+ jg .loop
+%if mmsize == 16
+ movhlps m0, m2
+ paddw m2, m0
+%endif
+ movd eax, m2
+ RET
+%endmacro
+
+INIT_MMX mmxext
+SAD 8
+SAD 16
+INIT_XMM sse2
+SAD 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1, int stride, int h);
+;------------------------------------------------------------------------------------------
+%macro SAD_X2 1
+cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h
+%if %1 == mmsize
+ shr hd, 1
+%define STRIDE strideq
+%else
+%define STRIDE 8
+%endif
+ pxor m0, m0
+
+align 16
+.loop:
+ movu m1, [pix2q]
+ movu m2, [pix2q+STRIDE]
+%if cpuflag(sse2)
+ movu m3, [pix2q+1]
+ movu m4, [pix2q+STRIDE+1]
+ pavgb m1, m3
+ pavgb m2, m4
+%else
+ pavgb m1, [pix2q+1]
+ pavgb m2, [pix2q+STRIDE+1]
+%endif
+ psadbw m1, [pix1q]
+ psadbw m2, [pix1q+STRIDE]
+ paddw m0, m1
+ paddw m0, m2
+%if %1 == mmsize
+ lea pix1q, [pix1q+2*strideq]
+ lea pix2q, [pix2q+2*strideq]
+%else
+ add pix1q, strideq
+ add pix2q, strideq
+%endif
+ dec hd
+ jnz .loop
+%if mmsize == 16
+ movhlps m1, m0
+ paddw m0, m1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_X2 8
+SAD_X2 16
+INIT_XMM sse2
+SAD_X2 16
+
+;------------------------------------------------------------------------------------------
+;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1, int stride, int h);
+;------------------------------------------------------------------------------------------
+%macro SAD_Y2 1
+cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h
+ shr hd, 1
+%if %1 != mmsize
+ movu m4, [pix2q+8]
+%endif
+ movu m1, [pix2q]
+ pxor m0, m0
+ add pix2q, strideq
+
+align 16
+.loop:
+ movu m2, [pix2q]
+ movu m3, [pix2q+strideq]
+ pavgb m1, m2
+ pavgb m2, m3
+ psadbw m1, [pix1q]
+ psadbw m2, [pix1q+strideq]
+ paddw m0, m1
+ paddw m0, m2
+ mova m1, m3
+%if %1 != mmsize
+ movu m5, [pix2q+8]
+ movu m6, [pix2q+strideq+8]
+ pavgb m4, m5
+ pavgb m5, m6
+ psadbw m4, [pix1q+8]
+ psadbw m5, [pix1q+strideq+8]
+ paddw m0, m4
+ paddw m0, m5
+ mova m4, m6
+%endif
+ lea pix1q, [pix1q+2*strideq]
+ lea pix2q, [pix2q+2*strideq]
+ dec hd
+ jnz .loop
+%if mmsize == 16
+ movhlps m1, m0
+ paddw m0, m1
+%endif
+ movd eax, m0 ; return value
+ RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_Y2 8
+SAD_Y2 16
+INIT_XMM sse2
+SAD_Y2 16
+
+;-------------------------------------------------------------------------------------------
+;int ff_sad_xy2_<opt>(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1, int stride, int h);
+;-------------------------------------------------------------------------------------------
+%macro SAD_XY2 1
+cglobal sad%1_xy2, 5, 5, 7, v, pix1, pix2, stride, h
+ shr hd, 1
+ pxor m0, m0
+ mova m4, [pb_1]
+
+ movu m1, [pix2q]
+%if cpuflag(sse2)
+ movu m2, [pix2q+1]
+ pavgb m1, m2
+%else
+ pavgb m1, [pix2q+1]
+%endif
+%if %1 != mmsize
+ movu m5, [pix2q+8]
+ pavgb m5, [pix2q+8+1]
+%endif
+ add pix2q, strideq
+
+align 16
+.loop:
+ movu m2, [pix2q]
+ movu m3, [pix2q+strideq]
+%if cpuflag(sse2)
+ movu m5, [pix2q+1]
+ movu m6, [pix2q+strideq+1]
+ pavgb m2, m5
+ pavgb m3, m6
+%else
+ pavgb m2, [pix2q+1]
+ pavgb m3, [pix2q+strideq+1]
+%endif
+ psubusb m2, m4
+ pavgb m1, m2
+ pavgb m2, m3
+ psadbw m1, [pix1q]
+ psadbw m2, [pix1q+strideq]
+ paddw m0, m1
+ paddw m0, m2
+ mova m1, m3
+%if %1 != mmsize
+ movu m6, [pix2q+8]
+ movu m7, [pix2q+strideq+8]
+ pavgb m6, [pix2q+8+1]
+ pavgb m7, [pix2q+strideq+8+1]
+ psubusb m6, m4
+ pavgb m5, m6
+ pavgb m6, m7
+ psadbw m5, [pix1q+8]
+ psadbw m6, [pix1q+strideq+8]
+ paddw m0, m5
+ paddw m0, m6
+ mova m5, m7
+%endif
+ lea pix1q, [pix1q+2*strideq]
+ lea pix2q, [pix2q+2*strideq]
+ dec hd
+ jnz .loop
+%if mmsize == 16
+ movhlps m1, m0
+ paddw m0, m1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_MMX mmxext
+SAD_XY2 8
+SAD_XY2 16
+INIT_XMM sse2
+SAD_XY2 16
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index 21db221..2ff0c80 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -41,6 +41,30 @@ int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h);
int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h);
int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h);
+int ff_sad8_mmxext(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
+ int stride, int h);
+int ff_sad16_mmxext(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
+ int stride, int h);
+int ff_sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
+ int stride, int h);
+int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
+ int stride, int h);
+int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
+ int stride, int h);
+int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
+ int stride, int h);
+int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
+ int stride, int h);
+int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
+ int stride, int h);
+int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
+ int stride, int h);
+int ff_sad8_xy2_mmxext(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
+ int stride, int h);
+int ff_sad16_xy2_mmxext(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
+ int stride, int h);
+int ff_sad16_xy2_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
+ int stride, int h);
#define hadamard_func(cpu) \
int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
@@ -345,8 +369,6 @@ DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
0x0002000200020002ULL,
};
-DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
-
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
x86_reg len = -(x86_reg)stride * h;
@@ -382,130 +404,6 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
: "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride));
}
-static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
- int stride, int h)
-{
- __asm__ volatile (
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "psadbw (%2), %%mm0 \n\t"
- "psadbw (%2, %3), %%mm1 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm1, %%mm6 \n\t"
- "lea (%1,%3,2), %1 \n\t"
- "lea (%2,%3,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2)
- : "r" ((x86_reg) stride));
-}
-
-static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
- int stride, int h)
-{
- int ret;
- __asm__ volatile (
- "pxor %%xmm2, %%xmm2 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movdqu (%1), %%xmm0 \n\t"
- "movdqu (%1, %4), %%xmm1 \n\t"
- "psadbw (%2), %%xmm0 \n\t"
- "psadbw (%2, %4), %%xmm1 \n\t"
- "paddw %%xmm0, %%xmm2 \n\t"
- "paddw %%xmm1, %%xmm2 \n\t"
- "lea (%1,%4,2), %1 \n\t"
- "lea (%2,%4,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- "movhlps %%xmm2, %%xmm0 \n\t"
- "paddw %%xmm0, %%xmm2 \n\t"
- "movd %%xmm2, %3 \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
- : "r" ((x86_reg) stride));
- return ret;
-}
-
-static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
- int stride, int h)
-{
- __asm__ volatile (
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "pavgb 1(%1), %%mm0 \n\t"
- "pavgb 1(%1, %3), %%mm1 \n\t"
- "psadbw (%2), %%mm0 \n\t"
- "psadbw (%2, %3), %%mm1 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm1, %%mm6 \n\t"
- "lea (%1,%3,2), %1 \n\t"
- "lea (%2,%3,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2)
- : "r" ((x86_reg) stride));
-}
-
-static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
- int stride, int h)
-{
- __asm__ volatile (
- "movq (%1), %%mm0 \n\t"
- "add %3, %1 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "pavgb %%mm1, %%mm0 \n\t"
- "pavgb %%mm2, %%mm1 \n\t"
- "psadbw (%2), %%mm0 \n\t"
- "psadbw (%2, %3), %%mm1 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm1, %%mm6 \n\t"
- "movq %%mm2, %%mm0 \n\t"
- "lea (%1,%3,2), %1 \n\t"
- "lea (%2,%3,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2)
- : "r" ((x86_reg) stride));
-}
-
-static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
- int stride, int h)
-{
- __asm__ volatile (
- "movq "MANGLE(bone)", %%mm5 \n\t"
- "movq (%1), %%mm0 \n\t"
- "pavgb 1(%1), %%mm0 \n\t"
- "add %3, %1 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1), %%mm1 \n\t"
- "movq (%1,%3), %%mm2 \n\t"
- "pavgb 1(%1), %%mm1 \n\t"
- "pavgb 1(%1,%3), %%mm2 \n\t"
- "psubusb %%mm5, %%mm1 \n\t"
- "pavgb %%mm1, %%mm0 \n\t"
- "pavgb %%mm2, %%mm1 \n\t"
- "psadbw (%2), %%mm0 \n\t"
- "psadbw (%2,%3), %%mm1 \n\t"
- "paddw %%mm0, %%mm6 \n\t"
- "paddw %%mm1, %%mm6 \n\t"
- "movq %%mm2, %%mm0 \n\t"
- "lea (%1,%3,2), %1 \n\t"
- "lea (%2,%3,2), %2 \n\t"
- "sub $2, %0 \n\t"
- " jg 1b \n\t"
- : "+r" (h), "+r" (blk1), "+r" (blk2)
- : "r" ((x86_reg) stride)
- NAMED_CONSTRAINTS_ADD(bone));
-}
-
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
int stride, int h)
{
@@ -750,7 +648,6 @@ static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
} \
PIX_SAD(mmx)
-PIX_SAD(mmxext)
#endif /* HAVE_INLINE_ASM */
@@ -782,29 +679,11 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
if (INLINE_MMXEXT(cpu_flags)) {
c->vsad[4] = vsad_intra16_mmxext;
- c->pix_abs[0][0] = sad16_mmxext;
- c->pix_abs[1][0] = sad8_mmxext;
-
- c->sad[0] = sad16_mmxext;
- c->sad[1] = sad8_mmxext;
-
- c->pix_abs[0][1] = sad16_x2_mmxext;
- c->pix_abs[0][2] = sad16_y2_mmxext;
- c->pix_abs[1][1] = sad8_x2_mmxext;
- c->pix_abs[1][2] = sad8_y2_mmxext;
-
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
- c->pix_abs[0][3] = sad16_xy2_mmxext;
- c->pix_abs[1][3] = sad8_xy2_mmxext;
-
c->vsad[0] = vsad16_mmxext;
}
}
- if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
- c->sad[0] = sad16_sse2;
- }
-
#endif /* HAVE_INLINE_ASM */
if (EXTERNAL_MMX(cpu_flags)) {
@@ -823,6 +702,21 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
+
+ c->sad[0] = ff_sad16_mmxext;
+ c->sad[1] = ff_sad8_mmxext;
+
+ c->pix_abs[0][0] = ff_sad16_mmxext;
+ c->pix_abs[0][1] = ff_sad16_x2_mmxext;
+ c->pix_abs[0][2] = ff_sad16_y2_mmxext;
+ c->pix_abs[1][0] = ff_sad8_mmxext;
+ c->pix_abs[1][1] = ff_sad8_x2_mmxext;
+ c->pix_abs[1][2] = ff_sad8_y2_mmxext;
+
+ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+ c->pix_abs[0][3] = ff_sad16_xy2_mmxext;
+ c->pix_abs[1][3] = ff_sad8_xy2_mmxext;
+ }
}
if (EXTERNAL_SSE2(cpu_flags)) {
@@ -833,6 +727,16 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
#endif
+ if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
+ c->sad[0] = ff_sad16_sse2;
+ c->pix_abs[0][0] = ff_sad16_sse2;
+ c->pix_abs[0][1] = ff_sad16_x2_sse2;
+ c->pix_abs[0][2] = ff_sad16_y2_sse2;
+
+ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+ c->pix_abs[0][3] = ff_sad16_xy2_sse2;
+ }
+ }
}
if (EXTERNAL_SSSE3(cpu_flags)) {
--
1.8.5.5
More information about the ffmpeg-devel
mailing list