[MPlayer-dev-eng] [PATCH] yadif SSE2/SSSE3 optimization
Michael Niedermayer
michaelni at gmx.at
Sat Nov 15 01:05:45 CET 2008
On Thu, Nov 13, 2008 at 08:22:50PM +0800, Zhou, Zongyi wrote:
> Hi all,
>
> I made this patch from ffdshow-mt branch. The original SSE2/SSSE3 codes are written by h.yamagata.
> I ported those codes back to mplayer and replaced all movdqa/movdqu with movaps/movups.
>
> My tests show that on Intel CPUs (except Yonah), SSE2 is ~20% faster than MMX2 and SSSE3 is 30% faster than MMX2.
> However on AMD CPUs (except Socket 754 Semprons), SSE2 is ~5% slower than MMX2.
> So now SSE2 function is used only on Intel CPUs.
>
> Regards,
>
> ZZ
> Index: cpudetect.c
> ===================================================================
> --- cpudetect.c (revision 27905)
> +++ cpudetect.c (working copy)
> @@ -140,10 +140,11 @@
> caps->cpuStepping=regs2[0] & 0xf;
>
> // general feature flags:
> - caps->hasTSC = (regs2[3] & (1 << 8 )) >> 8; // 0x0000010
> - caps->hasMMX = (regs2[3] & (1 << 23 )) >> 23; // 0x0800000
> - caps->hasSSE = (regs2[3] & (1 << 25 )) >> 25; // 0x2000000
> - caps->hasSSE2 = (regs2[3] & (1 << 26 )) >> 26; // 0x4000000
> + caps->hasTSC = (regs2[3] & (1 << 8 )) >> 8; // 0x00000100
> + caps->hasMMX = (regs2[3] & (1 << 23 )) >> 23; // 0x00800000
> + caps->hasSSE = (regs2[3] & (1 << 25 )) >> 25; // 0x02000000
> + caps->hasSSE2 = (regs2[3] & (1 << 26 )) >> 26; // 0x04000000
> + caps->hasSSSE3 = (regs2[3] & (1 << 9 )) >> 9; // 0x00000200
> caps->hasMMX2 = caps->hasSSE; // SSE cpus supports mmxext too
> cl_size = ((regs2[1] >> 8) & 0xFF)*8;
> if(cl_size) caps->cl_size = cl_size;
mixing cosmtics, functional changes and bugs
[...]
> Index: libmpcodecs/vf_yadif.c
> ===================================================================
> --- libmpcodecs/vf_yadif.c (revision 27905)
> +++ libmpcodecs/vf_yadif.c (working copy)
> @@ -281,6 +281,268 @@
> #undef CHECK2
> #undef FILTER
>
> +// sse2 & ssse3 templates
> +#define LOAD8(mem,dst) \
> + "movq "mem", "#dst" \n\t"\
> + "punpcklbw %%xmm7, "#dst" \n\t"
> +
> +#define CHECK(pj,mj) \
> + "movups "#pj"(%[cur],%[mrefs]), %%xmm2 \n\t" /* cur[x-refs-1+j] */\
> + "movups "#mj"(%[cur],%[prefs]), %%xmm3 \n\t" /* cur[x+refs-1-j] */\
> + "movaps %%xmm2, %%xmm4 \n\t"\
> + "movaps %%xmm2, %%xmm5 \n\t"\
> + "pxor %%xmm3, %%xmm4 \n\t"\
> + "pavgb %%xmm3, %%xmm5 \n\t"\
> + "pand %[pb1], %%xmm4 \n\t"\
> + "psubusb %%xmm4, %%xmm5 \n\t"\
> + "psrldq $1, %%xmm5 \n\t"\
> + "punpcklbw %%xmm7, %%xmm5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
> + "movaps %%xmm2, %%xmm4 \n\t"\
> + "psubusb %%xmm3, %%xmm2 \n\t"\
> + "psubusb %%xmm4, %%xmm3 \n\t"\
> + "pmaxub %%xmm3, %%xmm2 \n\t"\
> + "movaps %%xmm2, %%xmm3 \n\t"\
> + "movaps %%xmm2, %%xmm4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
> + "psrldq $1, %%xmm3 \n\t" /* ABS(cur[x-refs +j] - cur[x+refs -j]) */\
> + "psrldq $2, %%xmm4 \n\t" /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
> + "punpcklbw %%xmm7, %%xmm2 \n\t"\
> + "punpcklbw %%xmm7, %%xmm3 \n\t"\
> + "punpcklbw %%xmm7, %%xmm4 \n\t"\
> + "paddw %%xmm3, %%xmm2 \n\t"\
> + "paddw %%xmm4, %%xmm2 \n\t" /* score */
> +
> +#define CHECK1 \
> + "movaps %%xmm0, %%xmm3 \n\t"\
> + "pcmpgtw %%xmm2, %%xmm3 \n\t" /* if(score < spatial_score) */\
> + "pminsw %%xmm2, %%xmm0 \n\t" /* spatial_score= score; */\
> + "movaps %%xmm3, %%xmm6 \n\t"\
> + "pand %%xmm3, %%xmm5 \n\t"\
> + "pandn %%xmm1, %%xmm3 \n\t"\
> + "por %%xmm5, %%xmm3 \n\t"\
> + "movaps %%xmm3, %%xmm1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
> +
> +#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
> + hurts both quality and speed, but matches the C version. */\
> + "paddw %[pw1], %%xmm6 \n\t"\
> + "psllw $14, %%xmm6 \n\t"\
> + "paddsw %%xmm6, %%xmm2 \n\t"\
> + "movaps %%xmm0, %%xmm3 \n\t"\
> + "pcmpgtw %%xmm2, %%xmm3 \n\t"\
> + "pminsw %%xmm2, %%xmm0 \n\t"\
> + "pand %%xmm3, %%xmm5 \n\t"\
> + "pandn %%xmm1, %%xmm3 \n\t"\
> + "por %%xmm5, %%xmm3 \n\t"\
> + "movaps %%xmm3, %%xmm1 \n\t"
> +
> +#define FILTER\
> + for(x=0; x<w; x+=8){\
> + __asm__ volatile(\
> + "pxor %%xmm7, %%xmm7 \n\t"\
> + LOAD8("(%[cur],%[mrefs])", %%xmm0) /* c = cur[x-refs] */\
> + LOAD8("(%[cur],%[prefs])", %%xmm1) /* e = cur[x+refs] */\
> + LOAD8("(%["prev2"])", %%xmm2) /* prev2[x] */\
> + LOAD8("(%["next2"])", %%xmm3) /* next2[x] */\
> + "movaps %%xmm3, %%xmm4 \n\t"\
> + "paddw %%xmm2, %%xmm3 \n\t"\
> + "psraw $1, %%xmm3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
> + "movaps %%xmm0, %[tmp0] \n\t" /* c */\
> + "movaps %%xmm3, %[tmp1] \n\t" /* d */\
> + "movaps %%xmm1, %[tmp2] \n\t" /* e */\
> + "psubw %%xmm4, %%xmm2 \n\t"\
> + PABS( %%xmm4, %%xmm2) /* temporal_diff0 */\
> + LOAD8("(%[prev],%[mrefs])", %%xmm3) /* prev[x-refs] */\
> + LOAD8("(%[prev],%[prefs])", %%xmm4) /* prev[x+refs] */\
> + "psubw %%xmm0, %%xmm3 \n\t"\
> + "psubw %%xmm1, %%xmm4 \n\t"\
> + PABS( %%xmm5, %%xmm3)\
> + PABS( %%xmm5, %%xmm4)\
> + "paddw %%xmm4, %%xmm3 \n\t" /* temporal_diff1 */\
> + "psrlw $1, %%xmm2 \n\t"\
> + "psrlw $1, %%xmm3 \n\t"\
> + "pmaxsw %%xmm3, %%xmm2 \n\t"\
> + LOAD8("(%[next],%[mrefs])", %%xmm3) /* next[x-refs] */\
> + LOAD8("(%[next],%[prefs])", %%xmm4) /* next[x+refs] */\
> + "psubw %%xmm0, %%xmm3 \n\t"\
> + "psubw %%xmm1, %%xmm4 \n\t"\
> + PABS( %%xmm5, %%xmm3)\
> + PABS( %%xmm5, %%xmm4)\
> + "paddw %%xmm4, %%xmm3 \n\t" /* temporal_diff2 */\
> + "psrlw $1, %%xmm3 \n\t"\
> + "pmaxsw %%xmm3, %%xmm2 \n\t"\
> + "movaps %%xmm2, %[tmp3] \n\t" /* diff */\
> +\
> + "paddw %%xmm0, %%xmm1 \n\t"\
> + "paddw %%xmm0, %%xmm0 \n\t"\
> + "psubw %%xmm1, %%xmm0 \n\t"\
> + "psrlw $1, %%xmm1 \n\t" /* spatial_pred */\
> + PABS( %%xmm2, %%xmm0) /* ABS(c-e) */\
> +\
> + "movups -1(%[cur],%[mrefs]), %%xmm2 \n\t" /* cur[x-refs-1] */\
> + "movups -1(%[cur],%[prefs]), %%xmm3 \n\t" /* cur[x+refs-1] */\
> + "movaps %%xmm2, %%xmm4 \n\t"\
> + "psubusb %%xmm3, %%xmm2 \n\t"\
> + "psubusb %%xmm4, %%xmm3 \n\t"\
> + "pmaxub %%xmm3, %%xmm2 \n\t"\
> + "pshuflw $9,%%xmm2, %%xmm3 \n\t"\
> + "pshufhw $9,%%xmm2, %%xmm3 \n\t"\
> + "punpcklbw %%xmm7, %%xmm2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
> + "punpcklbw %%xmm7, %%xmm3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
> + "paddw %%xmm2, %%xmm0 \n\t"\
> + "paddw %%xmm3, %%xmm0 \n\t"\
> + "psubw %[pw1], %%xmm0 \n\t" /* spatial_score */\
> +\
> + CHECK(-2,0)\
> + CHECK1\
> + CHECK(-3,1)\
> + CHECK2\
> + CHECK(0,-2)\
> + CHECK1\
> + CHECK(1,-3)\
> + CHECK2\
> +\
> + /* if(p->mode<2) ... */\
> + "movaps %[tmp3], %%xmm6 \n\t" /* diff */\
> + "cmp $2, %[mode] \n\t"\
> + "jge 1f \n\t"\
> + LOAD8("(%["prev2"],%[mrefs],2)", %%xmm2) /* prev2[x-2*refs] */\
> + LOAD8("(%["next2"],%[mrefs],2)", %%xmm4) /* next2[x-2*refs] */\
> + LOAD8("(%["prev2"],%[prefs],2)", %%xmm3) /* prev2[x+2*refs] */\
> + LOAD8("(%["next2"],%[prefs],2)", %%xmm5) /* next2[x+2*refs] */\
> + "paddw %%xmm4, %%xmm2 \n\t"\
> + "paddw %%xmm5, %%xmm3 \n\t"\
> + "psrlw $1, %%xmm2 \n\t" /* b */\
> + "psrlw $1, %%xmm3 \n\t" /* f */\
> + "movaps %[tmp0], %%xmm4 \n\t" /* c */\
> + "movaps %[tmp1], %%xmm5 \n\t" /* d */\
> + "movaps %[tmp2], %%xmm7 \n\t" /* e */\
> + "psubw %%xmm4, %%xmm2 \n\t" /* b-c */\
> + "psubw %%xmm7, %%xmm3 \n\t" /* f-e */\
> + "movaps %%xmm5, %%xmm0 \n\t"\
> + "psubw %%xmm4, %%xmm5 \n\t" /* d-c */\
> + "psubw %%xmm7, %%xmm0 \n\t" /* d-e */\
> + "movaps %%xmm2, %%xmm4 \n\t"\
> + "pminsw %%xmm3, %%xmm2 \n\t"\
> + "pmaxsw %%xmm4, %%xmm3 \n\t"\
> + "pmaxsw %%xmm5, %%xmm2 \n\t"\
> + "pminsw %%xmm5, %%xmm3 \n\t"\
> + "pmaxsw %%xmm0, %%xmm2 \n\t" /* max */\
> + "pminsw %%xmm0, %%xmm3 \n\t" /* min */\
> + "pxor %%xmm4, %%xmm4 \n\t"\
> + "pmaxsw %%xmm3, %%xmm6 \n\t"\
> + "psubw %%xmm2, %%xmm4 \n\t" /* -max */\
> + "pmaxsw %%xmm4, %%xmm6 \n\t" /* diff= MAX3(diff, min, -max); */\
> + "1: \n\t"\
> +\
> + "movaps %[tmp1], %%xmm2 \n\t" /* d */\
> + "movaps %%xmm2, %%xmm3 \n\t"\
> + "psubw %%xmm6, %%xmm2 \n\t" /* d-diff */\
> + "paddw %%xmm6, %%xmm3 \n\t" /* d+diff */\
> + "pmaxsw %%xmm2, %%xmm1 \n\t"\
> + "pminsw %%xmm3, %%xmm1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
> + "packuswb %%xmm1, %%xmm1 \n\t"\
> +\
> + :[tmp0]"=m"(tmp0),\
> + [tmp1]"=m"(tmp1),\
> + [tmp2]"=m"(tmp2),\
> + [tmp3]"=m"(tmp3)\
> + :[prev] "r"(prev),\
> + [cur] "r"(cur),\
> + [next] "r"(next),\
> + [prefs]"r"((long)refs),\
> + [mrefs]"r"((long)-refs),\
> + [pw1] "m"(*pw_1),\
> + [pb1] "m"(*pb_1),\
> + [mode] "g"(mode)\
> + );\
> + __asm__ volatile("movq %%xmm1, %0" :"=m"(*dst));\
> + dst += 8;\
> + prev+= 8;\
> + cur += 8;\
> + next+= 8;\
> + }
This code is the same as the mmx code just with sse2 registers and
instructions.
Thus it qualifies as code duplication and should be factorized into common
code/macro
> +
> +static void __attribute__((force_align_arg_pointer)) filter_line_sse2(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity){
> + const int mode = p->mode;
> + DECLARE_ALIGNED(16, uint8_t, tmp0[16]);
> + DECLARE_ALIGNED(16, uint8_t, tmp1[16]);
> + DECLARE_ALIGNED(16, uint8_t, tmp2[16]);
> + DECLARE_ALIGNED(16, uint8_t, tmp3[16]);
put these in the context, that avoids the need of force_align_arg_pointer
> + int x;
> + static DECLARE_ALIGNED(16, const unsigned short, pw_1[]) =
> + {
> + 0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001
> + };
> +
> + static DECLARE_ALIGNED(16, const unsigned short, pb_1[]) =
> + {
> + 0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101
> + };
[...]
> + static DECLARE_ALIGNED(16, const unsigned short, pw_1[]) =
> + {
> + 0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001
> + };
> +
> + static DECLARE_ALIGNED(16, const unsigned short, pb_1[]) =
> + {
> + 0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101
> + };
duplicate
[...]
> @@ -364,7 +626,7 @@
> }
> }
> #if defined(HAVE_MMX) && defined(NAMED_ASM_ARGS)
> - if(gCpuCaps.hasMMX2) __asm__ volatile("emms \n\t" : : : "memory");
> + if(filter_line_mmx2 == filter_line) __asm__ volatile("emms \n\t" : : : "memory");
> #endif
this isnt worth the check
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
Complexity theory is the science of finding the exact solution to an
approximation. Benchmarking OTOH is finding an approximation of the exact
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/mplayer-dev-eng/attachments/20081115/8aff273c/attachment.pgp>
More information about the MPlayer-dev-eng
mailing list