[MPlayer-dev-eng] [PATCH] yadif SSE2/SSSE3 optimization

Sat Nov 15 01:05:45 CET 2008

On Thu, Nov 13, 2008 at 08:22:50PM +0800, Zhou, Zongyi wrote:
> Hi all, 
> 
> I made this patch from ffdshow-mt branch. The original SSE2/SSSE3 codes are written by h.yamagata.
> I ported those codes back to mplayer and replaced all movdqa/movdqu with movaps/movups.
> 
> My tests show that on Intel CPUs (except Yonah), SSE2 is ~20% faster than MMX2 and SSSE3 is 30% faster than MMX2.
> However on AMD CPUs (except Socket 754 Semprons), SSE2 is ~5% slower than MMX2.
> So now SSE2 function is used only on Intel CPUs.
> 
> Regards,
> 
> ZZ

> Index: cpudetect.c
> ===================================================================
> --- cpudetect.c	(revision 27905)
> +++ cpudetect.c	(working copy)
> @@ -140,10 +140,11 @@
>  		caps->cpuStepping=regs2[0] & 0xf;
>  
>  		// general feature flags:
> -		caps->hasTSC  = (regs2[3] & (1 << 8  )) >>  8; // 0x0000010
> -		caps->hasMMX  = (regs2[3] & (1 << 23 )) >> 23; // 0x0800000
> -		caps->hasSSE  = (regs2[3] & (1 << 25 )) >> 25; // 0x2000000
> -		caps->hasSSE2 = (regs2[3] & (1 << 26 )) >> 26; // 0x4000000
> +		caps->hasTSC  = (regs2[3] & (1 << 8  )) >>  8; // 0x00000100
> +		caps->hasMMX  = (regs2[3] & (1 << 23 )) >> 23; // 0x00800000
> +		caps->hasSSE  = (regs2[3] & (1 << 25 )) >> 25; // 0x02000000
> +		caps->hasSSE2 = (regs2[3] & (1 << 26 )) >> 26; // 0x04000000
> +		caps->hasSSSE3 = (regs2[3] & (1 << 9 )) >>  9; // 0x00000200
>  		caps->hasMMX2 = caps->hasSSE; // SSE cpus supports mmxext too
>  		cl_size = ((regs2[1] >> 8) & 0xFF)*8;
>  		if(cl_size) caps->cl_size = cl_size;

mixing cosmtics, functional changes and bugs

[...]
> Index: libmpcodecs/vf_yadif.c
> ===================================================================
> --- libmpcodecs/vf_yadif.c	(revision 27905)
> +++ libmpcodecs/vf_yadif.c	(working copy)
> @@ -281,6 +281,268 @@
>  #undef CHECK2
>  #undef FILTER
>  
> +// sse2 & ssse3 templates
> +#define LOAD8(mem,dst) \
> +            "movq      "mem", "#dst" \n\t"\
> +            "punpcklbw %%xmm7, "#dst" \n\t"
> +
> +#define CHECK(pj,mj) \
> +            "movups "#pj"(%[cur],%[mrefs]), %%xmm2 \n\t" /* cur[x-refs-1+j] */\
> +            "movups "#mj"(%[cur],%[prefs]), %%xmm3 \n\t" /* cur[x+refs-1-j] */\
> +            "movaps      %%xmm2, %%xmm4 \n\t"\
> +            "movaps      %%xmm2, %%xmm5 \n\t"\
> +            "pxor        %%xmm3, %%xmm4 \n\t"\
> +            "pavgb       %%xmm3, %%xmm5 \n\t"\
> +            "pand        %[pb1], %%xmm4 \n\t"\
> +            "psubusb     %%xmm4, %%xmm5 \n\t"\
> +            "psrldq      $1,    %%xmm5 \n\t"\
> +            "punpcklbw   %%xmm7, %%xmm5 \n\t" /* (cur[x-refs+j] + cur[x+refs-j])>>1 */\
> +            "movaps      %%xmm2, %%xmm4 \n\t"\
> +            "psubusb     %%xmm3, %%xmm2 \n\t"\
> +            "psubusb     %%xmm4, %%xmm3 \n\t"\
> +            "pmaxub      %%xmm3, %%xmm2 \n\t"\
> +            "movaps      %%xmm2, %%xmm3 \n\t"\
> +            "movaps      %%xmm2, %%xmm4 \n\t" /* ABS(cur[x-refs-1+j] - cur[x+refs-1-j]) */\
> +            "psrldq      $1,   %%xmm3 \n\t" /* ABS(cur[x-refs  +j] - cur[x+refs  -j]) */\
> +            "psrldq      $2,   %%xmm4 \n\t" /* ABS(cur[x-refs+1+j] - cur[x+refs+1-j]) */\
> +            "punpcklbw   %%xmm7, %%xmm2 \n\t"\
> +            "punpcklbw   %%xmm7, %%xmm3 \n\t"\
> +            "punpcklbw   %%xmm7, %%xmm4 \n\t"\
> +            "paddw       %%xmm3, %%xmm2 \n\t"\
> +            "paddw       %%xmm4, %%xmm2 \n\t" /* score */
> +
> +#define CHECK1 \
> +            "movaps      %%xmm0, %%xmm3 \n\t"\
> +            "pcmpgtw     %%xmm2, %%xmm3 \n\t" /* if(score < spatial_score) */\
> +            "pminsw      %%xmm2, %%xmm0 \n\t" /* spatial_score= score; */\
> +            "movaps      %%xmm3, %%xmm6 \n\t"\
> +            "pand        %%xmm3, %%xmm5 \n\t"\
> +            "pandn       %%xmm1, %%xmm3 \n\t"\
> +            "por         %%xmm5, %%xmm3 \n\t"\
> +            "movaps      %%xmm3, %%xmm1 \n\t" /* spatial_pred= (cur[x-refs+j] + cur[x+refs-j])>>1; */
> +
> +#define CHECK2 /* pretend not to have checked dir=2 if dir=1 was bad.\
> +                  hurts both quality and speed, but matches the C version. */\
> +            "paddw       %[pw1], %%xmm6 \n\t"\
> +            "psllw       $14,   %%xmm6 \n\t"\
> +            "paddsw      %%xmm6, %%xmm2 \n\t"\
> +            "movaps      %%xmm0, %%xmm3 \n\t"\
> +            "pcmpgtw     %%xmm2, %%xmm3 \n\t"\
> +            "pminsw      %%xmm2, %%xmm0 \n\t"\
> +            "pand        %%xmm3, %%xmm5 \n\t"\
> +            "pandn       %%xmm1, %%xmm3 \n\t"\
> +            "por         %%xmm5, %%xmm3 \n\t"\
> +            "movaps      %%xmm3, %%xmm1 \n\t"
> +
> +#define FILTER\
> +    for(x=0; x<w; x+=8){\
> +        __asm__ volatile(\
> +            "pxor        %%xmm7, %%xmm7 \n\t"\
> +            LOAD8("(%[cur],%[mrefs])", %%xmm0) /* c = cur[x-refs] */\
> +            LOAD8("(%[cur],%[prefs])", %%xmm1) /* e = cur[x+refs] */\
> +            LOAD8("(%["prev2"])", %%xmm2) /* prev2[x] */\
> +            LOAD8("(%["next2"])", %%xmm3) /* next2[x] */\
> +            "movaps      %%xmm3, %%xmm4 \n\t"\
> +            "paddw       %%xmm2, %%xmm3 \n\t"\
> +            "psraw       $1,    %%xmm3 \n\t" /* d = (prev2[x] + next2[x])>>1 */\
> +            "movaps      %%xmm0, %[tmp0] \n\t" /* c */\
> +            "movaps      %%xmm3, %[tmp1] \n\t" /* d */\
> +            "movaps      %%xmm1, %[tmp2] \n\t" /* e */\
> +            "psubw       %%xmm4, %%xmm2 \n\t"\
> +            PABS(        %%xmm4, %%xmm2) /* temporal_diff0 */\
> +            LOAD8("(%[prev],%[mrefs])", %%xmm3) /* prev[x-refs] */\
> +            LOAD8("(%[prev],%[prefs])", %%xmm4) /* prev[x+refs] */\
> +            "psubw       %%xmm0, %%xmm3 \n\t"\
> +            "psubw       %%xmm1, %%xmm4 \n\t"\
> +            PABS(        %%xmm5, %%xmm3)\
> +            PABS(        %%xmm5, %%xmm4)\
> +            "paddw       %%xmm4, %%xmm3 \n\t" /* temporal_diff1 */\
> +            "psrlw       $1,    %%xmm2 \n\t"\
> +            "psrlw       $1,    %%xmm3 \n\t"\
> +            "pmaxsw      %%xmm3, %%xmm2 \n\t"\
> +            LOAD8("(%[next],%[mrefs])", %%xmm3) /* next[x-refs] */\
> +            LOAD8("(%[next],%[prefs])", %%xmm4) /* next[x+refs] */\
> +            "psubw       %%xmm0, %%xmm3 \n\t"\
> +            "psubw       %%xmm1, %%xmm4 \n\t"\
> +            PABS(        %%xmm5, %%xmm3)\
> +            PABS(        %%xmm5, %%xmm4)\
> +            "paddw       %%xmm4, %%xmm3 \n\t" /* temporal_diff2 */\
> +            "psrlw       $1,    %%xmm3 \n\t"\
> +            "pmaxsw      %%xmm3, %%xmm2 \n\t"\
> +            "movaps      %%xmm2, %[tmp3] \n\t" /* diff */\
> +\
> +            "paddw       %%xmm0, %%xmm1 \n\t"\
> +            "paddw       %%xmm0, %%xmm0 \n\t"\
> +            "psubw       %%xmm1, %%xmm0 \n\t"\
> +            "psrlw       $1,    %%xmm1 \n\t" /* spatial_pred */\
> +            PABS(        %%xmm2, %%xmm0)      /* ABS(c-e) */\
> +\
> +            "movups      -1(%[cur],%[mrefs]), %%xmm2 \n\t" /* cur[x-refs-1] */\
> +            "movups      -1(%[cur],%[prefs]), %%xmm3 \n\t" /* cur[x+refs-1] */\
> +            "movaps      %%xmm2, %%xmm4 \n\t"\
> +            "psubusb     %%xmm3, %%xmm2 \n\t"\
> +            "psubusb     %%xmm4, %%xmm3 \n\t"\
> +            "pmaxub      %%xmm3, %%xmm2 \n\t"\
> +            "pshuflw      $9,%%xmm2, %%xmm3 \n\t"\
> +            "pshufhw      $9,%%xmm2, %%xmm3 \n\t"\
> +            "punpcklbw   %%xmm7, %%xmm2 \n\t" /* ABS(cur[x-refs-1] - cur[x+refs-1]) */\
> +            "punpcklbw   %%xmm7, %%xmm3 \n\t" /* ABS(cur[x-refs+1] - cur[x+refs+1]) */\
> +            "paddw       %%xmm2, %%xmm0 \n\t"\
> +            "paddw       %%xmm3, %%xmm0 \n\t"\
> +            "psubw       %[pw1], %%xmm0 \n\t" /* spatial_score */\
> +\
> +            CHECK(-2,0)\
> +            CHECK1\
> +            CHECK(-3,1)\
> +            CHECK2\
> +            CHECK(0,-2)\
> +            CHECK1\
> +            CHECK(1,-3)\
> +            CHECK2\
> +\
> +            /* if(p->mode<2) ... */\
> +            "movaps      %[tmp3], %%xmm6 \n\t" /* diff */\
> +            "cmp         $2, %[mode] \n\t"\
> +            "jge         1f \n\t"\
> +            LOAD8("(%["prev2"],%[mrefs],2)", %%xmm2) /* prev2[x-2*refs] */\
> +            LOAD8("(%["next2"],%[mrefs],2)", %%xmm4) /* next2[x-2*refs] */\
> +            LOAD8("(%["prev2"],%[prefs],2)", %%xmm3) /* prev2[x+2*refs] */\
> +            LOAD8("(%["next2"],%[prefs],2)", %%xmm5) /* next2[x+2*refs] */\
> +            "paddw       %%xmm4, %%xmm2 \n\t"\
> +            "paddw       %%xmm5, %%xmm3 \n\t"\
> +            "psrlw       $1,    %%xmm2 \n\t" /* b */\
> +            "psrlw       $1,    %%xmm3 \n\t" /* f */\
> +            "movaps      %[tmp0], %%xmm4 \n\t" /* c */\
> +            "movaps      %[tmp1], %%xmm5 \n\t" /* d */\
> +            "movaps      %[tmp2], %%xmm7 \n\t" /* e */\
> +            "psubw       %%xmm4, %%xmm2 \n\t" /* b-c */\
> +            "psubw       %%xmm7, %%xmm3 \n\t" /* f-e */\
> +            "movaps      %%xmm5, %%xmm0 \n\t"\
> +            "psubw       %%xmm4, %%xmm5 \n\t" /* d-c */\
> +            "psubw       %%xmm7, %%xmm0 \n\t" /* d-e */\
> +            "movaps      %%xmm2, %%xmm4 \n\t"\
> +            "pminsw      %%xmm3, %%xmm2 \n\t"\
> +            "pmaxsw      %%xmm4, %%xmm3 \n\t"\
> +            "pmaxsw      %%xmm5, %%xmm2 \n\t"\
> +            "pminsw      %%xmm5, %%xmm3 \n\t"\
> +            "pmaxsw      %%xmm0, %%xmm2 \n\t" /* max */\
> +            "pminsw      %%xmm0, %%xmm3 \n\t" /* min */\
> +            "pxor        %%xmm4, %%xmm4 \n\t"\
> +            "pmaxsw      %%xmm3, %%xmm6 \n\t"\
> +            "psubw       %%xmm2, %%xmm4 \n\t" /* -max */\
> +            "pmaxsw      %%xmm4, %%xmm6 \n\t" /* diff= MAX3(diff, min, -max); */\
> +            "1: \n\t"\
> +\
> +            "movaps      %[tmp1], %%xmm2 \n\t" /* d */\
> +            "movaps      %%xmm2, %%xmm3 \n\t"\
> +            "psubw       %%xmm6, %%xmm2 \n\t" /* d-diff */\
> +            "paddw       %%xmm6, %%xmm3 \n\t" /* d+diff */\
> +            "pmaxsw      %%xmm2, %%xmm1 \n\t"\
> +            "pminsw      %%xmm3, %%xmm1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
> +            "packuswb    %%xmm1, %%xmm1 \n\t"\
> +\
> +            :[tmp0]"=m"(tmp0),\
> +             [tmp1]"=m"(tmp1),\
> +             [tmp2]"=m"(tmp2),\
> +             [tmp3]"=m"(tmp3)\
> +            :[prev] "r"(prev),\
> +             [cur]  "r"(cur),\
> +             [next] "r"(next),\
> +             [prefs]"r"((long)refs),\
> +             [mrefs]"r"((long)-refs),\
> +             [pw1]  "m"(*pw_1),\
> +             [pb1]  "m"(*pb_1),\
> +             [mode] "g"(mode)\
> +        );\
> +        __asm__ volatile("movq %%xmm1, %0" :"=m"(*dst));\
> +        dst += 8;\
> +        prev+= 8;\
> +        cur += 8;\
> +        next+= 8;\
> +    }

This code is the same as the mmx code just with sse2 registers and
instructions.
Thus it qualifies as code duplication and should be factorized into common
code/macro

> +

> +static void __attribute__((force_align_arg_pointer)) filter_line_sse2(struct vf_priv_s *p, uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int refs, int parity){
> +    const int mode = p->mode;
> +    DECLARE_ALIGNED(16, uint8_t, tmp0[16]);
> +    DECLARE_ALIGNED(16, uint8_t, tmp1[16]);
> +    DECLARE_ALIGNED(16, uint8_t, tmp2[16]);
> +    DECLARE_ALIGNED(16, uint8_t, tmp3[16]);

put these in the context, that avoids the need of force_align_arg_pointer

> +    int x;
> +    static DECLARE_ALIGNED(16, const unsigned short, pw_1[]) =
> +    {
> +        0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001
> +    };
> +
> +    static DECLARE_ALIGNED(16, const unsigned short, pb_1[]) =
> +    {
> +        0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101
> +    };
[...]
> +    static DECLARE_ALIGNED(16, const unsigned short, pw_1[]) =
> +    {
> +        0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001
> +    };
> +
> +    static DECLARE_ALIGNED(16, const unsigned short, pb_1[]) =
> +    {
> +        0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101,0x0101
> +    };

duplicate

[...]
> @@ -364,7 +626,7 @@
>          }
>      }
>  #if defined(HAVE_MMX) && defined(NAMED_ASM_ARGS)
> -    if(gCpuCaps.hasMMX2) __asm__ volatile("emms \n\t" : : : "memory");
> +    if(filter_line_mmx2 == filter_line) __asm__ volatile("emms \n\t" : : : "memory");
>  #endif

this isnt worth the check

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Complexity theory is the science of finding the exact solution to an
approximation. Benchmarking OTOH is finding an approximation of the exact
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/mplayer-dev-eng/attachments/20081115/8aff273c/attachment.pgp>