[FFmpeg-devel] [PATCH] H.264: SSE2 weight/biweight functions

Michael Niedermayer michaelni
Thu Jan 8 01:48:15 CET 2009


On Mon, Jan 05, 2009 at 05:39:21PM -0500, Jason Garrett-Glaser wrote:
> On Mon, Jan 5, 2009 at 5:31 PM, Jason Garrett-Glaser
> <darkshikari at gmail.com> wrote:
> > On Mon, Jan 5, 2009 at 5:25 PM, Diego Biurrun <diego at biurrun.de> wrote:
> >> On Mon, Jan 05, 2009 at 05:17:45PM -0500, Jason Garrett-Glaser wrote:
> >>> On Mon, Jan 5, 2009 at 4:55 PM, Jason Garrett-Glaser
> >>> <darkshikari at gmail.com> wrote:
> >>> > $subject
> >>>
> >>> The bad news: 10L in the above patch.
> >>>
> >>> The good news: now includes SSSE3 support.
> >>
> >> some nits..
> >>
> >>> --- libavcodec/x86/dsputil_mmx.c      (revision 16439)
> >>> +++ libavcodec/x86/dsputil_mmx.c      (working copy)
> >>> @@ -2852,6 +2852,18 @@
> >>> +
> >>> +            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
> >>> +            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
> >>> +            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
> >>> +            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
> >>> +            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
> >>> +
> >>> +            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
> >>> +            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
> >>> +            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
> >>> +            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
> >>> +            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
> >>
> >> This could be more readable with a space before the =.
> >>
> >>> @@ -2873,6 +2885,11 @@
> >>>              c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_ssse3;
> >>>              c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_ssse3;
> >>>              c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
> >>> +            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
> >>> +            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
> >>> +            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
> >>> +            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
> >>> +            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
> >>
> >> ditto
> >>
> >>> --- libavcodec/x86/h264dsp_mmx.c      (revision 16439)
> >>> +++ libavcodec/x86/h264dsp_mmx.c      (working copy)
> >>> @@ -2239,6 +2239,47 @@
> >>>
> >>> +        "movd    %0, %%xmm4        \n\t"
> >>> +        "movd    %1, %%xmm5        \n\t"
> >>> +        "movd    %2, %%xmm6        \n\t"
> >>> +        "pshuflw  $0, %%xmm4, %%xmm4 \n\t"
> >>> +        "pshuflw  $0, %%xmm5, %%xmm5 \n\t"
> >>> +        "punpcklqdq  %%xmm4, %%xmm4 \n\t"
> >>> +        "punpcklqdq  %%xmm5, %%xmm5 \n\t"
> >>> +        "pxor    %%xmm7, %%xmm7     \n\t"
> >>
> >> This could be aligned for better readability, same below...
> >>
> >> Diego
> >> _______________________________________________
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel at mplayerhq.hu
> >> https://lists.mplayerhq.hu/mailman/listinfo/ffmpeg-devel
> >>
> >
> > Done.
> >
> > Dark Shikari
> >
> 
> Seems I forgot about weight sse2.  Fixed as well.
[...]
> Index: libavcodec/x86/h264dsp_mmx.c
> ===================================================================
> --- libavcodec/x86/h264dsp_mmx.c	(revision 16439)
> +++ libavcodec/x86/h264dsp_mmx.c	(working copy)
> @@ -2239,6 +2239,47 @@
>      }
>  }
>  
> +static inline void ff_h264_weight_WxH_sse2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
> +{
> +    int x, y;
> +    offset <<= log2_denom;
> +    offset += (1 << log2_denom) >> 1;

considering that the c code does
    offset <<= log2_denom; \
    if(log2_denom) offset += 1<<(log2_denom-1); \

i think, this could be done once per slice instead of repeatly for
each block

then we pass
h->luma_weight[list][refn], h->luma_offset[list][refn]
currently, which is silly, a single table containing weight and offset
interleaved would mean passing just one pointer instead of 2 dereferenced
ones.
Also it may (but doesnt have to be) faster to do the pshuflw/punpcklqdq
also ahead per slice and store the 16byte repeated variant in the table.



> +    __asm__ volatile(
> +        "movd    %0, %%xmm4         \n\t"
> +        "movd    %1, %%xmm5         \n\t"
> +        "movd    %2, %%xmm6         \n\t"
> +        "pshuflw $0, %%xmm4, %%xmm4 \n\t"
> +        "pshuflw $0, %%xmm5, %%xmm5 \n\t"
> +        "punpcklqdq  %%xmm4, %%xmm4 \n\t"
> +        "punpcklqdq  %%xmm5, %%xmm5 \n\t"
> +        "pxor    %%xmm7, %%xmm7     \n\t"
> +        :: "g"(weight), "g"(offset), "g"(log2_denom)
> +    );





> +    for(y=0; y<h; y+=2){
> +        for(x=0; x<w; x+=8){
> +            __asm__ volatile(
> +                "movq      %0,     %%xmm0 \n\t"
> +                "movq      %1,     %%xmm1 \n\t"
> +                "punpcklbw %%xmm7, %%xmm0 \n\t"
> +                "punpcklbw %%xmm7, %%xmm1 \n\t"
> +                "pmullw    %%xmm4, %%xmm0 \n\t"
> +                "pmullw    %%xmm4, %%xmm1 \n\t"
> +                "paddsw    %%xmm5, %%xmm0 \n\t"
> +                "paddsw    %%xmm5, %%xmm1 \n\t"
> +                "psraw     %%xmm6, %%xmm0 \n\t"
> +                "psraw     %%xmm6, %%xmm1 \n\t"
> +                "packuswb  %%xmm7, %%xmm0 \n\t"
> +                "packuswb  %%xmm7, %%xmm1 \n\t"
> +                "movq      %%xmm0, %0     \n\t"
> +                "movq      %%xmm1, %1     \n\t"
> +                : "+m"(*(uint64_t*)(dst+x)),
> +                  "+m"(*(uint64_t*)(dst+x+stride))
> +            );
> +        }
> +        dst += 2*stride;
> +    }

the loops should be done in asm, i dont trust gcc


> +}
> +
>  static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
>  {
>      int x, y;

> @@ -2277,6 +2318,84 @@
>      }
>  }
>  
> +static inline void ff_h264_biweight_WxH_sse2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
> +{
> +    int x, y;
> +    offset = ((offset + 1) | 1) << log2_denom;
> +    __asm__ volatile(
> +        "movd    %0, %%xmm3         \n\t"
> +        "movd    %1, %%xmm4         \n\t"
> +        "movd    %2, %%xmm5         \n\t"
> +        "movd    %3, %%xmm6         \n\t"
> +        "pshuflw $0, %%xmm3, %%xmm3 \n\t"
> +        "pshuflw $0, %%xmm4, %%xmm4 \n\t"
> +        "pshuflw $0, %%xmm5, %%xmm5 \n\t"
> +        "punpcklqdq  %%xmm3, %%xmm3 \n\t"
> +        "punpcklqdq  %%xmm4, %%xmm4 \n\t"
> +        "punpcklqdq  %%xmm5, %%xmm5 \n\t"
> +        "pxor    %%xmm7, %%xmm7     \n\t"
> +        :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
> +    );

this stuff similarly could be calculated per slice and stored in a
[ref0][ref1] table (for example the implicit weight table ...)
i think at least ...
(of course only if it is faster ...)


[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

No human being will ever know the Truth, for even if they happen to say it
by chance, they would not even known they had done so. -- Xenophanes
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090108/8c8eba12/attachment.pgp>



More information about the ffmpeg-devel mailing list