[FFmpeg-devel] [PATCH 01/10] diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped

Thu Jun 23 21:57:18 CEST 2016

On 6/23/2016 2:06 PM, Rostislav Pehlivanov wrote:
> Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>
> ---
>  libavcodec/x86/diracdsp.asm    | 47 ++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/diracdsp_init.c |  6 ++++++
>  2 files changed, 53 insertions(+)
> 
> diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> index a042413..9db7b67 100644
> --- a/libavcodec/x86/diracdsp.asm
> +++ b/libavcodec/x86/diracdsp.asm
> @@ -22,6 +22,8 @@
>  
>  SECTION_RODATA
>  pw_7: times 8 dw 7
> +convert_to_unsigned_10bit: times 4 dd 0x200
> +clip_10bit:                times 8 dw 0x3ff
>  
>  cextern pw_3
>  cextern pw_16
> @@ -172,6 +174,48 @@ cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w,
>      RET
>  %endm
>  
> +%macro PUT_RECT_10 0
> +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
> +cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h

This is x86_64 only. Either add the relevant pre-processor checks here
and to the init file, or make the necessary changes to make it work
on x86_32.
Look at the 8bit version of put_signed_rect_clamped for an example of
how to deal with this using stack.

> +
> +    neg      wq
> +    neg      hq

Why? You're not using these as part of effective addresses, just as
counters. Keep them as is and just do sub instead of add in the loops
below.
For that matter, you'd need to sign extend these with movsxd before
negating them, or change the prototype and make them ptrdiff_t instead
of int.

> +    mov      r6, srcq
> +    mov      r7, dstq
> +    mov      r8, wq
> +    pxor     m2, m2
> +    mova     m3, [clip_10bit]
> +    mova     m4, [convert_to_unsigned_10bit]
> +
> +    .loop_h:
> +    mov      srcq, r6
> +    mov      dstq, r7
> +    mov      wq,   r8
> +
> +    .loop_w:
> +    movu     m0, [srcq+0*mmsize]
> +    movu     m1, [srcq+1*mmsize]
> +
> +    paddd    m0, m4
> +    paddd    m1, m4
> +    packusdw m0, m0, m1
> +    CLIPW    m0, m2, m3 ; packusdw saturates so it's fine

Would be nice if you could make this work with SSE2 as well.
There are some examples of packusdw SSE2 emulation in the codebase.

> +
> +    movu     [dstq], m0
> +
> +    add      srcq, 2*mmsize
> +    add      dstq, 1*mmsize
> +    add      wq, 8
> +    jl       .loop_w
> +
> +    add      r6, src_strideq
> +    add      r7, dst_strideq
> +    add      hq, 1

Make sure to do "sub wd, 8" and "sub hd, 1" after removing the above
negs if don't change the prototype.

> +    jl       .loop_h
> +
> +    RET
> +%endm
> +
>  %macro ADD_RECT 1
>  ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
>  cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
> @@ -263,3 +307,6 @@ ADD_RECT sse2
>  HPEL_FILTER sse2
>  ADD_OBMC 32, sse2
>  ADD_OBMC 16, sse2
> +
> +INIT_XMM sse4
> +PUT_RECT_10

No need to make it a macro if it's going to be a single version.
If you add a SSE2 one then this would makes sense.

> diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
> index 5fae798..4786eea 100644
> --- a/libavcodec/x86/diracdsp_init.c
> +++ b/libavcodec/x86/diracdsp_init.c
> @@ -46,6 +46,8 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src,
>  void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
>  void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
>  
> +void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
> +
>  #if HAVE_YASM
>  
>  #define HPEL_FILTER(MMSIZE, EXT)                                                             \
> @@ -184,4 +186,8 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
>          c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
>          c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
>      }
> +
> +    if (EXTERNAL_SSE4(mm_flags)) {
> +        c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
> +    }
>  }
>