[FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

Thu Jun 23 22:01:05 CEST 2016

On 6/23/2016 2:06 PM, Rostislav Pehlivanov wrote:
> Currently unused, to be used in the following commits.
> 
> Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>
> ---
>  libavcodec/diracdsp.c          | 24 ++++++++++++++++++++++++
>  libavcodec/diracdsp.h          |  4 ++++
>  libavcodec/x86/diracdsp.asm    | 41 +++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/diracdsp_init.c |  4 +++-
>  4 files changed, 72 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
> index ab8d149..d0cfd00 100644
> --- a/libavcodec/diracdsp.c
> +++ b/libavcodec/diracdsp.c
> @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const uint16_t *src, int stride,
>      }
>  }
>  
> +#define DEQUANT_SUBBAND(PX)                                                                        \
> +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst, ptrdiff_t stride,             \
> +                                         const int qf, const int qs, int64_t tot_v, int64_t tot_h) \

Shouldn't this be int (or ptrdiff_t)? Seeing they are int in the
SliceCoeffs struct introduced by patch 6, i don't see why they
should be int64_t here. Unless I'm missing something.

> +{                                                                                                  \
> +    int i, y;                                                                                      \
> +    for (y = 0; y < tot_v; y++) {                                                                  \
> +        PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;                                        \
> +        for (i = 0; i < tot_h; i++) {                                                              \
> +            c = *src_r++;                                                                          \
> +            sign = FFSIGN(c)*(!!c);                                                                \
> +            c = (FFABS(c)*qf + qs) >> 2;                                                           \
> +            *dst_r++ = c*sign;                                                                     \
> +        }                                                                                          \
> +        src += tot_h << (sizeof(PX) >> 1);                                                         \
> +        dst += stride;                                                                             \
> +    }                                                                                              \
> +}
> +
> +DEQUANT_SUBBAND(int16_t)
> +DEQUANT_SUBBAND(int32_t)
> +
>  #define PIXFUNC(PFX, WIDTH)                                             \
>      c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _c; \
>      c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ## _dirac_pixels ## WIDTH ## _l2_c; \
> @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c)
>      c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
>      c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
>  
> +    c->dequant_subband[0] = c->dequant_subband[2] = dequant_subband_int16_t_c;
> +    c->dequant_subband[1] = c->dequant_subband[3] = dequant_subband_int32_t_c;
> +
>      PIXFUNC(put, 8);
>      PIXFUNC(put, 16);
>      PIXFUNC(put, 32);
> diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
> index 25a872d..c0ac56b 100644
> --- a/libavcodec/diracdsp.h
> +++ b/libavcodec/diracdsp.h
> @@ -22,6 +22,7 @@
>  #define AVCODEC_DIRACDSP_H
>  
>  #include <stdint.h>
> +#include <stddef.h>
>  
>  typedef void (*dirac_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int h);
>  typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src, int stride, int log2_denom, int weightd, int weights, int h);
> @@ -46,6 +47,9 @@ typedef struct {
>      void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int idwt_stride, int width, int height/*mod 2*/);
>      void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
>  
> +    /* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
> +    void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int64_t tot_v, int64_t tot_h);
> +
>      dirac_weight_func weight_dirac_pixels_tab[3];
>      dirac_biweight_func biweight_dirac_pixels_tab[3];
>  } DiracDSPContext;
> diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> index 9db7b67..f743363 100644
> --- a/libavcodec/x86/diracdsp.asm
> +++ b/libavcodec/x86/diracdsp.asm
> @@ -289,6 +289,46 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
>      RET
>  %endm
>  
> +%macro DEQUANT_SUBBAND_32 0
> +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int64_t tot_v, int64_t tot_h)
> +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v, tot_h

Again, x86_64 only as is.

> +
> +    movd   m2, qfd
> +    movd   m3, qsd
> +    SPLATD m2
> +    SPLATD m3
> +    neg    tot_vq
> +    neg    tot_hq

Same as with put_signed_rect_clamped_10, no reason to neg these.

> +    mov    r7, dstq
> +    mov    r8, tot_hq

You have qf and qs free. There's no need to use two extra registers.
This and changing tot_v and tot_h to int/ptrdiff_t should make it work
on x86_32 without extra work.

> +
> +    .loop_v:
> +    mov    dstq,   r7
> +    mov    tot_hq, r8
> +
> +    .loop_h:
> +    movu   m0, [srcq]
> +
> +    pabsd  m1, m0
> +    pmulld m1, m2
> +    paddd  m1, m3
> +    psrld  m1,  2
> +    psignd m1, m0
> +
> +    movu   [dstq], m1
> +
> +    add    srcq, mmsize
> +    add    dstq, mmsize
> +    add    tot_hq, 4
> +    jl     .loop_h
> +
> +    add    r7, strideq
> +    add    tot_vq, 1
> +    jl     .loop_v
> +
> +    RET
> +%endm
> +
>  INIT_MMX
>  %if ARCH_X86_64 == 0
>  PUT_RECT mmx
> @@ -310,3 +350,4 @@ ADD_OBMC 16, sse2
>  
>  INIT_XMM sse4
>  PUT_RECT_10
> +DEQUANT_SUBBAND_32

No reason to make it a macro. It's a single function.

> diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
> index 4786eea..8541eb3 100644
> --- a/libavcodec/x86/diracdsp_init.c
> +++ b/libavcodec/x86/diracdsp_init.c
> @@ -45,9 +45,10 @@ void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, i
>  void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
>  void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
>  void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
> -
>  void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
>  
> +void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int64_t tot_v, int64_t tot_h);
> +
>  #if HAVE_YASM
>  
>  #define HPEL_FILTER(MMSIZE, EXT)                                                             \
> @@ -188,6 +189,7 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
>      }
>  
>      if (EXTERNAL_SSE4(mm_flags)) {
> +        c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
>          c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
>      }
>  }
>