[FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

Mon Jun 27 15:09:42 CEST 2016

On Mon, Jun 27, 2016 at 12:53:47PM +0100, Rostislav Pehlivanov wrote:
> On 24 June 2016 at 16:38, James Almer <jamrial at gmail.com> wrote:
> 
> > On 6/24/2016 8:43 AM, Rostislav Pehlivanov wrote:
> > > From 154e4312b09f568108dd97089e394c10bb3c28a9 Mon Sep 17 00:00:00 2001
> > > From: Rostislav Pehlivanov <rpehlivanov at ob-encoder.com>
> > > Date: Thu, 23 Jun 2016 18:06:56 +0100
> > > Subject: [PATCH 2/2] diracdsp: add dequantization SIMD
> > >
> > > Currently unused, to be used in the following commits.
> > >
> > > Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>
> > > ---
> > >  libavcodec/diracdsp.c          | 24 ++++++++++++++++++++++++
> > >  libavcodec/diracdsp.h          |  4 ++++
> > >  libavcodec/x86/diracdsp.asm    | 36 ++++++++++++++++++++++++++++++++++++
> > >  libavcodec/x86/diracdsp_init.c |  2 ++
> > >  4 files changed, 66 insertions(+)
> > >
> > > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
> > > index ab8d149..cd1209e 100644
> > > --- a/libavcodec/diracdsp.c
> > > +++ b/libavcodec/diracdsp.c
> > > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const
> > uint16_t *src, int stride,
> > >      }
> > >  }
> > >
> > > +#define DEQUANT_SUBBAND(PX)
> >                     \
> > > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst,
> > ptrdiff_t stride,     \
> > > +                                         const int qf, const int qs,
> > int tot_v, int tot_h) \
> > > +{
> >                     \
> > > +    int i, y;
> >                     \
> > > +    for (y = 0; y < tot_v; y++) {
> >                     \
> > > +        PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;
> >                     \
> > > +        for (i = 0; i < tot_h; i++) {
> >                     \
> > > +            c = *src_r++;
> >                     \
> > > +            sign = FFSIGN(c)*(!!c);
> >                     \
> > > +            c = (FFABS(c)*qf + qs) >> 2;
> >                    \
> > > +            *dst_r++ = c*sign;
> >                    \
> > > +        }
> >                     \
> > > +        src += tot_h << (sizeof(PX) >> 1);
> >                    \
> > > +        dst += stride;
> >                    \
> > > +    }
> >                     \
> > > +}
> > > +
> > > +DEQUANT_SUBBAND(int16_t)
> > > +DEQUANT_SUBBAND(int32_t)
> > > +
> > >  #define PIXFUNC(PFX, WIDTH)
> >  \
> > >      c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ##
> > _dirac_pixels ## WIDTH ## _c; \
> > >      c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ##
> > _dirac_pixels ## WIDTH ## _l2_c; \
> > > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c)
> > >      c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
> > >      c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
> > >
> > > +    c->dequant_subband[0] = c->dequant_subband[2] =
> > dequant_subband_int16_t_c;
> > > +    c->dequant_subband[1] = c->dequant_subband[3] =
> > dequant_subband_int32_t_c;
> > > +
> > >      PIXFUNC(put, 8);
> > >      PIXFUNC(put, 16);
> > >      PIXFUNC(put, 32);
> > > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
> > > index 25a872d..224828d 100644
> > > --- a/libavcodec/diracdsp.h
> > > +++ b/libavcodec/diracdsp.h
> > > @@ -22,6 +22,7 @@
> > >  #define AVCODEC_DIRACDSP_H
> > >
> > >  #include <stdint.h>
> > > +#include <stddef.h>
> > >
> > >  typedef void (*dirac_weight_func)(uint8_t *block, int stride, int
> > log2_denom, int weight, int h);
> > >  typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src,
> > int stride, int log2_denom, int weightd, int weights, int h);
> > > @@ -46,6 +47,9 @@ typedef struct {
> > >      void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t
> > *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int
> > idwt_stride, int width, int height/*mod 2*/);
> > >      void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int
> > stride, const uint8_t *obmc_weight, int yblen);
> > >
> > > +    /* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
> > > +    void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t
> > stride, const int qf, const int qs, int tot_v, int tot_h);
> > > +
> > >      dirac_weight_func weight_dirac_pixels_tab[3];
> > >      dirac_biweight_func biweight_dirac_pixels_tab[3];
> > >  } DiracDSPContext;
> > > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> > > index a0d6788..a764706 100644
> > > --- a/libavcodec/x86/diracdsp.asm
> > > +++ b/libavcodec/x86/diracdsp.asm
> > > @@ -307,4 +307,40 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst,
> > dst_stride, src, src_stride, w
> > >
> > >      RET
> > >
> > > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
> > const int qf, const int qs, int tot_v, int tot_h)
> > > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v,
> > tot_h
> > > +
> > > +    movd   m2, qfd
> > > +    movd   m3, qsd
> > > +    SPLATD m2
> > > +    SPLATD m3
> > > +    mov    r7, dstq
> > > +    mov    r8, tot_hq
> >
> > Replace every r7 and r8 with r3 and r4, make the cglobal line 7, 7, 4
> > and the function will work on x86_32.
> >
> > > +
> > > +    .loop_v:
> > > +    mov    dstq,   r7
> > > +    mov    tot_hq, r8
> > > +
> > > +    .loop_h:
> > > +    movu   m0, [srcq]
> > > +
> > > +    pabsd  m1, m0
> > > +    pmulld m1, m2
> > > +    paddd  m1, m3
> > > +    psrld  m1,  2
> > > +    psignd m1, m0
> > > +
> > > +    movu   [dstq], m1
> > > +
> > > +    add    srcq, mmsize
> > > +    add    dstq, mmsize
> > > +    sub    tot_hq, 4
> > > +    jl     .loop_h
> >
> > Jump if greater. Also use tot_hd, or change the prototypes.
> >
> > > +
> > > +    add    r7, strideq
> > > +    sub    tot_vq, 1
> > > +    jl     .loop_v
> >
> > Ditto.
> >
> > > +
> > > +    RET
> > > +
> > >  %endif
> > > diff --git a/libavcodec/x86/diracdsp_init.c
> > b/libavcodec/x86/diracdsp_init.c
> > > index 7fa554e..a1bab9c 100644
> > > --- a/libavcodec/x86/diracdsp_init.c
> > > +++ b/libavcodec/x86/diracdsp_init.c
> > > @@ -48,6 +48,7 @@ void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int
> > dst_stride, const int16_t
> > >
> > >  #if ARCH_X86_64
> > >  void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride,
> > const uint8_t *src, int src_stride, int width, int height);
> > > +void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t
> > stride, const int qf, const int qs, int tot_v, int tot_h);
> > >  #endif
> > >
> > >  #if HAVE_YASM
> > > @@ -191,6 +192,7 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
> > >
> > >  #if ARCH_X86_64
> > >      if (EXTERNAL_SSE4(mm_flags)) {
> > > +        c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
> > >          c->put_signed_rect_clamped[1] =
> > ff_put_signed_rect_clamped_10_sse4;
> > >      }
> > >  #endif
> > > -- 2.8.1.369.geae769a
> >
> > _______________________________________________
> > ffmpeg-devel mailing list
> > ffmpeg-devel at ffmpeg.org
> > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> 
> I've attached another patch which should work fine now.
> I did this after the put_signed_rect so it does require the first patch,
> but if this patch is okay I'll amend and tidy things before I push.
> For some reason changing dstq to be stored at r4 or r3 broke it and I've no
> idea why. Neither is used after loading m2 and m3. Should work on x86_32
> now, but I'm wondering why I can't save that register.

on x86_32:
YASM    libavcodec/x86/diracdsp.o
src/libavcodec/x86/diracdsp.asm:279: error: undefined symbol `r7' (first use)
src/libavcodec/x86/diracdsp.asm:279: error:  (Each undefined symbol is reported only once.)
make: *** [libavcodec/x86/diracdsp.o] Error 1

btw you can test x86_32 on x86_64 easily
something like this:
./configure   --cc='ccache gcc' --arch=x86_32 --target-os=linux --extra-cflags=-m32 --extra-ldflags=-m32  --enable-cross-compile
should be all thats needed

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Those who are best at talking, realize last or never when they are wrong.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160627/4f392a14/attachment.sig>