[FFmpeg-devel] [PATCH 2/2] x86/idctdsp: port {put, add}_pixels_clamped to yasm

Thu Sep 25 00:08:37 CEST 2014

On 24/09/14 6:54 PM, Michael Niedermayer wrote:
> On Wed, Sep 24, 2014 at 05:44:17PM -0300, James Almer wrote:
>> Also add sse2 versions for both.
>> put_pixels_clamped port and sse2 version originally written by Timothy Gu.
>>
>> Signed-off-by: James Almer <jamrial at gmail.com>
>> ---
>>  libavcodec/x86/Makefile       |   3 +-
>>  libavcodec/x86/idctdsp.asm    | 103 ++++++++++++++++++++++++++++++++
>>  libavcodec/x86/idctdsp.h      |   4 ++
>>  libavcodec/x86/idctdsp_init.c |   7 ++-
>>  libavcodec/x86/idctdsp_mmx.c  | 133 ------------------------------------------
>>  5 files changed, 112 insertions(+), 138 deletions(-)
>>  delete mode 100644 libavcodec/x86/idctdsp_mmx.c
>>
>> diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
>> index 7bf0e82..9f34abd 100644
>> --- a/libavcodec/x86/Makefile
>> +++ b/libavcodec/x86/Makefile
>> @@ -66,8 +66,7 @@ OBJS-$(CONFIG_WEBP_DECODER)            += x86/vp8dsp_init.o
>>  # subsystems
>>  MMX-OBJS-$(CONFIG_DIRAC_DECODER)       += x86/dirac_dwt.o
>>  MMX-OBJS-$(CONFIG_FDCTDSP)             += x86/fdct.o
>> -MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/idctdsp_mmx.o             \
>> -                                          x86/simple_idct.o
>> +MMX-OBJS-$(CONFIG_IDCTDSP)             += x86/simple_idct.o
>>  
>>  # decoders/encoders
>>  MMX-OBJS-$(CONFIG_MPEG4_DECODER)       += x86/xvididct_mmx.o            \
>> diff --git a/libavcodec/x86/idctdsp.asm b/libavcodec/x86/idctdsp.asm
>> index 44a1a6e..b816e84 100644
>> --- a/libavcodec/x86/idctdsp.asm
>> +++ b/libavcodec/x86/idctdsp.asm
>> @@ -78,3 +78,106 @@ INIT_MMX mmx
>>  PUT_SIGNED_PIXELS_CLAMPED 0
>>  INIT_XMM sse2
>>  PUT_SIGNED_PIXELS_CLAMPED 3
>> +
>> +;--------------------------------------------------------------------------
>> +; void ff_put_pixels_clamped(const int16_t *block, uint8_t *pixels,
>> +;                            int line_size);
>> +;--------------------------------------------------------------------------
>> +; %1 = block offset
>> +%macro PUT_PIXELS_CLAMPED_HALF 1
>> +    mova     m0, [blockq+mmsize*0+%1]
>> +    mova     m1, [blockq+mmsize*2+%1]
>> +%if mmsize == 8
>> +    mova     m2, [blockq+mmsize*4+%1]
>> +    mova     m3, [blockq+mmsize*6+%1]
>> +%endif
>> +    packuswb m0, [blockq+mmsize*1+%1]
>> +    packuswb m1, [blockq+mmsize*3+%1]
>> +%if mmsize == 8
>> +    packuswb m2, [blockq+mmsize*5+%1]
>> +    packuswb m3, [blockq+mmsize*7+%1]
>> +    movq           [pixelsq], m0
>> +    movq    [lsizeq+pixelsq], m1
>> +    movq  [2*lsizeq+pixelsq], m2
>> +    movq   [lsize3q+pixelsq], m3
>> +%else
>> +    movq           [pixelsq], m0
>> +    movhps  [lsizeq+pixelsq], m0
>> +    movq  [2*lsizeq+pixelsq], m1
>> +    movhps [lsize3q+pixelsq], m1
>> +%endif
>> +%endmacro
>> +
>> +%macro PUT_PIXELS_CLAMPED 0
>> +cglobal put_pixels_clamped, 3, 4, 2, block, pixels, lsize, lsize3
>> +    lea lsize3q, [lsizeq*3]
>> +    PUT_PIXELS_CLAMPED_HALF 0
> 
> this doesnt match the prototype
> line_size is 32bit in the prototype but the code treats it as 64bit
> this will crash if its negative

Would this also apply to put_signed_pixels_clamped above? That one has been 
in the tree as is for quite some time.

I'll change the prototypes then. Better making line_size into ptrdiff_t than 
adding a movsxdifnidn to every function.