[FFmpeg-devel] [PATCH] NEON: VC1 no_rnd chroma MC

Thu Apr 16 22:39:00 CEST 2009

On Apr 16, 2009, at 4:29 PM, M?ns Rullg?rd wrote:

> David Conrad <lessen42 at gmail.com> writes:
>
>> Hi,
>>
>> This extends the h264_chroma_mc8 macro to also make no_rnd variants
>> for VC1.
>> 10-15% overall decode speedup depending on source.
>>
>> diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/ 
>> h264dsp_neon.S
>> index 44a1373..0f1c467 100644
>> --- a/libavcodec/arm/h264dsp_neon.S
>> +++ b/libavcodec/arm/h264dsp_neon.S
>> @@ -56,13 +56,16 @@
>>         .endm
>>
>> /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x,  
>> int y) */
>> -        .macro  h264_chroma_mc8 type
>> -function ff_\type\()_h264_chroma_mc8_neon, export=1
>> +        .macro  h264_chroma_mc8 type name=h264 vshrn=vrshrn.u16  
>> no_rnd=0
>> +function ff_\type\()_\name\()_chroma_mc8_neon, export=1
>>         push            {r4-r7, lr}
>>         ldrd            r4,  [sp, #20]
>> .ifc \type,avg
>>         mov             lr,  r0
>> .endif
>> +.if \no_rnd
>> +        vmov.u16        q15, #28
>> +.endif
>>         pld             [r1]
>>         pld             [r1, r2]
>>
>> @@ -100,10 +103,14 @@ function ff_\type\()_h264_chroma_mc8_neon,  
>> export=1
>>         vmlal.u8        q9,  d7,  d1
>>         vmlal.u8        q9,  d4,  d2
>>         vmlal.u8        q9,  d5,  d3
>> -        vrshrn.u16      d16, q8,  #6
>> +.if \no_rnd
>> +        vadd.u16        q8,  q8,  q15
>> +        vadd.u16        q9,  q9,  q15
>> +.endif
>
> This will stall waiting for q9.
>
>> +        \vshrn          d16, q8,  #6
>>         vld1.64         {d6, d7}, [r5], r4
>>         pld             [r1]
>> -        vrshrn.u16      d17, q9,  #6
>> +        \vshrn          d17, q9,  #6
>> .ifc \type,avg
>>         vld1.64         {d20}, [lr,:64], r2
>>         vld1.64         {d21}, [lr,:64], r2
>> @@ -135,8 +142,12 @@ function ff_\type\()_h264_chroma_mc8_neon,  
>> export=1
>>         vmull.u8        q9,  d6,  d0
>>         vmlal.u8        q9,  d4,  d1
>>         vld1.64         {d6}, [r5], r4
>> -        vrshrn.u16      d16, q8,  #6
>> -        vrshrn.u16      d17, q9,  #6
>> +.if \no_rnd
>> +        vadd.u16        q8,  q8,  q15
>> +        vadd.u16        q9,  q9,  q15
>> +.endif
>> +        \vshrn          d16, q8,  #6
>> +        \vshrn          d17, q9,  #6
>
> Ditto.
>
>> .ifc \type,avg
>>         vld1.64         {d20}, [lr,:64], r2
>>         vld1.64         {d21}, [lr,:64], r2
>> @@ -162,10 +173,14 @@ function ff_\type\()_h264_chroma_mc8_neon,  
>> export=1
>>         vld1.64         {d4, d5}, [r1], r2
>>         vmull.u8        q9,  d6,  d0
>>         vmlal.u8        q9,  d7,  d1
>> +.if \no_rnd
>> +        vadd.u16        q8,  q8,  q15
>> +        vadd.u16        q9,  q9,  q15
>> +.endif
>
> Ditto.
>
> Is there no way to move those adds down a bit?

The first one could be moved after a load and the third after a vext,  
but any more than that would require more drastic alterations to the  
entire function (and probably a completely separate function so as to  
not hurt h.264) which IMO isn't worth it.