[FFmpeg-devel] Fix bug for POWER LE: libswscale/ppc/swscale_altivec.c

Fri Oct 17 17:26:58 CEST 2014

On Thu, Oct 16, 2014 at 10:48:14AM +0800, rongyan wrote:
> Hi,
>  I created a patch to fix the bug in file libswscale/ppc/swscale_altivec.c for POWER LE. The fixed functions including 'hScale_altivec_real()', 'yuv2planeX_16_altivec()', and 'yuv2planeX_8()'. The fate test result can be found on http://fate.ffmpeg.org/ by search "ibmcrl", also attached here to facilitate the review:


[...]
> +#if !HAVE_BIGENDIAN
> +#define yuv2planeX_8(d1, d2, l1, src, x, filter) do {     \
> +        vector signed int   i1  = vec_mule(filter, l1);         \
> +        vector signed int   i2  = vec_mulo(filter, l1);         \
> +        vector signed int   vf1 = vec_mergel(i2, i1);           \
> +        vector signed int   vf2 = vec_mergeh(i2, i1);           \
> +        d1 = vec_add(d1, vf1);                                  \
> +        d2 = vec_add(d2, vf2);                                  \
> +    } while (0)
> +#else
>  #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {     \
>          vector signed short l2  = vec_ld(((x) << 1) + 16, src); \
>          vector signed short ls  = vec_perm(l1, l2, perm);       \
> @@ -44,11 +54,49 @@
>          d2 = vec_add(d2, vf2);                                  \
>          l1 = l2;                                                \
>      } while (0)
> +#endif
>  
>  static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
>                                    const int16_t **src, uint8_t *dest,
>                                    const uint8_t *dither, int offset, int x)
>  {
> +#if !HAVE_BIGENDIAN
> +    register int i, j;
> +    DECLARE_ALIGNED(16, int, val)[16];
> +    vector signed int vo1, vo2, vo3, vo4;
> +    vector unsigned short vs1, vs2;
> +    vector unsigned char vf;
> +    vector unsigned int altivec_vectorShiftInt19 =
> +        vec_add(vec_splat_u32(10), vec_splat_u32(9));
> +
> +    for (i = 0; i < 16; i++)
> +        val[i] = dither[(x + i + offset) & 7] << 12;
> +
> +    vo1 = vec_ld(0,  val);
> +    vo2 = vec_ld(16, val);
> +    vo3 = vec_ld(32, val);
> +    vo4 = vec_ld(48, val);
> +
> +    for (j = 0; j < filterSize; j++) {
> +        vector signed short l1, l2, vLumFilter = vec_vsx_ld(j << 1, filter);
> +        vLumFilter = vec_splat(vLumFilter, 0); // lumFilter[j] is loaded 8 times in vLumFilter
> +
> +        l1  = vec_vsx_ld(x << 1, src[j]);
> +        l2  = vec_vsx_ld(((x) << 1) + 16, src[j]);
> +
> +        yuv2planeX_8(vo1, vo2, l1, src[j], x, vLumFilter);
> +        yuv2planeX_8(vo3, vo4, l2, src[j], x + 8, vLumFilter);
> +    }
> +
> +    vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
> +    vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
> +    vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
> +    vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
> +    vs1 = vec_packsu(vo1, vo2);
> +    vs2 = vec_packsu(vo3, vo4);
> +    vf  = vec_packsu(vs1, vs2);
> +    vec_vsx_st(vf, 0, dest);
> +#else /* else of #if !HAVE_BIGENDIAN */

>      register int i, j;
>      DECLARE_ALIGNED(16, int, val)[16];
>      vector signed int vo1, vo2, vo3, vo4;

code duplication, this is identical to the code in the #if


> @@ -86,6 +134,7 @@ static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
>      vs2 = vec_packsu(vo3, vo4);
>      vf  = vec_packsu(vs1, vs2);
>      vec_st(vf, 0, dest);

this is identical as well, except vec_st, the following avoids more
code duplication:

#if HAVE_VSX
#    define VEC_ST vec_vsx_st
#else
#    define VEC_ST vec_st)
#endif

similar for the other functions, please dont duplicate code
unless there is a reason

[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Into a blind darkness they enter who follow after the Ignorance,
they as if into a greater darkness enter who devote themselves
to the Knowledge alone. -- Isha Upanishad
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20141017/2c82a1bd/attachment.asc>