[Ffmpeg-devel] [PATCH] idct8 in Altivec for H.264 decoding
Guillaume POIRIER
poirierg
Mon Oct 9 11:06:07 CEST 2006
On 10/9/06, Guillaume POIRIER <poirierg at gmail.com> wrote:
> On 10/9/06, Michael Niedermayer <michaelni at gmx.at> wrote:
> > On Mon, Oct 09, 2006 at 12:05:30AM +0200, Guillaume POIRIER wrote:
> > > This patch also carries some macros that are useful in Altivec
> > > programming. They are taken from x264 project, and I have permission
> > > from the author to re-licence them in LGPL.
> >
> > could you send a seperate patch for the TRANSPOSE move and these?
>
> Yes, please find them in attachement if this mail.
> I shall make an updated patch with my idct8 implementation when I have
> improved it.
Ahem....
--
With DADVSI (http://en.wikipedia.org/wiki/DADVSI), France finally has
a lead on USA on selling out individuals right to corporations!
Vive la France!
-------------- next part --------------
Index: libavcodec/ppc/dsputil_altivec.h
===================================================================
--- libavcodec/ppc/dsputil_altivec.h (r??vision 6597)
+++ libavcodec/ppc/dsputil_altivec.h (copie de travail)
@@ -67,6 +67,57 @@
#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
#endif
+
+/***********************************************************************
+ * Vector types
+ **********************************************************************/
+#define vec_u8_t vector unsigned char
+#define vec_s8_t vector signed char
+#define vec_u16_t vector unsigned short
+#define vec_s16_t vector signed short
+#define vec_u32_t vector unsigned int
+#define vec_s32_t vector signed int
+
+/***********************************************************************
+ * Null vector
+ **********************************************************************/
+#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
+
+#define zero_u8v (vec_u8_t) zerov
+#define zero_s8v (vec_s8_t) zerov
+#define zero_u16v (vec_u16_t) zerov
+#define zero_s16v (vec_s16_t) zerov
+#define zero_u32v (vec_u32_t) zerov
+#define zero_s32v (vec_s32_t) zerov
+
+/***********************************************************************
+* VEC_DIFF_H_8BYTE_ALIGNED
+***********************************************************************
+* p1, p2: u8 *
+* i1, i2, n: int
+* d: s16v
+*
+* Loads n bytes from p1 and p2, do the diff of the high elements into
+* d, increments p1 and p2 by i1 and i2
+* Slightly faster when we know we are loading/diffing 8bytes which
+* are 8 byte aligned. Reduces need for two loads and two vec_lvsl()'s
+**********************************************************************/
+#define PREP_DIFF_8BYTEALIGNED \
+LOAD_ZERO; \
+vec_s16_t pix1v, pix2v; \
+vec_u8_t permPix1, permPix2; \
+permPix1 = vec_lvsl(0, pix1); \
+permPix2 = vec_lvsl(0, pix2); \
+
+#define VEC_DIFF_H_8BYTE_ALIGNED(p1,i1,p2,i2,n,d) \
+pix1v = vec_perm(vec_ld(0,p1), zero_u8v, permPix1); \
+pix2v = vec_perm(vec_ld(0, p2), zero_u8v, permPix2); \
+pix1v = vec_u8_to_s16( pix1v ); \
+pix2v = vec_u8_to_s16( pix2v ); \
+d = vec_sub( pix1v, pix2v); \
+p1 += i1; \
+p2 += i2;
+
#else /* HAVE_ALTIVEC */
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
#error "I can't use ALTIVEC_USE_REFERENCE_C_CODE if I don't use HAVE_ALTIVEC"
-------------- next part --------------
Index: libavcodec/ppc/dsputil_altivec.h
===================================================================
--- libavcodec/ppc/dsputil_altivec.h (r??vision 6597)
+++ libavcodec/ppc/dsputil_altivec.h (copie de travail)
@@ -67,6 +67,40 @@
#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
#endif
+// Transpose 8x8 matrix of 16-bit elements. Borrowed from mpegvideo_altivec.c
+#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
+do { \
+ vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
+ vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
+ \
+ A1 = vec_mergeh (a, e); \
+ B1 = vec_mergel (a, e); \
+ C1 = vec_mergeh (b, f); \
+ D1 = vec_mergel (b, f); \
+ E1 = vec_mergeh (c, g); \
+ F1 = vec_mergel (c, g); \
+ G1 = vec_mergeh (d, h); \
+ H1 = vec_mergel (d, h); \
+ \
+ A2 = vec_mergeh (A1, E1); \
+ B2 = vec_mergel (A1, E1); \
+ C2 = vec_mergeh (B1, F1); \
+ D2 = vec_mergel (B1, F1); \
+ E2 = vec_mergeh (C1, G1); \
+ F2 = vec_mergel (C1, G1); \
+ G2 = vec_mergeh (D1, H1); \
+ H2 = vec_mergel (D1, H1); \
+ \
+ a = vec_mergeh (A2, E2); \
+ b = vec_mergel (A2, E2); \
+ c = vec_mergeh (B2, F2); \
+ d = vec_mergel (B2, F2); \
+ e = vec_mergeh (C2, G2); \
+ f = vec_mergel (C2, G2); \
+ g = vec_mergeh (D2, H2); \
+ h = vec_mergel (D2, H2); \
+} while (0)
+
#else /* HAVE_ALTIVEC */
#ifdef ALTIVEC_USE_REFERENCE_C_CODE
#error "I can't use ALTIVEC_USE_REFERENCE_C_CODE if I don't use HAVE_ALTIVEC"
Index: libavcodec/ppc/vc1dsp_altivec.c
===================================================================
--- libavcodec/ppc/vc1dsp_altivec.c (r??vision 6597)
+++ libavcodec/ppc/vc1dsp_altivec.c (copie de travail)
@@ -26,40 +26,6 @@
#include "dsputil_altivec.h"
-// Transpose 8x8 matrix of 16-bit elements. Borrowed from mpegvideo_altivec.c
-#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
-do { \
- vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
- vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
- \
- A1 = vec_mergeh (a, e); \
- B1 = vec_mergel (a, e); \
- C1 = vec_mergeh (b, f); \
- D1 = vec_mergel (b, f); \
- E1 = vec_mergeh (c, g); \
- F1 = vec_mergel (c, g); \
- G1 = vec_mergeh (d, h); \
- H1 = vec_mergel (d, h); \
- \
- A2 = vec_mergeh (A1, E1); \
- B2 = vec_mergel (A1, E1); \
- C2 = vec_mergeh (B1, F1); \
- D2 = vec_mergel (B1, F1); \
- E2 = vec_mergeh (C1, G1); \
- F2 = vec_mergel (C1, G1); \
- G2 = vec_mergeh (D1, H1); \
- H2 = vec_mergel (D1, H1); \
- \
- a = vec_mergeh (A2, E2); \
- b = vec_mergel (A2, E2); \
- c = vec_mergeh (B2, F2); \
- d = vec_mergel (B2, F2); \
- e = vec_mergeh (C2, G2); \
- f = vec_mergel (C2, G2); \
- g = vec_mergeh (D2, H2); \
- h = vec_mergel (D2, H2); \
-} while (0)
-
// main steps of 8x8 transform
#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
do { \
Index: libavcodec/ppc/mpegvideo_altivec.c
===================================================================
--- libavcodec/ppc/mpegvideo_altivec.c (r??vision 6597)
+++ libavcodec/ppc/mpegvideo_altivec.c (copie de travail)
@@ -52,40 +52,7 @@
d = vec_mergel(_trans_acl, _trans_bdl); \
} while (0)
-#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
-do { \
- __typeof__(a) _A1, _B1, _C1, _D1, _E1, _F1, _G1, _H1; \
- __typeof__(a) _A2, _B2, _C2, _D2, _E2, _F2, _G2, _H2; \
- \
- _A1 = vec_mergeh (a, e); \
- _B1 = vec_mergel (a, e); \
- _C1 = vec_mergeh (b, f); \
- _D1 = vec_mergel (b, f); \
- _E1 = vec_mergeh (c, g); \
- _F1 = vec_mergel (c, g); \
- _G1 = vec_mergeh (d, h); \
- _H1 = vec_mergel (d, h); \
- \
- _A2 = vec_mergeh (_A1, _E1); \
- _B2 = vec_mergel (_A1, _E1); \
- _C2 = vec_mergeh (_B1, _F1); \
- _D2 = vec_mergel (_B1, _F1); \
- _E2 = vec_mergeh (_C1, _G1); \
- _F2 = vec_mergel (_C1, _G1); \
- _G2 = vec_mergeh (_D1, _H1); \
- _H2 = vec_mergel (_D1, _H1); \
- \
- a = vec_mergeh (_A2, _E2); \
- b = vec_mergel (_A2, _E2); \
- c = vec_mergeh (_B2, _F2); \
- d = vec_mergel (_B2, _F2); \
- e = vec_mergeh (_C2, _G2); \
- f = vec_mergel (_C2, _G2); \
- g = vec_mergeh (_D2, _H2); \
- h = vec_mergel (_D2, _H2); \
-} while (0)
-
// Loads a four-byte value (int or float) from the target address
// into every element in the target vector. Only works if the
// target address is four-byte aligned (which should be always).
More information about the ffmpeg-devel
mailing list