[Ffmpeg-devel] [PATCH] idct8 in Altivec for H.264 decoding

Mon Oct 9 11:06:07 CEST 2006

On 10/9/06, Guillaume POIRIER <poirierg at gmail.com> wrote:
> On 10/9/06, Michael Niedermayer <michaelni at gmx.at> wrote:
> > On Mon, Oct 09, 2006 at 12:05:30AM +0200, Guillaume POIRIER wrote:
> > > This patch also carries some macros that are useful in Altivec
> > > programming. They are taken from x264 project, and I have permission
> > > from the author to re-licence them in LGPL.
> >
> > could you send a seperate patch for the TRANSPOSE move and these?
>
> Yes, please find them in attachement if this mail.
> I shall make an updated patch with my idct8 implementation when I have
> improved it.

Ahem....

-- 
With DADVSI (http://en.wikipedia.org/wiki/DADVSI), France finally has
a lead on USA on selling out individuals right to corporations!
Vive la France!
-------------- next part --------------
Index: libavcodec/ppc/dsputil_altivec.h
===================================================================

--- libavcodec/ppc/dsputil_altivec.h	(r??vision 6597)
+++ libavcodec/ppc/dsputil_altivec.h	(copie de travail)
@@ -67,6 +67,57 @@
 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
 #endif
 
+
+/***********************************************************************
+ * Vector types
+ **********************************************************************/
+#define vec_u8_t  vector unsigned char
+#define vec_s8_t  vector signed char
+#define vec_u16_t vector unsigned short
+#define vec_s16_t vector signed short
+#define vec_u32_t vector unsigned int
+#define vec_s32_t vector signed int
+
+/***********************************************************************
+ * Null vector
+ **********************************************************************/
+#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
+
+#define zero_u8v  (vec_u8_t)  zerov
+#define zero_s8v  (vec_s8_t)  zerov
+#define zero_u16v (vec_u16_t) zerov
+#define zero_s16v (vec_s16_t) zerov
+#define zero_u32v (vec_u32_t) zerov
+#define zero_s32v (vec_s32_t) zerov
+
+/***********************************************************************
+* VEC_DIFF_H_8BYTE_ALIGNED
+***********************************************************************
+* p1, p2:    u8 *
+* i1, i2, n: int
+* d:         s16v
+*
+* Loads n bytes from p1 and p2, do the diff of the high elements into
+* d, increments p1 and p2 by i1 and i2
+* Slightly faster when we know we are loading/diffing 8bytes which
+* are 8 byte aligned. Reduces need for two loads and two vec_lvsl()'s
+**********************************************************************/
+#define PREP_DIFF_8BYTEALIGNED \
+LOAD_ZERO;                     \
+vec_s16_t pix1v, pix2v;        \
+vec_u8_t permPix1, permPix2;   \
+permPix1 = vec_lvsl(0, pix1);  \
+permPix2 = vec_lvsl(0, pix2);  \
+
+#define VEC_DIFF_H_8BYTE_ALIGNED(p1,i1,p2,i2,n,d)    \
+pix1v = vec_perm(vec_ld(0,p1), zero_u8v, permPix1);  \
+pix2v = vec_perm(vec_ld(0, p2), zero_u8v, permPix2); \
+pix1v = vec_u8_to_s16( pix1v );                      \
+pix2v = vec_u8_to_s16( pix2v );                      \
+d = vec_sub( pix1v, pix2v);                          \
+p1 += i1;                                            \
+p2 += i2;
+
 #else /* HAVE_ALTIVEC */
 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 #error "I can't use ALTIVEC_USE_REFERENCE_C_CODE if I don't use HAVE_ALTIVEC"
-------------- next part --------------
Index: libavcodec/ppc/dsputil_altivec.h
===================================================================
--- libavcodec/ppc/dsputil_altivec.h	(r??vision 6597)
+++ libavcodec/ppc/dsputil_altivec.h	(copie de travail)
@@ -67,6 +67,40 @@
 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
 #endif
 
+// Transpose 8x8 matrix of 16-bit elements. Borrowed from mpegvideo_altivec.c
+#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
+do { \
+    vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
+    vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
+ \
+    A1 = vec_mergeh (a, e); \
+    B1 = vec_mergel (a, e); \
+    C1 = vec_mergeh (b, f); \
+    D1 = vec_mergel (b, f); \
+    E1 = vec_mergeh (c, g); \
+    F1 = vec_mergel (c, g); \
+    G1 = vec_mergeh (d, h); \
+    H1 = vec_mergel (d, h); \
+ \
+    A2 = vec_mergeh (A1, E1); \
+    B2 = vec_mergel (A1, E1); \
+    C2 = vec_mergeh (B1, F1); \
+    D2 = vec_mergel (B1, F1); \
+    E2 = vec_mergeh (C1, G1); \
+    F2 = vec_mergel (C1, G1); \
+    G2 = vec_mergeh (D1, H1); \
+    H2 = vec_mergel (D1, H1); \
+ \
+    a = vec_mergeh (A2, E2); \
+    b = vec_mergel (A2, E2); \
+    c = vec_mergeh (B2, F2); \
+    d = vec_mergel (B2, F2); \
+    e = vec_mergeh (C2, G2); \
+    f = vec_mergel (C2, G2); \
+    g = vec_mergeh (D2, H2); \
+    h = vec_mergel (D2, H2); \
+} while (0)
+
 #else /* HAVE_ALTIVEC */
 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 #error "I can't use ALTIVEC_USE_REFERENCE_C_CODE if I don't use HAVE_ALTIVEC"
Index: libavcodec/ppc/vc1dsp_altivec.c
===================================================================
--- libavcodec/ppc/vc1dsp_altivec.c	(r??vision 6597)
+++ libavcodec/ppc/vc1dsp_altivec.c	(copie de travail)
@@ -26,40 +26,6 @@
 
 #include "dsputil_altivec.h"
 
-// Transpose 8x8 matrix of 16-bit elements. Borrowed from mpegvideo_altivec.c
-#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
-do { \
-    vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \
-    vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \
- \
-    A1 = vec_mergeh (a, e); \
-    B1 = vec_mergel (a, e); \
-    C1 = vec_mergeh (b, f); \
-    D1 = vec_mergel (b, f); \
-    E1 = vec_mergeh (c, g); \
-    F1 = vec_mergel (c, g); \
-    G1 = vec_mergeh (d, h); \
-    H1 = vec_mergel (d, h); \
- \
-    A2 = vec_mergeh (A1, E1); \
-    B2 = vec_mergel (A1, E1); \
-    C2 = vec_mergeh (B1, F1); \
-    D2 = vec_mergel (B1, F1); \
-    E2 = vec_mergeh (C1, G1); \
-    F2 = vec_mergel (C1, G1); \
-    G2 = vec_mergeh (D1, H1); \
-    H2 = vec_mergel (D1, H1); \
- \
-    a = vec_mergeh (A2, E2); \
-    b = vec_mergel (A2, E2); \
-    c = vec_mergeh (B2, F2); \
-    d = vec_mergel (B2, F2); \
-    e = vec_mergeh (C2, G2); \
-    f = vec_mergel (C2, G2); \
-    g = vec_mergeh (D2, H2); \
-    h = vec_mergel (D2, H2); \
-} while (0)
-
 // main steps of 8x8 transform
 #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
 do { \
Index: libavcodec/ppc/mpegvideo_altivec.c
===================================================================
--- libavcodec/ppc/mpegvideo_altivec.c	(r??vision 6597)
+++ libavcodec/ppc/mpegvideo_altivec.c	(copie de travail)
@@ -52,40 +52,7 @@
   d = vec_mergel(_trans_acl, _trans_bdl); \
 } while (0)
 
-#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
-do { \
-    __typeof__(a)  _A1, _B1, _C1, _D1, _E1, _F1, _G1, _H1; \
-    __typeof__(a)  _A2, _B2, _C2, _D2, _E2, _F2, _G2, _H2; \
- \
-    _A1 = vec_mergeh (a, e); \
-    _B1 = vec_mergel (a, e); \
-    _C1 = vec_mergeh (b, f); \
-    _D1 = vec_mergel (b, f); \
-    _E1 = vec_mergeh (c, g); \
-    _F1 = vec_mergel (c, g); \
-    _G1 = vec_mergeh (d, h); \
-    _H1 = vec_mergel (d, h); \
- \
-    _A2 = vec_mergeh (_A1, _E1); \
-    _B2 = vec_mergel (_A1, _E1); \
-    _C2 = vec_mergeh (_B1, _F1); \
-    _D2 = vec_mergel (_B1, _F1); \
-    _E2 = vec_mergeh (_C1, _G1); \
-    _F2 = vec_mergel (_C1, _G1); \
-    _G2 = vec_mergeh (_D1, _H1); \
-    _H2 = vec_mergel (_D1, _H1); \
- \
-    a = vec_mergeh (_A2, _E2); \
-    b = vec_mergel (_A2, _E2); \
-    c = vec_mergeh (_B2, _F2); \
-    d = vec_mergel (_B2, _F2); \
-    e = vec_mergeh (_C2, _G2); \
-    f = vec_mergel (_C2, _G2); \
-    g = vec_mergeh (_D2, _H2); \
-    h = vec_mergel (_D2, _H2); \
-} while (0)
 
-
 // Loads a four-byte value (int or float) from the target address
 // into every element in the target vector.  Only works if the
 // target address is four-byte aligned (which should be always).