[FFmpeg-devel] [PATCH] Dsputilize some functions from APE decode 1/2 - Altivec implementation

Tue Jul 8 23:18:12 CEST 2008

Entirely untested (I don't have a ppc), but this looks like it should be 
faster. Your other functions would benefit from similar.
For that matter, a whole lot of dsp functions put lvsl inside the loop 
when it should be constant (assuming stride%16==0).

--Loren Merritt
-------------- next part --------------
Index: ppc/int_altivec.c
===================================================================

--- libavcodec/ppc/int_altivec.c	(revision 14094)
+++ libavcodec/ppc/int_altivec.c	(working copy)
@@ -79,14 +79,18 @@
 static void add_int16_altivec(int16_t * v1, int16_t * v2, int order)
 {
     int i;
-    register vec_s16_t vec, *pv;
+    vec_s16_t *pv1 = (vec_s16_t*)v1;
+    vec_s16_t *pv2 = (vec_s16_t*)v2;
+    register vec_u8_t perm = vec_lvsl(0, v2);
+    register vec_s16_t t0 = pv2[0], t1;
 
-    for(i = 0; i < order; i += 8){
-        pv = (vec_s16_t*)v2;
-        vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
-        vec_st(vec_add(vec_ld(0, v1), vec), 0, v1);
-        v1 += 8;
-        v2 += 8;
+    for(i = 0; i < order; i += 16){
+        t1 = pv2[1];
+        pv1[0] = vec_add(pv1[0], vec_perm(t0, t1, perm));
+        t0 = pv2[2];
+        pv1[1] = vec_add(pv1[1], vec_perm(t1, t0, perm));
+        v1 += 2;
+        v2 += 2;
     }
 }
 
Index: dsputil.h
===================================================================
--- libavcodec/dsputil.h	(revision 14094)
+++ libavcodec/dsputil.h	(working copy)
@@ -454,7 +454,7 @@
     /* ape functions */
     /**
      * Add contents of the second vector to the first one.
-     * @param len length of vectors, should be multiple of 8
+     * @param len length of vectors, should be multiple of 16
      */
     void (*add_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
     /**