[FFmpeg-devel] Mixed data type in SIMD code?

Wed Mar 5 03:36:42 CET 2008

On Tue, 4 Mar 2008, Michael Niedermayer wrote:
> On Mon, Mar 03, 2008 at 04:30:08PM -0700, Loren Merritt wrote:
>> On Mon, 3 Mar 2008, Michael Niedermayer wrote:
>>>
>>> Also i doubt we use or ever will use packed double.
>>
>> flac encoder does. Single isn't precise enough for a linear sum of up
>> to 16k elements. Reordering the sum to a tree made it half-way
>> decent decent precision, but also made it as slow as double.
>
> What about something like:
>
> for(i=0; i<16000;){
>    float sum=0;
>    do{
>        sum+= whatever[i++];
>    }while(i&127);
>    double_sum += sum;
> }

done.

core2:
2039632 dezicycles in autocorr_double_c, 65536 runs, 0 skips
771026 dezicycles in autocorr_double_sse2, 65536 runs, 0 skips
524713 dezicycles in autocorr_float_sse1, 65536 runs, 0 skips
500609 dezicycles in autocorr_float_sse2, 65534 runs, 2 skips
432458 dezicycles in autocorr_float_ssse3, 65535 runs, 1 skips
overall: 4.8%

k8:
1776170 dezicycles in autocorr_double_c, 65534 runs, 2 skips
1062022 dezicycles in autocorr_double_sse2, 65535 runs, 1 skips
932452 dezicycles in autocorr_float_sse1, 65533 runs, 3 skips
911259 dezicycles in autocorr_float_sse2, 65534 runs, 2 skips
overall: 2.5%

Presumably a cpu without sse2 would gain more.

cost: Some settings don't notice the reduced precision, some lose up to 
.09% bitrate. This doesn't vary much with the length of the single 
precision loop.

> Or maybe even using int32_t ?

While we only need about 16 msbs of the sum, the elements have about 42 
bits of dynamic range. So an extra pass to determine the appropriate 
shift.

--Loren Merritt
-------------- next part --------------

--- libavcodec/i386/dsputil_mmx.c	(revision 12228)
+++ libavcodec/i386/dsputil_mmx.c	(working copy)
@@ -68,8 +68,11 @@
 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
 DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
 
+DECLARE_ALIGNED_16(const float, ff_ps_1[4]) = { 1.0, 1.0, 1.0, 1.0 };
+DECLARE_ALIGNED_16(const float, ff_ps_4[4]) = { 4.0, 4.0, 4.0, 4.0 };
+DECLARE_ALIGNED_16(const float, ff_ps_1234[4]) = { 1.0, 2.0, 3.0, 4.0 };
+
 DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
-DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
 
 #define JUMPALIGN() asm volatile (ASMALIGN(3)::)
 #define MOVQ_ZERO(regd)  asm volatile ("pxor %%" #regd ", %%" #regd ::)
--- libavcodec/i386/dsputil_mmx.h	(revision 12228)
+++ libavcodec/i386/dsputil_mmx.h	(working copy)
@@ -53,8 +53,11 @@
 extern const uint64_t ff_pb_A1;
 extern const uint64_t ff_pb_FC;
 
+extern const float ff_ps_1[4];
+extern const float ff_ps_4[4];
+extern const float ff_ps_1234[4];
+
 extern const double ff_pd_1[2];
-extern const double ff_pd_2[2];
 
 /* in/out: mma=mma+mmb, mmb=mmb-mma */
 #define SUMSUB_BA( a, b ) \
--- libavcodec/i386/dsputilenc_mmx.c	(revision 12228)
+++ libavcodec/i386/dsputilenc_mmx.c	(working copy)
@@ -1330,8 +1330,12 @@
 
 
 /* FLAC specific */
+void ff_flac_compute_autocorr_sse(const int32_t *data, int len, int lag,
+                                  double *autoc);
 void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
                                    double *autoc);
+void ff_flac_compute_autocorr_ssse3(const int32_t *data, int len, int lag,
+                                    double *autoc);
 
 
 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
@@ -1390,6 +1394,11 @@
             c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
         }
 
+        if(mm_flags & MM_SSE){
+            if (ENABLE_FLAC_ENCODER)
+                c->flac_compute_autocorr = ff_flac_compute_autocorr_sse;
+        }
+
         if(mm_flags & MM_SSE2){
             c->sum_abs_dctelem= sum_abs_dctelem_sse2;
             c->hadamard8_diff[0]= hadamard8_diff16_sse2;
@@ -1407,6 +1416,8 @@
             c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
             c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
             c->hadamard8_diff[1]= hadamard8_diff_ssse3;
+            if (ENABLE_FLAC_ENCODER)
+                c->flac_compute_autocorr = ff_flac_compute_autocorr_ssse3;
         }
 #endif
 
--- libavcodec/flacenc.c	(revision 12228)
+++ libavcodec/flacenc.c	(working copy)
@@ -84,7 +84,7 @@
     int32_t coefs[MAX_LPC_ORDER];
     int shift;
     RiceContext rc;
-    int32_t samples[FLAC_MAX_BLOCKSIZE];
+    DECLARE_ALIGNED_16(int32_t, samples[FLAC_MAX_BLOCKSIZE]);
     int32_t residual[FLAC_MAX_BLOCKSIZE+1];
 } FlacSubframe;
 
--- libavcodec/dsputil.h	(revision 12228)
+++ libavcodec/dsputil.h	(working copy)
@@ -335,8 +335,7 @@
 
     /* assume len is a multiple of 4, and arrays are 16-byte aligned */
     void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
-    /* no alignment needed */
-    void (*flac_compute_autocorr)(const int32_t *data, int len, int lag, double *autoc);
+    void (*flac_compute_autocorr)(const int32_t *data/*align 16*/, int len, int lag, double *autoc);
     /* assume len is a multiple of 8, and arrays are 16-byte aligned */
     void (*vector_fmul)(float *dst, const float *src, int len);
     void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
--- libavcodec/i386/flacdsp_mmx.c  (revision 12228)
+++ libavcodec/i386/flacdsp_mmx.c  (working copy)
@@ -21,118 +21,200 @@
 
 #include "dsputil_mmx.h"
 
-static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data)
-{
-    double c = 2.0 / (len-1.0);
-    int n2 = len>>1;
-    long i = -n2*sizeof(int32_t);
-    long j =  n2*sizeof(int32_t);
-    asm volatile(
-        "movsd   %0,     %%xmm7 \n\t"
-        "movapd  %1,     %%xmm6 \n\t"
-        "movapd  %2,     %%xmm5 \n\t"
-        "movlhps %%xmm7, %%xmm7 \n\t"
-        "subpd   %%xmm5, %%xmm7 \n\t"
-        "addsd   %%xmm6, %%xmm7 \n\t"
-        ::"m"(c), "m"(*ff_pd_1), "m"(*ff_pd_2)
-    );
-#define WELCH(MOVPD, offset)\
-    asm volatile(\
-        "1:                         \n\t"\
-        "movapd   %%xmm7,  %%xmm1   \n\t"\
-        "mulpd    %%xmm1,  %%xmm1   \n\t"\
-        "movapd   %%xmm6,  %%xmm0   \n\t"\
-        "subpd    %%xmm1,  %%xmm0   \n\t"\
-        "pshufd   $0x4e,   %%xmm0, %%xmm1 \n\t"\
-        "cvtpi2pd (%3,%0), %%xmm2   \n\t"\
-        "cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\
-        "mulpd    %%xmm0,  %%xmm2   \n\t"\
-        "mulpd    %%xmm1,  %%xmm3   \n\t"\
-        "movapd   %%xmm2, (%2,%0,2) \n\t"\
-        MOVPD"    %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
-        "subpd    %%xmm5,  %%xmm7   \n\t"\
-        "sub      $8,      %1       \n\t"\
-        "add      $8,      %0       \n\t"\
-        "jl 1b                      \n\t"\
-        :"+&r"(i), "+&r"(j)\
-        :"r"(w_data+n2), "r"(data+n2)\
-    );
-    if(len&1)
-        WELCH("movupd", -1)
-    else
-        WELCH("movapd", -2)
-#undef WELCH
-}
-
-void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
-                                   double *autoc)
-{
-    double tmp[len + lag + 2];
-    double *data1 = tmp + lag;
-    int j;
-
-    if((long)data1 & 15)
-        data1++;
-
-    apply_welch_window_sse2(data, len, data1);
-
-    for(j=0; j<lag; j++)
-        data1[j-lag]= 0.0;
-    data1[len] = 0.0;
-
-    for(j=0; j<lag; j+=2){
-        long i = -len*sizeof(double);
-        if(j == lag-2) {
-            asm volatile(
-                "movsd     %6,     %%xmm0 \n\t"
-                "movsd     %6,     %%xmm1 \n\t"
-                "movsd     %6,     %%xmm2 \n\t"
-                "1:                       \n\t"
-                "movapd   (%4,%0), %%xmm3 \n\t"
-                "movupd -8(%5,%0), %%xmm4 \n\t"
-                "movapd   (%5,%0), %%xmm5 \n\t"
-                "mulpd     %%xmm3, %%xmm4 \n\t"
-                "mulpd     %%xmm3, %%xmm5 \n\t"
-                "mulpd -16(%5,%0), %%xmm3 \n\t"
-                "addpd     %%xmm4, %%xmm1 \n\t"
-                "addpd     %%xmm5, %%xmm0 \n\t"
-                "addpd     %%xmm3, %%xmm2 \n\t"
-                "add       $16,    %0     \n\t"
-                "jl 1b                    \n\t"
-                "movhlps   %%xmm0, %%xmm3 \n\t"
-                "movhlps   %%xmm1, %%xmm4 \n\t"
-                "movhlps   %%xmm2, %%xmm5 \n\t"
-                "addsd     %%xmm3, %%xmm0 \n\t"
-                "addsd     %%xmm4, %%xmm1 \n\t"
-                "addsd     %%xmm5, %%xmm2 \n\t"
-                "movsd     %%xmm0, %1     \n\t"
-                "movsd     %%xmm1, %2     \n\t"
-                "movsd     %%xmm2, %3     \n\t"
-                :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]), "=m"(autoc[j+2])
-                :"r"(data1+len), "r"(data1+len-j), "m"(*ff_pd_1)
-            );
-        } else {
-            asm volatile(
-                "movsd     %5,     %%xmm0 \n\t"
-                "movsd     %5,     %%xmm1 \n\t"
-                "1:                       \n\t"
-                "movapd   (%3,%0), %%xmm3 \n\t"
-                "movupd -8(%4,%0), %%xmm4 \n\t"
-                "mulpd     %%xmm3, %%xmm4 \n\t"
-                "mulpd    (%4,%0), %%xmm3 \n\t"
-                "addpd     %%xmm4, %%xmm1 \n\t"
-                "addpd     %%xmm3, %%xmm0 \n\t"
-                "add       $16,    %0     \n\t"
-                "jl 1b                    \n\t"
-                "movhlps   %%xmm0, %%xmm3 \n\t"
-                "movhlps   %%xmm1, %%xmm4 \n\t"
-                "addsd     %%xmm3, %%xmm0 \n\t"
-                "addsd     %%xmm4, %%xmm1 \n\t"
-                "movsd     %%xmm0, %1     \n\t"
-                "movsd     %%xmm1, %2     \n\t"
-                :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
-                :"r"(data1+len), "r"(data1+len-j), "m"(*ff_pd_1)
-            );
-        }
-    }
-}
+void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
+
+#define PI2PS_SSE2\
+        "pshufd $0x1b, %%xmm0, %%xmm1 \n\t"\
+        "cvtdq2ps  (%3,%0), %%xmm2 \n\t"\
+        "cvtdq2ps  (%3,%1), %%xmm3 \n\t"
+
+#define PI2PS_SSE1\
+        "cvtpi2ps  (%3,%0), %%xmm2 \n\t"\
+        "cvtpi2ps 8(%3,%0), %%xmm4 \n\t"\
+        "movaps    %%xmm0,  %%xmm1 \n\t"\
+        "movlhps   %%xmm4,  %%xmm2 \n\t"\
+        "cvtpi2ps  (%3,%1), %%xmm3 \n\t"\
+        "cvtpi2ps 8(%3,%1), %%xmm4 \n\t"\
+        "shufps $0x1b, %%xmm1, %%xmm1 \n\t"\
+        "movlhps   %%xmm4,  %%xmm3 \n\t"
+
+#define WELCH(CPU, PI2PS) \
+static av_noinline void apply_welch_window_##CPU(const int32_t *data, int len, float *w_data)\
+{\
+    float c = 2.0 / (len-1.0);\
+    int n2 = len>>1;\
+    long i = -n2*sizeof(int32_t);\
+    long j = (n2-4)*sizeof(int32_t);\
+    asm volatile(\
+        "movss     %0,      %%xmm7 \n\t"\
+        "movaps    %1,      %%xmm6 \n\t"\
+        "shufps $0, %%xmm7, %%xmm7 \n\t"\
+        "movaps    %2,      %%xmm5 \n\t"\
+        "subps     %3,      %%xmm7 \n\t"\
+        ::"m"(c), "m"(*ff_ps_1), "m"(*ff_ps_4), "m"(*ff_ps_1234)\
+        :"xmm5", "xmm6", "xmm7"\
+    );\
+    asm volatile(\
+        "1:                        \n\t"\
+        "movaps    %%xmm7,  %%xmm1 \n\t"\
+        "mulps     %%xmm1,  %%xmm1 \n\t"\
+        "movaps    %%xmm6,  %%xmm0 \n\t"\
+        "subps     %%xmm1,  %%xmm0 \n\t"\
+        PI2PS\
+        "mulps     %%xmm0,  %%xmm2 \n\t"\
+        "mulps     %%xmm1,  %%xmm3 \n\t"\
+        "movaps    %%xmm2, (%2,%0) \n\t"\
+        "movaps    %%xmm3, (%2,%1) \n\t"\
+        "subps     %%xmm5,  %%xmm7 \n\t"\
+        "sub       $16,     %1     \n\t"\
+        "add       $16,     %0     \n\t"\
+        "jl 1b                     \n\t"\
+        :"+&r"(i), "+&r"(j)\
+        :"r"(w_data+n2), "r"(data+n2)\
+        :"memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"\
+    );\
+}
+
+#define float_iterations 64 // how long to accumulate single-precision before upgrading
+
+#define OP2(op, s0,d0, s1,d1)\
+    #op" %%xmm"#s0", %%xmm"#d0" \n\t"\
+    #op" %%xmm"#s1", %%xmm"#d1" \n\t"
+
+#define CORR2_LOOP_SSE2(MULT,step)\
+    asm volatile(\
+        "movsd   %5,     %%xmm6 \n\t"\
+        "movapd  %%xmm6, %%xmm7 \n\t"\
+        "2:                     \n\t"\
+        OP2(xorps, 4,4, 5,5)\
+        "1:                     \n\t"\
+        MULT\
+        OP2(addps, 1,4, 2,5)\
+        "add $"#step"*16, %0    \n\t"\
+        "sub $"#step", %1       \n\t"\
+        "jg 1b                  \n\t"\
+        OP2(movhlps,  4,1, 5,2)\
+        OP2(addps,    1,4, 2,5)\
+        OP2(cvtps2pd, 4,4, 5,5)\
+        OP2(addpd,    4,6, 5,7)\
+        "mov   %6,  %1          \n\t"\
+        "cmp   $0,  %0          \n\t"\
+        "jl 2b                  \n\t"\
+        OP2(movhlps,  6,0, 7,1)\
+        OP2(addsd,    6,0, 7,1)\
+        "movsd   %%xmm0,    %2  \n\t"\
+        "movsd   %%xmm1,  8+%2  \n\t"\
+        :"+&r"(j), "+&r"(k), "=m"(autoc_buf[i])\
+        :"r"(data1+len), "r"(data1+len-i), "m"(*ff_pd_1), "i"(float_iterations)\
+        :"memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"\
+    );
+
+#define CORR2_LOOP_SSE1(MULT,step) {\
+    double s0=1.0, s1=1.0;\
+    DECLARE_ALIGNED_8( struct {float x[4];}, sumf );\
+    while(j<0) {\
+        asm volatile(\
+            OP2(xorps, 4,4, 5,5)\
+            "1:                     \n\t"\
+            MULT\
+            OP2(addps, 1,4, 2,5)\
+            "add $"#step"*16, %0    \n\t"\
+            "sub $"#step", %1       \n\t"\
+            "jg 1b                  \n\t"\
+            OP2(movhlps,  4,1, 5,2)\
+            OP2(addps,    1,4, 2,5)\
+            "movlps  %%xmm4,   %2   \n\t"\
+            "movlps  %%xmm5, 8+%2   \n\t"\
+            :"+&r"(j), "+&r"(k), "=m"(sumf)\
+            :"r"(data1+len), "r"(data1+len-i)\
+            :"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"\
+        );\
+        k = float_iterations;\
+        s0 += (double)sumf.x[0] + (double)sumf.x[1];\
+        s1 += (double)sumf.x[2] + (double)sumf.x[3];\
+    }\
+    autoc_buf[i] = s0;\
+    autoc_buf[i+1] = s1;\
+}
+
+#define MULT_EVEN_SSSE3\
+        "movaps       (%4,%0), %%xmm1 \n\t"\
+        "movaps        %%xmm1, %%xmm2 \n\t"\
+        "movaps       (%3,%0), %%xmm0 \n\t"\
+        "palignr $12, -16(%4,%0), %%xmm2 \n\t"\
+        OP2(mulps, 0,1, 0,2)
+
+#define MULT_ODD_SSSE3\
+        "movaps      8(%4,%0), %%xmm1 \n\t"\
+        "movaps     -8(%4,%0), %%xmm3 \n\t"\
+        "movaps        %%xmm1, %%xmm2 \n\t"\
+        "movaps       (%3,%0), %%xmm0 \n\t"\
+        "palignr   $8, %%xmm3, %%xmm1 \n\t"\
+        "palignr   $4, %%xmm3, %%xmm2 \n\t"\
+        OP2(mulps, 0,1, 0,2)
+
+#define MULT_EVEN_SSE1\
+        "movaps       (%4,%0), %%xmm1 \n\t"\
+        "movaps    -16(%4,%0), %%xmm2 \n\t"\
+        "movaps       (%3,%0), %%xmm0 \n\t"\
+        "movss         %%xmm1, %%xmm2 \n\t"\
+        "shufps $0x93, %%xmm1, %%xmm2 \n\t"\
+        OP2(mulps, 0,1, 0,2)
+
+#define MULT_ODD_SSE1\
+        "movaps     -8(%4,%0), %%xmm1 \n\t"\
+        "movaps      8(%4,%0), %%xmm3 \n\t"\
+        "movaps        %%xmm1, %%xmm2 \n\t"\
+        "movaps       (%3,%0), %%xmm0 \n\t"\
+        "movss         %%xmm3, %%xmm2 \n\t"\
+        "shufps $0x4e, %%xmm3, %%xmm1 \n\t"\
+        "shufps $0x39, %%xmm2, %%xmm2 \n\t"\
+        OP2(mulps, 0,1, 0,2)
+
+#define MULT_LAST\
+        "movaps       (%4,%0), %%xmm1 \n\t"\
+        "movaps     16(%4,%0), %%xmm2 \n\t"\
+        "mulps        (%3,%0), %%xmm1 \n\t"\
+        "mulps      16(%3,%0), %%xmm2 \n\t"
+
+#define AUTOCORR(CPU, CORR2_LOOP, EVEN, ODD) \
+void ff_flac_compute_autocorr_##CPU(const int32_t *data, int len, int lag,\
+                                    double *autoc)\
+{\
+    double autoc_buf[lag + 2];\
+    float tmp[len + lag + 7];\
+    float *data1 = tmp + lag + 3;\
+    int i;\
+    if(len&3) {\
+        ff_flac_compute_autocorr(data, len, lag, autoc);\
+        return;\
+    }\
+    data1 -= ((long)data1&15)>>2;\
+    apply_welch_window_##CPU(data, len, data1);\
+    for(i=0; i<lag; i++)\
+        data1[i-lag]= 0.0;\
+    for(i=0; i<4; i++)\
+        data1[len+i]= 0.0;\
+    for(i=0; i<=lag; i+=2){\
+        long j = -len*sizeof(float);\
+        int k = (((len>>2)-1)&(float_iterations-1))+1;\
+        if(i&2)\
+            CORR2_LOOP(ODD, 1)\
+        else if(i==lag) {\
+            CORR2_LOOP(MULT_LAST, 2)\
+            autoc_buf[i] += autoc_buf[i+1] - 1.0;\
+        } else\
+            CORR2_LOOP(EVEN, 1)\
+    }\
+    memcpy(autoc, autoc_buf, (lag+1)*sizeof(double));\
+}
+
+WELCH(sse, PI2PS_SSE1)
+WELCH(sse2, PI2PS_SSE2)
+#define apply_welch_window_ssse3 apply_welch_window_sse2
+AUTOCORR(sse, CORR2_LOOP_SSE1, MULT_EVEN_SSE1, MULT_ODD_SSE1)
+AUTOCORR(sse2, CORR2_LOOP_SSE2, MULT_EVEN_SSE1, MULT_ODD_SSE1)
+#ifdef HAVE_SSSE3
+AUTOCORR(ssse3, CORR2_LOOP_SSE2, MULT_EVEN_SSSE3, MULT_ODD_SSSE3)
+#endif
+