[FFmpeg-devel] Mixed data type in SIMD code?
Loren Merritt
lorenm
Wed Mar 5 03:36:42 CET 2008
On Tue, 4 Mar 2008, Michael Niedermayer wrote:
> On Mon, Mar 03, 2008 at 04:30:08PM -0700, Loren Merritt wrote:
>> On Mon, 3 Mar 2008, Michael Niedermayer wrote:
>>>
>>> Also i doubt we use or ever will use packed double.
>>
>> flac encoder does. Single isn't precise enough for a linear sum of up
>> to 16k elements. Reordering the sum to a tree made it half-way
>> decent decent precision, but also made it as slow as double.
>
> What about something like:
>
> for(i=0; i<16000;){
> float sum=0;
> do{
> sum+= whatever[i++];
> }while(i&127);
> double_sum += sum;
> }
done.
core2:
2039632 dezicycles in autocorr_double_c, 65536 runs, 0 skips
771026 dezicycles in autocorr_double_sse2, 65536 runs, 0 skips
524713 dezicycles in autocorr_float_sse1, 65536 runs, 0 skips
500609 dezicycles in autocorr_float_sse2, 65534 runs, 2 skips
432458 dezicycles in autocorr_float_ssse3, 65535 runs, 1 skips
overall: 4.8%
k8:
1776170 dezicycles in autocorr_double_c, 65534 runs, 2 skips
1062022 dezicycles in autocorr_double_sse2, 65535 runs, 1 skips
932452 dezicycles in autocorr_float_sse1, 65533 runs, 3 skips
911259 dezicycles in autocorr_float_sse2, 65534 runs, 2 skips
overall: 2.5%
Presumably a cpu without sse2 would gain more.
cost: Some settings don't notice the reduced precision, some lose up to
.09% bitrate. This doesn't vary much with the length of the single
precision loop.
> Or maybe even using int32_t ?
While we only need about 16 msbs of the sum, the elements have about 42
bits of dynamic range. So an extra pass to determine the appropriate
shift.
--Loren Merritt
-------------- next part --------------
--- libavcodec/i386/dsputil_mmx.c (revision 12228)
+++ libavcodec/i386/dsputil_mmx.c (working copy)
@@ -68,8 +68,11 @@
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
+DECLARE_ALIGNED_16(const float, ff_ps_1[4]) = { 1.0, 1.0, 1.0, 1.0 };
+DECLARE_ALIGNED_16(const float, ff_ps_4[4]) = { 4.0, 4.0, 4.0, 4.0 };
+DECLARE_ALIGNED_16(const float, ff_ps_1234[4]) = { 1.0, 2.0, 3.0, 4.0 };
+
DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
-DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
#define JUMPALIGN() asm volatile (ASMALIGN(3)::)
#define MOVQ_ZERO(regd) asm volatile ("pxor %%" #regd ", %%" #regd ::)
--- libavcodec/i386/dsputil_mmx.h (revision 12228)
+++ libavcodec/i386/dsputil_mmx.h (working copy)
@@ -53,8 +53,11 @@
extern const uint64_t ff_pb_A1;
extern const uint64_t ff_pb_FC;
+extern const float ff_ps_1[4];
+extern const float ff_ps_4[4];
+extern const float ff_ps_1234[4];
+
extern const double ff_pd_1[2];
-extern const double ff_pd_2[2];
/* in/out: mma=mma+mmb, mmb=mmb-mma */
#define SUMSUB_BA( a, b ) \
--- libavcodec/i386/dsputilenc_mmx.c (revision 12228)
+++ libavcodec/i386/dsputilenc_mmx.c (working copy)
@@ -1330,8 +1330,12 @@
/* FLAC specific */
+void ff_flac_compute_autocorr_sse(const int32_t *data, int len, int lag,
+ double *autoc);
void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
double *autoc);
+void ff_flac_compute_autocorr_ssse3(const int32_t *data, int len, int lag,
+ double *autoc);
void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
@@ -1390,6 +1394,11 @@
c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
}
+ if(mm_flags & MM_SSE){
+ if (ENABLE_FLAC_ENCODER)
+ c->flac_compute_autocorr = ff_flac_compute_autocorr_sse;
+ }
+
if(mm_flags & MM_SSE2){
c->sum_abs_dctelem= sum_abs_dctelem_sse2;
c->hadamard8_diff[0]= hadamard8_diff16_sse2;
@@ -1407,6 +1416,8 @@
c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
c->hadamard8_diff[1]= hadamard8_diff_ssse3;
+ if (ENABLE_FLAC_ENCODER)
+ c->flac_compute_autocorr = ff_flac_compute_autocorr_ssse3;
}
#endif
--- libavcodec/flacenc.c (revision 12228)
+++ libavcodec/flacenc.c (working copy)
@@ -84,7 +84,7 @@
int32_t coefs[MAX_LPC_ORDER];
int shift;
RiceContext rc;
- int32_t samples[FLAC_MAX_BLOCKSIZE];
+ DECLARE_ALIGNED_16(int32_t, samples[FLAC_MAX_BLOCKSIZE]);
int32_t residual[FLAC_MAX_BLOCKSIZE+1];
} FlacSubframe;
--- libavcodec/dsputil.h (revision 12228)
+++ libavcodec/dsputil.h (working copy)
@@ -335,8 +335,7 @@
/* assume len is a multiple of 4, and arrays are 16-byte aligned */
void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
- /* no alignment needed */
- void (*flac_compute_autocorr)(const int32_t *data, int len, int lag, double *autoc);
+ void (*flac_compute_autocorr)(const int32_t *data/*align 16*/, int len, int lag, double *autoc);
/* assume len is a multiple of 8, and arrays are 16-byte aligned */
void (*vector_fmul)(float *dst, const float *src, int len);
void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
--- libavcodec/i386/flacdsp_mmx.c (revision 12228)
+++ libavcodec/i386/flacdsp_mmx.c (working copy)
@@ -21,118 +21,200 @@
#include "dsputil_mmx.h"
-static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data)
-{
- double c = 2.0 / (len-1.0);
- int n2 = len>>1;
- long i = -n2*sizeof(int32_t);
- long j = n2*sizeof(int32_t);
- asm volatile(
- "movsd %0, %%xmm7 \n\t"
- "movapd %1, %%xmm6 \n\t"
- "movapd %2, %%xmm5 \n\t"
- "movlhps %%xmm7, %%xmm7 \n\t"
- "subpd %%xmm5, %%xmm7 \n\t"
- "addsd %%xmm6, %%xmm7 \n\t"
- ::"m"(c), "m"(*ff_pd_1), "m"(*ff_pd_2)
- );
-#define WELCH(MOVPD, offset)\
- asm volatile(\
- "1: \n\t"\
- "movapd %%xmm7, %%xmm1 \n\t"\
- "mulpd %%xmm1, %%xmm1 \n\t"\
- "movapd %%xmm6, %%xmm0 \n\t"\
- "subpd %%xmm1, %%xmm0 \n\t"\
- "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
- "cvtpi2pd (%3,%0), %%xmm2 \n\t"\
- "cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\
- "mulpd %%xmm0, %%xmm2 \n\t"\
- "mulpd %%xmm1, %%xmm3 \n\t"\
- "movapd %%xmm2, (%2,%0,2) \n\t"\
- MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
- "subpd %%xmm5, %%xmm7 \n\t"\
- "sub $8, %1 \n\t"\
- "add $8, %0 \n\t"\
- "jl 1b \n\t"\
- :"+&r"(i), "+&r"(j)\
- :"r"(w_data+n2), "r"(data+n2)\
- );
- if(len&1)
- WELCH("movupd", -1)
- else
- WELCH("movapd", -2)
-#undef WELCH
-}
-
-void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
- double *autoc)
-{
- double tmp[len + lag + 2];
- double *data1 = tmp + lag;
- int j;
-
- if((long)data1 & 15)
- data1++;
-
- apply_welch_window_sse2(data, len, data1);
-
- for(j=0; j<lag; j++)
- data1[j-lag]= 0.0;
- data1[len] = 0.0;
-
- for(j=0; j<lag; j+=2){
- long i = -len*sizeof(double);
- if(j == lag-2) {
- asm volatile(
- "movsd %6, %%xmm0 \n\t"
- "movsd %6, %%xmm1 \n\t"
- "movsd %6, %%xmm2 \n\t"
- "1: \n\t"
- "movapd (%4,%0), %%xmm3 \n\t"
- "movupd -8(%5,%0), %%xmm4 \n\t"
- "movapd (%5,%0), %%xmm5 \n\t"
- "mulpd %%xmm3, %%xmm4 \n\t"
- "mulpd %%xmm3, %%xmm5 \n\t"
- "mulpd -16(%5,%0), %%xmm3 \n\t"
- "addpd %%xmm4, %%xmm1 \n\t"
- "addpd %%xmm5, %%xmm0 \n\t"
- "addpd %%xmm3, %%xmm2 \n\t"
- "add $16, %0 \n\t"
- "jl 1b \n\t"
- "movhlps %%xmm0, %%xmm3 \n\t"
- "movhlps %%xmm1, %%xmm4 \n\t"
- "movhlps %%xmm2, %%xmm5 \n\t"
- "addsd %%xmm3, %%xmm0 \n\t"
- "addsd %%xmm4, %%xmm1 \n\t"
- "addsd %%xmm5, %%xmm2 \n\t"
- "movsd %%xmm0, %1 \n\t"
- "movsd %%xmm1, %2 \n\t"
- "movsd %%xmm2, %3 \n\t"
- :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]), "=m"(autoc[j+2])
- :"r"(data1+len), "r"(data1+len-j), "m"(*ff_pd_1)
- );
- } else {
- asm volatile(
- "movsd %5, %%xmm0 \n\t"
- "movsd %5, %%xmm1 \n\t"
- "1: \n\t"
- "movapd (%3,%0), %%xmm3 \n\t"
- "movupd -8(%4,%0), %%xmm4 \n\t"
- "mulpd %%xmm3, %%xmm4 \n\t"
- "mulpd (%4,%0), %%xmm3 \n\t"
- "addpd %%xmm4, %%xmm1 \n\t"
- "addpd %%xmm3, %%xmm0 \n\t"
- "add $16, %0 \n\t"
- "jl 1b \n\t"
- "movhlps %%xmm0, %%xmm3 \n\t"
- "movhlps %%xmm1, %%xmm4 \n\t"
- "addsd %%xmm3, %%xmm0 \n\t"
- "addsd %%xmm4, %%xmm1 \n\t"
- "movsd %%xmm0, %1 \n\t"
- "movsd %%xmm1, %2 \n\t"
- :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
- :"r"(data1+len), "r"(data1+len-j), "m"(*ff_pd_1)
- );
- }
- }
-}
+void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
+
+#define PI2PS_SSE2\
+ "pshufd $0x1b, %%xmm0, %%xmm1 \n\t"\
+ "cvtdq2ps (%3,%0), %%xmm2 \n\t"\
+ "cvtdq2ps (%3,%1), %%xmm3 \n\t"
+
+#define PI2PS_SSE1\
+ "cvtpi2ps (%3,%0), %%xmm2 \n\t"\
+ "cvtpi2ps 8(%3,%0), %%xmm4 \n\t"\
+ "movaps %%xmm0, %%xmm1 \n\t"\
+ "movlhps %%xmm4, %%xmm2 \n\t"\
+ "cvtpi2ps (%3,%1), %%xmm3 \n\t"\
+ "cvtpi2ps 8(%3,%1), %%xmm4 \n\t"\
+ "shufps $0x1b, %%xmm1, %%xmm1 \n\t"\
+ "movlhps %%xmm4, %%xmm3 \n\t"
+
+#define WELCH(CPU, PI2PS) \
+static av_noinline void apply_welch_window_##CPU(const int32_t *data, int len, float *w_data)\
+{\
+ float c = 2.0 / (len-1.0);\
+ int n2 = len>>1;\
+ long i = -n2*sizeof(int32_t);\
+ long j = (n2-4)*sizeof(int32_t);\
+ asm volatile(\
+ "movss %0, %%xmm7 \n\t"\
+ "movaps %1, %%xmm6 \n\t"\
+ "shufps $0, %%xmm7, %%xmm7 \n\t"\
+ "movaps %2, %%xmm5 \n\t"\
+ "subps %3, %%xmm7 \n\t"\
+ ::"m"(c), "m"(*ff_ps_1), "m"(*ff_ps_4), "m"(*ff_ps_1234)\
+ :"xmm5", "xmm6", "xmm7"\
+ );\
+ asm volatile(\
+ "1: \n\t"\
+ "movaps %%xmm7, %%xmm1 \n\t"\
+ "mulps %%xmm1, %%xmm1 \n\t"\
+ "movaps %%xmm6, %%xmm0 \n\t"\
+ "subps %%xmm1, %%xmm0 \n\t"\
+ PI2PS\
+ "mulps %%xmm0, %%xmm2 \n\t"\
+ "mulps %%xmm1, %%xmm3 \n\t"\
+ "movaps %%xmm2, (%2,%0) \n\t"\
+ "movaps %%xmm3, (%2,%1) \n\t"\
+ "subps %%xmm5, %%xmm7 \n\t"\
+ "sub $16, %1 \n\t"\
+ "add $16, %0 \n\t"\
+ "jl 1b \n\t"\
+ :"+&r"(i), "+&r"(j)\
+ :"r"(w_data+n2), "r"(data+n2)\
+ :"memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"\
+ );\
+}
+
+#define float_iterations 64 // how long to accumulate single-precision before upgrading
+
+#define OP2(op, s0,d0, s1,d1)\
+ #op" %%xmm"#s0", %%xmm"#d0" \n\t"\
+ #op" %%xmm"#s1", %%xmm"#d1" \n\t"
+
+#define CORR2_LOOP_SSE2(MULT,step)\
+ asm volatile(\
+ "movsd %5, %%xmm6 \n\t"\
+ "movapd %%xmm6, %%xmm7 \n\t"\
+ "2: \n\t"\
+ OP2(xorps, 4,4, 5,5)\
+ "1: \n\t"\
+ MULT\
+ OP2(addps, 1,4, 2,5)\
+ "add $"#step"*16, %0 \n\t"\
+ "sub $"#step", %1 \n\t"\
+ "jg 1b \n\t"\
+ OP2(movhlps, 4,1, 5,2)\
+ OP2(addps, 1,4, 2,5)\
+ OP2(cvtps2pd, 4,4, 5,5)\
+ OP2(addpd, 4,6, 5,7)\
+ "mov %6, %1 \n\t"\
+ "cmp $0, %0 \n\t"\
+ "jl 2b \n\t"\
+ OP2(movhlps, 6,0, 7,1)\
+ OP2(addsd, 6,0, 7,1)\
+ "movsd %%xmm0, %2 \n\t"\
+ "movsd %%xmm1, 8+%2 \n\t"\
+ :"+&r"(j), "+&r"(k), "=m"(autoc_buf[i])\
+ :"r"(data1+len), "r"(data1+len-i), "m"(*ff_pd_1), "i"(float_iterations)\
+ :"memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"\
+ );
+
+#define CORR2_LOOP_SSE1(MULT,step) {\
+ double s0=1.0, s1=1.0;\
+ DECLARE_ALIGNED_8( struct {float x[4];}, sumf );\
+ while(j<0) {\
+ asm volatile(\
+ OP2(xorps, 4,4, 5,5)\
+ "1: \n\t"\
+ MULT\
+ OP2(addps, 1,4, 2,5)\
+ "add $"#step"*16, %0 \n\t"\
+ "sub $"#step", %1 \n\t"\
+ "jg 1b \n\t"\
+ OP2(movhlps, 4,1, 5,2)\
+ OP2(addps, 1,4, 2,5)\
+ "movlps %%xmm4, %2 \n\t"\
+ "movlps %%xmm5, 8+%2 \n\t"\
+ :"+&r"(j), "+&r"(k), "=m"(sumf)\
+ :"r"(data1+len), "r"(data1+len-i)\
+ :"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"\
+ );\
+ k = float_iterations;\
+ s0 += (double)sumf.x[0] + (double)sumf.x[1];\
+ s1 += (double)sumf.x[2] + (double)sumf.x[3];\
+ }\
+ autoc_buf[i] = s0;\
+ autoc_buf[i+1] = s1;\
+}
+
+#define MULT_EVEN_SSSE3\
+ "movaps (%4,%0), %%xmm1 \n\t"\
+ "movaps %%xmm1, %%xmm2 \n\t"\
+ "movaps (%3,%0), %%xmm0 \n\t"\
+ "palignr $12, -16(%4,%0), %%xmm2 \n\t"\
+ OP2(mulps, 0,1, 0,2)
+
+#define MULT_ODD_SSSE3\
+ "movaps 8(%4,%0), %%xmm1 \n\t"\
+ "movaps -8(%4,%0), %%xmm3 \n\t"\
+ "movaps %%xmm1, %%xmm2 \n\t"\
+ "movaps (%3,%0), %%xmm0 \n\t"\
+ "palignr $8, %%xmm3, %%xmm1 \n\t"\
+ "palignr $4, %%xmm3, %%xmm2 \n\t"\
+ OP2(mulps, 0,1, 0,2)
+
+#define MULT_EVEN_SSE1\
+ "movaps (%4,%0), %%xmm1 \n\t"\
+ "movaps -16(%4,%0), %%xmm2 \n\t"\
+ "movaps (%3,%0), %%xmm0 \n\t"\
+ "movss %%xmm1, %%xmm2 \n\t"\
+ "shufps $0x93, %%xmm1, %%xmm2 \n\t"\
+ OP2(mulps, 0,1, 0,2)
+
+#define MULT_ODD_SSE1\
+ "movaps -8(%4,%0), %%xmm1 \n\t"\
+ "movaps 8(%4,%0), %%xmm3 \n\t"\
+ "movaps %%xmm1, %%xmm2 \n\t"\
+ "movaps (%3,%0), %%xmm0 \n\t"\
+ "movss %%xmm3, %%xmm2 \n\t"\
+ "shufps $0x4e, %%xmm3, %%xmm1 \n\t"\
+ "shufps $0x39, %%xmm2, %%xmm2 \n\t"\
+ OP2(mulps, 0,1, 0,2)
+
+#define MULT_LAST\
+ "movaps (%4,%0), %%xmm1 \n\t"\
+ "movaps 16(%4,%0), %%xmm2 \n\t"\
+ "mulps (%3,%0), %%xmm1 \n\t"\
+ "mulps 16(%3,%0), %%xmm2 \n\t"
+
+#define AUTOCORR(CPU, CORR2_LOOP, EVEN, ODD) \
+void ff_flac_compute_autocorr_##CPU(const int32_t *data, int len, int lag,\
+ double *autoc)\
+{\
+ double autoc_buf[lag + 2];\
+ float tmp[len + lag + 7];\
+ float *data1 = tmp + lag + 3;\
+ int i;\
+ if(len&3) {\
+ ff_flac_compute_autocorr(data, len, lag, autoc);\
+ return;\
+ }\
+ data1 -= ((long)data1&15)>>2;\
+ apply_welch_window_##CPU(data, len, data1);\
+ for(i=0; i<lag; i++)\
+ data1[i-lag]= 0.0;\
+ for(i=0; i<4; i++)\
+ data1[len+i]= 0.0;\
+ for(i=0; i<=lag; i+=2){\
+ long j = -len*sizeof(float);\
+ int k = (((len>>2)-1)&(float_iterations-1))+1;\
+ if(i&2)\
+ CORR2_LOOP(ODD, 1)\
+ else if(i==lag) {\
+ CORR2_LOOP(MULT_LAST, 2)\
+ autoc_buf[i] += autoc_buf[i+1] - 1.0;\
+ } else\
+ CORR2_LOOP(EVEN, 1)\
+ }\
+ memcpy(autoc, autoc_buf, (lag+1)*sizeof(double));\
+}
+
+WELCH(sse, PI2PS_SSE1)
+WELCH(sse2, PI2PS_SSE2)
+#define apply_welch_window_ssse3 apply_welch_window_sse2
+AUTOCORR(sse, CORR2_LOOP_SSE1, MULT_EVEN_SSE1, MULT_ODD_SSE1)
+AUTOCORR(sse2, CORR2_LOOP_SSE2, MULT_EVEN_SSE1, MULT_ODD_SSE1)
+#ifdef HAVE_SSSE3
+AUTOCORR(ssse3, CORR2_LOOP_SSE2, MULT_EVEN_SSSE3, MULT_ODD_SSSE3)
+#endif
+
More information about the ffmpeg-devel
mailing list