[Ffmpeg-devel] [RFC] svq1 very slow encoding
Loren Merritt
lorenm
Fri Mar 30 00:18:58 CEST 2007
On Thu, 29 Mar 2007, Baptiste Coudurier wrote:
> Mike Melanson wrote:
>> Baptiste Coudurier wrote:
>>>
>>> I just noticed that svq1 encoding was slow as hell (around 2fps here on
>>> p4 2.4ghz), if someone had spare time/motivation to speed it up, it
>>> would be great.
>>
>> SVQ1 stands for Sorenson Vector Quantizer #1. Are you sure you
>> understand how vector quantizers operate? By nature, they are
>> excruciatingly slow to encode, yet relatively blindingly fast to decode.
>
> Well, I don't know, it seems quicktime player here explodes ffmpeg
> encoder performance (561s for 760 frame long movie, which is less than
> 2fps).
>
> Quicktime takes like 4 minutes and achieve 4mbit/s, quality is clearly
> below of course, but it's clearly faster.
65% of the cpu time was spent on one line. Clearly a candidate for simd.
Patch makes the encode 2.3x faster on a athlon64. Additional speedups I
tried but didn't include here: using inline instead of dsp adds another
10%, and 3dnow adds 3%.
--Loren Merritt
-------------- next part --------------
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
--- libavcodec/i386/dsputil_mmx.c (revision 8549)
+++ libavcodec/i386/dsputil_mmx.c (working copy)
@@ -1730,6 +1730,38 @@
WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
+
+static int ssd_int8_vs_int16_mmx(int8_t *pix1, int16_t *pix2, int size){
+ int sum;
+ long i=size;
+ asm volatile(
+ "pxor %%mm4, %%mm4 \n"
+ "1: \n"
+ "sub $8, %0 \n"
+ "movq (%2,%0), %%mm2 \n"
+ "movq (%3,%0,2), %%mm0 \n"
+ "movq 8(%3,%0,2), %%mm1 \n"
+ "punpckhbw %%mm2, %%mm3 \n"
+ "punpcklbw %%mm2, %%mm2 \n"
+ "psraw $8, %%mm3 \n"
+ "psraw $8, %%mm2 \n"
+ "psubw %%mm3, %%mm1 \n"
+ "psubw %%mm2, %%mm0 \n"
+ "pmaddwd %%mm1, %%mm1 \n"
+ "pmaddwd %%mm0, %%mm0 \n"
+ "paddd %%mm1, %%mm4 \n"
+ "paddd %%mm0, %%mm4 \n"
+ "jg 1b \n"
+ "movq %%mm4, %%mm3 \n"
+ "psrlq $32, %%mm3 \n"
+ "paddd %%mm3, %%mm4 \n"
+ "movd %%mm4, %1 \n"
+ :"+r"(i), "=r"(sum)
+ :"r"(pix1), "r"(pix2)
+ );
+ return sum;
+}
+
#endif //CONFIG_ENCODERS
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
@@ -3215,6 +3247,8 @@
}
c->add_8x8basis= add_8x8basis_mmx;
+ c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
+
#endif //CONFIG_ENCODERS
c->h263_v_loop_filter= h263_v_loop_filter_mmx;
Index: libavcodec/svq1.c
===================================================================
--- libavcodec/svq1.c (revision 8549)
+++ libavcodec/svq1.c (working copy)
@@ -992,15 +992,10 @@
for(i=0; i<16; i++){
int sum= codebook_sum[stage*16 + i];
- int sqr=0;
- int diff, mean, score;
+ int sqr, diff, mean, score;
vector = codebook + stage*size*16 + i*size;
-
- for(j=0; j<size; j++){
- int v= vector[j];
- sqr += (v - block[stage][j])*(v - block[stage][j]);
- }
+ sqr = s->dsp.ssd_int8_vs_int16(vector, block[stage], size);
diff= block_sum[stage] - sum;
mean= (diff + (size>>1)) >> (level+3);
assert(mean >-300 && mean<300);
Index: libavcodec/dsputil.c
===================================================================
--- libavcodec/dsputil.c (revision 8549)
+++ libavcodec/dsputil.c (working copy)
@@ -3694,6 +3694,14 @@
return score;
}
+static int ssd_int8_vs_int16_c(int8_t *pix1, int16_t *pix2, int size){
+ int score=0;
+ long i;
+ for(i=0; i<size; i++)
+ score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
+ return score;
+}
+
WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
@@ -4076,6 +4084,8 @@
c->w97[1]= w97_8_c;
#endif
+ c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
+
c->add_bytes= add_bytes_c;
c->diff_bytes= diff_bytes_c;
c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
Index: libavcodec/dsputil.h
===================================================================
--- libavcodec/dsputil.h (revision 8549)
+++ libavcodec/dsputil.h (working copy)
@@ -200,6 +200,8 @@
me_cmp_func ildct_cmp[5]; //only width 16 used
me_cmp_func frame_skip_cmp[5]; //only width 8 used
+ int (*ssd_int8_vs_int16)(int8_t *pix1, int16_t *pix2, int size);
+
/**
* Halfpel motion compensation with rounding (a+b+1)>>1.
* this is an array[4][4] of motion compensation funcions for 4
More information about the ffmpeg-devel
mailing list