[FFmpeg-devel] [PATCH] Make VP3/Theora Decoder Much Faster
Loren Merritt
lorenm
Mon Dec 7 17:28:24 CET 2009
On Mon, 7 Dec 2009, Mike Melanson wrote:
> I'm a little surprised to realize that this functionality doesn't already
> exist (been a long time since I wrote the decoder). The original VP3 decoder
> had IDCTs for 1- and 3-element fragments in addition to the full flavor IDCT.
> I think perhaps I tried to bring them over but someone convinced me that
> those other cases don't occur often enough to make it worthwhile. Have you
> found a lot of fragments with 1-3 non-zero coeffs?
I've never examined a Theora bitstream, and I'm not about to start now.
However, if Theora doesn't have lots of DC-only blocks, it's either very
different from every other inter-predicted DCT codec out there, or you're
encoding at a ridiculously high bitrate.
I don't remember why I never committed such a change to mpegvideo, but
it's not that it didn't help. Maybe this isn't bitexact and I never
bothered to figure out why?
--Loren Merritt
-------------- next part --------------
Index: i386/dsputil_mmx.c
===================================================================
--- i386/dsputil_mmx.c (revision 11552)
+++ i386/dsputil_mmx.c (working copy)
@@ -37,6 +37,8 @@
extern void ff_idct_xvid_mmx(short *block);
extern void ff_idct_xvid_mmx2(short *block);
+extern void ff_xvid_idct_dc_add(uint8_t *dst, int stride, DCTELEM *block);
+extern void ff_xvid_idct_dc_put(uint8_t *dst, int stride, DCTELEM *block);
int mm_flags; /* multimedia extension flags */
@@ -3190,6 +3192,8 @@
c->idct_add= ff_simple_idct_add_mmx;
c->idct = ff_simple_idct_mmx;
c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
+ if(mm_flags & MM_MMXEXT)
+ c->idct_dc_add= ff_simple_idct_dc_add_mmx2;
#ifdef CONFIG_GPL
}else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
if(mm_flags & MM_MMXEXT){
@@ -3226,11 +3230,14 @@
c->idct_put= ff_idct_xvid_mmx2_put;
c->idct_add= ff_idct_xvid_mmx2_add;
c->idct = ff_idct_xvid_mmx2;
+ c->idct_dc_add= ff_xvid_idct_dc_add_mmx2;
}else{
c->idct_put= ff_idct_xvid_mmx_put;
c->idct_add= ff_idct_xvid_mmx_add;
c->idct = ff_idct_xvid_mmx;
+ c->idct_dc_add= ff_xvid_idct_dc_add;
}
+ c->idct_dc_put= ff_xvid_idct_dc_put;
}
}
Index: i386/h264dsp_mmx.c
===================================================================
--- i386/h264dsp_mmx.c (revision 11552)
+++ i386/h264dsp_mmx.c (working copy)
@@ -253,9 +253,8 @@
);
}
-static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+static inline void idct8_dc_add_mmx2(uint8_t *dst, int stride, int dc)
{
- int dc = (block[0] + 32) >> 6;
int y;
asm volatile(
"movd %0, %%mm0 \n\t"
@@ -264,7 +263,7 @@
"psubw %%mm0, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
+ ::"g"(dc)
);
for(y=2; y--; dst += 4*stride){
asm volatile(
@@ -292,7 +291,22 @@
}
}
+static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+ idct8_dc_add_mmx2(dst, stride, (block[0] + 32) >> 6);
+}
+static void ff_simple_idct_dc_add_mmx2(uint8_t *dst, int stride, DCTELEM *block)
+{
+ idct8_dc_add_mmx2(dst, stride, (block[0]*16383 + (1<<16)) >> 17);
+}
+
+static void ff_xvid_idct_dc_add_mmx2(uint8_t *dst, int stride, DCTELEM *block)
+{
+ idct8_dc_add_mmx2(dst, stride, (block[0] + 4) >> 3);
+}
+
+
/***********************************/
/* deblocking */
Index: mpegvideo.c
===================================================================
--- mpegvideo.c (revision 11552)
+++ mpegvideo.c (working copy)
@@ -1792,7 +1792,10 @@
DCTELEM *block, int i, uint8_t *dest, int line_size, int qscale)
{
s->dct_unquantize_intra(s, block, i, qscale);
- s->dsp.idct_put (dest, line_size, block);
+ if (s->block_last_index[i] > 0)
+ s->dsp.idct_put(dest, line_size, block);
+ else
+ s->dsp.idct_dc_put(dest, line_size, block);
}
/* add block[] to dest[] */
@@ -1800,7 +1803,10 @@
DCTELEM *block, int i, uint8_t *dest, int line_size)
{
if (s->block_last_index[i] >= 0) {
- s->dsp.idct_add (dest, line_size, block);
+ if (s->block_last_index[i] > 0)
+ s->dsp.idct_add(dest, line_size, block);
+ else
+ s->dsp.idct_dc_add(dest, line_size, block);
}
}
@@ -1810,7 +1816,10 @@
if (s->block_last_index[i] >= 0) {
s->dct_unquantize_inter(s, block, i, qscale);
- s->dsp.idct_add (dest, line_size, block);
+ if (s->block_last_index[i] > 0)
+ s->dsp.idct_add(dest, line_size, block);
+ else
+ s->dsp.idct_dc_add(dest, line_size, block);
}
}
Index: simple_idct.h
===================================================================
--- simple_idct.h (revision 11552)
+++ simple_idct.h (working copy)
@@ -44,4 +44,8 @@
void ff_simple_idct48_add(uint8_t *dest, int line_size, DCTELEM *block);
void ff_simple_idct44_add(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_simple_idct_dc_add(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_simple_idct_dc_put(uint8_t *dest, int line_size, DCTELEM *block);
+void ff_simple_idct_dc_put_mmx2(uint8_t *dest, int line_size, DCTELEM *block);
+
#endif /* FFMPEG_SIMPLE_IDCT_H */
Index: dsputil.c
===================================================================
--- dsputil.c (revision 11552)
+++ dsputil.c (working copy)
@@ -4028,6 +4028,8 @@
c->idct_put= ff_simple_idct_put;
c->idct_add= ff_simple_idct_add;
c->idct = ff_simple_idct;
+ c->idct_dc_put= ff_simple_idct_dc_put;
+ c->idct_dc_add= ff_simple_idct_dc_add;
c->idct_permutation_type= FF_NO_IDCT_PERM;
}
}
@@ -4298,6 +4300,11 @@
c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
}
+ if(!c->idct_dc_put)
+ c->idct_dc_put= c->idct_put;
+ if(!c->idct_dc_add)
+ c->idct_dc_add= c->idct_add;
+
switch(c->idct_permutation_type){
case FF_NO_IDCT_PERM:
for(i=0; i<64; i++)
Index: dsputil.h
===================================================================
--- dsputil.h (revision 11552)
+++ dsputil.h (working copy)
@@ -364,6 +364,10 @@
*/
void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
+ /* as idct_put/idct_add, but assume all ac coefs are zero */
+ void (*idct_dc_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
+ void (*idct_dc_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
+
/**
* idct input permutation.
* several optimized IDCTs need a permutated input (relative to the normal order of the reference
Index: h264idct.c
===================================================================
--- h264idct.c (revision 11552)
+++ h264idct.c (working copy)
@@ -154,10 +154,9 @@
}
}
-void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
+static inline void idct8_dc_add(uint8_t *dst, long stride, long dc){
int i, j;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
- int dc = (block[0] + 32) >> 6;
for( j = 0; j < 8; j++ )
{
for( i = 0; i < 8; i++ )
@@ -165,3 +164,31 @@
dst += stride;
}
}
+
+static inline void idct8_dc_put(uint8_t *dst, long stride, long dc){
+ int i;
+ uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+ uint64_t row = cm[dc] * 0x0101010101010101ULL;
+ for(i=0; i<8; i++)
+ *(uint64_t*)(dst+i*stride) = row;
+}
+
+void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
+ idct8_dc_add(dst, stride, (block[0] + 32) >> 6);
+}
+
+void ff_simple_idct_dc_add(uint8_t *dst, int stride, DCTELEM *block){
+ idct8_dc_add(dst, stride, (block[0]*16383 + (1<<16)) >> 17);
+}
+
+void ff_simple_idct_dc_put(uint8_t *dst, int stride, DCTELEM *block){
+ idct8_dc_put(dst, stride, (block[0]*16383 + (1<<16)) >> 17);
+}
+
+void ff_xvid_idct_dc_add(uint8_t *dst, int stride, DCTELEM *block){
+ idct8_dc_add(dst, stride, (block[0] + 4) >> 3);
+}
+
+void ff_xvid_idct_dc_put(uint8_t *dst, int stride, DCTELEM *block){
+ idct8_dc_put(dst, stride, (block[0] + 4) >> 3);
+}
More information about the ffmpeg-devel
mailing list