[Ffmpeg-devel] [PATCH] snow cached halfpel
Loren Merritt
lorenm
Mon Mar 13 08:02:48 CET 2006
Note: I this patch is not ready for inclusion, I am just posting it as-is
for anyone who cares.
Stores halfpel interpolated frames, so that each call to hpel mc is just a
copy, and each qpel mc is just a pavgb. This makes encoding with obme+qpel
9% faster, and obme+hpel 6% faster. But due to the extra branch in
pred_block, decoding is 1% slower.
I expect that even better encoding speed gains can be had using the same
method in mpeg4+qpel and snow+epzs, but that will involve much more
invasive changes.
--Loren Merritt
-------------- next part --------------
Index: snow.c
===================================================================
RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/snow.c,v
retrieving revision 1.91
diff -u -r1.91 snow.c
--- snow.c 13 Mar 2006 01:27:13 -0000 1.91
+++ snow.c 13 Mar 2006 06:29:00 -0000
@@ -445,7 +445,7 @@
AVFrame new_picture;
AVFrame input_picture; ///< new_picture with the internal linesizes
AVFrame current_picture;
- AVFrame last_picture;
+ Picture last_picture;
AVFrame mconly_picture;
// uint8_t q_context[16];
uint8_t header_state[32];
@@ -2526,7 +2526,31 @@
assert(tab_index>=0 && tab_index<4 || b_w==32);
if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
- else if(b_w==32){
+ else if(s->last_picture.interpolated[0] && plane_index==0 && src!=tmp+MB_SIZE){
+ const uint8_t *hpels[4] = {
+ s->last_picture.data[0],
+ s->last_picture.interpolated[0],
+ s->last_picture.interpolated[1],
+ s->last_picture.interpolated[2]};
+ static const int h0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+ static const int h1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+ int offset = (sx+2) + (sy+2)*stride;
+ uint8_t *src0 = hpels[h0[dy+(dx>>2)]] + offset + (dy>=12)*stride;
+ uint8_t *src1 = hpels[h1[dy+(dx>>2)]] + offset + (dx>=12);
+ int qpel = (dx|dy)&7;
+ if(b_w==32){
+ s->dsp.put_pixels_tab[0][0](dst, src0, stride, b_h);
+ s->dsp.put_pixels_tab[0][0](dst+16, src0+16, stride, b_h);
+ if(qpel){
+ s->dsp.avg_pixels_tab[0][0](dst, src1, stride, b_h);
+ s->dsp.avg_pixels_tab[0][0](dst+16, src1+16, stride, b_h);
+ }
+ }else{
+ s->dsp.put_pixels_tab[tab_index][0](dst, src0, stride, b_h);
+ if(qpel)
+ s->dsp.avg_pixels_tab[tab_index][0](dst, src1, stride, b_h);
+ }
+ }else if(b_w==32){
int y;
for(y=0; y<b_h; y+=16){
s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
@@ -3839,7 +3863,7 @@
s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
mc_block_hpel ## dx ## dy ## 8;
- mcfh(0, 0)
+// mcfh(0, 0)
mcfh(8, 0)
mcfh(0, 8)
mcfh(8, 8)
@@ -4012,6 +4036,19 @@
s->avctx->get_buffer(s->avctx, &s->input_picture);
+ //FIXME hpel caching would help with any ME method, but requires more changes in motion_est.c
+ if(s->avctx->me_method == ME_ITER && !(avctx->flags&CODEC_FLAG_EMU_EDGE))
+ {
+ int size, offset, i;
+ int w= avctx->width;
+ int h= avctx->height;
+ avcodec_align_dimensions(avctx, &w, &h);
+ w+= EDGE_WIDTH*2;
+ h+= EDGE_WIDTH*2;
+ for(i=0; i<3; i++)
+ s->last_picture.interpolated[i]= av_malloc(w*h) + (w+1)*EDGE_WIDTH;
+ }
+
return 0;
}
@@ -4026,10 +4063,26 @@
draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
}
- tmp= s->last_picture;
- s->last_picture= s->current_picture;
+ tmp= *(AVFrame*)&s->last_picture;
+ *(AVFrame*)&s->last_picture= s->current_picture;
s->current_picture= tmp;
+ if(s->last_picture.data[0] && s->last_picture.interpolated[0]){
+ int x,y,i;
+ int stride = s->last_picture.linesize[0];
+ uint8_t **dst = s->last_picture.interpolated;
+ assert(EDGE_WIDTH >= 12);
+ for(y=-8; y<h+4; y+=8)
+ for(x=-8; x<w+4; x+=8){
+ uint8_t *src= s->last_picture.data[0]+x+y*stride;
+ s->dsp.put_h264_qpel_pixels_tab[1][ 2](dst[0]+x+y*stride, src, stride);
+ s->dsp.put_h264_qpel_pixels_tab[1][ 8](dst[1]+x+y*stride, src, stride);
+ s->dsp.put_h264_qpel_pixels_tab[1][10](dst[2]+x+y*stride, src, stride);
+ }
+ for(i=0; i<3; i++)
+ draw_edges(dst[i]-3-3*stride, stride, w+6, h+6, EDGE_WIDTH-3);
+ }
+
s->current_picture.reference= 1;
if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
@@ -4221,7 +4274,7 @@
}
if(s->last_picture.data[0])
- avctx->release_buffer(avctx, &s->last_picture);
+ avctx->release_buffer(avctx, (AVFrame*)&s->last_picture);
s->current_picture.coded_picture_number = avctx->frame_number;
s->current_picture.pict_type = pict->pict_type;
@@ -4264,6 +4317,13 @@
}
}
}
+
+ if(s->last_picture.interpolated[0] && s->last_picture.linesize[0])
+ {
+ int i;
+ for(i=0; i<3; i++)
+ av_free(s->last_picture.interpolated[i] - (s->last_picture.linesize[0]+1)*EDGE_WIDTH);
+ }
}
static int encode_end(AVCodecContext *avctx)
@@ -4429,7 +4489,7 @@
emms_c();
if(s->last_picture.data[0])
- avctx->release_buffer(avctx, &s->last_picture);
+ avctx->release_buffer(avctx, (AVFrame*)&s->last_picture);
if(!(s->avctx->debug&2048))
*picture= s->current_picture;
More information about the ffmpeg-devel
mailing list