[FFmpeg-devel] [RFC] snow SSE2 optimizations (was: Re: [FFmpeg-cvslog] r10223 - in trunk/libavcodec/i386: dsputil_mmx.c snowdsp_mmx.c)
Reimar Döffinger
Reimar.Doeffinger
Tue Aug 28 00:07:02 CEST 2007
Hello,
On Mon, Aug 27, 2007 at 11:34:44PM +0200, Michael Niedermayer wrote:
> > > also theres some shift by 4 missing here
> >
> > I don't think so, there is a "psraw $4, %%xmm0 \n\t"
> > further down. And I know the code is an unreadable mess. I'll try to
> > reimplement it somewhen if noone else will do it...
>
> the daa after obmc is 16bit unsigned, the data after the IDWT is 13bit
> signed the white point differs by a factor of 16 a shift by 4 is needed to get
> them on the same level before adding ...
Right, right, I just missed a few lines of code while reading the C
version, thus the confusion.
Since the diff is unreadable, do you think the following is better than
the current code (I mean visually, it does decode correctly after all ;-),
though it is not measurably faster than the mmx code on my PC):
#define load_block_twolines(block, dst1, dst2) \
"mov "PTR_SIZE"*"#block"(%%"REG_a"), %%"REG_d" \n\t"\
"movq (%%"REG_d" ), "dst1" \n\t"\
"movq (%%"REG_d", %%"REG_c"), "dst2" \n\t"\
"punpcklbw %%xmm7, "dst1" \n\t"\
"punpcklbw %%xmm7, "dst2" \n\t"
#define load_obmc_twolines(offset, stride, dst1, dst2) \
"movq "#offset"(%%"REG_S"), "dst1" \n\t"\
"movq "#stride"+"#offset"(%%"REG_S"), "dst2" \n\t"\
"punpcklbw %%xmm7, "dst1" \n\t"\
"punpcklbw %%xmm7, "dst2" \n\t"
#define inc_block(delta) \
"add "delta", "PTR_SIZE"*3(%%"REG_a") \n\t"\
"add "delta", "PTR_SIZE"*2(%%"REG_a") \n\t"\
"add "delta", "PTR_SIZE"*1(%%"REG_a") \n\t"\
"add "delta", (%%"REG_a") \n\t"
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t
*obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
int src_x, int src_y, long src_stride,
slice_buffer * sb, int add, uint8_t * dst8){
IDWTELEM **dst_array = sb->line + src_y;
long tmp;
asm volatile (
"mov %7, %%"REG_c" \n\t"
"mov %6, %2 \n\t"
"mov %4, %%"REG_S" \n\t"
"pxor %%xmm7, %%xmm7 \n\t"
"pcmpeqd %%xmm3, %%xmm3 \n\t"
"psllw $15, %%xmm3 \n\t"
"psrlw $12, %%xmm3 \n\t"
"1: \n\t"
"mov %1, %%"REG_D" \n\t"
"mov (%%"REG_D"), %%"REG_D" \n\t"
"add %3, %%"REG_D" \n\t"
load_block_twolines(3, "%%xmm1", "%%xmm5")
load_obmc_twolines (0, 16, "%%xmm0", "%%xmm4")
"pmullw %%xmm0, %%xmm1 \n\t"
"pmullw %%xmm4, %%xmm5 \n\t"
load_block_twolines(2, "%%xmm2", "%%xmm6")
load_obmc_twolines (8, 16, "%%xmm0", "%%xmm4")
"pmullw %%xmm0, %%xmm2 \n\t"
"pmullw %%xmm4, %%xmm6 \n\t"
"paddusw %%xmm2, %%xmm1 \n\t"
"paddusw %%xmm6, %%xmm5 \n\t"
load_block_twolines(1, "%%xmm2", "%%xmm6")
load_obmc_twolines (128, 16, "%%xmm0", "%%xmm4")
"pmullw %%xmm0, %%xmm2 \n\t"
"pmullw %%xmm4, %%xmm6 \n\t"
"paddusw %%xmm2, %%xmm1 \n\t"
"paddusw %%xmm6, %%xmm5 \n\t"
load_block_twolines(0, "%%xmm2", "%%xmm6")
load_obmc_twolines (136, 16, "%%xmm0", "%%xmm4")
"pmullw %%xmm0, %%xmm2 \n\t"
"pmullw %%xmm4, %%xmm6 \n\t"
"paddusw %%xmm2, %%xmm1 \n\t"
"paddusw %%xmm6, %%xmm5 \n\t"
"mov %0, %%"REG_d" \n\t"
"movdqu (%%"REG_D"), %%xmm0 \n\t"
"psrlw $4, %%xmm1 \n\t"
"paddw %%xmm3, %%xmm1 \n\t"
"paddw %%xmm1, %%xmm0 \n\t"
"mov %1, %%"REG_D" \n\t"
"mov "PTR_SIZE"(%%"REG_D"), %%"REG_D" \n\t"
"add %3, %%"REG_D" \n\t"
"movdqu (%%"REG_D"), %%xmm4 \n\t"
"psrlw $4, %%xmm5 \n\t"
"paddw %%xmm3, %%xmm5 \n\t"
"paddw %%xmm5, %%xmm4 \n\t"
"psraw $4, %%xmm0 \n\t"
"psraw $4, %%xmm4 \n\t"
"packuswb %%xmm4, %%xmm0 \n\t"
"movq %%xmm0, (%%"REG_d") \n\t"
"movhpd %%xmm0, (%%"REG_d",%%"REG_c") \n\t"
"sal $1, %%"REG_c" \n\t"
"add $"PTR_SIZE"*2, %1 \n\t"
"add $16*2, %%"REG_S" \n\t"
"add %%"REG_c", %0 \n\t"
inc_block("%%"REG_c)
"sar $1, %%"REG_c" \n\t"
"sub $2, %2 \n\t"
"jnz 1b \n\t"
: "+m"(dst8), "+m"(dst_array), "=&r"(tmp)
: "rm"((long)(src_x<<1)), "m"(obmc), "a"(block), "m"((long)b_h), "m"((long)src_stride)
: "%"REG_c,"%"REG_S,"%"REG_D,"%"REG_d);
}
Greetings,
Reimar D?ffinger
More information about the ffmpeg-devel
mailing list