[Ffmpeg-devel] [PATCH] Snow mmx+sse2 asm optimizations
Michael Niedermayer
michaelni
Mon Mar 6 02:06:01 CET 2006
Hi
On Sun, Mar 05, 2006 at 06:09:09PM -0500, Robert Edele wrote:
[...]
> With the help of ods15, we have done the following:
> - the asm code now resides entirely in dsputil_mmx.c.
> - snow_mmx_sse2.h is now gone
> - code previously in snow.c and all of snow_mmx_sse2.h is now in
> dsputil_mxx.c, dsputil.c, and dsputil.h.
> - snow calls the asm via dsputil function pointers.
>
> If you have any further issues with this code, please let me know.
it looks much better then before, but
please move the stuff from dsputil_mmx.c to snowdsp_mmx.c
this should be just a copy&paste + Makefile update
[...]
> -static void vertical_compose97i(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width){
> +void ff_snow_vertical_compose97i(void *vb0, void *vb1, void *vb2, void *vb3, void *vb4, void *vb5, int width){
> + DWTELEM *b0 = vb0;
> + DWTELEM *b1 = vb1;
> + DWTELEM *b2 = vb2;
> + DWTELEM *b3 = vb3;
> + DWTELEM *b4 = vb4;
> + DWTELEM *b5 = vb5;
move DWTELEM to dsputil.h or anything else but please not that mess
[...]
> @@ -2545,6 +2620,41 @@
> }
> }
>
> +void ff_snow_inner_add_yblock(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> + int src_x, int src_y, int src_stride, void * vsb, int add, uint8_t * dst8){
> + slice_buffer * sb = vsb;
uhm...
put
typdef struct slice_buffer_s slice_buffer;
in dsputil.c or wherever its needed, and
struct slice_buffe_s { ... }; in snow.c
[...]
> Index: i386/dsputil_mmx.c
> ===================================================================
> RCS file: /cvsroot/ffmpeg/ffmpeg/libavcodec/i386/dsputil_mmx.c,v
> retrieving revision 1.111
> diff -u -r1.111 dsputil_mmx.c
> --- i386/dsputil_mmx.c 10 Feb 2006 06:55:25 -0000 1.111
> +++ i386/dsputil_mmx.c 5 Mar 2006 17:31:12 -0000
> @@ -2564,6 +2564,1518 @@
> }
> #endif
>
> +/* snow wavelet */
> +#define DWTELEM int
> +#define W_AM 3
> +#define W_AO 0
> +#define W_AS 1
> +
> +#define W_BM 1
> +#define W_BO 8
> +#define W_BS 4
> +
> +#define W_CM 1
> +#define W_CO 0
> +#define W_CS 0
> +
> +#define W_DM 3
> +#define W_DO 4
> +#define W_DS 3
> +
> +#ifdef ARCH_X86_64
> +#define PTR_SIZE "8"
> +#else
> +#define PTR_SIZE "4"
> +#endif
> +
> +/** Used to minimize the amount of memory used in order to optimize cache performance. **/
> +typedef struct {
> + DWTELEM * * line; ///< For use by idwt and predict_slices.
> + DWTELEM * * data_stack; ///< Used for internal purposes.
> + int data_stack_top;
> + int line_count;
> + int line_width;
> + int data_count;
> + DWTELEM * base_buffer; ///< Buffer that this structure is caching.
> +} slice_buffer;
duplicating #defines and structs is not accpetable, these should be in a
common header
> +
> +#define snow_interleave_line_header(low,b,width)\
> + int i = (width) - 2;\
> + \
> + if ((width) & 1)\
> + {\
> + (b)[i+1] = (low)[(i+1)>>1];\
> + i--;\
> + }
> +
> +#define snow_interleave_line_footer(low,high,b)\
> + for (; i>=0; i-=2){\
> + (b)[i+1] = (high)[i>>1];\
> + (b)[i] = (low)[i>>1];\
> + }
these should be inline functions
> +
> +static void horizontal_compose97i_sse2(void *vb, int width){
> + DWTELEM *b = vb;
> + const int w2= (width+1)>>1;
> + // SSE2 code runs faster with pointers aligned on a 32-byte boundary.
> + DWTELEM temp_buf[width>>1];
> + DWTELEM * const temp = temp_buf + 4 - (((int)temp_buf & 0xF) / 4);
replace /4 by >>2 or make type unsigned divides by 4 and signed is slow
[...]
> + for(; i<w_l; i++){
> + b[i] = b[i] - ((W_DM * (ref[i] + ref[i + 1]) + W_DO) >> W_DS);
> + }
> +
> + if(width&1){
> + b[w_l] = b[w_l] - ((W_DM * 2 * ref[w_l] + W_DO) >> W_DS);
> + }
[...]
> + for(; i<w_r; i++){
> + dst[i] = dst[i] - (b[i] + b[i + 1]);
> + }
> +
> + if(!(width&1)){
> + dst[w_r] = dst[w_r] - (2 * b[w_r]);
> + }
[...]
> + for(; i<w_l; i++){
> + b[i] = b[i] - (((-(ref[i] + ref[(i+1)])+W_BO) - 4*b[i])>>W_BS);
> + }
> +
> + if(width&1){
> + b[w_l] = b[w_l] - (((-2 * ref[w_l] + W_BO) - 4 * b[w_l]) >> W_BS);
> + }
...
replace this with a function, see the lift() function in snow.c on how if its
not obvious
same applies to the other such cases
[...]
> +static void vertical_compose97i_sse2(void *vb0, void *vb1, void *vb2, void *vb3, void *vb4, void *vb5, int width){
> + DWTELEM *b0 = vb0;
> + DWTELEM *b1 = vb1;
> + DWTELEM *b2 = vb2;
> + DWTELEM *b3 = vb3;
> + DWTELEM *b4 = vb4;
> + DWTELEM *b5 = vb5;
> + int i;
> + int end_w2 = width >> 4; /* Needed because GCC does something totally brain dead and mis-loads end_w into the asm code if I use end_w directly.*/
> +
> + asm volatile (
> + "sal $4, %%"REG_d" \n\t"
> + "jmp 2f \n\t"
> + "1: \n\t"
> +
> + "mov %5, %%"REG_a" \n\t"
> + "mov %3, %%"REG_b" \n\t"
> +
> + "movdqa (%%"REG_b",%%"REG_d",4), %%xmm0 \n\t"
> + "movdqa 16(%%"REG_b",%%"REG_d",4), %%xmm2 \n\t"
> + "movdqa 32(%%"REG_b",%%"REG_d",4), %%xmm4 \n\t"
> + "movdqa 48(%%"REG_b",%%"REG_d",4), %%xmm6 \n\t"
> +
> + "paddd (%%"REG_a",%%"REG_d",4), %%xmm0 \n\t"
> + "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2 \n\t"
> + "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4 \n\t"
> + "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6 \n\t"
> +
> + "movdqa %%xmm0, %%xmm1 \n\t"
> + "movdqa %%xmm2, %%xmm3 \n\t"
> + "movdqa %%xmm4, %%xmm5 \n\t"
> + "movdqa %%xmm6, %%xmm7 \n\t"
> +
> + "pslld $1, %%xmm0 \n\t"
> + "pslld $1, %%xmm2 \n\t"
> + "pslld $1, %%xmm4 \n\t"
> + "pslld $1, %%xmm6 \n\t"
> +
> + "paddd %%xmm1, %%xmm0 \n\t"
> + "paddd %%xmm3, %%xmm2 \n\t"
> + "paddd %%xmm5, %%xmm4 \n\t"
> + "paddd %%xmm7, %%xmm6 \n\t"
> +
> + "pcmpeqd %%xmm1, %%xmm1 \n\t"
> + "pslld $31, %%xmm1 \n\t"
> + "psrld $29, %%xmm1 \n\t"
> + "mov %4, %%"REG_a" \n\t"
> +
> + "paddd %%xmm1, %%xmm0 \n\t"
> + "paddd %%xmm1, %%xmm2 \n\t"
> + "paddd %%xmm1, %%xmm4 \n\t"
> + "paddd %%xmm1, %%xmm6 \n\t"
> +
> + "psrad $3, %%xmm0 \n\t"
> + "psrad $3, %%xmm2 \n\t"
> + "psrad $3, %%xmm4 \n\t"
> + "psrad $3, %%xmm6 \n\t"
> +
> + "movdqa (%%"REG_a",%%"REG_d",4), %%xmm1 \n\t"
> + "movdqa 16(%%"REG_a",%%"REG_d",4), %%xmm3 \n\t"
> + "movdqa 32(%%"REG_a",%%"REG_d",4), %%xmm5 \n\t"
> + "movdqa 48(%%"REG_a",%%"REG_d",4), %%xmm7 \n\t"
> +
> + "psubd %%xmm0, %%xmm1 \n\t"
> + "psubd %%xmm2, %%xmm3 \n\t"
> + "psubd %%xmm4, %%xmm5 \n\t"
> + "psubd %%xmm6, %%xmm7 \n\t"
> +
> + "movdqa %%xmm1, (%%"REG_a",%%"REG_d",4) \n\t"
> + "movdqa %%xmm3, 16(%%"REG_a",%%"REG_d",4) \n\t"
> + "movdqa %%xmm5, 32(%%"REG_a",%%"REG_d",4) \n\t"
> + "movdqa %%xmm7, 48(%%"REG_a",%%"REG_d",4) \n\t"
> +
> + "mov %2, %%"REG_c" \n\t"
> +
> + "paddd (%%"REG_c",%%"REG_d",4), %%xmm1 \n\t"
> + "paddd 16(%%"REG_c",%%"REG_d",4), %%xmm3 \n\t"
> + "paddd 32(%%"REG_c",%%"REG_d",4), %%xmm5 \n\t"
> + "paddd 48(%%"REG_c",%%"REG_d",4), %%xmm7 \n\t"
> +
> + "movdqa (%%"REG_b",%%"REG_d",4), %%xmm0 \n\t"
> + "movdqa 16(%%"REG_b",%%"REG_d",4), %%xmm2 \n\t"
> + "movdqa 32(%%"REG_b",%%"REG_d",4), %%xmm4 \n\t"
> + "movdqa 48(%%"REG_b",%%"REG_d",4), %%xmm6 \n\t"
> +
> + "psubd %%xmm1, %%xmm0 \n\t"
> + "psubd %%xmm3, %%xmm2 \n\t"
> + "psubd %%xmm5, %%xmm4 \n\t"
> + "psubd %%xmm7, %%xmm6 \n\t"
> +
> + "movdqa %%xmm0, (%%"REG_b",%%"REG_d",4) \n\t"
> + "movdqa %%xmm2, 16(%%"REG_b",%%"REG_d",4) \n\t"
> + "movdqa %%xmm4, 32(%%"REG_b",%%"REG_d",4) \n\t"
> + "movdqa %%xmm6, 48(%%"REG_b",%%"REG_d",4) \n\t"
> +
> + "mov %1, %%"REG_a" \n\t"
> +
> + "paddd (%%"REG_a",%%"REG_d",4), %%xmm0 \n\t"
> + "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2 \n\t"
> + "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4 \n\t"
> + "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6 \n\t"
> +
> + "movdqa (%%"REG_c",%%"REG_d",4), %%xmm1 \n\t"
> + "movdqa 16(%%"REG_c",%%"REG_d",4), %%xmm3 \n\t"
> + "movdqa 32(%%"REG_c",%%"REG_d",4), %%xmm5 \n\t"
> + "movdqa 48(%%"REG_c",%%"REG_d",4), %%xmm7 \n\t"
> +
> + "pslld $2, %%xmm1 \n\t"
> + "pslld $2, %%xmm3 \n\t"
> + "pslld $2, %%xmm5 \n\t"
> + "pslld $2, %%xmm7 \n\t"
> +
> + "paddd %%xmm1, %%xmm0 \n\t"
> + "paddd %%xmm3, %%xmm2 \n\t"
> + "paddd %%xmm5, %%xmm4 \n\t"
> + "paddd %%xmm7, %%xmm6 \n\t"
> +
> + "pcmpeqd %%xmm1, %%xmm1 \n\t"
> + "pslld $31, %%xmm1 \n\t"
> + "psrld $28, %%xmm1 \n\t"
> + "mov %0, %%"REG_b" \n\t"
> +
> + "paddd %%xmm1, %%xmm0 \n\t"
> + "paddd %%xmm1, %%xmm2 \n\t"
> + "paddd %%xmm1, %%xmm4 \n\t"
> + "paddd %%xmm1, %%xmm6 \n\t"
> +
> + "psrad $4, %%xmm0 \n\t"
> + "psrad $4, %%xmm2 \n\t"
> + "psrad $4, %%xmm4 \n\t"
> + "psrad $4, %%xmm6 \n\t"
> +
> + "paddd (%%"REG_c",%%"REG_d",4), %%xmm0 \n\t"
> + "paddd 16(%%"REG_c",%%"REG_d",4), %%xmm2 \n\t"
> + "paddd 32(%%"REG_c",%%"REG_d",4), %%xmm4 \n\t"
> + "paddd 48(%%"REG_c",%%"REG_d",4), %%xmm6 \n\t"
> +
> + "movdqa %%xmm0, (%%"REG_c",%%"REG_d",4) \n\t"
> + "movdqa %%xmm2, 16(%%"REG_c",%%"REG_d",4) \n\t"
> + "movdqa %%xmm4, 32(%%"REG_c",%%"REG_d",4) \n\t"
> + "movdqa %%xmm6, 48(%%"REG_c",%%"REG_d",4) \n\t"
> +
> + "paddd (%%"REG_b",%%"REG_d",4), %%xmm0 \n\t"
> + "paddd 16(%%"REG_b",%%"REG_d",4), %%xmm2 \n\t"
> + "paddd 32(%%"REG_b",%%"REG_d",4), %%xmm4 \n\t"
> + "paddd 48(%%"REG_b",%%"REG_d",4), %%xmm6 \n\t"
> +
> + "movdqa %%xmm0, %%xmm1 \n\t"
> + "movdqa %%xmm2, %%xmm3 \n\t"
> + "movdqa %%xmm4, %%xmm5 \n\t"
> + "movdqa %%xmm6, %%xmm7 \n\t"
> +
> + "pslld $1, %%xmm0 \n\t"
> + "pslld $1, %%xmm2 \n\t"
> + "pslld $1, %%xmm4 \n\t"
> + "pslld $1, %%xmm6 \n\t"
> +
> + "paddd %%xmm1, %%xmm0 \n\t"
> + "paddd %%xmm3, %%xmm2 \n\t"
> + "paddd %%xmm5, %%xmm4 \n\t"
> + "paddd %%xmm7, %%xmm6 \n\t"
> +
> + "psrad $1, %%xmm0 \n\t"
> + "psrad $1, %%xmm2 \n\t"
> + "psrad $1, %%xmm4 \n\t"
> + "psrad $1, %%xmm6 \n\t"
> +
> + "paddd (%%"REG_a",%%"REG_d",4), %%xmm0 \n\t"
> + "paddd 16(%%"REG_a",%%"REG_d",4), %%xmm2 \n\t"
> + "paddd 32(%%"REG_a",%%"REG_d",4), %%xmm4 \n\t"
> + "paddd 48(%%"REG_a",%%"REG_d",4), %%xmm6 \n\t"
> +
> + "movdqa %%xmm0, (%%"REG_a",%%"REG_d",4) \n\t"
> + "movdqa %%xmm2, 16(%%"REG_a",%%"REG_d",4) \n\t"
> + "movdqa %%xmm4, 32(%%"REG_a",%%"REG_d",4) \n\t"
> + "movdqa %%xmm6, 48(%%"REG_a",%%"REG_d",4) \n\t"
> +
> + "2: \n\t"
> + "sub $16, %%"REG_d" \n\t"
> + "jge 1b \n\t"
> + ::
> + "m"(b0),"m"(b1),"m"(b2),"m"(b3),"m"(b4),"m"(b5),"d"(end_w2):
> + "%"REG_a"","%"REG_b"","%"REG_c"");
this code is not valid, REG_d is changed but neither output nor on the clobber list
[...]
> +static inline void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
> + int y, x;
> + DWTELEM * dst;
> + DWTELEM * * dst_array = sb->line + src_y;
> +
> + asm volatile(
> + "mov %5, %%ebx \n\t"
> + "mov %3, %%"REG_S" \n\t"
> + "pcmpeqd %%xmm4, %%xmm4 \n\t"
> + "pslld $31, %%xmm4 \n\t"
> + "pxor %%xmm7, %%xmm7 \n\t" /* 0 */
> + "psrld $24, %%xmm4 \n\t" /* FRAC_BITS >> 1 */
> +
> + "1: \n\t"
> + "movq (%%"REG_S"), %%xmm0 \n\t"
> + "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> + "punpcklbw %%xmm7, %%xmm0 \n\t"
> + "movq 8(%%"REG_S"), %%xmm1 \n\t"
> + "punpcklbw %%xmm7, %%xmm1 \n\t"
> + "movq (%%"REG_d"), %%xmm5 \n\t"
> + "mov %1, %%"REG_D" \n\t"
> + "punpcklbw %%xmm7, %%xmm5 \n\t"
> + "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> + "movq (%%"REG_d"), %%xmm6 \n\t"
> + "pmullw %%xmm0, %%xmm5 \n\t"
> + "punpcklbw %%xmm7, %%xmm6 \n\t"
> + "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> + "mov (%%"REG_D"), %%"REG_D" \n\t"
> +
> + "movq 128(%%"REG_S"), %%xmm0 \n\t"
> + "pmullw %%xmm1, %%xmm6 \n\t"
> + "punpcklbw %%xmm7, %%xmm0 \n\t"
> + "movq 136(%%"REG_S"), %%xmm1 \n\t"
> + "add %2, %%"REG_D" \n\t"
> + "punpcklbw %%xmm7, %%xmm1 \n\t"
> + "movq (%%"REG_d"), %%xmm2 \n\t"
> + "punpcklbw %%xmm7, %%xmm2 \n\t"
> + "mov (%%"REG_a"), %%"REG_d" \n\t"
> + "paddusw %%xmm5, %%xmm6 \n\t"
> + "pmullw %%xmm0, %%xmm2 \n\t"
> + "movq (%%"REG_d"), %%xmm3 \n\t"
> + "mov %0, %%"REG_d" \n\t"
> + "punpcklbw %%xmm7, %%xmm3 \n\t"
> + "paddusw %%xmm2, %%xmm6 \n\t"
> + "pmullw %%xmm1, %%xmm3 \n\t"
> + "paddusw %%xmm3, %%xmm6 \n\t"
> +
> + "movdqa (%%"REG_D"), %%xmm3 \n\t"
> + "movdqa %%xmm6, %%xmm0 \n\t"
> + "movdqa 16(%%"REG_D"), %%xmm5 \n\t"
> + "punpckhwd %%xmm7, %%xmm6 \n\t"
> + "movq 24(%%"REG_S"), %%xmm1 \n\t"
> + "punpcklwd %%xmm7, %%xmm0 \n\t"
> + "paddd %%xmm0, %%xmm3 \n\t"
> + "paddd %%xmm6, %%xmm5 \n\t"
> + "punpcklbw %%xmm7, %%xmm1 \n\t"
> + "paddd %%xmm4, %%xmm3 \n\t"
> + "paddd %%xmm4, %%xmm5 \n\t"
> + "movq 16(%%"REG_S"), %%xmm0 \n\t"
> + "psrad $8, %%xmm3 \n\t" /* FRAC_BITS. */
> + "psrad $8, %%xmm5 \n\t" /* FRAC_BITS. */
> +
> + "packssdw %%xmm5, %%xmm3 \n\t"
> + "mov %1, %%"REG_D" \n\t"
> + "packuswb %%xmm7, %%xmm3 \n\t"
> +
> + "movq %%xmm3, (%%"REG_d") \n\t"
> +
> +
> + "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> + "punpcklbw %%xmm7, %%xmm0 \n\t"
> + "movq (%%"REG_d",%%"REG_c"), %%xmm5; \n\t"
> + "punpcklbw %%xmm7, %%xmm5 \n\t"
> + "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> + "movq (%%"REG_d",%%"REG_c"), %%xmm6; \n\t"
> + "pmullw %%xmm0, %%xmm5 \n\t"
> + "punpcklbw %%xmm7, %%xmm6 \n\t"
> +
> + "movq 144(%%"REG_S"), %%xmm0 \n\t"
> + "pmullw %%xmm1, %%xmm6 \n\t"
> + "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> + "punpcklbw %%xmm7, %%xmm0 \n\t"
> + "movq 152(%%"REG_S"), %%xmm1 \n\t"
> + "punpcklbw %%xmm7, %%xmm1 \n\t"
> + "movq (%%"REG_d",%%"REG_c"), %%xmm2;\n\t"
> + "punpcklbw %%xmm7, %%xmm2 \n\t"
> + "mov (%%"REG_a"), %%"REG_d" \n\t"
> + "paddusw %%xmm5, %%xmm6 \n\t"
> + "pmullw %%xmm0, %%xmm2 \n\t"
> + "movq (%%"REG_d",%%"REG_c"), %%xmm3;\n\t"
> + "punpcklbw %%xmm7, %%xmm3 \n\t"
> + "paddusw %%xmm2, %%xmm6 \n\t"
> + "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
> + "pmullw %%xmm1, %%xmm3 \n\t"
> + "sal $1, %%"REG_c" \n\t"
> + "add %2, %%"REG_D" \n\t"
> + "paddusw %%xmm3, %%xmm6 \n\t"
> + "mov %0, %%"REG_d" \n\t"
> +
> + "movdqa (%%"REG_D"), %%xmm3 \n\t"
> + "movdqa %%xmm6, %%xmm0 \n\t"
> + "movdqa 16(%%"REG_D"), %%xmm5 \n\t"
> + "punpckhwd %%xmm7, %%xmm6 \n\t"
> + "punpcklwd %%xmm7, %%xmm0 \n\t"
> + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
> + "paddd %%xmm0, %%xmm3 \n\t"
> + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
> + "paddd %%xmm6, %%xmm5 \n\t"
> + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
> + "paddd %%xmm4, %%xmm3 \n\t"
> + "add %%"REG_c", (%%"REG_a") \n\t"
> + "paddd %%xmm4, %%xmm5 \n\t"
> + "psrad $8, %%xmm3 \n\t" /* FRAC_BITS. */
> + "add $"PTR_SIZE"*2, %1 \n\t"
> + "psrad $8, %%xmm5 \n\t" /* FRAC_BITS. */
> + "add $32, %%"REG_S" \n\t"
> +
> + "packssdw %%xmm5, %%xmm3 \n\t"
> + "add %%"REG_c", %0 \n\t"
> + "packuswb %%xmm7, %%xmm3 \n\t"
> +
> + "sar $1, %%"REG_c" \n\t"
> + "movq %%xmm3, (%%"REG_d",%%"REG_c");\n\t"
> +
> + "sub $2, %%"REG_b" \n\t"
> + "jnz 1b \n\t"
> + :
> + :
> + "m"(dst8),"m"(dst_array),"rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"(b_h),"c"(src_stride):
> + "%"REG_b"","%"REG_S"","%"REG_D"","%"REG_d"");
a minor issue, dont use ebx please, it causes PIC fanboys to flame us
and a major one REG_c is changed and not an output or cloberlisted
[...]
> +
> +static inline void inner_add_yblock_bw_16_obmc_32_mmx(uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
> + int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
> + int y, x;
> + DWTELEM * dst;
> + DWTELEM * * dst_array = sb->line + src_y;
> +
> + asm volatile(
> + "mov %5, %%ebx \n\t"
> + "mov %3, %%"REG_S" \n\t"
> + "pcmpeqd %%mm4, %%mm4 \n\t"
> + "pslld $31, %%mm4 \n\t"
> + "pxor %%mm7, %%mm7 \n\t" /* 0 */
> + "psrld $24, %%mm4 \n\t" /* FRAC_BITS >> 1 */
> +
> + "1: \n\t"
> + "movd (%%"REG_S"), %%mm0 \n\t"
> + "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> + "punpcklbw %%mm7, %%mm0 \n\t"
> + "movd 16(%%"REG_S"), %%mm1 \n\t"
> + "punpcklbw %%mm7, %%mm1 \n\t"
> + "movd (%%"REG_d"), %%mm5 \n\t"
> + "mov %1, %%"REG_D" \n\t"
> + "punpcklbw %%mm7, %%mm5 \n\t"
> + "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> + "movd (%%"REG_d"), %%mm6 \n\t"
> + "pmullw %%mm0, %%mm5 \n\t"
> + "punpcklbw %%mm7, %%mm6 \n\t"
> + "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> +
> + "movd 512(%%"REG_S"), %%mm0 \n\t"
> + "pmullw %%mm1, %%mm6 \n\t"
> + "punpcklbw %%mm7, %%mm0 \n\t"
> + "movd 528(%%"REG_S"), %%mm1 \n\t"
> + "punpcklbw %%mm7, %%mm1 \n\t"
> + "movd (%%"REG_d"), %%mm2 \n\t"
> + "punpcklbw %%mm7, %%mm2 \n\t"
> + "mov (%%"REG_a"), %%"REG_d" \n\t"
> + "paddusw %%mm5, %%mm6 \n\t"
> + "mov (%%"REG_D"), %%"REG_D" \n\t"
> + "pmullw %%mm0, %%mm2 \n\t"
> + "movd (%%"REG_d"), %%mm3 \n\t"
> + "mov %0, %%"REG_d" \n\t"
> + "punpcklbw %%mm7, %%mm3 \n\t"
> + "add %2, %%"REG_D" \n\t"
> + "paddusw %%mm2, %%mm6 \n\t"
> + "pmullw %%mm1, %%mm3 \n\t"
> + "paddusw %%mm3, %%mm6 \n\t"
> +
> + "movq (%%"REG_D"), %%mm3 \n\t"
> + "movq %%mm6, %%mm0 \n\t"
> + "movq 8(%%"REG_D"), %%mm5 \n\t"
> + "punpckhwd %%mm7, %%mm6 \n\t"
> + "movd 20(%%"REG_S"), %%mm1 \n\t"
> + "punpcklwd %%mm7, %%mm0 \n\t"
> + "paddd %%mm0, %%mm3 \n\t"
> + "paddd %%mm6, %%mm5 \n\t"
> + "punpcklbw %%mm7, %%mm1 \n\t"
> + "paddd %%mm4, %%mm3 \n\t"
> + "paddd %%mm4, %%mm5 \n\t"
> + "movd 4(%%"REG_S"), %%mm0 \n\t"
> + "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
> + "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
> +
> + "packssdw %%mm5, %%mm3 \n\t"
> + "packuswb %%mm7, %%mm3 \n\t"
> +
> + "movd %%mm3, (%%"REG_d") \n\t"
> +
> +
> + "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> + "punpcklbw %%mm7, %%mm0 \n\t"
> + "movd 4(%%"REG_d"), %%mm5 \n\t"
> + "punpcklbw %%mm7, %%mm5 \n\t"
> + "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> + "movd 4(%%"REG_d"), %%mm6 \n\t"
> + "pmullw %%mm0, %%mm5 \n\t"
> + "punpcklbw %%mm7, %%mm6 \n\t"
> +
> + "movd 516(%%"REG_S"), %%mm0 \n\t"
> + "pmullw %%mm1, %%mm6 \n\t"
> + "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> + "punpcklbw %%mm7, %%mm0 \n\t"
> + "movd 532(%%"REG_S"), %%mm1 \n\t"
> + "punpcklbw %%mm7, %%mm1 \n\t"
> + "movd 4(%%"REG_d"), %%mm2 \n\t"
> + "punpcklbw %%mm7, %%mm2 \n\t"
> + "mov (%%"REG_a"), %%"REG_d" \n\t"
> + "paddusw %%mm5, %%mm6 \n\t"
> + "pmullw %%mm0, %%mm2 \n\t"
> + "movd 4(%%"REG_d"), %%mm3 \n\t"
> + "punpcklbw %%mm7, %%mm3 \n\t"
> + "paddusw %%mm2, %%mm6 \n\t"
> + "pmullw %%mm1, %%mm3 \n\t"
> + "paddusw %%mm3, %%mm6 \n\t"
> + "mov %0, %%"REG_d" \n\t"
> +
> + "movq 16(%%"REG_D"), %%mm3 \n\t"
> + "movq %%mm6, %%mm0 \n\t"
> + "movq 24(%%"REG_D"), %%mm5 \n\t"
> + "punpckhwd %%mm7, %%mm6 \n\t"
> + "punpcklwd %%mm7, %%mm0 \n\t"
> + "paddd %%mm0, %%mm3 \n\t"
> + "paddd %%mm6, %%mm5 \n\t"
> + "paddd %%mm4, %%mm3 \n\t"
> + "paddd %%mm4, %%mm5 \n\t"
> + "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
> + "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
> +
> + "packssdw %%mm5, %%mm3 \n\t"
> + "packuswb %%mm7, %%mm3 \n\t"
> +
> + "movd %%mm3, 4(%%"REG_d") \n\t"
> +
> +
> +
> + "movd 8(%%"REG_S"), %%mm0 \n\t"
> + "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> + "punpcklbw %%mm7, %%mm0 \n\t"
> + "movd 24(%%"REG_S"), %%mm1 \n\t"
> + "punpcklbw %%mm7, %%mm1 \n\t"
> + "movd 8(%%"REG_d"), %%mm5 \n\t"
> + "punpcklbw %%mm7, %%mm5 \n\t"
> + "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> + "movd 8(%%"REG_d"), %%mm6 \n\t"
> + "pmullw %%mm0, %%mm5 \n\t"
> + "punpcklbw %%mm7, %%mm6 \n\t"
> + "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> +
> + "movd 520(%%"REG_S"), %%mm0 \n\t"
> + "pmullw %%mm1, %%mm6 \n\t"
> + "punpcklbw %%mm7, %%mm0 \n\t"
> + "movd 536(%%"REG_S"), %%mm1 \n\t"
> + "punpcklbw %%mm7, %%mm1 \n\t"
> + "movd 8(%%"REG_d"), %%mm2 \n\t"
> + "punpcklbw %%mm7, %%mm2 \n\t"
> + "mov (%%"REG_a"), %%"REG_d" \n\t"
> + "paddusw %%mm5, %%mm6 \n\t"
> + "pmullw %%mm0, %%mm2 \n\t"
> + "movd 8(%%"REG_d"), %%mm3 \n\t"
> + "mov %0, %%"REG_d" \n\t"
> + "punpcklbw %%mm7, %%mm3 \n\t"
> + "paddusw %%mm2, %%mm6 \n\t"
> + "pmullw %%mm1, %%mm3 \n\t"
> + "paddusw %%mm3, %%mm6 \n\t"
> +
> + "movq 32(%%"REG_D"), %%mm3 \n\t"
> + "movq %%mm6, %%mm0 \n\t"
> + "movq 40(%%"REG_D"), %%mm5 \n\t"
> + "punpckhwd %%mm7, %%mm6 \n\t"
> + "movd 28(%%"REG_S"), %%mm1 \n\t"
> + "punpcklwd %%mm7, %%mm0 \n\t"
> + "paddd %%mm0, %%mm3 \n\t"
> + "paddd %%mm6, %%mm5 \n\t"
> + "punpcklbw %%mm7, %%mm1 \n\t"
> + "paddd %%mm4, %%mm3 \n\t"
> + "paddd %%mm4, %%mm5 \n\t"
> + "movd 12(%%"REG_S"), %%mm0 \n\t"
> + "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
> + "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
> +
> + "packssdw %%mm5, %%mm3 \n\t"
> + "packuswb %%mm7, %%mm3 \n\t"
> +
> + "movd %%mm3, 8(%%"REG_d") \n\t"
> +
> +
> + "mov "PTR_SIZE"*3(%%"REG_a"), %%"REG_d";\n\t"
> + "punpcklbw %%mm7, %%mm0 \n\t"
> + "movd 12(%%"REG_d"), %%mm5 \n\t"
> + "punpcklbw %%mm7, %%mm5 \n\t"
> + "mov "PTR_SIZE"*2(%%"REG_a"), %%"REG_d";\n\t"
> + "movd 12(%%"REG_d"), %%mm6 \n\t"
> + "pmullw %%mm0, %%mm5 \n\t"
> + "punpcklbw %%mm7, %%mm6 \n\t"
> +
> + "movd 524(%%"REG_S"), %%mm0 \n\t"
> + "pmullw %%mm1, %%mm6 \n\t"
> + "mov "PTR_SIZE"*1(%%"REG_a"), %%"REG_d";\n\t"
> + "punpcklbw %%mm7, %%mm0 \n\t"
> + "movd 540(%%"REG_S"), %%mm1 \n\t"
> + "punpcklbw %%mm7, %%mm1 \n\t"
> + "movd 12(%%"REG_d"), %%mm2 \n\t"
> + "punpcklbw %%mm7, %%mm2 \n\t"
> + "mov (%%"REG_a"), %%"REG_d" \n\t"
> + "paddusw %%mm5, %%mm6 \n\t"
> + "pmullw %%mm0, %%mm2 \n\t"
> + "movd 12(%%"REG_d"), %%mm3 \n\t"
> + "punpcklbw %%mm7, %%mm3 \n\t"
> + "paddusw %%mm2, %%mm6 \n\t"
> + "pmullw %%mm1, %%mm3 \n\t"
> + "paddusw %%mm3, %%mm6 \n\t"
> + "mov %0, %%"REG_d" \n\t"
> +
> + "movq 48(%%"REG_D"), %%mm3 \n\t"
> + "movq %%mm6, %%mm0 \n\t"
> + "movq 56(%%"REG_D"), %%mm5 \n\t"
> + "punpckhwd %%mm7, %%mm6 \n\t"
> + "punpcklwd %%mm7, %%mm0 \n\t"
> + "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"
> + "paddd %%mm0, %%mm3 \n\t"
> + "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"
> + "paddd %%mm6, %%mm5 \n\t"
> + "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"
> + "paddd %%mm4, %%mm3 \n\t"
> + "add %%"REG_c", (%%"REG_a") \n\t"
> + "paddd %%mm4, %%mm5 \n\t"
> + "psrad $8, %%mm3 \n\t" /* FRAC_BITS. */
> + "add $"PTR_SIZE"*1, %1 \n\t"
> + "psrad $8, %%mm5 \n\t" /* FRAC_BITS. */
> + "add $32, %%"REG_S" \n\t"
> +
> + "packssdw %%mm5, %%mm3 \n\t"
> + "add %%"REG_c", %0 \n\t"
> + "packuswb %%mm7, %%mm3 \n\t"
> +
> + "movd %%mm3, 12(%%"REG_d") \n\t"
> +
> + "dec %%"REG_b" \n\t"
> + "jnz 1b \n\t"
> + "emms \t\t"
is the emms here really needed?
[...]
--
Michael
More information about the ffmpeg-devel
mailing list