[FFmpeg-devel] [PATCH] M68K: Optimized MUL64/MULH/MULL functions for 68060

Sat Aug 1 23:21:34 CEST 2009

ami_stuff <ami_stuff at o2.pl> writes:

> 35% faster MP3 decoding on 68060.
>
> /*
>  * Copyright (c) 2009 Matthew Hey ( matthey7 gmail com )
>  * Copyright (c) 2009 Piotr Bandurski ( ami_stuff o2 pl )
>  *
>  * This file is part of FFmpeg.
>  *
>  * FFmpeg is free software; you can redistribute it and/or
>  * modify it under the terms of the GNU Lesser General Public
>  * License as published by the Free Software Foundation; either
>  * version 2.1 of the License, or (at your option) any later version.
>  *
>  * FFmpeg is distributed in the hope that it will be useful,
>  * but WITHOUT ANY WARRANTY; without even the implied warranty of
>  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>  * Lesser General Public License for more details.
>  *
>  * You should have received a copy of the GNU Lesser General Public
>  * License along with FFmpeg; if not, write to the Free Software
>  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>  */
>
> #ifndef AVCODEC_M68K_MATHOPS_H
> #define AVCODEC_M68K_MATHOPS_H
>
> #if defined (__mc68060__)
>
> #define MULH MULH
> static inline av_const int MULH(int a, int b)
> {
>     int lo, hi;
>     __asm__(
>     "move.l %0, d5      \n\t"
>     "move.l %0, d4      \n\t"
>     "bge.b  0f          \n\t"
>     "neg.l  %0          \n\t"
>     "neg.l  d4          \n\t"
>     "0:                 \n\t"
>     "eor.l  %1, d5      \n\t"
>     "move.l %1, d3      \n\t"
>     "bge.b  1f          \n\t"
>     "neg.l  %1          \n\t"
>     "neg.l  d3          \n\t"
>     "1:                 \n\t"
>     "move.w #16, d5     \n\t"
>     "move.l %0, d2      \n\t"
>     "mulu.w %1,%0       \n\t"
>     "lsr.l  d5, d3      \n\t"
>     "lsr.l  d5, d4      \n\t"
>     "mulu.w d3, d2      \n\t"
>     "mulu.w d4, %1      \n\t"
>     "mulu.w d4, d3      \n\t"
>     "move.l d2, d4      \n\t"
>     "lsr.l  d5, d2      \n\t"
>     "add.w  %1, d4      \n\t"
>     "addx.l d2, d3      \n\t"
>     "lsl.l  d5, d4      \n\t"
>     "lsr.l  d5, %1      \n\t"
>     "add.l  d4, %0      \n\t"
>     "addx.l d3, %1      \n\t"
>     "tst.l  d5          \n\t"
>     "bpl.b  2f          \n\t"
>     "neg.l  %0          \n\t"
>     "negx.l %1          \n\t"
>     "2:                 \n\t"
>     :"=d"(lo), "=d"(hi)

Those should be marked early-clobber (&).

>     :"0"(a), "1"(b)

Do these have to be the same regs?  Allowing different registers
theoretically gives the compiler better room for optimal register
allocation.  On the other hand, it gives the compiler more room to
mess up.

>     :"d2", "d3", "d4", "d5");

Avoid using hardcoded registers, and prefer explicitly declared temp
variables.

>     return hi;
> }

Out of interest, what does gcc do when left to its own devices?

> #define MUL64 MUL64
> static inline av_const int64_t MUL64(int a, int b)
> {
>     union { uint64_t x; unsigned hl[2]; } x;
>     __asm__(
>     "move.l %0, d5      \n\t"
>     "move.l %0, d4      \n\t"
>     "bge.b  0f          \n\t"
>     "neg.l  %0          \n\t"
>     "neg.l  d4          \n\t"
>     "0:                 \n\t"
>     "eor.l  %1, d5      \n\t"
>     "move.l %1, d3      \n\t"
>     "bge.b  1f          \n\t"
>     "neg.l  %1          \n\t"
>     "neg.l  d3          \n\t"
>     "1:                 \n\t"
>     "move.w #16, d5     \n\t"
>     "move.l %0, d2      \n\t"
>     "mulu.w %1,%0       \n\t"
>     "lsr.l  d5, d3      \n\t"
>     "lsr.l  d5, d4      \n\t"
>     "mulu.w d3, d2      \n\t"
>     "mulu.w d4, %1      \n\t"
>     "mulu.w d4, d3      \n\t"
>     "move.l d2, d4      \n\t"
>     "lsr.l  d5, d2      \n\t"
>     "add.w  %1, d4      \n\t"
>     "addx.l d2, d3      \n\t"
>     "lsl.l  d5, d4      \n\t"
>     "lsr.l  d5, %1      \n\t"
>     "add.l  d4, %0      \n\t"
>     "addx.l d3, %1      \n\t"
>     "tst.l  d5          \n\t"
>     "bpl.b  2f          \n\t"
>     "neg.l  %0          \n\t"
>     "negx.l %1          \n\t"
>     "2:                 \n\t"
>     :"=d"(x.hl[1]), "=d"(x.hl[0])
>     :"0"(a), "1"(b)
>     :"d2", "d3", "d4", "d5");
>     return x.x;
> }

Same comments as above.

> #define MULL(a,b,s)	(MUL64(a, b) >> s)

Can gcc really be trusted with this?

-- 
M?ns Rullg?rd
mans at mansr.com