[FFmpeg-devel] [PATCH v4] SH4 mpegaudio decoder optimizations

Wed Feb 2 09:10:08 CET 2011

Thanks for the review

On Tue, 1 Feb 2011, M?ns Rullg?rd wrote:

> Guennadi Liakhovetski <g.liakhovetski at gmx.de> writes:
> 
> > Hi all
> >
> > This is version 4 of the following patch:
> >
> > This patch implements several mpegaudio optimizations for SH4 FPU-enabled 
> > SoCs. Verified to provide more than 45% acceleration, when decoding a 
> > 128kbps stereo mp3 audio file to /dev/null.
> > ---
> >
> > Yes, I know, the .S file is all indented with "evil" TABs... Please, let's 
> > review it this way, I'll convert it to spaces once accepted. Would have 
> > been too much trouble for me to develop with spaces...
> >
> > v4: switched to a out-of-line .S assembly implementation
> >
> > v3, v2, v1: were inline, changelog irrelevant
> >
> > Thanks to all who commented!
> >
> > diff --git a/libavcodec/mpc.c b/libavcodec/mpc.c
> > index d9a1fb7..33b1692 100644
> > --- a/libavcodec/mpc.c
> > +++ b/libavcodec/mpc.c
> > @@ -51,7 +51,7 @@ static void mpc_synth(MPCContext *c, int16_t *out, int channels)
> >      for(ch = 0;  ch < channels; ch++){
> >          samples_ptr = samples + ch;
> >          for(i = 0; i < SAMPLES_PER_BAND; i++) {
> > -            ff_mpa_synth_filter(c->synth_buf[ch], &(c->synth_buf_offset[ch]),
> > +            ff_mpa_synth_filter(NULL, c->synth_buf[ch], &(c->synth_buf_offset[ch]),
> >                                  ff_mpa_synth_window, &dither_state,
> >                                  samples_ptr, channels,
> >                                  c->sb_samples[ch][i]);
> 
> I want to rework the C plumbing of these functions in general.  Review
> of those parts omitted.

So, this means, until then I'kk just have to wait?

> 
> > diff --git a/libavcodec/sh4/dsputil_sh4.c b/libavcodec/sh4/dsputil_sh4.c
> > index ec06e24..37efdf0 100644
> > --- a/libavcodec/sh4/dsputil_sh4.c
> > +++ b/libavcodec/sh4/dsputil_sh4.c
> > @@ -20,8 +20,11 @@
> >   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> >   */
> >
> > +#undef CONFIG_FLOAT
> 
> What purpose does that serve?

Documentation mainly, to show, that there is a dependencie in one of 
headers included below on this macro. Can remove if unwanted.

> >  #include "libavcodec/avcodec.h"
> >  #include "libavcodec/dsputil.h"
> > +#include "libavcodec/mpegaudio.h"
> >  #include "dsputil_sh4.h"
> >  #include "sh4.h"
> >
> > @@ -102,3 +105,32 @@ void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx)
> >                  c->idct_permutation_type= FF_NO_IDCT_PERM;
> >          }
> >  }
> > +
> > +void mp3_win_loop_sh4(int64_t *sum, MPA_INT *synth_buf, MPA_INT *w,
> > +		      OUT_INT *samples, int incr);
> > +int round_sample_sh4(int64_t *sum);
> > +void sum8_macs_sh4(int64_t *sum, int32_t *p, int32_t *w);
> > +
> > +static void apply_window_mp3_sh4(MPA_INT *synth_buf, MPA_INT *window,
> > +                                 int *dither_state, OUT_INT *samples, int incr)
> > +{
> > +    int64_t sum, sum2 = 0;
> > +
> > +    /* copy to avoid wrap */
> > +    memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
> > +
> > +    sum = *dither_state;
> > +    sum8_macs_sh4(&sum, synth_buf + 16, window);
> > +    sum8_macs_sh4(&sum2, synth_buf + 48, window + 32);
> > +    sum -= sum2;
> > +    *samples = round_sample_sh4(&sum);
> > +
> > +    mp3_win_loop_sh4(&sum, synth_buf + 32, window + 1, samples + incr, incr);
> > +
> > +    *dither_state = sum;
> > +}
> 
> Why are you doing this in C?  Doing it all in asm doesn't seem very hard.

Keeping dither_state in C avoids adding one more parameter to asm, the 4 
lines above the function - yes, could be moved to asm too, but I don't 
expect a huge improvement from that.

> > +void ff_mpegaudiodec_init_sh4(MPADecodeContext *s)
> > +{
> > +    s->apply_window_mp3 = apply_window_mp3_sh4;
> > +}
> > diff --git a/libavcodec/sh4/mpegaudiodec_sh4.S b/libavcodec/sh4/mpegaudiodec_sh4.S
> > new file mode 100644
> > index 0000000..ea3b95c
> > --- /dev/null
> > +++ b/libavcodec/sh4/mpegaudiodec_sh4.S
> > @@ -0,0 +1,244 @@
> > +#define MAXSHORT (1 << 15) - 1
> > +#define MINSHORT -(1 << 15)
> > +
> > +#define FRAC_BITS   23   /* fractional bits for sb_samples and dct */
> > +#define WFRAC_BITS  16   /* fractional bits for window */
> > +
> > +#define OUT_MAX MAXSHORT
> > +#define OUT_MIN MINSHORT
> > +#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
> 
> Something should be done to make sure these match the C code.  That
> might mean splitting some headers.

Yes, that would be cleaner, sure. Should I propose a patch for that or 
would you do it?

> > +	.macro sh_macs, wp, pp, tmp
> > +	add	\tmp, \wp
> > +	add	\tmp, \pp
> > +	mac.l	@\wp+, @\pp+
> > +	.endm
> > +
> > +	.macro sh_sum8_macs, sum, wp, pp, tmp
> > +	mov.l	@\sum+, \tmp
> > +	lds	\tmp, macl
> > +	mov.l	@\sum+, \tmp
> > +	lds	\tmp, mach
> 
> You could use the lds.l @reg+, mach/l instruction here,
> 
> > +	mov	#126, \tmp
> > +	shll	\tmp			/* 63 * 4 */
> > +	mac.l	@\wp+, @\pp+		/* 0 */
> > +	sh_macs	\wp, \pp, \tmp		/* 1 */
> > +	sh_macs	\wp, \pp, \tmp		/* 2 */
> > +	sh_macs	\wp, \pp, \tmp		/* 3 */
> > +	sh_macs	\wp, \pp, \tmp		/* 4 */
> > +	sh_macs	\wp, \pp, \tmp		/* 5 */
> > +	sh_macs	\wp, \pp, \tmp		/* 6 */
> > +	sh_macs	\wp, \pp, \tmp		/* 7 */
> > +	sts	mach, \tmp
> > +	mov.l	\tmp, @-\sum
> > +	sts	macl, \tmp
> > +	mov.l	\tmp, @-\sum
> 
> and sts.l mach/l, @-reg here.

Nice, I missed those two, thanks!

> 
> > +	.endm
> 
> You could probably improve pipelining using two pointer pairs,
> doubling the stride, and interleaving things a bit.  I don't remember
> the scheduling details, and they vary between implementations.

The sh_macs macro consists if two additions and one multiplication, both 
these commands belong to the "EX" instruction group and thus cannot be 
executed in parallel.

> > +	/* void sum8_macs(int64_t *sum, int32_t *p, int32_t *w) */
> > +
> > +	.globl sum8_macs_sh4
> > +	.balign 4
> > +	.type   sum8_macs_sh4, @function
> > +sum8_macs_sh4:
> > +	sh_sum8_macs	r4, r6, r5, r3
> > +	rts
> > +	nop
> > +	.size sum8_macs_sh4, .-sum8_macs_sh4
> > +
> > +	.macro round_sample, sum, tmp1, tmp2
> > +	mov.l	@\sum, \tmp1
> > +	mov.l	m23, \tmp2
> > +	and	\tmp1, \tmp2		/* hi &= (1 << 24) - 1 */
> > +	mov.l	\tmp2, @\sum
> > +	shlr16	\tmp1
> > +	shlr8	\tmp1			/* lo >>= 24 */
> > +	mov.l	@(4, \sum), \tmp2
> > +	shll8	\tmp2			/* hi <<= 8 */
> > +	or	\tmp2, \tmp1		/* lo |= hi */
> > +	mov	#0, \tmp2
> > +	mov.l	\tmp2, @(4, \sum)
> > +	/* av_clip */
> > +	mov.l	max, \tmp2
> > +	cmp/ge	\tmp2, \tmp1
> > +	bt	1f
> > +	mov.l	min, \tmp2
> > +	cmp/gt	\tmp2, \tmp1
> > +	bt	2f
> > +1:	mov	\tmp2, \tmp1
> > +2:	
> 
> These branches could make use of the delay slot.

Don't immediately see how - they depend on previous results.

> > +	.endm
> > +
> > +	/* int round_sample_asm(int64_t *sum) */
> > +
> > +	.globl round_sample_sh4
> > +	.balign 4
> > +	.type   round_sample_sh4, @function
> > +round_sample_sh4:
> > +	round_sample r4, r0, r1
> > +	rts
> > +	nop
> > +	.size round_sample_sh4, .-round_sample_sh4
> > +
> > +	.balign 4
> > +
> > +	/* void mp3_win_loop_sh4(&sum, synth_buf + 32, window + 1, samples + incr, incr) */
> > +	.globl mp3_win_loop_sh4
> > +	.balign 4
> > +	.type   mp3_win_loop_sh4, @function
> > +mp3_win_loop_sh4:
> 
> You might want to adapt some of the macros from arm/asm.S.

You mean stuff like their "function" macro? Well, sorry, I think, I don't 
have too many of them to really benefit from those and, IMHO, if anything, 
they would make the code more obscured in my case.

> > +	mov	r6, r0
> > +	add	#120, r0	/* w2 = w + 30 (* 4) */
> > +	mov	#60, r1		/* j = r1 - loop counter: 15 * 4 */
> > +
> > +	mov.l	@r15, r2	/* r2 = incr */
> > +	shll	r2		/* r2 *= 2 */
> > +
> > +	mov.l	r8, @-r15
> > +	mov.l	r9, @-r15
> > +	mov.l	r10, @-r15
> > +	mov.l	r11, @-r15
> > +	sts.l	pr, @-r15
> > +
> > +	mov	#30, r11
> > +	muls.w	r2, r11
> > +	sts	macl, r11
> > +	add	r7, r11		/* samples2 = samples + 30 * incr (* 2) */
> > +
> > +4:	sub	r1, r5		/* p = synth_buf - j (* 4) */
> > +
> > +	/* We'll use stack for 64-bit variables:
> > +	 * tmp = -sum2 */
> > +
> > +	/* SUM8P2_MACS_MLSS(sum, sum2, w, w2, p) */
> > +
> > +	mov	r5, r8
> > +	mov	r6, r9
> > +	sh_sum8_macs r4, r6, r5, r3	/* sum, window, pp, tmp */
> > +	mov	r8, r5		/* restore p */
> > +	mov	r9, r6		/* restore w */
> > +
> > +	mov	#0, r10
> > +	mov.l	r10, @-r15	/* sum2 = 0 */
> > +	mov.l	r10, @-r15
> > +	mov	r15, r10	/* r10 = &sum2 */
> > +
> > +	mov	r0, r9
> > +	sh_sum8_macs r10, r0, r5, r3	/* sum2 is now at the top of the stack */
> > +	mov	r9, r0		/* restore w2 */
> > +	mov	r8, r5		/* restore p */
> > +
> > +	/* SUM8P2_MLSS_MLSS(sum, sum2, w + 32, w2 + 32, p) */
> > +
> > +	add	r1, r5
> > +	add	r1, r5		/* p += 2 * j (* 4) */
> > +
> > +	mov	r5, r8		/* save r5 */
> > +	mov	r6, r9		/* save r6 before incrementing */
> > +
> > +	add	#64, r6
> > +	add	#64, r6		/* w += 32 (* 4): 128 < 0 in 8-bit arith. */
> > +	mov	#0, r10
> > +	mov.l	r10, @-r15	/* int64_t tmp = 0 */
> > +	mov.l	r10, @-r15
> > +
> > +	mov	r15, r10	/* r10 = &tmp */
> > +
> > +	sh_sum8_macs r10, r6, r5, r3	/* tmp is now at the top of the stack */
> > +	mov	r9, r6		/* restore w */
> > +	mov	r8, r5		/* restore p */
> 
> Why do you keep the sum on the stack?  Are there not enough registers?

Well, I think, I would need the same number of regsters actually, I 
wouldn't need the \tmp then. I could try to switch, yes.

> > +	clrt
> > +
> > +	/* sum -= tmp - 64-bit signed */
> > +	mov.l	@r15+, r8	/* low 32-bits of the tmp-sum */
> > +	mov.l	@r4, r10	/* low 32-bits of the sum */
> > +	subc	r8, r10
> > +	mov.l	r10, @r4
> > +	mov.l	@r15+, r8	/* high 32-bits of the tmp-sum */
> > +	mov.l	@(4, r4), r10	/* high 32-bits of the sum */
> > +	subc	r8, r10
> > +	mov.l	r10, @(4, r4)
> > +
> > +	mov	r5, r8
> > +	mov	r0, r10
> > +	add	#64, r0
> > +	add	#64, r0		/* w2 += 32 (* 4): 128 < 0 in 8-bit arith. */
> > +
> > +	mov	r15, r9		/* r9 = &sum2 */
> > +	sh_sum8_macs r9, r0, r5, r3	/* tmp = -sum2 is now at the top of the stack */
> > +	mov	r8, r5		/* restore p */
> > +
> > +	round_sample r4, r0, r3
> > +
> > +	mov.w	r0, @r7		/* *samples = round_samples() */
> > +	add	r2, r7		/* samples += incr (* 2) */
> > +
> > +	clrt
> > +
> > +	mov.l	@r15+, r0	/* low 32-bits */
> > +	mov.l	@r4, r3
> > +	subc	r0, r3
> > +	mov.l	r3, @r4
> > +	mov.l	@r15+, r0	/* high 32-bits */
> > +	mov.l	@(4, r4), r3
> > +	subc	r0, r3
> > +
> > +	mov.l	r3, @(4, r4)
> > +	round_sample r4, r0, r3
> > +
> > +	mov.w	r0, @r11	/* *samples2 = round_samples() */
> > +	sub	r2, r11		/* samples2 -= incr (* 2) */
> > +
> > +	sub	r1, r5		/* synth_buf = p - j */
> > +	mov	r10, r0		/* restore w2 */
> > +	add	#-4, r0		/* w2-- */
> > +
> > +	add	#-4, r1
> > +	cmp/pl	r1
> > +	bf	5f
> > +	bra	4b
> > +	/* CAREFUL: The below w++ is still in the loop - it's in the delay window */
> > +5:	add	#4, r6		/* w++ */
> 
> Most of this could be scheduled better.

I'm 100% certain my asm is not state of the art, sorry, it is not my 
primary programming language, so, I wouldn't want to spend days trying to 
make this perfect. I think, we should make reasonably good and commit, and 
any improvements can come afterwards.

> > +
> > +	mov #0, r10 mov.l r10, @-r15 /* int64_t tmp = 0 */ mov.l r10,
> > + at -r15 mov r15, r10 /* r10 = &tmp */
> > +
> > +	add	#64, r6
> > +	add	#64, r6		/* w += 32 (* 4): 128 < 0 in 8-bit arith. */
> > +
> > +	mov	r15, r10	/* r10 = &tmp */
> > +	sh_sum8_macs r10, r6, r5, r3	/* tmp = -sum2 is now at the top of the stack */
> > +
> > +	clrt
> > +
> > +	/* sum -= tmp - 64-bit signed */
> > +	mov.l	@r15+, r8	/* low 32-bits of the tmp-sum */
> > +	mov.l	@r4, r9		/* low 32-bits of the sum */
> > +	subc	r8, r9
> > +	mov.l	r9, @r4
> > +	mov.l	@r15+, r8	/* high 32-bits of the tmp-sum */
> > +	mov.l	@(4, r4), r9	/* high 32-bits of the sum */
> > +	subc	r8, r9
> > +
> > +	mov.l	r9, @(4, r4)
> > +	round_sample r4, r0, r1
> > +
> > +	mov.w	r0, @r7		/* *samples = round_samples() */
> > +
> > +	lds.l	@r15+, pr
> > +	mov.l	@r15+, r11
> > +	mov.l	@r15+, r10
> > +	mov.l	@r15+, r9
> > +
> > +	rts
> > +	mov.l	@r15+, r8	/* delay window */
> > +
> > +	.size mp3_win_loop_sh4, .-mp3_win_loop_sh4
> > +
> > +	.align 4
> > +max:	.long	OUT_MAX
> > +min:	.long	OUT_MIN
> > +m23:	.long	(1 << OUT_SHIFT) - 1
> >
> 
> -- 
> M?ns Rullg?rd
> mans at mansr.com

Thanks
Guennadi
---
Guennadi Liakhovetski, Ph.D.
Freelance Open-Source Software Developer
http://www.open-technology.de/