[FFmpeg-devel] [PATCH v4] SH4 mpegaudio decoder optimizations

Tue Feb 1 20:26:11 CET 2011

Guennadi Liakhovetski <g.liakhovetski at gmx.de> writes:

> Hi all
>
> This is version 4 of the following patch:
>
> This patch implements several mpegaudio optimizations for SH4 FPU-enabled 
> SoCs. Verified to provide more than 45% acceleration, when decoding a 
> 128kbps stereo mp3 audio file to /dev/null.
> ---
>
> Yes, I know, the .S file is all indented with "evil" TABs... Please, let's 
> review it this way, I'll convert it to spaces once accepted. Would have 
> been too much trouble for me to develop with spaces...
>
> v4: switched to a out-of-line .S assembly implementation
>
> v3, v2, v1: were inline, changelog irrelevant
>
> Thanks to all who commented!
>
> diff --git a/libavcodec/mpc.c b/libavcodec/mpc.c
> index d9a1fb7..33b1692 100644
> --- a/libavcodec/mpc.c
> +++ b/libavcodec/mpc.c
> @@ -51,7 +51,7 @@ static void mpc_synth(MPCContext *c, int16_t *out, int channels)
>      for(ch = 0;  ch < channels; ch++){
>          samples_ptr = samples + ch;
>          for(i = 0; i < SAMPLES_PER_BAND; i++) {
> -            ff_mpa_synth_filter(c->synth_buf[ch], &(c->synth_buf_offset[ch]),
> +            ff_mpa_synth_filter(NULL, c->synth_buf[ch], &(c->synth_buf_offset[ch]),
>                                  ff_mpa_synth_window, &dither_state,
>                                  samples_ptr, channels,
>                                  c->sb_samples[ch][i]);

I want to rework the C plumbing of these functions in general.  Review
of those parts omitted.

> diff --git a/libavcodec/sh4/dsputil_sh4.c b/libavcodec/sh4/dsputil_sh4.c
> index ec06e24..37efdf0 100644
> --- a/libavcodec/sh4/dsputil_sh4.c
> +++ b/libavcodec/sh4/dsputil_sh4.c
> @@ -20,8 +20,11 @@
>   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>   */
>
> +#undef CONFIG_FLOAT

What purpose does that serve?

>  #include "libavcodec/avcodec.h"
>  #include "libavcodec/dsputil.h"
> +#include "libavcodec/mpegaudio.h"
>  #include "dsputil_sh4.h"
>  #include "sh4.h"
>
> @@ -102,3 +105,32 @@ void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx)
>                  c->idct_permutation_type= FF_NO_IDCT_PERM;
>          }
>  }
> +
> +void mp3_win_loop_sh4(int64_t *sum, MPA_INT *synth_buf, MPA_INT *w,
> +		      OUT_INT *samples, int incr);
> +int round_sample_sh4(int64_t *sum);
> +void sum8_macs_sh4(int64_t *sum, int32_t *p, int32_t *w);
> +
> +static void apply_window_mp3_sh4(MPA_INT *synth_buf, MPA_INT *window,
> +                                 int *dither_state, OUT_INT *samples, int incr)
> +{
> +    int64_t sum, sum2 = 0;
> +
> +    /* copy to avoid wrap */
> +    memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
> +
> +    sum = *dither_state;
> +    sum8_macs_sh4(&sum, synth_buf + 16, window);
> +    sum8_macs_sh4(&sum2, synth_buf + 48, window + 32);
> +    sum -= sum2;
> +    *samples = round_sample_sh4(&sum);
> +
> +    mp3_win_loop_sh4(&sum, synth_buf + 32, window + 1, samples + incr, incr);
> +
> +    *dither_state = sum;
> +}

Why are you doing this in C?  Doing it all in asm doesn't seem very hard.

> +void ff_mpegaudiodec_init_sh4(MPADecodeContext *s)
> +{
> +    s->apply_window_mp3 = apply_window_mp3_sh4;
> +}
> diff --git a/libavcodec/sh4/mpegaudiodec_sh4.S b/libavcodec/sh4/mpegaudiodec_sh4.S
> new file mode 100644
> index 0000000..ea3b95c
> --- /dev/null
> +++ b/libavcodec/sh4/mpegaudiodec_sh4.S
> @@ -0,0 +1,244 @@
> +#define MAXSHORT (1 << 15) - 1
> +#define MINSHORT -(1 << 15)
> +
> +#define FRAC_BITS   23   /* fractional bits for sb_samples and dct */
> +#define WFRAC_BITS  16   /* fractional bits for window */
> +
> +#define OUT_MAX MAXSHORT
> +#define OUT_MIN MINSHORT
> +#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)

Something should be done to make sure these match the C code.  That
might mean splitting some headers.

> +	.macro sh_macs, wp, pp, tmp
> +	add	\tmp, \wp
> +	add	\tmp, \pp
> +	mac.l	@\wp+, @\pp+
> +	.endm
> +
> +	.macro sh_sum8_macs, sum, wp, pp, tmp
> +	mov.l	@\sum+, \tmp
> +	lds	\tmp, macl
> +	mov.l	@\sum+, \tmp
> +	lds	\tmp, mach

You could use the lds.l @reg+, mach/l instruction here,

> +	mov	#126, \tmp
> +	shll	\tmp			/* 63 * 4 */
> +	mac.l	@\wp+, @\pp+		/* 0 */
> +	sh_macs	\wp, \pp, \tmp		/* 1 */
> +	sh_macs	\wp, \pp, \tmp		/* 2 */
> +	sh_macs	\wp, \pp, \tmp		/* 3 */
> +	sh_macs	\wp, \pp, \tmp		/* 4 */
> +	sh_macs	\wp, \pp, \tmp		/* 5 */
> +	sh_macs	\wp, \pp, \tmp		/* 6 */
> +	sh_macs	\wp, \pp, \tmp		/* 7 */
> +	sts	mach, \tmp
> +	mov.l	\tmp, @-\sum
> +	sts	macl, \tmp
> +	mov.l	\tmp, @-\sum

and sts.l mach/l, @-reg here.

> +	.endm

You could probably improve pipelining using two pointer pairs,
doubling the stride, and interleaving things a bit.  I don't remember
the scheduling details, and they vary between implementations.

> +	/* void sum8_macs(int64_t *sum, int32_t *p, int32_t *w) */
> +
> +	.globl sum8_macs_sh4
> +	.balign 4
> +	.type   sum8_macs_sh4, @function
> +sum8_macs_sh4:
> +	sh_sum8_macs	r4, r6, r5, r3
> +	rts
> +	nop
> +	.size sum8_macs_sh4, .-sum8_macs_sh4
> +
> +	.macro round_sample, sum, tmp1, tmp2
> +	mov.l	@\sum, \tmp1
> +	mov.l	m23, \tmp2
> +	and	\tmp1, \tmp2		/* hi &= (1 << 24) - 1 */
> +	mov.l	\tmp2, @\sum
> +	shlr16	\tmp1
> +	shlr8	\tmp1			/* lo >>= 24 */
> +	mov.l	@(4, \sum), \tmp2
> +	shll8	\tmp2			/* hi <<= 8 */
> +	or	\tmp2, \tmp1		/* lo |= hi */
> +	mov	#0, \tmp2
> +	mov.l	\tmp2, @(4, \sum)
> +	/* av_clip */
> +	mov.l	max, \tmp2
> +	cmp/ge	\tmp2, \tmp1
> +	bt	1f
> +	mov.l	min, \tmp2
> +	cmp/gt	\tmp2, \tmp1
> +	bt	2f
> +1:	mov	\tmp2, \tmp1
> +2:	

These branches could make use of the delay slot.

> +	.endm
> +
> +	/* int round_sample_asm(int64_t *sum) */
> +
> +	.globl round_sample_sh4
> +	.balign 4
> +	.type   round_sample_sh4, @function
> +round_sample_sh4:
> +	round_sample r4, r0, r1
> +	rts
> +	nop
> +	.size round_sample_sh4, .-round_sample_sh4
> +
> +	.balign 4
> +
> +	/* void mp3_win_loop_sh4(&sum, synth_buf + 32, window + 1, samples + incr, incr) */
> +	.globl mp3_win_loop_sh4
> +	.balign 4
> +	.type   mp3_win_loop_sh4, @function
> +mp3_win_loop_sh4:

You might want to adapt some of the macros from arm/asm.S.

> +	mov	r6, r0
> +	add	#120, r0	/* w2 = w + 30 (* 4) */
> +	mov	#60, r1		/* j = r1 - loop counter: 15 * 4 */
> +
> +	mov.l	@r15, r2	/* r2 = incr */
> +	shll	r2		/* r2 *= 2 */
> +
> +	mov.l	r8, @-r15
> +	mov.l	r9, @-r15
> +	mov.l	r10, @-r15
> +	mov.l	r11, @-r15
> +	sts.l	pr, @-r15
> +
> +	mov	#30, r11
> +	muls.w	r2, r11
> +	sts	macl, r11
> +	add	r7, r11		/* samples2 = samples + 30 * incr (* 2) */
> +
> +4:	sub	r1, r5		/* p = synth_buf - j (* 4) */
> +
> +	/* We'll use stack for 64-bit variables:
> +	 * tmp = -sum2 */
> +
> +	/* SUM8P2_MACS_MLSS(sum, sum2, w, w2, p) */
> +
> +	mov	r5, r8
> +	mov	r6, r9
> +	sh_sum8_macs r4, r6, r5, r3	/* sum, window, pp, tmp */
> +	mov	r8, r5		/* restore p */
> +	mov	r9, r6		/* restore w */
> +
> +	mov	#0, r10
> +	mov.l	r10, @-r15	/* sum2 = 0 */
> +	mov.l	r10, @-r15
> +	mov	r15, r10	/* r10 = &sum2 */
> +
> +	mov	r0, r9
> +	sh_sum8_macs r10, r0, r5, r3	/* sum2 is now at the top of the stack */
> +	mov	r9, r0		/* restore w2 */
> +	mov	r8, r5		/* restore p */
> +
> +	/* SUM8P2_MLSS_MLSS(sum, sum2, w + 32, w2 + 32, p) */
> +
> +	add	r1, r5
> +	add	r1, r5		/* p += 2 * j (* 4) */
> +
> +	mov	r5, r8		/* save r5 */
> +	mov	r6, r9		/* save r6 before incrementing */
> +
> +	add	#64, r6
> +	add	#64, r6		/* w += 32 (* 4): 128 < 0 in 8-bit arith. */
> +	mov	#0, r10
> +	mov.l	r10, @-r15	/* int64_t tmp = 0 */
> +	mov.l	r10, @-r15
> +
> +	mov	r15, r10	/* r10 = &tmp */
> +
> +	sh_sum8_macs r10, r6, r5, r3	/* tmp is now at the top of the stack */
> +	mov	r9, r6		/* restore w */
> +	mov	r8, r5		/* restore p */

Why do you keep the sum on the stack?  Are there not enough registers?

> +	clrt
> +
> +	/* sum -= tmp - 64-bit signed */
> +	mov.l	@r15+, r8	/* low 32-bits of the tmp-sum */
> +	mov.l	@r4, r10	/* low 32-bits of the sum */
> +	subc	r8, r10
> +	mov.l	r10, @r4
> +	mov.l	@r15+, r8	/* high 32-bits of the tmp-sum */
> +	mov.l	@(4, r4), r10	/* high 32-bits of the sum */
> +	subc	r8, r10
> +	mov.l	r10, @(4, r4)
> +
> +	mov	r5, r8
> +	mov	r0, r10
> +	add	#64, r0
> +	add	#64, r0		/* w2 += 32 (* 4): 128 < 0 in 8-bit arith. */
> +
> +	mov	r15, r9		/* r9 = &sum2 */
> +	sh_sum8_macs r9, r0, r5, r3	/* tmp = -sum2 is now at the top of the stack */
> +	mov	r8, r5		/* restore p */
> +
> +	round_sample r4, r0, r3
> +
> +	mov.w	r0, @r7		/* *samples = round_samples() */
> +	add	r2, r7		/* samples += incr (* 2) */
> +
> +	clrt
> +
> +	mov.l	@r15+, r0	/* low 32-bits */
> +	mov.l	@r4, r3
> +	subc	r0, r3
> +	mov.l	r3, @r4
> +	mov.l	@r15+, r0	/* high 32-bits */
> +	mov.l	@(4, r4), r3
> +	subc	r0, r3
> +
> +	mov.l	r3, @(4, r4)
> +	round_sample r4, r0, r3
> +
> +	mov.w	r0, @r11	/* *samples2 = round_samples() */
> +	sub	r2, r11		/* samples2 -= incr (* 2) */
> +
> +	sub	r1, r5		/* synth_buf = p - j */
> +	mov	r10, r0		/* restore w2 */
> +	add	#-4, r0		/* w2-- */
> +
> +	add	#-4, r1
> +	cmp/pl	r1
> +	bf	5f
> +	bra	4b
> +	/* CAREFUL: The below w++ is still in the loop - it's in the delay window */
> +5:	add	#4, r6		/* w++ */

Most of this could be scheduled better.

> +
> +	mov #0, r10 mov.l r10, @-r15 /* int64_t tmp = 0 */ mov.l r10,
> + at -r15 mov r15, r10 /* r10 = &tmp */
> +
> +	add	#64, r6
> +	add	#64, r6		/* w += 32 (* 4): 128 < 0 in 8-bit arith. */
> +
> +	mov	r15, r10	/* r10 = &tmp */
> +	sh_sum8_macs r10, r6, r5, r3	/* tmp = -sum2 is now at the top of the stack */
> +
> +	clrt
> +
> +	/* sum -= tmp - 64-bit signed */
> +	mov.l	@r15+, r8	/* low 32-bits of the tmp-sum */
> +	mov.l	@r4, r9		/* low 32-bits of the sum */
> +	subc	r8, r9
> +	mov.l	r9, @r4
> +	mov.l	@r15+, r8	/* high 32-bits of the tmp-sum */
> +	mov.l	@(4, r4), r9	/* high 32-bits of the sum */
> +	subc	r8, r9
> +
> +	mov.l	r9, @(4, r4)
> +	round_sample r4, r0, r1
> +
> +	mov.w	r0, @r7		/* *samples = round_samples() */
> +
> +	lds.l	@r15+, pr
> +	mov.l	@r15+, r11
> +	mov.l	@r15+, r10
> +	mov.l	@r15+, r9
> +
> +	rts
> +	mov.l	@r15+, r8	/* delay window */
> +
> +	.size mp3_win_loop_sh4, .-mp3_win_loop_sh4
> +
> +	.align 4
> +max:	.long	OUT_MAX
> +min:	.long	OUT_MIN
> +m23:	.long	(1 << OUT_SHIFT) - 1
>

-- 
M?ns Rullg?rd
mans at mansr.com