[FFmpeg-devel] [PATCH v4] SH4 mpegaudio decoder optimizations
Måns Rullgård
mans
Tue Feb 1 20:26:11 CET 2011
Guennadi Liakhovetski <g.liakhovetski at gmx.de> writes:
> Hi all
>
> This is version 4 of the following patch:
>
> This patch implements several mpegaudio optimizations for SH4 FPU-enabled
> SoCs. Verified to provide more than 45% acceleration, when decoding a
> 128kbps stereo mp3 audio file to /dev/null.
> ---
>
> Yes, I know, the .S file is all indented with "evil" TABs... Please, let's
> review it this way, I'll convert it to spaces once accepted. Would have
> been too much trouble for me to develop with spaces...
>
> v4: switched to a out-of-line .S assembly implementation
>
> v3, v2, v1: were inline, changelog irrelevant
>
> Thanks to all who commented!
>
> diff --git a/libavcodec/mpc.c b/libavcodec/mpc.c
> index d9a1fb7..33b1692 100644
> --- a/libavcodec/mpc.c
> +++ b/libavcodec/mpc.c
> @@ -51,7 +51,7 @@ static void mpc_synth(MPCContext *c, int16_t *out, int channels)
> for(ch = 0; ch < channels; ch++){
> samples_ptr = samples + ch;
> for(i = 0; i < SAMPLES_PER_BAND; i++) {
> - ff_mpa_synth_filter(c->synth_buf[ch], &(c->synth_buf_offset[ch]),
> + ff_mpa_synth_filter(NULL, c->synth_buf[ch], &(c->synth_buf_offset[ch]),
> ff_mpa_synth_window, &dither_state,
> samples_ptr, channels,
> c->sb_samples[ch][i]);
I want to rework the C plumbing of these functions in general. Review
of those parts omitted.
> diff --git a/libavcodec/sh4/dsputil_sh4.c b/libavcodec/sh4/dsputil_sh4.c
> index ec06e24..37efdf0 100644
> --- a/libavcodec/sh4/dsputil_sh4.c
> +++ b/libavcodec/sh4/dsputil_sh4.c
> @@ -20,8 +20,11 @@
> * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> */
>
> +#undef CONFIG_FLOAT
What purpose does that serve?
> #include "libavcodec/avcodec.h"
> #include "libavcodec/dsputil.h"
> +#include "libavcodec/mpegaudio.h"
> #include "dsputil_sh4.h"
> #include "sh4.h"
>
> @@ -102,3 +105,32 @@ void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx)
> c->idct_permutation_type= FF_NO_IDCT_PERM;
> }
> }
> +
> +void mp3_win_loop_sh4(int64_t *sum, MPA_INT *synth_buf, MPA_INT *w,
> + OUT_INT *samples, int incr);
> +int round_sample_sh4(int64_t *sum);
> +void sum8_macs_sh4(int64_t *sum, int32_t *p, int32_t *w);
> +
> +static void apply_window_mp3_sh4(MPA_INT *synth_buf, MPA_INT *window,
> + int *dither_state, OUT_INT *samples, int incr)
> +{
> + int64_t sum, sum2 = 0;
> +
> + /* copy to avoid wrap */
> + memcpy(synth_buf + 512, synth_buf, 32 * sizeof(*synth_buf));
> +
> + sum = *dither_state;
> + sum8_macs_sh4(&sum, synth_buf + 16, window);
> + sum8_macs_sh4(&sum2, synth_buf + 48, window + 32);
> + sum -= sum2;
> + *samples = round_sample_sh4(&sum);
> +
> + mp3_win_loop_sh4(&sum, synth_buf + 32, window + 1, samples + incr, incr);
> +
> + *dither_state = sum;
> +}
Why are you doing this in C? Doing it all in asm doesn't seem very hard.
> +void ff_mpegaudiodec_init_sh4(MPADecodeContext *s)
> +{
> + s->apply_window_mp3 = apply_window_mp3_sh4;
> +}
> diff --git a/libavcodec/sh4/mpegaudiodec_sh4.S b/libavcodec/sh4/mpegaudiodec_sh4.S
> new file mode 100644
> index 0000000..ea3b95c
> --- /dev/null
> +++ b/libavcodec/sh4/mpegaudiodec_sh4.S
> @@ -0,0 +1,244 @@
> +#define MAXSHORT (1 << 15) - 1
> +#define MINSHORT -(1 << 15)
> +
> +#define FRAC_BITS 23 /* fractional bits for sb_samples and dct */
> +#define WFRAC_BITS 16 /* fractional bits for window */
> +
> +#define OUT_MAX MAXSHORT
> +#define OUT_MIN MINSHORT
> +#define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
Something should be done to make sure these match the C code. That
might mean splitting some headers.
> + .macro sh_macs, wp, pp, tmp
> + add \tmp, \wp
> + add \tmp, \pp
> + mac.l @\wp+, @\pp+
> + .endm
> +
> + .macro sh_sum8_macs, sum, wp, pp, tmp
> + mov.l @\sum+, \tmp
> + lds \tmp, macl
> + mov.l @\sum+, \tmp
> + lds \tmp, mach
You could use the lds.l @reg+, mach/l instruction here,
> + mov #126, \tmp
> + shll \tmp /* 63 * 4 */
> + mac.l @\wp+, @\pp+ /* 0 */
> + sh_macs \wp, \pp, \tmp /* 1 */
> + sh_macs \wp, \pp, \tmp /* 2 */
> + sh_macs \wp, \pp, \tmp /* 3 */
> + sh_macs \wp, \pp, \tmp /* 4 */
> + sh_macs \wp, \pp, \tmp /* 5 */
> + sh_macs \wp, \pp, \tmp /* 6 */
> + sh_macs \wp, \pp, \tmp /* 7 */
> + sts mach, \tmp
> + mov.l \tmp, @-\sum
> + sts macl, \tmp
> + mov.l \tmp, @-\sum
and sts.l mach/l, @-reg here.
> + .endm
You could probably improve pipelining using two pointer pairs,
doubling the stride, and interleaving things a bit. I don't remember
the scheduling details, and they vary between implementations.
> + /* void sum8_macs(int64_t *sum, int32_t *p, int32_t *w) */
> +
> + .globl sum8_macs_sh4
> + .balign 4
> + .type sum8_macs_sh4, @function
> +sum8_macs_sh4:
> + sh_sum8_macs r4, r6, r5, r3
> + rts
> + nop
> + .size sum8_macs_sh4, .-sum8_macs_sh4
> +
> + .macro round_sample, sum, tmp1, tmp2
> + mov.l @\sum, \tmp1
> + mov.l m23, \tmp2
> + and \tmp1, \tmp2 /* hi &= (1 << 24) - 1 */
> + mov.l \tmp2, @\sum
> + shlr16 \tmp1
> + shlr8 \tmp1 /* lo >>= 24 */
> + mov.l @(4, \sum), \tmp2
> + shll8 \tmp2 /* hi <<= 8 */
> + or \tmp2, \tmp1 /* lo |= hi */
> + mov #0, \tmp2
> + mov.l \tmp2, @(4, \sum)
> + /* av_clip */
> + mov.l max, \tmp2
> + cmp/ge \tmp2, \tmp1
> + bt 1f
> + mov.l min, \tmp2
> + cmp/gt \tmp2, \tmp1
> + bt 2f
> +1: mov \tmp2, \tmp1
> +2:
These branches could make use of the delay slot.
> + .endm
> +
> + /* int round_sample_asm(int64_t *sum) */
> +
> + .globl round_sample_sh4
> + .balign 4
> + .type round_sample_sh4, @function
> +round_sample_sh4:
> + round_sample r4, r0, r1
> + rts
> + nop
> + .size round_sample_sh4, .-round_sample_sh4
> +
> + .balign 4
> +
> + /* void mp3_win_loop_sh4(&sum, synth_buf + 32, window + 1, samples + incr, incr) */
> + .globl mp3_win_loop_sh4
> + .balign 4
> + .type mp3_win_loop_sh4, @function
> +mp3_win_loop_sh4:
You might want to adapt some of the macros from arm/asm.S.
> + mov r6, r0
> + add #120, r0 /* w2 = w + 30 (* 4) */
> + mov #60, r1 /* j = r1 - loop counter: 15 * 4 */
> +
> + mov.l @r15, r2 /* r2 = incr */
> + shll r2 /* r2 *= 2 */
> +
> + mov.l r8, @-r15
> + mov.l r9, @-r15
> + mov.l r10, @-r15
> + mov.l r11, @-r15
> + sts.l pr, @-r15
> +
> + mov #30, r11
> + muls.w r2, r11
> + sts macl, r11
> + add r7, r11 /* samples2 = samples + 30 * incr (* 2) */
> +
> +4: sub r1, r5 /* p = synth_buf - j (* 4) */
> +
> + /* We'll use stack for 64-bit variables:
> + * tmp = -sum2 */
> +
> + /* SUM8P2_MACS_MLSS(sum, sum2, w, w2, p) */
> +
> + mov r5, r8
> + mov r6, r9
> + sh_sum8_macs r4, r6, r5, r3 /* sum, window, pp, tmp */
> + mov r8, r5 /* restore p */
> + mov r9, r6 /* restore w */
> +
> + mov #0, r10
> + mov.l r10, @-r15 /* sum2 = 0 */
> + mov.l r10, @-r15
> + mov r15, r10 /* r10 = &sum2 */
> +
> + mov r0, r9
> + sh_sum8_macs r10, r0, r5, r3 /* sum2 is now at the top of the stack */
> + mov r9, r0 /* restore w2 */
> + mov r8, r5 /* restore p */
> +
> + /* SUM8P2_MLSS_MLSS(sum, sum2, w + 32, w2 + 32, p) */
> +
> + add r1, r5
> + add r1, r5 /* p += 2 * j (* 4) */
> +
> + mov r5, r8 /* save r5 */
> + mov r6, r9 /* save r6 before incrementing */
> +
> + add #64, r6
> + add #64, r6 /* w += 32 (* 4): 128 < 0 in 8-bit arith. */
> + mov #0, r10
> + mov.l r10, @-r15 /* int64_t tmp = 0 */
> + mov.l r10, @-r15
> +
> + mov r15, r10 /* r10 = &tmp */
> +
> + sh_sum8_macs r10, r6, r5, r3 /* tmp is now at the top of the stack */
> + mov r9, r6 /* restore w */
> + mov r8, r5 /* restore p */
Why do you keep the sum on the stack? Are there not enough registers?
> + clrt
> +
> + /* sum -= tmp - 64-bit signed */
> + mov.l @r15+, r8 /* low 32-bits of the tmp-sum */
> + mov.l @r4, r10 /* low 32-bits of the sum */
> + subc r8, r10
> + mov.l r10, @r4
> + mov.l @r15+, r8 /* high 32-bits of the tmp-sum */
> + mov.l @(4, r4), r10 /* high 32-bits of the sum */
> + subc r8, r10
> + mov.l r10, @(4, r4)
> +
> + mov r5, r8
> + mov r0, r10
> + add #64, r0
> + add #64, r0 /* w2 += 32 (* 4): 128 < 0 in 8-bit arith. */
> +
> + mov r15, r9 /* r9 = &sum2 */
> + sh_sum8_macs r9, r0, r5, r3 /* tmp = -sum2 is now at the top of the stack */
> + mov r8, r5 /* restore p */
> +
> + round_sample r4, r0, r3
> +
> + mov.w r0, @r7 /* *samples = round_samples() */
> + add r2, r7 /* samples += incr (* 2) */
> +
> + clrt
> +
> + mov.l @r15+, r0 /* low 32-bits */
> + mov.l @r4, r3
> + subc r0, r3
> + mov.l r3, @r4
> + mov.l @r15+, r0 /* high 32-bits */
> + mov.l @(4, r4), r3
> + subc r0, r3
> +
> + mov.l r3, @(4, r4)
> + round_sample r4, r0, r3
> +
> + mov.w r0, @r11 /* *samples2 = round_samples() */
> + sub r2, r11 /* samples2 -= incr (* 2) */
> +
> + sub r1, r5 /* synth_buf = p - j */
> + mov r10, r0 /* restore w2 */
> + add #-4, r0 /* w2-- */
> +
> + add #-4, r1
> + cmp/pl r1
> + bf 5f
> + bra 4b
> + /* CAREFUL: The below w++ is still in the loop - it's in the delay window */
> +5: add #4, r6 /* w++ */
Most of this could be scheduled better.
> +
> + mov #0, r10 mov.l r10, @-r15 /* int64_t tmp = 0 */ mov.l r10,
> + at -r15 mov r15, r10 /* r10 = &tmp */
> +
> + add #64, r6
> + add #64, r6 /* w += 32 (* 4): 128 < 0 in 8-bit arith. */
> +
> + mov r15, r10 /* r10 = &tmp */
> + sh_sum8_macs r10, r6, r5, r3 /* tmp = -sum2 is now at the top of the stack */
> +
> + clrt
> +
> + /* sum -= tmp - 64-bit signed */
> + mov.l @r15+, r8 /* low 32-bits of the tmp-sum */
> + mov.l @r4, r9 /* low 32-bits of the sum */
> + subc r8, r9
> + mov.l r9, @r4
> + mov.l @r15+, r8 /* high 32-bits of the tmp-sum */
> + mov.l @(4, r4), r9 /* high 32-bits of the sum */
> + subc r8, r9
> +
> + mov.l r9, @(4, r4)
> + round_sample r4, r0, r1
> +
> + mov.w r0, @r7 /* *samples = round_samples() */
> +
> + lds.l @r15+, pr
> + mov.l @r15+, r11
> + mov.l @r15+, r10
> + mov.l @r15+, r9
> +
> + rts
> + mov.l @r15+, r8 /* delay window */
> +
> + .size mp3_win_loop_sh4, .-mp3_win_loop_sh4
> +
> + .align 4
> +max: .long OUT_MAX
> +min: .long OUT_MIN
> +m23: .long (1 << OUT_SHIFT) - 1
>
--
M?ns Rullg?rd
mans at mansr.com
More information about the ffmpeg-devel
mailing list