[MPlayer-dev-eng] [PATCH] Add NEON optimizations to some critical audio functions.
Reimar Döffinger
Reimar.Doeffinger at gmx.de
Fri Oct 25 21:58:08 CEST 2013
One big issue is that lrintf is ridiculously slow on most (all?)
ARM Linux distributions, which makes the format conversion take
more time than the audio decoding.
It uses intrinsics because I was too lazy to learn the inline asm
syntax and for these trivial cases gcc doesn't seem to be able to
mess it up. In theory it also should allow the compiler to do loop
unrolling when it makes sense.
---
configure | 17 +++++++++++++++++
libaf/af_format.c | 21 ++++++++++++++++++++-
libmpcodecs/ad_ffmpeg.c | 33 ++++++++++++++++++++++++++++++++-
3 files changed, 69 insertions(+), 2 deletions(-)
diff --git a/configure b/configure
index b464adf..b1b5e12 100755
--- a/configure
+++ b/configure
@@ -578,6 +578,7 @@ Advanced options:
--enable-armvfp enable ARM VFP (ARM) [autodetect]
--enable-vfpv3 enable ARM VFPV3 (ARM) [autodetect]
--enable-neon enable NEON (ARM) [autodetect]
+ --enable-neon-intrin enable NEON intrinsics (ARM) [autodetect]
--enable-thumb enable THUMB (ARM) [autodetect]
--enable-iwmmxt enable iWMMXt (ARM) [autodetect]
--disable-fastmemcpy disable 3DNow!/SSE/MMX optimized memcpy [enable]
@@ -636,6 +637,7 @@ _armv6t2=auto
_armvfp=auto
vfpv3=auto
neon=auto
+neon_intrin=auto
armthumb=auto
_iwmmxt=auto
_mtrr=auto
@@ -1439,6 +1441,8 @@ for ac_option do
--disable-vfpv3) vfpv3=no ;;
--enable-neon) neon=yes ;;
--disable-neon) neon=no ;;
+ --enable-neon-intrin) neon_intrin=yes ;;
+ --disable-neon-intrin) neon_intrin=no ;;
--enable-thumb) armthumb=yes ;;
--disable-thumb) armthumb=no ;;
--enable-iwmmxt) _iwmmxt=yes ;;
@@ -3105,6 +3109,18 @@ if arm ; then
fi
echores "$neon"
+ echocheck "ARM NEON intrinsics"
+ if test $neon_intrin = "auto" ; then
+ neon_intrin=no
+ statement_check 'arm_neon.h' 'float in[4] = {0}; float32x4_t tmpf = vld1q_f32(in);' && neon_intrin=yes
+ fi
+ echores "$neon_intrin"
+ if test $neon_intrin = "yes" ; then
+ def_neon_intrin='#define NEON_INTRIN 1'
+ else
+ def_neon_intrin='#define NEON_INTRIN 0'
+ fi
+
echocheck "ARM THUMB"
if test $armthumb = "auto" ; then
armthumb=no
@@ -8716,6 +8732,7 @@ $def_altivec_h
$def_malloc_h
$def_mman_h
$def_mman_has_map_failed
+$def_neon_intrin
$def_soundcard_h
$def_sys_soundcard_h
$def_sys_sysinfo_h
diff --git a/libaf/af_format.c b/libaf/af_format.c
index 797b011..38f1642 100644
--- a/libaf/af_format.c
+++ b/libaf/af_format.c
@@ -20,14 +20,18 @@
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
+#include "config.h"
+
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <limits.h>
#include <math.h>
+#if NEON_INTRIN
+#include <arm_neon.h>
+#endif
-#include "config.h"
#include "af.h"
#include "mp_msg.h"
#include "mpbswap.h"
@@ -494,8 +498,23 @@ static void float2int(const float* in, void* out, int len, int bps)
((int8_t *)out)[i] = av_clip_int8(lrintf(128.0f * in[i]));
break;
case(2):
+#if NEON_INTRIN
+ for (i = 0; i + 4 <= len; i += 4) {
+ float32x4_t tmpf = vld1q_f32(in + i);
+ int32x4_t tmp32 = vcvtq_n_s32_f32(tmpf, 16);
+ int16x4_t tmp16 = vqmovn_s32(tmp32);
+ vst1_s16((int16_t *)out + i, tmp16);
+ }
+ for (; i < len; i++) {
+ float32x4_t tmpf = vld1q_dup_f32(in + i);
+ int32x4_t tmp32 = vcvtq_n_s32_f32(tmpf, 16);
+ int16x4_t tmp16 = vqmovn_s32(tmp32);
+ vst1_lane_s16((int16_t *)out + i, tmp16, 0);
+ }
+#else
for(i=0;i<len;i++)
((int16_t*)out)[i] = av_clip_int16(lrintf(32768.0f * in[i]));
+#endif
break;
case(3):
for(i=0;i<len;i++){
diff --git a/libmpcodecs/ad_ffmpeg.c b/libmpcodecs/ad_ffmpeg.c
index 11c502a..dcf9661 100644
--- a/libmpcodecs/ad_ffmpeg.c
+++ b/libmpcodecs/ad_ffmpeg.c
@@ -17,11 +17,15 @@
*/
#define AVCODEC_MAX_AUDIO_FRAME_SIZE 192000
+#include "config.h"
+
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
+#if NEON_INTRIN
+#include <arm_neon.h>
+#endif
-#include "config.h"
#include "mp_msg.h"
#include "help_mp.h"
@@ -220,6 +224,33 @@ static av_always_inline void copy_samples_planar(size_t bps,
{
size_t s, c, o = 0;
+#if NEON_INTRIN
+ if (nb_channels == 2 && bps == 4) {
+ for (s = 0; s + 4 <= nb_samples; s += 4) {
+ const uint32_t *src0 = (const uint32_t *)(src[0] + o);
+ const uint32_t *src1 = (const uint32_t *)(src[1] + o);
+ uint32x4x2_t tmp;
+ tmp.val[0] = vld1q_u32(src0);
+ tmp.val[1] = vld1q_u32(src1);
+ vst2q_u32((uint32_t *)dst, tmp);
+ o += 4*bps;
+ dst += 4*bps*nb_channels;
+ }
+ nb_samples -= s;
+ } else if (nb_channels == 2 && bps == 2) {
+ for (s = 0; s + 8 <= nb_samples; s += 8) {
+ const uint16_t *src0 = (const uint16_t *)(src[0] + o);
+ const uint16_t *src1 = (const uint16_t *)(src[1] + o);
+ uint16x8x2_t tmp;
+ tmp.val[0] = vld1q_u16(src0);
+ tmp.val[1] = vld1q_u16(src1);
+ vst2q_u16((uint16_t *)dst, tmp);
+ o += 8*bps;
+ dst += 8*bps*nb_channels;
+ }
+ nb_samples -= s;
+ }
+#endif
for (s = 0; s < nb_samples; s++) {
for (c = 0; c < nb_channels; c++) {
memcpy(dst, src[c] + o, bps);
--
1.8.4.rc3
More information about the MPlayer-dev-eng
mailing list