[MPlayer-dev-eng] [PATCH] Add NEON optimizations to some critical audio functions.

Reimar Döffinger Reimar.Doeffinger at gmx.de
Fri Oct 25 21:58:08 CEST 2013


One big issue is that lrintf is ridiculously slow on most (all?)
ARM Linux distributions, which makes the format conversion take
more time than the audio decoding.
It uses intrinsics because I was too lazy to learn the inline asm
syntax and for these trivial cases gcc doesn't seem to be able to
mess it up. In theory it also should allow the compiler to do loop
unrolling when it makes sense.
---
 configure               | 17 +++++++++++++++++
 libaf/af_format.c       | 21 ++++++++++++++++++++-
 libmpcodecs/ad_ffmpeg.c | 33 ++++++++++++++++++++++++++++++++-
 3 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index b464adf..b1b5e12 100755
--- a/configure
+++ b/configure
@@ -578,6 +578,7 @@ Advanced options:
   --enable-armvfp           enable ARM VFP (ARM) [autodetect]
   --enable-vfpv3            enable ARM VFPV3 (ARM) [autodetect]
   --enable-neon             enable NEON (ARM) [autodetect]
+  --enable-neon-intrin      enable NEON intrinsics (ARM) [autodetect]
   --enable-thumb            enable THUMB (ARM) [autodetect]
   --enable-iwmmxt           enable iWMMXt (ARM) [autodetect]
   --disable-fastmemcpy      disable 3DNow!/SSE/MMX optimized memcpy [enable]
@@ -636,6 +637,7 @@ _armv6t2=auto
 _armvfp=auto
 vfpv3=auto
 neon=auto
+neon_intrin=auto
 armthumb=auto
 _iwmmxt=auto
 _mtrr=auto
@@ -1439,6 +1441,8 @@ for ac_option do
   --disable-vfpv3) vfpv3=no ;;
   --enable-neon) neon=yes ;;
   --disable-neon) neon=no ;;
+  --enable-neon-intrin) neon_intrin=yes ;;
+  --disable-neon-intrin) neon_intrin=no ;;
   --enable-thumb) armthumb=yes ;;
   --disable-thumb) armthumb=no ;;
   --enable-iwmmxt) _iwmmxt=yes ;;
@@ -3105,6 +3109,18 @@ if arm ; then
   fi
   echores "$neon"
 
+  echocheck "ARM NEON intrinsics"
+  if test $neon_intrin = "auto" ; then
+    neon_intrin=no
+    statement_check 'arm_neon.h' 'float in[4] = {0}; float32x4_t tmpf = vld1q_f32(in);' && neon_intrin=yes
+  fi
+  echores "$neon_intrin"
+  if test $neon_intrin = "yes" ; then
+    def_neon_intrin='#define NEON_INTRIN 1'
+  else
+    def_neon_intrin='#define NEON_INTRIN 0'
+  fi
+
   echocheck "ARM THUMB"
   if test $armthumb = "auto" ; then
     armthumb=no
@@ -8716,6 +8732,7 @@ $def_altivec_h
 $def_malloc_h
 $def_mman_h
 $def_mman_has_map_failed
+$def_neon_intrin
 $def_soundcard_h
 $def_sys_soundcard_h
 $def_sys_sysinfo_h
diff --git a/libaf/af_format.c b/libaf/af_format.c
index 797b011..38f1642 100644
--- a/libaf/af_format.c
+++ b/libaf/af_format.c
@@ -20,14 +20,18 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include "config.h"
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <inttypes.h>
 #include <limits.h>
 #include <math.h>
+#if NEON_INTRIN
+#include <arm_neon.h>
+#endif
 
-#include "config.h"
 #include "af.h"
 #include "mp_msg.h"
 #include "mpbswap.h"
@@ -494,8 +498,23 @@ static void float2int(const float* in, void* out, int len, int bps)
       ((int8_t *)out)[i] = av_clip_int8(lrintf(128.0f * in[i]));
     break;
   case(2):
+#if NEON_INTRIN
+    for (i = 0; i + 4 <= len; i += 4) {
+      float32x4_t tmpf = vld1q_f32(in + i);
+      int32x4_t tmp32 = vcvtq_n_s32_f32(tmpf, 16);
+      int16x4_t tmp16 = vqmovn_s32(tmp32);
+      vst1_s16((int16_t *)out + i, tmp16);
+    }
+    for (; i < len; i++) {
+      float32x4_t tmpf = vld1q_dup_f32(in + i);
+      int32x4_t tmp32 = vcvtq_n_s32_f32(tmpf, 16);
+      int16x4_t tmp16 = vqmovn_s32(tmp32);
+      vst1_lane_s16((int16_t *)out + i, tmp16, 0);
+    }
+#else
     for(i=0;i<len;i++)
       ((int16_t*)out)[i] = av_clip_int16(lrintf(32768.0f * in[i]));
+#endif
     break;
   case(3):
     for(i=0;i<len;i++){
diff --git a/libmpcodecs/ad_ffmpeg.c b/libmpcodecs/ad_ffmpeg.c
index 11c502a..dcf9661 100644
--- a/libmpcodecs/ad_ffmpeg.c
+++ b/libmpcodecs/ad_ffmpeg.c
@@ -17,11 +17,15 @@
  */
 #define AVCODEC_MAX_AUDIO_FRAME_SIZE 192000
 
+#include "config.h"
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
+#if NEON_INTRIN
+#include <arm_neon.h>
+#endif
 
-#include "config.h"
 #include "mp_msg.h"
 #include "help_mp.h"
 
@@ -220,6 +224,33 @@ static av_always_inline void copy_samples_planar(size_t bps,
 {
     size_t s, c, o = 0;
 
+#if NEON_INTRIN
+    if (nb_channels == 2 && bps == 4) {
+        for (s = 0; s + 4 <= nb_samples; s += 4) {
+           const uint32_t *src0 = (const uint32_t *)(src[0] + o);
+           const uint32_t *src1 = (const uint32_t *)(src[1] + o);
+           uint32x4x2_t tmp;
+           tmp.val[0] = vld1q_u32(src0);
+           tmp.val[1] = vld1q_u32(src1);
+           vst2q_u32((uint32_t *)dst, tmp);
+           o += 4*bps;
+           dst += 4*bps*nb_channels;
+        }
+        nb_samples -= s;
+    } else if (nb_channels == 2 && bps == 2) {
+        for (s = 0; s + 8 <= nb_samples; s += 8) {
+           const uint16_t *src0 = (const uint16_t *)(src[0] + o);
+           const uint16_t *src1 = (const uint16_t *)(src[1] + o);
+           uint16x8x2_t tmp;
+           tmp.val[0] = vld1q_u16(src0);
+           tmp.val[1] = vld1q_u16(src1);
+           vst2q_u16((uint16_t *)dst, tmp);
+           o += 8*bps;
+           dst += 8*bps*nb_channels;
+        }
+        nb_samples -= s;
+    }
+#endif
     for (s = 0; s < nb_samples; s++) {
         for (c = 0; c < nb_channels; c++) {
             memcpy(dst, src[c] + o, bps);
-- 
1.8.4.rc3



More information about the MPlayer-dev-eng mailing list