[MPlayer-cvslog] r36489 - trunk/libaf/af_format.c

Sat Oct 26 11:17:28 CEST 2013

Author: reimar
Date: Sat Oct 26 11:17:28 2013
New Revision: 36489

Log:
ARM NEON optimization for float->int conversion.

Not optimal, but since lrintf is incredibly slow on
ARM it is > 10x faster than the old code.
A fallback solution that (incorrectly) defines lrintf(x)
as (int)(x) might make sense to avoid these kind of issues
for the pure C code, however we would still need to know whether
lrintf is slow or not.
Though maybe the better solution was if all architectures provided
a non-braindead implementation of lrintf.

Modified:
   trunk/libaf/af_format.c

Modified: trunk/libaf/af_format.c
==============================================================================

--- trunk/libaf/af_format.c	Sat Oct 26 10:30:29 2013	(r36488)
+++ trunk/libaf/af_format.c	Sat Oct 26 11:17:28 2013	(r36489)
@@ -494,8 +494,34 @@ static void float2int(const float* in, v
       ((int8_t *)out)[i] = av_clip_int8(lrintf(128.0f * in[i]));
     break;
   case(2):
+#if HAVE_NEON
+    {
+    const float *in_end = in + len;
+    while (in < in_end - 7) {
+      __asm__(
+          "vld1.32 {q0,q1}, [%0]!\n\t"
+          "vcvt.s32.f32 q0, q0, #31\n\t"
+          "vqrshrn.s32  d0, q0, #15\n\t"
+          "vcvt.s32.f32 q1, q1, #31\n\t"
+          "vqrshrn.s32  d1, q1, #15\n\t"
+          "vst1.16 {q0}, [%1]!\n\t"
+      : "+r"(in), "+r"(out)
+      :: "q0", "q1", "memory");
+    }
+    while (in < in_end) {
+      __asm__(
+          "vld1.32 {d0[0]}, [%0]!\n\t"
+          "vcvt.s32.f32 d0, d0, #31\n\t"
+          "vqrshrn.s32  d0, q0, #15\n\t"
+          "vst1.16 {d0[0]}, [%1]!\n\t"
+      : "+r"(in), "+r"(out)
+      :: "d0", "memory");
+    }
+    }
+#else
     for(i=0;i<len;i++)
       ((int16_t*)out)[i] = av_clip_int16(lrintf(32768.0f * in[i]));
+#endif
     break;
   case(3):
     for(i=0;i<len;i++){