[FFmpeg-devel] [PATCH] ARM: NEON optimised float_to_int16

Mon Aug 25 05:06:40 CEST 2008

---
 libavcodec/armv4l/dsputil_neon.c   |    6 +++
 libavcodec/armv4l/dsputil_neon_s.S |   69 ++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 0 deletions(-)

diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c
index 6dbe835..b584e5b 100644
--- a/libavcodec/armv4l/dsputil_neon.c
+++ b/libavcodec/armv4l/dsputil_neon.c
@@ -91,6 +91,9 @@ void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
 void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
 
+void ff_float_to_int16_neon(int16_t *, const float *, long);
+void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
+
 void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
 {
     c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
@@ -158,4 +161,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
 
     c->h264_idct_add = ff_h264_idct_add_neon;
     c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
+
+    c->float_to_int16 = ff_float_to_int16_neon;
+    c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
 }
diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S
index b8e1520..8e1ee6d 100644
--- a/libavcodec/armv4l/dsputil_neon_s.S
+++ b/libavcodec/armv4l/dsputil_neon_s.S
@@ -252,3 +252,72 @@
         defun2 put_pixels8_x2,   _no_rnd, vhadd.u8
         defun2 put_pixels8_y2,   _no_rnd, vhadd.u8
         defun2 put_pixels8_xy2,  _no_rnd, vshrn.u16, 1
+
+extern ff_float_to_int16_neon
+        dmb
+1:      vld1.64       {d0-d3}, [r1,:128]!
+        vcvt.s32.f32  q2, q0
+        vcvt.s32.f32  q3, q1
+        subs          r2, r2, #8
+        vqmovn.s32    d4, q2
+        vqmovn.s32    d5, q3
+        vst1.64       {d4-d5}, [r0,:128]!
+        bgt           1b
+        bx            lr
+        .endfunc
+
+extern ff_float_to_int16_interleave_neon
+        cmp           r3, #2
+        ldrlt         r1, [r1]
+        blt           ff_float_to_int16_neon
+        bne           2f
+
+        ldr           ip, [r1]
+        ldr           r1, [r1, #4]
+        vld1.64       {d0-d3}, [ip,:128]!
+        vld1.64       {d4-d7}, [r1,:128]!
+        dmb
+1:      vcvt.s32.f32  q8,  q0
+        vcvt.s32.f32  q9,  q1
+        vcvt.s32.f32  q10, q2
+        vcvt.s32.f32  q11, q3
+        subs          r2, r2, #8
+        vqmovn.s32    d16, q8
+        vqmovn.s32    d17, q9
+        vqmovn.s32    d18, q10
+        vqmovn.s32    d19, q11
+        beq           1f
+        vld1.64       {d0-d3}, [ip,:128]!
+        vld1.64       {d4-d7}, [r1,:128]!
+        vst2.16       {d16-d19}, [r0,:64]!
+        b             1b
+1:      vst2.16       {d16-d19}, [r0,:64]!
+        bx            lr
+
+2:      push          {r4,r5,lr}
+        lsls          r4, r3, #1
+        dmb
+        b             4f
+3:      vld1.64       {d0-d3}, [ip,:128]!
+        vcvt.s32.f32  q2, q0
+        vcvt.s32.f32  q3, q1
+        subs          lr, lr, #8
+        vqmovn.s32    d4, q2
+        vqmovn.s32    d5, q3
+        vst1.16       {d4[0]}, [r5,:16], r4
+        vst1.16       {d4[1]}, [r5,:16], r4
+        vst1.16       {d4[2]}, [r5,:16], r4
+        vst1.16       {d4[3]}, [r5,:16], r4
+        vst1.16       {d5[0]}, [r5,:16], r4
+        vst1.16       {d5[1]}, [r5,:16], r4
+        vst1.16       {d5[2]}, [r5,:16], r4
+        vst1.16       {d5[3]}, [r5,:16], r4
+        bgt           3b
+        subs          r3, r3, #1
+4:      ldr           ip, [r1], #4
+        mov           lr, r2
+        mov           r5, r0
+        add           r0, r0, #2
+        bne           3b
+        pop           {r4,r5,pc}
+        .endfunc
-- 
1.6.0