[FFmpeg-devel] [PATCH] ARM: NEON optimised float_to_int16
Mans Rullgard
mans
Mon Aug 25 05:06:40 CEST 2008
---
libavcodec/armv4l/dsputil_neon.c | 6 +++
libavcodec/armv4l/dsputil_neon_s.S | 69 ++++++++++++++++++++++++++++++++++++
2 files changed, 75 insertions(+), 0 deletions(-)
diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c
index 6dbe835..b584e5b 100644
--- a/libavcodec/armv4l/dsputil_neon.c
+++ b/libavcodec/armv4l/dsputil_neon.c
@@ -91,6 +91,9 @@ void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
+void ff_float_to_int16_neon(int16_t *, const float *, long);
+void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
+
void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
{
c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
@@ -158,4 +161,7 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->h264_idct_add = ff_h264_idct_add_neon;
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
+
+ c->float_to_int16 = ff_float_to_int16_neon;
+ c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
}
diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S
index b8e1520..8e1ee6d 100644
--- a/libavcodec/armv4l/dsputil_neon_s.S
+++ b/libavcodec/armv4l/dsputil_neon_s.S
@@ -252,3 +252,72 @@
defun2 put_pixels8_x2, _no_rnd, vhadd.u8
defun2 put_pixels8_y2, _no_rnd, vhadd.u8
defun2 put_pixels8_xy2, _no_rnd, vshrn.u16, 1
+
+extern ff_float_to_int16_neon
+ dmb
+1: vld1.64 {d0-d3}, [r1,:128]!
+ vcvt.s32.f32 q2, q0
+ vcvt.s32.f32 q3, q1
+ subs r2, r2, #8
+ vqmovn.s32 d4, q2
+ vqmovn.s32 d5, q3
+ vst1.64 {d4-d5}, [r0,:128]!
+ bgt 1b
+ bx lr
+ .endfunc
+
+extern ff_float_to_int16_interleave_neon
+ cmp r3, #2
+ ldrlt r1, [r1]
+ blt ff_float_to_int16_neon
+ bne 2f
+
+ ldr ip, [r1]
+ ldr r1, [r1, #4]
+ vld1.64 {d0-d3}, [ip,:128]!
+ vld1.64 {d4-d7}, [r1,:128]!
+ dmb
+1: vcvt.s32.f32 q8, q0
+ vcvt.s32.f32 q9, q1
+ vcvt.s32.f32 q10, q2
+ vcvt.s32.f32 q11, q3
+ subs r2, r2, #8
+ vqmovn.s32 d16, q8
+ vqmovn.s32 d17, q9
+ vqmovn.s32 d18, q10
+ vqmovn.s32 d19, q11
+ beq 1f
+ vld1.64 {d0-d3}, [ip,:128]!
+ vld1.64 {d4-d7}, [r1,:128]!
+ vst2.16 {d16-d19}, [r0,:64]!
+ b 1b
+1: vst2.16 {d16-d19}, [r0,:64]!
+ bx lr
+
+2: push {r4,r5,lr}
+ lsls r4, r3, #1
+ dmb
+ b 4f
+3: vld1.64 {d0-d3}, [ip,:128]!
+ vcvt.s32.f32 q2, q0
+ vcvt.s32.f32 q3, q1
+ subs lr, lr, #8
+ vqmovn.s32 d4, q2
+ vqmovn.s32 d5, q3
+ vst1.16 {d4[0]}, [r5,:16], r4
+ vst1.16 {d4[1]}, [r5,:16], r4
+ vst1.16 {d4[2]}, [r5,:16], r4
+ vst1.16 {d4[3]}, [r5,:16], r4
+ vst1.16 {d5[0]}, [r5,:16], r4
+ vst1.16 {d5[1]}, [r5,:16], r4
+ vst1.16 {d5[2]}, [r5,:16], r4
+ vst1.16 {d5[3]}, [r5,:16], r4
+ bgt 3b
+ subs r3, r3, #1
+4: ldr ip, [r1], #4
+ mov lr, r2
+ mov r5, r0
+ add r0, r0, #2
+ bne 3b
+ pop {r4,r5,pc}
+ .endfunc
--
1.6.0
More information about the ffmpeg-devel
mailing list