[FFmpeg-devel] [PATCH] ARM: NEON optimised vector_fmul_window
Mans Rullgard
mans
Mon Aug 25 05:06:42 CEST 2008
---
libavcodec/armv4l/dsputil_neon.c | 6 +++++
libavcodec/armv4l/dsputil_neon_s.S | 43 ++++++++++++++++++++++++++++++++++++
2 files changed, 49 insertions(+), 0 deletions(-)
diff --git a/libavcodec/armv4l/dsputil_neon.c b/libavcodec/armv4l/dsputil_neon.c
index f9d32c0..6c44940 100644
--- a/libavcodec/armv4l/dsputil_neon.c
+++ b/libavcodec/armv4l/dsputil_neon.c
@@ -91,6 +91,10 @@ void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
+void ff_vector_fmul_window_neon(float *dst, const float *src0,
+ const float *src1, const float *win,
+ float add_bias, int len);
+
void ff_float_to_int16_neon(int16_t *, const float *, long);
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
@@ -164,6 +168,8 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
c->h264_idct_add = ff_h264_idct_add_neon;
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
+ c->vector_fmul_window = ff_vector_fmul_window_neon;
+
c->float_to_int16 = ff_float_to_int16_neon;
c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
diff --git a/libavcodec/armv4l/dsputil_neon_s.S b/libavcodec/armv4l/dsputil_neon_s.S
index 5ccb034..e4b809e 100644
--- a/libavcodec/armv4l/dsputil_neon_s.S
+++ b/libavcodec/armv4l/dsputil_neon_s.S
@@ -324,6 +324,49 @@ extern ff_float_to_int16_interleave_neon
pop {r4,r5,pc}
.endfunc
+extern ff_vector_fmul_window_neon
+ vld1.32 {d16[],d17[]}, [sp,:32]
+ push {r4,r5,lr}
+ ldr lr, [sp, #16]
+ sub r2, r2, #8
+ sub r5, lr, #2
+ add r2, r2, r5, lsl #2
+ add r4, r3, r5, lsl #3
+ add ip, r0, r5, lsl #3
+ mov r5, #-16
+ dmb
+ vld1.64 {d0,d1}, [r1,:128]!
+ vld1.64 {d2,d3}, [r2,:128], r5
+ vld1.64 {d4,d5}, [r3,:128]!
+ vld1.64 {d6,d7}, [r4,:128], r5
+1: vmov q10, q8
+ vmov q11, q8
+ vmla.f32 q11, q0, q2
+ vrev64.32 q3, q3
+ vswp d6, d7
+ vmla.f32 q10, q0, q3
+ vrev64.32 q1, q1
+ vswp d2, d3
+ subs lr, lr, #4
+ vmla.f32 q11, q1, q3
+ vmls.f32 q10, q1, q2
+ beq 2f
+ vld1.64 {d0,d1}, [r1,:128]!
+ vld1.64 {d2,d3}, [r2,:128], r5
+ vld1.64 {d4,d5}, [r3,:128]!
+ vld1.64 {d6,d7}, [r4,:128], r5
+ vrev64.32 q11, q11
+ vswp d22, d23
+ vst1.64 {d20,d21}, [r0,:128]!
+ vst1.64 {d22,d23}, [ip,:128], r5
+ b 1b
+2: vrev64.32 q11, q11
+ vswp d22, d23
+ vst1.64 {d20,d21}, [r0,:128]!
+ vst1.64 {d22,d23}, [ip,:128], r5
+ pop {r4,r5,pc}
+ .endfunc
+
#ifdef CONFIG_VORBIS_DECODER
extern ff_vorbis_inverse_coupling_neon
vmov.i32 q10, #(1<<31)
--
1.6.0
More information about the ffmpeg-devel
mailing list