[FFmpeg-devel] [PATCH 3/3] VP8: begin adding idct functions
Rob Clark
rob
Thu Sep 16 20:28:33 CEST 2010
added:
+ vp8_idct_dc_add_neon
+ vp8_idct_dc_add4uv_neon
+ vp8_idct_dc_add4y_neon
---
libavcodec/arm/vp8dsp_init_arm.c | 8 ++
libavcodec/arm/vp8dsp_neon.S | 143 ++++++++++++++++++++++++++++++++++++++
2 files changed, 151 insertions(+), 0 deletions(-)
diff --git a/libavcodec/arm/vp8dsp_init_arm.c b/libavcodec/arm/vp8dsp_init_arm.c
index a51aad7..09bb656 100644
--- a/libavcodec/arm/vp8dsp_init_arm.c
+++ b/libavcodec/arm/vp8dsp_init_arm.c
@@ -22,6 +22,10 @@
#include "libavcodec/vp8dsp.h"
+void vp8_idct_dc_add_neon(uint8_t * dst, DCTELEM block[16], int stride);
+void vp8_idct_dc_add4uv_neon(uint8_t * dst, DCTELEM block[4][16], int stride);
+void vp8_idct_dc_add4y_neon(uint8_t * dst, DCTELEM block[4][16], int stride);
+
void vp8_v_loop_filter16y_neon(uint8_t *dst, int stride,
int flim_E, int flim_I, int hev_thresh);
void vp8_h_loop_filter16y_neon(uint8_t *dst, int stride,
@@ -102,6 +106,10 @@ void put_vp8_epel8_h4v6_neon(uint8_t * dst, int dststride, uint8_t * src,
av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp)
{
if (HAVE_NEON) {
+ dsp->vp8_idct_dc_add = vp8_idct_dc_add_neon;
+ dsp->vp8_idct_dc_add4y = vp8_idct_dc_add4y_neon;
+ dsp->vp8_idct_dc_add4uv = vp8_idct_dc_add4uv_neon;
+
dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16y_neon;
dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16y_neon;
dsp->vp8_v_loop_filter8uv = vp8_v_loop_filter8uv_neon;
diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S
index d741bbd..4ea3195 100644
--- a/libavcodec/arm/vp8dsp_neon.S
+++ b/libavcodec/arm/vp8dsp_neon.S
@@ -23,6 +23,149 @@
#include "asm.S"
+ at void vp8_idct_dc_add_neon(uint8_t * dst, DCTELEM block[16], int stride)
+@{
+function vp8_idct_dc_add_neon, export=1
+ mov r3, #0
+ ldrsh r12, [r1]
+ strh r3, [r1]
+ add r12, r12, #4
+ lsr r12, r12, #3
+ vdup.16 q1, r12
+ vld1.32 {d0[0]}, [r0], r2
+ vld1.32 {d0[1]}, [r0], r2
+ vld1.32 {d1[0]}, [r0], r2
+ vld1.32 {d1[1]}, [r0], r2
+ sub r0, r0, r2, lsl #2
+ vaddw.u8 q2, q1, d0
+ vaddw.u8 q3, q1, d1
+ vqshrun.s16 d0, q2, #0
+ vqshrun.s16 d1, q3, #0
+ vst1.32 {d0[0]}, [r0], r2
+ vst1.32 {d0[1]}, [r0], r2
+ vst1.32 {d1[0]}, [r0], r2
+ vst1.32 {d1[1]}, [r0], r2
+ bx lr
+endfunc
+@}
+
+ at void vp8_idct_dc_add4uv_neon(uint8_t * dst, DCTELEM block[4][16], int stride)
+@{
+function vp8_idct_dc_add4uv_neon, export=1
+ mov r3, #0
+ vmov.u16 q10, #4
+ ldrsh r12, [r1, #0] @ block[0][0]
+ strh r3, [r1, #0]
+ vdup.16 d16, r12
+ ldrsh r12, [r1, #32] @ block[1][0]
+ strh r3, [r1, #32]
+ vdup.16 d17, r12
+ ldrsh r12, [r1, #64] @ block[2][0]
+ strh r3, [r1, #64]
+ vdup.16 d18, r12
+ ldrsh r12, [r1, #96] @ block[3][0]
+ strh r3, [r1, #96]
+ vdup.16 d19, r12
+ vadd.s16 q8, q8, q10 @ dc += 4
+ vadd.s16 q9, q9, q10
+ vshr.s16 q8, q8, #3 @ dc >>= 3
+ vshr.s16 q9, q9, #3
+ add r3, r0, r2, lsl #2 @ dst2 = dst + 4 * stride
+
+ vld1.8 {d0}, [r0]
+ vld1.8 {d1}, [r3]
+ vaddw.u8 q1, q8, d0
+ vaddw.u8 q2, q9, d1
+ vqshrun.s16 d0, q1, #0
+ vqshrun.s16 d1, q2, #0
+ vst1.8 {d0}, [r0], r2
+ vst1.8 {d1}, [r3], r2
+
+ vld1.8 {d0}, [r0]
+ vld1.8 {d1}, [r3]
+ vaddw.u8 q1, q8, d0
+ vaddw.u8 q2, q9, d1
+ vqshrun.s16 d0, q1, #0
+ vqshrun.s16 d1, q2, #0
+ vst1.8 {d0}, [r0], r2
+ vst1.8 {d1}, [r3], r2
+
+ vld1.8 {d0}, [r0]
+ vld1.8 {d1}, [r3]
+ vaddw.u8 q1, q8, d0
+ vaddw.u8 q2, q9, d1
+ vqshrun.s16 d0, q1, #0
+ vqshrun.s16 d1, q2, #0
+ vst1.8 {d0}, [r0], r2
+ vst1.8 {d1}, [r3], r2
+
+ vld1.8 {d0}, [r0]
+ vld1.8 {d1}, [r3]
+ vaddw.u8 q1, q8, d0
+ vaddw.u8 q2, q9, d1
+ vqshrun.s16 d0, q1, #0
+ vqshrun.s16 d1, q2, #0
+ vst1.8 {d0}, [r0], r2
+ vst1.8 {d1}, [r3], r2
+
+ bx lr
+endfunc
+@}
+
+ at void vp8_idct_dc_add4y_neon(uint8_t * dst, DCTELEM block[4][16], int stride)
+@{
+function vp8_idct_dc_add4y_neon, export=1
+ mov r3, #0
+ vmov.u16 q10, #4
+ ldrsh r12, [r1, #0] @ block[0][0]
+ strh r3, [r1, #0]
+ vdup.16 d16, r12
+ ldrsh r12, [r1, #32] @ block[1][0]
+ strh r3, [r1, #32]
+ vdup.16 d17, r12
+ ldrsh r12, [r1, #64] @ block[2][0]
+ strh r3, [r1, #64]
+ vdup.16 d18, r12
+ ldrsh r12, [r1, #96] @ block[3][0]
+ strh r3, [r1, #96]
+ vdup.16 d19, r12
+ vadd.s16 q8, q8, q10 @ dc += 4
+ vadd.s16 q9, q9, q10
+ vshr.s16 q8, q8, #3 @ dc >>= 3
+ vshr.s16 q9, q9, #3
+
+ vld1.8 {q0}, [r0]
+ vaddw.u8 q1, q8, d0
+ vaddw.u8 q2, q9, d1
+ vqshrun.s16 d0, q1, #0
+ vqshrun.s16 d1, q2, #0
+ vst1.8 {q0}, [r0], r2
+
+ vld1.8 {q0}, [r0]
+ vaddw.u8 q1, q8, d0
+ vaddw.u8 q2, q9, d1
+ vqshrun.s16 d0, q1, #0
+ vqshrun.s16 d1, q2, #0
+ vst1.8 {q0}, [r0], r2
+
+ vld1.8 {q0}, [r0]
+ vaddw.u8 q1, q8, d0
+ vaddw.u8 q2, q9, d1
+ vqshrun.s16 d0, q1, #0
+ vqshrun.s16 d1, q2, #0
+ vst1.8 {q0}, [r0], r2
+
+ vld1.8 {q0}, [r0]
+ vaddw.u8 q1, q8, d0
+ vaddw.u8 q2, q9, d1
+ vqshrun.s16 d0, q1, #0
+ vqshrun.s16 d1, q2, #0
+ vst1.8 {q0}, [r0], r2
+
+ bx lr
+endfunc
+@}
+
@ Register layout:
@ P3..Q3 -> q0..q7
@ flim_E -> q14
--
1.7.1.1
More information about the ffmpeg-devel
mailing list