[FFmpeg-devel] [PATCH 3/6] avcodec/hevcdsp: Add NEON optimization for whole-pixel interpolation
Shengbin Meng
shengbinmeng at gmail.com
Wed Nov 22 13:12:03 EET 2017
New code is written for qpel; and then code for qpel is reused for epel,
because whole-pixel interpolation in qpel and epel are identical.
Signed-off-by: Shengbin Meng <shengbinmeng at gmail.com>
---
libavcodec/arm/hevcdsp_init_neon.c | 106 ++++++++++++++++++++++
libavcodec/arm/hevcdsp_qpel_neon.S | 177 +++++++++++++++++++++++++++++++++++++
2 files changed, 283 insertions(+)
diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
index 9d885a62a9..6171863113 100644
--- a/libavcodec/arm/hevcdsp_init_neon.c
+++ b/libavcodec/arm/hevcdsp_init_neon.c
@@ -71,6 +71,10 @@ static void (*put_hevc_epel_uw_neon[8][8])(uint8_t *dst, ptrdiff_t dststride, ui
int width, int height, int16_t* src2, ptrdiff_t src2stride);
static void (*put_hevc_epel_wt_neon[8][8])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
int width, int height, int denom, int wx1, int ox1, int wx0, int ox0, int16_t* src2, ptrdiff_t src2stride);
+static void (*put_hevc_qpel_bi_uw_pixels_neon[1])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int width, int height, int16_t* src2, ptrdiff_t src2stride);
+static void (*put_hevc_qpel_wt_pixels_neon[1])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
+ int width, int height, int denom, int wx1, int ox1, int wx0, int ox0, int16_t* src2, ptrdiff_t src2stride);
void ff_hevc_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
int height, intptr_t mx, intptr_t my, int width);
void ff_hevc_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
@@ -101,6 +105,17 @@ void ff_hevc_put_epel_bi_w_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8
int16_t *src2,
int height, int denom, int wx0, int wx1,
int ox0, int ox1, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int denom, int wx, int ox,
+ intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int16_t *src2,
+ int height, int denom, int wx0, int wx1,
+ int ox0, int ox1, intptr_t mx, intptr_t my, int width);
#define QPEL_FUNC(name) \
void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
@@ -154,6 +169,7 @@ QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h2v3_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v1_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v2_neon_8);
QPEL_FUNC_UW(ff_hevc_put_qpel_uw_h3v3_neon_8);
+QPEL_FUNC_UW(ff_hevc_put_qpel_bi_uw_pixels_neon_8);
#undef QPEL_FUNC_UW
#define QPEL_FUNC_WT(name) \
@@ -174,6 +190,7 @@ QPEL_FUNC_WT(ff_hevc_put_qpel_wt_h2v3_neon_8);
QPEL_FUNC_WT(ff_hevc_put_qpel_wt_h3v1_neon_8);
QPEL_FUNC_WT(ff_hevc_put_qpel_wt_h3v2_neon_8);
QPEL_FUNC_WT(ff_hevc_put_qpel_wt_h3v3_neon_8);
+QPEL_FUNC_WT(ff_hevc_put_qpel_wt_pixels_neon_8);
#undef QPEL_FUNC_WT
@@ -441,6 +458,26 @@ void ff_hevc_put_epel_bi_w_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8
put_hevc_epel_wt_neon[my][mx](dst, dststride, src, srcstride, width, height, denom, wx1, ox1, wx0, ox0, src2, MAX_PB_SIZE);
}
+void ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int16_t *src2,
+ int height, intptr_t mx, intptr_t my, int width) {
+ put_hevc_qpel_bi_uw_pixels_neon[0](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+}
+
+void ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper(uint8_t *dst, ptrdiff_t dststride,
+ uint8_t *src, ptrdiff_t srcstride,
+ int height, int denom, int wx, int ox,
+ intptr_t mx, intptr_t my, int width) {
+ put_hevc_qpel_wt_pixels_neon[0](dst, dststride, src, srcstride, width, height, denom, wx, ox, 0, 0, NULL, 0);
+}
+
+void ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+ int16_t *src2,
+ int height, int denom, int wx0, int wx1,
+ int ox0, int ox1, intptr_t mx, intptr_t my, int width) {
+ put_hevc_qpel_wt_pixels_neon[0](dst, dststride, src, srcstride, width, height, denom, wx1, ox1, wx0, ox0, src2, MAX_PB_SIZE);
+}
+
av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
{
if (bit_depth == 8) {
@@ -505,6 +542,8 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
put_hevc_qpel_wt_neon[3][1] = ff_hevc_put_qpel_wt_h1v3_neon_8;
put_hevc_qpel_wt_neon[3][2] = ff_hevc_put_qpel_wt_h2v3_neon_8;
put_hevc_qpel_wt_neon[3][3] = ff_hevc_put_qpel_wt_h3v3_neon_8;
+ put_hevc_qpel_wt_pixels_neon[0] = ff_hevc_put_qpel_wt_pixels_neon_8;
+ put_hevc_qpel_bi_uw_pixels_neon[0] = ff_hevc_put_qpel_bi_uw_pixels_neon_8;
put_hevc_epel_neon[1][0] = ff_hevc_put_epel_v1_neon_8;
put_hevc_epel_neon[2][0] = ff_hevc_put_epel_v2_neon_8;
put_hevc_epel_neon[3][0] = ff_hevc_put_epel_v3_neon_8;
@@ -745,5 +784,72 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_qpel_uw_pixels_w32_neon_8;
c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+
+ c->put_hevc_qpel_bi[1][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+ c->put_hevc_qpel_bi[3][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+ c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+ c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+ c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+ c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+ c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+
+ c->put_hevc_qpel_uni_w[1][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+ c->put_hevc_qpel_uni_w[3][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+ c->put_hevc_qpel_uni_w[5][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+ c->put_hevc_qpel_uni_w[6][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+ c->put_hevc_qpel_uni_w[7][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+ c->put_hevc_qpel_uni_w[8][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+ c->put_hevc_qpel_uni_w[9][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+
+ c->put_hevc_qpel_bi_w[1][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
+ c->put_hevc_qpel_bi_w[3][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
+ c->put_hevc_qpel_bi_w[5][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
+ c->put_hevc_qpel_bi_w[6][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
+ c->put_hevc_qpel_bi_w[7][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
+ c->put_hevc_qpel_bi_w[8][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
+ c->put_hevc_qpel_bi_w[9][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
+
+ c->put_hevc_epel[0][0][0] = ff_hevc_put_pixels_w2_neon_8;
+ c->put_hevc_epel[1][0][0] = ff_hevc_put_pixels_w4_neon_8;
+ c->put_hevc_epel[2][0][0] = ff_hevc_put_pixels_w6_neon_8;
+ c->put_hevc_epel[3][0][0] = ff_hevc_put_pixels_w8_neon_8;
+ c->put_hevc_epel[4][0][0] = ff_hevc_put_pixels_w12_neon_8;
+ c->put_hevc_epel[5][0][0] = ff_hevc_put_pixels_w16_neon_8;
+ c->put_hevc_epel[6][0][0] = ff_hevc_put_pixels_w24_neon_8;
+ c->put_hevc_epel[7][0][0] = ff_hevc_put_pixels_w32_neon_8;
+ c->put_hevc_epel[8][0][0] = ff_hevc_put_pixels_w48_neon_8;
+ c->put_hevc_epel[9][0][0] = ff_hevc_put_pixels_w64_neon_8;
+
+ c->put_hevc_epel_uni[1][0][0] = ff_hevc_put_qpel_uw_pixels_w4_neon_8;
+ c->put_hevc_epel_uni[3][0][0] = ff_hevc_put_qpel_uw_pixels_w8_neon_8;
+ c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_qpel_uw_pixels_w16_neon_8;
+ c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_qpel_uw_pixels_w24_neon_8;
+ c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_qpel_uw_pixels_w32_neon_8;
+ c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+ c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+
+ c->put_hevc_epel_bi[1][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+ c->put_hevc_epel_bi[3][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+ c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+ c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+ c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+ c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+ c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_qpel_bi_uw_pixels_neon_wrapper;
+
+ c->put_hevc_epel_uni_w[1][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+ c->put_hevc_epel_uni_w[3][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+ c->put_hevc_epel_uni_w[5][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+ c->put_hevc_epel_uni_w[6][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+ c->put_hevc_epel_uni_w[7][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+ c->put_hevc_epel_uni_w[8][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+ c->put_hevc_epel_uni_w[9][0][0] = ff_hevc_put_qpel_uni_wt_pixels_neon_wrapper;
+
+ c->put_hevc_epel_bi_w[1][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
+ c->put_hevc_epel_bi_w[3][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
+ c->put_hevc_epel_bi_w[5][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
+ c->put_hevc_epel_bi_w[6][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
+ c->put_hevc_epel_bi_w[7][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
+ c->put_hevc_epel_bi_w[8][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
+ c->put_hevc_epel_bi_w[9][0][0] = ff_hevc_put_qpel_bi_wt_pixels_neon_wrapper;
}
}
diff --git a/libavcodec/arm/hevcdsp_qpel_neon.S b/libavcodec/arm/hevcdsp_qpel_neon.S
index e188b215ba..71ecc00b6e 100644
--- a/libavcodec/arm/hevcdsp_qpel_neon.S
+++ b/libavcodec/arm/hevcdsp_qpel_neon.S
@@ -1506,3 +1506,180 @@ put_qpel_uw_pixels 24, d0-d2, d3-d5, d16-d18, d19-d21
put_qpel_uw_pixels 32, q0-q1, q2-q3, q8-q9, q10-q11
put_qpel_uw_pixels_m 48, q0-q1, q2, q8-q9, q10
put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11
+
+function ff_hevc_put_qpel_bi_uw_pixels_neon_8, export=1
+ push {r4-r10}
+ ldr r5, [sp, #28] // width
+ ldr r4, [sp, #32] // height
+ ldr r8, [sp, #36] // src2
+ ldr r9, [sp, #40] // src2stride
+ vpush {d8-d15}
+ mov r12, r4
+ mov r6, r0
+ mov r7, r2
+ lsl r9, #1
+ mov r10, r8
+0: pld [r2]
+ vld1.8 {d8}, [r2], r3 // load 8x8bit src
+ cmp r5, #4
+ beq 4f
+8: subs r4, #1
+ vshll.u8 q7 , d8, #6 // src[x] << 6 and move long to 8x16bit
+ vld1.16 {q0}, [r8], r9 // load 8x16bit src2
+ vqadd.s16 q0, q7 // ((src << 6) + src2) on 8x16bit operation
+ vqrshrun.s16 d0, q0, #7 // (((src << 6) + src2) + offset) >> 7 narrow to 8x8bit
+ vst1.8 d0, [r0], r1
+ vld1.8 {d8}, [r2], r3 // load 8x8bit src
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r4, r12
+ add r6, #8
+ mov r0, r6
+ add r10, #16
+ mov r8, r10
+ add r7, #8
+ mov r2, r7
+ b 0b
+4: subs r4, #1
+ vshll.u8 q7 , d8, #6 // src[x] << 6 and move long to 8x16bit
+ vld1.16 d0, [r8], r9
+ vqadd.s16 d0, d14
+ vqrshrun.s16 d0, q0, #7
+ vst1.32 d0[0], [r0], r1
+ vld1.32 {d8[0]}, [r2], r3
+ bne 4b
+99: vpop {d8-d15}
+ pop {r4-r10}
+ bx lr
+endfunc
+
+function ff_hevc_put_qpel_wt_pixels_neon_8, export=1
+ push {r4-r12}
+ ldr r5, [sp, #36] // width
+ ldr r4, [sp, #40] // height
+ ldr r8, [sp, #44] // denom
+ ldr r9, [sp, #48] // wx
+ ldr r10,[sp, #52] // ox
+ ldr r11,[sp, #64] // src2
+ vpush {d8-d15}
+ mov r12, r4
+ mov r6, r0
+ mov r7, r2
+ add r8, #6 // weight shift = denom + 6
+ vdup.32 q5, r8 // shift is a 32 bit action
+ vneg.s32 q4, q5 // q4 = -q5
+ vdup.32 q6, r9 // q6 wx
+ vdup.32 q5, r10 // q5 ox
+ cmp r11, #0
+ bne .bi
+0: pld [r2]
+ vld1.8 {d16}, [r2], r3
+ cmp r5, #4
+ beq 4f
+8: subs r4, #1
+ vshll.u8 q7 , d16, #6 // src[x] << 6 and move long to 8x16bit
+ vmovl.u16 q12, d14 // extending unsigned 4x16bit data to 4x32 bit
+ vmovl.u16 q13, d15
+ vmul.u32 q14, q12, q6
+ vmul.u32 q15, q13, q6
+ vqrshl.u32 q12, q14, q4
+ vqrshl.u32 q13, q15, q4
+ vadd.u32 q14, q12, q5
+ vadd.u32 q15, q13, q5
+ vqmovun.s32 d2, q14 // narrow
+ vqmovun.s32 d3, q15 // narrow
+ vqmovn.u16 d0, q1
+ vst1.8 d0, [r0], r1 // load 8 pixels
+ vld1.8 {d16}, [r2], r3
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r4, r12
+ add r6, #8
+ mov r0, r6
+ add r7, #8
+ mov r2, r7
+ b 0b
+4: subs r4, #1
+ vshll.u8 q7 , d16, #6 // src[x] << 6 and move long to 8x16bit
+ vmovl.u16 q12, d14 // extending signed 4x16bit data to 4x32 bit
+ vmul.u32 q14, q12, q6
+ vqrshl.u32 q12, q14, q4
+ vadd.u32 q14, q12, q5
+ vqmovun.s32 d14, q14
+ vqmovn.u16 d0, q7
+ vst1.32 d0[0], [r0], r1
+ vld1.32 {d16[0]}, [r2], r3
+ bne 4b
+ b 99f
+.bi:
+ ldr r8, [sp, #120] // w0
+ vdup.32 q1, r8 // q1 wx0
+ ldr r8, [sp, #124] // ox0
+ vdup.32 q2, r8 // q2 ox0
+ vadd.s32 q2, q5 // q2 = ox0 + ox1
+ vmov.s32 q10, #1
+ vadd.s32 q2, q10 // q2 = ox0 + ox1 + 1
+ vneg.s32 q15, q4 // q15 = -q4
+ vqrshl.s32 q3, q2, q15 // q3 = (ox0 + ox1 + 1)<<shift
+ vsub.s32 q4, q10
+ ldr r9, [sp, #132] // src2stride
+ lsl r9, #1
+ mov r10, r11 // r10 store startpoint of src2
+0: pld [r2]
+ vld1.8 {d16}, [r2], r3
+ cmp r5, #4
+ beq 4f
+8: subs r4, #1
+ vshll.u8 q7, d16, #6 // src[x] << 6 and move long to 8x16bit
+ vmovl.s16 q12, d14 // extending signed 4x16bit data to 4x32 bit
+ vmovl.s16 q13, d15
+ vmul.s32 q14, q12, q6 // src * w1
+ vmul.s32 q15, q13, q6 // src * w1
+ vld1.16 {q0}, [r11], r9 // load 8x16 bit pixels from src2 to q0
+ vmovl.s16 q2, d0 // extend signed 4x16bit to 4x32 bit
+ vmovl.s16 q5, d1
+ vmul.s32 q2, q1 // src2 * w0
+ vmul.s32 q5, q1 // src2 * w0
+ vadd.s32 q14, q2 // src * w1 + src2 * w0
+ vadd.s32 q15, q5 // src * w1 + src2 * w0
+ vadd.s32 q14, q3 // (src* w1 + src2 * w0 +(ox0 + ox1 + 1))<<shift
+ vadd.s32 q15, q3
+ vqshl.s32 q12, q14, q4 // shift
+ vqshl.s32 q13, q15, q4 // shift
+ vqmovun.s32 d28, q12 // narrow
+ vqmovun.s32 d29, q13 // narrow
+ vqmovn.u16 d0, q14 // narrow
+ vst1.8 d0, [r0], r1
+ vld1.8 {d16}, [r2], r3
+ bne 8b
+ subs r5, #8
+ beq 99f
+ mov r4, r12
+ add r6, #8
+ mov r0, r6
+ add r10, #16
+ mov r11, r10
+ add r7, #8
+ mov r2, r7
+ b 0b
+4: subs r4, #1
+ vshll.u8 q7, d16, #6 // src[x] << 6 and move long to 8x16bit
+ vmovl.s16 q12, d14 // extending signed 4x16bit data to 4x32 bit
+ vmul.s32 q14, q12, q6 // src * w1
+ vld1.16 d0, [r11], r9 // load 8x16 bit pixels from src2 to q0
+ vmovl.s16 q2, d0 // extend signed 4x16bit to 4x32 bit
+ vmul.s32 q2, q1 // src2 * w0
+ vadd.s32 q14, q2 // src * w1 + src2 * w0
+ vadd.s32 q14, q3 // (src* w1 + src2 * w0 +(ox0 + ox1 + 1))<<shift
+ vqshl.s32 q12, q14, q4 // shift
+ vqmovun.s32 d28, q12 // narrow
+ vqmovn.u16 d0, q14 // narrow
+ vst1.32 d0[0], [r0], r1
+ vld1.32 {d16[0]}, [r2], r3
+ bne 4b
+99: vpop {d8-d15}
+ pop {r4-r12}
+ bx lr
+endfunc
--
2.13.6 (Apple Git-96)
More information about the ffmpeg-devel
mailing list