[FFmpeg-devel] [PATCH 4/4] avcodec/mips: MSA (MIPS-SIMD-Arch) optimizations for VP9 intra functions
shivraj.patil at imgtec.com
shivraj.patil at imgtec.com
Thu Jul 9 15:15:29 CEST 2015
From: Shivraj Patil <shivraj.patil at imgtec.com>
This patch adds MSA (MIPS-SIMD-Arch) optimizations for VP9 intra functions in new file vp9_intra_msa.c
Signed-off-by: Shivraj Patil <shivraj.patil at imgtec.com>
---
libavcodec/mips/Makefile | 3 +-
libavcodec/mips/vp9_intra_msa.c | 880 +++++++++++++++++++++++++++++++++++++
libavcodec/mips/vp9dsp_init_mips.c | 23 +
libavcodec/mips/vp9dsp_mips.h | 73 +++
4 files changed, 978 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/mips/vp9_intra_msa.c
diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 7dca55d..b71d2c4 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -44,7 +44,8 @@ MSA-OBJS-$(CONFIG_HEVC_DECODER) += mips/hevcdsp_msa.o \
mips/hevcpred_msa.o
MSA-OBJS-$(CONFIG_VP9_DECODER) += mips/vp9_mc_msa.o \
mips/vp9_lpf_msa.o \
- mips/vp9_idct_msa.o
+ mips/vp9_idct_msa.o \
+ mips/vp9_intra_msa.o
MSA-OBJS-$(CONFIG_H264DSP) += mips/h264dsp_msa.o \
mips/h264idct_msa.o
MSA-OBJS-$(CONFIG_H264QPEL) += mips/h264qpel_msa.o
diff --git a/libavcodec/mips/vp9_intra_msa.c b/libavcodec/mips/vp9_intra_msa.c
new file mode 100644
index 0000000..e29e727
--- /dev/null
+++ b/libavcodec/mips/vp9_intra_msa.c
@@ -0,0 +1,880 @@
+/*
+ * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil at imgtec.com)
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/vp9dsp.h"
+#include "libavutil/mips/generic_macros_msa.h"
+#include "vp9dsp_mips.h"
+
+#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
+{ \
+ out0 = __msa_subs_u_h(out0, in0); \
+ out1 = __msa_subs_u_h(out1, in1); \
+}
+
+static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride)
+{
+ uint32_t src_data;
+
+ src_data = LW(src);
+
+ SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
+}
+
+static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride)
+{
+ uint32_t row;
+ uint32_t src_data1, src_data2;
+
+ src_data1 = LW(src);
+ src_data2 = LW(src + 4);
+
+ for (row = 8; row--;) {
+ SW(src_data1, dst);
+ SW(src_data2, (dst + 4));
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride)
+{
+ uint32_t row;
+ v16u8 src0;
+
+ src0 = LD_UB(src);
+
+ for (row = 16; row--;) {
+ ST_UB(src0, dst);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
+ int32_t dst_stride)
+{
+ uint32_t row;
+ v16u8 src1, src2;
+
+ src1 = LD_UB(src);
+ src2 = LD_UB(src + 16);
+
+ for (row = 32; row--;) {
+ ST_UB2(src1, src2, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_horiz_4x4_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride)
+{
+ uint32_t out0, out1, out2, out3;
+
+ out0 = src[0 * src_stride] * 0x01010101;
+ out1 = src[1 * src_stride] * 0x01010101;
+ out2 = src[2 * src_stride] * 0x01010101;
+ out3 = src[3 * src_stride] * 0x01010101;
+
+ SW4(out0, out1, out2, out3, dst, dst_stride);
+}
+
+static void intra_predict_horiz_8x8_msa(const uint8_t *src, int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride)
+{
+ uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
+
+ out0 = src[0 * src_stride] * 0x0101010101010101;
+ out1 = src[1 * src_stride] * 0x0101010101010101;
+ out2 = src[2 * src_stride] * 0x0101010101010101;
+ out3 = src[3 * src_stride] * 0x0101010101010101;
+ out4 = src[4 * src_stride] * 0x0101010101010101;
+ out5 = src[5 * src_stride] * 0x0101010101010101;
+ out6 = src[6 * src_stride] * 0x0101010101010101;
+ out7 = src[7 * src_stride] * 0x0101010101010101;
+
+ SD4(out0, out1, out2, out3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ SD4(out4, out5, out6, out7, dst, dst_stride);
+}
+
+static void intra_predict_horiz_16x16_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride)
+{
+ uint32_t row;
+ uint8_t inp0, inp1, inp2, inp3;
+ v16u8 src0, src1, src2, src3;
+
+ for (row = 4; row--;) {
+ inp0 = src[0];
+ src += src_stride;
+ inp1 = src[0];
+ src += src_stride;
+ inp2 = src[0];
+ src += src_stride;
+ inp3 = src[0];
+ src += src_stride;
+
+ src0 = (v16u8) __msa_fill_b(inp0);
+ src1 = (v16u8) __msa_fill_b(inp1);
+ src2 = (v16u8) __msa_fill_b(inp2);
+ src3 = (v16u8) __msa_fill_b(inp3);
+
+ ST_UB4(src0, src1, src2, src3, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void intra_predict_horiz_32x32_msa(const uint8_t *src,
+ int32_t src_stride,
+ uint8_t *dst, int32_t dst_stride)
+{
+ uint32_t row;
+ uint8_t inp0, inp1;
+ v16u8 src0, src1;
+
+ for (row = 16; row--;) {
+ inp0 = src[0];
+ src += src_stride;
+ inp1 = src[0];
+ src += src_stride;
+
+ src0 = (v16u8) __msa_fill_b(inp0);
+ src1 = (v16u8) __msa_fill_b(inp1);
+
+ ST_UB2(src0, src0, dst, 16);
+ dst += dst_stride;
+ ST_UB2(src1, src1, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
+ const uint8_t *src_left,
+ int32_t src_stride_left,
+ uint8_t *dst, int32_t dst_stride,
+ uint8_t is_above, uint8_t is_left)
+{
+ uint32_t row;
+ uint32_t out, addition = 0;
+ v16u8 src_above, store;
+ v8u16 sum_above;
+ v4u32 sum;
+
+ if (is_left && is_above) {
+ src_above = LD_UB(src_top);
+
+ sum_above = __msa_hadd_u_h(src_above, src_above);
+ sum = __msa_hadd_u_w(sum_above, sum_above);
+ addition = __msa_copy_u_w((v4i32) sum, 0);
+
+ for (row = 0; row < 4; row++) {
+ addition += src_left[row * src_stride_left];
+ }
+
+ addition = (addition + 4) >> 3;
+ store = (v16u8) __msa_fill_b(addition);
+ } else if (is_left) {
+ for (row = 0; row < 4; row++) {
+ addition += src_left[row * src_stride_left];
+ }
+
+ addition = (addition + 2) >> 2;
+ store = (v16u8) __msa_fill_b(addition);
+ } else if (is_above) {
+ src_above = LD_UB(src_top);
+
+ sum_above = __msa_hadd_u_h(src_above, src_above);
+ sum = __msa_hadd_u_w(sum_above, sum_above);
+ sum = (v4u32) __msa_srari_w((v4i32) sum, 2);
+ store = (v16u8) __msa_splati_b((v16i8) sum, 0);
+ } else {
+ store = (v16u8) __msa_ldi_b(128);
+ }
+
+ out = __msa_copy_u_w((v4i32) store, 0);
+
+ for (row = 4; row--;) {
+ SW(out, dst);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
+ const uint8_t *src_left,
+ int32_t src_stride_left,
+ uint8_t *dst, int32_t dst_stride,
+ uint8_t is_above, uint8_t is_left)
+{
+ uint32_t row;
+ uint32_t out, addition = 0;
+ v16u8 src_above, store;
+ v8u16 sum_above;
+ v4u32 sum_top;
+ v2u64 sum;
+
+ if (is_left && is_above) {
+ src_above = LD_UB(src_top);
+
+ sum_above = __msa_hadd_u_h(src_above, src_above);
+ sum_top = __msa_hadd_u_w(sum_above, sum_above);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ addition = __msa_copy_u_w((v4i32) sum, 0);
+
+ for (row = 0; row < 8; row++) {
+ addition += src_left[row * src_stride_left];
+ }
+
+ addition = (addition + 8) >> 4;
+ store = (v16u8) __msa_fill_b(addition);
+ } else if (is_left) {
+ for (row = 0; row < 8; row++) {
+ addition += src_left[row * src_stride_left];
+ }
+
+ addition = (addition + 4) >> 3;
+ store = (v16u8) __msa_fill_b(addition);
+ } else if (is_above) {
+ src_above = LD_UB(src_top);
+
+ sum_above = __msa_hadd_u_h(src_above, src_above);
+ sum_top = __msa_hadd_u_w(sum_above, sum_above);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ sum = (v2u64) __msa_srari_d((v2i64) sum, 3);
+ store = (v16u8) __msa_splati_b((v16i8) sum, 0);
+ } else {
+ store = (v16u8) __msa_ldi_b(128);
+ }
+
+ out = __msa_copy_u_w((v4i32) store, 0);
+
+ for (row = 8; row--;) {
+ SW(out, dst);
+ SW(out, (dst + 4));
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
+ const uint8_t *src_left,
+ int32_t src_stride_left,
+ uint8_t *dst, int32_t dst_stride,
+ uint8_t is_above, uint8_t is_left)
+{
+ uint32_t row;
+ uint32_t addition = 0;
+ v16u8 src_above, store;
+ v8u16 sum_above;
+ v4u32 sum_top;
+ v2u64 sum;
+
+ if (is_left && is_above) {
+ src_above = LD_UB(src_top);
+
+ sum_above = __msa_hadd_u_h(src_above, src_above);
+ sum_top = __msa_hadd_u_w(sum_above, sum_above);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ addition = __msa_copy_u_w((v4i32) sum, 0);
+
+ for (row = 0; row < 16; row++) {
+ addition += src_left[row * src_stride_left];
+ }
+
+ addition = (addition + 16) >> 5;
+ store = (v16u8) __msa_fill_b(addition);
+ } else if (is_left) {
+ for (row = 0; row < 16; row++) {
+ addition += src_left[row * src_stride_left];
+ }
+
+ addition = (addition + 8) >> 4;
+ store = (v16u8) __msa_fill_b(addition);
+ } else if (is_above) {
+ src_above = LD_UB(src_top);
+
+ sum_above = __msa_hadd_u_h(src_above, src_above);
+ sum_top = __msa_hadd_u_w(sum_above, sum_above);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ sum = (v2u64) __msa_srari_d((v2i64) sum, 4);
+ store = (v16u8) __msa_splati_b((v16i8) sum, 0);
+ } else {
+ store = (v16u8) __msa_ldi_b(128);
+ }
+
+ for (row = 16; row--;) {
+ ST_UB(store, dst);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
+ const uint8_t *src_left,
+ int32_t src_stride_left,
+ uint8_t *dst, int32_t dst_stride,
+ uint8_t is_above, uint8_t is_left)
+{
+ uint32_t row;
+ uint32_t addition = 0;
+ v16u8 src_above1, src_above2, store;
+ v8u16 sum_above1, sum_above2, sum_above;
+ v4u32 sum_top;
+ v2u64 sum;
+
+ if (is_left && is_above) {
+ src_above1 = LD_UB(src_top);
+ src_above2 = LD_UB(src_top + 16);
+
+ HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
+
+ sum_above = sum_above1 + sum_above2;
+ sum_top = __msa_hadd_u_w(sum_above, sum_above);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ addition = __msa_copy_u_w((v4i32) sum, 0);
+
+ for (row = 0; row < 32; row++) {
+ addition += src_left[row * src_stride_left];
+ }
+
+ addition = (addition + 32) >> 6;
+ store = (v16u8) __msa_fill_b(addition);
+ } else if (is_left) {
+ for (row = 0; row < 32; row++) {
+ addition += src_left[row * src_stride_left];
+ }
+
+ addition = (addition + 16) >> 5;
+ store = (v16u8) __msa_fill_b(addition);
+ } else if (is_above) {
+ src_above1 = LD_UB(src_top);
+ src_above2 = LD_UB(src_top + 16);
+
+ HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
+
+ sum_above = sum_above1 + sum_above2;
+ sum_top = __msa_hadd_u_w(sum_above, sum_above);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
+ sum = __msa_hadd_u_d(sum_top, sum_top);
+ sum = (v2u64) __msa_srari_d((v2i64) sum, 5);
+ store = (v16u8) __msa_splati_b((v16i8) sum, 0);
+ } else {
+ store = (v16u8) __msa_ldi_b(128);
+ }
+ for (row = 32; row--;) {
+ ST_UB2(store, store, dst, 16);
+ dst += dst_stride;
+ }
+}
+
+#define INTRA_PREDICT_VALDC_4X4_MSA(val) \
+static void intra_predict_##val##dc_4x4_msa(uint8_t *dst, \
+ int32_t dst_stride) \
+{ \
+ uint32_t row, out; \
+ v16i8 store; \
+ \
+ store = __msa_ldi_b(val); \
+ out = __msa_copy_u_w((v4i32) store, 0); \
+ \
+ for (row = 4; row--;) \
+ { \
+ SW(out, dst); \
+ dst += dst_stride; \
+ } \
+}
+
+INTRA_PREDICT_VALDC_4X4_MSA(127);
+INTRA_PREDICT_VALDC_4X4_MSA(129);
+
+#define INTRA_PREDICT_VALDC_8X8_MSA(val) \
+static void intra_predict_##val##dc_8x8_msa(uint8_t *dst, \
+ int32_t dst_stride) \
+{ \
+ uint32_t row, out; \
+ v16i8 store; \
+ \
+ store = __msa_ldi_b(val); \
+ out = __msa_copy_u_w((v4i32) store, 0); \
+ \
+ for (row = 8; row--;) \
+ { \
+ SW(out, dst); \
+ SW(out, (dst + 4)); \
+ dst += dst_stride; \
+ } \
+}
+
+INTRA_PREDICT_VALDC_8X8_MSA(127);
+INTRA_PREDICT_VALDC_8X8_MSA(129);
+
+#define INTRA_PREDICT_VALDC_16X16_MSA(val) \
+static void intra_predict_##val##dc_16x16_msa(uint8_t *dst, \
+ int32_t dst_stride) \
+{ \
+ uint32_t row; \
+ v16u8 store; \
+ \
+ store = (v16u8) __msa_ldi_b(val); \
+ \
+ for (row = 16; row--;) \
+ { \
+ ST_UB(store, dst); \
+ dst += dst_stride; \
+ } \
+}
+
+INTRA_PREDICT_VALDC_16X16_MSA(127);
+INTRA_PREDICT_VALDC_16X16_MSA(129);
+
+#define INTRA_PREDICT_VALDC_32X32_MSA(val) \
+static void intra_predict_##val##dc_32x32_msa(uint8_t *dst, \
+ int32_t dst_stride) \
+{ \
+ uint32_t row; \
+ v16u8 store; \
+ \
+ store = (v16u8) __msa_ldi_b(val); \
+ \
+ for (row = 32; row--;) \
+ { \
+ ST_UB2(store, store, dst, 16); \
+ dst += dst_stride; \
+ } \
+}
+
+INTRA_PREDICT_VALDC_32X32_MSA(127);
+INTRA_PREDICT_VALDC_32X32_MSA(129);
+
+static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr,
+ const uint8_t *src_left,
+ int32_t src_left_stride,
+ uint8_t *dst, int32_t dst_stride)
+{
+ uint8_t top_left = src_top_ptr[-1];
+ v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
+ v16u8 src0, src1, src2, src3;
+ v8u16 src_top_left, vec0, vec1, vec2, vec3;
+
+ src_top_left = (v8u16) __msa_fill_h(top_left);
+ src_top = LD_SB(src_top_ptr);
+
+ src_left0 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+ src_left1 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+ src_left2 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+ src_left3 = __msa_fill_b(src_left[0]);
+
+ ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+ src_left3, src_top, src0, src1, src2, src3);
+ HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+ SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+ ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
+}
+
+static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr,
+ const uint8_t *src_left,
+ int32_t src_left_stride,
+ uint8_t *dst,
+ int32_t dst_stride)
+{
+ uint8_t top_left = src_top_ptr[-1];
+ uint32_t loop_cnt;
+ v16i8 src_top, src_left0, src_left1, src_left2, src_left3, tmp0, tmp1;
+ v8u16 src_top_left, vec0, vec1, vec2, vec3;
+ v16u8 src0, src1, src2, src3;
+
+ src_top = LD_SB(src_top_ptr);
+ src_top_left = (v8u16) __msa_fill_h(top_left);
+
+ for (loop_cnt = 2; loop_cnt--;) {
+ src_left0 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+ src_left1 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+ src_left2 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+ src_left3 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+
+ ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top,
+ src_left3, src_top, src0, src1, src2, src3);
+ HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
+ SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
+ PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
+ ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ dst += (4 * dst_stride);
+ }
+}
+
+static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr,
+ const uint8_t *src_left,
+ int32_t src_left_stride,
+ uint8_t *dst,
+ int32_t dst_stride)
+{
+ uint8_t top_left = src_top_ptr[-1];
+ uint32_t loop_cnt;
+ v16i8 src_top, src_left0, src_left1, src_left2, src_left3;
+ v8u16 src_top_left, res_r, res_l;
+
+ src_top = LD_SB(src_top_ptr);
+ src_top_left = (v8u16) __msa_fill_h(top_left);
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ src_left0 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+ src_left1 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+ src_left2 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+ src_left3 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+
+ ILVRL_B2_UH(src_left0, src_top, res_r, res_l);
+ HADD_UB2_UH(res_r, res_l, res_r, res_l);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+
+ SAT_UH2_UH(res_r, res_l, 7);
+ PCKEV_ST_SB(res_r, res_l, dst);
+ dst += dst_stride;
+
+ ILVRL_B2_UH(src_left1, src_top, res_r, res_l);
+ HADD_UB2_UH(res_r, res_l, res_r, res_l);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+ SAT_UH2_UH(res_r, res_l, 7);
+ PCKEV_ST_SB(res_r, res_l, dst);
+ dst += dst_stride;
+
+ ILVRL_B2_UH(src_left2, src_top, res_r, res_l);
+ HADD_UB2_UH(res_r, res_l, res_r, res_l);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+ SAT_UH2_UH(res_r, res_l, 7);
+ PCKEV_ST_SB(res_r, res_l, dst);
+ dst += dst_stride;
+
+ ILVRL_B2_UH(src_left3, src_top, res_r, res_l);
+ HADD_UB2_UH(res_r, res_l, res_r, res_l);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l);
+ SAT_UH2_UH(res_r, res_l, 7);
+ PCKEV_ST_SB(res_r, res_l, dst);
+ dst += dst_stride;
+ }
+}
+
+static void intra_predict_tm_32x32_msa(const uint8_t *src_top,
+ const uint8_t *src_left,
+ int32_t src_left_stride,
+ uint8_t *dst,
+ int32_t dst_stride)
+{
+ uint8_t top_left = src_top[-1];
+ uint32_t loop_cnt;
+ v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3;
+ v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1;
+
+ src_top0 = LD_SB(src_top);
+ src_top1 = LD_SB(src_top + 16);
+ src_top_left = (v8u16) __msa_fill_h(top_left);
+
+ for (loop_cnt = 8; loop_cnt--;) {
+ src_left0 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+ src_left1 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+ src_left2 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+ src_left3 = __msa_fill_b(src_left[0]);
+ src_left += src_left_stride;
+
+ ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1);
+ ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1);
+ HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+ res_l1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+ SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+ PCKEV_ST_SB(res_r0, res_l0, dst);
+ PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+ dst += dst_stride;
+
+ ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1);
+ ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1);
+ HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+ res_l1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+ SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+ PCKEV_ST_SB(res_r0, res_l0, dst);
+ PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+ dst += dst_stride;
+
+ ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1);
+ ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1);
+ HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+ res_l1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+ SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+ PCKEV_ST_SB(res_r0, res_l0, dst);
+ PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+ dst += dst_stride;
+
+ ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1);
+ ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1);
+ HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1,
+ res_l1);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0);
+ IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1);
+ SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7);
+ PCKEV_ST_SB(res_r0, res_l0, dst);
+ PCKEV_ST_SB(res_r1, res_l1, dst + 16);
+ dst += dst_stride;
+ }
+}
+
+void ff_vert_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_vert_4x4_msa(top, dst, stride);
+}
+
+void ff_vert_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_vert_8x8_msa(top, dst, stride);
+}
+
+void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_vert_16x16_msa(top, dst, stride);
+}
+
+void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_vert_32x32_msa(top, dst, stride);
+}
+
+void ff_hor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_horiz_4x4_msa(left + 3, -1, dst, stride);
+}
+
+void ff_hor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_horiz_8x8_msa(left + 7, -1, dst, stride);
+}
+
+void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_horiz_16x16_msa(left + 15, -1, dst, stride);
+}
+
+void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_horiz_32x32_msa(left + 31, -1, dst, stride);
+}
+
+void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_dc_4x4_msa(top, left, 1, dst, stride, 1, 1);
+}
+
+void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_dc_8x8_msa(top, left, 1, dst, stride, 1, 1);
+}
+
+void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_dc_16x16_msa(top, left, 1, dst, stride, 1, 1);
+}
+
+void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_dc_32x32_msa(top, left, 1, dst, stride, 1, 1);
+}
+
+void ff_dc_left_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_dc_4x4_msa(top, left, 1, dst, stride, 0, 1);
+}
+
+void ff_dc_left_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_dc_8x8_msa(top, left, 1, dst, stride, 0, 1);
+}
+
+void ff_dc_left_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_dc_16x16_msa(top, left, 1, dst, stride, 0, 1);
+}
+
+void ff_dc_left_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_dc_32x32_msa(top, left, 1, dst, stride, 0, 1);
+}
+
+void ff_dc_top_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_dc_4x4_msa(top, left, 1, dst, stride, 1, 0);
+}
+
+void ff_dc_top_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_dc_8x8_msa(top, left, 1, dst, stride, 1, 0);
+}
+
+void ff_dc_top_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_dc_16x16_msa(top, left, 1, dst, stride, 1, 0);
+}
+
+void ff_dc_top_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_dc_32x32_msa(top, left, 1, dst, stride, 1, 0);
+}
+
+void ff_dc_128_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_dc_4x4_msa(top, left, 1, dst, stride, 0, 0);
+}
+
+void ff_dc_128_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_dc_8x8_msa(top, left, 1, dst, stride, 0, 0);
+}
+
+void ff_dc_128_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_dc_16x16_msa(top, left, 1, dst, stride, 0, 0);
+}
+
+void ff_dc_128_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_dc_32x32_msa(top, left, 1, dst, stride, 0, 0);
+}
+
+void ff_dc_127_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_127dc_4x4_msa(dst, stride);
+}
+
+void ff_dc_127_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_127dc_8x8_msa(dst, stride);
+}
+
+void ff_dc_127_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_127dc_16x16_msa(dst, stride);
+}
+
+void ff_dc_127_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_127dc_32x32_msa(dst, stride);
+}
+
+void ff_dc_129_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_129dc_4x4_msa(dst, stride);
+}
+
+void ff_dc_129_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top)
+{
+ intra_predict_129dc_8x8_msa(dst, stride);
+}
+
+void ff_dc_129_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_129dc_16x16_msa(dst, stride);
+}
+
+void ff_dc_129_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_129dc_32x32_msa(dst, stride);
+}
+
+void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_tm_4x4_msa(top, left + 3, -1, dst, stride);
+}
+
+void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_tm_8x8_msa(top, left + 7, -1, dst, stride);
+}
+
+void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_tm_16x16_msa(top, left + 15, -1, dst, stride);
+}
+
+void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top)
+{
+ intra_predict_tm_32x32_msa(top, left + 31, -1, dst, stride);
+}
diff --git a/libavcodec/mips/vp9dsp_init_mips.c b/libavcodec/mips/vp9dsp_init_mips.c
index 79cafb8..6c03601 100644
--- a/libavcodec/mips/vp9dsp_init_mips.c
+++ b/libavcodec/mips/vp9dsp_init_mips.c
@@ -24,6 +24,28 @@
#include "vp9dsp_mips.h"
#if HAVE_MSA
+static av_cold void vp9dsp_intrapred_init_msa(VP9DSPContext *dsp, int bpp)
+{
+ if (bpp == 8) {
+#define init_intra_pred_msa(tx, sz) \
+ dsp->intra_pred[tx][VERT_PRED] = ff_vert_##sz##_msa; \
+ dsp->intra_pred[tx][HOR_PRED] = ff_hor_##sz##_msa; \
+ dsp->intra_pred[tx][DC_PRED] = ff_dc_##sz##_msa; \
+ dsp->intra_pred[tx][LEFT_DC_PRED] = ff_dc_left_##sz##_msa; \
+ dsp->intra_pred[tx][TOP_DC_PRED] = ff_dc_top_##sz##_msa; \
+ dsp->intra_pred[tx][DC_128_PRED] = ff_dc_128_##sz##_msa; \
+ dsp->intra_pred[tx][DC_127_PRED] = ff_dc_127_##sz##_msa; \
+ dsp->intra_pred[tx][DC_129_PRED] = ff_dc_129_##sz##_msa; \
+ dsp->intra_pred[tx][TM_VP8_PRED] = ff_tm_##sz##_msa; \
+
+ init_intra_pred_msa(TX_4X4, 4x4);
+ init_intra_pred_msa(TX_8X8, 8x8);
+ init_intra_pred_msa(TX_16X16, 16x16);
+ init_intra_pred_msa(TX_32X32, 32x32);
+#undef init_intra_pred_msa
+ }
+}
+
static av_cold void vp9dsp_itxfm_init_msa(VP9DSPContext *dsp, int bpp)
{
if (bpp == 8) {
@@ -129,6 +151,7 @@ static av_cold void vp9dsp_loopfilter_init_msa(VP9DSPContext *dsp, int bpp)
static av_cold void vp9dsp_init_msa(VP9DSPContext *dsp, int bpp)
{
+ vp9dsp_intrapred_init_msa(dsp, bpp);
vp9dsp_itxfm_init_msa(dsp, bpp);
vp9dsp_mc_init_msa(dsp, bpp);
vp9dsp_loopfilter_init_msa(dsp, bpp);
diff --git a/libavcodec/mips/vp9dsp_mips.h b/libavcodec/mips/vp9dsp_mips.h
index 9f59d45..1df8fa9 100644
--- a/libavcodec/mips/vp9dsp_mips.h
+++ b/libavcodec/mips/vp9dsp_mips.h
@@ -150,4 +150,77 @@ void ff_idct_iadst_16x16_add_msa(uint8_t *pu8Dest, ptrdiff_t stride,
void ff_iwht_iwht_4x4_add_msa(uint8_t *dst, ptrdiff_t stride,
int16_t *block, int eob);
+void ff_vert_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_vert_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_vert_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_vert_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_hor_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_hor_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_hor_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_hor_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_left_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_left_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_left_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top);
+void ff_dc_left_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top);
+void ff_dc_top_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_top_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_top_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top);
+void ff_dc_top_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top);
+void ff_dc_128_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_128_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_128_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top);
+void ff_dc_128_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top);
+void ff_dc_127_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_127_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_127_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top);
+void ff_dc_127_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top);
+void ff_dc_129_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_129_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_dc_129_16x16_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top);
+void ff_dc_129_32x32_msa(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, const uint8_t *top);
+void ff_tm_4x4_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_tm_16x16_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+void ff_tm_32x32_msa(uint8_t *dst, ptrdiff_t stride, const uint8_t *left,
+ const uint8_t *top);
+
#endif // #ifndef AVCODEC_MIPS_VP9DSP_MIPS_H
--
2.3.7
More information about the ffmpeg-devel
mailing list