[FFmpeg-devel] [PATCH] PPC64: Add versions of functions in libswscale/input.c optimized for POWER8 VSX SIMD.
Dan Parrot
dan.parrot at mail.com
Wed Jul 6 05:37:04 EEST 2016
Finish providing SIMD versions for POWER8 VSX of functions in libswscale/input.c That should allow trac ticket #5570 to be closed.
The speedups obtained for the functions are:
abgrToA_c 1.19
bgr24ToUV_c 1.23
bgr24ToUV_half_c 1.37
bgr24ToY_c_vsx 1.43
nv12ToUV_c 1.05
nv21ToUV_c 1.06
planar_rgb_to_uv 1.25
planar_rgb_to_y 1.26
rgb24ToUV_c 1.11
rgb24ToUV_half_c 1.10
rgb24ToY_c 0.92
rgbaToA_c 0.88
uyvyToUV_c 1.05
uyvyToY_c 1.15
yuy2ToUV_c 1.07
yuy2ToY_c 1.17
yvy2ToUV_c 1.05
---
libswscale/ppc/input_vsx.c | 1021 +++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 1017 insertions(+), 4 deletions(-)
diff --git a/libswscale/ppc/input_vsx.c b/libswscale/ppc/input_vsx.c
index d977a32..35edd5e 100644
--- a/libswscale/ppc/input_vsx.c
+++ b/libswscale/ppc/input_vsx.c
@@ -30,6 +30,7 @@
#include "libavutil/mathematics.h"
#include "libavutil/pixdesc.h"
#include "libavutil/avassert.h"
+#include "libavutil/timer.h"
#include "config.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
@@ -54,6 +55,7 @@ static void abgrToA_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unus
for ( i = 0; i < width_adj; i += 8) {
vector int v_rd0 = vec_vsx_ld(0, (int *)src_addr);
vector int v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ vector int v_dst;
v_rd0 = vec_and(v_rd0, vec_splats(0x0ff));
v_rd1 = vec_and(v_rd1, vec_splats(0x0ff));
@@ -61,8 +63,8 @@ static void abgrToA_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unus
v_rd0 = vec_sl(v_rd0, vec_splats((unsigned)6));
v_rd1 = vec_sl(v_rd1, vec_splats((unsigned)6));
- vector int v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
- {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
src_addr += 32;
@@ -91,6 +93,7 @@ static void rgbaToA_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unus
for ( i = 0; i < width_adj; i += 8) {
vector int v_rd0 = vec_vsx_ld(0, (int *)src_addr);
vector int v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ vector int v_dst;
v_rd0 = vec_sld(v_rd0, v_rd0, 13);
v_rd1 = vec_sld(v_rd1, v_rd1, 13);
@@ -101,8 +104,8 @@ static void rgbaToA_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unus
v_rd0 = vec_sl(v_rd0, vec_splats((unsigned)6));
v_rd1 = vec_sl(v_rd1, vec_splats((unsigned)6));
- vector int v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
- {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
src_addr += 32;
@@ -114,6 +117,175 @@ static void rgbaToA_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unus
}
}
+static void monoblack2Y_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+ int16_t *dst = (int16_t *)_dst;
+ int i, j, width_adj, frag_len;
+
+ vector unsigned char v_rd;
+ vector signed short v_din, v_d, v_dst;
+ vector unsigned short v_opr;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width = (width + 7) >> 3;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width >> 3;
+ width_adj = width_adj << 3;
+ frag_len = width - width_adj;
+
+ v_opr = (vector unsigned short) {7, 6, 5, 4, 3, 2, 1, 0};
+
+ for (i = 0; i < width_adj; i += 8) {
+ if (i & 0x0f) {
+ v_rd = vec_sld(v_rd, v_rd, 8);
+ } else {
+ v_rd = vec_vsx_ld(0, (unsigned char *)src_addr);
+ src_addr += 16;
+ }
+
+ v_din = vec_unpackh((vector signed char)v_rd);
+ v_din = vec_and(v_din, vec_splats((short)0x00ff));
+
+ for (j = 0; j < 8; j++) {
+ switch(j) {
+ case 0:
+ v_d = vec_splat(v_din, 0);
+ break;
+ case 1:
+ v_d = vec_splat(v_din, 1);
+ break;
+ case 2:
+ v_d = vec_splat(v_din, 2);
+ break;
+ case 3:
+ v_d = vec_splat(v_din, 3);
+ break;
+ case 4:
+ v_d = vec_splat(v_din, 4);
+ break;
+ case 5:
+ v_d = vec_splat(v_din, 5);
+ break;
+ case 6:
+ v_d = vec_splat(v_din, 6);
+ break;
+ case 7:
+ v_d = vec_splat(v_din, 7);
+ break;
+ }
+
+ v_dst = vec_sr(v_d, v_opr);
+ v_dst = vec_and(v_dst, vec_splats((short)1));
+ v_dst = v_dst * vec_splats((short)16383);
+
+ vec_vsx_st(v_dst, 0, (short *)dst_addr);
+ dst_addr += 16;
+ }
+ }
+
+ for (i = width_adj; i < width_adj + frag_len; i++) {
+ int d = src[i];
+ for (j = 0; j < 8; j++)
+ dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+ }
+
+ i = width;
+ if(width&7){
+ int d= src[i];
+ for (j = 0; j < (width&7); j++)
+ dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+ }
+}
+
+static void monowhite2Y_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+ int16_t *dst = (int16_t *)_dst;
+ int i, j, width_adj, frag_len;
+
+ vector unsigned char v_rd;
+ vector signed short v_din, v_d, v_dst;
+ vector unsigned short v_opr;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width = (width + 7) >> 3;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width >> 3;
+ width_adj = width_adj << 3;
+ frag_len = width - width_adj;
+
+ v_opr = (vector unsigned short) {7, 6, 5, 4, 3, 2, 1, 0};
+
+ for (i = 0; i < width_adj; i += 8) {
+ if (i & 0x0f) {
+ v_rd = vec_sld(v_rd, v_rd, 8);
+ } else {
+ v_rd = vec_vsx_ld(0, (unsigned char *)src_addr);
+ src_addr += 16;
+ }
+
+ v_din = vec_unpackh((vector signed char)v_rd);
+ v_din = vec_and(v_din, vec_splats((short)0x00ff));
+ v_din = vec_xor(v_din, vec_splats((short)0xffff));
+
+ for (j = 0; j < 8; j++) {
+ switch(j) {
+ case 0:
+ v_d = vec_splat(v_din, 0);
+ break;
+ case 1:
+ v_d = vec_splat(v_din, 1);
+ break;
+ case 2:
+ v_d = vec_splat(v_din, 2);
+ break;
+ case 3:
+ v_d = vec_splat(v_din, 3);
+ break;
+ case 4:
+ v_d = vec_splat(v_din, 4);
+ break;
+ case 5:
+ v_d = vec_splat(v_din, 5);
+ break;
+ case 6:
+ v_d = vec_splat(v_din, 6);
+ break;
+ case 7:
+ v_d = vec_splat(v_din, 7);
+ break;
+ }
+
+ v_dst = vec_sr(v_d, v_opr);
+ v_dst = vec_and(v_dst, vec_splats((short)1));
+ v_dst = v_dst * vec_splats((short)16383);
+
+ vec_vsx_st(v_dst, 0, (short *)dst_addr);
+ dst_addr += 16;
+ }
+ }
+
+ for (i = width_adj; i < width_adj + frag_len; i++) {
+ int d = ~src[i];
+ for (j = 0; j < 8; j++)
+ dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+ }
+
+ i = width;
+ if(width&7){
+ int d= ~src[i];
+ for (j = 0; j < (width&7); j++)
+ dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+ }
+}
+
static void yuy2ToY_c_vsx(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
uint32_t *unused)
{
@@ -380,6 +552,806 @@ static void nv21ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
nvXXtoUV_c_vsx(dstV, dstU, src1, width);
}
+static void bgr24ToY_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *rgb2yuv)
+{
+ int16_t *dst = (int16_t *)_dst;
+ int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+ int i, j, width_adj, frag_len;
+
+ vector unsigned char v_rd0, v_rd1, v_tmpb, v_tmpg, v_tmpr;
+
+ vector short v_tmp_s, v_dst;
+ vector int v_r, v_g, v_b, v_rslt;
+
+ vector int v_ry = vec_splats((int)ry);
+ vector int v_gy = vec_splats((int)gy);
+ vector int v_by = vec_splats((int)by);
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width >> 3;
+ width_adj = width_adj << 3;
+ frag_len = width - width_adj;
+
+ for (i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned char *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned char *)(src_addr + 16));
+ src_addr += 24;
+
+ for (j = 0; j < 2; j++) {
+ v_tmpb = vec_perm(v_rd0, v_rd0, ((vector unsigned char)
+ {0, 3, 6, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpg = vec_perm(v_rd0, v_rd0, ((vector unsigned char)
+ {1, 4, 7, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpr = vec_perm(v_rd0, v_rd0, ((vector unsigned char)
+ {2, 5, 8, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+
+ v_rd0 = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 0, 0, 0, 0}));
+
+ v_tmp_s = vec_unpackh((vector signed char)v_tmpb);
+ v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff));
+ v_b = vec_unpackh(v_tmp_s);
+ v_tmp_s = vec_unpackh((vector signed char)v_tmpg);
+ v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff));
+ v_g = vec_unpackh(v_tmp_s);
+ v_tmp_s = vec_unpackh((vector signed char)v_tmpr);
+ v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff));
+ v_r = vec_unpackh(v_tmp_s);
+
+ vector unsigned v_opr1 = vec_splats((unsigned)(RGB2YUV_SHIFT-1));
+ vector unsigned v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-7));
+ vector unsigned v_opr3 = vec_splats((unsigned)(RGB2YUV_SHIFT-6));
+
+ v_rslt = v_ry*v_r + v_gy*v_g + v_by*v_b;
+ v_rslt += vec_sl(vec_splats((int)32), v_opr1);
+ v_rslt += vec_sl(vec_splats((int)1), v_opr2);
+ v_rslt = vec_sr(v_rslt, v_opr3);
+
+ v_tmp_s = vec_pack(v_rslt, v_rslt);
+ v_dst = vec_sld(v_dst, v_tmp_s, 8);
+ }
+ v_dst = vec_sld(v_dst, v_dst, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width_adj + frag_len; i++) {
+ int b = src[i * 3 + 0];
+ int g = src[i * 3 + 1];
+ int r = src[i * 3 + 2];
+
+ dst[i] = ((ry*r + gy*g + by*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
+ }
+}
+
+static void bgr24ToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *rgb2yuv)
+{
+ int16_t *dstU = (int16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+
+ int i, j, width_adj, frag_len;
+
+ vector unsigned char v_rd0, v_rd1, v_tmpb, v_tmpg, v_tmpr;
+
+ vector short v_tmp_s, v_dstu, v_dstv;
+ vector int v_r, v_g, v_b, v_rslt;
+
+ vector int v_ru = vec_splats((int)ru);
+ vector int v_gu = vec_splats((int)gu);
+ vector int v_bu = vec_splats((int)bu);
+
+ vector int v_rv = vec_splats((int)rv);
+ vector int v_gv = vec_splats((int)gv);
+ vector int v_bv = vec_splats((int)bv);
+
+ uintptr_t src1_addr = (uintptr_t)src1;
+ uintptr_t dstu_addr = (uintptr_t)dstU;
+ uintptr_t dstv_addr = (uintptr_t)dstV;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width >> 3;
+ width_adj = width_adj << 3;
+ frag_len = width - width_adj;
+
+ for (i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned char *)src1_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 16));
+ src1_addr += 24;
+
+ for (j = 0; j < 2; j++) {
+ v_tmpb = vec_perm(v_rd0, v_rd0, ((vector unsigned char)
+ {0, 3, 6, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpg = vec_perm(v_rd0, v_rd0, ((vector unsigned char)
+ {1, 4, 7, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpr = vec_perm(v_rd0, v_rd0, ((vector unsigned char)
+ {2, 5, 8, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+
+ v_rd0 = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 0, 0, 0, 0}));
+
+ v_tmp_s = vec_unpackh((vector signed char)v_tmpb);
+ v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff));
+ v_b = vec_unpackh(v_tmp_s);
+ v_tmp_s = vec_unpackh((vector signed char)v_tmpg);
+ v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff));
+ v_g = vec_unpackh(v_tmp_s);
+ v_tmp_s = vec_unpackh((vector signed char)v_tmpr);
+ v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff));
+ v_r = vec_unpackh(v_tmp_s);
+
+ vector unsigned v_opr1 = vec_splats((unsigned)(RGB2YUV_SHIFT-1));
+ vector unsigned v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-7));
+ vector unsigned v_opr3 = vec_splats((unsigned)(RGB2YUV_SHIFT-6));
+
+ v_rslt = v_ru*v_r + v_gu*v_g + v_bu*v_b;
+ v_rslt += vec_sl(vec_splats((int)256), v_opr1);
+ v_rslt += vec_sl(vec_splats((int)1), v_opr2);
+ v_rslt = vec_sr(v_rslt, v_opr3);
+
+ v_tmp_s = vec_pack(v_rslt, v_rslt);
+ v_dstu = vec_sld(v_dstu, v_tmp_s, 8);
+
+ v_rslt = v_rv*v_r + v_gv*v_g + v_bv*v_b;
+ v_rslt += vec_sl(vec_splats((int)256), v_opr1);
+ v_rslt += vec_sl(vec_splats((int)1), v_opr2);
+ v_rslt = vec_sr(v_rslt, v_opr3);
+
+ v_tmp_s = vec_pack(v_rslt, v_rslt);
+ v_dstv = vec_sld(v_dstv, v_tmp_s, 8);
+ }
+ v_dstu = vec_sld(v_dstu, v_dstu, 8);
+ v_dstv = vec_sld(v_dstv, v_dstv, 8);
+ vec_vsx_st((vector unsigned char)v_dstu, 0, (unsigned char *)dstu_addr);
+ vec_vsx_st((vector unsigned char)v_dstv, 0, (unsigned char *)dstv_addr);
+ dstu_addr += 16;
+ dstv_addr += 16;
+ }
+
+ for (i = width_adj; i < width_adj + frag_len; i++) {
+ int b = src1[3 * i + 0];
+ int g = src1[3 * i + 1];
+ int r = src1[3 * i + 2];
+
+ dstU[i] = (ru*r + gu*g + bu*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+ dstV[i] = (rv*r + gv*g + bv*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+ }
+ av_assert1(src1 == src2);
+}
+
+static void bgr24ToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *rgb2yuv)
+{
+ int16_t *dstU = (int16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+
+ int i, j, width_adj, frag_len;
+
+ vector unsigned char v_rd0, v_rd1, v_rd2, v_tmpb[2], v_tmpg[2], v_tmpr[2];
+
+ vector short v_tmp_s[2], v_dstu, v_dstv;
+ vector int v_r, v_g, v_b, v_rslt;
+
+ vector int v_ru = vec_splats((int)ru);
+ vector int v_gu = vec_splats((int)gu);
+ vector int v_bu = vec_splats((int)bu);
+
+ vector int v_rv = vec_splats((int)rv);
+ vector int v_gv = vec_splats((int)gv);
+ vector int v_bv = vec_splats((int)bv);
+
+ uintptr_t src1_addr = (uintptr_t)src1;
+ uintptr_t dstu_addr = (uintptr_t)dstU;
+ uintptr_t dstv_addr = (uintptr_t)dstV;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width >> 3;
+ width_adj = width_adj << 3;
+ frag_len = width - width_adj;
+
+ for (i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned char *)src1_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 16));
+ v_rd2 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 32));
+ src1_addr += 48;
+
+ for (j = 0; j < 2; j++) {
+ v_tmpb[0] = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {0, 6, 12, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpb[1] = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {3, 9, 15, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpg[0] = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {1, 7, 13, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpg[1] = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {4, 10, 16, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpr[0] = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {2, 8, 14, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpr[1] = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {5, 11, 17, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+
+ v_rd0 = vec_perm(v_rd1, v_rd2, ((vector unsigned char)
+ {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}));
+ v_rd1 = vec_perm(v_rd2, v_rd2, ((vector unsigned char)
+ {8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0}));
+
+ v_tmp_s[0] = vec_unpackh((vector signed char)v_tmpb[0]);
+ v_tmp_s[0] = vec_and(v_tmp_s[0], vec_splats((short)0x0ff));
+ v_tmp_s[1] = vec_unpackh((vector signed char)v_tmpb[1]);
+ v_tmp_s[1] = vec_and(v_tmp_s[1], vec_splats((short)0x0ff));
+ v_b = vec_unpackh(v_tmp_s[0]) + vec_unpackh(v_tmp_s[1]);
+
+ v_tmp_s[0] = vec_unpackh((vector signed char)v_tmpg[0]);
+ v_tmp_s[0] = vec_and(v_tmp_s[0], vec_splats((short)0x0ff));
+ v_tmp_s[1] = vec_unpackh((vector signed char)v_tmpg[1]);
+ v_tmp_s[1] = vec_and(v_tmp_s[1], vec_splats((short)0x0ff));
+ v_g = vec_unpackh(v_tmp_s[0]) + vec_unpackh(v_tmp_s[1]);
+
+ v_tmp_s[0] = vec_unpackh((vector signed char)v_tmpr[0]);
+ v_tmp_s[0] = vec_and(v_tmp_s[0], vec_splats((short)0x0ff));
+ v_tmp_s[1] = vec_unpackh((vector signed char)v_tmpr[1]);
+ v_tmp_s[1] = vec_and(v_tmp_s[1], vec_splats((short)0x0ff));
+ v_r = vec_unpackh(v_tmp_s[0]) + vec_unpackh(v_tmp_s[1]);
+
+ vector unsigned v_opr1 = vec_splats((unsigned)(RGB2YUV_SHIFT));
+ vector unsigned v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-6));
+ vector unsigned v_opr3 = vec_splats((unsigned)(RGB2YUV_SHIFT-5));
+
+ v_rslt = v_ru*v_r + v_gu*v_g + v_bu*v_b;
+ v_rslt += vec_sl(vec_splats((int)256), v_opr1);
+ v_rslt += vec_sl(vec_splats((int)1), v_opr2);
+ v_rslt = vec_sr(v_rslt, v_opr3);
+
+ v_tmp_s[0] = vec_pack(v_rslt, v_rslt);
+ v_dstu = vec_sld(v_dstu, v_tmp_s[0], 8);
+
+ v_rslt = v_rv*v_r + v_gv*v_g + v_bv*v_b;
+ v_rslt += vec_sl(vec_splats((int)256), v_opr1);
+ v_rslt += vec_sl(vec_splats((int)1), v_opr2);
+ v_rslt = vec_sr(v_rslt, v_opr3);
+
+ v_tmp_s[0] = vec_pack(v_rslt, v_rslt);
+ v_dstv = vec_sld(v_dstv, v_tmp_s[0], 8);
+ }
+ v_dstu = vec_sld(v_dstu, v_dstu, 8);
+ v_dstv = vec_sld(v_dstv, v_dstv, 8);
+ vec_vsx_st((vector unsigned char)v_dstu, 0, (unsigned char *)dstu_addr);
+ vec_vsx_st((vector unsigned char)v_dstv, 0, (unsigned char *)dstv_addr);
+ dstu_addr += 16;
+ dstv_addr += 16;
+ }
+
+ for (i = width_adj; i < width + frag_len; i++) {
+ int b = src1[6 * i + 0] + src1[6 * i + 3];
+ int g = src1[6 * i + 1] + src1[6 * i + 4];
+ int r = src1[6 * i + 2] + src1[6 * i + 5];
+
+ dstU[i] = (ru*r + gu*g + bu*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+ dstV[i] = (rv*r + gv*g + bv*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+ }
+ av_assert1(src1 == src2);
+}
+
+static void rgb24ToY_c_vsx(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *rgb2yuv)
+{
+//START_TIMER;
+ int16_t *dst = (int16_t *)_dst;
+ int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+ int i, j, width_adj, frag_len;
+
+ vector unsigned char v_rd0, v_rd1, v_tmpb, v_tmpg, v_tmpr;
+
+ vector short v_tmp_s, v_dst;
+ vector int v_r, v_g, v_b, v_rslt;
+
+ vector int v_ry = vec_splats((int)ry);
+ vector int v_gy = vec_splats((int)gy);
+ vector int v_by = vec_splats((int)by);
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width >> 3;
+ width_adj = width_adj << 3;
+ frag_len = width - width_adj;
+
+ for (i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned char *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned char *)(src_addr + 16));
+ src_addr += 24;
+
+ for (j = 0; j < 2; j++) {
+ v_tmpr = vec_perm(v_rd0, v_rd0, ((vector unsigned char)
+ {0, 3, 6, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpg = vec_perm(v_rd0, v_rd0, ((vector unsigned char)
+ {1, 4, 7, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpb = vec_perm(v_rd0, v_rd0, ((vector unsigned char)
+ {2, 5, 8, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+
+ v_rd0 = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 0, 0, 0, 0}));
+
+ v_tmp_s = vec_unpackh((vector signed char)v_tmpr);
+ v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff));
+ v_r = vec_unpackh(v_tmp_s);
+ v_tmp_s = vec_unpackh((vector signed char)v_tmpg);
+ v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff));
+ v_g = vec_unpackh(v_tmp_s);
+ v_tmp_s = vec_unpackh((vector signed char)v_tmpb);
+ v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff));
+ v_b = vec_unpackh(v_tmp_s);
+
+ vector unsigned v_opr1 = vec_splats((unsigned)(RGB2YUV_SHIFT-1));
+ vector unsigned v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-7));
+ vector unsigned v_opr3 = vec_splats((unsigned)(RGB2YUV_SHIFT-6));
+
+ v_rslt = v_ry*v_r + v_gy*v_g + v_by*v_b;
+ v_rslt += vec_sl(vec_splats((int)32), v_opr1);
+ v_rslt += vec_sl(vec_splats((int)1), v_opr2);
+ v_rslt = vec_sr(v_rslt, v_opr3);
+
+ v_tmp_s = vec_pack(v_rslt, v_rslt);
+ v_dst = vec_sld(v_dst, v_tmp_s, 8);
+ }
+ v_dst = vec_sld(v_dst, v_dst, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width_adj + frag_len; i++) {
+ int r = src[i * 3 + 0];
+ int g = src[i * 3 + 1];
+ int b = src[i * 3 + 2];
+
+ dst[i] = ((ry*r + gy*g + by*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
+ }
+//STOP_TIMER("rgb24ToY_c_vsx");
+}
+
+static void rgb24ToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *rgb2yuv)
+{
+ int16_t *dstU = (int16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+
+ int i, j, width_adj, frag_len;
+
+ vector unsigned char v_rd0, v_rd1, v_tmpb, v_tmpg, v_tmpr;
+
+ vector short v_tmp_s, v_dstu, v_dstv;
+ vector int v_r, v_g, v_b, v_rslt;
+
+ vector int v_ru = vec_splats((int)ru);
+ vector int v_gu = vec_splats((int)gu);
+ vector int v_bu = vec_splats((int)bu);
+
+ vector int v_rv = vec_splats((int)rv);
+ vector int v_gv = vec_splats((int)gv);
+ vector int v_bv = vec_splats((int)bv);
+
+ uintptr_t src1_addr = (uintptr_t)src1;
+ uintptr_t dstu_addr = (uintptr_t)dstU;
+ uintptr_t dstv_addr = (uintptr_t)dstV;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width >> 3;
+ width_adj = width_adj << 3;
+ frag_len = width - width_adj;
+
+ av_assert1(src1 == src2);
+ for (i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned char *)src1_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 16));
+ src1_addr += 24;
+
+ for (j = 0; j < 2; j++) {
+ v_tmpr = vec_perm(v_rd0, v_rd0, ((vector unsigned char)
+ {0, 3, 6, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpg = vec_perm(v_rd0, v_rd0, ((vector unsigned char)
+ {1, 4, 7, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpb = vec_perm(v_rd0, v_rd0, ((vector unsigned char)
+ {2, 5, 8, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+
+ v_rd0 = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 0, 0, 0, 0}));
+
+ v_tmp_s = vec_unpackh((vector signed char)v_tmpr);
+ v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff));
+ v_r = vec_unpackh(v_tmp_s);
+ v_tmp_s = vec_unpackh((vector signed char)v_tmpg);
+ v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff));
+ v_g = vec_unpackh(v_tmp_s);
+ v_tmp_s = vec_unpackh((vector signed char)v_tmpb);
+ v_tmp_s = vec_and(v_tmp_s, vec_splats((short)0x0ff));
+ v_b = vec_unpackh(v_tmp_s);
+
+ vector unsigned v_opr1 = vec_splats((unsigned)(RGB2YUV_SHIFT-1));
+ vector unsigned v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-7));
+ vector unsigned v_opr3 = vec_splats((unsigned)(RGB2YUV_SHIFT-6));
+
+ v_rslt = v_ru*v_r + v_gu*v_g + v_bu*v_b;
+ v_rslt += vec_sl(vec_splats((int)256), v_opr1);
+ v_rslt += vec_sl(vec_splats((int)1), v_opr2);
+ v_rslt = vec_sr(v_rslt, v_opr3);
+
+ v_tmp_s = vec_pack(v_rslt, v_rslt);
+ v_dstu = vec_sld(v_dstu, v_tmp_s, 8);
+
+ v_rslt = v_rv*v_r + v_gv*v_g + v_bv*v_b;
+ v_rslt += vec_sl(vec_splats((int)256), v_opr1);
+ v_rslt += vec_sl(vec_splats((int)1), v_opr2);
+ v_rslt = vec_sr(v_rslt, v_opr3);
+
+ v_tmp_s = vec_pack(v_rslt, v_rslt);
+ v_dstv = vec_sld(v_dstv, v_tmp_s, 8);
+ }
+ v_dstu = vec_sld(v_dstu, v_dstu, 8);
+ v_dstv = vec_sld(v_dstv, v_dstv, 8);
+ vec_vsx_st((vector unsigned char)v_dstu, 0, (unsigned char *)dstu_addr);
+ vec_vsx_st((vector unsigned char)v_dstv, 0, (unsigned char *)dstv_addr);
+ dstu_addr += 16;
+ dstv_addr += 16;
+ }
+
+ for (i = width_adj; i < width_adj + frag_len; i++) {
+ int r = src1[3 * i + 0];
+ int g = src1[3 * i + 1];
+ int b = src1[3 * i + 2];
+
+ dstU[i] = (ru*r + gu*g + bu*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+ dstV[i] = (rv*r + gv*g + bv*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+ }
+}
+
+static void rgb24ToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *rgb2yuv)
+{
+ int16_t *dstU = (int16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+
+ int i, j, width_adj, frag_len;
+
+ vector unsigned char v_rd0, v_rd1, v_rd2, v_tmpb[2], v_tmpg[2], v_tmpr[2];
+
+ vector short v_tmp_s[2], v_dstu, v_dstv;
+ vector int v_r, v_g, v_b, v_rslt;
+
+ vector int v_ru = vec_splats((int)ru);
+ vector int v_gu = vec_splats((int)gu);
+ vector int v_bu = vec_splats((int)bu);
+
+ vector int v_rv = vec_splats((int)rv);
+ vector int v_gv = vec_splats((int)gv);
+ vector int v_bv = vec_splats((int)bv);
+
+ uintptr_t src1_addr = (uintptr_t)src1;
+ uintptr_t dstu_addr = (uintptr_t)dstU;
+ uintptr_t dstv_addr = (uintptr_t)dstV;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width >> 3;
+ width_adj = width_adj << 3;
+ frag_len = width - width_adj;
+
+ for (i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned char *)src1_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 16));
+ v_rd2 = vec_vsx_ld(0, (unsigned char *)(src1_addr + 32));
+ src1_addr += 48;
+
+ av_assert1(src1 == src2);
+ for (j = 0; j < 2; j++) {
+ v_tmpr[0] = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {0, 6, 12, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpr[1] = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {3, 9, 15, 21, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpg[0] = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {1, 7, 13, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpg[1] = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {4, 10, 16, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpb[0] = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {2, 8, 14, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+ v_tmpb[1] = vec_perm(v_rd0, v_rd1, ((vector unsigned char)
+ {5, 11, 17, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+
+ v_rd0 = vec_perm(v_rd1, v_rd2, ((vector unsigned char)
+ {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}));
+ v_rd1 = vec_perm(v_rd2, v_rd2, ((vector unsigned char)
+ {8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0}));
+
+ v_tmp_s[0] = vec_unpackh((vector signed char)v_tmpr[0]);
+ v_tmp_s[0] = vec_and(v_tmp_s[0], vec_splats((short)0x0ff));
+ v_tmp_s[1] = vec_unpackh((vector signed char)v_tmpr[1]);
+ v_tmp_s[1] = vec_and(v_tmp_s[1], vec_splats((short)0x0ff));
+ v_r = vec_unpackh(v_tmp_s[0]) + vec_unpackh(v_tmp_s[1]);
+
+ v_tmp_s[0] = vec_unpackh((vector signed char)v_tmpg[0]);
+ v_tmp_s[0] = vec_and(v_tmp_s[0], vec_splats((short)0x0ff));
+ v_tmp_s[1] = vec_unpackh((vector signed char)v_tmpg[1]);
+ v_tmp_s[1] = vec_and(v_tmp_s[1], vec_splats((short)0x0ff));
+ v_g = vec_unpackh(v_tmp_s[0]) + vec_unpackh(v_tmp_s[1]);
+
+ v_tmp_s[0] = vec_unpackh((vector signed char)v_tmpb[0]);
+ v_tmp_s[0] = vec_and(v_tmp_s[0], vec_splats((short)0x0ff));
+ v_tmp_s[1] = vec_unpackh((vector signed char)v_tmpb[1]);
+ v_tmp_s[1] = vec_and(v_tmp_s[1], vec_splats((short)0x0ff));
+ v_b = vec_unpackh(v_tmp_s[0]) + vec_unpackh(v_tmp_s[1]);
+
+ vector unsigned v_opr1 = vec_splats((unsigned)(RGB2YUV_SHIFT));
+ vector unsigned v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-6));
+ vector unsigned v_opr3 = vec_splats((unsigned)(RGB2YUV_SHIFT-5));
+
+ v_rslt = v_ru*v_r + v_gu*v_g + v_bu*v_b;
+ v_rslt += vec_sl(vec_splats((int)256), v_opr1);
+ v_rslt += vec_sl(vec_splats((int)1), v_opr2);
+ v_rslt = vec_sr(v_rslt, v_opr3);
+
+ v_tmp_s[0] = vec_pack(v_rslt, v_rslt);
+ v_dstu = vec_sld(v_dstu, v_tmp_s[0], 8);
+
+ v_rslt = v_rv*v_r + v_gv*v_g + v_bv*v_b;
+ v_rslt += vec_sl(vec_splats((int)256), v_opr1);
+ v_rslt += vec_sl(vec_splats((int)1), v_opr2);
+ v_rslt = vec_sr(v_rslt, v_opr3);
+
+ v_tmp_s[0] = vec_pack(v_rslt, v_rslt);
+ v_dstv = vec_sld(v_dstv, v_tmp_s[0], 8);
+ }
+ v_dstu = vec_sld(v_dstu, v_dstu, 8);
+ v_dstv = vec_sld(v_dstv, v_dstv, 8);
+ vec_vsx_st((vector unsigned char)v_dstu, 0, (unsigned char *)dstu_addr);
+ vec_vsx_st((vector unsigned char)v_dstv, 0, (unsigned char *)dstv_addr);
+ dstu_addr += 16;
+ dstv_addr += 16;
+ }
+
+ for (i = width_adj; i < width + frag_len; i++) {
+ int r = src1[6 * i + 0] + src1[6 * i + 3];
+ int g = src1[6 * i + 1] + src1[6 * i + 4];
+ int b = src1[6 * i + 2] + src1[6 * i + 5];
+
+ dstU[i] = (ru*r + gu*g + bu*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+ dstV[i] = (rv*r + gv*g + bv*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
+ }
+}
+
+static void planar_rgb_to_y_vsx(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *rgb2yuv)
+{
+ uint16_t *dst = (uint16_t *)_dst;
+ int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+ int i, width_adj, frag_len;
+
+ vector unsigned char v_rd[3];
+ vector short v_din[3], v_dst;
+ vector int v_r, v_g, v_b, v_rslt;
+ vector unsigned v_opr1, v_opr2;
+
+ uintptr_t src0_addr, src1_addr, src2_addr, dst_addr;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width >> 3;
+ width_adj = width_adj << 3;
+ frag_len = width - width_adj;
+
+ vector int v_ry = vec_splats((int)ry);
+ vector int v_gy = vec_splats((int)gy);
+ vector int v_by = vec_splats((int)by);
+
+ src0_addr = (uintptr_t)src[0];
+ src1_addr = (uintptr_t)src[1];
+ src2_addr = (uintptr_t)src[2];
+ dst_addr = (uintptr_t)dst;
+
+ v_opr1 = vec_splats((unsigned)0x801);
+ v_opr1 = vec_sl(v_opr1, vec_splats((unsigned)(RGB2YUV_SHIFT-7)));
+ v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-6));
+
+ for (i = 0; i < width_adj; i += 8) {
+ if (i & 0x0f) {
+ v_din[0] = vec_unpackl((vector signed char)v_rd[0]);
+ v_din[1] = vec_unpackl((vector signed char)v_rd[1]);
+ v_din[2] = vec_unpackl((vector signed char)v_rd[2]);
+ } else {
+ v_rd[0] = vec_vsx_ld(0, (unsigned char *)src0_addr);
+ v_rd[1] = vec_vsx_ld(0, (unsigned char *)src1_addr);
+ v_rd[2] = vec_vsx_ld(0, (unsigned char *)src2_addr);
+ src0_addr += 16;
+ src1_addr += 16;
+ src2_addr += 16;
+ v_din[0] = vec_unpackh((vector signed char)v_rd[0]);
+ v_din[1] = vec_unpackh((vector signed char)v_rd[1]);
+ v_din[2] = vec_unpackh((vector signed char)v_rd[2]);
+ }
+
+ v_din[0] = v_din[0] & vec_splats((short)0x00ff);
+ v_din[1] = v_din[1] & vec_splats((short)0x00ff);
+ v_din[2] = v_din[2] & vec_splats((short)0x00ff);
+
+ v_g = vec_unpackh(v_din[0]);
+ v_b = vec_unpackh(v_din[1]);
+ v_r = vec_unpackh(v_din[2]);
+
+ v_rslt = v_ry*v_r + v_gy*v_g + v_by*v_b;
+ v_rslt += v_opr1;
+ v_rslt = vec_sr(v_rslt, v_opr2);
+ v_dst = vec_sld(vec_pack(v_rslt, v_rslt), v_dst, 8);
+
+ v_g = vec_unpackl(v_din[0]);
+ v_b = vec_unpackl(v_din[1]);
+ v_r = vec_unpackl(v_din[2]);
+
+ v_rslt = v_ry*v_r + v_gy*v_g + v_by*v_b;
+ v_rslt += v_opr1;
+ v_rslt = vec_sr(v_rslt, v_opr2);
+ v_dst = vec_sld(vec_pack(v_rslt, v_rslt), v_dst, 8);
+
+ vec_vsx_st(v_dst, 0, (short *)dst_addr);
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width_adj + frag_len; i++) {
+ int g = src[0][i];
+ int b = src[1][i];
+ int r = src[2][i];
+
+ dst[i] = (ry*r + gy*g + by*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+ }
+}
+
+static void planar_rgb_to_a_vsx(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *unused)
+{
+ uint16_t *dst = (uint16_t *)_dst;
+ int i, width_adj, frag_len;
+
+ vector unsigned char v_rd;
+ vector short v_din, v_dst;
+
+ uintptr_t src_addr, dst_addr;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width >> 3;
+ width_adj = width_adj << 3;
+ frag_len = width - width_adj;
+
+ src_addr = (uintptr_t)src[3];
+ dst_addr = (uintptr_t)dst;
+
+ for (i = 0; i < width_adj; i += 8) {
+ if (i & 0x0f) {
+ v_din = vec_unpackl((vector signed char)v_rd);
+ } else {
+ v_rd = vec_vsx_ld(0, (unsigned char *)src_addr);
+ v_din = vec_unpackh((vector signed char)v_rd);
+ src_addr += 16;
+ }
+
+ v_dst = v_din & vec_splats((short)0x00ff);
+ v_dst = v_dst << vec_splats((unsigned short)6);
+
+ vec_vsx_st(v_dst, 0, (short *)dst_addr);
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width_adj + frag_len; i++)
+ dst[i] = src[3][i] << 6;
+}
+
+static void planar_rgb_to_uv_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width, int32_t *rgb2yuv)
+{
+ uint16_t *dstU = (uint16_t *)_dstU;
+ uint16_t *dstV = (uint16_t *)_dstV;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ int i, width_adj, frag_len;
+
+ vector unsigned char v_rd[3];
+ vector short v_din[3], v_dstu, v_dstv;
+ vector int v_r, v_g, v_b, v_rslt;
+ vector unsigned v_opr1, v_opr2;
+
+ uintptr_t src0_addr, src1_addr, src2_addr, dstu_addr, dstv_addr;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width >> 3;
+ width_adj = width_adj << 3;
+ frag_len = width - width_adj;
+
+ vector int v_ru = vec_splats((int)ru);
+ vector int v_gu = vec_splats((int)gu);
+ vector int v_bu = vec_splats((int)bu);
+
+ vector int v_rv = vec_splats((int)rv);
+ vector int v_gv = vec_splats((int)gv);
+ vector int v_bv = vec_splats((int)bv);
+
+ src0_addr = (uintptr_t)src[0];
+ src1_addr = (uintptr_t)src[1];
+ src2_addr = (uintptr_t)src[2];
+ dstu_addr = (uintptr_t)dstU;
+ dstv_addr = (uintptr_t)dstV;
+
+ v_opr1 = vec_splats((unsigned)0x4001);
+ v_opr1 = vec_sl(v_opr1, vec_splats((unsigned)(RGB2YUV_SHIFT-7)));
+ v_opr2 = vec_splats((unsigned)(RGB2YUV_SHIFT-6));
+
+ for (i = 0; i < width_adj; i += 8) {
+ if (i & 0x0f) {
+ v_din[0] = vec_unpackl((vector signed char)v_rd[0]);
+ v_din[1] = vec_unpackl((vector signed char)v_rd[1]);
+ v_din[2] = vec_unpackl((vector signed char)v_rd[2]);
+ } else {
+ v_rd[0] = vec_vsx_ld(0, (unsigned char *)src0_addr);
+ v_rd[1] = vec_vsx_ld(0, (unsigned char *)src1_addr);
+ v_rd[2] = vec_vsx_ld(0, (unsigned char *)src2_addr);
+ src0_addr += 16;
+ src1_addr += 16;
+ src2_addr += 16;
+ v_din[0] = vec_unpackh((vector signed char)v_rd[0]);
+ v_din[1] = vec_unpackh((vector signed char)v_rd[1]);
+ v_din[2] = vec_unpackh((vector signed char)v_rd[2]);
+ }
+
+ v_din[0] = v_din[0] & vec_splats((short)0x00ff);
+ v_din[1] = v_din[1] & vec_splats((short)0x00ff);
+ v_din[2] = v_din[2] & vec_splats((short)0x00ff);
+
+ v_g = vec_unpackh(v_din[0]);
+ v_b = vec_unpackh(v_din[1]);
+ v_r = vec_unpackh(v_din[2]);
+
+ v_rslt = v_ru*v_r + v_gu*v_g + v_bu*v_b;
+ v_rslt += v_opr1;
+ v_rslt = vec_sr(v_rslt, v_opr2);
+ v_dstu = vec_sld(vec_pack(v_rslt, v_rslt), v_dstu, 8);
+
+ v_rslt = v_rv*v_r + v_gv*v_g + v_bv*v_b;
+ v_rslt += v_opr1;
+ v_rslt = vec_sr(v_rslt, v_opr2);
+ v_dstv = vec_sld(vec_pack(v_rslt, v_rslt), v_dstv, 8);
+
+ v_g = vec_unpackl(v_din[0]);
+ v_b = vec_unpackl(v_din[1]);
+ v_r = vec_unpackl(v_din[2]);
+
+ v_rslt = v_ru*v_r + v_gu*v_g + v_bu*v_b;
+ v_rslt += v_opr1;
+ v_rslt = vec_sr(v_rslt, v_opr2);
+ v_dstu = vec_sld(vec_pack(v_rslt, v_rslt), v_dstu, 8);
+
+ v_rslt = v_rv*v_r + v_gv*v_g + v_bv*v_b;
+ v_rslt += v_opr1;
+ v_rslt = vec_sr(v_rslt, v_opr2);
+ v_dstv = vec_sld(vec_pack(v_rslt, v_rslt), v_dstv, 8);
+
+ vec_vsx_st(v_dstu, 0, (short *)dstu_addr);
+ vec_vsx_st(v_dstv, 0, (short *)dstv_addr);
+ dstu_addr += 16;
+ dstv_addr += 16;
+ }
+
+ for (i = width_adj; i < width_adj + frag_len; i++) {
+ int g = src[0][i];
+ int b = src[1][i];
+ int r = src[2][i];
+
+ dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+ dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+ }
+}
+
#endif /* HAVE_VSX */
av_cold void ff_sws_init_input_funcs_vsx(SwsContext *c)
@@ -404,9 +1376,38 @@ av_cold void ff_sws_init_input_funcs_vsx(SwsContext *c)
case AV_PIX_FMT_NV21:
c->chrToYV12 = nv21ToUV_c_vsx;
break;
+ case AV_PIX_FMT_GBRAP:
+ case AV_PIX_FMT_GBRP:
+ c->readChrPlanar = planar_rgb_to_uv_vsx;
+ break;
+ }
+
+ if (c->chrSrcHSubSample) {
+ switch (srcFormat) {
+ case AV_PIX_FMT_BGR24:
+ c->chrToYV12 = bgr24ToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->chrToYV12 = rgb24ToUV_half_c_vsx;
+ break;
+ }
+ } else {
+ switch (srcFormat) {
+ case AV_PIX_FMT_BGR24:
+ c->chrToYV12 = bgr24ToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->chrToYV12 = rgb24ToUV_c_vsx;
+ break;
+ }
}
switch (srcFormat) {
+ case AV_PIX_FMT_GBRAP:
+ c->readAlpPlanar = planar_rgb_to_a_vsx;
+ case AV_PIX_FMT_GBRP:
+ c->readLumPlanar = planar_rgb_to_y_vsx;
+ break;
case AV_PIX_FMT_YUYV422:
case AV_PIX_FMT_YVYU422:
case AV_PIX_FMT_YA8:
@@ -415,6 +1416,18 @@ av_cold void ff_sws_init_input_funcs_vsx(SwsContext *c)
case AV_PIX_FMT_UYVY422:
c->lumToYV12 = uyvyToY_c_vsx;
break;
+ case AV_PIX_FMT_BGR24:
+ c->lumToYV12 = bgr24ToY_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->lumToYV12 = rgb24ToY_c_vsx;
+ break;
+ case AV_PIX_FMT_MONOBLACK:
+ c->lumToYV12 = monoblack2Y_c_vsx;
+ break;
+ case AV_PIX_FMT_MONOWHITE:
+ c->lumToYV12 = monowhite2Y_c_vsx;
+ break;
}
if (c->needAlpha) {
--
2.7.4
More information about the ffmpeg-devel
mailing list