[FFmpeg-devel] [PATCH] PPC64: Add IBM POWER8 SIMD Implementation
Dan Parrot
dan.parrot at mail.com
Wed Jun 15 06:25:11 CEST 2016
This is the first commit addressing Trac ticket #5570. Functions defined in
libswscale/input.c have corresponding definitions in libswscale/ppc/input_vsx.h
The corresponding function names in the latter contain the suffix "_vsx".
---
libswscale/input.c | 44 +--
libswscale/ppc/input_vsx.h | 831 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 853 insertions(+), 22 deletions(-)
create mode 100644 libswscale/ppc/input_vsx.h
diff --git a/libswscale/input.c b/libswscale/input.c
index 14ab5ab..de4347e 100644
--- a/libswscale/input.c
+++ b/libswscale/input.c
@@ -40,6 +40,13 @@
#define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? b_r : r_b)
#define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? r_b : b_r)
+#ifdef HAVE_VSX
+#include "ppc/input_vsx.h"
+#define RENAME_SIMD(fname) fname ## _vsx
+#elif
+#define RENAME_SIMD(fname) fname
+#endif
+
static av_always_inline void
rgb64ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
enum AVPixelFormat origin, int32_t *rgb2yuv)
@@ -99,7 +106,7 @@ static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src,
{ \
const uint16_t *src = (const uint16_t *) _src; \
uint16_t *dst = (uint16_t *) _dst; \
- rgb64ToY_c_template(dst, src, width, origin, rgb2yuv); \
+ RENAME_SIMD(rgb64ToY_c_template)(dst, src, width, origin, rgb2yuv); \
} \
\
static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
@@ -109,7 +116,7 @@ static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
const uint16_t *src1 = (const uint16_t *) _src1, \
*src2 = (const uint16_t *) _src2; \
uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
- rgb64ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
+ RENAME_SIMD(rgb64ToUV_c_template)(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
} \
\
static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
@@ -119,7 +126,7 @@ static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV
const uint16_t *src1 = (const uint16_t *) _src1, \
*src2 = (const uint16_t *) _src2; \
uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
- rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
+ RENAME_SIMD(rgb64ToUV_half_c_template)(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
}
rgb64funcs(rgb, LE, AV_PIX_FMT_RGBA64LE)
@@ -203,7 +210,7 @@ static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, \
{ \
const uint16_t *src = (const uint16_t *)_src; \
uint16_t *dst = (uint16_t *)_dst; \
- rgb48ToY_c_template(dst, src, width, origin, rgb2yuv); \
+ RENAME_SIMD(rgb48ToY_c_template)(dst, src, width, origin, rgb2yuv); \
} \
\
static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, \
@@ -218,7 +225,7 @@ static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, \
*src2 = (const uint16_t *)_src2; \
uint16_t *dstU = (uint16_t *)_dstU, \
*dstV = (uint16_t *)_dstV; \
- rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
+ RENAME_SIMD(rgb48ToUV_c_template)(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
} \
\
static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, \
@@ -233,7 +240,7 @@ static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, \
*src2 = (const uint16_t *)_src2; \
uint16_t *dstU = (uint16_t *)_dstU, \
*dstV = (uint16_t *)_dstV; \
- rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
+ RENAME_SIMD(rgb48ToUV_half_c_template)(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
}
rgb48funcs(rgb, LE, AV_PIX_FMT_RGB48LE)
@@ -273,7 +280,6 @@ static av_always_inline void rgb16_32ToY_c_template(int16_t *dst,
dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
}
}
-
static av_always_inline void rgb16_32ToUV_c_template(int16_t *dstU,
int16_t *dstV,
const uint8_t *src,
@@ -351,17 +357,17 @@ static av_always_inline void rgb16_32ToUV_half_c_template(int16_t *dstU,
static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, \
int width, uint32_t *tab) \
{ \
- rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, shr, shg, shb, shp, \
- maskr, maskg, maskb, rsh, gsh, bsh, S, tab); \
+ RENAME_SIMD(rgb16_32ToY_c_template)((int16_t*)dst, src, width, fmt, shr, shg, shb, shp, \
+ maskr, maskg, maskb, rsh, gsh, bsh, S, tab); \
} \
\
static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
int width, uint32_t *tab) \
{ \
- rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
- shr, shg, shb, shp, \
- maskr, maskg, maskb, rsh, gsh, bsh, S, tab);\
+ RENAME_SIMD(rgb16_32ToUV_c_template)((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
+ shr, shg, shb, shp, \
+ maskr, maskg, maskb, rsh, gsh, bsh, S, tab);\
} \
\
static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
@@ -369,10 +375,10 @@ static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
const uint8_t *dummy, \
int width, uint32_t *tab) \
{ \
- rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
- shr, shg, shb, shp, \
- maskr, maskg, maskb, \
- rsh, gsh, bsh, S, tab); \
+ RENAME_SIMD(rgb16_32ToUV_half_c_template)((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
+ shr, shg, shb, shp, \
+ maskr, maskg, maskb, \
+ rsh, gsh, bsh, S, tab); \
}
rgb16_32_wrapper(AV_PIX_FMT_BGR32, bgr32, 16, 0, 0, 0, 0xFF0000, 0xFF00, 0x00FF, 8, 0, 8, RGB2YUV_SHIFT + 8)
@@ -978,7 +984,6 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
case AV_PIX_FMT_GBRP9LE:
c->readChrPlanar = planar_rgb9le_to_uv;
break;
- case AV_PIX_FMT_GBRAP10LE:
case AV_PIX_FMT_GBRP10LE:
c->readChrPlanar = planar_rgb10le_to_uv;
break;
@@ -996,7 +1001,6 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
case AV_PIX_FMT_GBRP9BE:
c->readChrPlanar = planar_rgb9be_to_uv;
break;
- case AV_PIX_FMT_GBRAP10BE:
case AV_PIX_FMT_GBRP10BE:
c->readChrPlanar = planar_rgb10be_to_uv;
break;
@@ -1260,8 +1264,6 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
case AV_PIX_FMT_GBRP9LE:
c->readLumPlanar = planar_rgb9le_to_y;
break;
- case AV_PIX_FMT_GBRAP10LE:
- c->readAlpPlanar = planar_rgb10le_to_a;
case AV_PIX_FMT_GBRP10LE:
c->readLumPlanar = planar_rgb10le_to_y;
break;
@@ -1281,8 +1283,6 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
case AV_PIX_FMT_GBRP9BE:
c->readLumPlanar = planar_rgb9be_to_y;
break;
- case AV_PIX_FMT_GBRAP10BE:
- c->readAlpPlanar = planar_rgb10be_to_a;
case AV_PIX_FMT_GBRP10BE:
c->readLumPlanar = planar_rgb10be_to_y;
break;
diff --git a/libswscale/ppc/input_vsx.h b/libswscale/ppc/input_vsx.h
new file mode 100644
index 0000000..09fe8c1
--- /dev/null
+++ b/libswscale/ppc/input_vsx.h
@@ -0,0 +1,831 @@
+/*
+ * Copyright (C) 2016 Dan Parrot <dan.parrot at mail.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+// This is a SIMD version for IBM POWER8 of function rgb64ToY_c_template
+// in file libswscale/input.c
+static av_always_inline void
+rgb64ToY_c_template_vsx(uint16_t *dst, const uint16_t *src, int width,
+ enum AVPixelFormat origin, int32_t *rgb2yuv)
+{
+ int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+ int i, j;
+ int num_vec, frag;
+
+ num_vec = width / 8;
+ frag = width % 8;
+
+ vector int v_ry = vec_splats((int)ry);
+ vector int v_gy = vec_splats((int)gy);
+ vector int v_by = vec_splats((int)by);
+
+ int s_opr2;
+ s_opr2 = (int)(0x2001 << (RGB2YUV_SHIFT-1));
+
+ vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT);
+ vector int v_opr2 = vec_splats((int)s_opr2);
+
+ vector int v_r, v_g, v_b, v_tmp;
+ vector short v_tmpi, v_dst;
+
+ for (i = 0; i < num_vec; i++) {
+ for (j = 7; j >= 0 ; j--) {
+ int r_b = input_pixel(&src[(i*8+j)*4+0]);
+ int g = input_pixel(&src[(i*8+j)*4+1]);
+ int b_r = input_pixel(&src[(i*8+j)*4+2]);
+
+ v_r[j % 4] = r;
+ v_g[j % 4] = g;
+ v_b[j % 4] = b;
+
+ if (!(j % 4)) {
+ v_tmp = v_ry * v_r;
+ v_tmp = v_tmp + v_gy * v_g;
+ v_tmp = v_tmp + v_by * v_b;
+ v_tmp = v_tmp + v_opr2;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dst[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dst[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dst[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dst[(j / 4) * 4 + 0] = v_tmpi[0];
+ }
+ }
+ vec_vsx_st(v_dst, 0, (short *)&dst[i*8]);
+ }
+
+ // computation for any less than vector-length items at tail end
+ if( frag ) {
+ for (i = 0; i < frag; i++) {
+ unsigned int r_b = input_pixel(&src[num_vec*8+i*4+0]);
+ unsigned int g = input_pixel(&src[num_vec*8+i*4+1]);
+ unsigned int b_r = input_pixel(&src[num_vec*8+i*4+2]);
+
+ dst[num_vec*8+i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+ }
+
+}
+
+// This is a SIMD version for IBM POWER8 of function rgb64ToUV_c_template
+// in file libswscale/input.c
+static av_always_inline void
+rgb64ToUV_c_template_vsx(uint16_t *dstU, uint16_t *dstV,
+ const uint16_t *src1, const uint16_t *src2,
+ int width, enum AVPixelFormat origin, int32_t *rgb2yuv)
+{
+
+ int i, j;
+ int num_vec, frag;
+ int s_opr2;
+
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ av_assert1(src1==src2);
+
+ s_opr2 = (int)(0x10001 << (RGB2YUV_SHIFT-1));
+
+ vector int v_ru = vec_splats((int)ru);
+ vector int v_gu = vec_splats((int)gu);
+ vector int v_bu = vec_splats((int)bu);
+
+ vector int v_rv = vec_splats((int)rv);
+ vector int v_gv = vec_splats((int)gv);
+ vector int v_bv = vec_splats((int)bv);
+
+ vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT);
+ vector int v_opr2 = vec_splats((int)s_opr2);
+
+ vector int v_r, v_g, v_b, v_tmp;
+ vector short v_tmpi, v_dstu, v_dstv;
+
+ num_vec = width / 8;
+ frag = width % 8;
+
+ for (i = 0; i < num_vec; i++) {
+ for (j = 7; j >= 0 ; j--) {
+ int r_b = input_pixel(&src1[(i*8+j)*4+0]);
+ int g = input_pixel(&src1[(i*8+j)*4+1]);
+ int b_r = input_pixel(&src1[(i*8+j)*4+2]);
+
+
+ v_r[j % 4] = r;
+ v_g[j % 4] = g;
+ v_b[j % 4] = b;
+
+ if (!(j % 4)) {
+ v_tmp = v_ru * v_r;
+ v_tmp = v_tmp + v_gu * v_g;
+ v_tmp = v_tmp + v_bu * v_b;
+ v_tmp = v_tmp + v_opr2;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dstu[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dstu[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dstu[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dstu[(j / 4) * 4 + 0] = v_tmpi[0];
+
+ v_tmp = v_rv * v_r;
+ v_tmp = v_tmp + v_gv * v_g;
+ v_tmp = v_tmp + v_bv * v_b;
+ v_tmp = v_tmp + v_opr2;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dstv[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dstv[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dstv[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dstv[(j / 4) * 4 + 0] = v_tmpi[0];
+ }
+ }
+ vec_vsx_st(v_dstu, 0, (short *)&dstU[i*8]);
+ vec_vsx_st(v_dstv, 0, (short *)&dstV[i*8]);
+ }
+
+ // computation for any less than vector-length items at tail end
+ if( frag ) {
+ for (i = 0; i < frag; i++) {
+ int r_b = input_pixel(&src1[num_vec*8+i*4+0]);
+ int g = input_pixel(&src1[num_vec*8+i*4+1]);
+ int b_r = input_pixel(&src1[num_vec*8+i*4+2]);
+
+ dstU[num_vec*8+i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ dstV[num_vec*8+i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+ }
+
+}
+
+// This is a SIMD version for IBM POWER8 of function rgb64ToUV_half_c_template
+// in file libswscale/input.c
+static av_always_inline void
+rgb64ToUV_half_c_template_vsx(uint16_t *dstU, uint16_t *dstV,
+ const uint16_t *src1, const uint16_t *src2,
+ int width, enum AVPixelFormat origin, int32_t *rgb2yuv)
+{
+
+ int i, j;
+ int num_vec, frag;
+ int s_opr2;
+
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ av_assert1(src1==src2);
+
+ vector int v_ru = vec_splats((int)ru);
+ vector int v_gu = vec_splats((int)gu);
+ vector int v_bu = vec_splats((int)bu);
+
+ vector int v_rv = vec_splats((int)rv);
+ vector int v_gv = vec_splats((int)gv);
+ vector int v_bv = vec_splats((int)bv);
+
+ s_opr2 = (int)(0x10001 << (RGB2YUV_SHIFT-1));
+
+ vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT);
+ vector int v_opr2 = vec_splats((int)s_opr2);
+
+ vector int v_r, v_g, v_b, v_tmp;
+ vector short v_tmpi, v_dstu, v_dstv;
+
+ num_vec = width / 8;
+ frag = width % 8;
+
+ for (i = 0; i < num_vec; i++) {
+ for (j = 7; j >= 0 ; j--) {
+ int r_b = (input_pixel(&src1[8 * i * 8 + j + 0]) + input_pixel(&src1[8 * i * 8 + j + 4]) + 1) >> 1;
+ int g = (input_pixel(&src1[8 * i * 8 + j + 1]) + input_pixel(&src1[8 * i * 8 + j + 5]) + 1) >> 1;
+ int b_r = (input_pixel(&src1[8 * i * 8 + j + 2]) + input_pixel(&src1[8 * i * 8 + j + 6]) + 1) >> 1;
+
+ v_r[j % 4] = r;
+ v_g[j % 4] = g;
+ v_b[j % 4] = b;
+
+ if (!(j % 4)) {
+ v_tmp = v_ru * v_r;
+ v_tmp = v_tmp + v_gu * v_g;
+ v_tmp = v_tmp + v_bu * v_b;
+ v_tmp = v_tmp + v_opr2;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dstu[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dstu[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dstu[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dstu[(j / 4) * 4 + 0] = v_tmpi[0];
+
+ v_tmp = v_rv * v_r;
+ v_tmp = v_tmp + v_gv * v_g;
+ v_tmp = v_tmp + v_bv * v_b;
+ v_tmp = v_tmp + v_opr2;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dstv[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dstv[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dstv[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dstv[(j / 4) * 4 + 0] = v_tmpi[0];
+ }
+ }
+ vec_vsx_st(v_dstu, 0, (short *)&dstU[i*8]);
+ vec_vsx_st(v_dstv, 0, (short *)&dstV[i*8]);
+ }
+
+ // computation for any less than vector-length items at tail end
+ if( frag ) {
+ for (i = 0; i < frag; i++) {
+ int r_b = (input_pixel(&src1[num_vec * 8 + 8 * i + 0]) +
+ input_pixel(&src1[num_vec * 8 + 8 * i + 4]) + 1) >> 1;
+ int g = (input_pixel(&src1[num_vec * 8 + 8 * i + 1]) +
+ input_pixel(&src1[num_vec * 8 + 8 * i + 5]) + 1) >> 1;
+ int b_r = (input_pixel(&src1[num_vec * 8 + 8 * i + 2]) +
+ input_pixel(&src1[num_vec * 8 + 8 * i + 6]) + 1) >> 1;
+
+ dstU[num_vec*8+i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ dstV[num_vec*8+i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+ }
+
+}
+
+// This is a SIMD version for IBM POWER8 of function rgb48ToY_c_template
+// in file libswscale/input.c
+static av_always_inline void rgb48ToY_c_template_vsx(uint16_t *dst,
+ const uint16_t *src, int width,
+ enum AVPixelFormat origin,
+ int32_t *rgb2yuv)
+{
+
+ int i, j;
+ int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+
+ int num_vec, frag;
+
+ num_vec = width / 8;
+ frag = width % 8;
+
+ vector int v_ry = vec_splats((int)ry);
+ vector int v_gy = vec_splats((int)gy);
+ vector int v_by = vec_splats((int)by);
+
+ int s_opr2;
+ s_opr2 = (int)(0x2001 << (RGB2YUV_SHIFT-1));
+
+ vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT);
+ vector int v_opr2 = vec_splats((int)s_opr2);
+
+ vector int v_r, v_g, v_b, v_tmp;
+ vector short v_tmpi, v_dst;
+
+ for (i = 0; i < num_vec; i++) {
+ for (j = 7; j >= 0 ; j--) {
+ int r_b = input_pixel(&src[(i * 8 + j) * 3 + 0]);
+ int g = input_pixel(&src[(i * 8 + j) * 3 + 1]);
+ int b_r = input_pixel(&src[(i * 8 + j) * 3 + 2]);
+
+ v_r[j % 4] = r;
+ v_g[j % 4] = g;
+ v_b[j % 4] = b;
+
+ if (!(j % 4)) {
+ v_tmp = v_ry * v_r;
+ v_tmp = v_tmp + v_gy * v_g;
+ v_tmp = v_tmp + v_by * v_b;
+ v_tmp = v_tmp + v_opr2;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dst[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dst[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dst[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dst[(j / 4) * 4 + 0] = v_tmpi[0];
+ }
+ }
+ vec_vsx_st(v_dst, 0, (short *)&dst[i*8]);
+ }
+
+ // computation for any less than vector-length items at tail end
+ if( frag ) {
+ for (i = 0; i < frag; i++) {
+ int r_b = input_pixel(&src[(num_vec * 8 + i) * 3 + 0]);
+ int g = input_pixel(&src[(num_vec * 8 + i) * 3 + 1]);
+ int b_r = input_pixel(&src[(num_vec * 8 + i) * 3 + 2]);
+
+ dst[num_vec*8+i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+ }
+
+}
+
+// This is a SIMD version for IBM POWER8 of function rgb48ToUV_c_template
+// in file libswscale/input.c
+static av_always_inline void rgb48ToUV_c_template_vsx(uint16_t *dstU,
+ uint16_t *dstV,
+ const uint16_t *src1,
+ const uint16_t *src2,
+ int width,
+ enum AVPixelFormat origin,
+ int32_t *rgb2yuv)
+{
+
+ int i, j;
+ int num_vec, frag;
+ int s_opr2;
+
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ av_assert1(src1==src2);
+
+ num_vec = width / 8;
+ frag = width % 8;
+
+ vector int v_ru = vec_splats((int)ru);
+ vector int v_gu = vec_splats((int)gu);
+ vector int v_bu = vec_splats((int)bu);
+
+ vector int v_rv = vec_splats((int)rv);
+ vector int v_gv = vec_splats((int)gv);
+ vector int v_bv = vec_splats((int)bv);
+
+ s_opr2 = (int)(0x10001 << (RGB2YUV_SHIFT-1));
+
+ vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT);
+ vector int v_opr2 = vec_splats((int)s_opr2);
+
+ vector int v_r, v_g, v_b, v_tmp;
+ vector short v_tmpi, v_dstu, v_dstv;
+
+ for (i = 0; i < num_vec; i++) {
+ for (j = 7; j >= 0 ; j--) {
+ int r_b = input_pixel(&src1[(i * 8 + j) * 3 + 0]);
+ int g = input_pixel(&src1[(i * 8 + j) * 3 + 1]);
+ int b_r = input_pixel(&src1[(i * 8 + j) * 3 + 2]);
+
+ v_r[j % 4] = r;
+ v_g[j % 4] = g;
+ v_b[j % 4] = b;
+
+ if(!(j % 4)) {
+ v_tmp = v_ru * v_r;
+ v_tmp = v_tmp + v_gu * v_g;
+ v_tmp = v_tmp + v_bu * v_b;
+ v_tmp = v_tmp + v_opr2;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dstu[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dstu[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dstu[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dstu[(j / 4) * 4 + 0] = v_tmpi[0];
+
+ v_tmp = v_rv * v_r;
+ v_tmp = v_tmp + v_gv * v_g;
+ v_tmp = v_tmp + v_bv * v_b;
+ v_tmp = v_tmp + v_opr2;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dstv[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dstv[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dstv[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dstv[(j / 4) * 4 + 0] = v_tmpi[0];
+ }
+ }
+ vec_vsx_st(v_dstu, 0, (short *)&dstU[i*8]);
+ vec_vsx_st(v_dstv, 0, (short *)&dstV[i*8]);
+ }
+
+ // computation for any less than vector-length items at tail end
+ if( frag ) {
+ for (i = 0; i < frag; i++) {
+ int r_b = input_pixel(&src1[num_vec * 8 + i * 3 + 0]);
+ int g = input_pixel(&src1[num_vec * 8 + i * 3 + 1]);
+ int b_r = input_pixel(&src1[num_vec * 8 + i * 3 + 2]);
+
+ dstU[num_vec*8+i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ dstV[num_vec*8+i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+ }
+
+}
+
+// This is a SIMD version for IBM POWER8 of function rgb48ToUV_half_c_template
+// in file libswscale/input.c
+static av_always_inline void rgb48ToUV_half_c_template_vsx(uint16_t *dstU,
+ uint16_t *dstV,
+ const uint16_t *src1,
+ const uint16_t *src2,
+ int width,
+ enum AVPixelFormat origin,
+ int32_t *rgb2yuv)
+{
+
+ int i, j;
+ int num_vec, frag;
+
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ av_assert1(src1==src2);
+
+ num_vec = width / 8;
+ frag = width % 8;
+
+ vector int v_ru = vec_splats((int)ru);
+ vector int v_gu = vec_splats((int)gu);
+ vector int v_bu = vec_splats((int)bu);
+
+ vector int v_rv = vec_splats((int)rv);
+ vector int v_gv = vec_splats((int)gv);
+ vector int v_bv = vec_splats((int)bv);
+
+ int s_opr2;
+ s_opr2 = (int)(0x10001 << (RGB2YUV_SHIFT-1));
+
+ vector int v_opr1 = vec_splats((int)RGB2YUV_SHIFT);
+ vector int v_opr2 = vec_splats((int)s_opr2);
+
+ vector int v_r, v_g, v_b, v_tmp;
+ vector short v_tmpi, v_dstu, v_dstv;
+
+ for (i = 0; i < num_vec; i++) {
+ for (j = 7; j >= 0 ; j--) {
+ int r_b = (input_pixel(&src1[6 * (i * 8 + j) + 0]) +
+ input_pixel(&src1[6 * (i * 8 + j) + 3]) + 1) >> 1;
+ int g = (input_pixel(&src1[6 * (i * 8 + j) + 1]) +
+ input_pixel(&src1[6 * (i * 8 + j) + 4]) + 1) >> 1;
+ int b_r = (input_pixel(&src1[6 * (i * 8 + j) + 2]) +
+ input_pixel(&src1[6 * (i * 8 + j) + 5]) + 1) >> 1;
+
+ v_r[j % 4] = r;
+ v_g[j % 4] = g;
+ v_b[j % 4] = b;
+
+ if(!(j % 4)) {
+ v_tmp = v_ru * v_r;
+ v_tmp = v_tmp + v_gu * v_g;
+ v_tmp = v_tmp + v_bu * v_b;
+ v_tmp = v_tmp + v_opr2;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dstu[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dstu[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dstu[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dstu[(j / 4) * 4 + 0] = v_tmpi[0];
+
+ v_tmp = v_rv * v_r;
+ v_tmp = v_tmp + v_gv * v_g;
+ v_tmp = v_tmp + v_bv * v_b;
+ v_tmp = v_tmp + v_opr2;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr1);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dstv[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dstv[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dstv[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dstv[(j / 4) * 4 + 0] = v_tmpi[0];
+ }
+ }
+ vec_vsx_st(v_dstu, 0, (short *)&dstU[i*8]);
+ vec_vsx_st(v_dstv, 0, (short *)&dstV[i*8]);
+ }
+
+ // computation for any less than vector-length items at tail end
+ if( frag ) {
+ for (i = 0; i < frag; i++) {
+ int r_b = (input_pixel(&src1[6 * (num_vec * 8 + i) + 0]) +
+ input_pixel(&src1[6 * (num_vec * 8 + i) + 3]) + 1) >> 1;
+ int g = (input_pixel(&src1[6 * (num_vec * 8 + i) + 1]) +
+ input_pixel(&src1[6 * (num_vec * 8 + i) + 4]) + 1) >> 1;
+ int b_r = (input_pixel(&src1[6 * (num_vec * 8 + i) + 2]) +
+ input_pixel(&src1[6 * (num_vec * 8 + i) + 5]) + 1) >> 1;
+
+ dstU[num_vec*8+i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ dstV[num_vec*8+i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+ }
+}
+
+#pragma push_macro("r")
+#pragma push_macro("b")
+#pragma push_macro("input_pixel")
+
+#undef r
+#undef b
+#undef input_pixel
+
+#define input_pixel(i) ((origin == AV_PIX_FMT_RGBA || \
+ origin == AV_PIX_FMT_BGRA || \
+ origin == AV_PIX_FMT_ARGB || \
+ origin == AV_PIX_FMT_ABGR) \
+ ? AV_RN32A(&src[(i) * 4]) \
+ : (isBE(origin) ? AV_RB16(&src[(i) * 2]) \
+ : AV_RL16(&src[(i) * 2])))
+
+// This is a SIMD version for IBM POWER8 of function rgb16_32ToY_c_template
+// in file libswscale/input.c
+static av_always_inline void rgb16_32ToY_c_template_vsx(int16_t *dst,
+ const uint8_t *src,
+ int width,
+ enum AVPixelFormat origin,
+ int shr, int shg,
+ int shb, int shp,
+ int maskr, int maskg,
+ int maskb, int rsh,
+ int gsh, int bsh, int S,
+ int32_t *rgb2yuv)
+{
+ const int ry = rgb2yuv[RY_IDX]<<rsh, gy = rgb2yuv[GY_IDX]<<gsh, by = rgb2yuv[BY_IDX]<<bsh;
+ const unsigned rnd = (32<<((S)-1)) + (1<<(S-7));
+ int i, j;
+
+ int num_vec, frag;
+
+ num_vec = width / 8;
+ frag = width % 8;
+
+ vector int v_ry = vec_splats((int)ry);
+ vector int v_gy = vec_splats((int)gy);
+ vector int v_by = vec_splats((int)by);
+
+ vector int v_rnd = vec_splats((int)rnd);
+ vector int v_opr = vec_splats((int)((S)-6));
+
+ vector int v_r, v_b, v_g, v_tmp;
+ vector short v_tmpi, v_dst;
+
+ for (i = 0; i < num_vec; i++) {
+ for (j = 7; j >= 0; j--) {
+ int px = input_pixel(i * 8 + j) >> shp;
+ int b = (px & maskb) >> shb;
+ int g = (px & maskg) >> shg;
+ int r = (px & maskr) >> shr;
+
+ v_r[j % 4] = r;
+ v_g[j % 4] = g;
+ v_b[j % 4] = b;
+
+ if (!(j % 4)) {
+ v_tmp = v_ry * v_r;
+ v_tmp = v_tmp + v_gy * v_g;
+ v_tmp = v_tmp + v_by * v_b;
+ v_tmp = v_tmp + v_rnd;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dst[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dst[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dst[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dst[(j / 4) * 4 + 0] = v_tmpi[0];
+ }
+ }
+ vec_vsx_st(v_dst, 0, (short *)&dst[i * 8]);
+ }
+
+ if ( frag ) {
+ for (i = 0; i < frag; i++) {
+ int px = input_pixel(num_vec * 8 + i) >> shp;
+ int b = (px & maskb) >> shb;
+ int g = (px & maskg) >> shg;
+ int r = (px & maskr) >> shr;
+
+ dst[num_vec * 8 + i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
+ }
+ }
+}
+
+// This is a SIMD version for IBM POWER8 of function rgb16_32ToUV_c_template
+// in file libswscale/input.c
+static av_always_inline void rgb16_32ToUV_c_template_vsx(int16_t *dstU,
+ int16_t *dstV,
+ const uint8_t *src,
+ int width,
+ enum AVPixelFormat origin,
+ int shr, int shg,
+ int shb, int shp,
+ int maskr, int maskg,
+ int maskb, int rsh,
+ int gsh, int bsh, int S,
+ int32_t *rgb2yuv)
+{
+ const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << gsh, bu = rgb2yuv[BU_IDX] << bsh,
+ rv = rgb2yuv[RV_IDX] << rsh, gv = rgb2yuv[GV_IDX] << gsh, bv = rgb2yuv[BV_IDX] << bsh;
+ const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7));
+ int i, j;
+
+ int num_vec, frag;
+
+ num_vec = width / 8;
+ frag = width % 8;
+
+ vector int v_ru = vec_splats((int)ru);
+ vector int v_gu = vec_splats((int)gu);
+ vector int v_bu = vec_splats((int)bu);
+
+ vector int v_rv = vec_splats((int)rv);
+ vector int v_gv = vec_splats((int)gv);
+ vector int v_bv = vec_splats((int)bv);
+
+ vector int v_rnd = vec_splats((int)rnd);
+ vector int v_opr = vec_splats((int)((S)-6));
+
+ vector int v_r, v_b, v_g;
+ vector int v_tmp;
+ vector short v_tmpi, v_dstu, v_dstv;
+
+ for (i = 0; i < num_vec; i++) {
+ for(j = 7; j >= 0; j--) {
+ int px = input_pixel(i * 8 + j) >> shp;
+ int b = (px & maskb) >> shb;
+ int g = (px & maskg) >> shg;
+ int r = (px & maskr) >> shr;
+
+ v_r[j % 4] = r;
+ v_g[j % 4] = g;
+ v_b[j % 4] = b;
+
+ if (!(j % 4)) {
+ v_tmp = v_ru * v_r;
+ v_tmp = v_tmp + v_gu * v_g;
+ v_tmp = v_tmp + v_bu * v_b;
+ v_tmp = v_tmp + v_rnd;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dstu[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dstu[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dstu[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dstu[(j / 4) * 4 + 0] = v_tmpi[0];
+
+ v_tmp = v_rv * v_r;
+ v_tmp = v_tmp + v_gv * v_g;
+ v_tmp = v_tmp + v_bv * v_b;
+ v_tmp = v_tmp + v_rnd;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dstv[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dstv[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dstv[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dstv[(j / 4) * 4 + 0] = v_tmpi[0];
+ }
+ }
+ vec_vsx_st(v_dstu, 0, (short *)&dstU[i * 8]);
+ vec_vsx_st(v_dstv, 0, (short *)&dstV[i * 8]);
+ }
+
+ if ( frag ) {
+ for (i = 0; i < frag; i++) {
+ int px = input_pixel(num_vec * 8 + i) >> shp;
+ int b = (px & maskb) >> shb;
+ int g = (px & maskg) >> shg;
+ int r = (px & maskr) >> shr;
+
+ dstU[num_vec * 8 + i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
+ dstV[num_vec * 8 + i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
+ }
+ }
+}
+
+// This is a SIMD version for IBM POWER8 of function rgb16_32ToUV_half_c_template
+// in file libswscale/input.c
+static av_always_inline void rgb16_32ToUV_half_c_template_vsx(int16_t *dstU,
+ int16_t *dstV,
+ const uint8_t *src,
+ int width,
+ enum AVPixelFormat origin,
+ int shr, int shg,
+ int shb, int shp,
+ int maskr, int maskg,
+ int maskb, int rsh,
+ int gsh, int bsh, int S,
+ int32_t *rgb2yuv)
+{
+ const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << gsh, bu = rgb2yuv[BU_IDX] << bsh,
+ rv = rgb2yuv[RV_IDX] << rsh, gv = rgb2yuv[GV_IDX] << gsh, bv = rgb2yuv[BV_IDX] << bsh,
+ maskgx = ~(maskr | maskb);
+ const unsigned rnd = (256U<<(S)) + (1<<(S-6));
+ int i, j;
+
+ int num_vec, frag;
+
+ num_vec = width / 8;
+ frag = width % 8;
+
+ vector int v_ru = vec_splats((int)ru);
+ vector int v_gu = vec_splats((int)gu);
+ vector int v_bu = vec_splats((int)bu);
+
+ vector int v_rv = vec_splats((int)rv);
+ vector int v_gv = vec_splats((int)gv);
+ vector int v_bv = vec_splats((int)bv);
+
+ vector int v_rnd = vec_splats((int)rnd);
+ vector int v_opr = vec_splats((int)((S)-6+1));
+
+ vector int v_r, v_b, v_g;
+ vector int v_tmp;
+ vector short v_tmpi, v_dstu, v_dstv;
+
+ maskr |= maskr << 1;
+ maskb |= maskb << 1;
+ maskg |= maskg << 1;
+
+ for (i = 0; i < num_vec; i++) {
+ for (j = 7; j >= 0 ; j--) {
+ unsigned px0 = input_pixel(2 * (i * 8 + j) + 0) >> shp;
+ unsigned px1 = input_pixel(2 * (i * 8 + j) + 1) >> shp;
+ int b, r, g = (px0 & maskgx) + (px1 & maskgx);
+ int rb = px0 + px1 - g;
+
+ b = (rb & maskb) >> shb;
+ if (shp ||
+ origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
+ origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE) {
+ g >>= shg;
+ } else {
+ g = (g & maskg) >> shg;
+ }
+ r = (rb & maskr) >> shr;
+
+ v_r[j % 4] = r;
+ v_g[j % 4] = g;
+ v_b[j % 4] = b;
+
+ if (!(j % 4)) {
+ v_tmp = v_ru * v_r;
+ v_tmp = v_tmp + v_gu * v_g;
+ v_tmp = v_tmp + v_bu * v_b;
+ v_tmp = v_tmp + v_rnd;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dstu[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dstu[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dstu[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dstu[(j / 4) * 4 + 0] = v_tmpi[0];
+
+ v_tmp = v_rv * v_r;
+ v_tmp = v_tmp + v_gv * v_g;
+ v_tmp = v_tmp + v_bv * v_b;
+ v_tmp = v_tmp + v_rnd;
+ v_tmp = vec_sr(v_tmp, (vector unsigned int)v_opr);
+
+ v_tmpi = (vector short)v_tmp;
+ v_dstv[(j / 4) * 4 + 3] = v_tmpi[6];
+ v_dstv[(j / 4) * 4 + 2] = v_tmpi[4];
+ v_dstv[(j / 4) * 4 + 1] = v_tmpi[2];
+ v_dstv[(j / 4) * 4 + 0] = v_tmpi[0];
+ }
+ }
+ vec_vsx_st(v_dstu, 0, (short *)&dstU[i * 8]);
+ vec_vsx_st(v_dstv, 0, (short *)&dstV[i * 8]);
+ }
+
+ if ( frag ) {
+ for (i = 0; i < frag; i++) {
+ unsigned px0 = input_pixel(2 * (num_vec * 8 + i) + 0) >> shp;
+ unsigned px1 = input_pixel(2 * (num_vec * 8 + i) + 1) >> shp;
+ int b, r, g = (px0 & maskgx) + (px1 & maskgx);
+ int rb = px0 + px1 - g;
+
+ b = (rb & maskb) >> shb;
+ if (shp ||
+ origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
+ origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE) {
+ g >>= shg;
+ } else {
+ g = (g & maskg) >> shg;
+ }
+ r = (rb & maskr) >> shr;
+
+ dstU[num_vec * 8 + i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
+ dstV[num_vec * 8 + i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
+ }
+ }
+}
+
+#undef input_pixel
+
+#pragma pop_macro("r")
+#pragma pop_macro("b")
+#pragma pop_macro("input_pixel")
+
--
2.4.11
More information about the ffmpeg-devel
mailing list