[FFmpeg-devel] [PATCH] POWER8 VSX vectorization libswscale/input.c Track
Vyacheslav Pestov
pestov.vyach at yandex.ru
Thu Aug 13 17:55:27 EEST 2020
---
libswscale/ppc/Makefile | 3 +-
libswscale/ppc/input_vsx.c | 4559 +++++++++++++++++++++++++++++++++
libswscale/swscale.c | 2 +
libswscale/swscale_internal.h | 1 +
4 files changed, 4564 insertions(+), 1 deletion(-)
create mode 100644 libswscale/ppc/input_vsx.c
diff --git a/libswscale/ppc/Makefile b/libswscale/ppc/Makefile
index 0a31a3025b..6f4c2ebd18 100644
--- a/libswscale/ppc/Makefile
+++ b/libswscale/ppc/Makefile
@@ -1,4 +1,5 @@
OBJS += ppc/swscale_altivec.o \
ppc/yuv2rgb_altivec.o \
ppc/yuv2yuv_altivec.o \
- ppc/swscale_vsx.o
+ ppc/swscale_vsx.o \
+ ppc/input_vsx.o
diff --git a/libswscale/ppc/input_vsx.c b/libswscale/ppc/input_vsx.c
new file mode 100644
index 0000000000..177b8862f0
--- /dev/null
+++ b/libswscale/ppc/input_vsx.c
@@ -0,0 +1,4559 @@
+/*
+ * POWER8 VSX vectorization libswscale/input.c
+ * Written by Vyacheslav Pestov.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * POWER8 VSX vectorization libswscale/input.c
+ * @author Vyacheslav Pestov
+ */
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+
+#include "libavutil/avutil.h"
+#include "libavutil/bswap.h"
+#include "libavutil/cpu.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mathematics.h"
+#include "libavutil/pixdesc.h"
+#include "libavutil/avassert.h"
+#include "libavutil/ppc/util_altivec.h"
+#include "libavutil/timer.h"
+#include "config.h"
+#include "../rgb2rgb.h"
+#include "../swscale.h"
+#include "../swscale_internal.h"
+
+
+
+
+#if HAVE_VSX
+#if !HAVE_BIGENDIAN
+
+
+//vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+//vector signed short v_8000 = vec_splats((signed short)0x8000);
+//vector signed short v_7FFF = vec_splats((signed short)0x7FFF);
+//vector signed short v_FFFF = vec_splats((signed short)0xFFFF);
+vector unsigned int v_000000FF = ((vector unsigned int){0xFF, 0xFF, 0xFF, 0xFF});
+
+#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+
+#define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \
+ origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \
+ ? b_r : r_b)
+#define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \
+ origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \
+ ? r_b : b_r)
+#define v_r1 ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \
+ origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \
+ ? v_b_r1 : v_r_b1)
+#define v_b1 ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \
+ origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \
+ ? v_r_b1 : v_b_r1)
+#define v_r2 ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \
+ origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \
+ ? v_b_r2 : v_r_b2)
+#define v_b2 ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || \
+ origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) \
+ ? v_r_b2 : v_b_r2)
+
+static av_always_inline void
+rgb64ToY_c_template_vsx(uint16_t *dst, const uint16_t *src, int width,
+ enum AVPixelFormat origin, int32_t *rgb2yuv)
+{
+
+ int i, width_adj, is_BE;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_rd3;
+ vector unsigned short v_b_r1, v_b_r2, v_r_b1, v_r_b2, v_g1, v_g2;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ vector signed int v_ry, v_gy, v_by;
+ int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(0x2001<<(RGB2YUV_SHIFT-1)));
+ shift2 = vec_splats((unsigned int)RGB2YUV_SHIFT);
+ v_ry = vec_splats((signed int)ry);
+ v_gy = vec_splats((signed int)gy);
+ v_by = vec_splats((signed int)by);
+ is_BE = isBE(origin);
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+ v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32));
+ v_rd3 = vec_vsx_ld(0, (unsigned short *)(src_addr+48));
+
+ if(is_BE){
+ v_r_b1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){1, 0, 9, 8, 17, 16, 25, 24}));
+ v_r_b2 = vec_perm(v_rd2, v_rd3,
+ ((vector unsigned char){1, 0, 9, 8, 17, 16, 25, 24}));
+ v_g1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){3, 2, 11, 10, 19, 18, 27, 26}));
+ v_g2 = vec_perm(v_rd2, v_rd3,
+ ((vector unsigned char){3, 2, 11, 10, 19, 18, 27, 26}));
+ v_b_r1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){5, 4, 13, 12, 21, 20, 29, 28}));
+ v_b_r2 = vec_perm(v_rd2, v_rd3,
+ ((vector unsigned char){5, 4, 13, 12, 21, 20, 29, 28}));
+ }else{
+ v_r_b1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25}));
+ v_r_b2 = vec_perm(v_rd2, v_rd3,
+ ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25}));
+ v_g1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27}));
+ v_g2 = vec_perm(v_rd2, v_rd3,
+ ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27}));
+ v_b_r1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29}));
+ v_b_r2 = vec_perm(v_rd2, v_rd3,
+ ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29}));
+ }
+
+
+ v_r_b1 = vec_mergeh(v_r_b1, v_null);
+ v_g1 = vec_mergeh(v_g1, v_null);
+ v_b_r1 = vec_mergeh(v_b_r1, v_null);
+
+ v_r_b2 = vec_mergeh(v_r_b2, v_null);
+ v_g2 = vec_mergeh(v_g2, v_null);
+ v_b_r2 = vec_mergeh(v_b_r2, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ry);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1, v_gy ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1, v_by ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_ry);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g2, v_gy ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b2, v_by ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2, ((vector unsigned char)
+ {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr);
+
+ src_addr += 64;
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ unsigned int r_b = input_pixel(&src[i*4+0]);
+ unsigned int g = input_pixel(&src[i*4+1]);
+ unsigned int b_r = input_pixel(&src[i*4+2]);
+
+ dst[i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+
+}
+
+static av_always_inline void
+rgb64ToUV_c_template_vsx(uint16_t *dstU, uint16_t *dstV,
+ const uint16_t *src1, const uint16_t *src2,
+ int width, enum AVPixelFormat origin, int32_t *rgb2yuv)
+{
+ av_assert1(src1==src2);
+
+ int i, width_adj, is_BE ;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_rd3;
+ vector unsigned short v_b_r1, v_b_r2, v_r_b1, v_r_b2, v_g1, v_g2;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+ uintptr_t src_addr = (uintptr_t)src1;
+ uintptr_t dstU_addr = (uintptr_t)dstU;
+ uintptr_t dstV_addr = (uintptr_t)dstV;
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(0x10001<<(RGB2YUV_SHIFT-1)));
+ shift2 = vec_splats((unsigned int)RGB2YUV_SHIFT);
+ v_ru = vec_splats((signed int)ru);
+ v_gu = vec_splats((signed int)gu);
+ v_bu = vec_splats((signed int)bu);
+ v_rv = vec_splats((signed int)rv);
+ v_gv = vec_splats((signed int)gv);
+ v_bv = vec_splats((signed int)bv);
+ is_BE = isBE(origin);
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+ v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32));
+ v_rd3 = vec_vsx_ld(0, (unsigned short *)(src_addr+48));
+
+ if(is_BE){
+ v_r_b1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){1, 0, 9, 8, 17, 16, 25, 24}));
+ v_r_b2 = vec_perm(v_rd2, v_rd3,
+ ((vector unsigned char){1, 0, 9, 8, 17, 16, 25, 24}));
+ v_g1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){3, 2, 11, 10, 19, 18, 27, 26}));
+ v_g2 = vec_perm(v_rd2, v_rd3,
+ ((vector unsigned char){3, 2, 11, 10, 19, 18, 27, 26}));
+ v_b_r1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){5, 4, 13, 12, 21, 20, 29, 28}));
+ v_b_r2 = vec_perm(v_rd2, v_rd3,
+ ((vector unsigned char){5, 4, 13, 12, 21, 20, 29, 28}));
+ }else{
+ v_r_b1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25}));
+ v_r_b2 = vec_perm(v_rd2, v_rd3,
+ ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25}));
+ v_g1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27}));
+ v_g2 = vec_perm(v_rd2, v_rd3,
+ ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27}));
+ v_b_r1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29}));
+ v_b_r2 = vec_perm(v_rd2, v_rd3,
+ ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29}));
+ }
+
+
+ v_r_b1 = vec_mergeh(v_r_b1, v_null);
+ v_g1 = vec_mergeh(v_g1, v_null);
+ v_b_r1 = vec_mergeh(v_b_r1, v_null);
+
+ v_r_b2 = vec_mergeh(v_r_b2, v_null);
+ v_g2 = vec_mergeh(v_g2, v_null);
+ v_b_r2 = vec_mergeh(v_b_r2, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ru);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1, v_gu ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1, v_bu ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_ru);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g2, v_gu ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b2, v_bu ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_rv);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1, v_gv ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1, v_bv ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_rv);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g2, v_gv ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b2, v_bv ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);
+
+ src_addr += 64;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ int r_b = input_pixel(&src1[i*4+0]);
+ int g = input_pixel(&src1[i*4+1]);
+ int b_r = input_pixel(&src1[i*4+2]);
+
+ dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+
+}
+
+static av_always_inline void
+rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
+ const uint16_t *src1, const uint16_t *src2,
+ int width, enum AVPixelFormat origin, int32_t *rgb2yuv)
+{
+ int i;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ av_assert1(src1==src2);
+ for (i = 0; i < width; i++) {
+ int r_b = (input_pixel(&src1[8 * i + 0]) + input_pixel(&src1[8 * i + 4]) + 1) >> 1;
+ int g = (input_pixel(&src1[8 * i + 1]) + input_pixel(&src1[8 * i + 5]) + 1) >> 1;
+ int b_r = (input_pixel(&src1[8 * i + 2]) + input_pixel(&src1[8 * i + 6]) + 1) >> 1;
+
+ dstU[i]= (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ dstV[i]= (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
+ }
+}
+
+#define rgb64funcs(pattern, BE_LE, origin) \
+static void pattern ## 64 ## BE_LE ## ToY_c_vsx(uint8_t *_dst, const uint8_t *_src, \
+ const uint8_t *unused0, const uint8_t *unused1, \
+ int width, uint32_t *rgb2yuv) \
+{ \
+ const uint16_t *src = (const uint16_t *) _src; \
+ uint16_t *dst = (uint16_t *) _dst; \
+ rgb64ToY_c_template_vsx(dst, src, width, origin, rgb2yuv); \
+} \
+ \
+static void pattern ## 64 ## BE_LE ## ToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV, \
+ const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
+ int width, uint32_t *rgb2yuv) \
+{ \
+ const uint16_t *src1 = (const uint16_t *) _src1, \
+ *src2 = (const uint16_t *) _src2; \
+ uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
+ rgb64ToUV_c_template_vsx(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
+}\
+ \
+static void pattern ## 64 ## BE_LE ## ToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, \
+ const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
+ int width, uint32_t *rgb2yuv) \
+{ \
+ const uint16_t *src1 = (const uint16_t *) _src1, \
+ *src2 = (const uint16_t *) _src2; \
+ uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
+ rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
+}
+
+rgb64funcs(rgb, LE, AV_PIX_FMT_RGBA64LE)
+rgb64funcs(rgb, BE, AV_PIX_FMT_RGBA64BE)
+rgb64funcs(bgr, LE, AV_PIX_FMT_BGRA64LE)
+rgb64funcs(bgr, BE, AV_PIX_FMT_BGRA64BE)
+
+static av_always_inline void rgb48ToY_c_template_vsx(uint16_t *dst,
+ const uint16_t *src, int width,
+ enum AVPixelFormat origin,
+ int32_t *rgb2yuv)
+{
+
+ int i, width_adj, is_BE;
+ vector unsigned short v_rd0, v_rd1, v_rd2;
+ vector unsigned short v_b_r1, v_b_r2, v_r_b1, v_r_b2, v_g1, v_g2;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ vector signed int v_ry, v_gy, v_by;
+ int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(0x2001<<(RGB2YUV_SHIFT-1)));
+ shift2 = vec_splats((unsigned int)RGB2YUV_SHIFT);
+ v_ry = vec_splats((signed int)ry);
+ v_gy = vec_splats((signed int)gy);
+ v_by = vec_splats((signed int)by);
+ is_BE = isBE(origin);
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+ v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32));
+
+ if(is_BE){
+ v_r_b1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){1, 0, 7, 6, 13, 12, 19, 18}));
+ v_r_b2 = vec_perm(v_rd1, v_rd2,
+ ((vector unsigned char){9, 8, 15, 14, 21, 20, 27, 26}));
+ v_g1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){3, 2, 9, 8, 15, 14, 21, 20}));
+ v_g2 = vec_perm(v_rd1, v_rd2,
+ ((vector unsigned char){11, 10, 17, 16, 23, 22, 29, 28}));
+ v_b_r1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){5, 4, 11, 10, 17, 16, 23, 22}));
+ v_b_r2 = vec_perm(v_rd1, v_rd2,
+ ((vector unsigned char){13, 12, 19, 18, 25, 24, 31, 30}));
+ }else{
+ v_r_b1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 1, 6, 7, 12, 13, 18, 19}));
+ v_r_b2 = vec_perm(v_rd1, v_rd2,
+ ((vector unsigned char){8, 9, 14, 15, 20, 21, 26, 27}));
+ v_g1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){2, 3, 8, 9, 14, 15, 20, 21}));
+ v_g2 = vec_perm(v_rd1, v_rd2,
+ ((vector unsigned char){10, 11, 16, 17, 22, 23, 28, 29}));
+ v_b_r1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){4, 5, 10, 11, 16, 17, 22, 23}));
+ v_b_r2 = vec_perm(v_rd1, v_rd2,
+ ((vector unsigned char){12, 13, 18, 19, 24, 25, 30, 31}));
+ }
+
+
+ v_r_b1 = vec_mergeh(v_r_b1, v_null);
+ v_g1 = vec_mergeh(v_g1, v_null);
+ v_b_r1 = vec_mergeh(v_b_r1, v_null);
+
+ v_r_b2 = vec_mergeh(v_r_b2, v_null);
+ v_g2 = vec_mergeh(v_g2, v_null);
+ v_b_r2 = vec_mergeh(v_b_r2, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ry);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1, v_gy ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1, v_by ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_ry);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g2, v_gy ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b2, v_by ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr);
+
+ src_addr += 48;
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ unsigned int r_b = input_pixel(&src[i * 3 + 0]);
+ unsigned int g = input_pixel(&src[i * 3 + 1]);
+ unsigned int b_r = input_pixel(&src[i * 3 + 2]);
+
+ dst[i] = (ry*r + gy*g + by*b + (0x2001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+ }
+
+}
+
+static av_always_inline void rgb48ToUV_c_template_vsx(uint16_t *dstU,
+ uint16_t *dstV,
+ const uint16_t *src1,
+ const uint16_t *src2,
+ int width,
+ enum AVPixelFormat origin,
+ int32_t *rgb2yuv)
+{
+ av_assert1(src1==src2);
+
+ int i, width_adj, is_BE ;
+ vector unsigned short v_rd0, v_rd1, v_rd2;
+ vector unsigned short v_b_r1, v_b_r2, v_r_b1, v_r_b2, v_g1, v_g2;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX],
+ rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+ uintptr_t src_addr = (uintptr_t)src1;
+ uintptr_t dstU_addr = (uintptr_t)dstU;
+ uintptr_t dstV_addr = (uintptr_t)dstV;
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(0x10001<<(RGB2YUV_SHIFT-1)));
+ shift2 = vec_splats((unsigned int)RGB2YUV_SHIFT);
+ v_ru = vec_splats((signed int)ru);
+ v_gu = vec_splats((signed int)gu);
+ v_bu = vec_splats((signed int)bu);
+ v_rv = vec_splats((signed int)rv);
+ v_gv = vec_splats((signed int)gv);
+ v_bv = vec_splats((signed int)bv);
+ is_BE = isBE(origin);
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+ v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32));
+
+ if(is_BE){
+ v_r_b1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){1, 0, 7, 6, 13, 12, 19, 18}));
+ v_r_b2 = vec_perm(v_rd1, v_rd2,
+ ((vector unsigned char){9, 8, 15, 14, 21, 20, 27, 26}));
+ v_g1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){3, 2, 9, 8, 15, 14, 21, 20}));
+ v_g2 = vec_perm(v_rd1, v_rd2,
+ ((vector unsigned char){11, 10, 17, 16, 23, 22, 29, 28}));
+ v_b_r1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){5, 4, 11, 10, 17, 16, 23, 22}));
+ v_b_r2 = vec_perm(v_rd1, v_rd2,
+ ((vector unsigned char){13, 12, 19, 18, 25, 24, 31, 30}));
+ }else{
+ v_r_b1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 1, 6, 7, 12, 13, 18, 19}));
+ v_r_b2 = vec_perm(v_rd1, v_rd2,
+ ((vector unsigned char){8, 9, 14, 15, 20, 21, 26, 27}));
+ v_g1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){2, 3, 8, 9, 14, 15, 20, 21}));
+ v_g2 = vec_perm(v_rd1, v_rd2,
+ ((vector unsigned char){10, 11, 16, 17, 22, 23, 28, 29}));
+ v_b_r1 = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){4, 5, 10, 11, 16, 17, 22, 23}));
+ v_b_r2 = vec_perm(v_rd1, v_rd2,
+ ((vector unsigned char){12, 13, 18, 19, 24, 25, 30, 31}));
+ }
+
+
+ v_r_b1 = vec_mergeh(v_r_b1, v_null);
+ v_g1 = vec_mergeh(v_g1, v_null);
+ v_b_r1 = vec_mergeh(v_b_r1, v_null);
+
+ v_r_b2 = vec_mergeh(v_r_b2, v_null);
+ v_g2 = vec_mergeh(v_g2, v_null);
+ v_b_r2 = vec_mergeh(v_b_r2, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ru);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1, v_gu ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1, v_bu ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_ru);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g2, v_gu ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b2, v_bu ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_rv);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1, v_gv ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1, v_bv ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r2, v_rv);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g2, v_gv ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b2, v_bv ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);
+
+ src_addr += 48;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ int r_b = input_pixel(&src1[i * 3 + 0]);
+ int g = input_pixel(&src1[i * 3 + 1]);
+ int b_r = input_pixel(&src1[i * 3 + 2]);
+
+ dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+ dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+ }
+
+}
+
+static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU,
+ uint16_t *dstV,
+ const uint16_t *src1,
+ const uint16_t *src2,
+ int width,
+ enum AVPixelFormat origin,
+ int32_t *rgb2yuv)
+{
+
+ int i;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+
+ av_assert1(src1 == src2);
+ for (i = 0; i < width; i++) {
+ int r_b = (input_pixel(&src1[6 * i + 0]) +
+ input_pixel(&src1[6 * i + 3]) + 1) >> 1;
+ int g = (input_pixel(&src1[6 * i + 1]) +
+ input_pixel(&src1[6 * i + 4]) + 1) >> 1;
+ int b_r = (input_pixel(&src1[6 * i + 2]) +
+ input_pixel(&src1[6 * i + 5]) + 1) >> 1;
+
+ dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+ dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
+ }
+}
+
+#undef r
+#undef b
+#undef v_r1
+#undef v_b1
+#undef v_r2
+#undef v_b2
+#undef input_pixel
+
+#define rgb48funcs(pattern, BE_LE, origin) \
+static void pattern ## 48 ## BE_LE ## ToY_c_vsx(uint8_t *_dst, \
+ const uint8_t *_src, \
+ const uint8_t *unused0, const uint8_t *unused1,\
+ int width, \
+ uint32_t *rgb2yuv) \
+{ \
+ const uint16_t *src = (const uint16_t *)_src; \
+ uint16_t *dst = (uint16_t *)_dst; \
+ rgb48ToY_c_template_vsx(dst, src, width, origin, rgb2yuv); \
+} \
+ \
+static void pattern ## 48 ## BE_LE ## ToUV_c_vsx(uint8_t *_dstU, \
+ uint8_t *_dstV, \
+ const uint8_t *unused0, \
+ const uint8_t *_src1, \
+ const uint8_t *_src2, \
+ int width, \
+ uint32_t *rgb2yuv) \
+{ \
+ const uint16_t *src1 = (const uint16_t *)_src1, \
+ *src2 = (const uint16_t *)_src2; \
+ uint16_t *dstU = (uint16_t *)_dstU, \
+ *dstV = (uint16_t *)_dstV; \
+ rgb48ToUV_c_template_vsx(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
+} \
+ \
+static void pattern ## 48 ## BE_LE ## ToUV_half_c_vsx(uint8_t *_dstU, \
+ uint8_t *_dstV, \
+ const uint8_t *unused0, \
+ const uint8_t *_src1, \
+ const uint8_t *_src2, \
+ int width, \
+ uint32_t *rgb2yuv) \
+{ \
+ const uint16_t *src1 = (const uint16_t *)_src1, \
+ *src2 = (const uint16_t *)_src2; \
+ uint16_t *dstU = (uint16_t *)_dstU, \
+ *dstV = (uint16_t *)_dstV; \
+ rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
+}
+
+rgb48funcs(rgb, LE, AV_PIX_FMT_RGB48LE)
+rgb48funcs(rgb, BE, AV_PIX_FMT_RGB48BE)
+rgb48funcs(bgr, LE, AV_PIX_FMT_BGR48LE)
+rgb48funcs(bgr, BE, AV_PIX_FMT_BGR48BE)
+
+#define input_pixel(i) ((origin == AV_PIX_FMT_RGBA || \
+ origin == AV_PIX_FMT_BGRA || \
+ origin == AV_PIX_FMT_ARGB || \
+ origin == AV_PIX_FMT_ABGR) \
+ ? AV_RN32A(&src[(i) * 4]) \
+ : (isBE(origin) ? AV_RB16(&src[(i) * 2]) \
+ : AV_RL16(&src[(i) * 2])))
+
+
+static av_always_inline void rgb16_32ToY_c_template_vsx(int16_t *dst,
+ const uint8_t *src,
+ int width,
+ enum AVPixelFormat origin,
+ int shr, int shg,
+ int shb, int shp,
+ int maskr, int maskg,
+ int maskb, int rsh,
+ int gsh, int bsh, int S,
+ int32_t *rgb2yuv)
+{
+
+ int i, width_adj, is_DW, is_BE;
+ vector signed short v_rd0, v_rd1, v_px,v_sign,v_val;
+ vector signed short v_r1, v_r2, v_b1, v_b2, v_g1, v_g2;
+ vector signed int v_dst1, v_dst2;
+ vector signed int shift1;
+ vector signed int shift2;
+ const int ry = rgb2yuv[RY_IDX]<<rsh, gy = rgb2yuv[GY_IDX]<<gsh,
+ by = rgb2yuv[BY_IDX]<<bsh;
+ const unsigned rnd = (32<<((S)-1)) + (1<<(S-7));
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((signed int)rnd);
+ shift2 = vec_splats((signed int)((S)-6));
+ is_DW = (origin == AV_PIX_FMT_RGBA || origin == AV_PIX_FMT_BGRA ||
+ origin == AV_PIX_FMT_ARGB || origin == AV_PIX_FMT_ABGR);
+ if(!is_DW)
+ is_BE = isBE(origin);
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (signed short *)src_addr);
+
+ if(is_DW){
+ src_addr += 16;
+ v_rd1 = vec_vsx_ld(0, (signed short *)src_addr);
+
+ v_rd0 = (vector signed short)vec_sr((vector unsigned int)v_rd0,
+ (vector unsigned int)vec_splats((signed int)shp));
+ v_b1 = (vector signed short)vec_and((vector signed int)v_rd0,
+ vec_splats((signed int)maskb));
+ v_b1 = (vector signed short)vec_sr((vector unsigned int)v_b1,
+ (vector unsigned int)vec_splats((signed int)shb));
+ v_g1 = (vector signed short)vec_and((vector signed int)v_rd0,
+ vec_splats((signed int)maskg));
+ v_g1 = (vector signed short)vec_sr((vector unsigned int)v_g1,
+ (vector unsigned int)vec_splats((signed int)shg));
+ v_r1 = (vector signed short)vec_and((vector signed int)v_rd0,
+ vec_splats((signed int)maskr));
+ v_r1 = (vector signed short)vec_sr((vector unsigned int)v_r1,
+ (vector unsigned int)vec_splats((signed int)shr));
+
+ v_rd1 = (vector signed short)vec_sr((vector unsigned int)v_rd1,
+ (vector unsigned int)vec_splats((signed int)shp));
+ v_b2 = (vector signed short)vec_and((vector signed int)v_rd1,
+ vec_splats((signed int)maskb));
+ v_b2 = (vector signed short)vec_sr((vector unsigned int)v_b2,
+ (vector unsigned int)vec_splats((signed int)shb));
+ v_g2 = (vector signed short)vec_and((vector signed int)v_rd1,
+ vec_splats((signed int)maskg));
+ v_g2 = (vector signed short)vec_sr((vector unsigned int)v_g2,
+ (vector unsigned int)vec_splats((signed int)shg));
+ v_r2 = (vector signed short)vec_and((vector signed int)v_rd1,
+ vec_splats((signed int)maskr));
+ v_r2 = (vector signed short)vec_sr((vector unsigned int)v_r2,
+ (vector unsigned int)vec_splats((signed int)shr));
+ }else{
+ if(is_BE){
+ v_rd0 = vec_perm(v_rd0, v_rd0,
+ ((vector unsigned char){1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}));
+ }
+ v_px = (vector signed short)vec_sr((vector unsigned short)v_rd0,
+ (vector unsigned short)vec_splats((signed short)shp));
+ v_b1 = (vector signed short)vec_and(v_px,
+ vec_splats((signed short)maskb));
+ v_b1 = (vector signed short)vec_sr((vector unsigned short)v_b1,
+ (vector unsigned short)vec_splats((signed short)shb));
+ v_g1 = (vector signed short)vec_and(v_px,
+ vec_splats((signed short)maskg));
+ v_g1 = (vector signed short)vec_sr((vector unsigned short)v_g1,
+ (vector unsigned short)vec_splats((signed short)shg));
+ v_r1 = (vector signed short)vec_and(v_px,
+ vec_splats((signed short)maskr));
+ v_r1 = (vector signed short)vec_sr((vector unsigned short)v_r1,
+ (vector unsigned short)vec_splats((signed short)shr));
+
+
+ v_b2 = vec_mergel(v_b1, (vector signed short)v_null);
+ v_g2 = vec_mergel(v_g1, (vector signed short)v_null);
+ v_r2 = vec_mergel(v_r1, (vector signed short)v_null);
+ v_b1 = vec_mergeh(v_b1, (vector signed short)v_null);
+ v_g1 = vec_mergeh(v_g1, (vector signed short)v_null);
+ v_r1 = vec_mergeh(v_r1, (vector signed short)v_null);
+
+ }
+ vec_vsx_st((vector unsigned char)v_r1, 0, (unsigned char *)dst_addr);
+
+ v_dst1 = vec_mul((vector signed int)v_r1,
+ vec_splats((signed int)ry));
+ v_dst1 = vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1,
+ vec_splats((signed int)gy) ));
+ v_dst1 = vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1,
+ vec_splats((signed int)by) ));
+ v_dst1 = vec_add(v_dst1, (vector signed int)shift1);
+ v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2);
+
+ v_dst2 = vec_mul((vector signed int)v_r2,
+ vec_splats((signed int)ry));
+ v_dst2 = vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g2,
+ vec_splats((signed int)gy) ));
+ v_dst2 = vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b2,
+ vec_splats((signed int)by) ));
+ v_dst2 = vec_add(v_dst2, (vector signed int)shift1);
+ v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2);
+
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr);
+
+ dst_addr += 16;
+ src_addr += 16;
+
+ }
+
+ for (i = width_adj; i < width; i++) {
+ int px = input_pixel(i) >> shp;
+ int b = (px & maskb) >> shb;
+ int g = (px & maskg) >> shg;
+ int r = (px & maskr) >> shr;
+ dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
+ }
+
+}
+
+
+static av_always_inline void rgb16_32ToUV_c_template_vsx(int16_t *dstU,
+ int16_t *dstV,
+ const uint8_t *src,
+ int width,
+ enum AVPixelFormat origin,
+ int shr, int shg,
+ int shb, int shp,
+ int maskr, int maskg,
+ int maskb, int rsh,
+ int gsh, int bsh, int S,
+ int32_t *rgb2yuv)
+{
+
+ int i, width_adj, is_DW, is_BE;
+ vector signed short v_rd0, v_rd1, v_px, v_sign, v_val;
+ vector signed short v_r1, v_r2, v_b1, v_b2, v_g1, v_g2;
+ vector signed int v_dst1, v_dst2;
+ vector unsigned int shift1;
+ vector signed int shift2;
+ const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << gsh,
+ bu = rgb2yuv[BU_IDX] << bsh, rv = rgb2yuv[RV_IDX] << rsh,
+ gv = rgb2yuv[GV_IDX] << gsh, bv = rgb2yuv[BV_IDX] << bsh;
+ const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7));
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dstU_addr = (uintptr_t)dstU;
+ uintptr_t dstV_addr = (uintptr_t)dstV;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)rnd);
+ shift2 = vec_splats((signed int)((S)-6));
+ is_DW = (origin == AV_PIX_FMT_RGBA || origin == AV_PIX_FMT_BGRA ||
+ origin == AV_PIX_FMT_ARGB || origin == AV_PIX_FMT_ABGR);
+ if(!is_DW)
+ is_BE = isBE(origin);
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (signed short *)src_addr);
+
+ if(is_DW){
+ src_addr += 16;
+ v_rd1 = vec_vsx_ld(0, (signed short *)src_addr);
+
+ v_rd0 = (vector signed short)vec_sr((vector unsigned int)v_rd0,
+ (vector unsigned int)vec_splats((signed int)shp));
+ v_b1 = (vector signed short)vec_and((vector signed int)v_rd0,
+ vec_splats((signed int)maskb));
+ v_b1 = (vector signed short)vec_sr((vector unsigned int)v_b1,
+ (vector unsigned int)vec_splats((signed int)shb));
+ v_g1 = (vector signed short)vec_and((vector signed int)v_rd0,
+ vec_splats((signed int)maskg));
+ v_g1 = (vector signed short)vec_sr((vector unsigned int)v_g1,
+ (vector unsigned int)vec_splats((signed int)shg));
+ v_r1 = (vector signed short)vec_and((vector signed int)v_rd0,
+ vec_splats((signed int)maskr));
+ v_r1 = (vector signed short)vec_sr((vector unsigned int)v_r1,
+ (vector unsigned int)vec_splats((signed int)shr));
+
+ v_rd1 = (vector signed short)vec_sr((vector unsigned int)v_rd1,
+ (vector unsigned int)vec_splats((signed int)shp));
+ v_b2 = (vector signed short)vec_and((vector signed int)v_rd1,
+ vec_splats((signed int)maskb));
+ v_b2 = (vector signed short)vec_sr((vector unsigned int)v_b2,
+ (vector unsigned int)vec_splats((signed int)shb));
+ v_g2 = (vector signed short)vec_and((vector signed int)v_rd1,
+ vec_splats((signed int)maskg));
+ v_g2 = (vector signed short)vec_sr((vector unsigned int)v_g2,
+ (vector unsigned int)vec_splats((signed int)shg));
+ v_r2 = (vector signed short)vec_and((vector signed int)v_rd1,
+ vec_splats((signed int)maskr));
+ v_r2 = (vector signed short)vec_sr((vector unsigned int)v_r2,
+ (vector unsigned int)vec_splats((signed int)shr));
+ }else{
+ if(is_BE){
+ v_rd0 = vec_perm(v_rd0, v_rd0,
+ ((vector unsigned char){1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}));
+ }
+ v_px = (vector signed short)vec_sr((vector unsigned short)v_rd0,
+ (vector unsigned short)vec_splats((signed short)shp));
+ v_b1 = (vector signed short)vec_and(v_px,
+ vec_splats((signed short)maskb));
+ v_b1 = (vector signed short)vec_sr((vector unsigned short)v_b1,
+ (vector unsigned short)vec_splats((signed short)shb));
+ v_g1 = (vector signed short)vec_and(v_px,
+ vec_splats((signed short)maskg));
+ v_g1 = (vector signed short)vec_sr((vector unsigned short)v_g1,
+ (vector unsigned short)vec_splats((signed short)shg));
+ v_r1 = (vector signed short)vec_and(v_px,
+ vec_splats((signed short)maskr));
+ v_r1 = (vector signed short)vec_sr((vector unsigned short)v_r1,
+ (vector unsigned short)vec_splats((signed short)shr));
+
+
+ v_b2 = vec_mergel(v_b1, (vector signed short)v_null);
+ v_g2 = vec_mergel(v_g1, (vector signed short)v_null);
+ v_r2 = vec_mergel(v_r1, (vector signed short)v_null);
+ v_b1 = vec_mergeh(v_b1, (vector signed short)v_null);
+ v_g1 = vec_mergeh(v_g1, (vector signed short)v_null);
+ v_r1 = vec_mergeh(v_r1, (vector signed short)v_null);
+
+ }
+
+
+ v_dst1 = vec_mul((vector signed int)v_r1,
+ vec_splats((signed int)ru));
+ v_dst1 = vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1,
+ vec_splats((signed int)gu) ));
+ v_dst1 = vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1,
+ vec_splats((signed int)bu) ));
+ v_dst1 = vec_add(v_dst1, (vector signed int)shift1);
+ v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2);
+
+ v_dst2 = (vector signed int)vec_mul((vector unsigned int)v_r2, vec_splats((unsigned int)ru));
+ v_dst2 = vec_add((vector signed int)v_dst2, vec_mul((vector signed int)v_g2, vec_splats((signed int)gu) ));
+ v_dst2 = vec_add((vector signed int)v_dst2, vec_mul((vector signed int)v_b2, vec_splats((signed int)bu) ));
+
+ v_dst2 = (vector signed int)vec_add((vector unsigned int)v_dst2, (vector unsigned int)shift1);
+
+ v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2);
+
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+ v_dst1 = vec_mul((vector signed int)v_r1,
+ vec_splats((signed int)rv));
+ v_dst1 = vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1,
+ vec_splats((signed int)gv) ));
+ v_dst1 = vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1,
+ vec_splats((signed int)bv) ));
+ v_dst1 = vec_add(v_dst1, (vector signed int)shift1);
+ v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2);
+
+ v_dst2 = (vector signed int)vec_mul((vector unsigned int)v_r2, vec_splats((unsigned int)rv));
+ v_dst2 = vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g2,
+ vec_splats((signed int)gv) ));
+ v_dst2 = vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b2,
+ vec_splats((signed int)bv) ));
+
+ v_dst2 = (vector signed int)vec_add((vector unsigned int)v_dst2, (vector unsigned int)shift1);
+
+ v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2);
+
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);
+
+ dstU_addr += 16;
+ dstV_addr += 16;
+ src_addr += 16;
+
+ }
+
+ for (i = width_adj; i < width; i++) {
+ int px = input_pixel(i) >> shp;
+ int b = (px & maskb) >> shb;
+ int g = (px & maskg) >> shg;
+ int r = (px & maskr) >> shr;
+
+ dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
+ dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
+ }
+
+}
+
+static av_always_inline void rgb16_32ToUV_half_c_template_vsx(int16_t *dstU,
+ int16_t *dstV,
+ const uint8_t *src,
+ int width,
+ enum AVPixelFormat origin,
+ int shr, int shg,
+ int shb, int shp,
+ int maskr, int maskg,
+ int maskb, int rsh,
+ int gsh, int bsh, int S,
+ int32_t *rgb2yuv)
+{
+
+ int i, width_adj, is_DW, is_BE;
+ vector signed short v_rd0, v_rd1, v_sign, v_val;
+ vector unsigned int v_px0, v_px1;
+ vector signed int v_r2, v_g2, v_b2, v_r1, v_g1, v_b1,v_rb;
+ vector signed int v_dst1, v_dst2;
+ vector unsigned int shift1;
+ vector signed int shift2;
+ const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << gsh,
+ bu = rgb2yuv[BU_IDX] << bsh, rv = rgb2yuv[RV_IDX] << rsh,
+ gv = rgb2yuv[GV_IDX] << gsh, bv = rgb2yuv[BV_IDX] << bsh;
+ const int maskgx = ~(maskr | maskb);
+ const unsigned rnd = (256u<<(S)) + (1<<(S-6));
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dstU_addr = (uintptr_t)dstU;
+ uintptr_t dstV_addr = (uintptr_t)dstV;
+
+ maskr |= maskr << 1;
+ maskb |= maskb << 1;
+ maskg |= maskg << 1;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)rnd);
+ shift2 = vec_splats((signed int)((S)-6+1));
+ is_DW = (origin == AV_PIX_FMT_RGBA || origin == AV_PIX_FMT_BGRA ||
+ origin == AV_PIX_FMT_ARGB || origin == AV_PIX_FMT_ABGR);
+ if(!is_DW)
+ is_BE = isBE(origin);
+ }
+
+ for (i = 0; i < width_adj; i+=4) {
+ v_rd0 = vec_vsx_ld(0, (signed short *)src_addr);
+ src_addr += 16;
+ v_rd1 = vec_vsx_ld(0, (signed short *)src_addr);
+
+ if(is_DW){
+ v_rd0 = (vector signed short)vec_sr((vector unsigned int)v_rd0,
+ (vector unsigned int)vec_splats((signed int)shp));
+ v_rd1 = (vector signed short)vec_sr((vector unsigned int)v_rd1,
+ (vector unsigned int)vec_splats((signed int)shp));
+ v_px0 = (vector unsigned int)vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27}));
+ v_px1 = (vector unsigned int)vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}));
+ v_g1 = (vector signed int)vec_and((vector unsigned int)v_px0, (vector unsigned int)vec_splats(maskgx));
+ v_g1 = (vector signed int)vec_add((vector signed int)v_g1, (vector signed int)vec_and((vector unsigned int)v_px1, (vector unsigned int)vec_splats(maskgx)));
+ v_rb = (vector signed int)vec_add(v_px0, v_px1);
+ v_rb = (vector signed int)vec_sub((vector signed int)v_rb, (vector signed int)v_g1);
+
+ v_b1 = vec_and((vector signed int)v_rb,
+ vec_splats((signed int)maskb));
+ v_b1 = (vector signed int)vec_sr((vector unsigned int)v_b1,
+ (vector unsigned int)vec_splats((signed int)shb));
+
+ if(shp ||
+ origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
+ origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE){
+ v_g1 = (vector signed int)vec_sr((vector unsigned int)v_g1,
+ (vector unsigned int)vec_splats((signed int)shg));
+ }else{
+ v_g1 = vec_and((vector signed int)v_g1,
+ vec_splats((signed int)maskg));
+ v_g1 = (vector signed int)vec_sr((vector unsigned int)v_g1,
+ (vector unsigned int)vec_splats((signed int)shg));
+ }
+ v_r1 = vec_and((vector signed int)v_rb,
+ vec_splats((signed int)maskr));
+ v_r1 = (vector signed int)vec_sr((vector unsigned int)v_r1,
+ (vector unsigned int)vec_splats((signed int)shr));
+
+ src_addr += 16;
+ v_rd0 = vec_vsx_ld(0, (signed short *)src_addr);
+ src_addr += 16;
+ v_rd1 = vec_vsx_ld(0, (signed short *)src_addr);
+
+ v_rd0 = (vector signed short)vec_sr((vector unsigned int)v_rd0,
+ (vector unsigned int)vec_splats((signed int)shp));
+ v_rd1 = (vector signed short)vec_sr((vector unsigned int)v_rd1,
+ (vector unsigned int)vec_splats((signed int)shp));
+ v_px0 = (vector unsigned int)vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27}));
+ v_px1 = (vector unsigned int)vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}));
+ v_g2 = (vector signed int)vec_and(v_px0, (vector unsigned int)vec_splats(maskgx));
+ v_g2 = (vector signed int)vec_add((vector signed int)v_g2, (vector signed int)vec_and(v_px1, (vector unsigned int)vec_splats(maskgx)));
+ v_rb = (vector signed int)vec_add(v_px0, v_px1);
+ v_rb = (vector signed int)vec_sub((vector signed int)v_rb, (vector signed int)v_g1);
+
+ v_b2 = vec_and((vector signed int)v_rb,
+ vec_splats((signed int)maskb));
+ v_b2 = (vector signed int)vec_sr((vector unsigned int)v_b2,
+ (vector unsigned int)vec_splats((signed int)shb));
+ if(shp ||
+ origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
+ origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE){
+ v_g2 = (vector signed int)vec_sr((vector unsigned int)v_g2,
+ (vector unsigned int)vec_splats((signed int)shg));
+ }else{
+ v_g2 = vec_and((vector signed int)v_g2,
+ vec_splats((signed int)maskg));
+ v_g2 = (vector signed int)vec_sr((vector unsigned int)v_g2,
+ (vector unsigned int)vec_splats((signed int)shg));
+ }
+ v_r2 = vec_and((vector signed int)v_rb,
+ vec_splats((signed int)maskr));
+ v_r2 = (vector signed int)vec_sr((vector unsigned int)v_r2,
+ (vector unsigned int)vec_splats((signed int)shr));
+ }else{
+ if(is_BE){
+ v_px0 = (vector unsigned int)vec_perm(v_rd0, v_rd0,
+ ((vector unsigned char){1, 0, 0, 0, 5, 4, 0, 0, 9, 8, 0, 0, 13, 12, 0, 0}));
+ v_px1 = (vector unsigned int)vec_perm(v_rd0, v_rd0,
+ ((vector unsigned char){3, 2, 0, 0, 7, 6, 0, 0, 11, 10, 0, 0, 15, 14, 0, 0}));
+ }else{
+ v_px0 = (vector unsigned int)vec_perm(v_rd0, v_rd0,
+ ((vector unsigned char){0, 1, 0, 0, 4, 5, 0, 0, 8, 9, 0, 0, 12, 13, 0, 0}));
+ v_px1 = (vector unsigned int)vec_perm(v_rd0, v_rd0,
+ ((vector unsigned char){2, 3, 0, 0, 6, 7, 0, 0, 10, 11, 0, 0, 14, 15, 0, 0}));
+ }
+
+ v_px0 = vec_and(v_px0, vec_splats((unsigned int)0x0000FFFF));
+ v_px1 = vec_and(v_px1, vec_splats((unsigned int)0x0000FFFF));
+
+ v_px0 = (vector unsigned int)vec_sr(v_px0,
+ (vector unsigned int)vec_splats((signed int)shp));
+ v_px1 = (vector unsigned int)vec_sr(v_px1,
+ (vector unsigned int)vec_splats((signed short)shp));
+
+
+ v_g1 = (vector signed int)vec_and(v_px0, (vector unsigned int)vec_splats((unsigned int)maskgx));
+ v_g1 = (vector signed int)vec_add(v_g1, (vector signed int)vec_and(v_px1, (vector unsigned int)vec_splats((signed int)maskgx)));
+ v_rb = (vector signed int)vec_add(v_px0, v_px1);
+ v_rb = (vector signed int)vec_sub(v_rb, v_g1);
+
+
+
+ v_b1 = (vector signed int)vec_and(v_rb, vec_splats((signed int)maskb));
+ v_b1 = (vector signed int)vec_sr((vector unsigned int)v_b1,
+ (vector unsigned int)vec_splats((signed int)shb));
+
+ if(shp ||
+ origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
+ origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE){
+ v_g1 = (vector signed int)vec_sr((vector unsigned int)v_g1, (vector unsigned int)vec_splats((signed int)shg));
+ }else{
+ v_g1 = vec_and(v_g1, vec_splats((signed int)maskg));
+ v_g1 = (vector signed int)vec_sr((vector unsigned int)v_g1,
+ (vector unsigned int)vec_splats((signed int)shg));
+ }
+
+ v_r1 = (vector signed int)vec_and((vector signed int)v_rb,
+ vec_splats((signed int)maskr));
+ v_r1 = (vector signed int)vec_sr((vector unsigned int)v_r1,
+ (vector unsigned int)vec_splats((signed int)shr));
+ if(is_BE){
+ v_px0 = (vector unsigned int)vec_perm(v_rd1, v_rd1,
+ ((vector unsigned char){1, 0, 0, 0, 5, 4, 0, 0, 9, 8, 0, 0, 13, 12, 0, 0}));
+ v_px1 = (vector unsigned int)vec_perm(v_rd1, v_rd1,
+ ((vector unsigned char){3, 2, 0, 0, 7, 6, 0, 0, 11, 10, 0, 0, 15, 14, 0, 0}));
+ }else{
+ v_px0 = (vector unsigned int)vec_perm(v_rd1, v_rd1,
+ ((vector unsigned char){0, 1, 0, 0, 4, 5, 0, 0, 8, 9, 0, 0, 12, 13, 0, 0}));
+ v_px1 = (vector unsigned int)vec_perm(v_rd1, v_rd1,
+ ((vector unsigned char){2, 3, 0, 0, 6, 7, 0, 0, 10, 11, 0, 0, 14, 15, 0, 0}));
+ }
+
+ v_px0 = vec_and(v_px0, vec_splats((unsigned int)0x0000FFFF));
+ v_px1 = vec_and(v_px1, vec_splats((unsigned int)0x0000FFFF));
+
+ v_px0 = vec_sr((vector unsigned int)v_px0,
+ (vector unsigned int)vec_splats((signed int)shp));
+ v_px1 = vec_sr((vector unsigned int)v_px1,
+ (vector unsigned int)vec_splats((signed int)shp));
+
+
+ v_g2 = (vector signed int)vec_and(v_px0, (vector unsigned int)vec_splats((unsigned int)maskgx));
+ v_g2 = (vector signed int)vec_add(v_g2,
+ (vector signed int)vec_and(v_px1, (vector unsigned int)vec_splats((signed int)maskgx)));
+ v_rb = (vector signed int)vec_add(v_px0, v_px1);
+ v_rb = (vector signed int)vec_sub(v_rb, v_g2);
+
+ v_b2 = (vector signed int)vec_and(v_rb, vec_splats((signed int)maskb));
+ v_b2 = (vector signed int)vec_sr((vector unsigned int)v_b2,
+ (vector unsigned int)vec_splats((signed int)shb));
+ if(shp ||
+ origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
+ origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE){
+ v_g2 = (vector signed int)vec_sr((vector unsigned int)v_g2, (vector unsigned int)vec_splats((signed int)shg));
+ }else{
+ v_g2 = vec_and(v_g2,
+ vec_splats((signed int)maskg));
+ v_g2 = (vector signed int)vec_sr((vector unsigned int)v_g2, (vector unsigned int)vec_splats((signed int)shg));
+ }
+ v_r2 = vec_and(v_rb, vec_splats((signed int)maskr));
+ v_r2 = (vector signed int)vec_sr((vector unsigned int)v_r2, (vector unsigned int)vec_splats((signed short)shr));
+ }
+
+ v_dst1 = vec_mul((vector signed int)v_r1,
+ vec_splats((signed int)ru));
+ v_dst1 = vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1,
+ vec_splats((signed int)gu) ));
+ v_dst1 = vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1,
+ vec_splats((signed int)bu) ));
+ v_dst1 = vec_add(v_dst1, (vector signed int)shift1);
+ v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2);
+
+ v_dst2 = (vector signed int)vec_mul((vector unsigned int)v_r2, vec_splats((unsigned int)ru));
+ v_dst2 = vec_add((vector signed int)v_dst2, vec_mul((vector signed int)v_g2, vec_splats((signed int)gu) ));
+ v_dst2 = vec_add((vector signed int)v_dst2, vec_mul((vector signed int)v_b2, vec_splats((signed int)bu) ));
+
+ v_dst2 = (vector signed int)vec_add((vector unsigned int)v_dst2, (vector unsigned int)shift1);
+
+ v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2);
+
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+ v_dst1 = vec_mul((vector signed int)v_r1,
+ vec_splats((signed int)rv));
+ v_dst1 = vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1,
+ vec_splats((signed int)gv) ));
+ v_dst1 = vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1,
+ vec_splats((signed int)bv) ));
+ v_dst1 = vec_add(v_dst1, (vector signed int)shift1);
+ v_dst1 = (vector signed int)vec_sr((vector unsigned int)v_dst1, (vector unsigned int)shift2);
+
+ v_dst2 = (vector signed int)vec_mul((vector unsigned int)v_r2, vec_splats((unsigned int)rv));
+ v_dst2 = vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g2,
+ vec_splats((signed int)gv) ));
+ v_dst2 = vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b2,
+ vec_splats((signed int)bv) ));
+
+ v_dst2 = (vector signed int)vec_add((vector unsigned int)v_dst2, (vector unsigned int)shift1);
+
+ v_dst2 = (vector signed int)vec_sr((vector unsigned int)v_dst2, (vector unsigned int)shift2);
+
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);
+
+ dstU_addr += 16;
+ dstV_addr += 16;
+ src_addr += 16;
+
+ }
+
+ for (i = width_adj; i < width; i++) {
+
+ unsigned px0 = input_pixel(2 * i + 0) >> shp;
+ unsigned px1 = input_pixel(2 * i + 1) >> shp;
+ int b, r, g = (px0 & maskgx) + (px1 & maskgx);
+ int rb = px0 + px1 - g;
+
+ b = (rb & maskb) >> shb;
+ if (shp ||
+ origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
+ origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE) {
+ g >>= shg;
+ } else {
+ g = (g & maskg) >> shg;
+ }
+ r = (rb & maskr) >> shr;
+
+ dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
+ dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
+ }
+
+}
+
+#undef input_pixel
+
+#define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
+ maskg, maskb, rsh, gsh, bsh, S) \
+static void name ## ToY_c_vsx(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, \
+ int width, uint32_t *tab) \
+{ \
+ rgb16_32ToY_c_template_vsx((int16_t*)dst, src, width, fmt, shr, shg, shb, shp, \
+ maskr, maskg, maskb, rsh, gsh, bsh, S, tab); \
+} \
+ \
+static void name ## ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV, \
+ const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
+ int width, uint32_t *tab) \
+{ \
+ rgb16_32ToUV_c_template_vsx((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
+ shr, shg, shb, shp, \
+ maskr, maskg, maskb, rsh, gsh, bsh, S, tab);\
+} \
+ \
+static void name ## ToUV_half_c_vsx(uint8_t *dstU, uint8_t *dstV, \
+ const uint8_t *unused0, const uint8_t *src, \
+ const uint8_t *dummy, \
+ int width, uint32_t *tab) \
+{ \
+ rgb16_32ToUV_half_c_template_vsx((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
+ shr, shg, shb, shp, \
+ maskr, maskg, maskb, \
+ rsh, gsh, bsh, S, tab); \
+}
+
+rgb16_32_wrapper(AV_PIX_FMT_BGR32, bgr32, 16, 0, 0, 0, 0xFF0000, 0xFF00, 0x00FF, 8, 0, 8, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_BGR32_1, bgr321, 16, 0, 0, 8, 0xFF0000, 0xFF00, 0x00FF, 8, 0, 8, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_RGB32, rgb32, 0, 0, 16, 0, 0x00FF, 0xFF00, 0xFF0000, 8, 0, 8, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_RGB32_1, rgb321, 0, 0, 16, 8, 0x00FF, 0xFF00, 0xFF0000, 8, 0, 8, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_BGR565LE, bgr16le, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, 11, 5, 0, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_BGR555LE, bgr15le, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, 10, 5, 0, RGB2YUV_SHIFT + 7)
+rgb16_32_wrapper(AV_PIX_FMT_BGR444LE, bgr12le, 0, 0, 0, 0, 0x000F, 0x00F0, 0x0F00, 8, 4, 0, RGB2YUV_SHIFT + 4)
+rgb16_32_wrapper(AV_PIX_FMT_RGB565LE, rgb16le, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, 0, 5, 11, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_RGB555LE, rgb15le, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, 0, 5, 10, RGB2YUV_SHIFT + 7)
+rgb16_32_wrapper(AV_PIX_FMT_RGB444LE, rgb12le, 0, 0, 0, 0, 0x0F00, 0x00F0, 0x000F, 0, 4, 8, RGB2YUV_SHIFT + 4)
+rgb16_32_wrapper(AV_PIX_FMT_BGR565BE, bgr16be, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, 11, 5, 0, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_BGR555BE, bgr15be, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, 10, 5, 0, RGB2YUV_SHIFT + 7)
+rgb16_32_wrapper(AV_PIX_FMT_BGR444BE, bgr12be, 0, 0, 0, 0, 0x000F, 0x00F0, 0x0F00, 8, 4, 0, RGB2YUV_SHIFT + 4)
+rgb16_32_wrapper(AV_PIX_FMT_RGB565BE, rgb16be, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, 0, 5, 11, RGB2YUV_SHIFT + 8)
+rgb16_32_wrapper(AV_PIX_FMT_RGB555BE, rgb15be, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, 0, 5, 10, RGB2YUV_SHIFT + 7)
+rgb16_32_wrapper(AV_PIX_FMT_RGB444BE, rgb12be, 0, 0, 0, 0, 0x0F00, 0x00F0, 0x000F, 0, 4, 8, RGB2YUV_SHIFT + 4)
+
+static void gbr24pToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV,
+ const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
+ int width, uint32_t *rgb2yuv)
+{
+
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_rd00, v_rd01, v_rd02;
+ int i, width_adj;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ uint16_t *dstU = (uint16_t *)_dstU;
+ uint16_t *dstV = (uint16_t *)_dstV;
+ const int ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ const int rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+ uintptr_t gsrc_addr = (uintptr_t)gsrc;
+ uintptr_t bsrc_addr = (uintptr_t)bsrc;
+ uintptr_t rsrc_addr = (uintptr_t)rsrc;
+ uintptr_t dstU_addr = (uintptr_t)_dstU;
+ uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = ((vector unsigned int){(0x4001<<(RGB2YUV_SHIFT-6)),(0x4001<<(RGB2YUV_SHIFT-6)),
+ (0x4001<<(RGB2YUV_SHIFT-6)),(0x4001<<(RGB2YUV_SHIFT-6))} );
+ shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6+1));
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)gsrc_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)bsrc_addr);
+ v_rd2 = vec_vsx_ld(0, (unsigned short *)rsrc_addr);
+
+ v_rd00 = vec_sr(v_rd0, vec_splats((unsigned short)8));
+ v_rd01 = vec_sr(v_rd1, vec_splats((unsigned short)8));
+ v_rd02 = vec_sr(v_rd2, vec_splats((unsigned short)8));
+ v_rd0 = vec_and(v_rd0, vec_splats((unsigned short)0xFF));
+ v_rd1 = vec_and(v_rd1, vec_splats((unsigned short)0xFF));
+ v_rd2 = vec_and(v_rd2, vec_splats((unsigned short)0xFF));
+
+ v_rd0 = vec_add(v_rd0, v_rd00);
+ v_rd1 = vec_add(v_rd1, v_rd01);
+ v_rd2 = vec_add(v_rd2, v_rd02);
+
+ v_rd00 = vec_mergeh(v_rd0, v_null);
+ v_rd01 = vec_mergeh(v_rd1, v_null);
+ v_rd02 = vec_mergeh(v_rd2, v_null);
+ v_rd0 = vec_mergel(v_rd0, v_null);
+ v_rd1 = vec_mergel(v_rd1, v_null);
+ v_rd2 = vec_mergel(v_rd2, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd02,
+ ((vector signed int){ru,ru,ru,ru}));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd00,
+ ((vector signed int){gu,gu,gu,gu}) ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd01,
+ ((vector signed int){bu,bu,bu,bu})));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_rd2,
+ ((vector signed int){ru,ru,ru,ru}));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_rd0,
+ ((vector signed int){gu,gu,gu,gu})));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_rd1,
+ ((vector signed int){bu,bu,bu,bu})));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr); dstU_addr+=16;
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd02,
+ ((vector signed int){rv,rv,rv,rv}));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd00,
+ ((vector signed int){gv,gv,gv,gv})));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd01,
+ ((vector signed int){bv,bv,bv,bv})));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_rd2,
+ ((vector signed int){rv,rv,rv,rv}));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_rd0,
+ ((vector signed int){gv,gv,gv,gv})));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_rd1,
+ ((vector signed int){bv,bv,bv,bv})));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr); dstV_addr+=16;
+
+ gsrc_addr += 16;
+ bsrc_addr += 16;
+ rsrc_addr += 16;
+ }
+ for (i = width_adj; i < width; i++) {
+ unsigned int g = gsrc[2*i] + gsrc[2*i+1];
+ unsigned int b = bsrc[2*i] + bsrc[2*i+1];
+ unsigned int r = rsrc[2*i] + rsrc[2*i+1];
+
+ dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
+ dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
+ }
+
+}
+
+static void rgba64leToA_c_vsx(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
+ const uint8_t *unused2, int width, uint32_t *unused)
+{
+
+ int16_t *dst = (int16_t *)_dst;
+ const uint16_t *src = (const uint16_t *)_src;
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_rd3, v_dst;
+
+ uintptr_t src_addr = (uintptr_t)_src;
+ uintptr_t dst_addr = (uintptr_t)_dst;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width & (~(int)0x07);
+
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+ v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr + 32));
+ v_rd3 = vec_vsx_ld(0, (unsigned short *)(src_addr + 48));
+
+ v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char){6, 7, 14, 15, 22, 23, 30, 31}));
+ v_rd0 = vec_perm(v_rd2, v_rd3, ((vector unsigned char){6, 7, 14, 15, 22, 23, 30, 31}));
+ v_dst = vec_sld(v_dst, v_dst, 8);
+ v_dst = vec_sld(v_rd0, v_dst, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 64;
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ dst[i]= AV_RL16(src + 4*i + 3);
+ }
+
+}
+
+static void rgba64beToA_c_vsx(uint8_t *_dst, const uint8_t *_src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+ int16_t *dst = (int16_t *)_dst;
+ const uint16_t *src = (const uint16_t *)_src;
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_rd3, v_dst;
+
+ uintptr_t src_addr = (uintptr_t)_src;
+ uintptr_t dst_addr = (uintptr_t)_dst;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width & (~(int)0x07);
+
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+ v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr + 32));
+ v_rd3 = vec_vsx_ld(0, (unsigned short *)(src_addr + 48));
+
+ v_dst = vec_perm(v_rd0, v_rd1, ((vector unsigned char){7, 6, 15, 14, 23, 22, 31, 30}));
+ v_rd0 = vec_perm(v_rd2, v_rd3, ((vector unsigned char){7, 6, 15, 14, 23, 22, 31, 30}));
+ v_dst = vec_sld(v_dst, v_dst, 8);
+ v_dst = vec_sld(v_rd0, v_dst, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 64;
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ dst[i]= AV_RB16(src + 4*i + 3);
+ }
+
+}
+
+static void abgrToA_c_vsx(uint8_t *_dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+ int16_t *dst = (int16_t *)_dst;
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_dst;
+ vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width & (~(int)0x07);
+
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)(src_addr));
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+
+ v_rd0 = vec_and(v_rd0, v_FF);
+ v_rd1 = vec_and(v_rd1, v_FF);
+
+ v_rd0 = vec_sl(v_rd0, vec_splats((unsigned short)6));
+ v_rd1 = vec_sl(v_rd1, vec_splats((unsigned short)6));
+
+ v_dst = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 32;
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ dst[i]= src[4*i]<<6;
+ }
+
+}
+
+static void rgbaToA_c_vsx(uint8_t *_dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+ int16_t *dst = (int16_t *)_dst;
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_dst;
+ vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ // compute integral number of vector-length items and length of final fragment
+ width_adj = width & (~(int)0x07);
+
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)(src_addr));
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+
+ v_rd0 = vec_sld(v_rd0, v_rd0, 13);
+ v_rd1 = vec_sld(v_rd1, v_rd1, 13);
+
+ v_rd0 = vec_and(v_rd0, v_FF);
+ v_rd1 = vec_and(v_rd1, v_FF);
+
+ v_rd0 = vec_sl(v_rd0, vec_splats((unsigned short)6));
+ v_rd1 = vec_sl(v_rd1, vec_splats((unsigned short)6));
+
+ v_dst = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 32;
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ dst[i]= src[4*i+3]<<6;
+ }
+
+}
+
+/*static void palToA_c_vsx(uint8_t *_dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *pal)
+{
+
+ int16_t *dst = (int16_t *)_dst;
+ int i, j, d, width_adj;
+ uint32_t _pal[8];
+
+ vector unsigned short v_dst;
+ vector unsigned int v_rd0, v_rd1, v_rd3, v_rd4;
+ vector unsigned char sample;
+ vector unsigned int shift1;
+ vector unsigned short shift2;
+
+ uintptr_t dst_addr = (uintptr_t)_dst;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29});
+ shift1 = ((vector unsigned int){24, 24, 24, 24});
+ shift2 = vec_splats((unsigned short)6);
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ for( j=0; j<8; ++j)
+ _pal[j] = pal[src[j]];
+
+ v_rd0 = vec_vsx_ld(0, (unsigned int *)_pal);
+ v_rd1 = vec_vsx_ld(0, (unsigned int *)(&_pal[4]));
+ v_rd3 = vec_sr(v_rd0, shift1);
+ v_rd4 = vec_sr(v_rd1, shift1);
+ v_rd0 = vec_perm(v_rd3, v_rd4, sample);
+ v_dst = vec_sl((vector unsigned short)v_rd0, shift2);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src += 8;
+ dst_addr += 16;
+
+ }
+
+ for (i = width_adj; i < width; i++) {
+ d = *src;
+ dst[i]= (pal[d] >> 24)<<6;
+ ++src;
+ }
+
+}
+
+static void palToY_c_vsx(uint8_t *_dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *pal)
+{
+
+ int16_t *dst = (int16_t *)_dst;
+ int i, j, d, width_adj;
+ uint32_t _pal[8];
+
+ vector unsigned short v_dst;
+ vector unsigned int v_rd0, v_rd1, v_rd3, v_rd4;
+ vector unsigned char sample;
+ vector unsigned short shift;
+
+ uintptr_t dst_addr = (uintptr_t)_dst;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29});
+ shift = vec_splats((unsigned short)6);
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ for( j=0; j<8; ++j)
+ _pal[j] = pal[src[j]];
+
+ v_rd0 = vec_vsx_ld(0, (unsigned int *)_pal);
+ v_rd1 = vec_vsx_ld(0, (unsigned int *)(&_pal[4]));
+ v_rd3 = vec_and(v_rd0, v_000000FF);
+ v_rd4 = vec_and(v_rd1, v_000000FF);
+ v_rd0 = vec_perm(v_rd3, v_rd4, sample);
+ v_dst = vec_sl((vector unsigned short)v_rd0, shift);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src += 8;
+ dst_addr += 16;
+
+ }
+
+ for (i = width_adj; i < width; i++) {
+ d= *src;
+ dst[i] = (pal[d] & 0xFF)<<6;
+ src++;
+ }
+
+}
+
+static void palToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV,
+ const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *pal)
+{
+ av_assert1(src1 == src2);
+
+ uint16_t *dstU = (uint16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ int i, j, d, width_adj;
+ uint32_t _pal[8];
+ vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+ vector unsigned short v_dst, v_tmp0, v_tmp1;
+ vector unsigned int v_rd0, v_rd1, shift1, shift2;
+ vector unsigned char sample;
+ vector unsigned short shift3;
+
+ uintptr_t dstU_addr = (uintptr_t)_dstU;
+ uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29});
+ shift1 = vec_splats((unsigned int)8);
+ shift2 = vec_splats((unsigned int)16);
+ shift3 = vec_splats((unsigned short)6);
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ for( j = 0; j < 8; ++j)
+ _pal[j] = pal[src1[j]];
+
+ v_rd0 = vec_vsx_ld(0, (unsigned int *)_pal);
+ v_rd1 = vec_vsx_ld(0, (unsigned int *)(&_pal[4]));
+
+ v_tmp0 = (vector unsigned short)vec_sr(v_rd0, shift1);
+ v_tmp1 = (vector unsigned short)vec_sr(v_rd1, shift1);
+ v_dst = (vector unsigned short)vec_perm(v_tmp0, v_tmp1, sample);
+ v_tmp0 = vec_and(v_dst, v_FF);
+ v_dst = vec_sl((vector unsigned short)v_tmp0, shift3);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+
+ v_tmp0 = (vector unsigned short)vec_sr(v_rd0, shift2);
+ v_tmp1 = (vector unsigned short)vec_sr(v_rd1, shift2);
+ v_dst = (vector unsigned short)vec_perm(v_tmp0, v_tmp1, sample);
+ v_tmp0 = vec_and(v_dst, v_FF);
+ v_dst = vec_sl((vector unsigned short)v_tmp0, shift3);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+ src1 += 8;
+ dstU_addr += 16;
+ dstV_addr += 16;
+
+ }
+
+ for (i = width_adj; i < width; i++) {
+ d = pal[*src1];;
+ dstU[i] = (uint8_t)(d>> 8)<<6;
+ dstV[i] = (uint8_t)(d>>16)<<6;
+ src1++;
+ }
+
+}*/
+
+static void monowhite2Y_c_vsx(uint8_t *_dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+
+ int16_t *dst = (int16_t *)_dst;
+ int i, j;
+ vector unsigned short v_rd0, v_dst;
+
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width = (width + 7) >> 3;
+ for (i = 0; i < width; i++) {
+ v_rd0 = vec_splats((unsigned short)~src[i]);
+
+ v_dst = vec_sr(v_rd0, ((vector unsigned short){7, 6, 5, 4, 3, 2, 1, 0}));
+ v_dst = vec_and(v_dst, vec_splats((unsigned short)0x01));
+ v_dst = vec_mul(v_dst, vec_splats((unsigned short)16383));
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+ dst_addr += 16;
+ }
+ if(width&7){
+ int d= ~src[i];
+ for (j = 0; j < (width&7); j++)
+ dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+ }
+
+}
+
+static void monoblack2Y_c_vsx(uint8_t *_dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+ int16_t *dst = (int16_t *)_dst;
+ int i, j;
+ vector unsigned short v_rd0, v_dst;
+
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width = (width + 7) >> 3;
+ for (i = 0; i < width; i++) {
+ v_rd0 = vec_splats((unsigned short)src[i]);
+
+ v_dst = vec_sr(v_rd0, ((vector unsigned short){7, 6, 5, 4, 3, 2, 1, 0}));
+ v_dst = vec_and(v_dst, vec_splats((unsigned short)0x01));
+ v_dst = vec_mul(v_dst, vec_splats((unsigned short)16383));
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+ dst_addr += 16;
+ }
+ if(width&7){
+ int d= src[i];
+ for (j = 0; j < (width&7); j++)
+ dst[8*i+j]= ((d>>(7-j))&1) * 16383;
+ }
+
+}
+
+static void yuy2ToY_c_vsx(uint8_t *dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width_adj = width & (~(int)0x0F);
+
+ for ( i = 0; i < width_adj; i += 16) {
+ vector int v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ vector int v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+
+ vector int v_dst = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}));
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 32;
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ dst[i] = src[2 * i];
+ }
+
+}
+
+static void yuy2ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+ vector int v_rd0, v_rd1, v_rd2, v_rd3, v_dst;
+ vector unsigned char sample1, sample2;
+
+ uintptr_t src_addr = (uintptr_t)src1;
+ uintptr_t dstU_addr = (uintptr_t)dstU;
+ uintptr_t dstV_addr = (uintptr_t)dstV;
+
+ width_adj = width & (~(int)0x0F);
+
+ vec_and(v_dst, vec_splats((int)0x000));
+
+ if(width_adj){
+ sample1 = ((vector unsigned char){1, 5, 9, 13, 17, 21, 25, 29, 3, 7, 11, 15, 19, 23, 27, 31});
+ sample2 = ((vector unsigned char){3, 7, 11, 15, 19, 23, 27, 31, 1, 5, 9, 13, 17, 21, 25, 29});
+ }
+ for ( i = 0; i < width_adj; i += 16) {
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ v_rd2 = vec_perm(v_rd0, v_rd1, sample1);
+ src_addr += 32;
+
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ v_rd3 = vec_perm(v_rd0, v_rd1, sample2);
+ v_dst = vec_sld(v_rd2, v_rd3, 8);
+ v_dst = vec_sld(v_dst, v_dst, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+ v_dst = vec_sld(v_rd3, v_rd2, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+ src_addr += 32;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ dstU[i] = src1[4 * i + 1];
+ dstV[i] = src1[4 * i + 3];
+ }
+
+ av_assert1(src1 == src2);
+}
+
+static void yvy2ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+ vector int v_rd0, v_rd1, v_rd2, v_rd3, v_dst;
+ vector unsigned char sample1, sample2;
+
+ uintptr_t src_addr = (uintptr_t)src1;
+ uintptr_t dstU_addr = (uintptr_t)dstU;
+ uintptr_t dstV_addr = (uintptr_t)dstV;
+
+ width_adj = width & (~(int)0x0F);
+
+ vec_and(v_dst, vec_splats(0x000));
+
+ if(width_adj){
+ sample1 = ((vector unsigned char){1, 5, 9, 13, 17, 21, 25, 29, 3, 7, 11, 15, 19, 23, 27, 31});
+ sample2 = ((vector unsigned char){3, 7, 11, 15, 19, 23, 27, 31, 1, 5, 9, 13, 17, 21, 25, 29});
+ }
+ for ( i = 0; i < width_adj; i += 16) {
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ v_rd2 = vec_perm(v_rd0, v_rd1, sample1);
+ src_addr += 32;
+
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ v_rd3 = vec_perm(v_rd0, v_rd1, sample2);
+ v_dst = vec_sld(v_rd2, v_rd3, 8);
+ v_dst = vec_sld(v_dst, v_dst, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+ v_dst = vec_sld(v_rd3, v_rd2, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+
+ src_addr += 32;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ dstV[i] = src1[4 * i + 1];
+ dstU[i] = src1[4 * i + 3];
+ }
+
+ av_assert1(src1 == src2);
+}
+static void bswap16Y_c_vsx(uint8_t *_dst, const uint8_t *_src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_dst, v_shift;
+
+ const uint16_t *src = (const uint16_t *)_src;
+ uint16_t *dst = (uint16_t *)_dst;
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj)
+ v_shift = (vector unsigned short)vec_splats((unsigned short)8);
+ for ( i = 0; i < width_adj; i += 8) {
+ v_dst = vec_vsx_ld(0, (unsigned short *)src_addr);
+
+ v_rd0 = vec_sl((vector unsigned short)v_dst, v_shift);
+ v_rd1 = vec_sr((vector unsigned short)v_dst, v_shift);
+ v_dst = vec_or(v_rd0, v_rd1);
+
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 16;
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ dst[i] = (src[i]>>8) | (src[i]<<8);
+ }
+
+}
+
+static void bswap16UV_c_vsx(uint8_t *_dstU, uint8_t *_dstV,
+ const uint8_t *unused0, const uint8_t *_src1,
+ const uint8_t *_src2, int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_dst, v_shift;
+
+ const uint16_t *src1 = (const uint16_t *)_src1,
+ *src2 = (const uint16_t *)_src2;
+ uint16_t *dstU = (uint16_t *)_dstU,
+ *dstV = (uint16_t *)_dstV;
+ uintptr_t src1_addr = (uintptr_t)_src1,
+ src2_addr = (uintptr_t)_src2;
+ uintptr_t dstU_addr = (uintptr_t)dstU,
+ dstV_addr = (uintptr_t)dstV;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj)
+ v_shift = (vector unsigned short)vec_splats((unsigned short)8);
+ for ( i = 0; i < width_adj; i += 8) {
+ // load to dstU
+ v_dst = vec_vsx_ld(0, (unsigned short *)src1_addr);
+ v_rd0 = vec_sl((vector unsigned short)v_dst, v_shift);
+ v_rd1 = vec_sr((vector unsigned short)v_dst, v_shift);
+ v_dst = vec_or(v_rd0, v_rd1);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+ // load to dstV
+ v_dst = vec_vsx_ld(0, (unsigned short *)src2_addr);
+ v_rd0 = vec_sl((vector unsigned short)v_dst, v_shift);
+ v_rd1 = vec_sr((vector unsigned short)v_dst, v_shift);
+ v_dst = vec_or(v_rd0, v_rd1);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+ //
+ src1_addr += 16;
+ src2_addr += 16;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+ for (i = width_adj; i < width; i++) {
+ dstU[i] = (src1[i]>>8) | (src1[i]<<8);
+ dstV[i] = (src2[i]>>8) | (src2[i]<<8);
+ }
+
+}
+
+static void read_ya16le_gray_c_vsx(uint8_t *dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+ vector int v_rd0, v_rd1, v_dst;
+ vector unsigned char sample;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29});
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ v_dst = vec_perm(v_rd0, v_rd1, sample);
+
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 32;
+ dst_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ AV_WN16(dst + i * 2, AV_RL16(src + i * 4));
+ }
+
+}
+
+static void read_ya16le_alpha_c_vsx(uint8_t *dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+ vector int v_rd0, v_rd1, v_dst;
+ vector unsigned char sample;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample = ((vector unsigned char){2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31});
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ v_dst = vec_perm(v_rd0, v_rd1, sample);
+
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 32;
+ dst_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ AV_WN16(dst + i * 2, AV_RL16(src + i * 4 + 2));
+ }
+
+}
+
+static void read_ya16be_gray_c_vsx(uint8_t *dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+
+ int i, width_adj;
+
+ vector int v_rd0, v_rd1, v_dst;
+ vector unsigned char sample;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample = ((vector unsigned char){1, 0, 5, 4, 9, 8, 13, 12, 17, 16, 21, 20, 25, 24, 29, 28});
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ v_dst = vec_perm(v_rd0, v_rd1, sample);
+
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 32;
+ dst_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ AV_WN16(dst + i * 2, AV_RB16(src + i * 4));
+ }
+
+}
+
+static void read_ya16be_alpha_c_vsx(uint8_t *dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+
+ vector int v_rd0, v_rd1, v_dst;
+ vector unsigned char sample;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample = ((vector unsigned char){3, 2, 7, 6, 11, 10, 15, 14, 19, 18, 23, 22, 27, 26, 31, 30});
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ v_dst = vec_perm(v_rd0, v_rd1, sample);
+
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 32;
+ dst_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ AV_WN16(dst + i * 2, AV_RB16(src + i * 4 + 2));
+ }
+
+
+}
+
+static void read_ayuv64le_Y_c_vsx(uint8_t *dst, const uint8_t *src,
+ const uint8_t *unused0, const uint8_t *unused1,
+ int width, uint32_t *unused2)
+{
+
+ int i, width_adj;
+
+ vector int v_rd0, v_rd1, v_rd2, v_rd3, v_dst;
+ vector unsigned char sample1, sample2;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample1 = ((vector unsigned char){0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 10, 11, 18, 19, 26, 27});
+ sample2 = ((vector unsigned char){2, 3, 10, 11, 18, 19, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0});
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ v_rd2 = vec_vsx_ld(0, (int *)(src_addr+32));
+ v_rd3 = vec_vsx_ld(0, (int *)(src_addr + 48));
+
+ v_rd0 = vec_perm(v_rd0, v_rd1, sample1);
+ v_rd2 = vec_perm(v_rd2, v_rd3, sample2);
+ v_dst = vec_sld(v_rd2, v_rd0, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 64;
+ dst_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ AV_WN16(dst + i * 2, AV_RL16(src + i * 8 + 2));
+ }
+
+}
+
+
+static void read_ayuv64le_UV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused0, const uint8_t *src,
+ const uint8_t *unused1, int width, uint32_t *unused2)
+{
+
+ int i, width_adj;
+
+ vector int v_rd0, v_rd1, v_rd2, v_rd3, v_rd4, v_rd5, v_dst;
+ vector unsigned char sample1, sample2;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dstU_addr = (uintptr_t)dstU;
+ uintptr_t dstV_addr = (uintptr_t)dstV;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample1 = ((vector unsigned char){6, 7, 14, 15, 22, 23, 30, 31, 4, 5, 12, 13, 20, 21, 28, 29});
+ sample2 = ((vector unsigned char){4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31});
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ v_rd2 = vec_vsx_ld(0, (int *)(src_addr+32));
+ v_rd3 = vec_vsx_ld(0, (int *)(src_addr + 48));
+
+ v_rd4 = vec_perm(v_rd0, v_rd1, sample1);
+ v_rd5 = vec_perm(v_rd2, v_rd3, sample2);
+ v_dst = vec_sld(v_rd5, v_rd4, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+ v_dst = vec_sld(v_rd4, v_rd5, 8);
+ v_dst = vec_sld(v_dst, v_dst, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+ src_addr += 64;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ AV_WN16(dstU + i * 2, AV_RL16(src + i * 8 + 4));
+ AV_WN16(dstV + i * 2, AV_RL16(src + i * 8 + 6));
+ }
+
+}
+
+static void read_ayuv64le_A_c_vsx(uint8_t *dst, const uint8_t *src,
+ const uint8_t *unused0, const uint8_t *unused1,
+ int width, uint32_t *unused2)
+{
+
+ int i, width_adj;
+
+ vector int v_rd0, v_rd1, v_rd2, v_rd3, v_dst;
+ vector unsigned char sample1, sample2;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample1 = ((vector unsigned char){0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 8, 9, 16, 17, 24, 25});
+ sample2 = ((vector unsigned char){0, 1, 8, 9, 16, 17, 24, 25, 0, 0, 0, 0, 0, 0, 0, 0});
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ v_rd2 = vec_vsx_ld(0, (int *)(src_addr+32));
+ v_rd3 = vec_vsx_ld(0, (int *)(src_addr + 48));
+
+ v_rd0 = vec_perm(v_rd0, v_rd1, sample1);
+ v_rd2 = vec_perm(v_rd2, v_rd3, sample2);
+ v_dst = vec_sld(v_rd2, v_rd0, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 64;
+ dst_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ AV_WN16(dst + i * 2, AV_RL16(src + i * 8));
+ }
+
+}
+
+/* This is almost identical to the previous, end exists only because
+ * yuy2ToY/UV)(dst, src + 1, ...) would have 100% unaligned accesses. */
+static void uyvyToY_c_vsx(uint8_t *dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+
+ vector int v_rd0, v_rd1, v_dst;
+ vector unsigned char sample1, sample2;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width_adj = width & (~(int)0x0F);
+
+ if(width_adj){
+ sample1 = ((vector unsigned char){1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
+ }
+ for ( i = 0; i < width_adj; i += 16) {
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+
+ v_dst = vec_perm(v_rd0, v_rd1, sample1);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 32;
+ dst_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ dst[i] = src[2 * i + 1];
+ }
+
+}
+
+static void uyvyToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+
+ vector int v_rd0, v_rd1, v_rd2, v_rd3, v_rd4, v_rd5, v_dst;
+ vector unsigned char sample1, sample2;
+
+ uintptr_t src_addr = (uintptr_t)src1;
+ uintptr_t dstU_addr = (uintptr_t)dstU;
+ uintptr_t dstV_addr = (uintptr_t)dstV;
+
+ width_adj = width & (~(int)0x0F);
+
+ if(width_adj){
+ sample1 = ((vector unsigned char){2, 6, 10, 14, 18, 22, 26, 30, 0, 4, 8, 12, 16, 20, 24, 28});
+ sample2 = ((vector unsigned char){0, 4, 8, 12, 16, 20, 24, 28, 2, 6, 10, 14, 18, 22, 26, 30});
+ }
+ for ( i = 0; i < width_adj; i += 16) {
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+ v_rd2 = vec_vsx_ld(0, (int *)(src_addr+32));
+ v_rd3 = vec_vsx_ld(0, (int *)(src_addr + 48));
+
+ v_rd4 = vec_perm(v_rd0, v_rd1, sample1);
+ v_rd5 = vec_perm(v_rd2, v_rd3, sample2);
+ v_dst = vec_sld(v_rd5, v_rd4, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+ v_dst = vec_sld(v_rd4, v_rd5, 8);
+ v_dst = vec_sld(v_dst, v_dst, 8);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+ src_addr += 64;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ dstU[i] = src1[4 * i + 0];
+ dstV[i] = src1[4 * i + 2];
+ }
+
+ av_assert1(src1 == src2);
+}
+
+static av_always_inline void nvXXtoUV_c_vsx(uint8_t *dst1, uint8_t *dst2,
+ const uint8_t *src, int width)
+{
+
+ int i, width_adj;
+
+ vector int v_rd0, v_rd1, v_dst;
+ vector unsigned char sample1, sample2;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst1_addr = (uintptr_t)dst1;
+ uintptr_t dst2_addr = (uintptr_t)dst2;
+
+ width_adj = width & (~(int)0x0F);
+
+ if(width_adj){
+ sample1 = ((vector unsigned char){0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+ sample2 = ((vector unsigned char){1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
+ }
+ for ( i = 0; i < width_adj; i += 16) {
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr + 16));
+
+ v_dst = vec_perm(v_rd0, v_rd1, sample1);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst1_addr);
+ v_dst = vec_perm(v_rd0, v_rd1, sample2);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst2_addr);
+
+ src_addr += 32;
+ dst1_addr += 16;
+ dst2_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ dst1[i] = src[2 * i + 0];
+ dst2[i] = src[2 * i + 1];
+ }
+
+}
+
+static void nv12ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+ int width, uint32_t *unused)
+{
+ nvXXtoUV_c_vsx(dstU, dstV, src1, width);
+}
+
+static void nv21ToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+ int width, uint32_t *unused)
+{
+ nvXXtoUV_c_vsx(dstV, dstU, src1, width);
+}
+
+static void p010LEToY_c_vsx(uint8_t *dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_dst, shift;
+ vector unsigned char sample;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj)
+ shift = vec_splats((unsigned short)6);
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+
+ v_dst = vec_sr(v_rd0, shift);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 16;
+ dst_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ AV_WN16(dst + i * 2, AV_RL16(src + i * 2) >> 6);
+ }
+
+}
+
+static void p010BEToY_c_vsx(uint8_t *dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_dst, shift;
+ vector unsigned char sample;
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)dst;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample = ((vector unsigned char){1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14});
+ shift = vec_splats((unsigned short)6);
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+
+ v_rd1 = vec_perm(v_rd0, v_rd0, sample);
+ v_dst = vec_sr(v_rd1, shift);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 16;
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ AV_WN16(dst + i * 2, AV_RB16(src + i * 2) >> 6);
+ }
+
+}
+
+static void p010LEToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+
+ vector unsigned short v_rd0, v_rd1, v_dst;
+ vector unsigned char sample1, sample2;
+ vector unsigned short shift;
+
+ uintptr_t src_addr = (uintptr_t)src1;
+ uintptr_t dstU_addr = (uintptr_t)dstU;
+ uintptr_t dstV_addr = (uintptr_t)dstV;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample1 = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29});
+ sample2 = ((vector unsigned char){2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31});
+ shift = vec_splats((unsigned short)6);
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+
+ v_dst = vec_perm(v_rd0, v_rd1, sample1);
+ v_dst = vec_sr(v_dst, shift);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+ v_dst = vec_perm(v_rd0, v_rd1, sample2);
+ v_dst = vec_sr(v_dst, shift);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+ src_addr += 32;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ AV_WN16(dstU + i * 2, AV_RL16(src1 + i * 4 + 0) >> 6);
+ AV_WN16(dstV + i * 2, AV_RL16(src1 + i * 4 + 2) >> 6);
+ }
+
+}
+
+static void p010BEToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+
+ vector unsigned short v_rd0, v_rd1, v_dst;
+ vector unsigned char sample1, sample2;
+ vector unsigned short shift;
+
+ uintptr_t src_addr = (uintptr_t)src1;
+ uintptr_t dstU_addr = (uintptr_t)dstU;
+ uintptr_t dstV_addr = (uintptr_t)dstV;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample1 = ((vector unsigned char){1, 0, 5, 4, 9, 8, 13, 12, 17, 16, 21, 20, 25, 24, 29, 28});
+ sample2 = ((vector unsigned char){3, 2, 7, 6, 11, 10, 15, 14, 19, 18, 23, 22, 27, 26, 31, 30});
+ shift = vec_splats((unsigned short)6);
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+
+ v_dst = vec_perm(v_rd0, v_rd1, sample1);
+ v_dst = vec_sr(v_dst, shift);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+ v_dst = vec_perm(v_rd0, v_rd1, sample2);
+ v_dst = vec_sr(v_dst, shift);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+ src_addr += 32;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ AV_WN16(dstU + i * 2, AV_RB16(src1 + i * 4 + 0) >> 6);
+ AV_WN16(dstV + i * 2, AV_RB16(src1 + i * 4 + 2) >> 6);
+
+ }
+
+}
+
+static void p016LEToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+
+ vector unsigned short v_rd0, v_rd1, v_dst;
+ vector unsigned char sample1, sample2;
+
+ uintptr_t src_addr = (uintptr_t)src1;
+ uintptr_t dstU_addr = (uintptr_t)dstU;
+ uintptr_t dstV_addr = (uintptr_t)dstV;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample1 = ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29});
+ sample2 = ((vector unsigned char){2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31});
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+
+ v_dst = vec_perm(v_rd0, v_rd1, sample1);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+ v_dst = vec_perm(v_rd0, v_rd1, sample2);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+ src_addr += 32;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ AV_WN16(dstU + i * 2, AV_RL16(src1 + i * 4 + 0));
+ AV_WN16(dstV + i * 2, AV_RL16(src1 + i * 4 + 2));
+ }
+
+}
+
+static void p016BEToUV_c_vsx(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+
+ vector unsigned short v_rd0, v_rd1, v_dst;
+ vector unsigned char sample1, sample2;
+
+ uintptr_t src_addr = (uintptr_t)src1;
+ uintptr_t dstU_addr = (uintptr_t)dstU;
+ uintptr_t dstV_addr = (uintptr_t)dstV;
+
+ width_adj = width & (~(int)0x07);
+
+ if(width_adj){
+ sample1 = ((vector unsigned char){1, 0, 5, 4, 9, 8, 13, 12, 17, 16, 21, 20, 25, 24, 29, 28});
+ sample2 = ((vector unsigned char){3, 2, 7, 6, 11, 10, 15, 14, 19, 18, 23, 22, 27, 26, 31, 30});
+ }
+ for ( i = 0; i < width_adj; i += 8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr + 16));
+
+ v_dst = vec_perm(v_rd0, v_rd1, sample1);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstU_addr);
+ v_dst = vec_perm(v_rd0, v_rd1, sample2);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dstV_addr);
+
+ src_addr += 32;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+
+
+ for (i = width_adj; i < width; i++) {
+ AV_WN16(dstU + i * 2, AV_RB16(src1 + i * 4 + 0));
+ AV_WN16(dstV + i * 2, AV_RB16(src1 + i * 4 + 2));;
+ }
+
+}
+
+#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+
+static void bgr24ToY_c_vsx(uint8_t *_dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *rgb2yuv)
+{
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ int16_t *dst = (int16_t *)_dst;
+ vector signed int v_ry, v_gy, v_by;
+ int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+ vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)_dst;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(0x801<<(RGB2YUV_SHIFT-7)));
+ shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6));
+ v_ry = vec_splats((signed int)ry);
+ v_gy = vec_splats((signed int)gy);
+ v_by = vec_splats((signed int)by);
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+
+ v_b = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 0, 3, 0, 6, 0, 9, 0, 12, 0, 15, 0, 18, 0, 21, 0}));
+ v_g = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0}));
+ v_r = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23, 0}));
+
+ v_b = vec_and(v_b, v_FF);
+ v_g = vec_and(v_g, v_FF);
+ v_r = vec_and(v_r, v_FF);
+
+ v_rd0 = vec_mergeh(v_b, v_null);
+ v_rd1 = vec_mergeh(v_g, v_null);
+ v_rd2 = vec_mergeh(v_r, v_null);
+
+ v_g = vec_mergel(v_g, v_null);
+ v_b = vec_mergel(v_b, v_null);
+ v_r = vec_mergel(v_r, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd2, v_ry);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd1, v_gy ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd0, v_by ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ry);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gy ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_by ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr);
+
+ src_addr += 24;
+ dst_addr += 16;
+ }
+ for (i = width_adj; i < width; i++) {
+ unsigned int b = src[3*i];
+ unsigned int g = src[3*i + 1];
+ unsigned int r = src[3*i + 2];
+
+ dst[i] = ((ry*r + gy*g + by*b + (0x801<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
+ }
+
+}
+
+
+static void bgr24ToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV,
+ const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *rgb2yuv)
+{
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ int16_t *dstU = (int16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+ vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+ uintptr_t src_addr = (uintptr_t)src1;
+ uintptr_t dstU_addr = (uintptr_t)_dstU;
+ uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(0x4001<<(RGB2YUV_SHIFT-7)));
+ shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6));
+ v_ru = vec_splats((signed int)ru);
+ v_gu = vec_splats((signed int)gu);
+ v_bu = vec_splats((signed int)bu);
+ v_rv = vec_splats((signed int)rv);
+ v_gv = vec_splats((signed int)gv);
+ v_bv = vec_splats((signed int)bv);
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+
+ v_b = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 0, 3, 0, 6, 0, 9, 0, 12, 0, 15, 0, 18, 0, 21, 0}));
+ v_g = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0}));
+ v_r = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23, 0}));
+
+ v_b = vec_and(v_b, v_FF);
+ v_g = vec_and(v_g, v_FF);
+ v_r = vec_and(v_r, v_FF);
+
+ v_rd0 = vec_mergeh(v_b, v_null);
+ v_rd1 = vec_mergeh(v_g, v_null);
+ v_rd2 = vec_mergeh(v_r, v_null);
+
+ v_g = vec_mergel(v_g, v_null);
+ v_b = vec_mergel(v_b, v_null);
+ v_r = vec_mergel(v_r, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd2, v_ru);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd1, v_gu ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd0, v_bu ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gu ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_bu ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd2, v_rv);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd1, v_gv ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd0, v_bv ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gv ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_bv ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);
+
+ src_addr += 24;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+ for (i = width_adj; i < width; i++) {
+ int b = src1[3 * i + 0];
+ int g = src1[3 * i + 1];
+ int r = src1[3 * i + 2];
+
+ dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+ dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+ }
+
+ av_assert1(src1 == src2);
+}
+
+static void bgr24ToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *rgb2yuv)
+{
+
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ int16_t *dstU = (int16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+ vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+ uintptr_t src_addr = (uintptr_t)src1;
+ uintptr_t dstU_addr = (uintptr_t)_dstU;
+ uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(0x8002<<(RGB2YUV_SHIFT-7)));
+ shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-5));
+ v_ru = vec_splats((signed int)ru);
+ v_gu = vec_splats((signed int)gu);
+ v_bu = vec_splats((signed int)bu);
+ v_rv = vec_splats((signed int)rv);
+ v_gv = vec_splats((signed int)gv);
+ v_bv = vec_splats((signed int)bv);
+
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+ v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32));
+
+ v_b = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30}));
+ v_g = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31}));
+ v_r = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){2, 5, 8, 11, 14, 17, 20, 23, 26, 29}));
+
+ v_b = vec_perm(v_b, v_rd2,
+ ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29}));
+ v_g = vec_perm(v_g, v_rd2,
+ ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30}));
+ v_r = vec_perm(v_r, v_rd2,
+ ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31}));
+
+ v_r = vec_add(vec_and(v_r, v_FF), vec_sr(v_r, vec_splats((unsigned short)8)));
+ v_g = vec_add(vec_and(v_g, v_FF), vec_sr(v_g, vec_splats((unsigned short)8)));
+ v_b = vec_add(vec_and(v_b, v_FF), vec_sr(v_b, vec_splats((unsigned short)8)));
+
+ v_rd0 = vec_mergeh(v_r, v_null);
+ v_rd1 = vec_mergeh(v_g, v_null);
+ v_rd2 = vec_mergeh(v_b, v_null);
+
+ v_r = vec_mergel(v_r, v_null);
+ v_g = vec_mergel(v_g, v_null);
+ v_b = vec_mergel(v_b, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_ru);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd1, v_gu ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd2, v_bu ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gu ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_bu ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_rv);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd1, v_gv ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd2, v_bv ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gv ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_bv ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);
+
+ src_addr += 48;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ int b = src1[6 * i + 0] + src1[6 * i + 3];
+ int g = src1[6 * i + 1] + src1[6 * i + 4];
+ int r = src1[6 * i + 2] + src1[6 * i + 5];
+
+ dstU[i] = (ru*r + gu*g + bu*b + (0x8002<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-5);
+ dstV[i] = (rv*r + gv*g + bv*b + (0x8002<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-5);
+ }
+
+ av_assert1(src1 == src2);
+}
+
+static void rgb24ToY_c_vsx(uint8_t *_dst, const uint8_t *src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *rgb2yuv)
+{
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ int16_t *dst = (int16_t *)_dst;
+ vector signed int v_ry, v_gy, v_by;
+ int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+ vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)_dst;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(0x801<<(RGB2YUV_SHIFT-7)));
+ shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6));
+ v_ry = vec_splats((signed int)ry);
+ v_gy = vec_splats((signed int)gy);
+ v_by = vec_splats((signed int)by);
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+
+ v_r = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 0, 3, 0, 6, 0, 9, 0, 12, 0, 15, 0, 18, 0, 21, 0}));
+ v_g = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0}));
+ v_b = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23, 0}));
+
+ v_b = vec_and(v_b, v_FF);
+ v_g = vec_and(v_g, v_FF);
+ v_r = vec_and(v_r, v_FF);
+
+ v_rd0 = vec_mergeh(v_b, v_null);
+ v_rd1 = vec_mergeh(v_g, v_null);
+ v_rd2 = vec_mergeh(v_r, v_null);
+
+ v_g = vec_mergel(v_g, v_null);
+ v_b = vec_mergel(v_b, v_null);
+ v_r = vec_mergel(v_r, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd2, v_ry);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd1, v_gy ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd0, v_by ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ry);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gy ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_by ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr);
+
+ src_addr += 24;
+ dst_addr += 16;
+ }
+ for (i = width_adj; i < width; i++) {
+ unsigned int r = src[3*i];
+ unsigned int g = src[3*i + 1];
+ unsigned int b = src[3*i + 2];
+
+ //dst[i] = ((ry*r + gy*g + by*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
+ dst[i] = ((ry*r + gy*g + by*b + (0x801<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
+ }
+
+}
+
+static void rgb24ToUV_c_vsx(uint8_t *_dstU, uint8_t *_dstV,
+ const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *rgb2yuv)
+{
+ av_assert1(src1 == src2);
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ int16_t *dstU = (int16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+ vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+ uintptr_t src_addr = (uintptr_t)src1;
+ uintptr_t dstU_addr = (uintptr_t)_dstU;
+ uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(0x4001<<(RGB2YUV_SHIFT-7)));
+ shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6));
+ v_ru = vec_splats((signed int)ru);
+ v_gu = vec_splats((signed int)gu);
+ v_bu = vec_splats((signed int)bu);
+ v_rv = vec_splats((signed int)rv);
+ v_gv = vec_splats((signed int)gv);
+ v_bv = vec_splats((signed int)bv);
+
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+
+ v_r = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 0, 3, 0, 6, 0, 9, 0, 12, 0, 15, 0, 18, 0, 21, 0}));
+ v_g = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0}));
+ v_b = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 17, 0, 20, 0, 23, 0}));
+
+ v_r = vec_and(v_r, v_FF);
+ v_g = vec_and(v_g, v_FF);
+ v_b = vec_and(v_b, v_FF);
+
+ v_rd0 = vec_mergeh(v_r, v_null);
+ v_rd1 = vec_mergeh(v_g, v_null);
+ v_rd2 = vec_mergeh(v_b, v_null);
+
+ v_r = vec_mergel(v_r, v_null);
+ v_g = vec_mergel(v_g, v_null);
+ v_b = vec_mergel(v_b, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_ru);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd1, v_gu ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd2, v_bu ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gu ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_bu ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_rv);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd1, v_gv ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd2, v_bv ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gv ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_bv ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);
+
+ src_addr += 24;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+ for (i = width_adj; i < width; i++) {
+ int r = src1[3 * i + 0];
+ int g = src1[3 * i + 1];
+ int b = src1[3 * i + 2];
+
+ dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+ dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
+ }
+
+}
+
+static void rgb24ToUV_half_c_vsx(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
+ const uint8_t *src2, int width, uint32_t *rgb2yuv)
+{
+ av_assert1(src1 == src2);
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ int16_t *dstU = (int16_t *)_dstU;
+ int16_t *dstV = (int16_t *)_dstV;
+ vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+ vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+ uintptr_t src_addr = (uintptr_t)src1;
+ uintptr_t dstU_addr = (uintptr_t)_dstU;
+ uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(0x8002<<(RGB2YUV_SHIFT-7)));
+ shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-5));
+ v_ru = vec_splats((signed int)ru);
+ v_gu = vec_splats((signed int)gu);
+ v_bu = vec_splats((signed int)bu);
+ v_rv = vec_splats((signed int)rv);
+ v_gv = vec_splats((signed int)gv);
+ v_bv = vec_splats((signed int)bv);
+
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src_addr+16));
+ v_rd2 = vec_vsx_ld(0, (unsigned short *)(src_addr+32));
+
+ v_r = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30}));
+ v_g = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31}));
+ v_b = vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){2, 5, 8, 11, 14, 17, 20, 23, 26, 29}));
+
+ v_r = vec_perm(v_r, v_rd2,
+ ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29}));
+ v_g = vec_perm(v_g, v_rd2,
+ ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30}));
+ v_b = vec_perm(v_b, v_rd2,
+ ((vector unsigned char){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31}));
+
+ v_r = vec_add(vec_and(v_r, v_FF), vec_sr(v_r, vec_splats((unsigned short)8)));
+ v_g = vec_add(vec_and(v_g, v_FF), vec_sr(v_g, vec_splats((unsigned short)8)));
+ v_b = vec_add(vec_and(v_b, v_FF), vec_sr(v_b, vec_splats((unsigned short)8)));
+
+ v_rd0 = vec_mergeh(v_r, v_null);
+ v_rd1 = vec_mergeh(v_g, v_null);
+ v_rd2 = vec_mergeh(v_b, v_null);
+
+ v_r = vec_mergel(v_r, v_null);
+ v_g = vec_mergel(v_g, v_null);
+ v_b = vec_mergel(v_b, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_ru);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd1, v_gu ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd2, v_bu ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gu ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_bu ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_rd0, v_rv);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd1, v_gv ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_rd2, v_bv ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gv ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_bv ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);
+
+ src_addr += 48;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ int r = src1[6 * i + 0] + src1[6 * i + 3];
+ int g = src1[6 * i + 1] + src1[6 * i + 4];
+ int b = src1[6 * i + 2] + src1[6 * i + 5];
+
+ dstU[i] = (ru*r + gu*g + bu*b + (0x8002<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-5);
+ dstV[i] = (rv*r + gv*g + bv*b + (0x8002<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-5);
+ }
+
+}
+
+static void planar_rgb_to_y_vsx(uint8_t *_dst, const uint8_t *src[4],
+ int width, int32_t *rgb2yuv)
+{
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r, v_g1, v_b1, v_r1;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ int16_t *dst = (int16_t *)_dst;
+ vector signed int v_ry, v_gy, v_by;
+ int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+ vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+ uintptr_t dst_addr = (uintptr_t)_dst;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(0x801<<(RGB2YUV_SHIFT-7)));
+ shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6));
+ v_ry = vec_splats((signed int)ry);
+ v_gy = vec_splats((signed int)gy);
+ v_by = vec_splats((signed int)by);
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ if(i&1){
+ v_rd0 = vec_sld(v_rd0, v_rd0, 8);
+ v_rd1 = vec_sld(v_rd1, v_rd1, 8);
+ v_rd2 = vec_sld(v_rd2, v_rd2, 8);
+ }else{
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src[0]);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src[1]));
+ v_rd2 = vec_vsx_ld(0, (unsigned short *)(src[2]));
+ }
+
+ v_g = vec_perm(v_rd0, v_rd0,
+ ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+ v_b = vec_perm(v_rd1, v_rd1,
+ ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+ v_r = vec_perm(v_rd2, v_rd2,
+ ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+
+ v_b = vec_and(v_b, v_FF);
+ v_g = vec_and(v_g, v_FF);
+ v_r = vec_and(v_r, v_FF);
+
+ v_b1 = vec_mergeh(v_b, v_null);
+ v_g1 = vec_mergeh(v_g, v_null);
+ v_r1 = vec_mergeh(v_r, v_null);
+
+ v_g = vec_mergel(v_g, v_null);
+ v_b = vec_mergel(v_b, v_null);
+ v_r = vec_mergel(v_r, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ry);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1, v_gy ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1, v_by ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ry);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gy ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_by ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr);
+
+ src[0] += 8;
+ src[1] += 8;
+ src[2] += 8;
+ dst_addr += 16;
+ }
+
+ for (i = width_adj; i < width; i++) {
+ int g = src[0][0];
+ int b = src[1][0];
+ int r = src[2][0];
+ dst[i] = (ry*r + gy*g + by*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+ ++src[0];
+ ++src[1];
+ ++src[2];
+ }
+
+}
+
+
+static void planar_rgb_to_a_vsx(uint8_t *_dst, const uint8_t *src[4],
+ int width, int32_t *unused)
+{
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_a, v_dst;
+ int16_t *dst = (int16_t *)_dst;
+ vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+ uintptr_t dst_addr = (uintptr_t)_dst;
+
+
+ width_adj = width&(~(int)0x07);
+
+ for (i = 0; i < width_adj; i+=8) {
+ if(i&1)
+ v_rd0 = vec_sld(v_rd0, v_rd0, 8);
+ else
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src[3]);
+
+ v_a = vec_perm(v_rd0, v_rd0,
+ ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+ v_a = vec_and(v_a, v_FF);
+ v_dst = vec_sl(v_a, vec_splats((unsigned short)6));
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src[3] += 8;
+ dst_addr += 16;
+ }
+ for (i = width_adj; i < width; i++){
+ dst[i] = src[3][0] << 6;
+ ++src[3];
+ }
+
+}
+
+
+static void planar_rgb_to_uv_vsx(uint8_t *_dstU, uint8_t *_dstV,
+ const uint8_t *src[4], int width, int32_t *rgb2yuv)
+{
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r, v_g1, v_b1, v_r1;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ uint16_t *dstU = (uint16_t *)_dstU;
+ uint16_t *dstV = (uint16_t *)_dstV;
+ vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+ vector unsigned short v_FF = vec_splats((unsigned short)0x00FF);
+
+ uintptr_t dstU_addr = (uintptr_t)_dstU;
+ uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(0x4001<<(RGB2YUV_SHIFT-7)));
+ shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT-6));
+ v_ru = vec_splats((signed int)ru);
+ v_gu = vec_splats((signed int)gu);
+ v_bu = vec_splats((signed int)bu);
+ v_rv = vec_splats((signed int)rv);
+ v_gv = vec_splats((signed int)gv);
+ v_bv = vec_splats((signed int)bv);
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ if(i&1){
+ v_rd0 = vec_sld(v_rd0, v_rd0, 8);
+ v_rd1 = vec_sld(v_rd1, v_rd1, 8);
+ v_rd2 = vec_sld(v_rd2, v_rd2, 8);
+ }else{
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src[0]);
+ v_rd1 = vec_vsx_ld(0, (unsigned short *)(src[1]));
+ v_rd2 = vec_vsx_ld(0, (unsigned short *)(src[2]));
+ }
+
+ v_g = vec_perm(v_rd0, v_rd0,
+ ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+ v_b = vec_perm(v_rd1, v_rd1,
+ ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+ v_r = vec_perm(v_rd2, v_rd2,
+ ((vector unsigned char){0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0}));
+
+ v_b = vec_and(v_b, v_FF);
+ v_g = vec_and(v_g, v_FF);
+ v_r = vec_and(v_r, v_FF);
+
+ v_b1 = vec_mergeh(v_b, v_null);
+ v_g1 = vec_mergeh(v_g, v_null);
+ v_r1 = vec_mergeh(v_r, v_null);
+
+ v_g = vec_mergel(v_g, v_null);
+ v_b = vec_mergel(v_b, v_null);
+ v_r = vec_mergel(v_r, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ru);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1, v_gu ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1, v_bu ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gu ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_bu ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_rv);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1, v_gv ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1, v_bv ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gv ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_bv ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);
+
+ src[0] += 8;
+ src[1] += 8;
+ src[2] += 8;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+ for (i = width_adj; i < width; i++) {
+ int g = src[0][0];
+ int b = src[1][0];
+ int r = src[2][0];
+
+ dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+ dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
+ ++src[0];
+ ++src[1];
+ ++src[2];
+
+ }
+
+}
+
+#define rdpx(src) \
+ is_be ? AV_RB16(src) : AV_RL16(src)
+static av_always_inline
+void planar_rgb16_to_y_vsx(uint8_t *_dst, const uint8_t *_src[4],
+ int width, int bpc, int is_be, int32_t *rgb2yuv)
+{
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r, v_g1, v_b1, v_r1;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ int16_t *dst = (int16_t *)_dst;
+ const uint16_t **src = (const uint16_t **)_src;
+ vector signed int v_ry, v_gy, v_by;
+ int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+ int sh = bpc < 16 ? bpc : 14;
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+ uintptr_t src_addr = (uintptr_t)src;
+ uintptr_t dst_addr = (uintptr_t)_dst;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(33 << (RGB2YUV_SHIFT + bpc - 9)));
+ shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT + sh - 14));
+ v_ry = vec_splats((signed int)ry);
+ v_gy = vec_splats((signed int)gy);
+ v_by = vec_splats((signed int)by);
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+
+ v_g = vec_vsx_ld(0, (unsigned short *)src[0]);
+ v_b = vec_vsx_ld(0, (unsigned short *)(src[1]));
+ v_r = vec_vsx_ld(0, (unsigned short *)(src[2]));
+ if(is_be){
+ v_g = vec_perm(v_g, v_g,
+ ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+ v_b = vec_perm(v_b, v_b,
+ ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+ v_r = vec_perm(v_r, v_r,
+ ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+ }
+
+
+ v_b1 = vec_mergeh(v_b, v_null);
+ v_g1 = vec_mergeh(v_g, v_null);
+ v_r1 = vec_mergeh(v_r, v_null);
+
+ v_g = vec_mergel(v_g, v_null);
+ v_b = vec_mergel(v_b, v_null);
+ v_r = vec_mergel(v_r, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ry);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1, v_gy ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1, v_by ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ry);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gy ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_by ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dst_addr);
+
+ src[0] += 8;
+ src[1] += 8;
+ src[2] += 8;
+ dst_addr += 16;
+ }
+ for (i = width_adj; i < width; i++) {
+ int g = rdpx(src[0]);
+ int b = rdpx(src[1]);
+ int r = rdpx(src[2]);
+
+ dst[i] = ((ry*r + gy*g + by*b +
+ (33 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + sh - 14));
+ ++src[0];
+ ++src[1];
+ ++src[2];
+ }
+
+}
+//ToDO
+static av_always_inline
+void planar_rgb16_to_a_vsx(uint8_t *_dst, const uint8_t *_src[4],
+ int width, int bpc, int is_be, int32_t *rgb2yuv)
+{
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_a, v_dst, shift;
+ const uint16_t **src = (const uint16_t **)_src;
+ uint16_t *dst = (uint16_t *)_dst;
+ int sh = bpc < 16 ? bpc : 14;
+ uintptr_t dst_addr = (uintptr_t)_dst;
+
+
+ width_adj = width&(~(int)0x07);
+ if(width_adj){
+ shift = vec_splats((unsigned short)(14 - sh));
+ }
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (unsigned short *)src[3]);
+ if(is_be)
+ v_dst = vec_perm(v_rd0, v_rd0,
+ ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+ else
+ v_dst = vec_sl(v_rd0, shift);
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src[3] += 8;
+ dst_addr += 16;
+ }
+ for (i=width_adj; i< width; i++){
+ dst[i] = rdpx(src[3]) << (14 - sh);
+ ++src[3];
+ }
+
+}
+
+static av_always_inline
+void planar_rgb16_to_uv_vsx(uint8_t *_dstU, uint8_t *_dstV,
+ const uint8_t *_src[4], int width,
+ int bpc, int is_be, int32_t *rgb2yuv)
+{
+
+
+ int i, width_adj;
+ vector unsigned short v_rd0, v_rd1, v_rd2, v_g, v_b, v_r, v_g1, v_b1, v_r1;
+ vector unsigned int v_dst1, v_dst2;
+ vector unsigned int shift1, shift2;
+ const uint16_t **src = (const uint16_t **)_src;
+ uint16_t *dstU = (uint16_t *)_dstU;
+ uint16_t *dstV = (uint16_t *)_dstV;
+ vector signed int v_ru, v_gu, v_bu, v_rv, v_gv, v_bv;
+ int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+ int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
+ int sh = bpc < 16 ? bpc : 14;
+ vector unsigned short v_null = vec_splats((unsigned short)0x0000);
+
+ uintptr_t dstU_addr = (uintptr_t)_dstU;
+ uintptr_t dstV_addr = (uintptr_t)_dstV;
+
+
+ width_adj = width&(~(int)0x07);
+
+ if(width_adj){
+ shift1 = vec_splats((unsigned int)(257 << (RGB2YUV_SHIFT + bpc - 9)));
+ shift2 = vec_splats((unsigned int)(RGB2YUV_SHIFT + sh - 14));
+ v_ru = vec_splats((signed int)ru);
+ v_gu = vec_splats((signed int)gu);
+ v_bu = vec_splats((signed int)bu);
+ v_rv = vec_splats((signed int)rv);
+ v_gv = vec_splats((signed int)gv);
+ v_bv = vec_splats((signed int)bv);
+ }
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_g = vec_vsx_ld(0, (unsigned short *)src[0]);
+ v_b = vec_vsx_ld(0, (unsigned short *)(src[1]));
+ v_r = vec_vsx_ld(0, (unsigned short *)(src[2]));
+ if(is_be){
+ v_g = vec_perm(v_g, v_g,
+ ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+ v_b = vec_perm(v_b, v_b,
+ ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+ v_r = vec_perm(v_r, v_r,
+ ((vector unsigned char){1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14}));
+ }
+
+
+
+ v_b1 = vec_mergeh(v_b, v_null);
+ v_g1 = vec_mergeh(v_g, v_null);
+ v_r1 = vec_mergeh(v_r, v_null);
+
+ v_g = vec_mergel(v_g, v_null);
+ v_b = vec_mergel(v_b, v_null);
+ v_r = vec_mergel(v_r, v_null);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_ru);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1, v_gu ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1, v_bu ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_ru);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gu ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_bu ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstU_addr);
+
+ v_dst1 = (vector unsigned int)vec_mul((vector signed int)v_r1, v_rv);
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_g1, v_gv ));
+ v_dst1 = (vector unsigned int)vec_add((vector signed int)v_dst1,
+ vec_mul((vector signed int)v_b1, v_bv ));
+ v_dst1 = vec_add(v_dst1, shift1);
+ v_dst1 = vec_sr(v_dst1, shift2);
+ v_dst2 = (vector unsigned int)vec_mul((vector signed int)v_r, v_rv);
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_g, v_gv ));
+ v_dst2 = (vector unsigned int)vec_add((vector signed int)v_dst2,
+ vec_mul((vector signed int)v_b, v_bv ));
+ v_dst2 = vec_add(v_dst2, shift1);
+ v_dst2 = vec_sr(v_dst2, shift2);
+ v_dst1 = vec_perm(v_dst1, v_dst2,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst1, 0, (unsigned char *)dstV_addr);
+
+ src[0] += 8;
+ src[1] += 8;
+ src[2] += 8;
+ dstU_addr += 16;
+ dstV_addr += 16;
+ }
+ for (i = width_adj; i < width; i++) {
+ int g = rdpx(src[0]);
+ int b = rdpx(src[1]);
+ int r = rdpx(src[2]);
+
+ dstU[i] = (ru*r + gu*g + bu*b +
+ (257 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + sh - 14);
+ dstV[i] = (rv*r + gv*g + bv*b +
+ (257 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + sh - 14);
+ ++src[0];
+ ++src[1];
+ ++src[2];
+ }
+
+}
+#undef rdpx
+
+static av_always_inline void grayf32ToY16_c_vsx(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
+ const uint8_t *unused2, int width, uint32_t *unused)
+{
+
+ int i;
+ const float *src = (const float *)_src;
+ uint16_t *dst = (uint16_t *)_dst;
+
+ for (i = 0; i < width; ++i){
+ dst[i] = av_clip_uint16(lrintf(65535.0f * src[i]));
+ }
+}
+
+static av_always_inline void grayf32ToY16_bswap_c_vsx(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
+ const uint8_t *unused2, int width, uint32_t *unused)
+{
+
+ int i;
+ const uint32_t *src = (const uint32_t *)_src;
+ uint16_t *dst = (uint16_t *)_dst;
+
+ for (i = 0; i < width; ++i){
+ dst[i] = av_clip_uint16(lrintf(65535.0f * av_int2float(av_bswap32(src[i]))));
+ }
+}
+
+/*static av_always_inline
+void grayf32ToY16_c_vsx(uint8_t *_dst, const uint8_t *_src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+ vector float v_rd0, v_rd1;
+ vector signed int v_rd00, v_rd01, v_rd02, v_rd03;
+ vector unsigned short v_dst;
+ const float *src = (const float *)_src;
+ uint16_t *dst = (uint16_t *)_dst;
+
+ uintptr_t dst_addr = (uintptr_t)_dst;
+ uintptr_t src_addr = (uintptr_t)_src;
+
+
+ width_adj = width&(~(int)0x07);
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (float *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (float *)(src_addr+16));
+
+ v_rd0 = vec_rint(vec_mul(v_rd0, vec_splats((float)65535.0f)));
+ v_rd1 = vec_rint(vec_mul(v_rd1, vec_splats((float)65535.0f)));
+ v_rd00 = (vector signed int)vec_cts(v_rd0, 0);
+ v_rd01 = (vector signed int)vec_cts(v_rd1, 0);
+ v_rd02 = (vector signed int)vec_cmpgt((vector unsigned int)v_rd00,
+ vec_splats((unsigned int)0xFFFF));
+ v_rd03 = (vector signed int)vec_cmpgt((vector unsigned int)v_rd01,
+ vec_splats((unsigned int)0xFFFF));
+ v_rd00 = vec_or(v_rd00, v_rd02);
+ v_rd01 = vec_or(v_rd01, v_rd03);
+
+ v_dst = (vector unsigned short)vec_perm(v_rd00, v_rd01,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 32;
+ dst_addr += 16;
+ }
+ for (i = width_adj; i < width; i++){
+ dst[i] = av_clip_uint16(lrintf(65535.0f * src[i]));
+ }
+
+}
+static av_always_inline
+void grayf32ToY16_bswap_c_vsx(uint8_t *_dst, const uint8_t *_src,
+ const uint8_t *unused1, const uint8_t *unused2,
+ int width, uint32_t *unused)
+{
+
+ int i, width_adj;
+ vector signed int v_rd0, v_rd1, v_rd2, v_rd3;
+ vector float v_rd00, v_rd01;
+ vector unsigned short v_dst;
+ const uint32_t *src = (const float *)_src;
+ uint16_t *dst = (uint16_t *)_dst;
+
+ uintptr_t dst_addr = (uintptr_t)_dst;
+ uintptr_t src_addr = (uintptr_t)_src;
+
+
+ width_adj = width&(~(int)0x07);
+
+ for (i = 0; i < width_adj; i+=8) {
+ v_rd0 = vec_vsx_ld(0, (int *)src_addr);
+ v_rd1 = vec_vsx_ld(0, (int *)(src_addr+16));
+
+ v_rd0 = vec_perm(v_rd0, v_rd0,
+ ((vector unsigned char){3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}));
+ v_rd1 = vec_perm(v_rd1, v_rd1,
+ ((vector unsigned char){3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}));
+ v_rd00 = vec_round(vec_mul((vector float)v_rd0, vec_splats((float)65535.0f)));
+ v_rd01 = vec_round(vec_mul((vector float)v_rd1, vec_splats((float)65535.0f)));
+
+
+ v_rd0 = vec_cts(v_rd00, 0);
+ v_rd1 = vec_cts(v_rd01, 0);
+ v_rd2 = (vector signed int)vec_cmpgt((vector unsigned int)v_rd0,
+ vec_splats((unsigned int)0xFFFF));
+ v_rd3 = (vector signed int)vec_cmpgt((vector unsigned int)v_rd1,
+ vec_splats((unsigned int)0xFFFF));
+ v_rd0 = vec_or(v_rd0, v_rd2);
+ v_rd1 = vec_or(v_rd1, v_rd3);
+
+
+
+ v_dst = (vector unsigned short)vec_perm(v_rd0, v_rd1,
+ ((vector unsigned char){0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}));
+ vec_vsx_st((vector unsigned char)v_dst, 0, (unsigned char *)dst_addr);
+
+ src_addr += 32;
+ dst_addr += 16;
+ }
+ for (i = width_adj; i < width; i++){
+
+ dst[i] = av_clip_uint16(lrintf(65535.0f * av_int2float(av_bswap32(src[i]))));
+ }
+
+}*/
+
+#define rgb9plus_planar_funcs_endian(nbits, endian_name, endian) \
+static void planar_rgb##nbits##endian_name##_to_y_vsx(uint8_t *dst, const uint8_t *src[4], \
+ int w, int32_t *rgb2yuv) \
+{ \
+ planar_rgb16_to_y_vsx(dst, src, w, nbits, endian, rgb2yuv); \
+} \
+static void planar_rgb##nbits##endian_name##_to_uv_vsx(uint8_t *dstU, uint8_t *dstV, \
+ const uint8_t *src[4], int w, int32_t *rgb2yuv) \
+{ \
+ planar_rgb16_to_uv_vsx(dstU, dstV, src, w, nbits, endian, rgb2yuv); \
+} \
+
+
+#define rgb9plus_planar_transparency_funcs(nbits) \
+static void planar_rgb##nbits##le_to_a_vsx(uint8_t *dst, const uint8_t *src[4], \
+ int w, int32_t *rgb2yuv) \
+{ \
+ planar_rgb16_to_a_vsx(dst, src, w, nbits, 0, rgb2yuv); \
+} \
+static void planar_rgb##nbits##be_to_a_vsx(uint8_t *dst, const uint8_t *src[4], \
+ int w, int32_t *rgb2yuv) \
+{ \
+ planar_rgb16_to_a_vsx(dst, src, w, nbits, 1, rgb2yuv); \
+}
+
+#define rgb9plus_planar_funcs(nbits) \
+ rgb9plus_planar_funcs_endian(nbits, le, 0) \
+ rgb9plus_planar_funcs_endian(nbits, be, 1)
+
+rgb9plus_planar_funcs(9)
+rgb9plus_planar_funcs(10)
+rgb9plus_planar_funcs(12)
+rgb9plus_planar_funcs(14)
+rgb9plus_planar_funcs(16)
+
+rgb9plus_planar_transparency_funcs(10)
+rgb9plus_planar_transparency_funcs(12)
+rgb9plus_planar_transparency_funcs(16)
+#endif //!HAVE_BIGENDIAN
+#endif //HAVE_VSX
+av_cold void ff_sws_init_input_funcs_vsx(SwsContext *c)
+{
+ if (!(av_get_cpu_flags() & AV_CPU_FLAG_VSX))
+ return;
+#if HAVE_VSX
+#if !HAVE_BIGENDIAN
+ enum AVPixelFormat srcFormat = c->srcFormat;
+
+ c->chrToYV12 = NULL;
+ switch (srcFormat) {
+ case AV_PIX_FMT_YUYV422:
+ c->chrToYV12 = yuy2ToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_YVYU422:
+ c->chrToYV12 = yvy2ToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_UYVY422:
+ c->chrToYV12 = uyvyToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_NV12:
+ case AV_PIX_FMT_NV24:
+ c->chrToYV12 = nv12ToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_NV21:
+ case AV_PIX_FMT_NV42:
+ c->chrToYV12 = nv21ToUV_c_vsx;
+ break;
+ /*case AV_PIX_FMT_RGB8:
+ case AV_PIX_FMT_BGR8:
+ case AV_PIX_FMT_PAL8:
+ case AV_PIX_FMT_BGR4_BYTE:
+ case AV_PIX_FMT_RGB4_BYTE:
+ c->chrToYV12 = palToUV_c_vsx;
+ break;*/
+ case AV_PIX_FMT_GBRP9LE:
+ c->readChrPlanar = planar_rgb9le_to_uv_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP10LE:
+ case AV_PIX_FMT_GBRP10LE:
+ c->readChrPlanar = planar_rgb10le_to_uv_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP12LE:
+ case AV_PIX_FMT_GBRP12LE:
+ c->readChrPlanar = planar_rgb12le_to_uv_vsx;
+ break;
+ case AV_PIX_FMT_GBRP14LE:
+ c->readChrPlanar = planar_rgb14le_to_uv_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP16LE:
+ case AV_PIX_FMT_GBRP16LE:
+ c->readChrPlanar = planar_rgb16le_to_uv_vsx;
+ break;
+ case AV_PIX_FMT_GBRP9BE:
+ c->readChrPlanar = planar_rgb9be_to_uv_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP10BE:
+ case AV_PIX_FMT_GBRP10BE:
+ c->readChrPlanar = planar_rgb10be_to_uv_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP12BE:
+ case AV_PIX_FMT_GBRP12BE:
+ c->readChrPlanar = planar_rgb12be_to_uv_vsx;
+ break;
+ case AV_PIX_FMT_GBRP14BE:
+ c->readChrPlanar = planar_rgb14be_to_uv_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP16BE:
+ case AV_PIX_FMT_GBRP16BE:
+ c->readChrPlanar = planar_rgb16be_to_uv_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP:
+ case AV_PIX_FMT_GBRP:
+ c->readChrPlanar = planar_rgb_to_uv_vsx;
+ break;
+ case AV_PIX_FMT_YUV420P9BE:
+ case AV_PIX_FMT_YUV422P9BE:
+ case AV_PIX_FMT_YUV444P9BE:
+ case AV_PIX_FMT_YUV420P10BE:
+ case AV_PIX_FMT_YUV422P10BE:
+ case AV_PIX_FMT_YUV440P10BE:
+ case AV_PIX_FMT_YUV444P10BE:
+ case AV_PIX_FMT_YUV420P12BE:
+ case AV_PIX_FMT_YUV422P12BE:
+ case AV_PIX_FMT_YUV440P12BE:
+ case AV_PIX_FMT_YUV444P12BE:
+ case AV_PIX_FMT_YUV420P14BE:
+ case AV_PIX_FMT_YUV422P14BE:
+ case AV_PIX_FMT_YUV444P14BE:
+ case AV_PIX_FMT_YUV420P16BE:
+ case AV_PIX_FMT_YUV422P16BE:
+ case AV_PIX_FMT_YUV444P16BE:
+
+ case AV_PIX_FMT_YUVA420P9BE:
+ case AV_PIX_FMT_YUVA422P9BE:
+ case AV_PIX_FMT_YUVA444P9BE:
+ case AV_PIX_FMT_YUVA420P10BE:
+ case AV_PIX_FMT_YUVA422P10BE:
+ case AV_PIX_FMT_YUVA444P10BE:
+ case AV_PIX_FMT_YUVA422P12BE:
+ case AV_PIX_FMT_YUVA444P12BE:
+ case AV_PIX_FMT_YUVA420P16BE:
+ case AV_PIX_FMT_YUVA422P16BE:
+ case AV_PIX_FMT_YUVA444P16BE:
+ c->chrToYV12 = bswap16UV_c_vsx;
+ break;
+ case AV_PIX_FMT_AYUV64LE:
+ c->chrToYV12 = read_ayuv64le_UV_c_vsx;
+ break;
+ case AV_PIX_FMT_P010LE:
+ c->chrToYV12 = p010LEToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_P010BE:
+ c->chrToYV12 = p010BEToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_P016LE:
+ c->chrToYV12 = p016LEToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_P016BE:
+ c->chrToYV12 = p016BEToUV_c_vsx;
+ break;
+ }
+ if (c->chrSrcHSubSample) {
+ switch (srcFormat) {
+ case AV_PIX_FMT_RGBA64BE:
+ c->chrToYV12 = rgb64BEToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_RGBA64LE:
+ c->chrToYV12 = rgb64LEToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_BGRA64BE:
+ c->chrToYV12 = bgr64BEToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_BGRA64LE:
+ c->chrToYV12 = bgr64LEToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB48BE:
+ c->chrToYV12 = rgb48BEToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB48LE:
+ c->chrToYV12 = rgb48LEToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR48BE:
+ c->chrToYV12 = bgr48BEToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR48LE:
+ c->chrToYV12 = bgr48LEToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB32:
+ c->chrToYV12 = bgr32ToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB32_1:
+ c->chrToYV12 = bgr321ToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR24:
+ c->chrToYV12 = bgr24ToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR565LE:
+ c->chrToYV12 = bgr16leToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR565BE:
+ c->chrToYV12 = bgr16beToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR555LE:
+ c->chrToYV12 = bgr15leToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR555BE:
+ c->chrToYV12 = bgr15beToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP:
+ case AV_PIX_FMT_GBRP:
+ c->chrToYV12 = gbr24pToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR444LE:
+ c->chrToYV12 = bgr12leToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR444BE:
+ c->chrToYV12 = bgr12beToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR32:
+ c->chrToYV12 = rgb32ToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR32_1:
+ c->chrToYV12 = rgb321ToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->chrToYV12 = rgb24ToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB565LE:
+ c->chrToYV12 = rgb16leToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB565BE:
+ c->chrToYV12 = rgb16beToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB555LE:
+ c->chrToYV12 = rgb15leToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB555BE:
+ c->chrToYV12 = rgb15beToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB444LE:
+ c->chrToYV12 = rgb12leToUV_half_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB444BE:
+ c->chrToYV12 = rgb12beToUV_half_c_vsx;
+ break;
+ }
+ } else {
+ switch (srcFormat) {
+ case AV_PIX_FMT_RGBA64BE:
+ c->chrToYV12 = rgb64BEToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_RGBA64LE:
+ c->chrToYV12 = rgb64LEToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_BGRA64BE:
+ c->chrToYV12 = bgr64BEToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_BGRA64LE:
+ c->chrToYV12 = bgr64LEToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB48BE:
+ c->chrToYV12 = rgb48BEToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB48LE:
+ c->chrToYV12 = rgb48LEToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR48BE:
+ c->chrToYV12 = bgr48BEToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR48LE:
+ c->chrToYV12 = bgr48LEToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB32:
+ c->chrToYV12 = bgr32ToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB32_1:
+ c->chrToYV12 = bgr321ToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR24:
+ c->chrToYV12 = bgr24ToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR565LE:
+ c->chrToYV12 = bgr16leToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR565BE:
+ c->chrToYV12 = bgr16beToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR555LE:
+ c->chrToYV12 = bgr15leToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR555BE:
+ c->chrToYV12 = bgr15beToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR444LE:
+ c->chrToYV12 = bgr12leToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR444BE:
+ c->chrToYV12 = bgr12beToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR32:
+ c->chrToYV12 = rgb32ToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR32_1:
+ c->chrToYV12 = rgb321ToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->chrToYV12 = rgb24ToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB565LE:
+ c->chrToYV12 = rgb16leToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB565BE:
+ c->chrToYV12 = rgb16beToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB555LE:
+ c->chrToYV12 = rgb15leToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB555BE:
+ c->chrToYV12 = rgb15beToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB444LE:
+ c->chrToYV12 = rgb12leToUV_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB444BE:
+ c->chrToYV12 = rgb12beToUV_c_vsx;
+ break;
+ }
+ }
+
+ c->lumToYV12 = NULL;
+ c->alpToYV12 = NULL;
+ switch (srcFormat) {
+ case AV_PIX_FMT_GBRP9LE:
+ c->readLumPlanar = planar_rgb9le_to_y_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP10LE:
+ c->readAlpPlanar = planar_rgb10le_to_a_vsx;
+ case AV_PIX_FMT_GBRP10LE:
+ c->readLumPlanar = planar_rgb10le_to_y_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP12LE:
+ c->readAlpPlanar = planar_rgb12le_to_a_vsx;
+ case AV_PIX_FMT_GBRP12LE:
+ c->readLumPlanar = planar_rgb12le_to_y_vsx;
+ break;
+ case AV_PIX_FMT_GBRP14LE:
+ c->readLumPlanar = planar_rgb14le_to_y_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP16LE:
+ c->readAlpPlanar = planar_rgb16le_to_a_vsx;
+ case AV_PIX_FMT_GBRP16LE:
+ c->readLumPlanar = planar_rgb16le_to_y_vsx;
+ break;
+ case AV_PIX_FMT_GBRP9BE:
+ c->readLumPlanar = planar_rgb9be_to_y_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP10BE:
+ c->readAlpPlanar = planar_rgb10be_to_a_vsx;
+ case AV_PIX_FMT_GBRP10BE:
+ c->readLumPlanar = planar_rgb10be_to_y_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP12BE:
+ c->readAlpPlanar = planar_rgb12be_to_a_vsx;
+ case AV_PIX_FMT_GBRP12BE:
+ c->readLumPlanar = planar_rgb12be_to_y_vsx;
+ break;
+ case AV_PIX_FMT_GBRP14BE:
+ c->readLumPlanar = planar_rgb14be_to_y_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP16BE:
+ c->readAlpPlanar = planar_rgb16be_to_a_vsx;
+ case AV_PIX_FMT_GBRP16BE:
+ c->readLumPlanar = planar_rgb16be_to_y_vsx;
+ break;
+ case AV_PIX_FMT_GBRAP:
+ c->readAlpPlanar = planar_rgb_to_a_vsx;
+ case AV_PIX_FMT_GBRP:
+ c->readLumPlanar = planar_rgb_to_y_vsx;
+ break;
+
+ case AV_PIX_FMT_YUV420P9BE:
+ case AV_PIX_FMT_YUV422P9BE:
+ case AV_PIX_FMT_YUV444P9BE:
+ case AV_PIX_FMT_YUV420P10BE:
+ case AV_PIX_FMT_YUV422P10BE:
+ case AV_PIX_FMT_YUV440P10BE:
+ case AV_PIX_FMT_YUV444P10BE:
+ case AV_PIX_FMT_YUV420P12BE:
+ case AV_PIX_FMT_YUV422P12BE:
+ case AV_PIX_FMT_YUV440P12BE:
+ case AV_PIX_FMT_YUV444P12BE:
+ case AV_PIX_FMT_YUV420P14BE:
+ case AV_PIX_FMT_YUV422P14BE:
+ case AV_PIX_FMT_YUV444P14BE:
+ case AV_PIX_FMT_YUV420P16BE:
+ case AV_PIX_FMT_YUV422P16BE:
+ case AV_PIX_FMT_YUV444P16BE:
+
+ case AV_PIX_FMT_GRAY9BE:
+ case AV_PIX_FMT_GRAY10BE:
+ case AV_PIX_FMT_GRAY12BE:
+ case AV_PIX_FMT_GRAY14BE:
+ case AV_PIX_FMT_GRAY16BE:
+
+ case AV_PIX_FMT_P016BE:
+ c->lumToYV12 = bswap16Y_c_vsx;
+ break;
+ case AV_PIX_FMT_YUVA420P9BE:
+ case AV_PIX_FMT_YUVA422P9BE:
+ case AV_PIX_FMT_YUVA444P9BE:
+ case AV_PIX_FMT_YUVA420P10BE:
+ case AV_PIX_FMT_YUVA422P10BE:
+ case AV_PIX_FMT_YUVA444P10BE:
+ case AV_PIX_FMT_YUVA422P12BE:
+ case AV_PIX_FMT_YUVA444P12BE:
+ case AV_PIX_FMT_YUVA420P16BE:
+ case AV_PIX_FMT_YUVA422P16BE:
+ case AV_PIX_FMT_YUVA444P16BE:
+ c->lumToYV12 = bswap16Y_c_vsx;
+ c->alpToYV12 = bswap16Y_c_vsx;
+ break;
+ case AV_PIX_FMT_YA16LE:
+ c->lumToYV12 = read_ya16le_gray_c_vsx;
+ break;
+ case AV_PIX_FMT_YA16BE:
+ c->lumToYV12 = read_ya16be_gray_c_vsx;
+ break;
+ case AV_PIX_FMT_AYUV64LE:
+ c->lumToYV12 = read_ayuv64le_Y_c_vsx;
+ break;
+ case AV_PIX_FMT_YUYV422:
+ case AV_PIX_FMT_YVYU422:
+ case AV_PIX_FMT_YA8:
+ c->lumToYV12 = yuy2ToY_c_vsx;
+ break;
+ case AV_PIX_FMT_UYVY422:
+ c->lumToYV12 = uyvyToY_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR24:
+ c->lumToYV12 = bgr24ToY_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR565LE:
+ c->lumToYV12 = bgr16leToY_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR565BE:
+ c->lumToYV12 = bgr16beToY_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR555LE:
+ c->lumToYV12 = bgr15leToY_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR555BE:
+ c->lumToYV12 = bgr15beToY_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR444LE:
+ c->lumToYV12 = bgr12leToY_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR444BE:
+ c->lumToYV12 = bgr12beToY_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB24:
+ c->lumToYV12 = rgb24ToY_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB565LE:
+ c->lumToYV12 = rgb16leToY_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB565BE:
+ c->lumToYV12 = rgb16beToY_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB555LE:
+ c->lumToYV12 = rgb15leToY_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB555BE:
+ c->lumToYV12 = rgb15beToY_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB444LE:
+ c->lumToYV12 = rgb12leToY_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB444BE:
+ c->lumToYV12 = rgb12beToY_c_vsx;
+ break;
+ /*case AV_PIX_FMT_RGB8:
+ case AV_PIX_FMT_BGR8:
+ case AV_PIX_FMT_PAL8:
+ case AV_PIX_FMT_BGR4_BYTE:
+ case AV_PIX_FMT_RGB4_BYTE:
+ c->lumToYV12 = palToY_c_vsx;
+ break;*/
+ case AV_PIX_FMT_MONOBLACK:
+ c->lumToYV12 = monoblack2Y_c_vsx;
+ break;
+ case AV_PIX_FMT_MONOWHITE:
+ c->lumToYV12 = monowhite2Y_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB32:
+ c->lumToYV12 = bgr32ToY_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB32_1:
+ c->lumToYV12 = bgr321ToY_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR32:
+ c->lumToYV12 = rgb32ToY_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR32_1:
+ c->lumToYV12 = rgb321ToY_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB48BE:
+ c->lumToYV12 = rgb48BEToY_c_vsx;
+ break;
+ case AV_PIX_FMT_RGB48LE:
+ c->lumToYV12 = rgb48LEToY_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR48BE:
+ c->lumToYV12 = bgr48BEToY_c_vsx;
+ break;
+ case AV_PIX_FMT_BGR48LE:
+ c->lumToYV12 = bgr48LEToY_c_vsx;
+ break;
+ case AV_PIX_FMT_RGBA64BE:
+ c->lumToYV12 = rgb64BEToY_c_vsx;
+ break;
+ case AV_PIX_FMT_RGBA64LE:
+ c->lumToYV12 = rgb64LEToY_c_vsx;
+ break;
+ case AV_PIX_FMT_BGRA64BE:
+ c->lumToYV12 = bgr64BEToY_c_vsx;
+ break;
+ case AV_PIX_FMT_BGRA64LE:
+ c->lumToYV12 = bgr64LEToY_c_vsx;
+ break;
+ case AV_PIX_FMT_P010LE:
+ c->lumToYV12 = p010LEToY_c_vsx;
+ break;
+ case AV_PIX_FMT_P010BE:
+ c->lumToYV12 = p010BEToY_c_vsx;
+ break;
+ case AV_PIX_FMT_GRAYF32LE:
+ c->lumToYV12 = grayf32ToY16_c_vsx;
+ break;
+ case AV_PIX_FMT_GRAYF32BE:
+ c->lumToYV12 = grayf32ToY16_bswap_c_vsx;
+ break;
+ }
+ if (c->needAlpha) {
+ if (is16BPS(srcFormat) || isNBPS(srcFormat)) {
+ if (HAVE_BIGENDIAN == !isBE(srcFormat) && !c->readAlpPlanar)
+ c->alpToYV12 = bswap16Y_c_vsx;
+ }
+ switch (srcFormat) {
+ case AV_PIX_FMT_BGRA64LE:
+ case AV_PIX_FMT_RGBA64LE: c->alpToYV12 = rgba64leToA_c_vsx; break;
+ case AV_PIX_FMT_BGRA64BE:
+ case AV_PIX_FMT_RGBA64BE: c->alpToYV12 = rgba64beToA_c_vsx; break;
+ case AV_PIX_FMT_BGRA:
+ case AV_PIX_FMT_RGBA:
+ c->alpToYV12 = rgbaToA_c_vsx;
+ break;
+ case AV_PIX_FMT_ABGR:
+ case AV_PIX_FMT_ARGB:
+ c->alpToYV12 = abgrToA_c_vsx;
+ break;
+ case AV_PIX_FMT_YA8:
+ c->alpToYV12 = uyvyToY_c_vsx;
+ break;
+ case AV_PIX_FMT_YA16LE:
+ c->alpToYV12 = read_ya16le_alpha_c_vsx;
+ break;
+ case AV_PIX_FMT_YA16BE:
+ c->alpToYV12 = read_ya16be_alpha_c_vsx;
+ break;
+ case AV_PIX_FMT_AYUV64LE:
+ c->alpToYV12 = read_ayuv64le_A_c_vsx;
+ break;
+ /*case AV_PIX_FMT_PAL8 :
+ c->alpToYV12 = palToA_c_vsx;
+ break;*/
+ }
+ }
+#endif //!HAVE_BIGENDIAN
+#endif //HAVE_VSX
+}
diff --git a/libswscale/swscale.c b/libswscale/swscale.c
index 9cb7e8f6ac..60dc8d6985 100644
--- a/libswscale/swscale.c
+++ b/libswscale/swscale.c
@@ -557,6 +557,8 @@ static av_cold void sws_init_swscale(SwsContext *c)
&c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
ff_sws_init_input_funcs(c);
+ if (ARCH_PPC)
+ ff_sws_init_input_funcs_vsx(c);
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index d207d3beff..61fb1e4dc2 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -872,6 +872,7 @@ void ff_get_unscaled_swscale_aarch64(SwsContext *c);
SwsFunc ff_getSwsFunc(SwsContext *c);
void ff_sws_init_input_funcs(SwsContext *c);
+void ff_sws_init_input_funcs_vsx(SwsContext *c);
void ff_sws_init_output_funcs(SwsContext *c,
yuv2planar1_fn *yuv2plane1,
yuv2planarX_fn *yuv2planeX,
--
2.17.1
More information about the ffmpeg-devel
mailing list