[Ffmpeg-devel] [PATCH] put_mpeg4_qpel16_h_lowpass altivec, take 2
Brian Foley
bfoley
Sun Nov 26 03:11:17 CET 2006
On Mon, Nov 20, 2006 at 02:43:17AM +0100, Michael Niedermayer wrote:
> Hi
>
> On Sun, Nov 19, 2006 at 11:20:14PM +0000, Brian Foley wrote:
> > +static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
> code duplication, move copy_block17 to a common header like dsputil.h or
> dsputil_internal.h or whatever, dont copy and paste it
OK, done. I've moved all the copy_block* stuff into dsputil.h. sh4 was
using a copy of this too as it turns out.
> > +static void put_pixels16_l2_altivec(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride,
> > + int src_stride1, int src_stride2, int h)
> > +{
>
> code duplication
I've removed the duplicated integer code, and replaced it with a
conditional unaligned store.
> > + for(i=0; i<h; i++) {
> > + src1v = vec_perm(vec_ld(0, src1), vec_ld(15, src1), vec_lvsl(0, src1));
> > + src2v = vec_perm(vec_ld(0, src2), vec_ld(15, src2), vec_lvsl(0, src2));
>
> one of the 2 is in many cases aligned
I'm not really sure the best way to handle this. I could have an aligned
load in an 'if (((int) src & 0xf) == 0)', but I suspect the branch would
hurt us quite badly. The other approach is to have 3 other functions
where we assert src1 or src2 or both (and their strides) are aligned.
I guess I should look at it with simg4 at some point, but there are much
more CPU intensive functions that could be sped up first.
> > +static void put_mpeg4_qpel16_h_lowpass_altivec(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
> src too is aligned in several cases where this function is called
As above.
> also isnt vec_lvsl( 0, src) == vec_lvsl(16, src)?
D'oh! Yes it is.
> > + /* Positive columns */
> > + ph = vec_add(
> > + vec_add(vec_mulo(v1h, twenty), vec_mulo(v2h, twenty)),
> > + vec_add(vec_mulo(v5h, three), vec_mulo(v6h, three))
> > + );
> > + pl = vec_adds(
> > + vec_adds(vec_mulo(v1l, twenty), vec_mulo(v2l, twenty)),
> > + vec_adds(vec_mulo(v5l, three), vec_mulo(v6l, three)));
> > +
> > + /* Negative columns */
> > + mh = vec_adds(
> > + vec_add(vec_mulo(v3h, six), vec_mulo(v4h, six)),
> > + vec_add(v7h, v8h)
> > + );
> > + ml = vec_adds(
> > + vec_adds(vec_mulo(v3l, six), vec_mulo(v4l, six)),
> > + vec_adds(v7l, v8l)
> > + );
> > +
> > + /* Add the positive and negative components */
> > + tmph = vec_subs(ph, mh);
> > + tmpl = vec_subs(pl, ml);
> > +
> > + tmph = vec_sra(vec_add(tmph, sixteen), five);
> > + tmpl = vec_sra(vec_add(tmpl, sixteen), five);
> > +
> > + /* This store requires a 16-byte aligned dst! */
> > + vec_st(vec_packsu(tmph, tmpl), 0, dst);
>
> 16 add
> 12 mul
> 2 shift
> 1 pack
>
> maybe the following is faster (it replace 12 mul be 10 add)
> though maybe its not i dunno ppc well enough ...
[...]
Interesting. I implemented your suggestion, and it actually does work out
a little faster, and it saves on setting up a couple of constant vectors
too, which is no bad thing. The other tweaks suggested here save reading
another constant vector from memory too.
> > +static void put_mpeg4_qpel16_v_lowpass_altivec(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){
>
> this code is also duplicated more or less
Yes, this is duplicated too, but it's just a placeholder (which happens
to be required for the code to compile) until I sort out an Altivec version
of it, which I plan to do ASAP. I didn't want to go messing with the
original in the PIXOP macro in dsputil.c
I've tidied up the use of types so it compiles cleanly with GCC 3.4 on
Linux/PPC. Hopefully it should be about ready to commit now.
Cheers,
Brian.
-------------- next part --------------
Index: ppc/mpeg4_altivec.c
===================================================================
--- ppc/mpeg4_altivec.c (revision 0)
+++ ppc/mpeg4_altivec.c (revision 0)
@@ -0,0 +1,384 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "../dsputil.h"
+
+#include "gcc_fixes.h"
+
+#include "dsputil_altivec.h"
+
+static void put_pixels16_l2_altivec(uint8_t *dst, const uint8_t *src1,
+ const uint8_t *src2, int dst_stride, int src_stride1,
+ int src_stride2, int h)
+{
+ register vector unsigned char src1v, src2v, dstv;
+ register vector unsigned char tmp1, tmp2, mask, edges, align;
+ int i;
+
+ for(i=0; i<h; i++) {
+ /* Unaligned load */
+ src1v = vec_perm(
+ vec_ld(0, src1), vec_ld(15, src1), vec_lvsl(0, src1));
+ src2v = vec_perm(
+ vec_ld(0, src2), vec_ld(15, src2), vec_lvsl(0, src2));
+
+ /*Altivec's vec_avg is exactly the (a+b+1)>>1 that we want */
+ dstv = vec_avg(src1v, src2v);
+
+ if ((int)dst & 0xf) {
+ /* Unaligned store */
+ tmp2 = vec_ld(15, dst);
+ tmp1 = vec_ld(0, dst);
+
+ mask = vec_lvsl(0, dst);
+ edges = vec_perm(tmp2, tmp1, mask);
+ align = vec_lvsr(0, dst);
+
+ tmp1 = vec_perm(edges, dstv, align);
+ tmp2 = vec_perm(dstv, edges, align);
+
+ vec_st(tmp2, 15, dst);
+ vec_st(tmp1, 0 , dst);
+ } else {
+ /* Aligned store */
+ vec_st(dstv, 0, dst);
+ }
+
+ src1 += src_stride1;
+ src2 += src_stride2;
+ dst += dst_stride;
+ }
+}
+
+static void put_mpeg4_qpel16_h_lowpass_altivec(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride, int height)
+{
+ POWERPC_PERF_DECLARE(put_mpeg4_qpel16_h_lowpass_altivec, 1);
+
+ #ifdef POWERPC_PERFORMANCE_REPORT
+ POWERPC_PERF_START_COUNT(put_mpeg4_qpel16_h_lowpass_altivec_mike, 1);
+ #endif
+ int i;
+ vector unsigned char srcAv, srcBv;
+ const_vector unsigned char zero = (const_vector unsigned char)
+ vec_splat_u8(0);
+ const_vector unsigned short five = (const_vector unsigned short)
+ vec_splat_u16(5);
+ const_vector short sixteen = (const_vector short)
+ vec_add(vec_splat_s16(8), vec_splat_s16(8));
+
+ const_vector unsigned char p2 = (const_vector unsigned char)
+ vec_lvsl(1, (unsigned char *)0);
+ const_vector unsigned char p3 = (const_vector unsigned char)
+ AVV( 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14);
+ const_vector unsigned char p4 = (const_vector unsigned char)
+ AVV( 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,16);
+ const_vector unsigned char p5 = (const_vector unsigned char)
+ AVV( 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13);
+ const_vector unsigned char p6 = (const_vector unsigned char)
+ AVV( 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,16,15);
+ const_vector unsigned char p7 = (const_vector unsigned char)
+ AVV( 2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12);
+ const_vector unsigned char p8 = (const_vector unsigned char)
+ AVV( 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,16,15,14);
+
+
+
+ vector unsigned char a, b, c, e, f, g, h;
+ vector short ah, bh, ch, dh, eh, fh;
+ vector short al, bl, cl, dl, el, fl;
+ vector short gh, hh, gl, hl;
+ vector short tmph, tmpl;
+
+ for(i=0; i<height; i++)
+ {
+ /* Read unaligned: srcAv = src[0]--src[15] srcBv = src[16]--src[31] */
+ vector unsigned char permute = vec_lvsl(0, src);
+ srcAv = vec_perm(vec_ld( 0, src), vec_ld(15, src), permute);
+ srcBv = vec_perm(vec_ld(16, src), vec_ld(31, src), permute);
+
+ /* a -- h are the src[...] columns in the C function */
+ e = vec_perm(srcAv, srcBv, p2);
+ c = vec_perm(srcAv, srcBv, p3);
+ f = vec_perm(srcAv, srcBv, p4);
+ b = vec_perm(srcAv, srcBv, p5);
+ g = vec_perm(srcAv, srcBv, p6);
+ a = vec_perm(srcAv, srcBv, p7);
+ h = vec_perm(srcAv, srcBv, p8);
+ /*
+ * Split the 16 * uint_8 vectors into pairs of 8 * uint_16 vectors
+ * since we're going to do arithmetic that overflows a uint_8...
+ */
+ ah = (vector short) vec_mergeh(zero, a);
+ al = (vector short) vec_mergel(zero, a);
+ bh = (vector short) vec_mergeh(zero, b);
+ bl = (vector short) vec_mergel(zero, b);
+ ch = (vector short) vec_mergeh(zero, c);
+ cl = (vector short) vec_mergel(zero, c);
+ dh = (vector short) vec_mergeh(zero, srcAv);
+ dl = (vector short) vec_mergel(zero, srcAv);
+ eh = (vector short) vec_mergeh(zero, e);
+ el = (vector short) vec_mergel(zero, e);
+ fh = (vector short) vec_mergeh(zero, f);
+ fl = (vector short) vec_mergel(zero, f);
+ gh = (vector short) vec_mergeh(zero, g);
+ gl = (vector short) vec_mergel(zero, g);
+ hh = (vector short) vec_mergeh(zero, h);
+ hl = (vector short) vec_mergel(zero, h);
+
+
+ dh = vec_add(dh, eh); dl = vec_add(dl, el);
+ ch = vec_add(ch, fh); cl = vec_add(cl, fl);
+ bh = vec_add(bh, gh); bl = vec_add(bl, gl);
+ ah = vec_add(ah, hh); al = vec_add(al, hl);
+ dh = vec_add(dh, dh); dl = vec_add(dl, dl);
+ dh = vec_add(dh, dh); dl = vec_add(dl, dl);
+ ch = vec_sub(ch, dh); cl = vec_sub(cl, dl);
+ bh = vec_sub(bh, vec_add(ch, ch));
+ bl = vec_sub(bl, vec_add(cl, cl));
+ bh = vec_add(bh, vec_add(bh, bh));
+ bl = vec_add(bl, vec_add(bl, bl));
+ bh = vec_sub(bh, ah); bl = vec_sub(bl, al);
+ bh = vec_sub(bh, dh); bl = vec_sub(bl, dl);
+
+
+ /*
+ * Finally do cm[a+16 >>5] and pack 16 uint_16s into 16 uint_8s.
+ * We don't need to worry about 16-bit overflow/underflow since
+ * the saturating arithmetic above did it for us.
+ * We do however need to worry about turning uint_16s >= 0x100
+ * into 0xff. Happily, vec_packsu does exactly this automatically.
+ */
+
+ /*
+ * When we get around to using a macro to generate put & avg
+ * variants of this, we just need to replace these two lines
+ */
+ tmph = vec_sra(vec_add(bh, sixteen), five);
+ tmpl = vec_sra(vec_add(bl, sixteen), five);
+
+ /* This store requires a 16-byte aligned dst! */
+ vec_st(vec_packsu(tmph, tmpl), 0, dst);
+
+ dst+=dstStride;
+ src+=srcStride;
+ }
+
+ POWERPC_PERF_STOP_COUNT(put_mpeg4_qpel16_h_lowpass_altivec, 1);
+}
+
+/*
+ * This is a placeholder copied from dsputils.c. Needed for the rest of the
+ * code to build. Will be replaced with Altivec soon, since it shows up as
+ * a hot spot when decoding qpel video.
+ */
+static void put_mpeg4_qpel16_v_lowpass_altivec(uint8_t *dst, uint8_t *src,
+ int dstStride, int srcStride) {
+ uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
+ int i;
+ const int w=16;
+ for(i=0; i<w; i++)
+ {
+ const int src0 = src[0*srcStride];
+ const int src1 = src[1*srcStride];
+ const int src2 = src[2*srcStride];
+ const int src3 = src[3*srcStride];
+ const int src4 = src[4*srcStride];
+ const int src5 = src[5*srcStride];
+ const int src6 = src[6*srcStride];
+ const int src7 = src[7*srcStride];
+ const int src8 = src[8*srcStride];
+ const int src9 = src[9*srcStride];
+ const int src10 = src[10*srcStride];
+ const int src11 = src[11*srcStride];
+ const int src12 = src[12*srcStride];
+ const int src13 = src[13*srcStride];
+ const int src14 = src[14*srcStride];
+ const int src15 = src[15*srcStride];
+ const int src16 = src[16*srcStride];
+
+ #define op_put(a, b) a = cm[((b) + 16)>>5]
+ op_put(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));
+ op_put(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));
+ op_put(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));
+ op_put(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));
+ op_put(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));
+ op_put(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));
+ op_put(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));
+ op_put(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));
+ op_put(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));
+ op_put(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));
+ op_put(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));
+ op_put(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));
+ op_put(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));
+ op_put(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));
+ op_put(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));
+ op_put(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));
+ #undef op_put
+ dst++;
+ src++;
+ }
+}
+
+static void put_qpel16_mc00_altivec (uint8_t *dst, uint8_t *src, int stride){
+ put_pixels16_altivec(dst, src, stride, 16);
+}
+
+static void put_qpel16_mc10_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) half[256];
+ put_mpeg4_qpel16_h_lowpass_altivec(half, src, 16, stride, 16);
+ put_pixels16_l2_altivec(dst, src, half, stride, stride, 16, 16);
+}
+
+static void put_qpel16_mc20_altivec(uint8_t *dst, uint8_t *src, int stride){
+ put_mpeg4_qpel16_h_lowpass_altivec(dst, src, stride, stride, 16);
+}
+
+static void put_qpel16_mc30_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) half[256];
+ put_mpeg4_qpel16_h_lowpass_altivec(half, src, 16, stride, 16);
+ put_pixels16_l2_altivec(dst, src+1, half, stride, stride, 16, 16);
+}
+
+static void put_qpel16_mc01_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) full[24*17];
+ uint8_t __attribute__((aligned(16))) half[256];
+ copy_block17(full, src, 24, stride, 17);
+ put_mpeg4_qpel16_v_lowpass_altivec(half, full, 16, 24);
+ put_pixels16_l2_altivec(dst, full, half, stride, 24, 16, 16);
+}
+
+static void put_qpel16_mc02_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) full[24*17];
+ copy_block17(full, src, 24, stride, 17);
+ put_mpeg4_qpel16_v_lowpass_altivec(dst, full, stride, 24);
+}
+
+static void put_qpel16_mc03_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) full[24*17];
+ uint8_t __attribute__((aligned(16))) half[256];
+ copy_block17(full, src, 24, stride, 17);
+ put_mpeg4_qpel16_v_lowpass_altivec(half, full, 16, 24);
+ put_pixels16_l2_altivec(dst, full+24, half, stride, 24, 16, 16);
+}
+
+static void put_qpel16_mc11_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) full[24*17];
+ uint8_t __attribute__((aligned(16))) halfH[272];
+ uint8_t __attribute__((aligned(16))) halfHV[256];
+ copy_block17(full, src, 24, stride, 17);
+ put_mpeg4_qpel16_h_lowpass_altivec(halfH, full, 16, 24, 17);
+ put_pixels16_l2_altivec(halfH, halfH, full, 16, 16, 24, 17);
+ put_mpeg4_qpel16_v_lowpass_altivec(halfHV, halfH, 16, 16);
+ put_pixels16_l2_altivec(dst, halfH, halfHV, stride, 16, 16, 16);
+}
+
+static void put_qpel16_mc31_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) full[24*17];
+ uint8_t __attribute__((aligned(16))) halfH[272];
+ uint8_t __attribute__((aligned(16))) halfHV[256];
+ copy_block17(full, src, 24, stride, 17);
+ put_mpeg4_qpel16_h_lowpass_altivec(halfH, full, 16, 24, 17);
+ put_pixels16_l2_altivec(halfH, halfH, full+1, 16, 16, 24, 17);
+ put_mpeg4_qpel16_v_lowpass_altivec(halfHV, halfH, 16, 16);
+ put_pixels16_l2_altivec(dst, halfH, halfHV, stride, 16, 16, 16);
+}
+
+static void put_qpel16_mc13_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) full[24*17];
+ uint8_t __attribute__((aligned(16))) halfH[272];
+ uint8_t __attribute__((aligned(16))) halfHV[256];
+ copy_block17(full, src, 24, stride, 17);
+ put_mpeg4_qpel16_h_lowpass_altivec(halfH, full, 16, 24, 17);
+ put_pixels16_l2_altivec(halfH, halfH, full, 16, 16, 24, 17);
+ put_mpeg4_qpel16_v_lowpass_altivec(halfHV, halfH, 16, 16);
+ put_pixels16_l2_altivec(dst, halfH+16, halfHV, stride, 16, 16, 16);
+}
+
+static void put_qpel16_mc33_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) full[24*17];
+ uint8_t __attribute__((aligned(16))) halfH[272];
+ uint8_t __attribute__((aligned(16))) halfHV[256];
+ copy_block17(full, src, 24, stride, 17);
+ put_mpeg4_qpel16_h_lowpass_altivec(halfH, full, 16, 24, 17);
+ put_pixels16_l2_altivec(halfH, halfH, full+1, 16, 16, 24, 17);
+ put_mpeg4_qpel16_v_lowpass_altivec(halfHV, halfH, 16, 16);
+ put_pixels16_l2_altivec(dst, halfH+16, halfHV, stride, 16, 16, 16);
+}
+
+static void put_qpel16_mc21_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) halfH[272];
+ uint8_t __attribute__((aligned(16))) halfHV[256];
+ put_mpeg4_qpel16_h_lowpass_altivec(halfH, src, 16, stride, 17);
+ put_mpeg4_qpel16_v_lowpass_altivec(halfHV, halfH, 16, 16);
+ put_pixels16_l2_altivec(dst, halfH, halfHV, stride, 16, 16, 16);
+}
+
+static void put_qpel16_mc23_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) halfH[272];
+ uint8_t __attribute__((aligned(16))) halfHV[256];
+ put_mpeg4_qpel16_h_lowpass_altivec(halfH, src, 16, stride, 17);
+ put_mpeg4_qpel16_v_lowpass_altivec(halfHV, halfH, 16, 16);
+ put_pixels16_l2_altivec(dst, halfH+16, halfHV, stride, 16, 16, 16);
+}
+
+static void put_qpel16_mc12_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) full[24*17];
+ uint8_t __attribute__((aligned(16))) halfH[272];
+ copy_block17(full, src, 24, stride, 17);
+ put_mpeg4_qpel16_h_lowpass_altivec(halfH, full, 16, 24, 17);
+ put_pixels16_l2_altivec(halfH, halfH, full, 16, 16, 24, 17);
+ put_mpeg4_qpel16_v_lowpass_altivec(dst, halfH, stride, 16);
+}
+
+static void put_qpel16_mc32_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) full[24*17];
+ uint8_t __attribute__((aligned(16))) halfH[272];
+ copy_block17(full, src, 24, stride, 17);
+ put_mpeg4_qpel16_h_lowpass_altivec(halfH, full, 16, 24, 17);
+ put_pixels16_l2_altivec(halfH, halfH, full+1, 16, 16, 24, 17);
+ put_mpeg4_qpel16_v_lowpass_altivec(dst, halfH, stride, 16);
+}
+
+static void put_qpel16_mc22_altivec(uint8_t *dst, uint8_t *src, int stride){
+ uint8_t __attribute__((aligned(16))) halfH[272];
+ put_mpeg4_qpel16_h_lowpass_altivec(halfH, src, 16, stride, 17);
+ put_mpeg4_qpel16_v_lowpass_altivec(dst, halfH, stride, 16);
+}
+
+void dsputil_mpeg4_init_altivec(DSPContext* c, AVCodecContext *avctx)
+{
+ if (has_altivec()) {
+ c->put_qpel_pixels_tab[0][ 0] = put_qpel16_mc00_altivec;
+ c->put_qpel_pixels_tab[0][ 1] = put_qpel16_mc10_altivec;
+ c->put_qpel_pixels_tab[0][ 2] = put_qpel16_mc20_altivec;
+ c->put_qpel_pixels_tab[0][ 3] = put_qpel16_mc30_altivec;
+ c->put_qpel_pixels_tab[0][ 4] = put_qpel16_mc01_altivec;
+ c->put_qpel_pixels_tab[0][ 5] = put_qpel16_mc11_altivec;
+ c->put_qpel_pixels_tab[0][ 6] = put_qpel16_mc21_altivec;
+ c->put_qpel_pixels_tab[0][ 7] = put_qpel16_mc31_altivec;
+ c->put_qpel_pixels_tab[0][ 8] = put_qpel16_mc02_altivec;
+ c->put_qpel_pixels_tab[0][ 9] = put_qpel16_mc12_altivec;
+ c->put_qpel_pixels_tab[0][10] = put_qpel16_mc22_altivec;
+ c->put_qpel_pixels_tab[0][11] = put_qpel16_mc32_altivec;
+ c->put_qpel_pixels_tab[0][12] = put_qpel16_mc03_altivec;
+ c->put_qpel_pixels_tab[0][13] = put_qpel16_mc13_altivec;
+ c->put_qpel_pixels_tab[0][14] = put_qpel16_mc23_altivec;
+ c->put_qpel_pixels_tab[0][15] = put_qpel16_mc33_altivec;
+ }
+}
Index: ppc/dsputil_ppc.c
===================================================================
--- ppc/dsputil_ppc.c (revision 7166)
+++ ppc/dsputil_ppc.c (working copy)
@@ -35,6 +35,7 @@
void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx);
+void dsputil_mpeg4_init_altivec(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx);
void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx);
void snow_init_altivec(DSPContext* c, AVCodecContext *avctx);
@@ -274,7 +275,12 @@
}
#ifdef HAVE_ALTIVEC
- if(ENABLE_H264_DECODER) dsputil_h264_init_ppc(c, avctx);
+#ifdef ENABLE_H264_DECODER
+ dsputil_h264_init_ppc(c, avctx);
+#endif
+#ifdef ENABLE_MPEG4_DECODER
+ dsputil_mpeg4_init_altivec(c, avctx);
+#endif
if (has_altivec()) {
mm_flags |= MM_ALTIVEC;
Index: dsputil.c
===================================================================
--- dsputil.c (revision 7166)
+++ dsputil.c (working copy)
@@ -1513,83 +1513,7 @@
}
}
-static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-{
- int i;
- for(i=0; i<h; i++)
- {
- ST16(dst , LD16(src ));
- dst+=dstStride;
- src+=srcStride;
- }
-}
-static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-{
- int i;
- for(i=0; i<h; i++)
- {
- ST32(dst , LD32(src ));
- dst+=dstStride;
- src+=srcStride;
- }
-}
-
-static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-{
- int i;
- for(i=0; i<h; i++)
- {
- ST32(dst , LD32(src ));
- ST32(dst+4 , LD32(src+4 ));
- dst+=dstStride;
- src+=srcStride;
- }
-}
-
-static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-{
- int i;
- for(i=0; i<h; i++)
- {
- ST32(dst , LD32(src ));
- ST32(dst+4 , LD32(src+4 ));
- ST32(dst+8 , LD32(src+8 ));
- ST32(dst+12, LD32(src+12));
- dst+=dstStride;
- src+=srcStride;
- }
-}
-
-static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-{
- int i;
- for(i=0; i<h; i++)
- {
- ST32(dst , LD32(src ));
- ST32(dst+4 , LD32(src+4 ));
- ST32(dst+8 , LD32(src+8 ));
- ST32(dst+12, LD32(src+12));
- dst[16]= src[16];
- dst+=dstStride;
- src+=srcStride;
- }
-}
-
-static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-{
- int i;
- for(i=0; i<h; i++)
- {
- ST32(dst , LD32(src ));
- ST32(dst+4 , LD32(src+4 ));
- dst[8]= src[8];
- dst+=dstStride;
- src+=srcStride;
- }
-}
-
-
#define QPEL_MC(r, OPNAME, RND, OP) \
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
Index: dsputil.h
===================================================================
--- dsputil.h (revision 7166)
+++ dsputil.h (working copy)
@@ -698,4 +698,81 @@
return score;\
}
+static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
+{
+ int i;
+ for(i=0; i<h; i++)
+ {
+ ST16(dst , LD16(src ));
+ dst+=dstStride;
+ src+=srcStride;
+ }
+}
+
+static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
+{
+ int i;
+ for(i=0; i<h; i++)
+ {
+ ST32(dst , LD32(src ));
+ dst+=dstStride;
+ src+=srcStride;
+ }
+}
+
+static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
+{
+ int i;
+ for(i=0; i<h; i++)
+ {
+ ST32(dst , LD32(src ));
+ ST32(dst+4 , LD32(src+4 ));
+ dst+=dstStride;
+ src+=srcStride;
+ }
+}
+
+static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
+{
+ int i;
+ for(i=0; i<h; i++)
+ {
+ ST32(dst , LD32(src ));
+ ST32(dst+4 , LD32(src+4 ));
+ ST32(dst+8 , LD32(src+8 ));
+ ST32(dst+12, LD32(src+12));
+ dst+=dstStride;
+ src+=srcStride;
+ }
+}
+
+static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
+{
+ int i;
+ for(i=0; i<h; i++)
+ {
+ ST32(dst , LD32(src ));
+ ST32(dst+4 , LD32(src+4 ));
+ ST32(dst+8 , LD32(src+8 ));
+ ST32(dst+12, LD32(src+12));
+ dst[16]= src[16];
+ dst+=dstStride;
+ src+=srcStride;
+ }
+}
+
+static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
+{
+ int i;
+ for(i=0; i<h; i++)
+ {
+ ST32(dst , LD32(src ));
+ ST32(dst+4 , LD32(src+4 ));
+ dst[8]= src[8];
+ dst+=dstStride;
+ src+=srcStride;
+ }
+}
+
+
#endif
Index: Makefile
===================================================================
--- Makefile (revision 7166)
+++ Makefile (working copy)
@@ -383,6 +383,7 @@
sh4/dsputil_align.o \
OBJS-$(TARGET_ALTIVEC) += ppc/dsputil_altivec.o \
+ ppc/mpeg4_altivec.o \
ppc/mpegvideo_altivec.o \
ppc/idct_altivec.o \
ppc/fft_altivec.o \
Index: sh4/qpel.c
===================================================================
--- sh4/qpel.c (revision 7166)
+++ sh4/qpel.c (working copy)
@@ -564,73 +564,7 @@
#undef op_avg
#undef op_put
-/* not yet optimized */
-static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-{
- int i;
- for(i=0; i<h; i++)
- {
- ST32(dst , LD32(src ));
- dst+=dstStride;
- src+=srcStride;
- }
-}
-static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-{
- int i;
- for(i=0; i<h; i++)
- {
- ST32(dst , LD32(src ));
- ST32(dst+4 , LD32(src+4 ));
- dst+=dstStride;
- src+=srcStride;
- }
-}
-
-static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-{
- int i;
- for(i=0; i<h; i++)
- {
- ST32(dst , LD32(src ));
- ST32(dst+4 , LD32(src+4 ));
- ST32(dst+8 , LD32(src+8 ));
- ST32(dst+12, LD32(src+12));
- dst+=dstStride;
- src+=srcStride;
- }
-}
-
-static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-{
- int i;
- for(i=0; i<h; i++)
- {
- ST32(dst , LD32(src ));
- ST32(dst+4 , LD32(src+4 ));
- ST32(dst+8 , LD32(src+8 ));
- ST32(dst+12, LD32(src+12));
- dst[16]= src[16];
- dst+=dstStride;
- src+=srcStride;
- }
-}
-
-static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
-{
- int i;
- for(i=0; i<h; i++)
- {
- ST32(dst , LD32(src ));
- ST32(dst+4 , LD32(src+4 ));
- dst[8]= src[8];
- dst+=dstStride;
- src+=srcStride;
- }
-}
-/* end not optimized */
-
#define QPEL_MC(r, OPNAME, RND, OP) \
static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
More information about the ffmpeg-devel
mailing list