[FFmpeg-devel] [RFC][PATCH] DSPUtilize some functions from APE decoder
Kostya
kostya.shishkov
Wed Jul 2 15:26:25 CEST 2008
I'm not satisfied with the decoding speed of APE decoder,
so I've decided to finally dsputilize functions marked as such.
Altivec version is in development.
-------------- next part --------------
Index: libavcodec/i386/dsputil_mmx.c
===================================================================
--- libavcodec/i386/dsputil_mmx.c (revision 14044)
+++ libavcodec/i386/dsputil_mmx.c (working copy)
@@ -2061,6 +2061,66 @@
extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
+
+static void vector_int16_add_sse(int16_t * v1, int16_t * v2, int order)
+{
+ int i;
+ for(i = 0; i < order; i += 8){
+ asm volatile(
+ "movdqa (%0), %%xmm0 \n\t"
+ "movdqu (%1), %%xmm1 \n\t"
+ "paddw %%xmm1, %%xmm0 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ : "+r"(v1), "+r"(v2)
+ );
+ v1 += 8;
+ v2 += 8;
+ }
+}
+
+static void vector_int16_sub_sse(int16_t * v1, int16_t * v2, int order)
+{
+ int i;
+ for(i = 0; i < order; i += 8){
+ asm volatile(
+ "movdqa (%0), %%xmm0 \n\t"
+ "movdqu (%1), %%xmm1 \n\t"
+ "psubw %%xmm1, %%xmm0 \n\t"
+ "movdqa %%xmm0, (%0) \n\t"
+ : "+r"(v1), "+r"(v2)
+ );
+ v1 += 8;
+ v2 += 8;
+ }
+}
+
+static int32_t vector_int16_scalarproduct_sse(int16_t * v1, int16_t * v2, int order)
+{
+ int i;
+ int res = 0, *resp=&res;
+
+ asm volatile("pxor %xmm7, %xmm7 \n\t");
+
+ for(i = 0; i < order; i += 8){
+ asm volatile(
+ "movdqu (%0), %%xmm0 \n\t"
+ "movdqa (%1), %%xmm1 \n\t"
+ "pmaddwd %%xmm1, %%xmm0 \n\t"
+ "movhlps %%xmm0, %%xmm2 \n\t"
+
+ "paddd %%xmm2, %%xmm0 \n\t"
+ "pshufd $0x01, %%xmm0,%%xmm2 \n\t"
+ "paddd %%xmm2, %%xmm0 \n\t"
+ "paddd %%xmm0, %%xmm7 \n\t"
+ : "+r"(v1), "+r"(v2)
+ );
+ v1 += 8;
+ v2 += 8;
+ }
+ asm volatile("movd %%xmm7, (%0)\n\t" : "+r"(resp));
+ return res;
+}
+
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
mm_flags = mm_support();
@@ -2426,6 +2486,9 @@
c->float_to_int16 = float_to_int16_sse;
c->vector_fmul_reverse = vector_fmul_reverse_sse;
c->vector_fmul_add_add = vector_fmul_add_add_sse;
+ c->vector_int16_add = vector_int16_add_sse;
+ c->vector_int16_sub = vector_int16_sub_sse;
+ c->vector_int16_scalarproduct = vector_int16_scalarproduct_sse;
}
if(mm_flags & MM_3DNOW)
c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
Index: libavcodec/ppc/dsputil_altivec.c
===================================================================
--- libavcodec/ppc/dsputil_altivec.c (revision 14044)
+++ libavcodec/ppc/dsputil_altivec.c (working copy)
@@ -1484,6 +1484,60 @@
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
}
+static void vector_int16_add_ppc(int16_t * v1, int16_t * v2, int order)
+{
+ int i;
+ register vector short vec1;
+ DECLARE_ALIGNED_16(int16_t, buf[order]);
+ int16_t *b2 = buf;
+
+ memcpy(buf, v2, order *2);
+ for(i = 0; i < order; i += 8){
+ vec1 = vec_ld(0, v1);
+ vec1 = vec_add(vec1, vec_ld(0, b2));
+ vec_st(vec1, 0, v1);
+ v1 += 8;
+ b2 += 8;
+ }
+}
+
+static void vector_int16_sub_ppc(int16_t * v1, int16_t * v2, int order)
+{
+ int i;
+ register vector short vec1;
+ DECLARE_ALIGNED_16(int16_t, buf[order]);
+ int16_t *b2 = buf;
+
+ memcpy(buf, v2, order *2);
+ for(i = 0; i < order; i += 8){
+ vec1 = vec_ld(0, v1);
+ vec1 = vec_sub(vec1, vec_ld(0, b2));
+ vec_st(vec1, 0, v1);
+ v1 += 8;
+ b2 += 8;
+ }
+}
+
+static int32_t vector_int16_scalarproduct_ppc(int16_t * v1, int16_t * v2, int order)
+{
+ int i;
+ register vector short vec1;
+ register const vector short zero = vec_splat_s16(0);
+ register vector int res = vec_splat_s32(0), t;
+ DECLARE_ALIGNED_16(int, ires);
+
+ for(i = 0; i < order; i += 8){
+ vec1 = vec_ld(0, v1);
+ t = vec_msums(vec1, vec_ld(0, v2), zero);
+ res = vec_sums(t, res);
+ v1 += 8;
+ v2 += 8;
+ }
+ vec_st(res, 0, &ires);
+ return ires;
+}
+
+
void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
{
c->pix_abs[0][1] = sad16_x2_altivec;
@@ -1515,4 +1569,8 @@
c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
if (ENABLE_VORBIS_DECODER)
c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
+
+ c->vector_int16_add = vector_int16_add_ppc;
+ c->vector_int16_sub = vector_int16_sub_ppc;
+ c->vector_int16_scalarproduct = vector_int16_scalarproduct_ppc;
}
Index: libavcodec/apedec.c
===================================================================
--- libavcodec/apedec.c (revision 14044)
+++ libavcodec/apedec.c (working copy)
@@ -161,30 +161,7 @@
} APEContext;
// TODO: dsputilize
-static inline void vector_add(int16_t * v1, int16_t * v2, int order)
-{
- while (order--)
- *v1++ += *v2++;
-}
-// TODO: dsputilize
-static inline void vector_sub(int16_t * v1, int16_t * v2, int order)
-{
- while (order--)
- *v1++ -= *v2++;
-}
-
-// TODO: dsputilize
-static inline int32_t scalarproduct(int16_t * v1, int16_t * v2, int order)
-{
- int res = 0;
-
- while (order--)
- res += *v1++ * *v2++;
-
- return res;
-}
-
static av_cold int ape_decode_init(AVCodecContext * avctx)
{
APEContext *s = avctx->priv_data;
@@ -672,19 +649,19 @@
do_init_filter(&f[1], buf + order * 3 + HISTORY_SIZE, order);
}
-static inline void do_apply_filter(int version, APEFilter *f, int32_t *data, int count, int order, int fracbits)
+static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits)
{
int res;
int absres;
while (count--) {
/* round fixedpoint scalar product */
- res = (scalarproduct(f->delay - order, f->coeffs, order) + (1 << (fracbits - 1))) >> fracbits;
+ res = (ctx->dsp.vector_int16_scalarproduct(f->delay - order, f->coeffs, order) + (1 << (fracbits - 1))) >> fracbits;
if (*data < 0)
- vector_add(f->coeffs, f->adaptcoeffs - order, order);
+ ctx->dsp.vector_int16_add(f->coeffs, f->adaptcoeffs - order, order);
else if (*data > 0)
- vector_sub(f->coeffs, f->adaptcoeffs - order, order);
+ ctx->dsp.vector_int16_sub(f->coeffs, f->adaptcoeffs - order, order);
res += *data;
@@ -736,9 +713,9 @@
int32_t * data0, int32_t * data1,
int count, int order, int fracbits)
{
- do_apply_filter(ctx->fileversion, &f[0], data0, count, order, fracbits);
+ do_apply_filter(ctx, ctx->fileversion, &f[0], data0, count, order, fracbits);
if (data1)
- do_apply_filter(ctx->fileversion, &f[1], data1, count, order, fracbits);
+ do_apply_filter(ctx, ctx->fileversion, &f[1], data1, count, order, fracbits);
}
static void ape_apply_filters(APEContext * ctx, int32_t * decoded0,
Index: libavcodec/dsputil.c
===================================================================
--- libavcodec/dsputil.c (revision 14044)
+++ libavcodec/dsputil.c (working copy)
@@ -3944,6 +3944,26 @@
}
}
+static void vector_int16_add_c(int16_t * v1, int16_t * v2, int order){
+ while (order--)
+ *v1++ += *v2++;
+}
+
+static void vector_int16_sub_c(int16_t * v1, int16_t * v2, int order){
+ while (order--)
+ *v1++ -= *v2++;
+}
+
+static int32_t vector_int16_scalarproduct_c(int16_t * v1, int16_t * v2, int order)
+{
+ int res = 0;
+
+ while (order--)
+ res += *v1++ * *v2++;
+
+ return res;
+}
+
#define W0 2048
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -4430,6 +4450,10 @@
c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
c->float_to_int16 = ff_float_to_int16_c;
+ c->vector_int16_add = vector_int16_add_c;
+ c->vector_int16_sub = vector_int16_sub_c;
+ c->vector_int16_scalarproduct = vector_int16_scalarproduct_c;
+
c->shrink[0]= ff_img_copy_plane;
c->shrink[1]= ff_shrink22;
c->shrink[2]= ff_shrink44;
Index: libavcodec/dsputil.h
===================================================================
--- libavcodec/dsputil.h (revision 14044)
+++ libavcodec/dsputil.h (working copy)
@@ -451,6 +451,13 @@
void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize,
int * range, int * sum, int edges);
+ /* ape functions */
+ /* Add second vector values to the first one. v1 is aligned, v2 is not. */
+ void (*vector_int16_add)(int16_t *v1, int16_t *v2, int len);
+ /* Add second vector values to the first one. v1 is aligned, v2 is not. */
+ void (*vector_int16_sub)(int16_t *v1, int16_t *v2, int len);
+ /* Calculate scalar product of two vectors. v1 is unaligned, v2 is aligned. */
+ int32_t (*vector_int16_scalarproduct)(int16_t *v1, int16_t *v2, int len);
} DSPContext;
void dsputil_static_init(void);
More information about the ffmpeg-devel
mailing list