[FFmpeg-cvslog] yuv2planeX10 SIMD
Kieran Kunhya
git at videolan.org
Sun Oct 23 05:53:10 CEST 2011
ffmpeg | branch: master | Kieran Kunhya <kieran at kunhya.com> | Sun Oct 9 16:20:48 2011 +0100| [7fbbf9529397756a31850fe37036f026f34f80fc] | committer: Ronald S. Bultje
yuv2planeX10 SIMD
Signed-off-by: Ronald S. Bultje <rsbultje at gmail.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=7fbbf9529397756a31850fe37036f026f34f80fc
---
libswscale/x86/scale.asm | 77 +++++++++++++++++++++++++++++++++++++++++-
libswscale/x86/swscale_mmx.c | 15 ++++++++
2 files changed, 91 insertions(+), 1 deletions(-)
diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index d355894..05e2d96 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -1,6 +1,7 @@
;******************************************************************************
-;* x86-optimized horizontal line scaling functions
+;* x86-optimized horizontal/vertical line scaling functions
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje at gmail.com>
+;* Kieran Kunhya <kieran at kunhya.com>
;*
;* This file is part of Libav.
;*
@@ -28,6 +29,8 @@ max_19bit_int: times 4 dd 0x7ffff
max_19bit_flt: times 4 dd 524287.0
minshort: times 8 dw 0x8000
unicoeff: times 4 dd 0x20000000
+yuv2yuvX_10_start: times 4 dd 0x10000
+yuv2yuvX_10_upper: times 8 dw 0x3ff
SECTION .text
@@ -427,3 +430,75 @@ INIT_XMM
SCALE_FUNCS2 sse2, 6, 7, 8
SCALE_FUNCS2 ssse3, 6, 6, 8
SCALE_FUNCS2 sse4, 6, 6, 8
+
+;-----------------------------------------------------------------------------
+; vertical line scaling
+;
+; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
+; const uint8_t *dither, int offset)
+; and
+; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
+; const int16_t **src, uint8_t *dst, int dstW,
+; const uint8_t *dither, int offset)
+;
+; Scale one or $filterSize lines of source data to generate one line of output
+; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
+; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
+; of 2. $offset is either 0 or 3. $dither holds 8 values.
+;-----------------------------------------------------------------------------
+
+%macro yuv2planeX10 1
+
+%ifdef ARCH_X86_32
+%define cntr_reg r1
+%else
+%define cntr_reg r11
+%endif
+
+cglobal yuv2planeX10_%1, 7, 7
+ xor r5, r5
+.pixelloop
+ mova m1, [yuv2yuvX_10_start]
+ mova m2, m1
+ movsxdifnidn cntr_reg, r1d
+.filterloop
+ pxor m0, m0
+
+ mov r6, [r2+gprsize*cntr_reg-2*gprsize]
+ mova m3, [r6+r5]
+
+ mov r6, [r2+gprsize*cntr_reg-gprsize]
+ mova m4, [r6+r5]
+
+ punpcklwd m5, m3, m4
+ punpckhwd m3, m4
+
+ movd m0, [r0+2*cntr_reg-4]
+ SPLATD m0, m0
+
+ pmaddwd m5, m0
+ pmaddwd m3, m0
+
+ paddd m2, m5
+ paddd m1, m3
+
+ sub cntr_reg, 2
+ jg .filterloop
+
+ psrad m2, 17
+ psrad m1, 17
+
+ packusdw m2, m1
+ pminsw m2, [yuv2yuvX_10_upper]
+ mova [r3+r5], m2
+
+ add r5, mmsize
+ sub r4d, mmsize/2
+ jg .pixelloop
+ REP_RET
+%endmacro
+
+INIT_XMM
+yuv2planeX10 sse4
+INIT_AVX
+yuv2planeX10 avx
diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index dd7aea1..3c0632d 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -211,6 +211,14 @@ SCALE_FUNCS_SSE(sse2);
SCALE_FUNCS_SSE(ssse3);
SCALE_FUNCS_SSE(sse4);
+extern void ff_yuv2planeX10_sse4(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
+extern void ff_yuv2planeX10_avx(const int16_t *filter, int filterSize,
+ const int16_t **src, uint8_t *dest, int dstW,
+ const uint8_t *dither, int offset);
+
void ff_sws_init_swScale_mmx(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
@@ -270,6 +278,13 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
/* Xto15 don't need special sse4 functions */
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
+ if (c->dstBpc == 10 && !isBE(c->dstFormat))
+ c->yuv2planeX = ff_yuv2planeX10_sse4;
+ }
+
+ if (cpu_flags & AV_CPU_FLAG_AVX) {
+ if (c->dstBpc == 10 && !isBE(c->dstFormat))
+ c->yuv2planeX = ff_yuv2planeX10_avx;
}
#endif
}
More information about the ffmpeg-cvslog
mailing list