[FFmpeg-cvslog] yuv2planeX10 SIMD

Kieran Kunhya git at videolan.org
Sun Oct 23 05:53:10 CEST 2011


ffmpeg | branch: master | Kieran Kunhya <kieran at kunhya.com> | Sun Oct  9 16:20:48 2011 +0100| [7fbbf9529397756a31850fe37036f026f34f80fc] | committer: Ronald S. Bultje

yuv2planeX10 SIMD

Signed-off-by: Ronald S. Bultje <rsbultje at gmail.com>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=7fbbf9529397756a31850fe37036f026f34f80fc
---

 libswscale/x86/scale.asm     |   77 +++++++++++++++++++++++++++++++++++++++++-
 libswscale/x86/swscale_mmx.c |   15 ++++++++
 2 files changed, 91 insertions(+), 1 deletions(-)

diff --git a/libswscale/x86/scale.asm b/libswscale/x86/scale.asm
index d355894..05e2d96 100644
--- a/libswscale/x86/scale.asm
+++ b/libswscale/x86/scale.asm
@@ -1,6 +1,7 @@
 ;******************************************************************************
-;* x86-optimized horizontal line scaling functions
+;* x86-optimized horizontal/vertical line scaling functions
 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje at gmail.com>
+;*                    Kieran Kunhya <kieran at kunhya.com>
 ;*
 ;* This file is part of Libav.
 ;*
@@ -28,6 +29,8 @@ max_19bit_int: times 4 dd 0x7ffff
 max_19bit_flt: times 4 dd 524287.0
 minshort:      times 8 dw 0x8000
 unicoeff:      times 4 dd 0x20000000
+yuv2yuvX_10_start:  times 4 dd 0x10000
+yuv2yuvX_10_upper:  times 8 dw 0x3ff
 
 SECTION .text
 
@@ -427,3 +430,75 @@ INIT_XMM
 SCALE_FUNCS2 sse2,  6, 7, 8
 SCALE_FUNCS2 ssse3, 6, 6, 8
 SCALE_FUNCS2 sse4,  6, 6, 8
+
+;-----------------------------------------------------------------------------
+; vertical line scaling
+;
+; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
+;                                     const uint8_t *dither, int offset)
+; and
+; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
+;                                     const int16_t **src, uint8_t *dst, int dstW,
+;                                     const uint8_t *dither, int offset)
+;
+; Scale one or $filterSize lines of source data to generate one line of output
+; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
+; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
+; of 2. $offset is either 0 or 3. $dither holds 8 values.
+;-----------------------------------------------------------------------------
+
+%macro yuv2planeX10 1
+
+%ifdef ARCH_X86_32
+%define cntr_reg r1
+%else
+%define cntr_reg r11
+%endif
+
+cglobal yuv2planeX10_%1, 7, 7
+    xor      r5, r5
+.pixelloop
+    mova     m1, [yuv2yuvX_10_start]
+    mova     m2, m1
+    movsxdifnidn cntr_reg, r1d
+.filterloop
+    pxor     m0, m0
+
+    mov      r6, [r2+gprsize*cntr_reg-2*gprsize]
+    mova     m3, [r6+r5]
+
+    mov      r6, [r2+gprsize*cntr_reg-gprsize]
+    mova     m4, [r6+r5]
+
+    punpcklwd m5, m3, m4
+    punpckhwd m3, m4
+
+    movd     m0, [r0+2*cntr_reg-4]
+    SPLATD   m0, m0
+
+    pmaddwd  m5, m0
+    pmaddwd  m3, m0
+
+    paddd    m2, m5
+    paddd    m1, m3
+
+    sub      cntr_reg, 2
+    jg .filterloop
+
+    psrad    m2, 17
+    psrad    m1, 17
+
+    packusdw m2, m1
+    pminsw   m2, [yuv2yuvX_10_upper]
+    mova     [r3+r5], m2
+
+    add      r5, mmsize
+    sub      r4d, mmsize/2
+    jg .pixelloop
+    REP_RET
+%endmacro
+
+INIT_XMM
+yuv2planeX10 sse4
+INIT_AVX
+yuv2planeX10 avx
diff --git a/libswscale/x86/swscale_mmx.c b/libswscale/x86/swscale_mmx.c
index dd7aea1..3c0632d 100644
--- a/libswscale/x86/swscale_mmx.c
+++ b/libswscale/x86/swscale_mmx.c
@@ -211,6 +211,14 @@ SCALE_FUNCS_SSE(sse2);
 SCALE_FUNCS_SSE(ssse3);
 SCALE_FUNCS_SSE(sse4);
 
+extern void ff_yuv2planeX10_sse4(const int16_t *filter, int filterSize,
+                                 const int16_t **src, uint8_t *dest, int dstW,
+                                 const uint8_t *dither, int offset);
+
+extern void ff_yuv2planeX10_avx(const int16_t *filter, int filterSize,
+                                const int16_t **src, uint8_t *dest, int dstW,
+                                const uint8_t *dither, int offset);
+
 void ff_sws_init_swScale_mmx(SwsContext *c)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -270,6 +278,13 @@ void ff_sws_init_swScale_mmx(SwsContext *c)
         /* Xto15 don't need special sse4 functions */
         ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
         ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
+        if (c->dstBpc == 10 && !isBE(c->dstFormat))
+            c->yuv2planeX = ff_yuv2planeX10_sse4;
+    }
+
+    if (cpu_flags & AV_CPU_FLAG_AVX) {
+        if (c->dstBpc == 10 && !isBE(c->dstFormat))
+            c->yuv2planeX = ff_yuv2planeX10_avx;
     }
 #endif
 }



More information about the ffmpeg-cvslog mailing list