[FFmpeg-cvslog] avcodec/x86/hevc: add avx2 dc idct

plepere git at videolan.org
Wed Jun 25 15:06:54 CEST 2014


ffmpeg | branch: master | plepere <pierre-edouard.lepere at insa-rennes.fr> | Mon Jun 16 14:47:21 2014 +0200| [942e22c651166e8aa67bfffa7a431970200d3203] | committer: Michael Niedermayer

avcodec/x86/hevc: add avx2 dc idct

Signed-off-by: Michael Niedermayer <michaelni at gmx.at>

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=942e22c651166e8aa67bfffa7a431970200d3203
---

 libavcodec/x86/hevc_idct.asm  |   51 ++++++++++++++++++++++++++++++++++++++---
 libavcodec/x86/hevcdsp.h      |    6 +++++
 libavcodec/x86/hevcdsp_init.c |   18 +++++++++++++++
 3 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/hevc_idct.asm b/libavcodec/x86/hevc_idct.asm
index 6963dc7..31532ae 100644
--- a/libavcodec/x86/hevc_idct.asm
+++ b/libavcodec/x86/hevc_idct.asm
@@ -20,12 +20,12 @@
 ; */
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-max_pixels_10:          times 8  dw ((1 << 10)-1)
+SECTION_RODATA 32
+max_pixels_10:          times 16  dw ((1 << 10)-1)
 dc_add_10:              times 4 dd ((1 << 14-10) + 1)
 
 
-SECTION .text
+SECTION_TEXT 32
 
 ;the idct_dc_add macros and functions were largely inspired by x264 project's code in the h264_idct.asm file
 
@@ -41,6 +41,18 @@ SECTION .text
     packuswb          m1, m1
 %endmacro
 
+%macro DC_ADD_INIT_AVX2 2
+    add              %1w, ((1 << 14-8) + 1)
+    sar              %1w, (15-8)
+    movd             xm0, %1d
+    vpbroadcastw      m0, xm0    ;SPLATW
+    lea               %1, [%2*3]
+    pxor              m1, m1
+    psubw             m1, m0
+    packuswb          m0, m0
+    packuswb          m1, m1
+%endmacro
+
 %macro DC_ADD_OP 4
     %1                m2, [%2     ]
     %1                m3, [%2+%3  ]
@@ -112,6 +124,19 @@ cglobal hevc_idct16_dc_add_8, 3, 4, 0
     DC_ADD_OP       mova, r0, r2, r3
     RET
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+; void ff_hevc_idct32_dc_add_8_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+cglobal hevc_idct32_dc_add_8, 3, 4, 6
+    movsx             r3, word [r1]
+    DC_ADD_INIT_AVX2  r3, r2
+    DC_ADD_OP       mova, r0, r2, r3,
+ %rep 7
+    lea               r0, [r0+r2*4]
+    DC_ADD_OP       mova, r0, r2, r3
+%endrep
+    RET
+%endif ;HAVE_AVX2_EXTERNAL
 ;-----------------------------------------------------------------------------
 ; void ff_hevc_idct_dc_add_10(pixel *dst, int16_t *block, int stride)
 ;-----------------------------------------------------------------------------
@@ -178,3 +203,23 @@ IDCT8_DC_ADD
 INIT_XMM avx
 IDCT8_DC_ADD
 %endif
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+cglobal hevc_idct16_dc_add_10,3,4,7
+    mov              r1w, [r1]
+    add              r1w, ((1 << 4) + 1)
+    sar              r1w, 5
+    movd             xm0, r1d
+    lea               r1, [r2*3]
+    vpbroadcastw      m0, xm0    ;SPLATW
+    mova              m6, [max_pixels_10]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    lea               r0, [r0+r2*4]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    lea               r0, [r0+r2*4]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    lea               r0, [r0+r2*4]
+    IDCT_DC_ADD_OP_10 r0, r2, r1
+    RET
+%endif ;HAVE_AVX_EXTERNAL
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 029492e..661a860 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -133,6 +133,8 @@ idct_dc_proto(8, 8,mmxext);
 idct_dc_proto(16,8,  sse2);
 idct_dc_proto(32,8,  sse2);
 
+idct_dc_proto(32,8,  avx2);
+
 
 idct_dc_proto(4, 10,mmxext);
 idct_dc_proto(8, 10,  sse2);
@@ -142,6 +144,10 @@ idct_dc_proto(8, 10,   avx);
 idct_dc_proto(16,10,   avx);
 idct_dc_proto(32,10,   avx);
 
+idct_dc_proto(16,10,  avx2);
+idct_dc_proto(32,10,  avx2);
+
+
 
 
 
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 58a0891..cad236d 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -92,6 +92,17 @@ void ff_hevc_idct32_dc_add_10_avx(uint8_t *dst, int16_t *coeffs, ptrdiff_t strid
 }
 #endif //HAVE_AVX_EXTERNAL
 
+#if HAVE_AVX2_EXTERNAL
+
+void ff_hevc_idct32_dc_add_10_avx2(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
+{
+    ff_hevc_idct16_dc_add_10_avx2(dst, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_avx2(dst+32, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_avx2(dst+16*stride, coeffs, stride);
+    ff_hevc_idct16_dc_add_10_avx2(dst+16*stride+32, coeffs, stride);
+}
+#endif //HAVE_AVX2_EXTERNAL
+
 #define mc_rep_func(name, bitd, step, W, opt) \
 void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, ptrdiff_t dststride,                            \
                                                 uint8_t *_src, ptrdiff_t _srcstride, int height,                \
@@ -438,6 +449,9 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v,     8, sse4);
             QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv,    8, sse4);
         }
+        if (EXTERNAL_AVX2(mm_flags)) {
+            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_8_avx2;
+        }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(mm_flags)) {
                 c->transform_dc_add[0]    =  ff_hevc_idct4_dc_add_10_mmxext;
@@ -473,6 +487,10 @@ void ff_hevcdsp_init_x86(HEVCDSPContext *c, const int bit_depth)
             c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_avx;
             c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_avx;
         }
+        if (EXTERNAL_AVX2(mm_flags)) {
+            c->transform_dc_add[2]    =  ff_hevc_idct16_dc_add_10_avx2;
+            c->transform_dc_add[3]    =  ff_hevc_idct32_dc_add_10_avx2;
 
+        }
     }
 }



More information about the ffmpeg-cvslog mailing list