[FFmpeg-devel] [PATCH 4/6] avcodec/x86: add x86-64 8-bit simple_idct put function

Mon Jun 12 16:36:07 EEST 2017

---
 libavcodec/x86/idctdsp_init.c    |  2 ++
 libavcodec/x86/simple_idct.h     |  3 +++
 libavcodec/x86/simple_idct10.asm | 23 +++++++++++++++++++++++
 3 files changed, 28 insertions(+)

diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 4b2145e478..1826d01e0e 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -102,6 +102,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                 avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
                 avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
                 c->idct      = ff_simple_idct8_sse2;
+                c->idct_put  = ff_simple_idct8_put_sse2;
                 c->perm_type = FF_IDCT_PERM_TRANSPOSE;
         }
     }
@@ -113,6 +114,7 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
                 avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
                 avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
                 c->idct      = ff_simple_idct8_avx;
+                c->idct_put  = ff_simple_idct8_put_avx;
                 c->perm_type = FF_IDCT_PERM_TRANSPOSE;
         }
 
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index d17a855312..b559f8527c 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -32,6 +32,9 @@ void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
 void ff_simple_idct8_sse2(int16_t *block);
 void ff_simple_idct8_avx(int16_t *block);
 
+void ff_simple_idct8_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct8_put_avx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+
 void ff_simple_idct10_sse2(int16_t *block);
 void ff_simple_idct10_avx(int16_t *block);
 
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index 168b6a08e0..f31fb5cfa5 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -71,11 +71,34 @@ CONST_DEC  w7_min_w5,    W7sh2, -W5sh2
 
 SECTION .text
 
+%macro STORE_HI_LO 12
+    movq   %1, %9
+    movq   %3, %10
+    movq   %5, %11
+    movq   %7, %12
+    movhps %2, %9
+    movhps %4, %10
+    movhps %6, %11
+    movhps %8, %12
+%endmacro
+
 %macro idct_fn 0
 cglobal simple_idct8, 1, 1, 16, block
     IDCT_FN    "", 11, pw_round_20_div_w4, 20, "store"
 RET
 
+; TODO: optimise by not writing the final data to the block.
+cglobal simple_idct8_put, 3, 4, 16, pixels, lsize, block
+    IDCT_FN    "", 11, pw_round_20_div_w4, 20
+    lea       r3, [3*lsizeq]
+    lea       r2, [pixelsq + r3]
+    packuswb  m8, m0
+    packuswb  m1, m2
+    packuswb  m4, m11
+    packuswb  m9, m10
+    STORE_HI_LO PASS8ROWS(pixelsq, r2, lsizeq, r3), m8, m1, m4, m9
+RET
+
 cglobal simple_idct10, 1, 1, 16, block
     IDCT_FN    "", 12, "", 19, "store"
     RET
-- 
2.13.0