[FFmpeg-devel] [PATCH v2 1/5] avcodec/x86/hevc_mc: add qpel_h8_8_avx512icl and qpel_hv8_8_avx512icl

jianhua.wu at intel.com jianhua.wu at intel.com
Fri Mar 11 09:52:09 EET 2022


From: Wu Jianhua <jianhua.wu at intel.com>

This commit uses the instruction `vpdpbusd` introduced by AVX512 VNNI
to calculate the horizontal filter.

ff_hevc_put_hevc_qpel_h8_8_sse4       1039169
ff_hevc_put_hevc_qpel_h8_8_avx512icl   677153
ff_hevc_put_hevc_qpel_hv8_8_sse4      3603511
ff_hevc_put_hevc_qpel_hv8_8_avx512icl 2995354

Signed-off-by: Wu Jianhua <jianhua.wu at intel.com>
---
 libavcodec/x86/hevc_mc.asm    | 137 ++++++++++++++++++++++++++++++++++
 libavcodec/x86/hevcdsp.h      |   3 +
 libavcodec/x86/hevcdsp_init.c |   4 +
 3 files changed, 144 insertions(+)

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index ff6ed0711a..52fa3ec948 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -87,6 +87,26 @@ QPEL_TABLE 12, 4, w, sse4
 QPEL_TABLE  8,16, b, avx2
 QPEL_TABLE 10, 8, w, avx2
 
+QPEL_TABLE  8, 1, b, avx512icl_h
+QPEL_TABLE  8, 1, d, avx512icl_v
+
+pb_qpel_shuffle_index: db  0,  1,  2,  3
+                       db  1,  2,  3,  4
+                       db  2,  3,  4,  5
+                       db  3,  4,  5,  6
+                       db  4,  5,  6,  7
+                       db  5,  6,  7,  8
+                       db  6,  7,  8,  9
+                       db  7,  8,  9, 10
+                       db  4,  5,  6,  7
+                       db  5,  6,  7,  8
+                       db  6,  7,  8,  9
+                       db  7,  8,  9, 10
+                       db  8,  9, 10, 11
+                       db  9, 10, 11, 12
+                       db 10, 11, 12, 13
+                       db 11, 12, 13, 14
+
 SECTION .text
 
 %define MAX_PB_SIZE  64
@@ -1670,3 +1690,120 @@ HEVC_PUT_HEVC_QPEL_HV 16, 10
 
 %endif ;AVX2
 %endif ; ARCH_X86_64
+
+%macro QPEL_FILTER_H 5
+%define %%table hevc_qpel_filters_avx512icl_h_%1
+%assign %%offset 4
+    dec %2q
+    shl %2q, 3
+%ifdef PIC
+    lea %5q, [%%table]
+    %define FILTER %5q
+%else
+    %define FILTER %%table
+%endif
+    vpbroadcastd m%3, [FILTER + %2q + 0*%%offset]
+    vpbroadcastd m%4, [FILTER + %2q + 1*%%offset]
+%endmacro
+
+%macro QPEL_FILTER_V 5
+    vpbroadcastd m%3, [%5 + %2q + 4*%4]
+%endmacro
+
+%macro QPEL_LOAD_SHUF 2
+    movu m%1, [pb_qpel_shuffle_index +  0]
+    movu m%2, [pb_qpel_shuffle_index + 32]
+%endmacro
+
+; required: m0-m5
+; %1: dst register index
+; %2: name for src
+%macro QPEL_H_LOAD_COMPUTE 2
+    pxor            m%1, m%1
+    movu            xm4, [%2q - 3]
+    vpermb           m5, m2, m4
+    vpermb           m4, m3, m4
+    vpdpbusd        m%1, m5, m0
+    vpdpbusd        m%1, m4, m1
+%endmacro
+
+%macro HEVC_PUT_HEVC_QPEL_AVX512ICL 2
+cglobal hevc_put_hevc_qpel_h%1_%2, 5, 6, 8, dst, src, srcstride, height, mx, tmp
+    QPEL_FILTER_H   %1, mx, 0, 1, tmp
+    QPEL_LOAD_SHUF   2, 3
+.loop:
+    QPEL_H_LOAD_COMPUTE   6, src
+    vpmovdw          [dstq], m6
+    LOOP_END            dst, src, srcstride
+    RET
+%endmacro
+
+%macro HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 2
+cglobal hevc_put_hevc_qpel_hv%1_%2, 6, 7, 27, dst, src, srcstride, height, mx, my, tmp
+%assign %%shift 6
+%assign %%extra 7
+    QPEL_FILTER_H    %1, mx, 0, 1, tmp
+    QPEL_LOAD_SHUF    2, 3
+    lea            tmpq, [srcstrideq*3]
+    sub            srcq, tmpq
+    sub             myq, 1
+    shl             myq, 5
+%ifdef PIC
+%define %%table hevc_qpel_filters_avx512icl_v_%1
+    lea tmpq, [%%table]
+    %define FILTER tmpq
+%else
+    %define FILTER %%table
+%endif
+%assign %%i 6
+%assign %%j 0
+%rep %1
+    QPEL_FILTER_V %1, my, %%i, %%j, FILTER
+    %assign %%i %%i+1
+    %assign %%j %%j+1
+%endrep
+%rep %%extra
+    QPEL_H_LOAD_COMPUTE %%i, src
+    add srcq, srcstrideq
+%assign %%i %%i+1
+%endrep
+.loop:
+    QPEL_H_LOAD_COMPUTE %%i, src
+    vpmulld           m22, m14, m6
+    vpmulld           m23, m15, m7
+    vpmulld           m24, m16, m8
+    vpmulld           m25, m17, m9
+    vpaddd            m26, m22, m23
+    vpaddd            m24, m25
+    vpaddd            m26, m24
+    vpmulld           m22, m18, m10
+    vpmulld           m23, m19, m11
+    vpmulld           m24, m20, m12
+    vpmulld           m25, m21, m13
+    vpaddd            m22, m22, m23
+    vpaddd            m24, m25
+    vpaddd            m26, m24
+    vpaddd            m22, m26
+    mova              m14, m15
+    mova              m15, m16
+    mova              m16, m17
+    mova              m17, m18
+    mova              m18, m19
+    mova              m19, m20
+    mova              m20, m21
+    vpsrad            m22, %%shift
+    vpmovdw        [dstq], m22
+    LOOP_END          dst, src, srcstride
+
+    RET
+%endmacro
+
+%if ARCH_X86_64
+%if HAVE_AVX512ICL_EXTERNAL
+
+INIT_YMM avx512icl
+HEVC_PUT_HEVC_QPEL_AVX512ICL 8, 8
+HEVC_PUT_HEVC_QPEL_HV_AVX512ICL 8, 8
+
+%endif
+%endif
diff --git a/libavcodec/x86/hevcdsp.h b/libavcodec/x86/hevcdsp.h
index 67be0a9059..5a495d2563 100644
--- a/libavcodec/x86/hevcdsp.h
+++ b/libavcodec/x86/hevcdsp.h
@@ -233,6 +233,9 @@ WEIGHTING_PROTOTYPES(8, sse4);
 WEIGHTING_PROTOTYPES(10, sse4);
 WEIGHTING_PROTOTYPES(12, sse4);
 
+void ff_hevc_put_hevc_qpel_h8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_hevc_qpel_hv8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+
 ///////////////////////////////////////////////////////////////////////////////
 // TRANSFORM_ADD
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/libavcodec/x86/hevcdsp_init.c b/libavcodec/x86/hevcdsp_init.c
index 8a3fa2744b..e51375ebc2 100644
--- a/libavcodec/x86/hevcdsp_init.c
+++ b/libavcodec/x86/hevcdsp_init.c
@@ -878,6 +878,10 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
 
             c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
         }
+        if (EXTERNAL_AVX512ICL(cpu_flags) && ARCH_X86_64) {
+            c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_avx512icl;
+            c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
+        }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
             c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
-- 
2.17.1



More information about the ffmpeg-devel mailing list