[FFmpeg-devel] [PATCH 4/4] libswscale/x86/rgb2rgb: add uyvytoyuv422 avx512

Wu Jianhua jianhua.wu at intel.com
Tue Sep 28 08:34:04 EEST 2021


With the accelerating by means of AVX512, the uyvytoyuv422 can be faster.

Performance data(Less is better):
    uyvytoyuv422_avx2      0.27915
    uyvytoyuv422_avx512    0.16442

Signed-off-by: Wu Jianhua <jianhua.wu at intel.com>
---
 libswscale/x86/rgb2rgb.c     |  6 ++++++
 libswscale/x86/rgb_2_rgb.asm | 17 +++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index a965a1755c..c59136a352 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -167,6 +167,9 @@ void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
 void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                           const uint8_t *src, int width, int height,
                           int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv422_avx512(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+                            const uint8_t *src, int width, int height,
+                            int lumStride, int chromStride, int srcStride);
 #endif
 
 av_cold void rgb2rgb_init_x86(void)
@@ -222,5 +225,8 @@ av_cold void rgb2rgb_init_x86(void)
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
         uyvytoyuv422 = ff_uyvytoyuv422_avx2;
     }
+    if (EXTERNAL_AVX512(cpu_flags)) {
+        uyvytoyuv422 = ff_uyvytoyuv422_avx512;
+    }
 #endif
 }
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 683bd067a5..552967b6a2 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -31,7 +31,9 @@ pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
 pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
 pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
 pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+pd_permd512_uv: dd 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
 pd_permd256_uv: dd 0, 4, 1, 5, 2, 6, 3, 7
+pq_permq512_yy: dq 0, 2, 4, 6, 1, 3, 5, 7
 
 SECTION .text
 
@@ -193,7 +195,11 @@ SHUFFLE_BYTES 3, 2, 1, 0
 %macro UYVY_TO_YUV422 0
 cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
     pxor         m0, m0
+%if mmsize == 64
+    vpternlogd   m1, m1, m1, 0xff
+%else
     pcmpeqw      m1, m1
+%endif
     psrlw        m1, 8
 
     movsxdifnidn            wq, wd
@@ -212,6 +218,9 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
 
 %if mmsize == 32
     movu    m15, [pd_permd256_uv]
+%elif mmsize == 64
+    movu    m14, [pq_permq512_yy]
+    movu    m15, [pd_permd512_uv]
 %endif
 
 .loop_line:
@@ -265,6 +274,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
         packuswb       m6, m7 ; YYYY YYYY...
 
         VPERM   q, 32, m6, m6, 0xd8
+        VPERM   q, 64, m6, m14, m6
 
         movu [ydstq + wq], m6
 
@@ -277,6 +287,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
         packuswb       m6, m7 ; YYYY YYYY...
 
         VPERM   q, 32, m6, m6, 0xd8
+        VPERM   q, 64, m6, m14, m6
 
         movu [ydstq + wq + mmsize], m6
 
@@ -295,6 +306,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
         packuswb     m6, m7 ; UUUU
 
         VPERM d, 32, m6, m15, m6
+        VPERM d, 64, m6, m15, m6
 
         movu   [udstq + whalfq], m6
 
@@ -304,6 +316,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
         packuswb     m2, m4 ; VVVV
 
         VPERM d, 32, m2, m15, m2
+        VPERM d, 64, m2, m15, m2
 
         movu   [vdstq + whalfq], m2
 
@@ -340,4 +353,8 @@ UYVY_TO_YUV422
 INIT_YMM avx2
 UYVY_TO_YUV422
 %endif
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+UYVY_TO_YUV422
+%endif
 %endif
-- 
2.17.1



More information about the ffmpeg-devel mailing list