[FFmpeg-devel] [PATCH v4 4/4] libswscale/x86/rgb2rgb: add uyvytoyuv422 avx512
Wu Jianhua
jianhua.wu at intel.com
Thu Sep 30 11:50:23 EEST 2021
With the accelerating by means of AVX512, the uyvytoyuv422 can be faster.
Performance data(Less is better):
uyvytoyuv422_avx2 0.27309
uyvytoyuv422_avx512 0.16229
Signed-off-by: Wu Jianhua <jianhua.wu at intel.com>
---
libswscale/x86/rgb2rgb.c | 6 ++++++
libswscale/x86/rgb_2_rgb.asm | 20 ++++++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index a965a1755c..c59136a352 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -167,6 +167,9 @@ void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
+void ff_uyvytoyuv422_avx512(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
+ const uint8_t *src, int width, int height,
+ int lumStride, int chromStride, int srcStride);
#endif
av_cold void rgb2rgb_init_x86(void)
@@ -222,5 +225,8 @@ av_cold void rgb2rgb_init_x86(void)
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
uyvytoyuv422 = ff_uyvytoyuv422_avx2;
}
+ if (EXTERNAL_AVX512(cpu_flags)) {
+ uyvytoyuv422 = ff_uyvytoyuv422_avx512;
+ }
#endif
}
diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
index 1777a99faf..d55d2ca07f 100644
--- a/libswscale/x86/rgb_2_rgb.asm
+++ b/libswscale/x86/rgb_2_rgb.asm
@@ -31,8 +31,10 @@ pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
+pd_permd512_uv: dd 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
pb_shuffle_low: db 1, 3, 5, 7, 9, 11, 13, 15
pd_permd256_uv: dd 0, 4, 1, 5, 2, 6, 3, 7
+pq_permq512_yy: dq 0, 2, 4, 6, 1, 3, 5, 7
SECTION .text
@@ -194,7 +196,11 @@ SHUFFLE_BYTES 3, 2, 1, 0
%macro UYVY_TO_YUV422 0
cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_stride, src_stride, wtwo, whalf, tmp, x, back_w
pxor m0, m0
+%if mmsize == 64
+ vpternlogd m1, m1, m1, 0xff
+%else
pcmpeqw m1, m1
+%endif
psrlw m1, 8
movsxdifnidn wq, wd
@@ -213,7 +219,12 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
%if mmsize > 16
vpbroadcastq m13, [pb_shuffle_low]
+%if mmsize == 32
movu m15, [pd_permd256_uv]
+%else
+ movu m14, [pq_permq512_yy]
+ movu m15, [pd_permd512_uv]
+%endif
%endif
.loop_line:
@@ -271,6 +282,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
pshufb m7, m3, m13
punpcklqdq m6, m6, m7
VPERM q, 32, m6, m6, 0xd8
+ VPERM q, 64, m6, m14, m6
%endif
movu [ydstq + wq], m6
@@ -287,6 +299,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
pshufb m7, m5, m13
punpcklqdq m6, m6, m7
VPERM q, 32, m6, m6, 0xd8
+ VPERM q, 64, m6, m14, m6
%endif
movu [ydstq + wq + mmsize], m6
@@ -305,6 +318,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
packuswb m6, m7 ; UUUU
VPERM d, 32, m6, m15, m6
+ VPERM d, 64, m6, m15, m6
movu [udstq + whalfq], m6
@@ -314,6 +328,7 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
packuswb m2, m4 ; VVVV
VPERM d, 32, m2, m15, m2
+ VPERM d, 64, m2, m15, m2
movu [vdstq + whalfq], m2
@@ -350,4 +365,9 @@ UYVY_TO_YUV422
INIT_YMM avx2
UYVY_TO_YUV422
%endif
+
+%if HAVE_AVX512_EXTERNAL
+INIT_ZMM avx512
+UYVY_TO_YUV422
+%endif
%endif
--
2.17.1
More information about the ffmpeg-devel
mailing list