[FFmpeg-devel] [PATCH 2/3] swscale/x86/output: add AVX2 version of yuv2nv12cX
Nelson Gomez
negomez at linux.microsoft.com
Fri Apr 24 06:13:18 EEST 2020
From: Nelson Gomez <nelson.gomez at microsoft.com>
256 bits is just wide enough to fit all the operands needed to vectorize
the software implementation, but AVX2 is needed to for some instructions
like 16-to-32 bit vector sign extension.
Output is bit-for-bit identical to C.
Signed-off-by: Nelson Gomez <nelson.gomez at microsoft.com>
---
libswscale/x86/output.asm | 140 +++++++++++++++++++++++++++++++++++++-
libswscale/x86/swscale.c | 24 +++++++
2 files changed, 163 insertions(+), 1 deletion(-)
diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
index db3e9934f8..7947163cac 100644
--- a/libswscale/x86/output.asm
+++ b/libswscale/x86/output.asm
@@ -2,6 +2,7 @@
;* x86-optimized vertical line scaling functions
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje at gmail.com>
;* Kieran Kunhya <kieran at kunhya.com>
+;* (c) 2020 Nelson Gomez <nelson.gomez at microsoft.com>
;*
;* This file is part of FFmpeg.
;*
@@ -22,7 +23,7 @@
%include "libavutil/x86/x86util.asm"
-SECTION_RODATA
+SECTION_RODATA 32
minshort: times 8 dw 0x8000
yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
@@ -37,6 +38,18 @@ pw_32: times 8 dw 32
pw_512: times 8 dw 512
pw_1024: times 8 dw 1024
+uint8_min_ymm: times 8 dd 0
+uint8_max_ymm: times 8 dd 255
+yuv2nv12_shuffle_mask: times 2 db 0, 4, 8, 12, \
+ -1, -1, -1, -1, \
+ -1, -1, -1, -1, \
+ -1, -1, -1, -1
+yuv2nv21_shuffle_mask: times 2 db 4, 0, 12, 8, \
+ -1, -1, -1, -1, \
+ -1, -1, -1, -1, \
+ -1, -1, -1, -1
+yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7
+
SECTION .text
;-----------------------------------------------------------------------------
@@ -423,3 +436,128 @@ yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 5, 3
%endif
+
+%undef movsx
+
+;-----------------------------------------------------------------------------
+; AVX2 yuv2nv12cX implementation
+;
+; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
+; const int16_t *filter, int filterSize,
+; const int16_t **u, const int16_t **v,
+; uint8_t *dst, int dstWidth)
+;
+; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
+; const int16_t *filter, int filterSize,
+; const int16_t **u, const int16_t **v,
+; uint8_t *dst, int dstWidth)
+;-----------------------------------------------------------------------------
+
+%macro yuv2nv12cX_avx2_fn 1
+%if %1
+cglobal yuv2nv21cX, 8, 11, 13, 64, \
+ format, dither, filter, filterSize, u, v, dst, dstWidth
+%else
+cglobal yuv2nv12cX, 8, 11, 13, 64, \
+ format, dither, filter, filterSize, u, v, dst, dstWidth
+%endif
+
+ %assign i 0
+ %rep 8
+ movzx r8d, byte [ditherq + i]
+ shl r8d, 12
+ mov [rsp + i * 8], r8d
+
+ movzx r9d, byte [ditherq + ((i + 3) % 8)]
+ shl r9d, 12
+ mov [rsp + (i * 8) + 4], r9d
+
+ %assign i i+1
+ %endrep
+
+ mova ym0, [rsp] ; ditherLo
+ mova ym1, [rsp + 32] ; ditherHi
+ mova ym9, [uint8_min_ymm] ; uint8_min dwords
+ mova ym10, [uint8_max_ymm] ; uint8_max dwords
+ mova ym12, [yuv2nv12_permute_mask] ; permute mask
+%if %1
+ mova ym11, [yuv2nv21_shuffle_mask] ; shuffle_mask (NV21)
+%else
+ mova ym11, [yuv2nv12_shuffle_mask] ; shuffle_mask (NV12)
+%endif
+
+ DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth
+
+ xor r8q, r8q
+
+nv12_outer_%1:
+ mova ym2, ym0 ; resultLo
+ mova ym3, ym1 ; resultHi
+ xor r9q, r9q
+
+nv12_inner_%1:
+ movsx r10d, word [filterq + (2 * r9q)]
+ movd xm4, r10d
+ vpbroadcastd ym4, xm4 ; filter
+
+ mov tmp1q, [uq + (gprsize * r9q)]
+ mova xm7, oword [tmp1q + 2 * r8q]
+
+ mov tmp2q, [vq + (gprsize * r9q)]
+ mova xm8, oword [tmp2q + 2 * r8q]
+
+ vpunpcklwd xm5, xm7, xm8
+ vpmovsxwd ym5, xm5 ; multiplicandsLo
+ vpunpckhwd xm6, xm7, xm8
+ vpmovsxwd ym6, xm6 ; multiplicandsHi
+
+ vpmulld ym7, ym5, ym4 ; mulResultLo
+ vpmulld ym8, ym6, ym4 ; mulResultHi
+ vpaddd ym2, ym2, ym7 ; resultLo += mulResultLo
+ vpaddd ym3, ym3, ym8 ; resultHi += mulResultHi
+
+ inc r9d
+ cmp r9d, filterSized
+ jl nv12_inner_%1
+ ; end of inner loop
+
+ vpsrad ym2, ym2, 19
+ vpsrad ym3, ym3, 19
+
+ ; Vectorized av_clip_uint8
+ vpmaxsd ym2, ym2, ym9
+ vpmaxsd ym3, ym3, ym9
+ vpminsd ym2, ym2, ym10
+ vpminsd ym3, ym3, ym10
+
+ ; At this point we have clamped uint8s arranged in this order:
+ ; ym2: u1 0 0 0 v1 0 0 0 [...]
+ ; ym3: u5 0 0 0 v5 0 0 0 [...]
+ ;
+ ; First, we shuffle the bytes to make the bytes semi-contiguous.
+ ; AVX-2 doesn't have cross-lane shuffling, so we'll end up with:
+ ; ym2: u1 v1 u2 v2 0 0 0 0 0 0 0 0 u3 v3 u4 v4
+ ; ym3: u5 v5 u6 v6 0 0 0 0 0 0 0 0 u7 v7 u8 v8
+ vpshufb ym2, ym2, ym11
+ vpshufb ym3, ym3, ym11
+
+ ; To fix the cross-lane shuffling issue, we'll then use cross-lane
+ ; permutation to combine the two segments
+ vpermd ym2, ym12, ym2
+ vpermd ym3, ym12, ym3
+
+ ; Now we have the final results in the lower 8 bytes of each register
+ movq [dstq], xm2
+ movq [dstq + 8], xm3
+
+ add r8d, 8
+ add dstq, 16
+
+ cmp r8d, dstWidthd
+ jl nv12_outer_%1
+ RET
+%endmacro
+
+INIT_YMM avx2
+yuv2nv12cX_avx2_fn 0
+yuv2nv12cX_avx2_fn 1
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 61110839ee..ad4a09df8d 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -380,6 +380,15 @@ INPUT_FUNCS(sse2);
INPUT_FUNCS(ssse3);
INPUT_FUNCS(avx);
+#define YUV2NV_DECL(fmt, opt) \
+void ff_yuv2 ## fmt ## cX_ ## opt(enum AVPixelFormat format, const uint8_t *dither, \
+ const int16_t *filter, int filterSize, \
+ const int16_t **u, const int16_t **v, \
+ uint8_t *dst, int dstWidth)
+
+YUV2NV_DECL(nv12, avx2);
+YUV2NV_DECL(nv21, avx2);
+
av_cold void ff_sws_init_swscale_x86(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
@@ -580,4 +589,19 @@ switch(c->dstBpc){ \
break;
}
}
+
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ switch (c->dstFormat) {
+ case AV_PIX_FMT_NV12:
+ case AV_PIX_FMT_NV24:
+ c->yuv2nv12cX = ff_yuv2nv12cX_avx2;
+ break;
+ case AV_PIX_FMT_NV21:
+ case AV_PIX_FMT_NV42:
+ c->yuv2nv12cX = ff_yuv2nv21cX_avx2;
+ break;
+ default:
+ break;
+ }
+ }
}
--
2.25.1
More information about the ffmpeg-devel
mailing list