[FFmpeg-devel] [PATCH 2/3] swscale/x86/output: add AVX2 version of yuv2nv12cX
James Almer
jamrial at gmail.com
Fri Apr 24 06:56:37 EEST 2020
On 4/24/2020 12:13 AM, Nelson Gomez wrote:
> From: Nelson Gomez <nelson.gomez at microsoft.com>
>
> 256 bits is just wide enough to fit all the operands needed to vectorize
> the software implementation, but AVX2 is needed to for some instructions
> like 16-to-32 bit vector sign extension.
>
> Output is bit-for-bit identical to C.
>
> Signed-off-by: Nelson Gomez <nelson.gomez at microsoft.com>
> ---
> libswscale/x86/output.asm | 140 +++++++++++++++++++++++++++++++++++++-
> libswscale/x86/swscale.c | 24 +++++++
> 2 files changed, 163 insertions(+), 1 deletion(-)
>
> diff --git a/libswscale/x86/output.asm b/libswscale/x86/output.asm
> index db3e9934f8..7947163cac 100644
> --- a/libswscale/x86/output.asm
> +++ b/libswscale/x86/output.asm
> @@ -2,6 +2,7 @@
> ;* x86-optimized vertical line scaling functions
> ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje at gmail.com>
> ;* Kieran Kunhya <kieran at kunhya.com>
> +;* (c) 2020 Nelson Gomez <nelson.gomez at microsoft.com>
> ;*
> ;* This file is part of FFmpeg.
> ;*
> @@ -22,7 +23,7 @@
>
> %include "libavutil/x86/x86util.asm"
>
> -SECTION_RODATA
> +SECTION_RODATA 32
>
> minshort: times 8 dw 0x8000
> yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
> @@ -37,6 +38,18 @@ pw_32: times 8 dw 32
> pw_512: times 8 dw 512
> pw_1024: times 8 dw 1024
>
> +uint8_min_ymm: times 8 dd 0
Clear the register you need this for using pxor instead.
> +uint8_max_ymm: times 8 dd 255
Call it pd_255, following the same naming scheme as the constants above.
> +yuv2nv12_shuffle_mask: times 2 db 0, 4, 8, 12, \
> + -1, -1, -1, -1, \
> + -1, -1, -1, -1, \
> + -1, -1, -1, -1
> +yuv2nv21_shuffle_mask: times 2 db 4, 0, 12, 8, \
> + -1, -1, -1, -1, \
> + -1, -1, -1, -1, \
> + -1, -1, -1, -1
> +yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7
> +
> SECTION .text
>
> ;-----------------------------------------------------------------------------
> @@ -423,3 +436,128 @@ yuv2plane1_fn 9, 5, 3
> yuv2plane1_fn 10, 5, 3
> yuv2plane1_fn 16, 5, 3
> %endif
> +
> +%undef movsx
> +
> +;-----------------------------------------------------------------------------
> +; AVX2 yuv2nv12cX implementation
> +;
> +; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
> +; const int16_t *filter, int filterSize,
> +; const int16_t **u, const int16_t **v,
> +; uint8_t *dst, int dstWidth)
> +;
> +; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
> +; const int16_t *filter, int filterSize,
> +; const int16_t **u, const int16_t **v,
> +; uint8_t *dst, int dstWidth)
> +;-----------------------------------------------------------------------------
> +
> +%macro yuv2nv12cX_avx2_fn 1
> +%if %1
> +cglobal yuv2nv21cX, 8, 11, 13, 64, \
> + format, dither, filter, filterSize, u, v, dst, dstWidth
> +%else
> +cglobal yuv2nv12cX, 8, 11, 13, 64, \
> + format, dither, filter, filterSize, u, v, dst, dstWidth
> +%endif
You can pass yuv2nv21 and yuv2nv12 as arguments to the
yuv2nv12cX_avx2_fn macro instead of 0 and 1, and simplify this as
cglobal %1cX, 8, 11, 13, 64, ...
> +
> + %assign i 0
> + %rep 8
> + movzx r8d, byte [ditherq + i]
> + shl r8d, 12
> + mov [rsp + i * 8], r8d
> +
> + movzx r9d, byte [ditherq + ((i + 3) % 8)]
> + shl r9d, 12
> + mov [rsp + (i * 8) + 4], r9d
> +
> + %assign i i+1
> + %endrep
> +
> + mova ym0, [rsp] ; ditherLo
Use m# instead of ym#. By initializing these functions with INIT_YMM,
m0-m15 become aliases of ym0-ym15.
> + mova ym1, [rsp + 32] ; ditherHi
> + mova ym9, [uint8_min_ymm] ; uint8_min dwords
As i said above, pxor xm9, xm9 (Which also implicitly clears the higher
16 bytes).
> + mova ym10, [uint8_max_ymm] ; uint8_max dwords
> + mova ym12, [yuv2nv12_permute_mask] ; permute mask
> +%if %1
> + mova ym11, [yuv2nv21_shuffle_mask] ; shuffle_mask (NV21)
> +%else
> + mova ym11, [yuv2nv12_shuffle_mask] ; shuffle_mask (NV12)
> +%endif
Can also be simplified as
mova m11, [%1_shuffle_mask]
> +
> + DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth
> +
> + xor r8q, r8q
> +
> +nv12_outer_%1:
> + mova ym2, ym0 ; resultLo
> + mova ym3, ym1 ; resultHi
> + xor r9q, r9q
> +
> +nv12_inner_%1:
> + movsx r10d, word [filterq + (2 * r9q)]
> + movd xm4, r10d
> + vpbroadcastd ym4, xm4 ; filter
> +
> + mov tmp1q, [uq + (gprsize * r9q)]
> + mova xm7, oword [tmp1q + 2 * r8q]
> +
> + mov tmp2q, [vq + (gprsize * r9q)]
> + mova xm8, oword [tmp2q + 2 * r8q]
> +
> + vpunpcklwd xm5, xm7, xm8
Don't add the v prefix to pre-avx instructions. The x86inc magic will
ensure they are expanded to the corresponding VEX encoded version.
You only need it for AVX or newer instructions, like vperm and vpbroadcastd.
> + vpmovsxwd ym5, xm5 ; multiplicandsLo
> + vpunpckhwd xm6, xm7, xm8
> + vpmovsxwd ym6, xm6 ; multiplicandsHi
> +
> + vpmulld ym7, ym5, ym4 ; mulResultLo
> + vpmulld ym8, ym6, ym4 ; mulResultHi
> + vpaddd ym2, ym2, ym7 ; resultLo += mulResultLo
> + vpaddd ym3, ym3, ym8 ; resultHi += mulResultHi
> +
> + inc r9d
> + cmp r9d, filterSized
> + jl nv12_inner_%1
> + ; end of inner loop
> +
> + vpsrad ym2, ym2, 19
> + vpsrad ym3, ym3, 19
> +
> + ; Vectorized av_clip_uint8
> + vpmaxsd ym2, ym2, ym9
> + vpmaxsd ym3, ym3, ym9
> + vpminsd ym2, ym2, ym10
> + vpminsd ym3, ym3, ym10
> +
> + ; At this point we have clamped uint8s arranged in this order:
> + ; ym2: u1 0 0 0 v1 0 0 0 [...]
> + ; ym3: u5 0 0 0 v5 0 0 0 [...]
> + ;
> + ; First, we shuffle the bytes to make the bytes semi-contiguous.
> + ; AVX-2 doesn't have cross-lane shuffling, so we'll end up with:
> + ; ym2: u1 v1 u2 v2 0 0 0 0 0 0 0 0 u3 v3 u4 v4
> + ; ym3: u5 v5 u6 v6 0 0 0 0 0 0 0 0 u7 v7 u8 v8
> + vpshufb ym2, ym2, ym11
> + vpshufb ym3, ym3, ym11
> +
> + ; To fix the cross-lane shuffling issue, we'll then use cross-lane
> + ; permutation to combine the two segments
> + vpermd ym2, ym12, ym2
> + vpermd ym3, ym12, ym3
> +
> + ; Now we have the final results in the lower 8 bytes of each register
> + movq [dstq], xm2
> + movq [dstq + 8], xm3
> +
> + add r8d, 8
> + add dstq, 16
> +
> + cmp r8d, dstWidthd
> + jl nv12_outer_%1
> + RET
> +%endmacro
> +
> +INIT_YMM avx2
> +yuv2nv12cX_avx2_fn 0
> +yuv2nv12cX_avx2_fn 1
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 61110839ee..ad4a09df8d 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -380,6 +380,15 @@ INPUT_FUNCS(sse2);
> INPUT_FUNCS(ssse3);
> INPUT_FUNCS(avx);
>
> +#define YUV2NV_DECL(fmt, opt) \
> +void ff_yuv2 ## fmt ## cX_ ## opt(enum AVPixelFormat format, const uint8_t *dither, \
> + const int16_t *filter, int filterSize, \
> + const int16_t **u, const int16_t **v, \
> + uint8_t *dst, int dstWidth)
> +
> +YUV2NV_DECL(nv12, avx2);
> +YUV2NV_DECL(nv21, avx2);
> +
> av_cold void ff_sws_init_swscale_x86(SwsContext *c)
> {
> int cpu_flags = av_get_cpu_flags();
> @@ -580,4 +589,19 @@ switch(c->dstBpc){ \
> break;
> }
> }
> +
> + if (EXTERNAL_AVX2(cpu_flags)) {
EXTERNAL_AVX2_FAST(cpu_flags). Otherwise these will be used on AMD
Excavator cpus, which are very slow with ymm instructions.
> + switch (c->dstFormat) {
> + case AV_PIX_FMT_NV12:
> + case AV_PIX_FMT_NV24:
> + c->yuv2nv12cX = ff_yuv2nv12cX_avx2;
> + break;
> + case AV_PIX_FMT_NV21:
> + case AV_PIX_FMT_NV42:
> + c->yuv2nv12cX = ff_yuv2nv21cX_avx2;
> + break;
> + default:
> + break;
> + }
> + }
> }
>
More information about the ffmpeg-devel
mailing list