[FFmpeg-devel] [PATCH v3 2/3] avcodec/x86/diracdsp: migrate last remaining MMX function to SSE2
James Almer
jamrial at gmail.com
Thu Nov 14 17:21:45 EET 2024
On 11/14/2024 11:30 AM, Kyosuke Kawakami wrote:
> The add_dirac_obmc8_mmx function was the only MMX function left. This
> patch migrates it to SSE2.
>
> Here are the checkasm benchmark results:
>
> diracdsp.add_dirac_obmc_8_c: 2299.1 ( 1.00x)
> diracdsp.add_dirac_obmc_8_mmx: 237.6 ( 9.68x)
> diracdsp.add_dirac_obmc_8_sse2: 109.1 (21.07x)
>
> Signed-off-by: Kyosuke Kawakami <kawakami150708 at gmail.com>
> ---
> libavcodec/x86/diracdsp.asm | 23 +++++++++++++++++++----
> libavcodec/x86/diracdsp_init.c | 10 +++-------
> 2 files changed, 22 insertions(+), 11 deletions(-)
>
> diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> index e5e2b11846..e708400b66 100644
> --- a/libavcodec/x86/diracdsp.asm
> +++ b/libavcodec/x86/diracdsp.asm
> @@ -227,7 +227,7 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
> punpckhbw m1, m4
> mova m2, [obmcq+i]
> mova m3, m2
> - punpcklbw m2, m4
> + punpcklbw m2, m4
> punpckhbw m3, m4
> pmullw m0, m2
> pmullw m1, m3
> @@ -247,9 +247,6 @@ cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
> RET
> %endm
>
> -INIT_MMX
> -ADD_OBMC 8, mmx
> -
> INIT_XMM
> PUT_RECT sse2
> ADD_RECT sse2
> @@ -258,6 +255,24 @@ HPEL_FILTER sse2
> ADD_OBMC 32, sse2
> ADD_OBMC 16, sse2
>
> +cglobal add_dirac_obmc8_sse2, 6,6,5, dst, src, stride, obmc, yblen
You're loading 5 gpr and using 5 too, not 6.
> + pxor m4, m4
Add...
movsxdifnidn strideq, strided
...here, otherwise the tests will fail on Windows x86_64 (Upper 32 bits
of the register are garbage).
And while at it, also make these changes to the other two ADD_OBMC
functions in the macro above.
> +.loop:
> + movh m0, [srcq]
> + punpcklbw m0, m4
> + movh m1, [obmcq]
> + punpcklbw m1, m4
> + pmullw m0, m1
> + movu m1, [dstq]
> + paddw m0, m1
> + movu [dstq], m0
> + lea srcq, [srcq+strideq]
> + lea dstq, [dstq+2*strideq]
> + add obmcq, 32
> + sub yblend, 1
> + jg .loop
> + RET
> +
> INIT_XMM sse4
>
> ; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
> diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
> index f678759dc0..08247133e1 100644
> --- a/libavcodec/x86/diracdsp_init.c
> +++ b/libavcodec/x86/diracdsp_init.c
> @@ -24,8 +24,7 @@
>
> void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
>
> -void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
> -
> +void ff_add_dirac_obmc8_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
> void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
> void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
>
> @@ -94,15 +93,12 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
> #if HAVE_X86ASM
> int mm_flags = av_get_cpu_flags();
>
> - if (EXTERNAL_MMX(mm_flags)) {
> - c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
> - }
> -
> if (EXTERNAL_SSE2(mm_flags)) {
> c->dirac_hpel_filter = dirac_hpel_filter_sse2;
> c->add_rect_clamped = ff_add_rect_clamped_sse2;
> c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
>
> + c->add_dirac_obmc[0] = ff_add_dirac_obmc8_sse2;
> c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
> c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
>
> @@ -116,5 +112,5 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
> c->dequant_subband[1] = ff_dequant_subband_32_sse4;
> c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
> }
> -#endif
> +#endif // HAVE_X86ASM
> }
-------------- next part --------------
A non-text attachment was scrubbed...
Name: OpenPGP_signature.asc
Type: application/pgp-signature
Size: 495 bytes
Desc: OpenPGP digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20241114/cae20c0d/attachment.sig>
More information about the ffmpeg-devel
mailing list