[FFmpeg-devel] [PATCH v4 4/5] avcodec/x86/diracdsp: migrate last remaining MMX function to SSE2
Kyosuke Kawakami
kawakami150708 at gmail.com
Thu Nov 14 20:25:34 EET 2024
The add_dirac_obmc8_mmx function was the only MMX function left. This
patch migrates it to SSE2.
Here are the checkasm benchmark results:
diracdsp.add_dirac_obmc_8_c: 2299.1 ( 1.00x)
diracdsp.add_dirac_obmc_8_mmx: 237.6 ( 9.68x)
diracdsp.add_dirac_obmc_8_sse2: 109.1 (21.07x)
Signed-off-by: Kyosuke Kawakami <kawakami150708 at gmail.com>
---
libavcodec/x86/diracdsp.asm | 24 ++++++++++++++++++++----
libavcodec/x86/diracdsp_init.c | 10 +++-------
2 files changed, 23 insertions(+), 11 deletions(-)
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index a653fa04de..6ae7f888b3 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -228,7 +228,7 @@ cglobal add_dirac_obmc%1_%2, 5,5,5, dst, src, stride, obmc, yblen
punpckhbw m1, m4
mova m2, [obmcq+i]
mova m3, m2
- punpcklbw m2, m4
+ punpcklbw m2, m4
punpckhbw m3, m4
pmullw m0, m2
pmullw m1, m3
@@ -248,9 +248,6 @@ cglobal add_dirac_obmc%1_%2, 5,5,5, dst, src, stride, obmc, yblen
RET
%endm
-INIT_MMX
-ADD_OBMC 8, mmx
-
INIT_XMM
PUT_RECT sse2
ADD_RECT sse2
@@ -259,6 +256,25 @@ HPEL_FILTER sse2
ADD_OBMC 32, sse2
ADD_OBMC 16, sse2
+cglobal add_dirac_obmc8_sse2, 5,5,4, dst, src, stride, obmc, yblen
+ pxor m3, m3
+ movsxdifnidn strideq, strided
+.loop:
+ movh m0, [srcq]
+ punpcklbw m0, m3
+ movh m1, [obmcq]
+ punpcklbw m1, m3
+ pmullw m0, m1
+ movu m1, [dstq]
+ paddw m0, m1
+ movu [dstq], m0
+ lea srcq, [srcq+strideq]
+ lea dstq, [dstq+2*strideq]
+ add obmcq, 32
+ sub yblend, 1
+ jg .loop
+ RET
+
INIT_XMM sse4
; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
index f678759dc0..08247133e1 100644
--- a/libavcodec/x86/diracdsp_init.c
+++ b/libavcodec/x86/diracdsp_init.c
@@ -24,8 +24,7 @@
void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
-void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
-
+void ff_add_dirac_obmc8_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
@@ -94,15 +93,12 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
#if HAVE_X86ASM
int mm_flags = av_get_cpu_flags();
- if (EXTERNAL_MMX(mm_flags)) {
- c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
- }
-
if (EXTERNAL_SSE2(mm_flags)) {
c->dirac_hpel_filter = dirac_hpel_filter_sse2;
c->add_rect_clamped = ff_add_rect_clamped_sse2;
c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
+ c->add_dirac_obmc[0] = ff_add_dirac_obmc8_sse2;
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
@@ -116,5 +112,5 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
c->dequant_subband[1] = ff_dequant_subband_32_sse4;
c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
}
-#endif
+#endif // HAVE_X86ASM
}
--
2.47.0
More information about the ffmpeg-devel
mailing list