[FFmpeg-devel] [PATCH v4 4/5] avcodec/x86/diracdsp: migrate last remaining MMX function to SSE2

Kyosuke Kawakami kawakami150708 at gmail.com
Thu Nov 14 20:25:34 EET 2024


The add_dirac_obmc8_mmx function was the only MMX function left. This
patch migrates it to SSE2.

Here are the checkasm benchmark results:

diracdsp.add_dirac_obmc_8_c:    2299.1 ( 1.00x)
diracdsp.add_dirac_obmc_8_mmx:   237.6 ( 9.68x)
diracdsp.add_dirac_obmc_8_sse2:  109.1 (21.07x)

Signed-off-by: Kyosuke Kawakami <kawakami150708 at gmail.com>
---
 libavcodec/x86/diracdsp.asm    | 24 ++++++++++++++++++++----
 libavcodec/x86/diracdsp_init.c | 10 +++-------
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index a653fa04de..6ae7f888b3 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -228,7 +228,7 @@ cglobal add_dirac_obmc%1_%2, 5,5,5, dst, src, stride, obmc, yblen
     punpckhbw   m1, m4
     mova        m2, [obmcq+i]
     mova        m3, m2
-   punpcklbw   m2, m4
+    punpcklbw   m2, m4
     punpckhbw   m3, m4
     pmullw      m0, m2
     pmullw      m1, m3
@@ -248,9 +248,6 @@ cglobal add_dirac_obmc%1_%2, 5,5,5, dst, src, stride, obmc, yblen
     RET
 %endm
 
-INIT_MMX
-ADD_OBMC 8, mmx
-
 INIT_XMM
 PUT_RECT sse2
 ADD_RECT sse2
@@ -259,6 +256,25 @@ HPEL_FILTER sse2
 ADD_OBMC 32, sse2
 ADD_OBMC 16, sse2
 
+cglobal add_dirac_obmc8_sse2, 5,5,4, dst, src, stride, obmc, yblen
+    pxor        m3, m3
+    movsxdifnidn strideq, strided
+.loop:
+    movh        m0, [srcq]
+    punpcklbw   m0, m3
+    movh        m1, [obmcq]
+    punpcklbw   m1, m3
+    pmullw      m0, m1
+    movu        m1, [dstq]
+    paddw       m0, m1
+    movu        [dstq], m0
+    lea         srcq, [srcq+strideq]
+    lea         dstq, [dstq+2*strideq]
+    add         obmcq, 32
+    sub         yblend, 1
+    jg          .loop
+    RET
+
 INIT_XMM sse4
 
 ; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
index f678759dc0..08247133e1 100644
--- a/libavcodec/x86/diracdsp_init.c
+++ b/libavcodec/x86/diracdsp_init.c
@@ -24,8 +24,7 @@
 
 void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
 
-void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
-
+void ff_add_dirac_obmc8_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
 void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
 void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
 
@@ -94,15 +93,12 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
 #if HAVE_X86ASM
     int mm_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMX(mm_flags)) {
-        c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
-    }
-
     if (EXTERNAL_SSE2(mm_flags)) {
         c->dirac_hpel_filter = dirac_hpel_filter_sse2;
         c->add_rect_clamped = ff_add_rect_clamped_sse2;
         c->put_signed_rect_clamped[0] = (void *)ff_put_signed_rect_clamped_sse2;
 
+        c->add_dirac_obmc[0] = ff_add_dirac_obmc8_sse2;
         c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
         c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
 
@@ -116,5 +112,5 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
         c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
         c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
     }
-#endif
+#endif // HAVE_X86ASM
 }
-- 
2.47.0



More information about the ffmpeg-devel mailing list