[FFmpeg-devel] [PATCH] x86: vc1dsp: Convert vc1_inv_trans_*_dc to NASM format
Timothy Gu
timothygu99 at gmail.com
Mon Feb 1 00:28:40 CET 2016
---
libavcodec/x86/vc1dsp.asm | 98 ++++++++++++++++++++
libavcodec/x86/vc1dsp_init.c | 13 +++
libavcodec/x86/vc1dsp_mmx.c | 207 -------------------------------------------
3 files changed, 111 insertions(+), 207 deletions(-)
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm
index 6415a83..6d00650 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp.asm
@@ -395,3 +395,101 @@ cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
jnz .loop
REP_RET
%endif ; HAVE_MMX_INLINE
+
+%macro INV_TRANS_INIT 0
+ movsxdifnidn linesizeq, linesized
+ mova m0, blockq
+ SPLATW m0, m0
+ pxor m1, m1
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+
+ DEFINE_ARGS dest, linesize, linesize3
+ lea linesize3q, [linesizeq*3]
+%endmacro
+
+%macro INV_TRANS_PROCESS 1
+ mov%1 m2, [destq+linesizeq*0]
+ mov%1 m3, [destq+linesizeq*1]
+ mov%1 m4, [destq+linesizeq*2]
+ mov%1 m5, [destq+linesize3q]
+ paddusb m2, m0
+ paddusb m3, m0
+ paddusb m4, m0
+ paddusb m5, m0
+ psubusb m2, m1
+ psubusb m3, m1
+ psubusb m4, m1
+ psubusb m5, m1
+ mov%1 [linesizeq*0+destq], m2
+ mov%1 [linesizeq*1+destq], m3
+ mov%1 [linesizeq*2+destq], m4
+ mov%1 [linesize3q +destq], m5
+%endmacro
+
+; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, int linesize, int16_t *block)
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
+ movsx r3, WORD [blockq]
+ mov blockq, r3 ; dc
+ shl blockq, 4 ; 16 * dc
+ lea blockq, [blockq+r3+4] ; 17 * dc + 4
+ sar blockq, 3 ; >> 3
+ mov r3, blockq ; dc
+ shl blockq, 4 ; 16 * dc
+ lea blockq, [blockq+r3+64] ; 17 * dc + 64
+ sar blockq, 7 ; >> 7
+
+ INV_TRANS_INIT
+
+ INV_TRANS_PROCESS h
+ RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
+ movsx r3, WORD [blockq]
+ mov blockq, r3 ; dc
+ shl blockq, 4 ; 16 * dc
+ lea blockq, [blockq+r3+4] ; 17 * dc + 4
+ sar blockq, 3 ; >> 3
+ shl blockq, 2 ; 4 * dc
+ lea blockq, [blockq*3+64] ; 12 * dc + 64
+ sar blockq, 7 ; >> 7
+
+ INV_TRANS_INIT
+
+ INV_TRANS_PROCESS h
+ lea destq, [destq+linesizeq*4]
+ INV_TRANS_PROCESS h
+ RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
+ movsx blockq, WORD [blockq] ; dc
+ lea blockq, [blockq*3+1] ; 3 * dc + 1
+ sar blockq, 1 ; >> 1
+ mov r3, blockq ; dc
+ shl blockq, 4 ; 16 * dc
+ lea blockq, [blockq+r3+64] ; 17 * dc + 64
+ sar blockq, 7 ; >> 7
+
+ INV_TRANS_INIT
+
+ INV_TRANS_PROCESS a
+ RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
+ movsx blockq, WORD [blockq] ; dc
+ lea blockq, [blockq*3+1] ; 3 * dc + 1
+ sar blockq, 1 ; >> 1
+ lea blockq, [blockq*3+16] ; 3 * dc + 16
+ sar blockq, 5 ; >> 5
+
+ INV_TRANS_INIT
+
+ INV_TRANS_PROCESS a
+ lea destq, [destq+linesizeq*4]
+ INV_TRANS_PROCESS a
+ RET
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index 1747305..c8943fa 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -92,6 +92,14 @@ void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
+void ff_vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
+ int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
+ int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
+ int16_t *block);
+void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
+ int16_t *block);
av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
@@ -130,6 +138,11 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
dsp->avg_vc1_mspel_pixels_tab[1][0] = avg_vc1_mspel_mc00_8_mmxext;
dsp->avg_vc1_mspel_pixels_tab[0][0] = avg_vc1_mspel_mc00_16_mmxext;
+
+ dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_mmxext;
+ dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_mmxext;
+ dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_mmxext;
+ dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index c268cc6..ff13d9b 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -481,208 +481,6 @@ DECLARE_FUNCTION(3, 1)
DECLARE_FUNCTION(3, 2)
DECLARE_FUNCTION(3, 3)
-static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
- int16_t *block)
-{
- int dc = block[0];
- dc = (17 * dc + 4) >> 3;
- dc = (17 * dc + 64) >> 7;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- __asm__ volatile(
- "movd %0, %%mm2 \n\t"
- "movd %1, %%mm3 \n\t"
- "movd %2, %%mm4 \n\t"
- "movd %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movd %%mm2, %0 \n\t"
- "movd %%mm3, %1 \n\t"
- "movd %%mm4, %2 \n\t"
- "movd %%mm5, %3 \n\t"
- :"+m"(*(uint32_t*)(dest+0*linesize)),
- "+m"(*(uint32_t*)(dest+1*linesize)),
- "+m"(*(uint32_t*)(dest+2*linesize)),
- "+m"(*(uint32_t*)(dest+3*linesize))
- );
-}
-
-static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
- int16_t *block)
-{
- int dc = block[0];
- dc = (17 * dc + 4) >> 3;
- dc = (12 * dc + 64) >> 7;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- __asm__ volatile(
- "movd %0, %%mm2 \n\t"
- "movd %1, %%mm3 \n\t"
- "movd %2, %%mm4 \n\t"
- "movd %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movd %%mm2, %0 \n\t"
- "movd %%mm3, %1 \n\t"
- "movd %%mm4, %2 \n\t"
- "movd %%mm5, %3 \n\t"
- :"+m"(*(uint32_t*)(dest+0*linesize)),
- "+m"(*(uint32_t*)(dest+1*linesize)),
- "+m"(*(uint32_t*)(dest+2*linesize)),
- "+m"(*(uint32_t*)(dest+3*linesize))
- );
- dest += 4*linesize;
- __asm__ volatile(
- "movd %0, %%mm2 \n\t"
- "movd %1, %%mm3 \n\t"
- "movd %2, %%mm4 \n\t"
- "movd %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movd %%mm2, %0 \n\t"
- "movd %%mm3, %1 \n\t"
- "movd %%mm4, %2 \n\t"
- "movd %%mm5, %3 \n\t"
- :"+m"(*(uint32_t*)(dest+0*linesize)),
- "+m"(*(uint32_t*)(dest+1*linesize)),
- "+m"(*(uint32_t*)(dest+2*linesize)),
- "+m"(*(uint32_t*)(dest+3*linesize))
- );
-}
-
-static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
- int16_t *block)
-{
- int dc = block[0];
- dc = ( 3 * dc + 1) >> 1;
- dc = (17 * dc + 64) >> 7;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- __asm__ volatile(
- "movq %0, %%mm2 \n\t"
- "movq %1, %%mm3 \n\t"
- "movq %2, %%mm4 \n\t"
- "movq %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movq %%mm2, %0 \n\t"
- "movq %%mm3, %1 \n\t"
- "movq %%mm4, %2 \n\t"
- "movq %%mm5, %3 \n\t"
- :"+m"(*(uint32_t*)(dest+0*linesize)),
- "+m"(*(uint32_t*)(dest+1*linesize)),
- "+m"(*(uint32_t*)(dest+2*linesize)),
- "+m"(*(uint32_t*)(dest+3*linesize))
- );
-}
-
-static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
- int16_t *block)
-{
- int dc = block[0];
- dc = (3 * dc + 1) >> 1;
- dc = (3 * dc + 16) >> 5;
- __asm__ volatile(
- "movd %0, %%mm0 \n\t"
- "pshufw $0, %%mm0, %%mm0 \n\t"
- "pxor %%mm1, %%mm1 \n\t"
- "psubw %%mm0, %%mm1 \n\t"
- "packuswb %%mm0, %%mm0 \n\t"
- "packuswb %%mm1, %%mm1 \n\t"
- ::"r"(dc)
- );
- __asm__ volatile(
- "movq %0, %%mm2 \n\t"
- "movq %1, %%mm3 \n\t"
- "movq %2, %%mm4 \n\t"
- "movq %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movq %%mm2, %0 \n\t"
- "movq %%mm3, %1 \n\t"
- "movq %%mm4, %2 \n\t"
- "movq %%mm5, %3 \n\t"
- :"+m"(*(uint32_t*)(dest+0*linesize)),
- "+m"(*(uint32_t*)(dest+1*linesize)),
- "+m"(*(uint32_t*)(dest+2*linesize)),
- "+m"(*(uint32_t*)(dest+3*linesize))
- );
- dest += 4*linesize;
- __asm__ volatile(
- "movq %0, %%mm2 \n\t"
- "movq %1, %%mm3 \n\t"
- "movq %2, %%mm4 \n\t"
- "movq %3, %%mm5 \n\t"
- "paddusb %%mm0, %%mm2 \n\t"
- "paddusb %%mm0, %%mm3 \n\t"
- "paddusb %%mm0, %%mm4 \n\t"
- "paddusb %%mm0, %%mm5 \n\t"
- "psubusb %%mm1, %%mm2 \n\t"
- "psubusb %%mm1, %%mm3 \n\t"
- "psubusb %%mm1, %%mm4 \n\t"
- "psubusb %%mm1, %%mm5 \n\t"
- "movq %%mm2, %0 \n\t"
- "movq %%mm3, %1 \n\t"
- "movq %%mm4, %2 \n\t"
- "movq %%mm5, %3 \n\t"
- :"+m"(*(uint32_t*)(dest+0*linesize)),
- "+m"(*(uint32_t*)(dest+1*linesize)),
- "+m"(*(uint32_t*)(dest+2*linesize)),
- "+m"(*(uint32_t*)(dest+3*linesize))
- );
-}
-
#define FN_ASSIGN(OP, X, Y, INSN) \
dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \
dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN
@@ -729,10 +527,5 @@ av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
FN_ASSIGN(avg_, 3, 1, _mmxext);
FN_ASSIGN(avg_, 3, 2, _mmxext);
FN_ASSIGN(avg_, 3, 3, _mmxext);
-
- dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
- dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
- dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
- dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
}
#endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */
--
2.1.4
More information about the ffmpeg-devel
mailing list