[FFmpeg-devel] [PATCH] x86/vf_w3fdif: 32-bit compatibility for w3fdif_simple_high
Hendrik Leppkes
h.leppkes at gmail.com
Thu Jan 7 03:54:30 CET 2016
---
Based on an idea from Ronald mentioend in an earlier thread about this function.
It works and passes FATE, however I'm sure some aspects can be done easier or cleaner, so please let me know.
libavfilter/x86/vf_w3fdif.asm | 37 ++++++++++++++++++++++++++++++++++---
libavfilter/x86/vf_w3fdif_init.c | 2 +-
2 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/libavfilter/x86/vf_w3fdif.asm b/libavfilter/x86/vf_w3fdif.asm
index c3c73ea..35768c3 100644
--- a/libavfilter/x86/vf_w3fdif.asm
+++ b/libavfilter/x86/vf_w3fdif.asm
@@ -102,14 +102,22 @@ cglobal w3fdif_complex_low, 4, 7, 8, 0, work_line, in_lines_cur0, coef, linesize
REP_RET
%if ARCH_X86_64
-
cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
+%else
+cglobal w3fdif_simple_high, 4, 7, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
+%endif
movq m2, [coefq]
- DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2
+%if ARCH_X86_64
+ DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, linesize, offset, in_lines_cur1, in_lines_cur2, in_lines_adj1, in_lines_adj2
+ mov offsetq, 0
+%else
+ DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, in_lines_cur2, in_lines_adj1, in_lines_adj2
+ %define linesized dword r4m
+%endif
+
pshufd m0, m2, q0000
SPLATW m2, m2, 2
pxor m7, m7
- mov offsetq, 0
mov in_lines_cur2q, [in_lines_cur0q+gprsize*2]
mov in_lines_cur1q, [in_lines_cur0q+gprsize]
mov in_lines_cur0q, [in_lines_cur0q]
@@ -117,8 +125,21 @@ cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0,
mov in_lines_adj1q, [in_lines_adj0q+gprsize]
mov in_lines_adj0q, [in_lines_adj0q]
+%if ARCH_X86_32
+ sub in_lines_cur1q, in_lines_cur0q
+ sub in_lines_cur2q, in_lines_cur0q
+ sub in_lines_adj0q, in_lines_cur0q
+ sub in_lines_adj1q, in_lines_cur0q
+ sub in_lines_adj2q, in_lines_cur0q
+ %define offsetq in_lines_cur0q
+%endif
+
.loop:
+%if ARCH_X86_64
movh m3, [in_lines_cur0q+offsetq]
+%else
+ movh m3, [in_lines_cur0q]
+%endif
movh m4, [in_lines_cur1q+offsetq]
punpcklbw m3, m7
punpcklbw m4, m7
@@ -143,15 +164,25 @@ cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0,
pmaddwd m6, m2
paddd m3, m5
paddd m4, m6
+%if ARCH_X86_64
paddd m3, [work_lineq+offsetq*4]
paddd m4, [work_lineq+offsetq*4+mmsize]
mova [work_lineq+offsetq*4], m3
mova [work_lineq+offsetq*4+mmsize], m4
+%else
+ paddd m3, [work_lineq]
+ paddd m4, [work_lineq+mmsize]
+ mova [work_lineq], m3
+ mova [work_lineq+mmsize], m4
+ add work_lineq, mmsize*2
+%endif
add offsetq, mmsize/2
sub linesized, mmsize/2
jg .loop
REP_RET
+%if ARCH_X86_64
+
cglobal w3fdif_complex_high, 5, 13, 10, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
movq m0, [coefq+0]
movd m4, [coefq+8]
diff --git a/libavfilter/x86/vf_w3fdif_init.c b/libavfilter/x86/vf_w3fdif_init.c
index 72ea657..9bf06e8 100644
--- a/libavfilter/x86/vf_w3fdif_init.c
+++ b/libavfilter/x86/vf_w3fdif_init.c
@@ -51,12 +51,12 @@ av_cold void ff_w3fdif_init_x86(W3FDIFDSPContext *dsp)
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->filter_simple_low = ff_w3fdif_simple_low_sse2;
+ dsp->filter_simple_high = ff_w3fdif_simple_high_sse2;
dsp->filter_complex_low = ff_w3fdif_complex_low_sse2;
dsp->filter_scale = ff_w3fdif_scale_sse2;
}
if (ARCH_X86_64 && EXTERNAL_SSE2(cpu_flags)) {
- dsp->filter_simple_high = ff_w3fdif_simple_high_sse2;
dsp->filter_complex_high = ff_w3fdif_complex_high_sse2;
}
}
--
2.6.2.windows.1
More information about the ffmpeg-devel
mailing list