[FFmpeg-devel] [PATCH] x86/vf_w3fdif: 32-bit compatibility for w3fdif_simple_high
James Almer
jamrial at gmail.com
Thu Jan 7 04:18:40 CET 2016
On 1/6/2016 11:54 PM, Hendrik Leppkes wrote:
> ---
> Based on an idea from Ronald mentioend in an earlier thread about this function.
>
> It works and passes FATE, however I'm sure some aspects can be done easier or cleaner, so please let me know.
>
>
> libavfilter/x86/vf_w3fdif.asm | 37 ++++++++++++++++++++++++++++++++++---
> libavfilter/x86/vf_w3fdif_init.c | 2 +-
> 2 files changed, 35 insertions(+), 4 deletions(-)
>
> diff --git a/libavfilter/x86/vf_w3fdif.asm b/libavfilter/x86/vf_w3fdif.asm
> index c3c73ea..35768c3 100644
> --- a/libavfilter/x86/vf_w3fdif.asm
> +++ b/libavfilter/x86/vf_w3fdif.asm
> @@ -102,14 +102,22 @@ cglobal w3fdif_complex_low, 4, 7, 8, 0, work_line, in_lines_cur0, coef, linesize
> REP_RET
>
> %if ARCH_X86_64
> -
> cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
> +%else
> +cglobal w3fdif_simple_high, 4, 7, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
> +%endif
> movq m2, [coefq]
> - DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2
> +%if ARCH_X86_64
> + DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, linesize, offset, in_lines_cur1, in_lines_cur2, in_lines_adj1, in_lines_adj2
This broke x86_64. Leave it as it was above.
> + mov offsetq, 0
Since you're moving this take the chance to replace it with a xor.
> +%else
> + DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, in_lines_cur2, in_lines_adj1, in_lines_adj2
> + %define linesized dword r4m
Nit: r4mp instead of dword r4m
> +%endif
> +
> pshufd m0, m2, q0000
> SPLATW m2, m2, 2
> pxor m7, m7
> - mov offsetq, 0
> mov in_lines_cur2q, [in_lines_cur0q+gprsize*2]
> mov in_lines_cur1q, [in_lines_cur0q+gprsize]
> mov in_lines_cur0q, [in_lines_cur0q]
> @@ -117,8 +125,21 @@ cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0,
> mov in_lines_adj1q, [in_lines_adj0q+gprsize]
> mov in_lines_adj0q, [in_lines_adj0q]
>
> +%if ARCH_X86_32
> + sub in_lines_cur1q, in_lines_cur0q
> + sub in_lines_cur2q, in_lines_cur0q
> + sub in_lines_adj0q, in_lines_cur0q
> + sub in_lines_adj1q, in_lines_cur0q
> + sub in_lines_adj2q, in_lines_cur0q
> + %define offsetq in_lines_cur0q
> +%endif
> +
> .loop:
> +%if ARCH_X86_64
> movh m3, [in_lines_cur0q+offsetq]
> +%else
> + movh m3, [in_lines_cur0q]
> +%endif
> movh m4, [in_lines_cur1q+offsetq]
> punpcklbw m3, m7
> punpcklbw m4, m7
> @@ -143,15 +164,25 @@ cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0,
> pmaddwd m6, m2
> paddd m3, m5
> paddd m4, m6
> +%if ARCH_X86_64
> paddd m3, [work_lineq+offsetq*4]
> paddd m4, [work_lineq+offsetq*4+mmsize]
> mova [work_lineq+offsetq*4], m3
> mova [work_lineq+offsetq*4+mmsize], m4
> +%else
> + paddd m3, [work_lineq]
> + paddd m4, [work_lineq+mmsize]
> + mova [work_lineq], m3
> + mova [work_lineq+mmsize], m4
> + add work_lineq, mmsize*2
> +%endif
> add offsetq, mmsize/2
> sub linesized, mmsize/2
> jg .loop
> REP_RET
>
> +%if ARCH_X86_64
> +
> cglobal w3fdif_complex_high, 5, 13, 10, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize
> movq m0, [coefq+0]
> movd m4, [coefq+8]
> diff --git a/libavfilter/x86/vf_w3fdif_init.c b/libavfilter/x86/vf_w3fdif_init.c
> index 72ea657..9bf06e8 100644
> --- a/libavfilter/x86/vf_w3fdif_init.c
> +++ b/libavfilter/x86/vf_w3fdif_init.c
> @@ -51,12 +51,12 @@ av_cold void ff_w3fdif_init_x86(W3FDIFDSPContext *dsp)
>
> if (EXTERNAL_SSE2(cpu_flags)) {
> dsp->filter_simple_low = ff_w3fdif_simple_low_sse2;
> + dsp->filter_simple_high = ff_w3fdif_simple_high_sse2;
> dsp->filter_complex_low = ff_w3fdif_complex_low_sse2;
> dsp->filter_scale = ff_w3fdif_scale_sse2;
> }
>
> if (ARCH_X86_64 && EXTERNAL_SSE2(cpu_flags)) {
> - dsp->filter_simple_high = ff_w3fdif_simple_high_sse2;
> dsp->filter_complex_high = ff_w3fdif_complex_high_sse2;
> }
> }
>
Seems to work. Maybe it can be improved but it should be good as is.
And to answer your question, no, i wasn't working on this. I got
distracted with other filters :P
More information about the ffmpeg-devel
mailing list