[FFmpeg-devel] [PATCH 3/3] avfilter/x86/vf_eq: add SSE2 version
James Almer
jamrial at gmail.com
Tue Sep 17 16:56:12 EEST 2019
On 9/17/2019 10:39 AM, Ting Fu wrote:
> Signed-off-by: Ting Fu <ting.fu at intel.com>
> ---
> libavfilter/x86/vf_eq.asm | 19 +++++++++++++++++--
> libavfilter/x86/vf_eq_init.c | 20 ++++++++++++++++++++
> 2 files changed, 37 insertions(+), 2 deletions(-)
>
> diff --git a/libavfilter/x86/vf_eq.asm b/libavfilter/x86/vf_eq.asm
> index bf28691297..d6b51cf6df 100644
> --- a/libavfilter/x86/vf_eq.asm
> +++ b/libavfilter/x86/vf_eq.asm
> @@ -24,14 +24,21 @@
>
> SECTION .text
>
> -INIT_MMX mmx
> +%macro PROCESS_ONE_LINE 1
> cglobal process_one_line, 5, 7, 5, src, dst, contrast, brightness, w
> movd m3, contrastd
> movd m4, brightnessd
> movsx r5d, contrastw
> movsx r6d, brightnessw
> +%if mmsize == 8
> pshufw m3, m3, 0
> pshufw m4, m4, 0
pshufw isn't mmx, but mmxext (AKA, integer SSE). Hardly a problem since
i don't think anyone using a Pentium 2 will use this filter, but still
it's technically wrong.
The function should be changed into mmxext to reflect the above.
> +%elif mmsize == 16
> + pshuflw m3, m3, 0
> + movlhps m3, m3
> + pshuflw m4, m4, 0
> + movlhps m4, m4
> +%endif
You can use the SPLATW macro instead. It will expand to the correct
instructions based on what set is enabled.
>
> DEFINE_ARGS src, dst, tmp, scalar, w
> xor tmpd, tmpd
> @@ -39,7 +46,7 @@ cglobal process_one_line, 5, 7, 5, src, dst, contrast, brightness, w
> pxor m1, m1
> mov scalard, wd
> and scalard, mmsize-1
> - sar wd, 3
> + sar wd, %1
> cmp wd, 1
> jl .loop1
>
> @@ -80,3 +87,11 @@ cglobal process_one_line, 5, 7, 5, src, dst, contrast, brightness, w
>
> .end:
> RET
> +
> +%endmacro
> +
> +INIT_MMX mmx
> +PROCESS_ONE_LINE 3
> +
> +INIT_XMM sse2
> +PROCESS_ONE_LINE 4
> diff --git a/libavfilter/x86/vf_eq_init.c b/libavfilter/x86/vf_eq_init.c
> index 63c69078fb..cdd5272220 100644
> --- a/libavfilter/x86/vf_eq_init.c
> +++ b/libavfilter/x86/vf_eq_init.c
> @@ -28,6 +28,8 @@
>
> extern void ff_process_one_line_mmx(const uint8_t *src, uint8_t *dst, int contvec,
> int brvec, int w);
> +extern void ff_process_one_line_sse2(const uint8_t *src, uint8_t *dst, int contvec,
> + int brvec, int w);
>
> static void process_mmx(EQParameters *param, uint8_t *dst, int dst_stride,
> const uint8_t *src, int src_stride, int w, int h)
> @@ -44,6 +46,21 @@ static void process_mmx(EQParameters *param, uint8_t *dst, int dst_stride,
> emms_c();
> }
>
> +static void process_sse2(EQParameters *param, uint8_t *dst, int dst_stride,
> + const uint8_t *src, int src_stride, int w, int h)
> +{
> + short contrast = (short) (param->contrast * 256 * 16);
> + short brightness = ((short) (100.0 * param->brightness + 100.0) * 511)
> + / 200 - 128 - contrast / 32;
> +
> + while (h--) {
> + ff_process_one_line_sse2(src, dst, contrast, brightness, w);
> + src += src_stride;
> + dst += dst_stride;
> + }
> + emms_c();
No need for this since the SSE2 version uses xmm regs.
> +}
> +
> av_cold void ff_eq_init_x86(EQContext *eq)
> {
> int cpu_flags = av_get_cpu_flags();
> @@ -51,5 +68,8 @@ av_cold void ff_eq_init_x86(EQContext *eq)
> if (cpu_flags & AV_CPU_FLAG_MMX) {
> eq->process = process_mmx;
> }
> + if (cpu_flags & AV_CPU_FLAG_SSE2) {
> + eq->process = process_sse2;
> + }
> }
>
>
More information about the ffmpeg-devel
mailing list