[FFmpeg-devel] [PATCH 01/10] diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped
James Almer
jamrial at gmail.com
Fri Jun 24 17:21:11 CEST 2016
On 6/24/2016 8:44 AM, Rostislav Pehlivanov wrote:
> From 86ecebfe70509329d6f5b8a587ae79d19f9c8154 Mon Sep 17 00:00:00 2001
> From: Rostislav Pehlivanov <rpehlivanov at ob-encoder.com>
> Date: Thu, 23 Jun 2016 18:06:55 +0100
> Subject: [PATCH 1/2] diracdsp: add SIMD for the 10 bit version of
> put_signed_rect_clamped
>
> Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>
> ---
> libavcodec/x86/diracdsp.asm | 45 ++++++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/diracdsp_init.c | 10 ++++++++++
> 2 files changed, 55 insertions(+)
>
> diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> index a042413..a0d6788 100644
> --- a/libavcodec/x86/diracdsp.asm
> +++ b/libavcodec/x86/diracdsp.asm
> @@ -22,6 +22,8 @@
>
> SECTION_RODATA
> pw_7: times 8 dw 7
> +convert_to_unsigned_10bit: times 4 dd 0x200
> +clip_10bit: times 8 dw 0x3ff
>
> cextern pw_3
> cextern pw_16
> @@ -263,3 +265,46 @@ ADD_RECT sse2
> HPEL_FILTER sse2
> ADD_OBMC 32, sse2
> ADD_OBMC 16, sse2
> +
> +%if ARCH_X86_64 == 1
> +INIT_XMM sse4
> +
> +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
> +cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h
> +
> + mov r6, srcq
> + mov r7, dstq
> + mov r8, wq
> + pxor m2, m2
> + mova m3, [clip_10bit]
> + mova m4, [convert_to_unsigned_10bit]
> +
> + .loop_h:
> + mov srcq, r6
> + mov dstq, r7
> + mov wq, r8
> +
> + .loop_w:
> + movu m0, [srcq+0*mmsize]
> + movu m1, [srcq+1*mmsize]
> +
> + paddd m0, m4
> + paddd m1, m4
> + packusdw m0, m0, m1
> + CLIPW m0, m2, m3 ; packusdw saturates so it's fine
> +
> + movu [dstq], m0
> +
> + add srcq, 2*mmsize
> + add dstq, 1*mmsize
> + sub wq, 8
> + jl .loop_w
Since you're substracting w now, this should be jump if greater.
Also, use wd, not wq, since it comes from stack on Win64. With msvc
x86_64 afaik there's no guarantee that the upper half of the register
is zeroed.
> +
> + add r6, src_strideq
> + add r7, dst_strideq
> + sub hq, 1
> + jl .loop_h
Ditto.
Alternatively as i said before is to just change the prototypes to
use ptrdiff_t instead of int.
> +
> + RET
> +
> +%endif
> diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
> index 5fae798..7fa554e 100644
> --- a/libavcodec/x86/diracdsp_init.c
> +++ b/libavcodec/x86/diracdsp_init.c
> @@ -46,6 +46,10 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src,
> void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
> void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
>
> +#if ARCH_X86_64
> +void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
> +#endif
> +
> #if HAVE_YASM
>
> #define HPEL_FILTER(MMSIZE, EXT) \
> @@ -184,4 +188,10 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
> c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
> c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
> }
> +
> +#if ARCH_X86_64
> + if (EXTERNAL_SSE4(mm_flags)) {
> + c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
> + }
> +#endif
> }
> --
More information about the ffmpeg-devel
mailing list