[FFmpeg-devel] [PATCH 01/10] diracdsp: add SIMD for the 10 bit version of put_signed_rect_clamped
James Almer
jamrial at gmail.com
Thu Jun 23 21:57:18 CEST 2016
On 6/23/2016 2:06 PM, Rostislav Pehlivanov wrote:
> Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>
> ---
> libavcodec/x86/diracdsp.asm | 47 ++++++++++++++++++++++++++++++++++++++++++
> libavcodec/x86/diracdsp_init.c | 6 ++++++
> 2 files changed, 53 insertions(+)
>
> diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> index a042413..9db7b67 100644
> --- a/libavcodec/x86/diracdsp.asm
> +++ b/libavcodec/x86/diracdsp.asm
> @@ -22,6 +22,8 @@
>
> SECTION_RODATA
> pw_7: times 8 dw 7
> +convert_to_unsigned_10bit: times 4 dd 0x200
> +clip_10bit: times 8 dw 0x3ff
>
> cextern pw_3
> cextern pw_16
> @@ -172,6 +174,48 @@ cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w,
> RET
> %endm
>
> +%macro PUT_RECT_10 0
> +; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
> +cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src, src_stride, w, h
This is x86_64 only. Either add the relevant pre-processor checks here
and to the init file, or make the necessary changes to make it work
on x86_32.
Look at the 8bit version of put_signed_rect_clamped for an example of
how to deal with this using stack.
> +
> + neg wq
> + neg hq
Why? You're not using these as part of effective addresses, just as
counters. Keep them as is and just do sub instead of add in the loops
below.
For that matter, you'd need to sign extend these with movsxd before
negating them, or change the prototype and make them ptrdiff_t instead
of int.
> + mov r6, srcq
> + mov r7, dstq
> + mov r8, wq
> + pxor m2, m2
> + mova m3, [clip_10bit]
> + mova m4, [convert_to_unsigned_10bit]
> +
> + .loop_h:
> + mov srcq, r6
> + mov dstq, r7
> + mov wq, r8
> +
> + .loop_w:
> + movu m0, [srcq+0*mmsize]
> + movu m1, [srcq+1*mmsize]
> +
> + paddd m0, m4
> + paddd m1, m4
> + packusdw m0, m0, m1
> + CLIPW m0, m2, m3 ; packusdw saturates so it's fine
Would be nice if you could make this work with SSE2 as well.
There are some examples of packusdw SSE2 emulation in the codebase.
> +
> + movu [dstq], m0
> +
> + add srcq, 2*mmsize
> + add dstq, 1*mmsize
> + add wq, 8
> + jl .loop_w
> +
> + add r6, src_strideq
> + add r7, dst_strideq
> + add hq, 1
Make sure to do "sub wd, 8" and "sub hd, 1" after removing the above
negs if don't change the prototype.
> + jl .loop_h
> +
> + RET
> +%endm
> +
> %macro ADD_RECT 1
> ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
> cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
> @@ -263,3 +307,6 @@ ADD_RECT sse2
> HPEL_FILTER sse2
> ADD_OBMC 32, sse2
> ADD_OBMC 16, sse2
> +
> +INIT_XMM sse4
> +PUT_RECT_10
No need to make it a macro if it's going to be a single version.
If you add a SSE2 one then this would makes sense.
> diff --git a/libavcodec/x86/diracdsp_init.c b/libavcodec/x86/diracdsp_init.c
> index 5fae798..4786eea 100644
> --- a/libavcodec/x86/diracdsp_init.c
> +++ b/libavcodec/x86/diracdsp_init.c
> @@ -46,6 +46,8 @@ void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src,
> void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
> void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
>
> +void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height);
> +
> #if HAVE_YASM
>
> #define HPEL_FILTER(MMSIZE, EXT) \
> @@ -184,4 +186,8 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
> c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
> c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
> }
> +
> + if (EXTERNAL_SSE4(mm_flags)) {
> + c->put_signed_rect_clamped[1] = ff_put_signed_rect_clamped_10_sse4;
> + }
> }
>
More information about the ffmpeg-devel
mailing list