[FFmpeg-devel] [PATCH] x86/diracdsp: make ff_put_signed_rect_clamped_10_sse4 work on x86_32
Rostislav Pehlivanov
atomnuker at gmail.com
Wed Jul 20 19:28:42 EEST 2016
On 20 July 2016 at 02:40, James Almer <jamrial at gmail.com> wrote:
> Signed-off-by: James Almer <jamrial at gmail.com>
> ---
> libavcodec/x86/diracdsp.asm | 37 ++++++++++++++++++++-----------------
> libavcodec/x86/diracdsp_init.c | 4 ----
> 2 files changed, 20 insertions(+), 21 deletions(-)
>
> diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> index d86b543..6b3f780 100644
> --- a/libavcodec/x86/diracdsp.asm
> +++ b/libavcodec/x86/diracdsp.asm
> @@ -303,24 +303,30 @@ cglobal dequant_subband_32, 7, 7, 4, src, dst,
> stride, qf, qs, tot_v, tot_h
>
> RET
>
> -%if ARCH_X86_64 == 1
> +INIT_XMM sse4
> ; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const
> uint8_t *src, int src_stride, int width, int height)
> -cglobal put_signed_rect_clamped_10, 6, 9, 6, dst, dst_stride, src,
> src_stride, w, h
> - mov r6, srcq
> - mov r7, dstq
> - mov r8, wq
> +%if ARCH_X86_64
> +cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src,
> src_stride, w, h, t1, t2
> +%else
> +cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src,
> src_stride, w, t1, t2
> + %define hd r5mp
> +%endif
> + shl wd, 2
> + add srcq, wq
> + neg wq
> + mov t2q, dstq
> + mov t1q, wq
> pxor m2, m2
> mova m3, [clip_10bit]
> mova m4, [convert_to_unsigned_10bit]
>
> .loop_h:
> - mov srcq, r6
> - mov dstq, r7
> - mov wq, r8
> + mov dstq, t2q
> + mov wq, t1q
>
> .loop_w:
> - movu m0, [srcq+0*mmsize]
> - movu m1, [srcq+1*mmsize]
> + movu m0, [srcq+wq+0*mmsize]
> + movu m1, [srcq+wq+1*mmsize]
>
> paddd m0, m4
> paddd m1, m4
> @@ -329,16 +335,13 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst,
> dst_stride, src, src_stride, w
>
> movu [dstq], m0
>
> - add srcq, 2*mmsize
> add dstq, 1*mmsize
> - sub wd, 8
> - jg .loop_w
> + add wq, 2*mmsize
> + jl .loop_w
>
> - add r6, src_strideq
> - add r7, dst_strideq
> + add srcq, src_strideq
> + add t2q, dst_strideq
> sub hd, 1
> jg .loop_h
>
> RET
> -
> -%endif
> diff --git a/libavcodec/x86/diracdsp_init.c
> b/libavcodec/x86/diracdsp_init.c
> index d7c7cd1..b195113 100644
> --- a/libavcodec/x86/diracdsp_init.c
> +++ b/libavcodec/x86/diracdsp_init.c
> @@ -45,9 +45,7 @@ void ff_put_rect_clamped_mmx(uint8_t *dst, int
> dst_stride, const int16_t *src, i
> void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t
> *src, int src_stride, int width, int height);
> void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const
> int16_t *src, int src_stride, int width, int height);
> void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const
> int16_t *src, int src_stride, int width, int height);
> -#if ARCH_X86_64
> void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride,
> const uint8_t *src, int src_stride, int width, int height);
> -#endif
>
> void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t
> stride, const int qf, const int qs, int tot_v, int tot_h);
>
> @@ -192,8 +190,6 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
>
> if (EXTERNAL_SSE4(mm_flags)) {
> c->dequant_subband[1] = ff_dequant_subband_32_sse4;
> -#if ARCH_X86_64
> c->put_signed_rect_clamped[1] =
> ff_put_signed_rect_clamped_10_sse4;
> -#endif
> }
> }
> --
> 2.9.1
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
Very nice, thanks.
Push whenever you have the time
More information about the ffmpeg-devel
mailing list