[FFmpeg-devel] [PATCH 6/8] lavc/x86/flac_dsp_gpl: partially unroll 32-bit LPC encoder
Rostislav Pehlivanov
atomnuker at gmail.com
Mon Nov 27 01:17:31 EET 2017
On 26 November 2017 at 22:51, James Darnley <james.darnley at gmail.com> wrote:
> Around 1.1 times faster and reduces runtime by up to 6%.
> ---
> libavcodec/x86/flac_dsp_gpl.asm | 91 ++++++++++++++++++++++++++++++
> ++---------
> 1 file changed, 72 insertions(+), 19 deletions(-)
>
> diff --git a/libavcodec/x86/flac_dsp_gpl.asm
> b/libavcodec/x86/flac_dsp_gpl.asm
> index 952fc8b86b..91989ce560 100644
> --- a/libavcodec/x86/flac_dsp_gpl.asm
> +++ b/libavcodec/x86/flac_dsp_gpl.asm
> @@ -152,13 +152,13 @@ RET
> %macro FUNCTION_BODY_32 0
>
> %if ARCH_X86_64
> - cglobal flac_enc_lpc_32, 5, 7, 8, mmsize, res, smp, len, order, coefs
> + cglobal flac_enc_lpc_32, 5, 7, 8, mmsize*4, res, smp, len, order,
> coefs
>
Why x4, shouldn't this be x2?
> DECLARE_REG_TMP 5, 6
> %define length r2d
>
> movsxd orderq, orderd
> %else
> - cglobal flac_enc_lpc_32, 5, 6, 8, mmsize, res, smp, len, order, coefs
> + cglobal flac_enc_lpc_32, 5, 6, 8, mmsize*4, res, smp, len, order,
> coefs
> DECLARE_REG_TMP 2, 5
> %define length r2mp
> %endif
> @@ -189,18 +189,23 @@ mova [rsp], m4 ; save sign extend mask
> %define negj t1q
>
> .looplen:
> + ; process "odd" samples
> pxor m0, m0
> pxor m4, m4
> pxor m6, m6
> mov posj, orderq
> xor negj, negj
>
> - .looporder:
> + .looporder1:
> movd m2, [coefsq+posj*4] ; c = coefs[j]
> SPLATD m2
> - pmovzxdq m1, [smpq+negj*4-4] ; s = smp[i-j-1]
> - pmovzxdq m5, [smpq+negj*4-4+mmsize/2]
> - pmovzxdq m7, [smpq+negj*4-4+mmsize]
> + movu m1, [smpq+negj*4-4] ; s = smp[i-j-1]
> + movu m5, [smpq+negj*4-4+mmsize]
> + movu m7, [smpq+negj*4-4+mmsize*2]
> + ; Rather than explicitly unpack adjacent samples into qwords we
> can let
> + ; the pmuldq instruction unpack the 0th and 2nd samples for us
> when it
> + ; does its multiply. This saves an unpack for every sample in
> the inner
> + ; loop meaning it should be (much) quicker.
> pmuldq m1, m2
> pmuldq m5, m2
> pmuldq m7, m2
> @@ -210,7 +215,7 @@ mova [rsp], m4 ; save sign extend mask
>
> dec negj
> inc posj
> - jnz .looporder
> + jnz .looporder1
>
> HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift
> HACK_PSRAQ m4, m3, [rsp], m2
> @@ -218,22 +223,70 @@ mova [rsp], m4 ; save sign extend mask
> CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
> CLIPQ m4, [pq_int_min], [pq_int_max], m2
> CLIPQ m6, [pq_int_min], [pq_int_max], m2
> - pshufd m0, m0, q0020 ; pack into first 2 dwords
> - pshufd m4, m4, q0020
> - pshufd m6, m6, q0020
> - movh m1, [smpq]
> - movh m5, [smpq+mmsize/2]
> - movh m7, [smpq+mmsize]
> + movu m1, [smpq]
> + movu m5, [smpq+mmsize]
> + movu m7, [smpq+mmsize*2]
> psubd m1, m0 ; smp[i] - p
> psubd m5, m4
> psubd m7, m6
> - movh [resq], m1 ; res[i] = smp[i] - (p >> shift)
> - movh [resq+mmsize/2], m5
> - movh [resq+mmsize], m7
> + mova [rsp+mmsize], m1 ; res[i] = smp[i] - (p >> shift)
> + mova [rsp+mmsize*2], m5
> + mova [rsp+mmsize*3], m7
> +
> + ; process "even" samples
> + pxor m0, m0
> + pxor m4, m4
> + pxor m6, m6
> + mov posj, orderq
> + xor negj, negj
> +
> + .looporder2:
> + movd m2, [coefsq+posj*4] ; c = coefs[j]
> + SPLATD m2
> + movu m1, [smpq+negj*4] ; s = smp[i-j-1]
> + movu m5, [smpq+negj*4+mmsize]
> + movu m7, [smpq+negj*4+mmsize*2]
> + pmuldq m1, m2
> + pmuldq m5, m2
> + pmuldq m7, m2
> + paddq m0, m1 ; p += c * s
> + paddq m4, m5
> + paddq m6, m7
> +
> + dec negj
> + inc posj
> + jnz .looporder2
> +
> + HACK_PSRAQ m0, m3, [rsp], m2 ; p >>= shift
> + HACK_PSRAQ m4, m3, [rsp], m2
> + HACK_PSRAQ m6, m3, [rsp], m2
> + CLIPQ m0, [pq_int_min], [pq_int_max], m2 ; clip(p >> shift)
> + CLIPQ m4, [pq_int_min], [pq_int_max], m2
> + CLIPQ m6, [pq_int_min], [pq_int_max], m2
> + movu m1, [smpq+4]
> + movu m5, [smpq+4+mmsize]
> + movu m7, [smpq+4+mmsize*2]
> + psubd m1, m0 ; smp[i] - p
> + psubd m5, m4
> + psubd m7, m6
> +
> + ; interleave odd and even samples
> + pslldq m1, 4
> + pslldq m5, 4
> + pslldq m7, 4
> +
> + pblendw m1, [rsp+mmsize], q0303
> + pblendw m5, [rsp+mmsize*2], q0303
> + pblendw m7, [rsp+mmsize*3], q0303
> +
> + movu [resq], m1
> + movu [resq+mmsize], m5
> + movu [resq+mmsize*2], m7
> +
> + add resq, 3*mmsize
> + add smpq, 3*mmsize
> + sub length, (3*mmsize)/4
>
> - add resq, (3*mmsize)/2
> - add smpq, (3*mmsize)/2
> - sub length, (3*mmsize)/8
> jg .looplen
> RET
>
> --
> 2.15.0
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
Apart from that lgtm
More information about the ffmpeg-devel
mailing list