[FFmpeg-devel] [PATCH v2] lpc: rewrite lpc_compute_autocorr in external asm
Lynne
dev at lynne.ee
Sun May 26 04:42:01 EEST 2024
The inline asm function had issues running under checkasm.
So I came to finish what I started, and wrote the last part
of LPC computation in assembly.
---
libavcodec/x86/lpc.asm | 91 +++++++++++++++++++++++++++++++++++++++
libavcodec/x86/lpc_init.c | 87 ++++---------------------------------
2 files changed, 100 insertions(+), 78 deletions(-)
diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
index a585c17ef5..9c359ae480 100644
--- a/libavcodec/x86/lpc.asm
+++ b/libavcodec/x86/lpc.asm
@@ -261,3 +261,94 @@ APPLY_WELCH_FN
INIT_YMM avx2
APPLY_WELCH_FN
%endif
+
+%macro COMPUTE_AUTOCORR_FN 0
+cglobal lpc_compute_autocorr, 4, 7, 3, data, len, lag, autoc, lag_p, data_l, len_p
+ shl lagd, 3
+ shl lenq, 3
+ xor lag_pq, lag_pq
+
+.lag_l:
+ movaps m2, [one_tab]
+
+ mov len_pq, lag_pq
+
+ lea data_lq, [lag_pq + mmsize - 8]
+ neg data_lq ; -j - mmsize
+ add data_lq, dataq ; data[-j - mmsize]
+.len_l:
+
+%if mmsize == 32
+ vbroadcastsd m0, [dataq + len_pq]
+ vpermpd m1, [data_lq + len_pq], q0123
+%else
+ movupd m1, [data_lq + len_pq] ; data[i - j]
+ movsd xm0, [dataq + len_pq] ; data[i]
+ shufpd m1, m1, m1, 01b
+%endif
+
+ shufpd m0, m0, m0, 1100b
+
+ ; fmadd actually hurts performance in this case due to
+ ; the earlier loads + shuffles
+ mulpd m0, m1
+ addpd m2, m0 ; sum += data[i]*data[i-j]
+
+ add len_pq, 8
+ cmp len_pq, lenq
+ jl .len_l
+
+ movupd [autocq + lag_pq], m2 ; autoc[j] = sum
+ add lag_pq, mmsize
+ cmp lag_pq, lagq
+ jl .lag_l
+
+ ; The tail computation is guaranteed never to happen
+ ; as long as we're doing multiples of 4, rather than 2.
+%if mmsize != 32
+ jg .end
+ ; If lag_p == lag fallthrough
+
+.tail:
+ movaps m2, [one_tab]
+
+ mov len_pq, lag_pq
+ sub len_pq, mmsize
+
+ lea data_lq, [lag_pq]
+ neg data_lq ; -j
+ add data_lq, dataq ; data[-j]
+
+.tail_l:
+ movupd m0, [dataq + len_pq]
+ movupd m1, [data_lq + len_pq]
+
+ mulpd m0, m1
+ addpd m2, m0 ; sum += data[i]*data[i-j]
+
+ add len_pq, mmsize
+ cmp len_pq, lenq
+ jl .tail_l
+
+ shufpd m1, m2, m2, 01b
+ addpd m2, m1
+
+ ; Leave this here just in case its ever needed
+%if mmsize == 32
+ vperm2f128 m1, m2, m2, 0x01
+ addpd xm2, xm1
+ movupd [autocq + lag_pq], xm2
+%else
+ movhpd [autocq + lag_pq], xm2
+%endif
+
+.end:
+%endif
+
+ RET
+%endmacro
+
+INIT_XMM sse2
+COMPUTE_AUTOCORR_FN
+INIT_YMM avx
+COMPUTE_AUTOCORR_FN
diff --git a/libavcodec/x86/lpc_init.c b/libavcodec/x86/lpc_init.c
index f2fca53799..bb174be53e 100644
--- a/libavcodec/x86/lpc_init.c
+++ b/libavcodec/x86/lpc_init.c
@@ -28,89 +28,20 @@ void ff_lpc_apply_welch_window_sse2(const int32_t *data, ptrdiff_t len,
double *w_data);
void ff_lpc_apply_welch_window_avx2(const int32_t *data, ptrdiff_t len,
double *w_data);
-
-DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
-
-#if HAVE_SSE2_INLINE
-
-static void lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
- double *autoc)
-{
- int j;
-
- if((x86_reg)data & 15)
- data++;
-
- for(j=0; j<lag; j+=2){
- x86_reg i = -len*sizeof(double);
- if(j == lag-2) {
- __asm__ volatile(
- "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
- "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
- "movsd "MANGLE(pd_1)", %%xmm2 \n\t"
- "1: \n\t"
- "movapd (%2,%0), %%xmm3 \n\t"
- "movupd -8(%3,%0), %%xmm4 \n\t"
- "movapd (%3,%0), %%xmm5 \n\t"
- "mulpd %%xmm3, %%xmm4 \n\t"
- "mulpd %%xmm3, %%xmm5 \n\t"
- "mulpd -16(%3,%0), %%xmm3 \n\t"
- "addpd %%xmm4, %%xmm1 \n\t"
- "addpd %%xmm5, %%xmm0 \n\t"
- "addpd %%xmm3, %%xmm2 \n\t"
- "add $16, %0 \n\t"
- "jl 1b \n\t"
- "movhlps %%xmm0, %%xmm3 \n\t"
- "movhlps %%xmm1, %%xmm4 \n\t"
- "movhlps %%xmm2, %%xmm5 \n\t"
- "addsd %%xmm3, %%xmm0 \n\t"
- "addsd %%xmm4, %%xmm1 \n\t"
- "addsd %%xmm5, %%xmm2 \n\t"
- "movsd %%xmm0, (%1) \n\t"
- "movsd %%xmm1, 8(%1) \n\t"
- "movsd %%xmm2, 16(%1) \n\t"
- :"+&r"(i)
- :"r"(autoc+j), "r"(data+len), "r"(data+len-j)
- NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
- :"memory"
- );
- } else {
- __asm__ volatile(
- "movsd "MANGLE(pd_1)", %%xmm0 \n\t"
- "movsd "MANGLE(pd_1)", %%xmm1 \n\t"
- "1: \n\t"
- "movapd (%3,%0), %%xmm3 \n\t"
- "movupd -8(%4,%0), %%xmm4 \n\t"
- "mulpd %%xmm3, %%xmm4 \n\t"
- "mulpd (%4,%0), %%xmm3 \n\t"
- "addpd %%xmm4, %%xmm1 \n\t"
- "addpd %%xmm3, %%xmm0 \n\t"
- "add $16, %0 \n\t"
- "jl 1b \n\t"
- "movhlps %%xmm0, %%xmm3 \n\t"
- "movhlps %%xmm1, %%xmm4 \n\t"
- "addsd %%xmm3, %%xmm0 \n\t"
- "addsd %%xmm4, %%xmm1 \n\t"
- "movsd %%xmm0, %1 \n\t"
- "movsd %%xmm1, %2 \n\t"
- :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
- :"r"(data+len), "r"(data+len-j)
- NAMED_CONSTRAINTS_ARRAY_ADD(pd_1)
- );
- }
- }
-}
-
-#endif /* HAVE_SSE2_INLINE */
+void ff_lpc_compute_autocorr_sse2(const double *data, ptrdiff_t len, int lag,
+ double *autoc);
+void ff_lpc_compute_autocorr_avx(const double *data, ptrdiff_t len, int lag,
+ double *autoc);
av_cold void ff_lpc_init_x86(LPCContext *c)
{
int cpu_flags = av_get_cpu_flags();
-#if HAVE_SSE2_INLINE
- if (INLINE_SSE2_SLOW(cpu_flags))
- c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
-#endif
+ if (EXTERNAL_SSE2(cpu_flags))
+ c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
+
+ if (EXTERNAL_AVX_FAST(cpu_flags))
+ c->lpc_compute_autocorr = ff_lpc_compute_autocorr_avx;
if (EXTERNAL_SSE2(cpu_flags))
c->lpc_apply_welch_window = ff_lpc_apply_welch_window_sse2;
--
2.43.0.381.gb435a96ce8
More information about the ffmpeg-devel
mailing list