[FFmpeg-devel] [PATCH 2/8] avcodec/flac: add AVX2 version of the 16-bit LPC encoder
James Darnley
james.darnley at gmail.com
Mon Nov 27 00:51:05 EET 2017
When compared to the SSE4 version, runtime is reduced by 0.5 to 20%.
After a bug fix log, long ago in e609cfd697 the 16-bit lpc encoder is
used so little that the runtime reduction is no longer correct. The
function itself is around 2 times faster. (As one might expect for
doing twice as many samples every iteration.)
---
libavcodec/flacenc.c | 2 +-
libavcodec/x86/flac_dsp_gpl.asm | 32 +++++++++++++++++++++++++++-----
libavcodec/x86/flacdsp_init.c | 5 +++++
3 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
index 170c3caf48..cf25982c91 100644
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@@ -88,7 +88,7 @@ typedef struct FlacSubframe {
uint64_t rc_sums[32][MAX_PARTITIONS];
int32_t samples[FLAC_MAX_BLOCKSIZE];
- int32_t residual[FLAC_MAX_BLOCKSIZE+11];
+ int32_t residual[FLAC_MAX_BLOCKSIZE+23];
} FlacSubframe;
typedef struct FlacFrame {
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index e285158185..c461c666be 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -24,7 +24,8 @@
SECTION .text
-INIT_XMM sse4
+%macro FUNCTION_BODY_16 0
+
%if ARCH_X86_64
cglobal flac_enc_lpc_16, 5, 7, 8, 0, res, smp, len, order, coefs
DECLARE_REG_TMP 5, 6
@@ -51,7 +52,7 @@ lea resq, [resq+orderq*4]
lea smpq, [smpq+orderq*4]
lea coefsq, [coefsq+orderq*4]
sub length, orderd
-movd m3, r5m
+movd xm3, r5m
neg orderq
%define posj t0q
@@ -65,8 +66,20 @@ neg orderq
xor negj, negj
.looporder:
+%if cpuflag(avx)
+ vbroadcastss m2, [coefsq+posj*4]
+%else
movd m2, [coefsq+posj*4] ; c = coefs[j]
SPLATD m2
+%endif
+%if cpuflag(avx)
+ vpmulld m1, m2, [smpq+negj*4-4]
+ vpmulld m5, m2, [smpq+negj*4-4+mmsize]
+ vpmulld m7, m2, [smpq+negj*4-4+mmsize*2]
+ vpaddd m0, m1
+ vpaddd m4, m5
+ vpaddd m6, m7
+%else
movu m1, [smpq+negj*4-4] ; s = smp[i-j-1]
movu m5, [smpq+negj*4-4+mmsize]
movu m7, [smpq+negj*4-4+mmsize*2]
@@ -76,14 +89,15 @@ neg orderq
paddd m0, m1 ; p += c * s
paddd m4, m5
paddd m6, m7
+%endif
dec negj
inc posj
jnz .looporder
- psrad m0, m3 ; p >>= shift
- psrad m4, m3
- psrad m6, m3
+ psrad m0, xm3 ; p >>= shift
+ psrad m4, xm3
+ psrad m6, xm3
movu m1, [smpq]
movu m5, [smpq+mmsize]
movu m7, [smpq+mmsize*2]
@@ -99,3 +113,11 @@ neg orderq
sub length, (3*mmsize)/4
jg .looplen
RET
+
+%endmacro
+
+INIT_XMM sse4
+FUNCTION_BODY_16
+
+INIT_YMM avx2
+FUNCTION_BODY_16
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index 1971f81b8d..0a5c01859f 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -28,6 +28,7 @@ void ff_flac_lpc_32_xop(int32_t *samples, const int coeffs[32], int order,
int qlevel, int len);
void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int, const int32_t *,int);
+void ff_flac_enc_lpc_16_avx2(int32_t *, const int32_t *, int, int, const int32_t *,int);
#define DECORRELATE_FUNCS(fmt, opt) \
void ff_flac_decorrelate_ls_##fmt##_##opt(uint8_t **out, int32_t **in, int channels, \
@@ -110,6 +111,10 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt, int
if (CONFIG_GPL)
c->lpc16_encode = ff_flac_enc_lpc_16_sse4;
}
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ if (CONFIG_GPL)
+ c->lpc16_encode = ff_flac_enc_lpc_16_avx2;
+ }
#endif
#endif /* HAVE_X86ASM */
}
--
2.15.0
More information about the ffmpeg-devel
mailing list