[FFmpeg-devel] [PATCH] x86/dcadsp: add ff_decode_hf_avx2()
James Almer
jamrial at gmail.com
Wed Feb 19 05:39:12 CET 2014
Signed-off-by: James Almer <jamrial at gmail.com>
---
This patch depends on "[PATCH 10/10] dcadsp: x86: SSE implementation of decode_hf" by Christophe Gisquet.
Tested with Intel SDE so no benchmarks were run, but I think it's safe to assume it's faster.
Benching and testing on actual hardware welcome.
---
libavcodec/x86/dcadsp.asm | 21 ++++++++++++++++++---
libavcodec/x86/dcadsp_init.c | 7 +++++++
2 files changed, 25 insertions(+), 3 deletions(-)
diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 5aed8bc..8eecbc3 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -41,18 +41,24 @@ cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
.loop:
%if ARCH_X86_64
mov offsetd, [scaleq + 2*startq]
- cvtsi2ss m0, offsetd
+ cvtsi2ss xmm0, offsetd
%else
- cvtsi2ss m0, [scaleq + 2*startq]
+ cvtsi2ss xmm0, [scaleq + 2*startq]
%endif
mov offsetd, [numq + startq]
- mulss m0, [pf_inv16]
+ mulss xmm0, [pf_inv16]
shl DICT, 5
+%if cpuflag(avx2)
+ vbroadcastss m0, xmm0
+%else
shufps m0, m0, 0
+%endif
%if cpuflag(sse2)
%if cpuflag(sse4)
pmovsxbd m1, [srcq + DICT + 0]
+%if notcpuflag(avx2)
pmovsxbd m2, [srcq + DICT + 4]
+%endif
%else
movq m1, [srcq + DICT]
punpcklbw m1, m1
@@ -63,7 +69,9 @@ cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
psrad m2, 24
%endif
cvtdq2ps m1, m1
+%if notcpuflag(avx2)
cvtdq2ps m2, m2
+%endif
%else
movd mm0, [srcq + DICT + 0]
movd mm1, [srcq + DICT + 4]
@@ -88,9 +96,13 @@ cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
shufps m2, m4, q1010
%endif
mulps m1, m0
+%if notcpuflag(avx2)
mulps m2, m0
+%endif
mova [dstq + 8*startq + 0], m1
+%if notcpuflag(avx2)
mova [dstq + 8*startq + 16], m2
+%endif
add startq, 4
cmp startq, endm
jl .loop
@@ -111,3 +123,6 @@ DECODE_HF
INIT_XMM sse4
DECODE_HF
+
+INIT_YMM avx2
+DECODE_HF
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index fde1297..f578df7 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -32,6 +32,9 @@ void ff_decode_hf_sse2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS
void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
const int8_t hf_vq[1024][32], intptr_t vq_offset,
int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
+void ff_decode_hf_avx2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
+ const int8_t hf_vq[1024][32], intptr_t vq_offset,
+ int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
{
@@ -50,4 +53,8 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
if (EXTERNAL_SSE4(cpu_flags)) {
s->decode_hf = ff_decode_hf_sse4;
}
+
+ if (EXTERNAL_AVX2(cpu_flags)) {
+ s->decode_hf = ff_decode_hf_avx2;
+ }
}
--
1.8.3.2
More information about the ffmpeg-devel
mailing list