[FFmpeg-devel] [PATCH 3/6] avcodec/x86: add x86-64 8-bit simple_idct function
James Darnley
jdarnley at obe.tv
Mon Jun 12 16:36:06 EEST 2017
Rounding contributed by Ronald S. Bultje
---
libavcodec/tests/x86/dct.c | 2 ++
libavcodec/x86/idctdsp_init.c | 19 +++++++++++++++++++
libavcodec/x86/simple_idct.h | 3 +++
libavcodec/x86/simple_idct10.asm | 8 ++++++++
4 files changed, 32 insertions(+)
diff --git a/libavcodec/tests/x86/dct.c b/libavcodec/tests/x86/dct.c
index 34f5b8767b..317d973f9f 100644
--- a/libavcodec/tests/x86/dct.c
+++ b/libavcodec/tests/x86/dct.c
@@ -88,10 +88,12 @@ static const struct algo idct_tab_arch[] = {
#if HAVE_YASM
#if ARCH_X86_64
#if HAVE_SSE2_EXTERNAL
+ { "SIMPLE8-SSE2", ff_simple_idct8_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
{ "SIMPLE10-SSE2", ff_simple_idct10_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2},
{ "SIMPLE12-SSE2", ff_simple_idct12_sse2, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
#endif
#if HAVE_AVX_EXTERNAL
+ { "SIMPLE8-AVX", ff_simple_idct8_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
{ "SIMPLE10-AVX", ff_simple_idct10_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX},
{ "SIMPLE12-AVX", ff_simple_idct12_avx, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_AVX, 1 },
#endif
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index f1c915aa00..4b2145e478 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -94,9 +94,28 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
c->idct_add = ff_simple_idct_add_sse2;
c->perm_type = FF_IDCT_PERM_SIMPLE;
}
+
+ if (ARCH_X86_64 &&
+ !high_bit_depth &&
+ avctx->lowres == 0 &&
+ (avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+ c->idct = ff_simple_idct8_sse2;
+ c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+ }
}
if (ARCH_X86_64 && avctx->lowres == 0) {
+ if (EXTERNAL_AVX(cpu_flags) &&
+ !high_bit_depth &&
+ (avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+ c->idct = ff_simple_idct8_avx;
+ c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+ }
+
if (avctx->bits_per_raw_sample == 10 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index d17ef6a462..d17a855312 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -29,6 +29,9 @@ void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_add_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
void ff_simple_idct_put_sse2(uint8_t *dest, ptrdiff_t line_size, int16_t *block);
+void ff_simple_idct8_sse2(int16_t *block);
+void ff_simple_idct8_avx(int16_t *block);
+
void ff_simple_idct10_sse2(int16_t *block);
void ff_simple_idct10_avx(int16_t *block);
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index 1a5a2eae9b..168b6a08e0 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -33,9 +33,11 @@ cextern pw_2
cextern pw_16
cextern pw_1023
cextern pw_4095
+pd_round_11: times 4 dd 1<<(11-1)
pd_round_12: times 4 dd 1<<(12-1)
pd_round_15: times 4 dd 1<<(15-1)
pd_round_19: times 4 dd 1<<(19-1)
+pd_round_20: times 4 dd 1<<(20-1)
%macro CONST_DEC 3
const %1
@@ -50,6 +52,8 @@ times 4 dw %2, %3
%define W6sh2 8867 ; W6 = 35468 = 8867<<2
%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1
+pw_round_20_div_w4: times 8 dw ((1 << (20 - 1)) / W4sh2)
+
CONST_DEC w4_plus_w2, W4sh2, +W2sh2
CONST_DEC w4_min_w2, W4sh2, -W2sh2
CONST_DEC w4_plus_w6, W4sh2, +W6sh2
@@ -68,6 +72,10 @@ CONST_DEC w7_min_w5, W7sh2, -W5sh2
SECTION .text
%macro idct_fn 0
+cglobal simple_idct8, 1, 1, 16, block
+ IDCT_FN "", 11, pw_round_20_div_w4, 20, "store"
+RET
+
cglobal simple_idct10, 1, 1, 16, block
IDCT_FN "", 12, "", 19, "store"
RET
--
2.13.0
More information about the ffmpeg-devel
mailing list