[FFmpeg-devel] [PATCH 6/9] x86: simple_idct: 12bits versions
Christophe Gisquet
christophe.gisquet at gmail.com
Fri Oct 9 23:53:43 CEST 2015
On 12 frames of a 444p 12 bits DNxHR sequence, _put function:
C: 78902 decicycles in idct, 262071 runs, 73 skips
avx: 32478 decicycles in idct, 262045 runs, 99 skips
Difference between the 2:
stddev: 0.39 PSNR:104.47 MAXDIFF: 2
This is unavoidable and due to the scale factors used in the x86
version, which cannot match the C ones, as this would cause
overflows (there's one less 1bit of precision). In particular,
the trick to merge an addition into a multiplication of the first
butterfly of the pass can cause an overflow (15bits coeff now
needing 16).
---
libavcodec/x86/idctdsp_init.c | 21 +++++++++++++++++++--
libavcodec/x86/simple_idct.h | 6 ++++++
libavcodec/x86/simple_idct10.asm | 17 +++++++++++++++++
3 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/libavcodec/x86/idctdsp_init.c b/libavcodec/x86/idctdsp_init.c
index 4fc9b0d..be563c2 100644
--- a/libavcodec/x86/idctdsp_init.c
+++ b/libavcodec/x86/idctdsp_init.c
@@ -86,8 +86,8 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
c->add_pixels_clamped = ff_add_pixels_clamped_sse2;
}
- if (ARCH_X86_64 &&
- avctx->bits_per_raw_sample == 10 && avctx->lowres == 0 &&
+ if (ARCH_X86_64 && avctx->lowres == 0) {
+ if (avctx->bits_per_raw_sample == 10 &&
(avctx->idct_algo == FF_IDCT_AUTO ||
avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
avctx->idct_algo == FF_IDCT_SIMPLE)) {
@@ -102,5 +102,22 @@ av_cold void ff_idctdsp_init_x86(IDCTDSPContext *c, AVCodecContext *avctx,
c->idct_put = ff_simple_idct10_put_avx;
c->perm_type = FF_IDCT_PERM_TRANSPOSE;
}
+ }
+
+ if (avctx->bits_per_raw_sample == 12 &&
+ (avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEMMX)) {
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->idct = ff_simple_idct12_sse2;
+ c->idct_put = ff_simple_idct12_put_sse2;
+ c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+
+ }
+ if (EXTERNAL_AVX(cpu_flags)) {
+ c->idct = ff_simple_idct12_avx;
+ c->idct_put = ff_simple_idct12_put_avx;
+ c->perm_type = FF_IDCT_PERM_TRANSPOSE;
+ }
+ }
}
}
diff --git a/libavcodec/x86/simple_idct.h b/libavcodec/x86/simple_idct.h
index e8f59c1..8eeb31e 100644
--- a/libavcodec/x86/simple_idct.h
+++ b/libavcodec/x86/simple_idct.h
@@ -31,4 +31,10 @@ void ff_simple_idct10_avx(int16_t *block);
void ff_simple_idct10_put_sse2(uint8_t *dest, int line_size, int16_t *block);
void ff_simple_idct10_put_avx(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct12_sse2(int16_t *block);
+void ff_simple_idct12_avx(int16_t *block);
+
+void ff_simple_idct12_put_sse2(uint8_t *dest, int line_size, int16_t *block);
+void ff_simple_idct12_put_avx(uint8_t *dest, int line_size, int16_t *block);
+
#endif /* AVCODEC_X86_SIMPLE_IDCT_H */
diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm
index b1f45ea..a410191 100644
--- a/libavcodec/x86/simple_idct10.asm
+++ b/libavcodec/x86/simple_idct10.asm
@@ -29,9 +29,13 @@
SECTION_RODATA
+cextern pw_1
+cextern pw_2
cextern pw_8
cextern pw_1023
+cextern pw_4095
pd_round: times 4 dd 1<<(13-1)
+pd_round2: times 4 dd 1<<(15-1)
%include "libavcodec/x86/simple_idct10_template.asm"
@@ -47,6 +51,19 @@ cglobal simple_idct10_put, 3, 3, 16
mova m15, [pd_round]
IDCT_FN "", 13, pw_8, 18, 0, pw_1023
RET
+
+cglobal simple_idct12, 1, 1, 16
+ mova m15, [pd_round2]
+ IDCT_FN "", 15, pw_2, 16
+ RET
+
+cglobal simple_idct12_put, 3, 3, 16
+ ; range isn't known, so the C simple_idct range is used
+ ; Also, using a bias on input overflows, so use the bias
+ ; on output of the first butterfly instead
+ mova m15, [pd_round2]
+ IDCT_FN "", 15, pw_2, 16, 0, pw_4095
+ RET
%endmacro
INIT_XMM sse2
--
2.6.0
More information about the ffmpeg-devel
mailing list