[FFmpeg-cvslog] x86/hevc_sao: simplify sao_edge_filter 10/12bit
James Almer
git at videolan.org
Sun Dec 20 20:47:49 CET 2015
ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Sun Dec 6 02:46:51 2015 -0300| [3ff2beff65af87fc9ce75d55f2c06e01d606cebc] | committer: James Almer
x86/hevc_sao: simplify sao_edge_filter 10/12bit
Reviewed-by: Michael Niedermayer <michaelni at gmx.at>
Reviewed-by: Christophe Gisquet <christophe.gisquet at gmail.com>
Signed-off-by: James Almer <jamrial at gmail.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=3ff2beff65af87fc9ce75d55f2c06e01d606cebc
---
libavcodec/x86/hevc_sao_10bit.asm | 150 +++++++++++++------------------------
1 file changed, 54 insertions(+), 96 deletions(-)
diff --git a/libavcodec/x86/hevc_sao_10bit.asm b/libavcodec/x86/hevc_sao_10bit.asm
index 3a7048a..79776ac 100644
--- a/libavcodec/x86/hevc_sao_10bit.asm
+++ b/libavcodec/x86/hevc_sao_10bit.asm
@@ -221,46 +221,6 @@ HEVC_SAO_BAND_FILTER 12, 64, 4
add b_strideq, tmpq
%endmacro
-%macro HEVC_SAO_EDGE_FILTER_COMPUTE 0
- PMINUW m4, m1, m2, m6
- PMINUW m5, m1, m3, m7
- pcmpeqw m2, m4
- pcmpeqw m3, m5
- pcmpeqw m4, m1
- pcmpeqw m5, m1
- psubw m4, m2
- psubw m5, m3
-
- paddw m4, m5
- pcmpeqw m2, m4, [pw_m2]
-%if ARCH_X86_64
- pcmpeqw m3, m4, m13
- pcmpeqw m5, m4, m0
- pcmpeqw m6, m4, m14
- pcmpeqw m7, m4, m15
- pand m2, m8
- pand m3, m9
- pand m5, m10
- pand m6, m11
- pand m7, m12
-%else
- pcmpeqw m3, m4, [pw_m1]
- pcmpeqw m5, m4, m0
- pcmpeqw m6, m4, [pw_1]
- pcmpeqw m7, m4, [pw_2]
- pand m2, [rsp+MMSIZE*0]
- pand m3, [rsp+MMSIZE*1]
- pand m5, [rsp+MMSIZE*2]
- pand m6, [rsp+MMSIZE*3]
- pand m7, [rsp+MMSIZE*4]
-%endif
- paddw m2, m3
- paddw m5, m6
- paddw m2, m7
- paddw m2, m1
- paddw m2, m5
-%endmacro
-
;void ff_hevc_sao_edge_filter_<width>_<depth>_<opt>(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
; int eo, int width, int height);
%macro HEVC_SAO_EDGE_FILTER 3
@@ -274,7 +234,6 @@ cglobal hevc_sao_edge_filter_%2_%1, 4, 9, 16, dst, src, dststride, offset, eo, a
%else ; ARCH_X86_32
cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_stride, b_stride, height
-%assign MMSIZE mmsize
%define eoq srcq
%define tmpq heightq
%define tmp2q dststrideq
@@ -325,54 +284,53 @@ cglobal hevc_sao_edge_filter_%2_%1, 1, 6, 8, 5*mmsize, dst, src, dststride, a_st
align 16
.loop:
-%if %2 == 8
- mova m1, [srcq]
- movu m2, [srcq+a_strideq]
- movu m3, [srcq+b_strideq]
-
- HEVC_SAO_EDGE_FILTER_COMPUTE
- CLIPW m2, m0, [pw_mask %+ %1]
- movu [dstq], m2
-%endif
-
%assign i 0
%rep %3
mova m1, [srcq + i]
movu m2, [srcq+a_strideq + i]
movu m3, [srcq+b_strideq + i]
- HEVC_SAO_EDGE_FILTER_COMPUTE
- CLIPW m2, m0, [pw_mask %+ %1]
- mova [dstq + i], m2
+ PMINUW m4, m1, m2, m6
+ PMINUW m5, m1, m3, m7
+ pcmpeqw m2, m4
+ pcmpeqw m3, m5
+ pcmpeqw m4, m1
+ pcmpeqw m5, m1
+ psubw m4, m2
+ psubw m5, m3
- mova m1, [srcq + i + mmsize]
- movu m2, [srcq+a_strideq + i + mmsize]
- movu m3, [srcq+b_strideq + i + mmsize]
- HEVC_SAO_EDGE_FILTER_COMPUTE
+ paddw m4, m5
+ pcmpeqw m2, m4, [pw_m2]
+%if ARCH_X86_64
+ pcmpeqw m3, m4, m13
+ pcmpeqw m5, m4, m0
+ pcmpeqw m6, m4, m14
+ pcmpeqw m7, m4, m15
+ pand m2, m8
+ pand m3, m9
+ pand m5, m10
+ pand m6, m11
+ pand m7, m12
+%else
+ pcmpeqw m3, m4, [pw_m1]
+ pcmpeqw m5, m4, m0
+ pcmpeqw m6, m4, [pw_1]
+ pcmpeqw m7, m4, [pw_2]
+ pand m2, [rsp+mmsize*0]
+ pand m3, [rsp+mmsize*1]
+ pand m5, [rsp+mmsize*2]
+ pand m6, [rsp+mmsize*3]
+ pand m7, [rsp+mmsize*4]
+%endif
+ paddw m2, m3
+ paddw m5, m6
+ paddw m2, m7
+ paddw m2, m1
+ paddw m2, m5
CLIPW m2, m0, [pw_mask %+ %1]
- mova [dstq + i + mmsize], m2
-%assign i i+mmsize*2
+ mova [dstq + i], m2
+%assign i i+mmsize
%endrep
-%if %2 == 48
-INIT_XMM cpuname
- mova m1, [srcq + i]
- movu m2, [srcq+a_strideq + i]
- movu m3, [srcq+b_strideq + i]
- HEVC_SAO_EDGE_FILTER_COMPUTE
- CLIPW m2, m0, [pw_mask %+ %1]
- mova [dstq + i], m2
-
- mova m1, [srcq + i + mmsize]
- movu m2, [srcq+a_strideq + i + mmsize]
- movu m3, [srcq+b_strideq + i + mmsize]
- HEVC_SAO_EDGE_FILTER_COMPUTE
- CLIPW m2, m0, [pw_mask %+ %1]
- mova [dstq + i + mmsize], m2
-%if cpuflag(avx2)
-INIT_YMM cpuname
-%endif
-%endif
-
add dstq, dststrideq
add srcq, EDGE_SRCSTRIDE
dec heightd
@@ -381,25 +339,25 @@ INIT_YMM cpuname
%endmacro
INIT_XMM sse2
-HEVC_SAO_EDGE_FILTER 10, 8, 0
-HEVC_SAO_EDGE_FILTER 10, 16, 1
+HEVC_SAO_EDGE_FILTER 10, 8, 1
+HEVC_SAO_EDGE_FILTER 10, 16, 2
+HEVC_SAO_EDGE_FILTER 10, 32, 4
+HEVC_SAO_EDGE_FILTER 10, 48, 6
+HEVC_SAO_EDGE_FILTER 10, 64, 8
+
+HEVC_SAO_EDGE_FILTER 12, 8, 1
+HEVC_SAO_EDGE_FILTER 12, 16, 2
+HEVC_SAO_EDGE_FILTER 12, 32, 4
+HEVC_SAO_EDGE_FILTER 12, 48, 6
+HEVC_SAO_EDGE_FILTER 12, 64, 8
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
HEVC_SAO_EDGE_FILTER 10, 32, 2
-HEVC_SAO_EDGE_FILTER 10, 48, 2
+HEVC_SAO_EDGE_FILTER 10, 48, 3
HEVC_SAO_EDGE_FILTER 10, 64, 4
-HEVC_SAO_EDGE_FILTER 12, 8, 0
-HEVC_SAO_EDGE_FILTER 12, 16, 1
HEVC_SAO_EDGE_FILTER 12, 32, 2
-HEVC_SAO_EDGE_FILTER 12, 48, 2
+HEVC_SAO_EDGE_FILTER 12, 48, 3
HEVC_SAO_EDGE_FILTER 12, 64, 4
-
-%if HAVE_AVX2_EXTERNAL
-INIT_YMM avx2
-HEVC_SAO_EDGE_FILTER 10, 32, 1
-HEVC_SAO_EDGE_FILTER 10, 48, 1
-HEVC_SAO_EDGE_FILTER 10, 64, 2
-
-HEVC_SAO_EDGE_FILTER 12, 32, 1
-HEVC_SAO_EDGE_FILTER 12, 48, 1
-HEVC_SAO_EDGE_FILTER 12, 64, 2
%endif
More information about the ffmpeg-cvslog
mailing list