[FFmpeg-cvslog] Merge commit '7abdd026df6a9a52d07d8174505b33cc89db7bf6'
James Almer
git at videolan.org
Wed Sep 27 00:49:11 EEST 2017
ffmpeg | branch: master | James Almer <jamrial at gmail.com> | Tue Sep 26 18:48:06 2017 -0300| [0c005fa86f01df75be8c9cacad7530978af80900] | committer: James Almer
Merge commit '7abdd026df6a9a52d07d8174505b33cc89db7bf6'
* commit '7abdd026df6a9a52d07d8174505b33cc89db7bf6':
asm: Consistently uppercase SECTION markers
Merged-by: James Almer <jamrial at gmail.com>
> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=0c005fa86f01df75be8c9cacad7530978af80900
---
libavcodec/x86/dirac_dwt.asm | 2 +-
libavcodec/x86/diracdsp.asm | 2 +-
libavcodec/x86/dnxhdenc.asm | 2 +-
libavcodec/x86/huffyuvencdsp.asm | 2 +-
libavcodec/x86/lossless_videoencdsp.asm | 2 +-
libavcodec/x86/vc1dsp_loopfilter.asm | 2 +-
libavcodec/x86/vc1dsp_mc.asm | 2 +-
libavutil/x86/x86inc.asm | 4 ++--
8 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/libavcodec/x86/dirac_dwt.asm b/libavcodec/x86/dirac_dwt.asm
index 89806899a2..22a5c2bbbb 100644
--- a/libavcodec/x86/dirac_dwt.asm
+++ b/libavcodec/x86/dirac_dwt.asm
@@ -29,7 +29,7 @@ cextern pw_2
cextern pw_8
cextern pw_16
-section .text
+SECTION .text
; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2
%macro COMPOSE_53iL0 4
diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
index 6b3f780e41..cc8a26fca5 100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@ -30,7 +30,7 @@ cextern pw_16
cextern pw_32
cextern pb_80
-section .text
+SECTION .text
%macro UNPACK_ADD 6
mov%5 %1, %3
diff --git a/libavcodec/x86/dnxhdenc.asm b/libavcodec/x86/dnxhdenc.asm
index 9dd6d51ee6..b4f759552e 100644
--- a/libavcodec/x86/dnxhdenc.asm
+++ b/libavcodec/x86/dnxhdenc.asm
@@ -22,7 +22,7 @@
%include "libavutil/x86/x86util.asm"
-section .text
+SECTION .text
; void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels,
; ptrdiff_t line_size)
diff --git a/libavcodec/x86/huffyuvencdsp.asm b/libavcodec/x86/huffyuvencdsp.asm
index 1228aa8355..eeef81ab8e 100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@ -25,7 +25,7 @@
%include "libavutil/x86/x86util.asm"
-section .text
+SECTION .text
; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
; unsigned mask, int w);
diff --git a/libavcodec/x86/lossless_videoencdsp.asm b/libavcodec/x86/lossless_videoencdsp.asm
index 63fd72174a..3cb7dce07f 100644
--- a/libavcodec/x86/lossless_videoencdsp.asm
+++ b/libavcodec/x86/lossless_videoencdsp.asm
@@ -25,7 +25,7 @@
%include "libavutil/x86/x86util.asm"
-section .text
+SECTION .text
; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
; intptr_t w);
diff --git a/libavcodec/x86/vc1dsp_loopfilter.asm b/libavcodec/x86/vc1dsp_loopfilter.asm
index 1838f6f235..fd33bd13dc 100644
--- a/libavcodec/x86/vc1dsp_loopfilter.asm
+++ b/libavcodec/x86/vc1dsp_loopfilter.asm
@@ -24,7 +24,7 @@
cextern pw_4
cextern pw_5
-section .text
+SECTION .text
; dst_low, dst_high (src), zero
; zero-extends one vector from 8 to 16 bits
diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
index 2850ca861d..0e6d87dd8b 100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -24,7 +24,7 @@
cextern pw_9
cextern pw_128
-section .text
+SECTION .text
%if HAVE_MMX_INLINE
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index c4ec29bd9d..6a054a3e09 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -87,9 +87,9 @@
; keep supporting OS/2.
%macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,aout
- section .text
+ SECTION .text
%elifidn __OUTPUT_FORMAT__,coff
- section .text
+ SECTION .text
%else
SECTION .rodata align=%1
%endif
======================================================================
diff --cc libavcodec/x86/dirac_dwt.asm
index 89806899a2,0000000000..22a5c2bbbb
mode 100644,000000..100644
--- a/libavcodec/x86/dirac_dwt.asm
+++ b/libavcodec/x86/dirac_dwt.asm
@@@ -1,307 -1,0 +1,307 @@@
+;******************************************************************************
+;* x86 optimized discrete wavelet trasnform
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_1991: times 4 dw 9,-1
+
+cextern pw_1
+cextern pw_2
+cextern pw_8
+cextern pw_16
+
- section .text
++SECTION .text
+
+; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2
+%macro COMPOSE_53iL0 4
+ paddw %2, %3
+ paddw %2, %4
+ psraw %2, 2
+ psubw %1, %2
+%endm
+
+; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
+; if %4 is supplied, %1 is loaded unaligned from there
+; m2: clobbered m3: pw_8 m4: pw_1991
+%macro COMPOSE_DD97iH0 3-4
+ paddw m0, %3
+ paddw m1, %2
+ psubw m0, m3
+ mova m2, m1
+ punpcklwd m1, m0
+ punpckhwd m2, m0
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+%if %0 > 3
+ movu %1, %4
+%endif
+ psrad m1, 4
+ psrad m2, 4
+ packssdw m1, m2
+ paddw m1, %1
+%endm
+
+%macro COMPOSE_VERTICAL 1
+; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; int width)
+cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
+ mova m2, [pw_2]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m1, [b0q+2*widthq]
+ mova m0, [b1q+2*widthq]
+ COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
+ mova [b1q+2*widthq], m0
+ jg .loop
+ REP_RET
+
+; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; int width)
+cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
+ mova m1, [pw_1]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m0, [b0q+2*widthq]
+ paddw m0, [b2q+2*widthq]
+ paddw m0, m1
+ psraw m0, 1
+ paddw m0, [b1q+2*widthq]
+ mova [b1q+2*widthq], m0
+ jg .loop
+ REP_RET
+
+; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
+ mova m3, [pw_8]
+ mova m4, [pw_1991]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m0, [b0q+2*widthq]
+ mova m1, [b1q+2*widthq]
+ COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
+ mova [b2q+2*widthq], m1
+ jg .loop
+ REP_RET
+
+; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
+; IDWTELEM *b3, IDWTELEM *b4, int width)
+cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
+ mova m3, [pw_16]
+ mova m4, [pw_1991]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m0, [b0q+2*widthq]
+ mova m1, [b1q+2*widthq]
+ mova m5, [b2q+2*widthq]
+ paddw m0, [b4q+2*widthq]
+ paddw m1, [b3q+2*widthq]
+ psubw m0, m3
+ mova m2, m1
+ punpcklwd m1, m0
+ punpckhwd m2, m0
+ pmaddwd m1, m4
+ pmaddwd m2, m4
+ psrad m1, 5
+ psrad m2, 5
+ packssdw m1, m2
+ psubw m5, m1
+ mova [b2q+2*widthq], m5
+ jg .loop
+ REP_RET
+
+; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
+cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
+ mova m3, [pw_1]
+%if ARCH_X86_64
+ mov widthd, widthd
+%endif
+.loop:
+ sub widthq, mmsize/2
+ mova m1, [b1q+2*widthq]
+ mova m0, [b0q+2*widthq]
+ mova m2, m1
+ paddw m1, m3
+ psraw m1, 1
+ psubw m0, m1
+ mova [b0q+2*widthq], m0
+ paddw m2, m0
+ mova [b1q+2*widthq], m2
+ jg .loop
+ REP_RET
+%endmacro
+
+; extend the left and right edges of the tmp array by %1 and %2 respectively
+%macro EDGE_EXTENSION 3
+ mov %3, [tmpq]
+%assign %%i 1
+%rep %1
+ mov [tmpq-2*%%i], %3
+ %assign %%i %%i+1
+%endrep
+ mov %3, [tmpq+2*w2q-2]
+%assign %%i 0
+%rep %2
+ mov [tmpq+2*w2q+2*%%i], %3
+ %assign %%i %%i+1
+%endrep
+%endmacro
+
+
+%macro HAAR_HORIZONTAL 2
+; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
+ mov w2d, wd
+ xor xq, xq
+ shr w2d, 1
+ lea b_w2q, [bq+wq]
+ mova m3, [pw_1]
+.lowpass_loop:
+ movu m1, [b_w2q + 2*xq]
+ mova m0, [bq + 2*xq]
+ paddw m1, m3
+ psraw m1, 1
+ psubw m0, m1
+ mova [tmpq + 2*xq], m0
+ add xq, mmsize/2
+ cmp xq, w2q
+ jl .lowpass_loop
+
+ xor xq, xq
+ and w2q, ~(mmsize/2 - 1)
+ cmp w2q, mmsize/2
+ jl .end
+
+.highpass_loop:
+ movu m1, [b_w2q + 2*xq]
+ mova m0, [tmpq + 2*xq]
+ paddw m1, m0
+
+ ; shift and interleave
+%if %2 == 1
+ paddw m0, m3
+ paddw m1, m3
+ psraw m0, 1
+ psraw m1, 1
+%endif
+ mova m2, m0
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ mova [bq+4*xq], m0
+ mova [bq+4*xq+mmsize], m2
+
+ add xq, mmsize/2
+ cmp xq, w2q
+ jl .highpass_loop
+.end:
+ REP_RET
+%endmacro
+
+
+INIT_XMM
+; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
+cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
+ mov w2d, wd
+ xor xd, xd
+ shr w2d, 1
+ lea b_w2q, [bq+wq]
+ movu m4, [bq+wq]
+ mova m7, [pw_2]
+ pslldq m4, 14
+.lowpass_loop:
+ movu m1, [b_w2q + 2*xq]
+ mova m0, [bq + 2*xq]
+ mova m2, m1
+ palignr m1, m4, 14
+ mova m4, m2
+ COMPOSE_53iL0 m0, m1, m2, m7
+ mova [tmpq + 2*xq], m0
+ add xd, mmsize/2
+ cmp xd, w2d
+ jl .lowpass_loop
+
+ EDGE_EXTENSION 1, 2, xw
+ ; leave the last up to 7 (sse) or 3 (mmx) values for C
+ xor xd, xd
+ and w2d, ~(mmsize/2 - 1)
+ cmp w2d, mmsize/2
+ jl .end
+
+ mova m7, [tmpq-mmsize]
+ mova m0, [tmpq]
+ mova m5, [pw_1]
+ mova m3, [pw_8]
+ mova m4, [pw_1991]
+.highpass_loop:
+ mova m6, m0
+ palignr m0, m7, 14
+ mova m7, [tmpq + 2*xq + 16]
+ mova m1, m7
+ mova m2, m7
+ palignr m1, m6, 2
+ palignr m2, m6, 4
+ COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
+ mova m0, m7
+ mova m7, m6
+
+ ; shift and interleave
+ paddw m6, m5
+ paddw m1, m5
+ psraw m6, 1
+ psraw m1, 1
+ mova m2, m6
+ punpcklwd m6, m1
+ punpckhwd m2, m1
+ mova [bq+4*xq], m6
+ mova [bq+4*xq+mmsize], m2
+
+ add xd, mmsize/2
+ cmp xd, w2d
+ jl .highpass_loop
+.end:
+ REP_RET
+
+
+%if ARCH_X86_64 == 0
+INIT_MMX
+COMPOSE_VERTICAL mmx
+HAAR_HORIZONTAL mmx, 0
+HAAR_HORIZONTAL mmx, 1
+%endif
+
+;;INIT_XMM
+INIT_XMM
+COMPOSE_VERTICAL sse2
+HAAR_HORIZONTAL sse2, 0
+HAAR_HORIZONTAL sse2, 1
diff --cc libavcodec/x86/diracdsp.asm
index 6b3f780e41,0000000000..cc8a26fca5
mode 100644,000000..100644
--- a/libavcodec/x86/diracdsp.asm
+++ b/libavcodec/x86/diracdsp.asm
@@@ -1,347 -1,0 +1,347 @@@
+;******************************************************************************
+;* Copyright (c) 2010 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+pw_7: times 8 dw 7
+convert_to_unsigned_10bit: times 4 dd 0x200
+clip_10bit: times 8 dw 0x3ff
+
+cextern pw_3
+cextern pw_16
+cextern pw_32
+cextern pb_80
+
- section .text
++SECTION .text
+
+%macro UNPACK_ADD 6
+ mov%5 %1, %3
+ mov%6 m5, %4
+ mova m4, %1
+ mova %2, m5
+ punpcklbw %1, m7
+ punpcklbw m5, m7
+ punpckhbw m4, m7
+ punpckhbw %2, m7
+ paddw %1, m5
+ paddw %2, m4
+%endmacro
+
+%macro HPEL_FILTER 1
+; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
+cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
+ mov src0q, srcq
+ lea stridex3q, [3*strideq]
+ sub src0q, stridex3q
+ pxor m7, m7
+.loop:
+ ; 7*(src[0] + src[1])
+ UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
+ pmullw m0, [pw_7]
+ pmullw m1, [pw_7]
+
+ ; 3*( ... + src[-2] + src[3])
+ UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
+ paddw m0, m2
+ paddw m1, m3
+ pmullw m0, [pw_3]
+ pmullw m1, [pw_3]
+
+ ; ... - 7*(src[-1] + src[2])
+ UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
+ pmullw m2, [pw_7]
+ pmullw m3, [pw_7]
+ psubw m0, m2
+ psubw m1, m3
+
+ ; ... - (src[-3] + src[4])
+ UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
+ psubw m0, m2
+ psubw m1, m3
+
+ paddw m0, [pw_16]
+ paddw m1, [pw_16]
+ psraw m0, 5
+ psraw m1, 5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, mmsize
+ add srcq, mmsize
+ add src0q, mmsize
+ sub widthd, mmsize
+ jg .loop
+ RET
+
+; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
+cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
+ dec widthd
+ pxor m7, m7
+ and widthd, ~(mmsize-1)
+.loop:
+ ; 7*(src[0] + src[1])
+ UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
+ pmullw m0, [pw_7]
+ pmullw m1, [pw_7]
+
+ ; 3*( ... + src[-2] + src[3])
+ UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
+ paddw m0, m2
+ paddw m1, m3
+ pmullw m0, [pw_3]
+ pmullw m1, [pw_3]
+
+ ; ... - 7*(src[-1] + src[2])
+ UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
+ pmullw m2, [pw_7]
+ pmullw m3, [pw_7]
+ psubw m0, m2
+ psubw m1, m3
+
+ ; ... - (src[-3] + src[4])
+ UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
+ psubw m0, m2
+ psubw m1, m3
+
+ paddw m0, [pw_16]
+ paddw m1, [pw_16]
+ psraw m0, 5
+ psraw m1, 5
+ packuswb m0, m1
+ mova [dstq + widthq], m0
+ sub widthd, mmsize
+ jge .loop
+ RET
+%endmacro
+
+%macro PUT_RECT 1
+; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
+cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
+ mova m0, [pb_80]
+ add wd, (mmsize-1)
+ and wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+ movsxd dst_strideq, dst_strided
+ movsxd src_strideq, src_strided
+ mov r7d, r5m
+ mov r8d, wd
+ %define wspill r8d
+ %define hd r7d
+%else
+ mov r4m, wd
+ %define wspill r4m
+ %define hd r5mp
+%endif
+
+.loopy:
+ lea src2q, [srcq+src_strideq]
+ lea dst2q, [dstq+dst_strideq]
+.loopx:
+ sub wd, mmsize
+ mova m1, [srcq +2*wq]
+ mova m2, [src2q+2*wq]
+ packsswb m1, [srcq +2*wq+mmsize]
+ packsswb m2, [src2q+2*wq+mmsize]
+ paddb m1, m0
+ paddb m2, m0
+ mova [dstq +wq], m1
+ mova [dst2q+wq], m2
+ jg .loopx
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+ sub hd, 2
+ mov wd, wspill
+ jg .loopy
+ RET
+%endm
+
+%macro ADD_RECT 1
+; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
+cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
+ mova m0, [pw_32]
+ add wd, (mmsize-1)
+ and wd, ~(mmsize-1)
+
+%if ARCH_X86_64
+ movsxd strideq, strided
+ movsxd idwt_strideq, idwt_strided
+ mov r8d, wd
+ %define wspill r8d
+%else
+ mov r5m, wd
+ %define wspill r5m
+%endif
+
+.loop:
+ sub wd, mmsize
+ movu m1, [srcq +2*wq] ; FIXME: ensure alignment
+ paddw m1, m0
+ psraw m1, 6
+ movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
+ paddw m2, m0
+ psraw m2, 6
+ paddw m1, [idwtq+2*wq]
+ paddw m2, [idwtq+2*wq+mmsize]
+ packuswb m1, m2
+ mova [dstq +wq], m1
+ jg .loop
+
+ lea srcq, [srcq + 2*strideq]
+ add dstq, strideq
+ lea idwtq, [idwtq+ 2*idwt_strideq]
+ sub hd, 1
+ mov wd, wspill
+ jg .loop
+ RET
+%endm
+
+%macro ADD_OBMC 2
+; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
+cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
+ pxor m4, m4
+.loop:
+%assign i 0
+%rep %1 / mmsize
+ mova m0, [srcq+i]
+ mova m1, m0
+ punpcklbw m0, m4
+ punpckhbw m1, m4
+ mova m2, [obmcq+i]
+ mova m3, m2
+ punpcklbw m2, m4
+ punpckhbw m3, m4
+ pmullw m0, m2
+ pmullw m1, m3
+ movu m2, [dstq+2*i]
+ movu m3, [dstq+2*i+mmsize]
+ paddw m0, m2
+ paddw m1, m3
+ movu [dstq+2*i], m0
+ movu [dstq+2*i+mmsize], m1
+%assign i i+mmsize
+%endrep
+ lea srcq, [srcq+strideq]
+ lea dstq, [dstq+2*strideq]
+ add obmcq, 32
+ sub yblend, 1
+ jg .loop
+ RET
+%endm
+
+INIT_MMX
+%if ARCH_X86_64 == 0
+PUT_RECT mmx
+ADD_RECT mmx
+
+HPEL_FILTER mmx
+ADD_OBMC 32, mmx
+ADD_OBMC 16, mmx
+%endif
+ADD_OBMC 8, mmx
+
+INIT_XMM
+PUT_RECT sse2
+ADD_RECT sse2
+
+HPEL_FILTER sse2
+ADD_OBMC 32, sse2
+ADD_OBMC 16, sse2
+
+INIT_XMM sse4
+
+; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
+cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
+ movd m2, qfd
+ movd m3, qsd
+ SPLATD m2
+ SPLATD m3
+ mov r4, tot_hq
+ mov r3, dstq
+
+ .loop_v:
+ mov tot_hq, r4
+ mov dstq, r3
+
+ .loop_h:
+ movu m0, [srcq]
+
+ pabsd m1, m0
+ pmulld m1, m2
+ paddd m1, m3
+ psrld m1, 2
+ psignd m1, m0
+
+ movu [dstq], m1
+
+ add srcq, mmsize
+ add dstq, mmsize
+ sub tot_hd, 4
+ jg .loop_h
+
+ add r3, strideq
+ dec tot_vd
+ jg .loop_v
+
+ RET
+
+INIT_XMM sse4
+; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
+%if ARCH_X86_64
+cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2
+%else
+cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2
+ %define hd r5mp
+%endif
+ shl wd, 2
+ add srcq, wq
+ neg wq
+ mov t2q, dstq
+ mov t1q, wq
+ pxor m2, m2
+ mova m3, [clip_10bit]
+ mova m4, [convert_to_unsigned_10bit]
+
+ .loop_h:
+ mov dstq, t2q
+ mov wq, t1q
+
+ .loop_w:
+ movu m0, [srcq+wq+0*mmsize]
+ movu m1, [srcq+wq+1*mmsize]
+
+ paddd m0, m4
+ paddd m1, m4
+ packusdw m0, m0, m1
+ CLIPW m0, m2, m3 ; packusdw saturates so it's fine
+
+ movu [dstq], m0
+
+ add dstq, 1*mmsize
+ add wq, 2*mmsize
+ jl .loop_w
+
+ add srcq, src_strideq
+ add t2q, dst_strideq
+ sub hd, 1
+ jg .loop_h
+
+ RET
diff --cc libavcodec/x86/huffyuvencdsp.asm
index 1228aa8355,0000000000..eeef81ab8e
mode 100644,000000..100644
--- a/libavcodec/x86/huffyuvencdsp.asm
+++ b/libavcodec/x86/huffyuvencdsp.asm
@@@ -1,143 -1,0 +1,143 @@@
+;************************************************************************
+;* SIMD-optimized HuffYUV encoding functions
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni at gmx.at>
+;*
+;* MMX optimization by Nick Kurshev <nickols_k at mail.ru>
+;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99 at gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
- section .text
++SECTION .text
+
+; void ff_diff_int16(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; unsigned mask, int w);
+%macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
+ movd m4, maskd
+ SPLATW m4, m4
+ add wd, wd
+ test wq, 2*mmsize - 1
+ jz %%.tomainloop
+ push tmpq
+%%.wordloop:
+ sub wq, 2
+%ifidn %2, add
+ mov tmpw, [srcq+wq]
+ add tmpw, [dstq+wq]
+%else
+ mov tmpw, [src1q+wq]
+ sub tmpw, [src2q+wq]
+%endif
+ and tmpw, maskw
+ mov [dstq+wq], tmpw
+ test wq, 2*mmsize - 1
+ jnz %%.wordloop
+ pop tmpq
+%%.tomainloop:
+%ifidn %2, add
+ add srcq, wq
+%else
+ add src1q, wq
+ add src2q, wq
+%endif
+ add dstq, wq
+ neg wq
+ jz %%.end
+%%.loop:
+%ifidn %2, add
+ mov%1 m0, [srcq+wq]
+ mov%1 m1, [dstq+wq]
+ mov%1 m2, [srcq+wq+mmsize]
+ mov%1 m3, [dstq+wq+mmsize]
+%else
+ mov%1 m0, [src1q+wq]
+ mov%1 m1, [src2q+wq]
+ mov%1 m2, [src1q+wq+mmsize]
+ mov%1 m3, [src2q+wq+mmsize]
+%endif
+ p%2w m0, m1
+ p%2w m2, m3
+ pand m0, m4
+ pand m2, m4
+ mov%1 [dstq+wq] , m0
+ mov%1 [dstq+wq+mmsize], m2
+ add wq, 2*mmsize
+ jl %%.loop
+%%.end:
+ RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+ INT16_LOOP a, sub
+%endif
+
+INIT_XMM sse2
+cglobal diff_int16, 5,5,5, dst, src1, src2, mask, w, tmp
+ test src1q, mmsize-1
+ jnz .unaligned
+ test src2q, mmsize-1
+ jnz .unaligned
+ test dstq, mmsize-1
+ jnz .unaligned
+ INT16_LOOP a, sub
+.unaligned:
+ INT16_LOOP u, sub
+
+INIT_MMX mmxext
+cglobal sub_hfyu_median_pred_int16, 7,7,0, dst, src1, src2, mask, w, left, left_top
+ add wd, wd
+ movd mm7, maskd
+ SPLATW mm7, mm7
+ movq mm0, [src1q]
+ movq mm2, [src2q]
+ psllq mm0, 16
+ psllq mm2, 16
+ movd mm6, [left_topq]
+ por mm0, mm6
+ movd mm6, [leftq]
+ por mm2, mm6
+ xor maskq, maskq
+.loop:
+ movq mm1, [src1q + maskq]
+ movq mm3, [src2q + maskq]
+ movq mm4, mm2
+ psubw mm2, mm0
+ paddw mm2, mm1
+ pand mm2, mm7
+ movq mm5, mm4
+ pmaxsw mm4, mm1
+ pminsw mm1, mm5
+ pminsw mm4, mm2
+ pmaxsw mm4, mm1
+ psubw mm3, mm4
+ pand mm3, mm7
+ movq [dstq + maskq], mm3
+ add maskq, 8
+ movq mm0, [src1q + maskq - 2]
+ movq mm2, [src2q + maskq - 2]
+ cmp maskq, wq
+ jb .loop
+ movzx maskd, word [src1q + wq - 2]
+ mov [left_topq], maskd
+ movzx maskd, word [src2q + wq - 2]
+ mov [leftq], maskd
+ RET
diff --cc libavcodec/x86/lossless_videoencdsp.asm
index 63fd72174a,0000000000..3cb7dce07f
mode 100644,000000..100644
--- a/libavcodec/x86/lossless_videoencdsp.asm
+++ b/libavcodec/x86/lossless_videoencdsp.asm
@@@ -1,150 -1,0 +1,150 @@@
+;************************************************************************
+;* SIMD-optimized lossless video encoding functions
+;* Copyright (c) 2000, 2001 Fabrice Bellard
+;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni at gmx.at>
+;*
+;* MMX optimization by Nick Kurshev <nickols_k at mail.ru>
+;* Conversion to NASM format by Tiancheng "Timothy" Gu <timothygu99 at gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
- section .text
++SECTION .text
+
+; void ff_diff_bytes(uint8_t *dst, const uint8_t *src1, const uint8_t *src2,
+; intptr_t w);
+%macro DIFF_BYTES_PROLOGUE 0
+%if ARCH_X86_32
+cglobal diff_bytes, 3,5,2, dst, src1, src2
+%define wq r4q
+ DECLARE_REG_TMP 3
+ mov wq, r3mp
+%else
+cglobal diff_bytes, 4,5,2, dst, src1, src2, w
+ DECLARE_REG_TMP 4
+%endif ; ARCH_X86_32
+%define i t0q
+%endmacro
+
+; label to jump to if w < regsize
+%macro DIFF_BYTES_LOOP_PREP 1
+ mov i, wq
+ and i, -2 * regsize
+ jz %1
+ add dstq, i
+ add src1q, i
+ add src2q, i
+ neg i
+%endmacro
+
+; mov type used for src1q, dstq, first reg, second reg
+%macro DIFF_BYTES_LOOP_CORE 4
+%if mmsize != 16
+ mov%1 %3, [src1q + i]
+ mov%1 %4, [src1q + i + regsize]
+ psubb %3, [src2q + i]
+ psubb %4, [src2q + i + regsize]
+ mov%2 [dstq + i], %3
+ mov%2 [regsize + dstq + i], %4
+%else
+ ; SSE enforces alignment of psubb operand
+ mov%1 %3, [src1q + i]
+ movu %4, [src2q + i]
+ psubb %3, %4
+ mov%2 [dstq + i], %3
+ mov%1 %3, [src1q + i + regsize]
+ movu %4, [src2q + i + regsize]
+ psubb %3, %4
+ mov%2 [regsize + dstq + i], %3
+%endif
+%endmacro
+
+%macro DIFF_BYTES_BODY 2 ; mov type used for src1q, for dstq
+ %define regsize mmsize
+.loop_%1%2:
+ DIFF_BYTES_LOOP_CORE %1, %2, m0, m1
+ add i, 2 * regsize
+ jl .loop_%1%2
+.skip_main_%1%2:
+ and wq, 2 * regsize - 1
+ jz .end_%1%2
+%if mmsize > 16
+ ; fall back to narrower xmm
+ %define regsize mmsize / 2
+ DIFF_BYTES_LOOP_PREP .setup_loop_gpr_aa
+.loop2_%1%2:
+ DIFF_BYTES_LOOP_CORE %1, %2, xm0, xm1
+ add i, 2 * regsize
+ jl .loop2_%1%2
+.setup_loop_gpr_%1%2:
+ and wq, 2 * regsize - 1
+ jz .end_%1%2
+%endif
+ add dstq, wq
+ add src1q, wq
+ add src2q, wq
+ neg wq
+.loop_gpr_%1%2:
+ mov t0b, [src1q + wq]
+ sub t0b, [src2q + wq]
+ mov [dstq + wq], t0b
+ inc wq
+ jl .loop_gpr_%1%2
+.end_%1%2:
+ REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx
+DIFF_BYTES_PROLOGUE
+ %define regsize mmsize
+ DIFF_BYTES_LOOP_PREP .skip_main_aa
+ DIFF_BYTES_BODY a, a
+%undef i
+%endif
+
+INIT_XMM sse2
+DIFF_BYTES_PROLOGUE
+ %define regsize mmsize
+ DIFF_BYTES_LOOP_PREP .skip_main_aa
+ test dstq, regsize - 1
+ jnz .loop_uu
+ test src1q, regsize - 1
+ jnz .loop_ua
+ DIFF_BYTES_BODY a, a
+ DIFF_BYTES_BODY u, a
+ DIFF_BYTES_BODY u, u
+%undef i
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+DIFF_BYTES_PROLOGUE
+ %define regsize mmsize
+ ; Directly using unaligned SSE2 version is marginally faster than
+ ; branching based on arguments.
+ DIFF_BYTES_LOOP_PREP .skip_main_uu
+ test dstq, regsize - 1
+ jnz .loop_uu
+ test src1q, regsize - 1
+ jnz .loop_ua
+ DIFF_BYTES_BODY a, a
+ DIFF_BYTES_BODY u, a
+ DIFF_BYTES_BODY u, u
+%undef i
+%endif
diff --cc libavcodec/x86/vc1dsp_loopfilter.asm
index 1838f6f235,0000000000..fd33bd13dc
mode 100644,000000..100644
--- a/libavcodec/x86/vc1dsp_loopfilter.asm
+++ b/libavcodec/x86/vc1dsp_loopfilter.asm
@@@ -1,317 -1,0 +1,317 @@@
+;******************************************************************************
+;* VC1 loopfilter optimizations
+;* Copyright (c) 2009 David Conrad
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pw_4
+cextern pw_5
+
- section .text
++SECTION .text
+
+; dst_low, dst_high (src), zero
+; zero-extends one vector from 8 to 16 bits
+%macro UNPACK_8TO16 4
+ mova m%2, m%3
+ punpckh%1 m%3, m%4
+ punpckl%1 m%2, m%4
+%endmacro
+
+%macro STORE_4_WORDS 6
+%if cpuflag(sse4)
+ pextrw %1, %5, %6+0
+ pextrw %2, %5, %6+1
+ pextrw %3, %5, %6+2
+ pextrw %4, %5, %6+3
+%else
+ movd %6d, %5
+%if mmsize==16
+ psrldq %5, 4
+%else
+ psrlq %5, 32
+%endif
+ mov %1, %6w
+ shr %6, 16
+ mov %2, %6w
+ movd %6d, %5
+ mov %3, %6w
+ shr %6, 16
+ mov %4, %6w
+%endif
+%endmacro
+
+; in: p1 p0 q0 q1, clobbers p0
+; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
+%macro VC1_LOOP_FILTER_A0 4
+ psubw %1, %4
+ psubw %2, %3
+ paddw %1, %1
+ pmullw %2, [pw_5]
+ psubw %1, %2
+ paddw %1, [pw_4]
+ psraw %1, 3
+%endmacro
+
+; in: p0 q0 a0 a1 a2
+; m0 m1 m7 m6 m5
+; %1: size
+; out: m0=p0' m1=q0'
+%macro VC1_FILTER 1
+ PABSW m4, m7
+ PABSW m3, m6
+ PABSW m2, m5
+ mova m6, m4
+ pminsw m3, m2
+ pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0)
+ psubw m3, m4
+ pmullw m3, [pw_5] ; 5*(a3 - a0)
+ PABSW m2, m3
+ psraw m2, 3 ; abs(d/8)
+ pxor m7, m3 ; d_sign ^= a0_sign
+
+ pxor m5, m5
+ movd m3, r2d
+%if %1 > 4
+ punpcklbw m3, m3
+%endif
+ punpcklbw m3, m5
+ pcmpgtw m3, m4 ; if (a0 < pq)
+ pand m6, m3
+
+ mova m3, m0
+ psubw m3, m1
+ PABSW m4, m3
+ psraw m4, 1
+ pxor m3, m7 ; d_sign ^ clip_sign
+ psraw m3, 15
+ pminsw m2, m4 ; min(d, clip)
+ pcmpgtw m4, m5
+ pand m6, m4 ; filt3 (C return value)
+
+; each set of 4 pixels is not filtered if the 3rd is not
+%if mmsize==16
+ pshuflw m4, m6, 0xaa
+%if %1 > 4
+ pshufhw m4, m4, 0xaa
+%endif
+%else
+ pshufw m4, m6, 0xaa
+%endif
+ pandn m3, m4
+ pand m2, m6
+ pand m3, m2 ; d final
+
+ psraw m7, 15
+ pxor m3, m7
+ psubw m3, m7
+ psubw m0, m3
+ paddw m1, m3
+ packuswb m0, m0
+ packuswb m1, m1
+%endmacro
+
+; 1st param: size of filter
+; 2nd param: mov suffix equivalent to the filter size
+%macro VC1_V_LOOP_FILTER 2
+ pxor m5, m5
+ mov%2 m6, [r4]
+ mov%2 m4, [r4+r1]
+ mov%2 m7, [r4+2*r1]
+ mov%2 m0, [r4+r3]
+ punpcklbw m6, m5
+ punpcklbw m4, m5
+ punpcklbw m7, m5
+ punpcklbw m0, m5
+
+ VC1_LOOP_FILTER_A0 m6, m4, m7, m0
+ mov%2 m1, [r0]
+ mov%2 m2, [r0+r1]
+ punpcklbw m1, m5
+ punpcklbw m2, m5
+ mova m4, m0
+ VC1_LOOP_FILTER_A0 m7, m4, m1, m2
+ mov%2 m3, [r0+2*r1]
+ mov%2 m4, [r0+r3]
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+ mova m5, m1
+ VC1_LOOP_FILTER_A0 m5, m2, m3, m4
+
+ VC1_FILTER %1
+ mov%2 [r4+r3], m0
+ mov%2 [r0], m1
+%endmacro
+
+; 1st param: size of filter
+; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
+; 2nd (optional) param: temp register to use for storing words
+%macro VC1_H_LOOP_FILTER 1-2
+%if %1 == 4
+ movq m0, [r0 -4]
+ movq m1, [r0+ r1-4]
+ movq m2, [r0+2*r1-4]
+ movq m3, [r0+ r3-4]
+ TRANSPOSE4x4B 0, 1, 2, 3, 4
+%else
+ movq m0, [r0 -4]
+ movq m4, [r0+ r1-4]
+ movq m1, [r0+2*r1-4]
+ movq m5, [r0+ r3-4]
+ movq m2, [r4 -4]
+ movq m6, [r4+ r1-4]
+ movq m3, [r4+2*r1-4]
+ movq m7, [r4+ r3-4]
+ punpcklbw m0, m4
+ punpcklbw m1, m5
+ punpcklbw m2, m6
+ punpcklbw m3, m7
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+%endif
+ pxor m5, m5
+
+ UNPACK_8TO16 bw, 6, 0, 5
+ UNPACK_8TO16 bw, 7, 1, 5
+ VC1_LOOP_FILTER_A0 m6, m0, m7, m1
+ UNPACK_8TO16 bw, 4, 2, 5
+ mova m0, m1 ; m0 = p0
+ VC1_LOOP_FILTER_A0 m7, m1, m4, m2
+ UNPACK_8TO16 bw, 1, 3, 5
+ mova m5, m4
+ VC1_LOOP_FILTER_A0 m5, m2, m1, m3
+ SWAP 1, 4 ; m1 = q0
+
+ VC1_FILTER %1
+ punpcklbw m0, m1
+%if %0 > 1
+ STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
+%if %1 > 4
+ psrldq m0, 4
+ STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
+%endif
+%else
+ STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
+ STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
+%endif
+%endmacro
+
+
+%macro START_V_FILTER 0
+ mov r4, r0
+ lea r3, [4*r1]
+ sub r4, r3
+ lea r3, [r1+2*r1]
+ imul r2, 0x01010101
+%endmacro
+
+%macro START_H_FILTER 1
+ lea r3, [r1+2*r1]
+%if %1 > 4
+ lea r4, [r0+4*r1]
+%endif
+ imul r2, 0x01010101
+%endmacro
+
+%macro VC1_LF 0
+cglobal vc1_v_loop_filter_internal
+ VC1_V_LOOP_FILTER 4, d
+ ret
+
+cglobal vc1_h_loop_filter_internal
+ VC1_H_LOOP_FILTER 4, r4
+ ret
+
+; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter4, 3,5,0
+ START_V_FILTER
+ call vc1_v_loop_filter_internal
+ RET
+
+; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter4, 3,5,0
+ START_H_FILTER 4
+ call vc1_h_loop_filter_internal
+ RET
+
+; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter8, 3,5,0
+ START_V_FILTER
+ call vc1_v_loop_filter_internal
+ add r4, 4
+ add r0, 4
+ call vc1_v_loop_filter_internal
+ RET
+
+; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8, 3,5,0
+ START_H_FILTER 4
+ call vc1_h_loop_filter_internal
+ lea r0, [r0+4*r1]
+ call vc1_h_loop_filter_internal
+ RET
+%endmacro
+
+INIT_MMX mmxext
+VC1_LF
+
+INIT_XMM sse2
+; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter8, 3,5,8
+ START_V_FILTER
+ VC1_V_LOOP_FILTER 8, q
+ RET
+
+; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8, 3,6,8
+ START_H_FILTER 8
+ VC1_H_LOOP_FILTER 8, r5
+ RET
+
+INIT_MMX ssse3
+; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter4, 3,5,0
+ START_V_FILTER
+ VC1_V_LOOP_FILTER 4, d
+ RET
+
+; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter4, 3,5,0
+ START_H_FILTER 4
+ VC1_H_LOOP_FILTER 4, r4
+ RET
+
+INIT_XMM ssse3
+; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_v_loop_filter8, 3,5,8
+ START_V_FILTER
+ VC1_V_LOOP_FILTER 8, q
+ RET
+
+; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8, 3,6,8
+ START_H_FILTER 8
+ VC1_H_LOOP_FILTER 8, r5
+ RET
+
+INIT_XMM sse4
+; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
+cglobal vc1_h_loop_filter8, 3,5,8
+ START_H_FILTER 8
+ VC1_H_LOOP_FILTER 8
+ RET
diff --cc libavcodec/x86/vc1dsp_mc.asm
index 2850ca861d,0000000000..0e6d87dd8b
mode 100644,000000..100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@@ -1,292 -1,0 +1,292 @@@
+;******************************************************************************
+;* VC1 motion compensation optimizations
+;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet at free.fr>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+cextern pw_9
+cextern pw_128
+
- section .text
++SECTION .text
+
+%if HAVE_MMX_INLINE
+
+; XXX some of these macros are not used right now, but they will in the future
+; when more functions are ported.
+
+%macro OP_PUT 2 ; dst, src
+%endmacro
+
+%macro OP_AVG 2 ; dst, src
+ pavgb %1, %2
+%endmacro
+
+%macro NORMALIZE_MMX 1 ; shift
+ paddw m3, m7 ; +bias-r
+ paddw m4, m7 ; +bias-r
+ psraw m3, %1
+ psraw m4, %1
+%endmacro
+
+%macro TRANSFER_DO_PACK 2 ; op, dst
+ packuswb m3, m4
+ %1 m3, [%2]
+ mova [%2], m3
+%endmacro
+
+%macro TRANSFER_DONT_PACK 2 ; op, dst
+ %1 m3, [%2]
+ %1 m3, [%2 + mmsize]
+ mova [%2], m3
+ mova [mmsize + %2], m4
+%endmacro
+
+; see MSPEL_FILTER13_CORE for use as UNPACK macro
+%macro DO_UNPACK 1 ; reg
+ punpcklbw %1, m0
+%endmacro
+%macro DONT_UNPACK 1 ; reg
+%endmacro
+
+; Compute the rounder 32-r or 8-r and unpacks it to m7
+%macro LOAD_ROUNDER_MMX 1 ; round
+ movd m7, %1
+ punpcklwd m7, m7
+ punpckldq m7, m7
+%endmacro
+
+%macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
+ paddw m%3, m%4
+ movh m%2, [srcq + stride_neg2]
+ pmullw m%3, m6
+ punpcklbw m%2, m0
+ movh m%5, [srcq + strideq]
+ psubw m%3, m%2
+ punpcklbw m%5, m0
+ paddw m%3, m7
+ psubw m%3, m%5
+ psraw m%3, shift
+ movu [dstq + %1], m%3
+ add srcq, strideq
+%endmacro
+
+INIT_MMX mmx
+; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
+; x86_reg stride, int rnd, int64_t shift)
+; Sacrificing m6 makes it possible to pipeline loads from src
+%if ARCH_X86_32
+cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
+ DECLARE_REG_TMP 3, 4, 5
+ %define rnd r3mp
+ %define shift qword r4m
+%else ; X86_64
+cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
+ DECLARE_REG_TMP 4, 5, 6
+ %define rnd r3d
+ ; We need shift either in memory or in a mm reg as it's used in psraw
+ ; On WIN64, the arg is already on the stack
+ ; On UNIX64, m5 doesn't seem to be used
+%if WIN64
+ %define shift r4mp
+%else ; UNIX64
+ %define shift m5
+ mova shift, r4q
+%endif ; WIN64
+%endif ; X86_32
+%define stride_neg2 t0q
+%define stride_9minus4 t1q
+%define i t2q
+ mov stride_neg2, strideq
+ neg stride_neg2
+ add stride_neg2, stride_neg2
+ lea stride_9minus4, [strideq * 9 - 4]
+ mov i, 3
+ LOAD_ROUNDER_MMX rnd
+ mova m6, [pw_9]
+ pxor m0, m0
+.loop:
+ movh m2, [srcq]
+ add srcq, strideq
+ movh m3, [srcq]
+ punpcklbw m2, m0
+ punpcklbw m3, m0
+ SHIFT2_LINE 0, 1, 2, 3, 4
+ SHIFT2_LINE 24, 2, 3, 4, 1
+ SHIFT2_LINE 48, 3, 4, 1, 2
+ SHIFT2_LINE 72, 4, 1, 2, 3
+ SHIFT2_LINE 96, 1, 2, 3, 4
+ SHIFT2_LINE 120, 2, 3, 4, 1
+ SHIFT2_LINE 144, 3, 4, 1, 2
+ SHIFT2_LINE 168, 4, 1, 2, 3
+ sub srcq, stride_9minus4
+ add dstq, 8
+ dec i
+ jnz .loop
+ REP_RET
+%undef rnd
+%undef shift
+%undef stride_neg2
+%undef stride_9minus4
+%undef i
+
+; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
+; const int16_t *src, int rnd);
+; Data is already unpacked, so some operations can directly be made from
+; memory.
+%macro HOR_16B_SHIFT2 2 ; op, opname
+cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
+ mov hq, 8
+ sub srcq, 2
+ sub rndd, (-1+9+9-1) * 1024 ; add -1024 bias
+ LOAD_ROUNDER_MMX rndd
+ mova m5, [pw_9]
+ mova m6, [pw_128]
+ pxor m0, m0
+
+.loop:
+ mova m1, [srcq + 2 * 0]
+ mova m2, [srcq + 2 * 0 + mmsize]
+ mova m3, [srcq + 2 * 1]
+ mova m4, [srcq + 2 * 1 + mmsize]
+ paddw m3, [srcq + 2 * 2]
+ paddw m4, [srcq + 2 * 2 + mmsize]
+ paddw m1, [srcq + 2 * 3]
+ paddw m2, [srcq + 2 * 3 + mmsize]
+ pmullw m3, m5
+ pmullw m4, m5
+ psubw m3, m1
+ psubw m4, m2
+ NORMALIZE_MMX 7
+ ; remove bias
+ paddw m3, m6
+ paddw m4, m6
+ TRANSFER_DO_PACK %1, dstq
+ add srcq, 24
+ add dstq, strideq
+ dec hq
+ jnz .loop
+
+ RET
+%endmacro
+
+INIT_MMX mmx
+HOR_16B_SHIFT2 OP_PUT, put
+
+INIT_MMX mmxext
+HOR_16B_SHIFT2 OP_AVG, avg
+%endif ; HAVE_MMX_INLINE
+
+%macro INV_TRANS_INIT 0
+ movsxdifnidn linesizeq, linesized
+ movd m0, blockd
+ SPLATW m0, m0
+ pxor m1, m1
+ psubw m1, m0
+ packuswb m0, m0
+ packuswb m1, m1
+
+ DEFINE_ARGS dest, linesize, linesize3
+ lea linesize3q, [linesizeq*3]
+%endmacro
+
+%macro INV_TRANS_PROCESS 1
+ mov%1 m2, [destq+linesizeq*0]
+ mov%1 m3, [destq+linesizeq*1]
+ mov%1 m4, [destq+linesizeq*2]
+ mov%1 m5, [destq+linesize3q]
+ paddusb m2, m0
+ paddusb m3, m0
+ paddusb m4, m0
+ paddusb m5, m0
+ psubusb m2, m1
+ psubusb m3, m1
+ psubusb m4, m1
+ psubusb m5, m1
+ mov%1 [linesizeq*0+destq], m2
+ mov%1 [linesizeq*1+destq], m3
+ mov%1 [linesizeq*2+destq], m4
+ mov%1 [linesize3q +destq], m5
+%endmacro
+
+; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
+ movsx r3d, WORD [blockq]
+ mov blockd, r3d ; dc
+ shl blockd, 4 ; 16 * dc
+ lea blockd, [blockq+r3+4] ; 17 * dc + 4
+ sar blockd, 3 ; >> 3
+ mov r3d, blockd ; dc
+ shl blockd, 4 ; 16 * dc
+ lea blockd, [blockq+r3+64] ; 17 * dc + 64
+ sar blockd, 7 ; >> 7
+
+ INV_TRANS_INIT
+
+ INV_TRANS_PROCESS h
+ RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
+ movsx r3d, WORD [blockq]
+ mov blockd, r3d ; dc
+ shl blockd, 4 ; 16 * dc
+ lea blockd, [blockq+r3+4] ; 17 * dc + 4
+ sar blockd, 3 ; >> 3
+ shl blockd, 2 ; 4 * dc
+ lea blockd, [blockq*3+64] ; 12 * dc + 64
+ sar blockd, 7 ; >> 7
+
+ INV_TRANS_INIT
+
+ INV_TRANS_PROCESS h
+ lea destq, [destq+linesizeq*4]
+ INV_TRANS_PROCESS h
+ RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
+ movsx blockd, WORD [blockq] ; dc
+ lea blockd, [blockq*3+1] ; 3 * dc + 1
+ sar blockd, 1 ; >> 1
+ mov r3d, blockd ; dc
+ shl blockd, 4 ; 16 * dc
+ lea blockd, [blockq+r3+64] ; 17 * dc + 64
+ sar blockd, 7 ; >> 7
+
+ INV_TRANS_INIT
+
+ INV_TRANS_PROCESS a
+ RET
+
+INIT_MMX mmxext
+cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
+ movsx blockd, WORD [blockq] ; dc
+ lea blockd, [blockq*3+1] ; 3 * dc + 1
+ sar blockd, 1 ; >> 1
+ lea blockd, [blockq*3+16] ; 3 * dc + 16
+ sar blockd, 5 ; >> 5
+
+ INV_TRANS_INIT
+
+ INV_TRANS_PROCESS a
+ lea destq, [destq+linesizeq*4]
+ INV_TRANS_PROCESS a
+ RET
diff --cc libavutil/x86/x86inc.asm
index c4ec29bd9d,e04dbfedf3..6a054a3e09
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@@ -87,9 -87,7 +87,9 @@@
; keep supporting OS/2.
%macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,aout
- section .text
+ SECTION .text
+ %elifidn __OUTPUT_FORMAT__,coff
- section .text
++ SECTION .text
%else
SECTION .rodata align=%1
%endif
More information about the ffmpeg-cvslog
mailing list