[FFmpeg-devel] [PATCH 2/2] tta/x86: add ff_ttafilter_process_dec_{ssse3, sse4}
Christophe Gisquet
christophe.gisquet at gmail.com
Tue Feb 11 02:12:20 CET 2014
Hi,
2014-02-11 0:51 GMT+01:00 James Almer <jamrial at gmail.com>:
> +INIT_XMM ssse3
> +cglobal ttafilter_process_dec, 2,2,8, filter, in
> + mova m2, [filterq + 0xc ]
> + mova m3, [filterq + 0x1c]
> + mova m4, [filterq + 0x4c]
> + mova m5, [filterq + 0x5c]
> +
> + movd m6, [filterq + 0x8 ]
err... I'm missing something or is this code prone to perform aligned
reads on unaligned addresses?
Seems a bit lucky because of this:
typedef struct TTAFilter {
int32_t shift, round, error;
int32_t qm[MAX_ORDER];
int32_t dx[MAX_ORDER];
int32_t dl[MAX_ORDER];
} TTAFilter;
[...]
typedef struct TTAChannel {
int32_t predictor;
TTAFilter filter;
TTARice rice;
} TTAChannel;
sizeof(TTAChannel) should be 4*(3*MAX_ORDER+8)=224 (multiple of 16)
and the offset of the TTAFilter 4, so all mova actually end up on
aligned addresses.
This seems to highly rely on the TTAChannel being allocated at an
aligned address, the structures not being modified, a compiler not
deciding to add padding between struct members, and the offsets
remaining constant. It is probably ok, but I'm not terribly confident
here.
> + pshufd m6, m6, 0
> + psignd m0, m4, m6
> + psignd m1, m5, m6
> + paddd m2, m0
> + paddd m3, m1
> + mova [filterq + 0xc ], m2
> + mova [filterq + 0x1c], m3
> +
[...]
> +
> + psignd m1, [pd_m1]
> + psrldq m1, 4
> + pslldq m6, 12
> + pshufd m6, m6, 0xf0
> + paddd m1, m6
> + psrldq m6, m1, 4
> + pshufd m6, m6, 0xf4
> + paddd m1, m6
> + psrldq m6, 4
> + paddd m1, m6
> + mova [filterq + 0x9c], m1
> + RET
> +
> +INIT_XMM sse4
> +cglobal ttafilter_process_dec, 2,2,7, filter, in
> + mova m2, [filterq + 0xc ]
> + mova m3, [filterq + 0x1c]
> + mova m4, [filterq + 0x4c]
> + mova m5, [filterq + 0x5c]
> +
> + movd m6, [filterq + 0x8 ] ; if (filter->error < 0) {
> + pshufd m6, m6, 0 ; for (int i = 0; i < 8; i++)
> + psignd m0, m4, m6 ; filter->qm[i] -= filter->dx[i];
> + psignd m1, m5, m6 ; } else if (filter->error > 0) {
> + paddd m2, m0 ; for (int i = 0; i < 8; i++)
> + paddd m3, m1 ; filter->qm[i] += filter->dx[i];
> + mova [filterq + 0xc ], m2 ; }
> + mova [filterq + 0x1c], m3 ;
> +
[...]
> + psignd m1, [pd_m1] ;
> + psrldq m1, 4 ;
> + pslldq m0, 12 ; filter->dl[4] = -filter->dl[5];
> + pshufd m0, m0, 0xf0 ; filter->dl[5] = -filter->dl[6];
> + paddd m1, m0 ; filter->dl[6] = *in - filter->dl[7];
> + psrldq m0, m1, 4 ; filter->dl[7] = *in;
> + pshufd m0, m0, 0xf4 ; filter->dl[5] += filter->dl[6];
> + paddd m1, m0 ; filter->dl[4] += filter->dl[5];
> + psrldq m0, 4 ;
> + paddd m1, m0 ;
> + mova [filterq + 0x9c], m1 ;
> + RET
Could you please macroize this code? Of interest are the macros
ARCH_X86_64/mmsize (8/16/32, size of main SIMD reg)/cpuflag()
For your above case, this gives for instance:
%macro TTA_FILTER 0
cglobal ttafilter_process_dec, 2,2,7, filter, in
mova m2, [filterq + 0xc ]
mova m3, [filterq + 0x1c]
mova m4, [filterq + 0x4c]
mova m5, [filterq + 0x5c]
movd m6, [filterq + 0x8 ] ; if (filter->error < 0) {
pshufd m6, m6, 0 ; for (int i = 0; i < 8; i++)
psignd m0, m4, m6 ; filter->qm[i] -= filter->dx[i];
psignd m1, m5, m6 ; } else if (filter->error > 0) {
paddd m2, m0 ; for (int i = 0; i < 8; i++)
paddd m3, m1 ; filter->qm[i] += filter->dx[i];
mova [filterq + 0xc ], m2 ; }
mova [filterq + 0x1c], m3 ;
%if cpuflags(sse4)
; some code
%else
; SSSE3 code
%endif
; tail
%endmacro
INIT_XMM ssse3
TTA_FILTER
INIT_XMM sse4
TTA_FILTER
There are plenty of such examples, just grep for them in libavcodec/x86
I haven't quite checked if the code is optimal, but I haven't seen any
other issue. Maybe using more registers to break dependencies, but
that's a short function, and there's no loop to amortize their use.
Thanks,
--
Christophe
More information about the ffmpeg-devel
mailing list