[FFmpeg-devel] [PATCH 2/2] tta/x86: add ff_ttafilter_process_dec_{ssse3, sse4}

Christophe Gisquet christophe.gisquet at gmail.com
Tue Feb 11 02:12:20 CET 2014


Hi,

2014-02-11 0:51 GMT+01:00 James Almer <jamrial at gmail.com>:

> +INIT_XMM ssse3
> +cglobal ttafilter_process_dec, 2,2,8, filter, in
> +    mova       m2, [filterq + 0xc ]
> +    mova       m3, [filterq + 0x1c]
> +    mova       m4, [filterq + 0x4c]
> +    mova       m5, [filterq + 0x5c]
> +
> +    movd       m6, [filterq + 0x8 ]

err... I'm missing something or is this code prone to perform aligned
reads on unaligned addresses?

Seems a bit lucky because of this:
typedef struct TTAFilter {
    int32_t shift, round, error;
    int32_t qm[MAX_ORDER];
    int32_t dx[MAX_ORDER];
    int32_t dl[MAX_ORDER];
} TTAFilter;

[...]

typedef struct TTAChannel {
    int32_t predictor;
    TTAFilter filter;
    TTARice rice;
} TTAChannel;

sizeof(TTAChannel) should be 4*(3*MAX_ORDER+8)=224 (multiple of 16)
and the offset of the TTAFilter 4, so all mova actually end up on
aligned addresses.

This seems to highly rely on the TTAChannel being allocated at an
aligned address, the structures not being modified, a compiler not
deciding to add padding between struct members, and the offsets
remaining constant. It is probably ok, but I'm not terribly confident
here.

> +    pshufd     m6, m6, 0
> +    psignd     m0, m4, m6
> +    psignd     m1, m5, m6
> +    paddd      m2, m0
> +    paddd      m3, m1
> +    mova       [filterq + 0xc ], m2
> +    mova       [filterq + 0x1c], m3
> +
[...]
> +
> +    psignd     m1, [pd_m1]
> +    psrldq     m1, 4
> +    pslldq     m6, 12
> +    pshufd     m6, m6, 0xf0
> +    paddd      m1, m6
> +    psrldq     m6, m1, 4
> +    pshufd     m6, m6, 0xf4
> +    paddd      m1, m6
> +    psrldq     m6, 4
> +    paddd      m1, m6
> +    mova       [filterq + 0x9c], m1
> +    RET
> +
> +INIT_XMM sse4
> +cglobal ttafilter_process_dec, 2,2,7, filter, in
> +    mova       m2, [filterq + 0xc ]
> +    mova       m3, [filterq + 0x1c]
> +    mova       m4, [filterq + 0x4c]
> +    mova       m5, [filterq + 0x5c]
> +
> +    movd       m6, [filterq + 0x8 ] ; if (filter->error < 0) {
> +    pshufd     m6, m6, 0            ;     for (int i = 0; i < 8; i++)
> +    psignd     m0, m4, m6           ;         filter->qm[i] -= filter->dx[i];
> +    psignd     m1, m5, m6           ; } else if (filter->error > 0) {
> +    paddd      m2, m0               ;     for (int i = 0; i < 8; i++)
> +    paddd      m3, m1               ;         filter->qm[i] += filter->dx[i];
> +    mova       [filterq + 0xc ], m2 ; }
> +    mova       [filterq + 0x1c], m3 ;
> +
[...]
> +    psignd     m1, [pd_m1]          ;
> +    psrldq     m1, 4                ;
> +    pslldq     m0, 12               ; filter->dl[4] = -filter->dl[5];
> +    pshufd     m0, m0, 0xf0         ; filter->dl[5] = -filter->dl[6];
> +    paddd      m1, m0               ; filter->dl[6] = *in - filter->dl[7];
> +    psrldq     m0, m1, 4            ; filter->dl[7] = *in;
> +    pshufd     m0, m0, 0xf4         ; filter->dl[5] += filter->dl[6];
> +    paddd      m1, m0               ; filter->dl[4] += filter->dl[5];
> +    psrldq     m0, 4                ;
> +    paddd      m1, m0               ;
> +    mova       [filterq + 0x9c], m1 ;
> +    RET

Could you please macroize this code? Of interest are the macros
ARCH_X86_64/mmsize (8/16/32, size of main SIMD reg)/cpuflag()

For your above case, this gives for instance:
%macro TTA_FILTER 0
cglobal ttafilter_process_dec, 2,2,7, filter, in
   mova       m2, [filterq + 0xc ]
   mova       m3, [filterq + 0x1c]
   mova       m4, [filterq + 0x4c]
   mova       m5, [filterq + 0x5c]

   movd       m6, [filterq + 0x8 ] ; if (filter->error < 0) {
   pshufd     m6, m6, 0            ;     for (int i = 0; i < 8; i++)
   psignd     m0, m4, m6           ;         filter->qm[i] -= filter->dx[i];
   psignd     m1, m5, m6           ; } else if (filter->error > 0) {
   paddd      m2, m0               ;     for (int i = 0; i < 8; i++)
   paddd      m3, m1               ;         filter->qm[i] += filter->dx[i];
   mova       [filterq + 0xc ], m2 ; }
   mova       [filterq + 0x1c], m3 ;
%if cpuflags(sse4)
   ; some code
%else
   ; SSSE3 code
%endif
   ; tail
%endmacro

INIT_XMM ssse3
TTA_FILTER

INIT_XMM sse4
TTA_FILTER

There are plenty of such examples, just grep for them in libavcodec/x86

I haven't quite checked if the code is optimal, but I haven't seen any
other issue. Maybe using more registers to break dependencies, but
that's a short function, and there's no loop to amortize their use.

Thanks,
-- 
Christophe


More information about the ffmpeg-devel mailing list