[FFmpeg-devel] avcodec/utvideoenc : add SIMD (SSSE3) for sub_left_pred
Henrik Gramner
henrik at gramner.com
Fri Jan 12 20:06:39 EET 2018
On Thu, Jan 11, 2018 at 9:45 PM, Martin Vignali
<martin.vignali at gmail.com> wrote:
> + if (check_func(c.sub_left_predict, "sub_left_predict")) {
> + call_ref(dst0, src0, stride, width, height);
> + call_new(dst1, src0, stride, width, height);
> + if (memcmp(dst0, dst1, width))
> + fail();
> + bench_new(dst1, src0, stride, width, height);
> + }
You're only verifying the results of the first row here. Changing it
to test all rows results in test failures.
> + int width = av_clip(rnd(), 16, 128);
> + int height = av_clip(rnd(), 16, 128);
This kind of clipping will result in the values being 128 almost every
run. You should also use constant sizes instead of random ones because
random ones will make benchmarking inconsistent since you'll measure
different things for the C and asm versions.
You could do something along the lines of
static const struct { uint8_t w, h, s; } planes[] = {
{16,16,16}, {21,23,25}, {32,17,48}, {15,128,16}, {128,127,128}
};
and just test all of those every run.
> +%if ARCH_X86_64
> +INIT_XMM ssse3
> +cglobal sub_left_predict, 4,5,5, dst, src, stride, width, height, x
> + mova m0, [pb_15] ; shuffle for last byte
> + mova m1, [pb_80] ; prev initial
> +.nextrow:
> + xor xq, xq
> +
> + .loop:
> + movu m2, [srcq + xq]
> + psubb m1, m2 ; - prev
> + pslldq m3, m1, 1
> + psubb m3, m1
> + movu [dstq+xq], m3
> + pshufb m1, m2, m0
> + add xq, mmsize
> + cmp xd, widthd
> + jl .loop
> +
> + add srcq, strideq
> + add dstq, widthq
> + sub heightq, 1
> + jg .nextrow
> + REP_RET
> +%endif
There's no need to restrict this to x86-64 only.
The register specification is wrong and will fail on Windows (and 32-bit).
Using a constant 15 for pshufb will be be wrong for the first byte of
every row except for the first with non-mod16 widths.
Try something like this:
INIT_XMM avx
cglobal sub_left_predict, 5,6,5, dst, src, stride, width, height, x
movsxdifnidn widthq, widthd ; Change width from int to ptrdiff_t
to get rid of this
mova m1, [pb_80] ; prev
add dstq, widthq
add srcq, widthq
lea xd, [widthq-1]
neg widthq
and xd, 15
pinsrb m4, m1, xd, 15
mov xq, widthq
.loop:
movu m0, [srcq+widthq]
palignr m2, m0, m1, 15
movu m1, [srcq+widthq+16]
palignr m3, m1, m0, 15
psubb m2, m0, m2
psubb m3, m1, m3
movu [dstq+widthq], m2
movu [dstq+widthq+16], m3
add widthq, 2*16
jl .loop
add srcq, strideq
sub dstq, xq
test xd, 16
jz .mod32
mova m1, m0
.mod32:
pshufb m1, m4
mov widthq, xq
dec heightd
jg .loop
RET
More information about the ffmpeg-devel
mailing list