[FFmpeg-devel] [PATCH] Make swscale x86 ASM depend less on registers been preserved across blocks
Vitor Sessak
vitor1001 at gmail.com
Wed Sep 17 22:55:52 CEST 2014
On Wed, Sep 17, 2014 at 10:31 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
> On Wed, Sep 17, 2014 at 09:21:08PM +0200, Vitor Sessak wrote:
>> Hi!
>>
>> I'm not sure we can count on registers been preserved across ASM
>> blocks. Moreover, this causes problems with utils that instrument the
>> code by inserting function calls between lines (like ThreadSanitizer).
>>
>> The attached patch fix one instance of this problem in swscale.
>>
>> -Vitor
>
>> swscale.c | 82 +++++++++++++++++++++++++++++++-------------------------------
>> 1 file changed, 42 insertions(+), 40 deletions(-)
>> 7fe9f7a0c1a7c1ed843b891f5f810fbd94a2e03f 0001-swscale-x86-do-not-expect-registers-to-be-preserved-.patch
>> From f1c59628b2baeb9994bed8947c0a2c228610bf4f Mon Sep 17 00:00:00 2001
>> From: Vitor Sessak <vsessak at google.com>
>> Date: Wed, 17 Sep 2014 21:10:16 +0200
>> Subject: [PATCH] swscale/x86: do not expect registers to be preserved across
>> inline ASM blocks
>>
>> ---
>> libswscale/x86/swscale.c | 82 +++++++++++++++++++++++++-----------------------
>> 1 file changed, 42 insertions(+), 40 deletions(-)
>>
>> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
>> index c4c0e28..38157c8 100644
>> --- a/libswscale/x86/swscale.c
>> +++ b/libswscale/x86/swscale.c
>> @@ -205,36 +205,19 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
>> yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
>> return;
>> }
>> - if (offset) {
>> - __asm__ volatile("movq (%0), %%xmm3\n\t"
>> - "movdqa %%xmm3, %%xmm4\n\t"
>> - "psrlq $24, %%xmm3\n\t"
>> - "psllq $40, %%xmm4\n\t"
>> - "por %%xmm4, %%xmm3\n\t"
>> - :: "r"(dither)
>> - );
>> - } else {
>> - __asm__ volatile("movq (%0), %%xmm3\n\t"
>> - :: "r"(dither)
>> - );
>> - }
>
>> - filterSize--;
>
> not sure i missed it but it seems this is lost in teh new code
Good catch. Fixed!
>> - __asm__ volatile(
>> - "pxor %%xmm0, %%xmm0\n\t"
>> - "punpcklbw %%xmm0, %%xmm3\n\t"
>> - "movd %0, %%xmm1\n\t"
>> - "punpcklwd %%xmm1, %%xmm1\n\t"
>> - "punpckldq %%xmm1, %%xmm1\n\t"
>> - "punpcklqdq %%xmm1, %%xmm1\n\t"
>> - "psllw $3, %%xmm1\n\t"
>> - "paddw %%xmm1, %%xmm3\n\t"
>> - "psraw $4, %%xmm3\n\t"
>> - ::"m"(filterSize)
>> - );
>> - __asm__ volatile(
>> - "movdqa %%xmm3, %%xmm4\n\t"
>> - "movdqa %%xmm3, %%xmm7\n\t"
>> - "movl %3, %%ecx\n\t"
>> +#define MAIN_FUNCTION \
>> + "pxor %%xmm0, %%xmm0 \n\t" \
>> + "punpcklbw %%xmm0, %%xmm3 \n\t" \
>> + "movd %4, %%xmm1 \n\t" \
>> + "punpcklwd %%xmm1, %%xmm1 \n\t" \
>> + "punpckldq %%xmm1, %%xmm1 \n\t" \
>> + "punpcklqdq %%xmm1, %%xmm1 \n\t" \
>> + "psllw $3, %%xmm1 \n\t" \
>> + "paddw %%xmm1, %%xmm3 \n\t" \
>> + "psraw $4, %%xmm3 \n\t" \
>> + "movdqa %%xmm3, %%xmm4 \n\t" \
>> + "movdqa %%xmm3, %%xmm7 \n\t" \
>> + "movl %3, %%ecx \n\t" \
>> "mov %0, %%"REG_d" \n\t"\
>> "mov (%%"REG_d"), %%"REG_S" \n\t"\
>> ".p2align 4 \n\t" /* FIXME Unroll? */\
>> @@ -252,20 +235,39 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
>> " jnz 1b \n\t"\
>> "psraw $3, %%xmm3 \n\t"\
>> "psraw $3, %%xmm4 \n\t"\
>> - "packuswb %%xmm4, %%xmm3 \n\t"
>> - "movntdq %%xmm3, (%1, %%"REG_c")\n\t"
>> + "packuswb %%xmm4, %%xmm3 \n\t"\
>> + "movntdq %%xmm3, (%1, %%"REG_c")\n\t"\
>> "add $16, %%"REG_c" \n\t"\
>> "cmp %2, %%"REG_c" \n\t"\
>> - "movdqa %%xmm7, %%xmm3\n\t"
>> - "movdqa %%xmm7, %%xmm4\n\t"
>> + "movdqa %%xmm7, %%xmm3 \n\t" \
>> + "movdqa %%xmm7, %%xmm4 \n\t" \
>> "mov %0, %%"REG_d" \n\t"\
>> "mov (%%"REG_d"), %%"REG_S" \n\t"\
>> - "jb 1b \n\t"\
>> - :: "g" (filter),
>> - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
>> - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
>> - "%"REG_d, "%"REG_S, "%"REG_c
>> - );
>> + "jb 1b \n\t"
>> +
>> + if (offset) {
>> + __asm__ volatile(
>> + "movq %5, %%xmm3 \n\t"
>> + "movdqa %%xmm3, %%xmm4 \n\t"
>> + "psrlq $24, %%xmm3 \n\t"
>> + "psllq $40, %%xmm4 \n\t"
>> + "por %%xmm4, %%xmm3 \n\t"
>> + MAIN_FUNCTION
>> + :: "g" (filter),
>> + "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
>> + "m"(filterSize), "m"(((uint64_t *) dither)[0])
>> + : "%"REG_d, "%"REG_S, "%"REG_c
>> + );
>
> you lost the XMM_CLOBBERS()
True. Restored it.
>> + } else {
>> + __asm__ volatile(
>> + "movq %5, %%xmm3 \n\t"
>> + MAIN_FUNCTION
>> + :: "g" (filter),
>> + "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
>> + "m"(filterSize), "m"(((uint64_t *) dither)[0])
>> + : "%"REG_d, "%"REG_S, "%"REG_c
>> + );
>
> tabs
Why the hell the default emacs profile is so bad :-(
New patch attached.
-Vitor
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-swscale-x86-do-not-expect-registers-to-be-preserved-.patch
Type: application/octet-stream
Size: 5181 bytes
Desc: not available
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140917/89130fde/attachment.obj>
More information about the ffmpeg-devel
mailing list