[FFmpeg-devel] [PATCH] VP8 arithcoder asm

Sun Jul 4 12:39:44 CEST 2010

On Sun, 4 Jul 2010, Jason Garrett-Glaser wrote:

>+cglobal vp56_rac_get_prob, 2,6
>+    mov    r2d, [r0+cb.high]
>+    mov    r3d, r2d
>+    dec    r2d                       ; high-1
>+    imul   r2d, r1d                  ; (high-1) * prob
>+    mov    r4d, [r0+cb.code_word]
>+    shr    r2d, 8                    ; ((high-1) * prob) >> 8
>+    mov    r1d, r4d
>+    inc    r2d                       ; 1 + (((high-1) * prob) >> 8)

(high*prob - (prob-256)) >> 8
looks like it has 1 shorter dependency chain

Does any rounding get simpler if you negate the codeword and reverse the 
roles of high and low?

>+    mov    r5d, r2d                  ; low
>+    sub    r3d, r2d                  ; high - low
>+    shl    r2d, 8                    ; low_shift
>+    sub    r1d, r2d                  ; code_word - low_shift

If code_word fits in 16bit, you can subtract the low byte of r2 from the 
second byte of r1 without any intervening shift.

>+    cmp    r4d, r2d                  ; if(code_word >= low_shift)

Useless. Sub already set the same flags.

>+    setge  r2b                       ; bit = code_word >= low_shift
>+    cmovge r5d, r3d                  ; high = bit ? high - low : low
>+    mov    r3d, [r0+cb.bits]
>+    cmovge r4d, r1d                  ; code_word = bit ? code_word - 
low_shift : code_word
>+    movzx  r1d, byte [h264_norm_shift+r5] ; ff_h264_norm_shift[high]
>+    movzx  r2d, r2b

Merge with mov to eax.

>+    dec    r1d                       ; ff_h264_norm_shift[high] - 1
>+    shl    r5d, r1b                  ; high <<= shift
>+    shl    r4d, r1b                  ; code_word <<= shift
>+    add    r3d, r1d                  ; bits += shift

Add from memory, unless the separate load improves scheduling.

>+    mov [r0+cb.high], r5d
>+    jge .putbyte                     ; if(bits >= 0)
>+.ret:
>+    mov [r0+cb.bits], r3d
>+    mov [r0+cb.code_word], r4d
>+    mov    eax, r2d
>+    RET
>+.putbyte:

getbyte

>+    mov     r1, [r0+cb.buffer]
>+    cmp     r1, [r0+cb.end]          ; if(buffer >= end)
>+    jge .ret
>+    movzx  r5d, byte [r1]            ; *buffer
>+    inc     r1                       ; buffer++
>+    mov [r0+cb.buffer], r1
>+    mov    r1d, r3d
>+    sub    r3d, 8                    ; bits -= 8
>+    shl    r5d, r1b                  ; *buffer << bits
>+    or     r4d, r5d                  ; code_word |= *buffer << bits
>+    mov [r0+cb.bits], r3d
>+    mov [r0+cb.code_word], r4d
>+    mov    eax, r2d
>+    RET

--Loren Merritt