[Ffmpeg-devel] clever 8-bit MMX loop filter ABS test
Skal
skal
Tue May 3 11:03:05 CEST 2005
Richard and all,
On Tue, 2005-05-03 at 09:23, Richard Goedeken wrote:
> Hi guys,
>
> I saw this piece of code in the list last thursday. While it is a rather
> clever way to do the alpha/beta test, it won't produce results that are
> identical to the 16-bit code.
[...]
This code was only meant to be a hinted example, just
to delineate the idea. Getting a fully working code
needs quite more work.
Ok. Attached is a fully working code for doing the
Horizontal-Chroma filtering with bS=4 (the simplest
of all h264 filtering funcs). The other funcs are
similar, only even more involved. Plus there are still
other tricks applicable, but let's keep simple for
clarity's sake.
Note: the trick is to pass Alpha-1 / Beta-1 instead
of Alpha/Beta. And test |Po-Qo|<=Alpha-1 instead
of |Po-Qo|<Alpha, e.g.
Note2: this code uses a different implementation
choice than ffmpeg's code: it filters 2 rows of
8 chroma samples all together.
All in all, staying 8bits pays off: ~3ticks/pel (amortized).
hope it helps,
Skal
%macro ABS_LESS_SSE 2 ; %1:out reg %2: alpha-1/beta-1 mm0:Px mm1:Qx Trashes mm0,mm1,mm2
movq mm2, mm0 ; Save Po
psubusb mm0, mm1 ; Po-Qo
psubusb mm1, mm2 ; Qo-Po
psubusb mm0, %2
psubusb mm1, %2
por mm1, mm0
pxor %1, %1
pcmpeqb %1, mm1
%endmacro
%macro FILTER_CHROMA4_SSE 2 ; filters %1-%2. Result in %2. Trashes mm2,mm3.
; Input: mm0 = (P1+Q1+1)>>1, mm1 = (P1^Q1), mm6 = [One]
movq mm2, %1 ; Po
movq mm3, %2 ; P1
pavgb %2, mm2 ; mm2 = (Po+P1+1)>>1
pxor mm2, mm3 ; mm3 = Po^P1
por mm2, mm1 ; mm3 = (Po^P1) | (P1^Q1)
movq mm3, %2
pavgb %2, mm0
pxor mm3, mm0
pand mm2, mm6 ; One
pand mm3, mm2
psubusb %2, mm3 ; mm2 = new Po
pand %2, mm7 ; mask
%endmacro
align 16
Skl_Deblock_Chroma4_H_SSE: ; 47c Signature: (uint8_t *Dst, const int Stride, const int Alpha_minus1, const int Beta_minus1)
mov ecx, [esp+4] ; Dst
mov edx, [esp+8] ; BpS
mov eax, ecx ; Qo[]
sub ecx, edx ; Po[]
pshufw mm4, [esp+12], 0 ; Alpha-1
pshufw mm5, [esp+16], 0 ; Beta -1
movq mm6, [One]
packuswb mm4, mm4 ; Alpha-1
packuswb mm5, mm5 ; Beta -1
; Building mask in mm7
movq mm0, [eax ] ; Qo
movq mm1, [ecx ] ; Po
sub ecx, edx
ABS_LESS_SSE mm7, mm4 ; |Po-Qo|<Alpha
movq mm0, [ecx ] ; P1
movq mm1, [ecx+ edx] ; Po
sub ecx, edx
ABS_LESS_SSE mm0, mm5 ; |Po-P1|<Beta
pand mm7, mm0
movq mm0, [eax ] ; Qo
movq mm1, [eax+edx] ; Q1
ABS_LESS_SSE mm0, mm5 ; |Qo-Q1|<Beta
pand mm7, mm0
; Filtering
movq mm0, [ecx+ edx] ; P1
movq mm1, [eax+ edx] ; Q1
movq mm4, mm1 ; mm4 = Q1, saved
movq mm5, mm0 ; mm5 = P1, saved
pavgb mm0, mm1 ; mm0 = (P1+Q1+1)>>1
pxor mm1, mm5 ; mm1 = P1^Q1
FILTER_CHROMA4_SSE [ecx+2*edx], mm5 ; Computes (Po+P1+(P1+Q1)+2)>>2
FILTER_CHROMA4_SSE [eax ], mm4 ; Computes (Qo+Q1+(P1+Q1)+2)>>2
movq mm0, mm7
movq mm1, mm7
pandn mm0, [ecx+2*edx] ; old Po, anti-masked
pandn mm1, [eax ] ; old Qo, anti-masked
por mm0, mm5
movq [ecx+2*edx], mm0 ; => Po
por mm1, mm4
movq [eax ], mm1 ; => Qo
ret
More information about the ffmpeg-devel
mailing list