[Ffmpeg-devel] clever 8-bit MMX loop filter ABS test

Tue May 3 11:03:05 CEST 2005

	Richard and all,

On Tue, 2005-05-03 at 09:23, Richard Goedeken wrote:
> Hi guys,
> 
> I saw this piece of code in the list last thursday. While it is a rather
> clever way to do the alpha/beta test, it won't produce results that are
> identical to the 16-bit code.

	[...]

	This code was only meant to be a hinted example, just
	to delineate the idea. Getting a fully working code
	needs quite more work.

	Ok. Attached is a fully working code for doing the
	Horizontal-Chroma filtering with bS=4 (the simplest
	of all h264 filtering funcs). The other funcs are
	similar, only even more involved. Plus there are still
	other tricks applicable, but let's keep simple for 
	clarity's sake.

	Note: the trick is to pass Alpha-1 / Beta-1 instead
	of Alpha/Beta. And test |Po-Qo|<=Alpha-1 instead
	of |Po-Qo|<Alpha, e.g. 

	Note2: this code uses a different implementation
	choice than ffmpeg's code: it filters 2 rows of
	8 chroma samples all together.

	All in all, staying 8bits pays off: ~3ticks/pel (amortized).

	hope it helps,
Skal


%macro ABS_LESS_SSE 2     ;   %1:out reg  %2: alpha-1/beta-1  mm0:Px mm1:Qx  Trashes mm0,mm1,mm2
  movq    mm2, mm0  ; Save Po
  psubusb mm0, mm1  ; Po-Qo
  psubusb mm1, mm2  ; Qo-Po
  psubusb mm0,  %2
  psubusb mm1,  %2
  por     mm1, mm0
  pxor     %1,  %1
  pcmpeqb  %1, mm1
%endmacro

%macro FILTER_CHROMA4_SSE 2       ; filters %1-%2. Result in %2.        Trashes mm2,mm3.
                                  ; Input: mm0 = (P1+Q1+1)>>1, mm1 = (P1^Q1), mm6 = [One]
  movq    mm2,  %1    ; Po
  movq    mm3,  %2    ; P1
  pavgb    %2, mm2    ; mm2 = (Po+P1+1)>>1
  pxor    mm2, mm3    ; mm3 = Po^P1
  por     mm2, mm1    ; mm3 = (Po^P1) | (P1^Q1)

  movq    mm3,  %2
  pavgb    %2, mm0
  pxor    mm3, mm0
  pand    mm2, mm6    ; One
  pand    mm3, mm2
  psubusb  %2, mm3    ; mm2 = new Po
  pand     %2, mm7    ; mask
%endmacro

align 16
Skl_Deblock_Chroma4_H_SSE:   ; 47c  Signature:  (uint8_t *Dst, const int Stride, const int Alpha_minus1, const int Beta_minus1)

  mov ecx, [esp+4] ; Dst
  mov edx, [esp+8] ; BpS

  mov eax, ecx      ; Qo[]
  sub ecx, edx      ; Po[]

  pshufw   mm4, [esp+12], 0 ; Alpha-1
  pshufw   mm5, [esp+16], 0 ; Beta -1

  movq     mm6, [One]

  packuswb mm4, mm4         ; Alpha-1
  packuswb mm5, mm5         ; Beta -1

    ; Building mask in mm7

  movq     mm0, [eax      ]   ; Qo
  movq     mm1, [ecx      ]   ; Po
  sub      ecx, edx
  ABS_LESS_SSE mm7, mm4   ; |Po-Qo|<Alpha

  movq     mm0, [ecx      ]   ; P1
  movq     mm1, [ecx+  edx]   ; Po
  sub      ecx, edx
  ABS_LESS_SSE mm0, mm5   ; |Po-P1|<Beta
  pand     mm7, mm0

  movq     mm0, [eax    ]     ; Qo
  movq     mm1, [eax+edx]     ; Q1
  ABS_LESS_SSE mm0, mm5   ; |Qo-Q1|<Beta
  pand     mm7, mm0

    ; Filtering

  movq    mm0, [ecx+  edx] ; P1
  movq    mm1, [eax+  edx] ; Q1

  movq    mm4, mm1    ; mm4 = Q1, saved
  movq    mm5, mm0    ; mm5 = P1, saved
  pavgb   mm0, mm1    ; mm0 = (P1+Q1+1)>>1
  pxor    mm1, mm5    ; mm1 = P1^Q1

  FILTER_CHROMA4_SSE  [ecx+2*edx], mm5    ; Computes (Po+P1+(P1+Q1)+2)>>2
  FILTER_CHROMA4_SSE  [eax      ], mm4    ; Computes (Qo+Q1+(P1+Q1)+2)>>2

  movq    mm0, mm7
  movq    mm1, mm7
  pandn   mm0, [ecx+2*edx] ; old Po, anti-masked
  pandn   mm1, [eax      ] ; old Qo, anti-masked
  por     mm0, mm5
  movq    [ecx+2*edx], mm0   ; => Po
  por     mm1, mm4
  movq    [eax      ], mm1   ; => Qo

  ret