[FFmpeg-devel] [PATCH] lavfi/hqdn3d/x86: use full register for lowpass.

Fri Sep 21 05:45:00 CEST 2012

On Fri, 21 Sep 2012, Michael Niedermayer wrote:

> On Wed, Sep 19, 2012 at 05:08:10PM +0200, Nicolas George wrote:
> > LOWPASS starts using the full register for prevsample and cursample,
> > but currently only returns a valid 32-bits register; later calls will
> > reuse the value as full register, leading to possibly incorrect
> > high-order bits.
> >
> > Fix trac ticket #1752.
> >
> > Signed-off-by: Nicolas George <nicolas.george at normalesup.org>
> > ---
> >  libavfilter/x86/hqdn3d.asm |    4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> >
> > I am rather out of my league here. Using gdb, I observed this for the second
> > line of the offending sample:
> >
> > "LOWPASS   t1, pixelant, spatial" becomes:
> >
> > sub    %r10,%rbx              rbx = 0xfffffffffffffff9 = -7
> > sar    $0x4,%rbx              rbx = 0xffffffffffffffff = -1
> > movswl (%r9,%rbx,2),%ebx      rbx = 0xfffffff8 = (uint32_t)-8
> > add    %r10d,%ebx             rbx = 0xffffffff
> >
> > t1 is r10, pixelant is rbx.
> >
> > Then, a few lines below "LOWPASS   t0, t1, temporal" becomes:
> >
> > sub    %rbx,%r11              r11 = 0xffffffff00000001
> > sar    $0x4,%r11              r11 = 0xfffffffff0000000
> > movswl (%rax,%r11,2),%r11d    SEGV
> > add    %ebx,%r11d
> >
> > The first subtraction on rbx seems to result in an incorrect sign extension,
> > leading to a segfault when it is used as an index in the "temporal" array.
> >
> >
> > diff --git a/libavfilter/x86/hqdn3d.asm b/libavfilter/x86/hqdn3d.asm
> > index 88b9b0d..324642d 100644
> > --- a/libavfilter/x86/hqdn3d.asm
> > +++ b/libavfilter/x86/hqdn3d.asm
> > @@ -27,8 +27,8 @@ SECTION .text
> >  %if lut_bits != 8
> >      sar    %1q, 8-lut_bits
> >  %endif
> > -    movsx  %1d, word [%3q+%1q*2]
> > -    add    %1d, %2d
> > +    movsx  %1q, word [%3q+%1q*2]
> > +    add    %1q, %2q
> >  %endmacro

This value should never be negative.
Different fix that more closely tracks the fact that pixels are unsigned:

---
 libavfilter/vf_hqdn3d.c    |    7 +++----
 libavfilter/x86/hqdn3d.asm |    2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/libavfilter/vf_hqdn3d.c b/libavfilter/vf_hqdn3d.c
index 71b670d..e3e45f2 100644
--- a/libavfilter/vf_hqdn3d.c
+++ b/libavfilter/vf_hqdn3d.c
@@ -50,10 +50,9 @@ void ff_hqdn3d_row_10_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16
 void ff_hqdn3d_row_16_x86(uint8_t *src, uint8_t *dst, uint16_t *line_ant, uint16_t *frame_ant, ptrdiff_t w, int16_t *spatial, int16_t *temporal);

 #define LUT_BITS (depth==16 ? 8 : 4)
-#define RIGHTSHIFT(a,b) (((a)+(((1<<(b))-1)>>1))>>(b))
-#define LOAD(x) ((depth==8 ? src[x] : AV_RN16A(src+(x)*2)) << (16-depth))
-#define STORE(x,val) (depth==8 ? dst[x] = RIGHTSHIFT(val, 16-depth)\
-                    : AV_WN16A(dst+(x)*2, RIGHTSHIFT(val, 16-depth)))
+#define LOAD(x) (((depth==8 ? src[x] : AV_RN16A(src+(x)*2)) << (16-depth)) + (((1<<(16-depth))-1)>>1))
+#define STORE(x,val) (depth==8 ? dst[x] = (val) >> (16-depth)\
+                    : AV_WN16A(dst+(x)*2, (val) >> (16-depth)))

 av_always_inline
 static uint32_t lowpass(int prev, int cur, int16_t *coef, int depth)
diff --git a/libavfilter/x86/hqdn3d.asm b/libavfilter/x86/hqdn3d.asm
index 7254194..8554093 100644
--- a/libavfilter/x86/hqdn3d.asm
+++ b/libavfilter/x86/hqdn3d.asm
@@ -39,6 +39,7 @@ SECTION .text
 %endif
 %if %3 != 16
     shl    %1, 16-%3
+    add    %1, (1<<(15-%3))-1
 %endif
 %endmacro

@@ -86,7 +87,6 @@ ALIGN 16
     mov       [frameantq+xq*2], t0w
     movifnidn dstq, dstmp
 %if %1 != 16
-    add    t0d, (1<<(15-%1))-1
     shr    t0d, 16-%1 ; could eliminate this by storing from t0h, but only with some contraints on register allocation
 %endif
 %if %1 == 8
-- 
1.7.4.1