[FFmpeg-devel] [WIP] vc2 encoder simd assembly

Tue Mar 1 17:08:01 CET 2016

Hello

I've been working on assembly for the vc2 encoder and have reached an
impasse.  My code results in very visible errors, very obvious vertical
streaks in the bottom-right half of the image and some low-frequency
effect (I think).

I cannot see the problem in my code so I need some fresh eyes to look at it.

You can test the code with these two commands:
> ./ffmpeg -f lavfi -i testsrc=s=1024x576,format=yuv420p -vcodec vc2 -strict -1 -vframes 1 -cpuflags 0 -y temp.vc2
> ./ffmpeg -f lavfi -i testsrc=s=1024x576,format=yuv420p -vcodec vc2 -strict -1 -vframes 1 -y temp.vc2

Decode each to a png image to see the problem.

Attached is a diff from master (my branch is a two dozen small commits).

Thanks.

-------------- next part --------------

diff --git a/libavcodec/vc2enc_dwt.c b/libavcodec/vc2enc_dwt.c
index c60b003..bac262d 100644
--- a/libavcodec/vc2enc_dwt.c
+++ b/libavcodec/vc2enc_dwt.c
@@ -23,6 +23,7 @@
 #include "libavutil/attributes.h"
 #include "libavutil/mem.h"
 #include "vc2enc_dwt.h"
+#include "libavcodec/x86/vc2dsp.h"
 
 /* Since the transforms spit out interleaved coefficients, this function
  * rearranges the coefficients into the more traditional subdivision,
@@ -76,16 +77,21 @@ static void vc2_subband_dwt_97(VC2TransformContext *t, dwtcoef *data,
     for (y = 0; y < synth_height; y++) {
         /* Lifting stage 2. */
         synthl[1] -= (8*synthl[0] + 9*synthl[2] - synthl[4] + 8) >> 4;
+
         for (x = 1; x < width - 2; x++)
-            synthl[2*x + 1] -= (9*synthl[2*x] + 9*synthl[2*x + 2] - synthl[2*x + 4] -
-                                synthl[2 * x - 2] + 8) >> 4;
+            synthl[2*x + 1] -= (9*synthl[2*x] +
+                                9*synthl[2*x + 2] -
+                                  synthl[2*x + 4] -
+                                  synthl[2*x - 2] + 8) >> 4;
+
         synthl[synth_width - 1] -= (17*synthl[synth_width - 2] -
-                                    synthl[synth_width - 4] + 8) >> 4;
+                                       synthl[synth_width - 4] + 8) >> 4;
         synthl[synth_width - 3] -= (8*synthl[synth_width - 2] +
                                     9*synthl[synth_width - 4] -
-                                    synthl[synth_width - 6] + 8) >> 4;
+                                      synthl[synth_width - 6] + 8) >> 4;
         /* Lifting stage 1. */
         synthl[0] += (synthl[1] + synthl[1] + 2) >> 2;
+
         for (x = 1; x < width - 1; x++)
             synthl[2*x] += (synthl[2*x - 1] + synthl[2*x + 1] + 2) >> 2;
 
@@ -97,25 +103,28 @@ static void vc2_subband_dwt_97(VC2TransformContext *t, dwtcoef *data,
     /* Vertical synthesis: Lifting stage 2. */
     synthl = synth + synth_width;
     for (x = 0; x < synth_width; x++)
-        synthl[x] -= (8*synthl[x - synth_width] + 9*synthl[x + synth_width] -
-                      synthl[x + 3 * synth_width] + 8) >> 4;
+        synthl[x] -= (8*synthl[x - synth_width] +
+                      9*synthl[x + synth_width] -
+                        synthl[x + 3*synth_width] + 8) >> 4;
 
     synthl = synth + (synth_width << 1);
     for (y = 1; y < height - 2; y++) {
         for (x = 0; x < synth_width; x++)
             synthl[x + synth_width] -= (9*synthl[x] +
-                                        9*synthl[x + 2 * synth_width] -
-                                        synthl[x - 2 * synth_width] -
-                                        synthl[x + 4 * synth_width] + 8) >> 4;
+                                        9*synthl[x + 2*synth_width] -
+                                          synthl[x - 2*synth_width] -
+                                          synthl[x + 4*synth_width] + 8) >> 4;
         synthl += synth_width << 1;
     }
 
     synthl = synth + (synth_height - 1) * synth_width;
     for (x = 0; x < synth_width; x++) {
         synthl[x] -= (17*synthl[x - synth_width] -
-                      synthl[x - 3*synth_width] + 8) >> 4;
-                      synthl[x - 2*synth_width] -= (9*synthl[x - 3*synth_width] +
-                      8*synthl[x - 1*synth_width] - synthl[x - 5*synth_width] + 8) >> 4;
+                         synthl[x - 3*synth_width] + 8) >> 4;
+
+        synthl[x - 2*synth_width] -= (9*synthl[x - 3*synth_width] +
+                                      8*synthl[x - 1*synth_width] -
+                                        synthl[x - 5*synth_width] + 8) >> 4;
     }
 
     /* Vertical synthesis: Lifting stage 1. */
@@ -262,6 +271,10 @@ av_cold int ff_vc2enc_init_transforms(VC2TransformContext *s, int p_width, int p
     s->vc2_subband_dwt[VC2_TRANSFORM_HAAR]   = vc2_subband_dwt_haar;
     s->vc2_subband_dwt[VC2_TRANSFORM_HAAR_S] = vc2_subband_dwt_haar_shift;
 
+#if ARCH_X86
+    ff_vc2enc_init_x86(s, p_width, p_height);
+#endif
+
     s->buffer = av_malloc(2*p_width*p_height*sizeof(dwtcoef));
     if (!s->buffer)
         return 1;
diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 839b5bc..24aedda 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -34,6 +34,7 @@ OBJS-$(CONFIG_PIXBLOCKDSP)             += x86/pixblockdsp_init.o
 OBJS-$(CONFIG_QPELDSP)                 += x86/qpeldsp_init.o
 OBJS-$(CONFIG_RV34DSP)                 += x86/rv34dsp_init.o
 OBJS-$(CONFIG_VC1DSP)                  += x86/vc1dsp_init.o
+OBJS-$(CONFIG_VC2_ENCODER)             += x86/vc2dsp_init.o
 OBJS-$(CONFIG_VIDEODSP)                += x86/videodsp_init.o
 OBJS-$(CONFIG_VP3DSP)                  += x86/vp3dsp_init.o
 OBJS-$(CONFIG_VP8DSP)                  += x86/vp8dsp_init.o
@@ -122,6 +123,7 @@ YASM-OBJS-$(CONFIG_QPELDSP)            += x86/qpeldsp.o                 \
 YASM-OBJS-$(CONFIG_RV34DSP)            += x86/rv34dsp.o
 YASM-OBJS-$(CONFIG_VC1DSP)             += x86/vc1dsp_loopfilter.o       \
                                           x86/vc1dsp_mc.o
+YASM-OBJS-$(CONFIG_VC2_ENCODER)        += x86/vc2enc_dwt.o
 YASM-OBJS-$(CONFIG_IDCTDSP)            += x86/simple_idct10.o
 YASM-OBJS-$(CONFIG_VIDEODSP)           += x86/videodsp.o
 YASM-OBJS-$(CONFIG_VP3DSP)             += x86/vp3dsp.o
diff --git a/libavcodec/x86/vc2dsp.h b/libavcodec/x86/vc2dsp.h
new file mode 100644
index 0000000..825862e
--- /dev/null
+++ b/libavcodec/x86/vc2dsp.h
@@ -0,0 +1,28 @@
+/*
+ * VC2 encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_X86_VC2DSP_H
+#define AVCODEC_X86_VC2DSP_H
+
+#include "libavcodec/vc2enc_dwt.h"
+
+void ff_vc2enc_init_x86(VC2TransformContext *s, int p_width, int p_height);
+
+#endif /* AVCODEC_X86_VC2DSP_H */
diff --git a/libavcodec/x86/vc2dsp_init.c b/libavcodec/x86/vc2dsp_init.c
new file mode 100644
index 0000000..b3bdb53
--- /dev/null
+++ b/libavcodec/x86/vc2dsp_init.c
@@ -0,0 +1,67 @@
+/*
+ * VC2 encoder
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/cpu.h"
+#include "libavutil/x86/cpu.h"
+#include "libavcodec/x86/vc2dsp.h"
+
+#if HAVE_SSE2_EXTERNAL
+void ff_vc2_subband_dwt_97_sse2(VC2TransformContext*, dwtcoef*, ptrdiff_t, int , int);
+#endif
+
+static av_always_inline void deinterleave(dwtcoef *linell, ptrdiff_t stride,
+                                          int width, int height, dwtcoef *synthl)
+{
+    int x, y;
+    ptrdiff_t synthw = width << 1;
+    dwtcoef *linehl = linell + width;
+    dwtcoef *linelh = linell + height*stride;
+    dwtcoef *linehh = linelh + width;
+
+    /* Deinterleave the coefficients. */
+    for (y = 0; y < height; y++) {
+        for (x = 0; x < width; x++) {
+            linell[x] = synthl[(x << 1)];
+            linehl[x] = synthl[(x << 1) + 1];
+            linelh[x] = synthl[(x << 1) + synthw];
+            linehh[x] = synthl[(x << 1) + synthw + 1];
+        }
+        synthl += synthw << 1;
+        linell += stride;
+        linelh += stride;
+        linehl += stride;
+        linehh += stride;
+    }
+}
+
+void static wrapper_97(VC2TransformContext *t, dwtcoef *data, ptrdiff_t stride, int width, int height)
+{
+    ff_vc2_subband_dwt_97_sse2(t, data, stride, width, height);
+    deinterleave(data, stride, width, height, t->buffer);
+}
+
+av_cold void ff_vc2enc_init_x86(VC2TransformContext *s, int p_width, int p_height)
+{
+    int cpuflags = av_get_cpu_flags();
+
+    if (EXTERNAL_SSE2(cpuflags)) {
+        s->vc2_subband_dwt[VC2_TRANSFORM_9_7] = wrapper_97;
+    }
+}
diff --git a/libavcodec/x86/vc2enc_dwt.asm b/libavcodec/x86/vc2enc_dwt.asm
new file mode 100644
index 0000000..c6fdf36
--- /dev/null
+++ b/libavcodec/x86/vc2enc_dwt.asm
@@ -0,0 +1,312 @@
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+cextern pw_2
+cextern pw_8
+cextern pw_9
+cextern pw_17
+
+section .text
+
+INIT_XMM sse2
+cglobal vc2_subband_dwt_97, 5, 12, 5, synth, data, stride, width, height
+    sub rsp, 5 * gprsize
+    mov synthq, [synthq]
+    movsxdifnidn widthq, widthd
+    movsxdifnidn heightq, heightd
+    mov [rsp + gprsize*0], synthq
+    mov [rsp + gprsize*1], dataq
+    mov [rsp + gprsize*2], strideq
+    mov [rsp + gprsize*3], widthq
+    mov [rsp + gprsize*4], heightq
+
+    DECLARE_REG_TMP 9, 10, 11
+    %define synth_w r5
+    %define synth_h r6
+    mov synth_w, widthq
+    mov synth_h, heightq
+    add synth_w, synth_w
+    add synth_h, synth_h
+
+    %define x r7
+    %define y r8
+    ; shifting loop (for precision and copy)
+    lea synthq, [synthq + synth_w*2] ; add 1 synth_w to save comparing x against synth_w
+    lea dataq,  [dataq + synth_w*2]
+    mov y, synth_h
+    .loop1_y:
+        mov x, synth_w
+        neg x
+        .loop1_x:
+            movu   m0, [dataq + x*2]
+            paddw  m0, m0
+            movu  [synthq + x*2],  m0
+            add    x, mmsize/2
+        jl .loop1_x
+
+        lea dataq, [dataq + strideq*2]
+        lea synthq, [synthq + synth_w*2]
+        sub y, 1
+    jg .loop1_y
+
+    ; Horizontal synthesis
+    mov synthq, [rsp]
+    mov y, synth_h
+    .loop2_y:
+        ; Lifting stage 2
+        mov t0w, word [synthq+0]
+        mov t1w, word [synthq+4]
+        imul t0w, 8
+        imul t1w, 9
+        add t0w, 8
+        sub t1w, word [synthq+8]
+        add t0w, t1w
+        sar t0w, 4
+        sub word [synthq+2], t0w
+
+        mov x, 1
+        sub widthq, 2
+        .loop2_x_1:
+%if 0 ; needs horizontal masking
+            movu m0, [synthq + x*4 + 0]
+            movu m1, [synthq + x*4 + 4]
+            movu m2, [synthq + x*4 + 8]
+            movu m3, [synthq + x*4 - 4]
+            pmullw m0, [pw_9]
+            pmullw m1, [pw_9]
+            psubw m0, m2
+            psubw m1, m3
+            paddw m0, m1
+            paddw m0, [pw_8]
+            psraw m0, 4
+            movu m4, [synthq + x*4 + 2]
+            psubw m4, m0
+            movu [synthq + x*4 + 2], m4
+            add x, mmsize / 2
+            cmp x, widthq
+%else
+            mov t0w, word [synthq + x*4 + 0]
+            mov t1w, word [synthq + x*4 + 4]
+            imul t0w, 9
+            imul t1w, 9
+            sub t0w, word [synthq + x*4 + 8]
+            sub t1w, word [synthq + x*4 - 4]
+            add t0w, 8
+            add t0w, t1w
+            sar t0w, 4
+            sub word [synthq + x*4 + 2], t0w
+            add x, 1
+            cmp x, widthq
+%endif
+        jl .loop2_x_1
+
+        mov t0w, word [synthq + synth_w*2 - 4]
+        mov t1w, word [synthq + synth_w*2 - 8]
+        imul t0w, 17
+        sub t1w, 8 ; -8 so that the sign is corrected below
+        sub t0w, t1w
+        sar t0w, 4
+        sub word [synthq + synth_w*2 - 2], t0w
+
+        mov t0w, word [synthq + synth_w*2 - 4]
+        mov t1w, word [synthq + synth_w*2 - 8]
+        imul t0w, 8
+        imul t1w, 9
+        sub t0w, word [synthq + synth_w*2 - 12]
+        add t1w, 8
+        add t0w, t1w
+        sar t0w, 4
+        sub word [synthq + synth_w*2 - 6], t0w
+
+        ; Lifting stage 1
+        mov t0w, word [synthq + 2]
+        add t0w, t0w
+        add t0w, 2
+        sar t0w, 2
+        add word [synthq], t0w
+
+        mov x, 1
+        add widthq, 1
+        .loop2_x_2:
+%if 0 ; needs horizontal masking
+            movu m0, [synthq + x*4 - 2]
+            movu m1, [synthq + x*4 + 2]
+            paddw m0, m1
+            paddw m0, [pw_2]
+            psraw m0, 2
+            movu m2, [synthq + x*4 + 0]
+            paddw m2, m0
+            movu [synthq + x*4 + 0], m2
+            add x, mmsize / 2
+            cmp x, widthq
+%else
+            mov t0w, word [synthq + x*4 - 2]
+            mov t1w, word [synthq + x*4 + 2]
+            add t0w, 2
+            add t0w, t1w
+            sar t0w, 2
+            add word [synthq + x*4 + 0], t0w
+            add x, 1
+            cmp x, widthq
+%endif
+        jl .loop2_x_2
+
+        mov t0w, word [synthq + synth_w*2 - 6]
+        mov t1w, word [synthq + synth_w*2 - 2]
+        add t0w, t1w
+        add t0w, 2
+        sar t0w, 2
+        add word [synthq + synth_w*2 - 4], t0w
+
+        lea synthq, [synthq + synth_w*2]
+        sub y, 1
+    jg .loop2_y
+
+    ; Vertical synthesis: Lifting stage 2
+    mov synthq, [rsp] ; No addition of synth_w so indicies in the loop have an extra synth_w added.
+    mov x, synth_w
+    .loop3_x:
+        movu m0, [synthq]
+        movu m1, [synthq + synth_w*4]
+        movu m2, [synthq + synth_w*8]
+        pmullw m0, [pw_8]
+        pmullw m1, [pw_9]
+        psubw m0, m2
+        paddw m1, [pw_8]
+        paddw m0, m1
+        psraw m0, 4
+        movu m3, [synthq + synth_w*2]
+        psubw m3, m0
+        movu [synthq + synth_w*2], m3
+        add synthq, mmsize
+        sub x, mmsize/2
+    jg .loop3_x
+
+    mov synthq, [rsp]
+    mov t0, synth_w
+    neg t0
+    mov y, heightq
+    sub y, 3
+    .loop4_y:
+        lea synthq, [synthq + synth_w*4]
+        mov t1, synthq
+        mov x, synth_w
+        .loop4_x:
+            movu m0, [t1]
+            movu m1, [t1 + synth_w*4]
+            movu m2, [t1 + t0*4]
+            movu m3, [t1 + synth_w*8]
+            pmullw m0, [pw_9]
+            pmullw m1, [pw_9]
+            paddw m2, m3
+            paddw m0, [pw_8]
+            psubw m1, m2
+            paddw m0, m1
+            psraw m0, 4
+            movu m4, [t1 + synth_w*2]
+            psubw m4, m0
+            movu [t1 + synth_w*2], m4
+            add t1, mmsize
+            sub x, mmsize/2
+        jg .loop4_x
+        sub y, 1
+    jg .loop4_y
+
+    mov synthq, [rsp]
+    mov t0, synth_h
+    sub t0, 1
+    imul t0, synth_w
+    lea synthq, [synthq + t0*2]
+    neg synth_w
+    lea t1, [synth_w*3]
+    lea t0, [synth_w*5]
+    mov x, synth_w
+    .loop5_x:
+        movu m0, [synthq + synth_w*2]
+        movu m1, [synthq + t1*2]
+        pmullw m0, [pw_17]
+        psubw m1, [pw_8] ; -8 so that the sign is corrected below
+        psubw m0, m1
+        psraw m0, 4
+        movu m2, [synthq]
+        psubw m2, m0
+        movu [synthq], m2
+
+        movu m0, [synthq + t1*2]
+        movu m1, [synthq + synth_w*2]
+        movu m2, [synthq + t0*2]
+        pmullw m0, [pw_9]
+        pmullw m1, [pw_8]
+        psubw m2, [pw_8] ; -8 so that the sign is corrected below
+        paddw m0, m1
+        psubw m0, m2
+        psraw m0, 4
+        movu m3, [synthq + synth_w*4]
+        psubw m3, m0
+        movu [synthq + synth_w*4], m3
+
+        add synthq, mmsize
+        add x, mmsize/2
+    jl .loop5_x
+
+    ; Vertical synthesis: Lifting stage 1
+    mov synthq, [rsp]
+    mov x, synth_w
+    neg synth_w
+    .loop6_x:
+        movu m0, [synthq + synth_w]
+        paddw m0, m0
+        paddw m0, [pw_2]
+        psraw m0, 2
+        movu m1, [synthq]
+        paddw m1, m0
+        movu [synthq], m1
+        add synthq, mmsize
+        add x, mmsize/2
+    jl .loop6_x
+
+    mov synthq, [rsp]
+    lea synthq, [synthq + synth_w*2] ; 1 synth_w left out so indicies in the loop have an extra synth_w added.
+    mov y, heightq
+    sub y, 2
+    .loop7_y:
+        mov t0, synthq
+        mov x, synth_w
+        .loop7_x:
+            movu m0, [t0]
+            movu m1, [t0 + synth_w*4]
+            paddw m0, m1
+            paddw m0, [pw_2]
+            psraw m0, 2
+            movu m2, [t0 + synth_w*2]
+            paddw m2, m0
+            movu [t0 + synth_w*2], m2
+            add t0, mmsize
+            sub x, mmsize/2
+        jg .loop7_x
+        lea synthq, [synthq + synth_w*4]
+        sub y, 1
+    jg .loop7_y
+
+    mov synthq, [rsp]
+    mov t0, synth_h
+    sub t0, 3
+    imul t0, synth_w ; 1 synth_w left out so indicies in the loop have an extra synth_w added.
+    lea synthq, [synthq + t0*2]
+    mov x, synth_w
+    .loop8_x:
+        movu m0, [synthq]
+        movu m1, [synthq + synth_w*4]
+        paddw m0, m1
+        paddw m0, [pw_2]
+        psraw m0, 2
+        movu m2, [synthq + synth_w*2]
+        paddw m2, m0
+        movu [synthq + synth_w*2], m2
+        add synthq, mmsize
+        sub x, mmsize/2
+    jg .loop8_x
+
+    add rsp, 5 * gprsize
+RET
+
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 603 bytes
Desc: OpenPGP digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160301/9cbf2092/attachment.sig>