[FFmpeg-devel] [PATCH] ARMv6 optimised VP3 loop filter

Sat Feb 13 11:01:05 CET 2010

10% faster overall on a 1g ipod touch
---
 libavcodec/Makefile                 |    3 +
 libavcodec/arm/dsputil_init_armv6.c |    8 ++
 libavcodec/arm/vp3dsp_armv6.S       |  126 +++++++++++++++++++++++++++++++++++
 3 files changed, 137 insertions(+), 0 deletions(-)
 create mode 100644 libavcodec/arm/vp3dsp_armv6.S

diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 0b58459..6108d30 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -610,9 +610,12 @@ OBJS-$(HAVE_ARMV5TE)                   += arm/dsputil_init_armv5te.o    \
                                           arm/mpegvideo_armv5te_s.o     \
                                           arm/simple_idct_armv5te.o     \
 
+ARMV6-OBJS-$(CONFIG_VP3_DECODER)       += arm/vp3dsp_armv6.o
+
 OBJS-$(HAVE_ARMV6)                     += arm/dsputil_init_armv6.o      \
                                           arm/dsputil_armv6.o           \
                                           arm/simple_idct_armv6.o       \
+                                          $(ARMV6-OBJS-yes)
 
 OBJS-$(HAVE_ARMVFP)                    += arm/dsputil_vfp.o             \
                                           arm/dsputil_init_vfp.o        \
diff --git a/libavcodec/arm/dsputil_init_armv6.c b/libavcodec/arm/dsputil_init_armv6.c
index 3209062..2d7a354 100644
--- a/libavcodec/arm/dsputil_init_armv6.c
+++ b/libavcodec/arm/dsputil_init_armv6.c
@@ -70,6 +70,9 @@ int ff_sse16_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
 int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
 int ff_pix_sum_armv6(uint8_t *pix, int line_size);
 
+void ff_vp3_v_loop_filter_armv6(uint8_t *, int, int *);
+void ff_vp3_h_loop_filter_armv6(uint8_t *, int, int *);
+
 void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx)
 {
     if (!avctx->lowres && (avctx->idct_algo == FF_IDCT_AUTO ||
@@ -118,4 +121,9 @@ void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx)
 
     c->pix_norm1 = ff_pix_norm1_armv6;
     c->pix_sum   = ff_pix_sum_armv6;
+
+    if (CONFIG_VP3_DECODER) {
+        c->vp3_v_loop_filter = ff_vp3_v_loop_filter_armv6;
+        c->vp3_h_loop_filter = ff_vp3_h_loop_filter_armv6;
+    }
 }
diff --git a/libavcodec/arm/vp3dsp_armv6.S b/libavcodec/arm/vp3dsp_armv6.S
new file mode 100644
index 0000000..3d751e6
--- /dev/null
+++ b/libavcodec/arm/vp3dsp_armv6.S
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2010 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+.text
+
+// ((p0 - p3) + 3*(p2 - p1) + 4) >> 3 has range -127, 128
+// so we instead calculate the negative, so it fits in signed 8 bits
+
+function ff_vp3_v_loop_filter_armv6, export=1
+    push    {r4-r10,lr}
+    sub     r0,  r0,  r1
+    ldr     r2,  [r2, #129*4]
+    ldr     r3,  =0x030003
+    ldr     ip,  =0xff00ff
+    mov     lr,  #8
+1:
+    ldr     r4,  [r0,-r1]
+    ldr     r5,  [r0]
+    ldr     r6,  [r0, r1]
+    ldr     r7,  [r0, r1, lsl #1]
+    subs    lr,  lr,  #4
+
+    uxtb16  r10, r4
+    uxtb16  r8,  r5
+    uxtb16  r9,  r6
+    ssub16  r8,  r8,  r9
+    sadd16  r9,  r8,  r8
+    sadd16  r8,  r8,  r9            // 3*(p1-p2)
+    uxtb16  r9,  r7
+    ssub16  r9,  r9,  r10
+    sadd16  r8,  r8,  r9
+    sadd16  r8,  r8,  r3            // (p3 - p0) + 3*(p1 - p2) + 3
+    uxtb16  r9,  r5,  ror #8
+    and     r8,  ip,  r8, lsr #3
+
+    uxtb16  r10, r6,  ror #8
+    ssub16  r9,  r9,  r10
+    sadd16  r10, r9,  r9
+    sadd16  r9,  r9,  r10
+    uxtb16  r4,  r4,  ror #8
+    uxtb16  r7,  r7,  ror #8
+    ssub16  r4,  r7,  r4
+    sadd16  r4,  r4,  r9
+    sadd16  r4,  r4,  r3
+    mov     r10, #0
+    and     r4,  ip,  r4, lsr #3
+    orr     r8,  r8,  r4, lsl #8    // 4 filter values
+
+    ssub8   r9,  r10, r8
+    sel     r9,  r9,  r8            // abs(filter_value)
+    uqsub8  r7,  r2,  r9
+    uqsub8  r9,  r7,  r9
+    usub8   r9,  r7,  r9
+// add and sub, then select the right one based on the original sign
+    uqsub8  r7,  r6,  r9
+    uqadd8  r6,  r6,  r9
+    uqadd8  r4,  r5,  r9
+    uqsub8  r5,  r5,  r9
+    ssub8   r10, r8,  r10
+    sel     r6,  r6,  r7
+    sel     r5,  r5,  r4
+    str     r6,  [r0, r1]
+    str     r5,  [r0], #4
+    bgt     1b
+    pop     {r4-r10,pc}
+.endfunc
+
+function ff_vp3_h_loop_filter_armv6, export=1
+    push    {r4-r10,lr}
+    sub     ip,  r0,  #1
+    sub     r0,  r0,  #2
+    ldr     r3,  =0x010003
+    mov     r10, #4
+    mov     lr,  #8
+1:
+    ldr     r4,  [r0], r1
+    ldr     r7,  [r0], r1
+    subs    lr,  lr,  #2
+
+    uxtb16  r5,  r4,  ror #8
+    uxtb16  r6,  r4,  ror #16
+    ssub16  r6,  r6,  r5        // p0-p3  p2-p1
+    smlad   r6,  r6,  r3,  r10  // (p0-p3) + 3*(p2-p1) + 4
+
+    uxtb16  r8,  r7,  ror #8
+    uxtb16  r9,  r7,  ror #16
+    ssub16  r9,  r9,  r8
+    smlad   r9,  r9,  r3,  r10
+
+.macro filter p1 p2 r
+    asr     \r,  \r,  #3
+    ldr     \r,  [r2, \r, lsl #2] // filter value
+    uxtb    \p2, \p1, ror #16     // p2
+    uxtb    \p1, \p1, ror #8      // p1
+    sub     \p2, \p2, \r
+    add     \p1, \p1, \r
+    usat    \p2, #8,  \p2
+    usat    \p1, #8,  \p1
+    strb    \p2, [ip, #1]
+    strb    \p1, [ip], r1
+.endm
+
+    filter  r4,  r5,  r6
+    filter  r7,  r8,  r9
+    bgt     1b
+    pop     {r4-r10,pc}
+.endfunc
-- 
1.6.6