[FFmpeg-devel] [PATCH 3/3] avfilter/avf_showcqt: draw_bar x86 optimization

Muhammad Faiz mfcc64 at gmail.com
Thu Mar 10 10:53:15 CET 2016


use sse/sse2 intrinsic
bitexact on x86_64
bar_time:
rgb24:   12.601s   6.492s
yuv444p: 14.495s   5.661s
yuv422p: 10.514s   3.953s
yuv420p:  8.795s   3.256s

Signed-off-by: Muhammad Faiz <mfcc64 at gmail.com>
---
 libavfilter/avf_showcqt.c     |  20 ++-
 libavfilter/avf_showcqt.h     |   2 +
 libavfilter/x86/avf_showcqt.c | 300 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 318 insertions(+), 4 deletions(-)

diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c
index 2d2644c..2528c0f 100644
--- a/libavfilter/avf_showcqt.c
+++ b/libavfilter/avf_showcqt.c
@@ -137,6 +137,7 @@ static void common_uninit(ShowCQTContext *s)
     av_freep(&s->fft_result);
     av_freep(&s->cqt_result);
     av_freep(&s->c_buf);
+    av_freep(&s->c_bar_buf);
     av_freep(&s->h_buf);
     av_freep(&s->rcp_h_buf);
     av_freep(&s->freq);
@@ -1024,7 +1025,12 @@ static int plot_cqt(AVFilterContext *ctx, AVFrame **frameout)
         UPDATE_TIME(s->alloc_time);
 
         if (s->bar_h) {
-            s->draw_bar(out, s->h_buf, s->rcp_h_buf, s->c_buf, s->bar_h);
+            if (s->permute_color_bar) {
+                s->permute_color_bar(s->c_bar_buf, s->c_buf, s->width);
+                s->draw_bar(out, s->h_buf, s->rcp_h_buf, s->c_bar_buf, s->bar_h);
+            } else {
+                s->draw_bar(out, s->h_buf, s->rcp_h_buf, s->c_buf, s->bar_h);
+            }
             UPDATE_TIME(s->bar_time);
         }
 
@@ -1228,12 +1234,18 @@ static int config_output(AVFilterLink *outlink)
             return AVERROR(ENOMEM);
     }
 
-    s->h_buf = av_malloc_array(s->cqt_len, sizeof (*s->h_buf));
-    s->rcp_h_buf = av_malloc_array(s->width, sizeof(*s->rcp_h_buf));
-    s->c_buf = av_malloc_array(s->width, sizeof(*s->c_buf));
+    s->h_buf = av_calloc(FFALIGN(s->cqt_len, 32), sizeof (*s->h_buf));
+    s->rcp_h_buf = av_calloc(FFALIGN(s->width, 32), sizeof(*s->rcp_h_buf));
+    s->c_buf = av_calloc(FFALIGN(s->width, 32), sizeof(*s->c_buf));
     if (!s->h_buf || !s->rcp_h_buf || !s->c_buf)
         return AVERROR(ENOMEM);
 
+    if (s->permute_color_bar) {
+        s->c_bar_buf = av_calloc(FFALIGN(s->width, 32), sizeof(*s->c_bar_buf));
+        if (!s->c_bar_buf)
+            return AVERROR(ENOMEM);
+    }
+
     s->sono_count = 0;
     s->next_pts = 0;
     s->sono_idx = 0;
diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h
index d01d90a..9de60f3 100644
--- a/libavfilter/avf_showcqt.h
+++ b/libavfilter/avf_showcqt.h
@@ -67,6 +67,7 @@ typedef struct {
     int                 cqt_len;
     int                 cqt_align;
     ColorFloat          *c_buf;
+    ColorFloat          *c_bar_buf;
     float               *h_buf;
     float               *rcp_h_buf;
     float               *sono_v_buf;
@@ -81,6 +82,7 @@ typedef struct {
     void                (*update_sono)(AVFrame *sono, const ColorFloat *c, int idx);
     /* permute callback, for easier SIMD code */
     void                (*permute_coeffs)(float *val, int len);
+    void                (*permute_color_bar)(ColorFloat *out, const ColorFloat *in, int len);
     /* performance debugging */
     int64_t             fft_time;
     int64_t             cqt_time;
diff --git a/libavfilter/x86/avf_showcqt.c b/libavfilter/x86/avf_showcqt.c
index b8e9d32..d0c90a5 100644
--- a/libavfilter/x86/avf_showcqt.c
+++ b/libavfilter/x86/avf_showcqt.c
@@ -27,6 +27,10 @@
 #include <xmmintrin.h>
 #endif
 
+#if HAVE_SSE2_INTRINSIC
+#include <emmintrin.h>
+#endif
+
 #if HAVE_SSE3_INTRINSIC
 #include <pmmintrin.h>
 #endif
@@ -259,6 +263,282 @@ static void permute_coeffs_avx(float *v, int len)
 }
 #endif
 
+#if HAVE_SSE2_INTRINSIC
+static av_intrinsic_sse2
+void draw_bar_rgb24_sse2(AVFrame *out, const float *h, const float *rcp_h,
+                         const ColorFloat *color, int bar_h)
+{
+    const float *c;
+    int x, y, w = out->width;
+    float rcp_bar_h = 1.0f / bar_h;
+    uint8_t *v = out->data[0];
+    uint8_t *lp;
+    int ls = out->linesize[0];
+    __m128i is_le, rri, rgi, rbi;
+    __m128 hx, ht, mul, rr, rg, rb;
+    uint32_t red, green, blue;
+
+    for (y = 0; y < bar_h; y++) {
+        lp = v + ls * y;
+        ht = _mm_set1_ps((bar_h - y) * rcp_bar_h);
+        x = 0;
+        c = (const float *) color;
+        do {
+            hx = _mm_load_ps(h+x);
+            is_le = _mm_castps_si128(_mm_cmple_ps(hx, ht));
+            is_le = _mm_packs_epi32(is_le, is_le);
+            is_le = _mm_packs_epi16(is_le, is_le);
+            if (-1 == _mm_cvtsi128_si32(is_le)) {
+                memset(lp, 0, 12);
+            } else {
+                mul = _mm_max_ps(_mm_sub_ps(hx, ht), _mm_setzero_ps());
+                mul = _mm_mul_ps(mul, _mm_load_ps(rcp_h + x));
+                rr = _mm_mul_ps(mul, _mm_load_ps(c));
+                rg = _mm_mul_ps(mul, _mm_load_ps(c+4));
+                rb = _mm_mul_ps(mul, _mm_load_ps(c+8));
+                rri = _mm_cvtps_epi32(rr);
+                rgi = _mm_cvtps_epi32(rg);
+                rbi = _mm_cvtps_epi32(rb);
+                rri = _mm_packs_epi32(rri, rri);
+                rgi = _mm_packs_epi32(rgi, rgi);
+                rbi = _mm_packs_epi32(rbi, rbi);
+                rri = _mm_packus_epi16(rri, rri);
+                rgi = _mm_packus_epi16(rgi, rgi);
+                rbi = _mm_packus_epi16(rbi, rbi);
+                red = _mm_cvtsi128_si32(rri);
+                green = _mm_cvtsi128_si32(rgi);
+                blue = _mm_cvtsi128_si32(rbi);
+                lp[0] = red; lp[1] = green; lp[2] = blue;
+                red >>= 8; green >>= 8; blue >>= 8;
+                lp[3] = red; lp[4] = green; lp[5] = blue;
+                red >>= 8; green >>= 8; blue >>= 8;
+                lp[6] = red; lp[7] = green; lp[8] = blue;
+                red >>= 8; green >>= 8; blue >>= 8;
+                lp[9] = red; lp[10] = green; lp[11] = blue;
+            }
+            lp += 12;
+            x += 4; c += 12;
+        } while (x < w);
+    }
+}
+
+static av_intrinsic_sse2
+void draw_bar_yuv_sse2(AVFrame *out, const float *h, const float *rcp_h,
+                       const ColorFloat *color, int bar_h)
+{
+    const float *c;
+    int x, y, yh, w = out->width;
+    float rcp_bar_h = 1.0f / bar_h;
+    uint8_t *vy = out->data[0], *vu = out->data[1], *vv = out->data[2];
+    uint8_t *lpy, *lpu, *lpv;
+    int lsy = out->linesize[0], lsu = out->linesize[1], lsv = out->linesize[2];
+    int fmt = out->format;
+    __m128i is_le, ryi, ryi2, rui, rvi;
+    __m128 ht, hx, hx2, mul, mul2, mul3, ry, ry2, ru, rv;
+
+#define DRAW_BAR_LINE_FULL_CHROMA() \
+do { \
+    x = 0; \
+    c = (const float *) color; \
+    do { \
+        hx = _mm_load_ps(h+x); \
+        is_le = _mm_castps_si128(_mm_cmple_ps(hx, ht)); \
+        is_le = _mm_packs_epi32(is_le, is_le); \
+        is_le = _mm_packs_epi16(is_le, is_le); \
+        if (-1 == _mm_cvtsi128_si32(is_le)) { \
+            *((uint32_t *) lpy) = 0x10101010; \
+            *((uint32_t *) lpu) = 0x80808080; \
+            *((uint32_t *) lpv) = 0x80808080; \
+        } else { \
+            mul = _mm_max_ps(_mm_sub_ps(hx, ht), _mm_setzero_ps()); \
+            mul = _mm_mul_ps(mul, _mm_load_ps(rcp_h + x)); \
+            ry = _mm_mul_ps(mul, _mm_load_ps(c)); \
+            ru = _mm_mul_ps(mul, _mm_load_ps(c+4)); \
+            rv = _mm_mul_ps(mul, _mm_load_ps(c+8)); \
+            ry = _mm_add_ps(ry, _mm_set1_ps(16.0f)); \
+            ru = _mm_add_ps(ru, _mm_set1_ps(128.0f)); \
+            rv = _mm_add_ps(rv, _mm_set1_ps(128.0f)); \
+            ryi = _mm_cvtps_epi32(ry); \
+            rui = _mm_cvtps_epi32(ru); \
+            rvi = _mm_cvtps_epi32(rv); \
+            ryi = _mm_packs_epi32(ryi, ryi); \
+            rui = _mm_packs_epi32(rui, rui); \
+            rvi = _mm_packs_epi32(rvi, rvi); \
+            ryi = _mm_packus_epi16(ryi, ryi); \
+            rui = _mm_packus_epi16(rui, rui); \
+            rvi = _mm_packus_epi16(rvi, rvi); \
+            *((int32_t *) lpy) = _mm_cvtsi128_si32(ryi); \
+            *((int32_t *) lpu) = _mm_cvtsi128_si32(rui); \
+            *((int32_t *) lpv) = _mm_cvtsi128_si32(rvi); \
+        } \
+        lpy += 4; lpu += 4; lpv += 4; \
+        x += 4; c += 12; \
+    } while (x < w); \
+} while (0)
+
+#define DRAW_BAR_LINE_HALF_CHROMA() \
+do { \
+    x = 0; \
+    c = (const float *) color; \
+    do { \
+        hx = _mm_load_ps(h+x); \
+        hx2 = _mm_load_ps(h+x+4); \
+        is_le = _mm_castps_si128(_mm_cmple_ps(hx, ht)); \
+        is_le = _mm_packs_epi32(is_le, _mm_castps_si128(_mm_cmple_ps(hx2, ht))); \
+        is_le = _mm_packs_epi16(is_le, is_le); \
+        is_le = _mm_packs_epi16(is_le, is_le); \
+        if (-1 == _mm_cvtsi128_si32(is_le)) { \
+            *((uint32_t *) lpy) = 0x10101010; \
+            *((uint32_t *) (lpy+4)) = 0x10101010; \
+            *((uint32_t *) lpu) = 0x80808080; \
+            *((uint32_t *) lpv) = 0x80808080; \
+        } else { \
+            mul = _mm_max_ps(_mm_sub_ps(hx, ht), _mm_setzero_ps()); \
+            mul = _mm_mul_ps(mul, _mm_load_ps(rcp_h + x)); \
+            mul2 = _mm_max_ps(_mm_sub_ps(hx2, ht), _mm_setzero_ps()); \
+            mul2 = _mm_mul_ps(mul2, _mm_load_ps(rcp_h + x + 4)); \
+            mul3 = _mm_shuffle_ps(mul, mul2, _MM_SHUFFLE(2,0,2,0)); \
+            ry = _mm_mul_ps(mul, _mm_load_ps(c)); \
+            ry2 = _mm_mul_ps(mul2, _mm_load_ps(c+8)); \
+            ru = _mm_mul_ps(mul3, _mm_load_ps(c+4)); \
+            rv = _mm_mul_ps(mul3, _mm_load_ps(c+12)); \
+            ry = _mm_add_ps(ry, _mm_set1_ps(16.0f)); \
+            ry2 = _mm_add_ps(ry2, _mm_set1_ps(16.0f)); \
+            ru = _mm_add_ps(ru, _mm_set1_ps(128.0f)); \
+            rv = _mm_add_ps(rv, _mm_set1_ps(128.0f)); \
+            ryi = _mm_cvtps_epi32(ry); \
+            ryi2 = _mm_cvtps_epi32(ry2); \
+            rui = _mm_cvtps_epi32(ru); \
+            rvi = _mm_cvtps_epi32(rv); \
+            ryi = _mm_packs_epi32(ryi, ryi); \
+            ryi2 = _mm_packs_epi32(ryi2, ryi2); \
+            rui = _mm_packs_epi32(rui, rui); \
+            rvi = _mm_packs_epi32(rvi, rvi); \
+            ryi = _mm_packus_epi16(ryi, ryi); \
+            ryi2 = _mm_packus_epi16(ryi2, ryi2); \
+            rui = _mm_packus_epi16(rui, rui); \
+            rvi = _mm_packus_epi16(rvi, rvi); \
+            *((int32_t *) lpy) = _mm_cvtsi128_si32(ryi); \
+            *((int32_t *) (lpy+4)) = _mm_cvtsi128_si32(ryi2); \
+            *((int32_t *) lpu) = _mm_cvtsi128_si32(rui); \
+            *((int32_t *) lpv) = _mm_cvtsi128_si32(rvi); \
+        } \
+        lpy += 8; lpu += 4; lpv += 4; \
+        x += 8; c += 16; \
+    } while (x < w); \
+} while (0)
+
+#define DRAW_BAR_LINE_NO_CHROMA() \
+do { \
+    x = 0; \
+    c = (const float *) color; \
+    do { \
+        hx = _mm_load_ps(h+x); \
+        hx2 = _mm_load_ps(h+x+4); \
+        is_le = _mm_castps_si128(_mm_cmple_ps(hx, ht)); \
+        is_le = _mm_packs_epi32(is_le, _mm_castps_si128(_mm_cmple_ps(hx2, ht))); \
+        is_le = _mm_packs_epi16(is_le, is_le); \
+        is_le = _mm_packs_epi16(is_le, is_le); \
+        if (-1 == _mm_cvtsi128_si32(is_le)) { \
+            *((uint32_t *) lpy) = 0x10101010; \
+            *((uint32_t *) (lpy+4)) = 0x10101010; \
+        } else { \
+            mul = _mm_max_ps(_mm_sub_ps(hx, ht), _mm_setzero_ps()); \
+            mul = _mm_mul_ps(mul, _mm_load_ps(rcp_h + x)); \
+            mul2 = _mm_max_ps(_mm_sub_ps(hx2, ht), _mm_setzero_ps()); \
+            mul2 = _mm_mul_ps(mul2, _mm_load_ps(rcp_h + x + 4)); \
+            ry = _mm_mul_ps(mul, _mm_load_ps(c)); \
+            ry2 = _mm_mul_ps(mul2, _mm_load_ps(c+8)); \
+            ry = _mm_add_ps(ry, _mm_set1_ps(16.0f)); \
+            ry2 = _mm_add_ps(ry2, _mm_set1_ps(16.0f)); \
+            ryi = _mm_cvtps_epi32(ry); \
+            ryi2 = _mm_cvtps_epi32(ry2); \
+            ryi = _mm_packs_epi32(ryi, ryi); \
+            ryi2 = _mm_packs_epi32(ryi2, ryi2); \
+            ryi = _mm_packus_epi16(ryi, ryi); \
+            ryi2 = _mm_packus_epi16(ryi2, ryi2); \
+            *((int32_t *) lpy) = _mm_cvtsi128_si32(ryi); \
+            *((int32_t *) (lpy+4)) = _mm_cvtsi128_si32(ryi2); \
+        } \
+        lpy += 8; \
+        x += 8; c += 16; \
+    } while (x < w); \
+} while (0)
+
+    for (y = 0; y < bar_h; y += 2) {
+        yh = (fmt == AV_PIX_FMT_YUV420P) ? y / 2 : y;
+        ht = _mm_set1_ps((bar_h - y) * rcp_bar_h);
+        lpy = vy + y * lsy;
+        lpu = vu + yh * lsu;
+        lpv = vv + yh * lsv;
+        if (fmt == AV_PIX_FMT_YUV444P)
+            DRAW_BAR_LINE_FULL_CHROMA();
+        else
+            DRAW_BAR_LINE_HALF_CHROMA();
+
+        ht = _mm_set1_ps((bar_h - (y+1)) * rcp_bar_h);
+        lpy = vy + (y+1) * lsy;
+        lpu = vu + (y+1) * lsu;
+        lpv = vv + (y+1) * lsv;
+        if (fmt == AV_PIX_FMT_YUV444P)
+            DRAW_BAR_LINE_FULL_CHROMA();
+        else if (fmt == AV_PIX_FMT_YUV422P)
+            DRAW_BAR_LINE_HALF_CHROMA();
+        else
+            DRAW_BAR_LINE_NO_CHROMA();
+    }
+#undef DRAW_BAR_LINE_FULL_CHROMA
+#undef DRAW_BAR_LINE_HALF_CHROMA
+#undef DRAW_BAR_LINE_NO_CHROMA
+}
+
+static void permute_color_bar_full_chroma_sse2(ColorFloat *out, const ColorFloat *in, int len)
+{
+    float *c = (float *) out;
+    int k;
+
+    for (k = 0; k < len; k += 4, c += 12) {
+        c[0] = in[k].yuv.y;
+        c[1] = in[k+1].yuv.y;
+        c[2] = in[k+2].yuv.y;
+        c[3] = in[k+3].yuv.y;
+        c[4] = in[k].yuv.u;
+        c[5] = in[k+1].yuv.u;
+        c[6] = in[k+2].yuv.u;
+        c[7] = in[k+3].yuv.u;
+        c[8] = in[k].yuv.v;
+        c[9] = in[k+1].yuv.v;
+        c[10] = in[k+2].yuv.v;
+        c[11] = in[k+3].yuv.v;
+    }
+}
+
+static void permute_color_bar_half_chroma_sse2(ColorFloat *out, const ColorFloat *in, int len)
+{
+    float *c = (float *) out;
+    int k;
+
+    for (k = 0; k < len; k += 8, c += 16) {
+        c[0] = in[k].yuv.y;
+        c[1] = in[k+1].yuv.y;
+        c[2] = in[k+2].yuv.y;
+        c[3] = in[k+3].yuv.y;
+        c[8] = in[k+4].yuv.y;
+        c[9] = in[k+5].yuv.y;
+        c[10] = in[k+6].yuv.y;
+        c[11] = in[k+7].yuv.y;
+        c[4] = in[k].yuv.u;
+        c[5] = in[k+2].yuv.u;
+        c[6] = in[k+4].yuv.u;
+        c[7] = in[k+6].yuv.u;
+        c[12] = in[k].yuv.v;
+        c[13] = in[k+2].yuv.v;
+        c[14] = in[k+4].yuv.v;
+        c[15] = in[k+6].yuv.v;
+    }
+}
+#endif
+
 av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
@@ -286,4 +566,24 @@ av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
         s->cqt_align = 8;
     }
 #endif
+
+#if HAVE_SSE2_INTRINSIC
+    if (cpu_flags & AV_CPU_FLAG_SSE2) {
+        switch (s->format) {
+        case AV_PIX_FMT_RGB24:
+            s->permute_color_bar = permute_color_bar_full_chroma_sse2;
+            s->draw_bar = draw_bar_rgb24_sse2;
+            break;
+        case AV_PIX_FMT_YUV444P:
+            s->permute_color_bar = permute_color_bar_full_chroma_sse2;
+            s->draw_bar = draw_bar_yuv_sse2;
+            break;
+        case AV_PIX_FMT_YUV422P:
+        case AV_PIX_FMT_YUV420P:
+            s->permute_color_bar = permute_color_bar_half_chroma_sse2;
+            s->draw_bar = draw_bar_yuv_sse2;
+            break;
+        }
+    }
+#endif
 }
-- 
2.5.0



More information about the ffmpeg-devel mailing list