[FFmpeg-devel] [PATCH 3/3] avfilter/avf_showcqt: draw_bar x86 optimization
Muhammad Faiz
mfcc64 at gmail.com
Thu Mar 10 10:53:15 CET 2016
use sse/sse2 intrinsic
bitexact on x86_64
bar_time:
rgb24: 12.601s 6.492s
yuv444p: 14.495s 5.661s
yuv422p: 10.514s 3.953s
yuv420p: 8.795s 3.256s
Signed-off-by: Muhammad Faiz <mfcc64 at gmail.com>
---
libavfilter/avf_showcqt.c | 20 ++-
libavfilter/avf_showcqt.h | 2 +
libavfilter/x86/avf_showcqt.c | 300 ++++++++++++++++++++++++++++++++++++++++++
3 files changed, 318 insertions(+), 4 deletions(-)
diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c
index 2d2644c..2528c0f 100644
--- a/libavfilter/avf_showcqt.c
+++ b/libavfilter/avf_showcqt.c
@@ -137,6 +137,7 @@ static void common_uninit(ShowCQTContext *s)
av_freep(&s->fft_result);
av_freep(&s->cqt_result);
av_freep(&s->c_buf);
+ av_freep(&s->c_bar_buf);
av_freep(&s->h_buf);
av_freep(&s->rcp_h_buf);
av_freep(&s->freq);
@@ -1024,7 +1025,12 @@ static int plot_cqt(AVFilterContext *ctx, AVFrame **frameout)
UPDATE_TIME(s->alloc_time);
if (s->bar_h) {
- s->draw_bar(out, s->h_buf, s->rcp_h_buf, s->c_buf, s->bar_h);
+ if (s->permute_color_bar) {
+ s->permute_color_bar(s->c_bar_buf, s->c_buf, s->width);
+ s->draw_bar(out, s->h_buf, s->rcp_h_buf, s->c_bar_buf, s->bar_h);
+ } else {
+ s->draw_bar(out, s->h_buf, s->rcp_h_buf, s->c_buf, s->bar_h);
+ }
UPDATE_TIME(s->bar_time);
}
@@ -1228,12 +1234,18 @@ static int config_output(AVFilterLink *outlink)
return AVERROR(ENOMEM);
}
- s->h_buf = av_malloc_array(s->cqt_len, sizeof (*s->h_buf));
- s->rcp_h_buf = av_malloc_array(s->width, sizeof(*s->rcp_h_buf));
- s->c_buf = av_malloc_array(s->width, sizeof(*s->c_buf));
+ s->h_buf = av_calloc(FFALIGN(s->cqt_len, 32), sizeof (*s->h_buf));
+ s->rcp_h_buf = av_calloc(FFALIGN(s->width, 32), sizeof(*s->rcp_h_buf));
+ s->c_buf = av_calloc(FFALIGN(s->width, 32), sizeof(*s->c_buf));
if (!s->h_buf || !s->rcp_h_buf || !s->c_buf)
return AVERROR(ENOMEM);
+ if (s->permute_color_bar) {
+ s->c_bar_buf = av_calloc(FFALIGN(s->width, 32), sizeof(*s->c_bar_buf));
+ if (!s->c_bar_buf)
+ return AVERROR(ENOMEM);
+ }
+
s->sono_count = 0;
s->next_pts = 0;
s->sono_idx = 0;
diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h
index d01d90a..9de60f3 100644
--- a/libavfilter/avf_showcqt.h
+++ b/libavfilter/avf_showcqt.h
@@ -67,6 +67,7 @@ typedef struct {
int cqt_len;
int cqt_align;
ColorFloat *c_buf;
+ ColorFloat *c_bar_buf;
float *h_buf;
float *rcp_h_buf;
float *sono_v_buf;
@@ -81,6 +82,7 @@ typedef struct {
void (*update_sono)(AVFrame *sono, const ColorFloat *c, int idx);
/* permute callback, for easier SIMD code */
void (*permute_coeffs)(float *val, int len);
+ void (*permute_color_bar)(ColorFloat *out, const ColorFloat *in, int len);
/* performance debugging */
int64_t fft_time;
int64_t cqt_time;
diff --git a/libavfilter/x86/avf_showcqt.c b/libavfilter/x86/avf_showcqt.c
index b8e9d32..d0c90a5 100644
--- a/libavfilter/x86/avf_showcqt.c
+++ b/libavfilter/x86/avf_showcqt.c
@@ -27,6 +27,10 @@
#include <xmmintrin.h>
#endif
+#if HAVE_SSE2_INTRINSIC
+#include <emmintrin.h>
+#endif
+
#if HAVE_SSE3_INTRINSIC
#include <pmmintrin.h>
#endif
@@ -259,6 +263,282 @@ static void permute_coeffs_avx(float *v, int len)
}
#endif
+#if HAVE_SSE2_INTRINSIC
+static av_intrinsic_sse2
+void draw_bar_rgb24_sse2(AVFrame *out, const float *h, const float *rcp_h,
+ const ColorFloat *color, int bar_h)
+{
+ const float *c;
+ int x, y, w = out->width;
+ float rcp_bar_h = 1.0f / bar_h;
+ uint8_t *v = out->data[0];
+ uint8_t *lp;
+ int ls = out->linesize[0];
+ __m128i is_le, rri, rgi, rbi;
+ __m128 hx, ht, mul, rr, rg, rb;
+ uint32_t red, green, blue;
+
+ for (y = 0; y < bar_h; y++) {
+ lp = v + ls * y;
+ ht = _mm_set1_ps((bar_h - y) * rcp_bar_h);
+ x = 0;
+ c = (const float *) color;
+ do {
+ hx = _mm_load_ps(h+x);
+ is_le = _mm_castps_si128(_mm_cmple_ps(hx, ht));
+ is_le = _mm_packs_epi32(is_le, is_le);
+ is_le = _mm_packs_epi16(is_le, is_le);
+ if (-1 == _mm_cvtsi128_si32(is_le)) {
+ memset(lp, 0, 12);
+ } else {
+ mul = _mm_max_ps(_mm_sub_ps(hx, ht), _mm_setzero_ps());
+ mul = _mm_mul_ps(mul, _mm_load_ps(rcp_h + x));
+ rr = _mm_mul_ps(mul, _mm_load_ps(c));
+ rg = _mm_mul_ps(mul, _mm_load_ps(c+4));
+ rb = _mm_mul_ps(mul, _mm_load_ps(c+8));
+ rri = _mm_cvtps_epi32(rr);
+ rgi = _mm_cvtps_epi32(rg);
+ rbi = _mm_cvtps_epi32(rb);
+ rri = _mm_packs_epi32(rri, rri);
+ rgi = _mm_packs_epi32(rgi, rgi);
+ rbi = _mm_packs_epi32(rbi, rbi);
+ rri = _mm_packus_epi16(rri, rri);
+ rgi = _mm_packus_epi16(rgi, rgi);
+ rbi = _mm_packus_epi16(rbi, rbi);
+ red = _mm_cvtsi128_si32(rri);
+ green = _mm_cvtsi128_si32(rgi);
+ blue = _mm_cvtsi128_si32(rbi);
+ lp[0] = red; lp[1] = green; lp[2] = blue;
+ red >>= 8; green >>= 8; blue >>= 8;
+ lp[3] = red; lp[4] = green; lp[5] = blue;
+ red >>= 8; green >>= 8; blue >>= 8;
+ lp[6] = red; lp[7] = green; lp[8] = blue;
+ red >>= 8; green >>= 8; blue >>= 8;
+ lp[9] = red; lp[10] = green; lp[11] = blue;
+ }
+ lp += 12;
+ x += 4; c += 12;
+ } while (x < w);
+ }
+}
+
+static av_intrinsic_sse2
+void draw_bar_yuv_sse2(AVFrame *out, const float *h, const float *rcp_h,
+ const ColorFloat *color, int bar_h)
+{
+ const float *c;
+ int x, y, yh, w = out->width;
+ float rcp_bar_h = 1.0f / bar_h;
+ uint8_t *vy = out->data[0], *vu = out->data[1], *vv = out->data[2];
+ uint8_t *lpy, *lpu, *lpv;
+ int lsy = out->linesize[0], lsu = out->linesize[1], lsv = out->linesize[2];
+ int fmt = out->format;
+ __m128i is_le, ryi, ryi2, rui, rvi;
+ __m128 ht, hx, hx2, mul, mul2, mul3, ry, ry2, ru, rv;
+
+#define DRAW_BAR_LINE_FULL_CHROMA() \
+do { \
+ x = 0; \
+ c = (const float *) color; \
+ do { \
+ hx = _mm_load_ps(h+x); \
+ is_le = _mm_castps_si128(_mm_cmple_ps(hx, ht)); \
+ is_le = _mm_packs_epi32(is_le, is_le); \
+ is_le = _mm_packs_epi16(is_le, is_le); \
+ if (-1 == _mm_cvtsi128_si32(is_le)) { \
+ *((uint32_t *) lpy) = 0x10101010; \
+ *((uint32_t *) lpu) = 0x80808080; \
+ *((uint32_t *) lpv) = 0x80808080; \
+ } else { \
+ mul = _mm_max_ps(_mm_sub_ps(hx, ht), _mm_setzero_ps()); \
+ mul = _mm_mul_ps(mul, _mm_load_ps(rcp_h + x)); \
+ ry = _mm_mul_ps(mul, _mm_load_ps(c)); \
+ ru = _mm_mul_ps(mul, _mm_load_ps(c+4)); \
+ rv = _mm_mul_ps(mul, _mm_load_ps(c+8)); \
+ ry = _mm_add_ps(ry, _mm_set1_ps(16.0f)); \
+ ru = _mm_add_ps(ru, _mm_set1_ps(128.0f)); \
+ rv = _mm_add_ps(rv, _mm_set1_ps(128.0f)); \
+ ryi = _mm_cvtps_epi32(ry); \
+ rui = _mm_cvtps_epi32(ru); \
+ rvi = _mm_cvtps_epi32(rv); \
+ ryi = _mm_packs_epi32(ryi, ryi); \
+ rui = _mm_packs_epi32(rui, rui); \
+ rvi = _mm_packs_epi32(rvi, rvi); \
+ ryi = _mm_packus_epi16(ryi, ryi); \
+ rui = _mm_packus_epi16(rui, rui); \
+ rvi = _mm_packus_epi16(rvi, rvi); \
+ *((int32_t *) lpy) = _mm_cvtsi128_si32(ryi); \
+ *((int32_t *) lpu) = _mm_cvtsi128_si32(rui); \
+ *((int32_t *) lpv) = _mm_cvtsi128_si32(rvi); \
+ } \
+ lpy += 4; lpu += 4; lpv += 4; \
+ x += 4; c += 12; \
+ } while (x < w); \
+} while (0)
+
+#define DRAW_BAR_LINE_HALF_CHROMA() \
+do { \
+ x = 0; \
+ c = (const float *) color; \
+ do { \
+ hx = _mm_load_ps(h+x); \
+ hx2 = _mm_load_ps(h+x+4); \
+ is_le = _mm_castps_si128(_mm_cmple_ps(hx, ht)); \
+ is_le = _mm_packs_epi32(is_le, _mm_castps_si128(_mm_cmple_ps(hx2, ht))); \
+ is_le = _mm_packs_epi16(is_le, is_le); \
+ is_le = _mm_packs_epi16(is_le, is_le); \
+ if (-1 == _mm_cvtsi128_si32(is_le)) { \
+ *((uint32_t *) lpy) = 0x10101010; \
+ *((uint32_t *) (lpy+4)) = 0x10101010; \
+ *((uint32_t *) lpu) = 0x80808080; \
+ *((uint32_t *) lpv) = 0x80808080; \
+ } else { \
+ mul = _mm_max_ps(_mm_sub_ps(hx, ht), _mm_setzero_ps()); \
+ mul = _mm_mul_ps(mul, _mm_load_ps(rcp_h + x)); \
+ mul2 = _mm_max_ps(_mm_sub_ps(hx2, ht), _mm_setzero_ps()); \
+ mul2 = _mm_mul_ps(mul2, _mm_load_ps(rcp_h + x + 4)); \
+ mul3 = _mm_shuffle_ps(mul, mul2, _MM_SHUFFLE(2,0,2,0)); \
+ ry = _mm_mul_ps(mul, _mm_load_ps(c)); \
+ ry2 = _mm_mul_ps(mul2, _mm_load_ps(c+8)); \
+ ru = _mm_mul_ps(mul3, _mm_load_ps(c+4)); \
+ rv = _mm_mul_ps(mul3, _mm_load_ps(c+12)); \
+ ry = _mm_add_ps(ry, _mm_set1_ps(16.0f)); \
+ ry2 = _mm_add_ps(ry2, _mm_set1_ps(16.0f)); \
+ ru = _mm_add_ps(ru, _mm_set1_ps(128.0f)); \
+ rv = _mm_add_ps(rv, _mm_set1_ps(128.0f)); \
+ ryi = _mm_cvtps_epi32(ry); \
+ ryi2 = _mm_cvtps_epi32(ry2); \
+ rui = _mm_cvtps_epi32(ru); \
+ rvi = _mm_cvtps_epi32(rv); \
+ ryi = _mm_packs_epi32(ryi, ryi); \
+ ryi2 = _mm_packs_epi32(ryi2, ryi2); \
+ rui = _mm_packs_epi32(rui, rui); \
+ rvi = _mm_packs_epi32(rvi, rvi); \
+ ryi = _mm_packus_epi16(ryi, ryi); \
+ ryi2 = _mm_packus_epi16(ryi2, ryi2); \
+ rui = _mm_packus_epi16(rui, rui); \
+ rvi = _mm_packus_epi16(rvi, rvi); \
+ *((int32_t *) lpy) = _mm_cvtsi128_si32(ryi); \
+ *((int32_t *) (lpy+4)) = _mm_cvtsi128_si32(ryi2); \
+ *((int32_t *) lpu) = _mm_cvtsi128_si32(rui); \
+ *((int32_t *) lpv) = _mm_cvtsi128_si32(rvi); \
+ } \
+ lpy += 8; lpu += 4; lpv += 4; \
+ x += 8; c += 16; \
+ } while (x < w); \
+} while (0)
+
+#define DRAW_BAR_LINE_NO_CHROMA() \
+do { \
+ x = 0; \
+ c = (const float *) color; \
+ do { \
+ hx = _mm_load_ps(h+x); \
+ hx2 = _mm_load_ps(h+x+4); \
+ is_le = _mm_castps_si128(_mm_cmple_ps(hx, ht)); \
+ is_le = _mm_packs_epi32(is_le, _mm_castps_si128(_mm_cmple_ps(hx2, ht))); \
+ is_le = _mm_packs_epi16(is_le, is_le); \
+ is_le = _mm_packs_epi16(is_le, is_le); \
+ if (-1 == _mm_cvtsi128_si32(is_le)) { \
+ *((uint32_t *) lpy) = 0x10101010; \
+ *((uint32_t *) (lpy+4)) = 0x10101010; \
+ } else { \
+ mul = _mm_max_ps(_mm_sub_ps(hx, ht), _mm_setzero_ps()); \
+ mul = _mm_mul_ps(mul, _mm_load_ps(rcp_h + x)); \
+ mul2 = _mm_max_ps(_mm_sub_ps(hx2, ht), _mm_setzero_ps()); \
+ mul2 = _mm_mul_ps(mul2, _mm_load_ps(rcp_h + x + 4)); \
+ ry = _mm_mul_ps(mul, _mm_load_ps(c)); \
+ ry2 = _mm_mul_ps(mul2, _mm_load_ps(c+8)); \
+ ry = _mm_add_ps(ry, _mm_set1_ps(16.0f)); \
+ ry2 = _mm_add_ps(ry2, _mm_set1_ps(16.0f)); \
+ ryi = _mm_cvtps_epi32(ry); \
+ ryi2 = _mm_cvtps_epi32(ry2); \
+ ryi = _mm_packs_epi32(ryi, ryi); \
+ ryi2 = _mm_packs_epi32(ryi2, ryi2); \
+ ryi = _mm_packus_epi16(ryi, ryi); \
+ ryi2 = _mm_packus_epi16(ryi2, ryi2); \
+ *((int32_t *) lpy) = _mm_cvtsi128_si32(ryi); \
+ *((int32_t *) (lpy+4)) = _mm_cvtsi128_si32(ryi2); \
+ } \
+ lpy += 8; \
+ x += 8; c += 16; \
+ } while (x < w); \
+} while (0)
+
+ for (y = 0; y < bar_h; y += 2) {
+ yh = (fmt == AV_PIX_FMT_YUV420P) ? y / 2 : y;
+ ht = _mm_set1_ps((bar_h - y) * rcp_bar_h);
+ lpy = vy + y * lsy;
+ lpu = vu + yh * lsu;
+ lpv = vv + yh * lsv;
+ if (fmt == AV_PIX_FMT_YUV444P)
+ DRAW_BAR_LINE_FULL_CHROMA();
+ else
+ DRAW_BAR_LINE_HALF_CHROMA();
+
+ ht = _mm_set1_ps((bar_h - (y+1)) * rcp_bar_h);
+ lpy = vy + (y+1) * lsy;
+ lpu = vu + (y+1) * lsu;
+ lpv = vv + (y+1) * lsv;
+ if (fmt == AV_PIX_FMT_YUV444P)
+ DRAW_BAR_LINE_FULL_CHROMA();
+ else if (fmt == AV_PIX_FMT_YUV422P)
+ DRAW_BAR_LINE_HALF_CHROMA();
+ else
+ DRAW_BAR_LINE_NO_CHROMA();
+ }
+#undef DRAW_BAR_LINE_FULL_CHROMA
+#undef DRAW_BAR_LINE_HALF_CHROMA
+#undef DRAW_BAR_LINE_NO_CHROMA
+}
+
+static void permute_color_bar_full_chroma_sse2(ColorFloat *out, const ColorFloat *in, int len)
+{
+ float *c = (float *) out;
+ int k;
+
+ for (k = 0; k < len; k += 4, c += 12) {
+ c[0] = in[k].yuv.y;
+ c[1] = in[k+1].yuv.y;
+ c[2] = in[k+2].yuv.y;
+ c[3] = in[k+3].yuv.y;
+ c[4] = in[k].yuv.u;
+ c[5] = in[k+1].yuv.u;
+ c[6] = in[k+2].yuv.u;
+ c[7] = in[k+3].yuv.u;
+ c[8] = in[k].yuv.v;
+ c[9] = in[k+1].yuv.v;
+ c[10] = in[k+2].yuv.v;
+ c[11] = in[k+3].yuv.v;
+ }
+}
+
+static void permute_color_bar_half_chroma_sse2(ColorFloat *out, const ColorFloat *in, int len)
+{
+ float *c = (float *) out;
+ int k;
+
+ for (k = 0; k < len; k += 8, c += 16) {
+ c[0] = in[k].yuv.y;
+ c[1] = in[k+1].yuv.y;
+ c[2] = in[k+2].yuv.y;
+ c[3] = in[k+3].yuv.y;
+ c[8] = in[k+4].yuv.y;
+ c[9] = in[k+5].yuv.y;
+ c[10] = in[k+6].yuv.y;
+ c[11] = in[k+7].yuv.y;
+ c[4] = in[k].yuv.u;
+ c[5] = in[k+2].yuv.u;
+ c[6] = in[k+4].yuv.u;
+ c[7] = in[k+6].yuv.u;
+ c[12] = in[k].yuv.v;
+ c[13] = in[k+2].yuv.v;
+ c[14] = in[k+4].yuv.v;
+ c[15] = in[k+6].yuv.v;
+ }
+}
+#endif
+
av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
{
int cpu_flags = av_get_cpu_flags();
@@ -286,4 +566,24 @@ av_cold void ff_showcqt_init_x86(ShowCQTContext *s)
s->cqt_align = 8;
}
#endif
+
+#if HAVE_SSE2_INTRINSIC
+ if (cpu_flags & AV_CPU_FLAG_SSE2) {
+ switch (s->format) {
+ case AV_PIX_FMT_RGB24:
+ s->permute_color_bar = permute_color_bar_full_chroma_sse2;
+ s->draw_bar = draw_bar_rgb24_sse2;
+ break;
+ case AV_PIX_FMT_YUV444P:
+ s->permute_color_bar = permute_color_bar_full_chroma_sse2;
+ s->draw_bar = draw_bar_yuv_sse2;
+ break;
+ case AV_PIX_FMT_YUV422P:
+ case AV_PIX_FMT_YUV420P:
+ s->permute_color_bar = permute_color_bar_half_chroma_sse2;
+ s->draw_bar = draw_bar_yuv_sse2;
+ break;
+ }
+ }
+#endif
}
--
2.5.0
More information about the ffmpeg-devel
mailing list