[FFmpeg-devel] [PATCH] libavfilter/ebur128: SIMD optimization
gkhayat at spectre-music.com
gkhayat at spectre-music.com
Thu Apr 17 17:33:16 EEST 2025
From: Guillaume Khayat <gkhayat at spectre-music.com>
Improve performance (+17%) of ebur_128 filter using AVX2 and FMA instruction in the body of the filter_frame function.
## Benchmark
Tested with hyperfine
hyperfine --warmup 2 "./ffmpeg_reference -i ~/test.wav -vn -af ebur128=peak=none:framelog=quiet -f null -" "./ffmpeg_avx -i ~/test.wav -vn -af ebur128=peak=none:framelog=quiet -f null -"
Benchmark 1: ./ffmpeg_reference -i ~/test.wav -vn -af ebur128=peak=none:framelog=quiet -f null -
Time (mean ± σ): 7.118 s ± 0.037 s [User: 9.114 s, System: 1.038 s]
Range (min … max): 7.073 s … 7.177 s 10 runs
Benchmark 2: ./ffmpeg_avx -i ~/test.wav -vn -af ebur128=peak=none:framelog=quiet -f null -
Time (mean ± σ): 6.073 s ± 0.108 s [User: 7.903 s, System: 1.058 s]
Range (min … max): 5.955 s … 6.327 s 10 runs
Summary
./ffmpeg_avx -i ~/test.wav -vn -af ebur128=peak=none:framelog=quiet -f null - ran
1.17 ± 0.02 times faster than ./ffmpeg_reference -i ~/test.wav -vn -af ebur128=peak=none:framelog=quiet -f null -
## Tests
- all FATE tests pass, tested on Darwin/arm64 and Linux/x86_64 w/ AVX2/FMA support
- On AVX2/FMA-capable system, all test files from the EBU yield the exact same output values (I/LRA) after and before optimization. See https://tech.ebu.ch/publications/ebu_loudness_test_set
Disclaimer: this is my first ever patch submission to FFmpeg, and first ever time using git send-email to submit a patch anywhere.
Signed-off-by: Cesar Matheus <cesar.matheus at telecom-paris.fr>
Signed-off-by: Guillaume Khayat <gkhayat at spectre-music.com>
---
libavfilter/f_ebur128.c | 246 ++++++++++++++++++++++++++++++++++------
1 file changed, 214 insertions(+), 32 deletions(-)
diff --git a/libavfilter/f_ebur128.c b/libavfilter/f_ebur128.c
index 768f062bac..e305b0a3ce 100644
--- a/libavfilter/f_ebur128.c
+++ b/libavfilter/f_ebur128.c
@@ -28,7 +28,7 @@
#include <float.h>
#include <math.h>
-
+#include "libavutil/intmath.h"
#include "libavutil/avassert.h"
#include "libavutil/channel_layout.h"
#include "libavutil/dict.h"
@@ -199,7 +199,7 @@ static const AVOption ebur128_options[] = {
};
AVFILTER_DEFINE_CLASS(ebur128);
-
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
static const uint8_t graph_colors[] = {
0xdd, 0x66, 0x66, // value above 1LU non reached below -1LU (impossible)
0x66, 0x66, 0xdd, // value below 1LU non reached below -1LU
@@ -628,13 +628,61 @@ static int gate_update(struct integrator *integ, double power,
static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
{
- int i, ch, idx_insample, ret;
+
+ int i, ch, idx_insample, ret,bin_id_400,bin_id_3000;
AVFilterContext *ctx = inlink->dst;
EBUR128Context *ebur128 = ctx->priv;
const int nb_channels = ebur128->nb_channels;
const int nb_samples = insamples->nb_samples;
const double *samples = (double *)insamples->data[0];
AVFrame *pic;
+
+#if HAVE_AVX2_EXTERNAL && HAVE_AVX2
+ double bin[4];
+ __m256d pre_b_0,pre_b_1,pre_b_2,pre_a_1,pre_a_2,rlb_b_0,rlb_b_1,rlb_b_2,rlb_a_1,rlb_a_2,x1,x2,x0,y1,y2,y0,z1,z2,z0; //
+
+ /**
+ * Set each coeeficients value of pre_b,pre_a,rlb_b, rlb_a as vector variable of size 4, each element
+ * of the vectors corresponds to a channel, here we coded the case where there is 2 channel to deal with, case where nb_channel =3 or
+ * 4 can easily be implemented following the exact same methodology
+ */
+
+ // Case where nb_channel = 3 : pre_b_0 = _mm256_setr_ps(ebur128->pre_b[0],ebur128->pre_b[0], ebur128->pre_b[0], 0.0);
+ bin[0] = 0.0;
+ bin[1] = 0.0;
+ bin[2] = 0.0;
+ bin[3] = 0.0;
+
+ // Load pre_b coefficients in 3 4*64 bits vector
+ pre_b_0 = _mm256_set1_pd(ebur128->pre_b[0]);
+ pre_b_1 = _mm256_set1_pd(ebur128->pre_b[1]);
+ pre_b_2 = _mm256_set1_pd(ebur128->pre_b[2]);
+
+ // Load pre_a coefficients in 2 4*64 bits vector pre_a_0 is not used here so no need to lad it
+ pre_a_1 = _mm256_set1_pd(ebur128->pre_a[1]);
+ pre_a_2 = _mm256_set1_pd(ebur128->pre_a[2]);
+
+ // Load rlb_b
+ rlb_b_0 = _mm256_set1_pd(ebur128->rlb_b[0]);
+ rlb_b_1 = _mm256_set1_pd(ebur128->rlb_b[1]);
+ rlb_b_2 = _mm256_set1_pd(ebur128->rlb_b[2]);
+
+ // Load rlb_a
+ rlb_a_1 = _mm256_set1_pd(ebur128->rlb_a[1]);
+ rlb_a_2 = _mm256_set1_pd(ebur128->rlb_a[2]);
+
+ // At the start all the buffer filter are set at 0 in the start
+ x1 = _mm256_set1_pd(0.0);
+ x2 = _mm256_set1_pd(0.0);
+
+ y0 = _mm256_set1_pd(0.0);
+ y1 = _mm256_set1_pd(0.0);
+ y2 = _mm256_set1_pd(0.0);
+
+ z0 =_mm256_set1_pd(0.0);
+ z1 = _mm256_set1_pd(0.0);
+ z2 = _mm256_set1_pd(0.0);
+#endif
#if CONFIG_SWRESAMPLE
if (ebur128->peak_mode & PEAK_MODE_TRUE_PEAKS && ebur128->idx_insample == 0) {
@@ -657,8 +705,8 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
#endif
for (idx_insample = ebur128->idx_insample; idx_insample < nb_samples; idx_insample++) {
- const int bin_id_400 = ebur128->i400.cache_pos;
- const int bin_id_3000 = ebur128->i3000.cache_pos;
+ bin_id_400 = ebur128->i400.cache_pos;
+ bin_id_3000 = ebur128->i3000.cache_pos;
#define MOVE_TO_NEXT_CACHED_ENTRY(time) do { \
ebur128->i##time.cache_pos++; \
@@ -671,46 +719,180 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *insamples)
MOVE_TO_NEXT_CACHED_ENTRY(400);
MOVE_TO_NEXT_CACHED_ENTRY(3000);
+
+#if HAVE_AVX2_EXTERNAL && HAVE_AVX2
+ // Performs filter computation in parallel for the first 4 channels of the audio file
+ for (ch = 0; ch < MIN(4,nb_channels); ch++) {
+ if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS){
+ ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[0], fabs(samples[idx_insample * nb_channels ]));
+ }
+ bin[ch] = samples[idx_insample * nb_channels +ch ];
+ }
+
+ // Initialise x0
+ x0 = _mm256_setr_pd(bin[0],bin[1], bin[2],bin[3]);
+ y2 = y1;
+ y1 = y0;
+ y0 = _mm256_fmadd_pd(x0,pre_b_0,_mm256_fmadd_pd(x1,pre_b_1,_mm256_fmadd_pd(x2,pre_b_2,_mm256_fnmsub_pd(y1,pre_a_1,_mm256_mul_pd(y2,pre_a_2)))));
+
+
+ x2 = x1;
+ x1 = x0;
+
+ z2 = z1;
+ z1 = z0;
+
+ z0 = _mm256_fmadd_pd(y0,rlb_b_0,_mm256_fmadd_pd(y1,rlb_b_1,_mm256_fmadd_pd(y2,rlb_b_2,_mm256_fnmsub_pd(z1,rlb_a_1,_mm256_mul_pd(z2,rlb_a_2)))));
+
+
+ // Retrieve the filtered values stored in Z0, bin[i] gets the value corresponding to the channel i
+ _mm256_store_pd(bin, _mm256_mul_pd(z0, z0));
+
+
+
+ /**
+ * Add the new value, and limit the sum to the cache size (400ms or 3s)
+ * by removing the oldest one
+ * update sum and cache, demanding on the number of channel
+ */
+ switch(nb_channels){
+ case 1:
+
+ ebur128->i400.sum [0] = ebur128->i400.sum [0] + bin[0] - ebur128->i400.cache [0][bin_id_400];
+ ebur128->i3000.sum[0] = ebur128->i3000.sum[0] + bin[0] - ebur128->i3000.cache[0][bin_id_3000];
+ ebur128->i400.cache [0][bin_id_400 ] = bin[0];
+ ebur128->i3000.cache[0][bin_id_3000] = bin[0];
+ break;
+ case 2:
+
+ ebur128->i400.sum [0] = ebur128->i400.sum [0] + bin[0] - ebur128->i400.cache [0][bin_id_400];
+ ebur128->i3000.sum[0] = ebur128->i3000.sum[0] + bin[0] - ebur128->i3000.cache[0][bin_id_3000];
+
+ ebur128->i400.cache [0][bin_id_400 ] = bin[0];
+ ebur128->i3000.cache[0][bin_id_3000] = bin[0];
+
+ ebur128->i400.sum [1] = ebur128->i400.sum [1] + bin[1] - ebur128->i400.cache [1][bin_id_400];
+ ebur128->i3000.sum[1] = ebur128->i3000.sum[1] + bin[1] - ebur128->i3000.cache[1][bin_id_3000];
- for (ch = 0; ch < nb_channels; ch++) {
- double bin;
+ ebur128->i400.cache [1][bin_id_400 ] = bin[1];
+ ebur128->i3000.cache[1][bin_id_3000] = bin[1];
+ break;
+
+ case 3:
+
+ ebur128->i400.sum [0] = ebur128->i400.sum [0] + bin[0] - ebur128->i400.cache [0][bin_id_400];
+ ebur128->i3000.sum[0] = ebur128->i3000.sum[0] + bin[0] - ebur128->i3000.cache[0][bin_id_3000];
+ ebur128->i400.cache [0][bin_id_400 ] = bin[0];
+ ebur128->i3000.cache[0][bin_id_3000] = bin[0];
+
+ ebur128->i400.sum [1] = ebur128->i400.sum [1] + bin[1] - ebur128->i400.cache [1][bin_id_400];
+ ebur128->i3000.sum[1] = ebur128->i3000.sum[1] + bin[1] - ebur128->i3000.cache[1][bin_id_3000];
+ ebur128->i400.cache [1][bin_id_400 ] = bin[1];
+ ebur128->i3000.cache[1][bin_id_3000] = bin[1];
+
+ ebur128->i400.sum [2] = ebur128->i400.sum [2] + bin[2] - ebur128->i400.cache [2][bin_id_400];
+ ebur128->i3000.sum[2] = ebur128->i3000.sum[2] + bin[2] - ebur128->i3000.cache[2][bin_id_3000];
+ ebur128->i400.cache [2][bin_id_400 ] = bin[2];
+ ebur128->i3000.cache[2][bin_id_3000] = bin[2];
+ break;
+
+ default :
+ ebur128->i400.sum[0] = ebur128->i400.sum [0] + bin[0] - ebur128->i400.cache [0][bin_id_400];
+ ebur128->i3000.sum[0] = ebur128->i3000.sum[0] + bin[0] - ebur128->i3000.cache[0][bin_id_3000];
+ ebur128->i400.cache[0][bin_id_400 ] = bin[0];
+ ebur128->i3000.cache[0][bin_id_3000] = bin[0];
+
+ ebur128->i400.sum[1] = ebur128->i400.sum [1] + bin[1] - ebur128->i400.cache [1][bin_id_400];
+ ebur128->i3000.sum[1] = ebur128->i3000.sum[1] + bin[1] - ebur128->i3000.cache[1][bin_id_3000];
+ ebur128->i400.cache[1][bin_id_400 ] = bin[1];
+ ebur128->i3000.cache[1][bin_id_3000] = bin[1];
+
+ ebur128->i400.sum[2] = ebur128->i400.sum [2] + bin[2] - ebur128->i400.cache [2][bin_id_400];
+ ebur128->i3000.sum[2] = ebur128->i3000.sum[2] + bin[2] - ebur128->i3000.cache[2][bin_id_3000];
+ ebur128->i400.cache[2][bin_id_400 ] = bin[2];
+ ebur128->i3000.cache[2][bin_id_3000] = bin[2];
+
+ ebur128->i400.sum[3] = ebur128->i400.sum [3] + bin[3] - ebur128->i400.cache [3][bin_id_400];
+ ebur128->i3000.sum[3] = ebur128->i3000.sum[3] + bin[3] - ebur128->i3000.cache[3][bin_id_3000];
+ ebur128->i400.cache [3][bin_id_400 ] = bin[3];
+ ebur128->i3000.cache[3][bin_id_3000] = bin[3];
+ break;
+ }
+ // Use the classic version to compute data from the remainings channels
+ for (ch = 4; ch < nb_channels; ch++) {
+ double bin2;
if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS)
ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[ch], fabs(samples[idx_insample * nb_channels + ch]));
ebur128->x[ch * 3] = samples[idx_insample * nb_channels + ch]; // set X[i]
-
if (!ebur128->ch_weighting[ch])
continue;
+
+ #define FILTER(Y, X, NUM, DEN) do { \
+ double *dst = ebur128->Y + ch*3; \
+ double *src = ebur128->X + ch*3; \
+ dst[2] = dst[1]; \
+ dst[1] = dst[0]; \
+ dst[0] = src[0]*NUM[0] + src[1]*NUM[1] + src[2]*NUM[2] \
+ - dst[1]*DEN[1] - dst[2]*DEN[2]; \
+ } while (0)
+
+ // TODO: merge both filters in one?
+ FILTER(y, x, ebur128->pre_b, ebur128->pre_a); // apply pre-filter
+ ebur128->x[ch * 3 + 2] = ebur128->x[ch * 3 + 1];
+ ebur128->x[ch * 3 + 1] = ebur128->x[ch * 3 ];
+ FILTER(z, y, ebur128->rlb_b, ebur128->rlb_a); // apply RLB-filter
+
+ bin2 = ebur128->z[ch * 3] * ebur128->z[ch * 3];
+
+ /* Add the new value, and limit the sum to the cache size (400ms or 3s)
+ * by removing the oldest one */
+ ebur128->i400.sum [ch] = ebur128->i400.sum [ch] + bin2 - ebur128->i400.cache [ch][bin_id_400];
+ ebur128->i3000.sum[ch] = ebur128->i3000.sum[ch] + bin2 - ebur128->i3000.cache[ch][bin_id_3000];
+
+ // Override old cache entry with the new value
+ ebur128->i400.cache [ch][bin_id_400 ] = bin2;
+ ebur128->i3000.cache[ch][bin_id_3000] = bin2;
+ }
- /* Y[i] = X[i]*b0 + X[i-1]*b1 + X[i-2]*b2 - Y[i-1]*a1 - Y[i-2]*a2 */
-#define FILTER(Y, X, NUM, DEN) do { \
- double *dst = ebur128->Y + ch*3; \
- double *src = ebur128->X + ch*3; \
- dst[2] = dst[1]; \
- dst[1] = dst[0]; \
- dst[0] = src[0]*NUM[0] + src[1]*NUM[1] + src[2]*NUM[2] \
- - dst[1]*DEN[1] - dst[2]*DEN[2]; \
-} while (0)
-
- // TODO: merge both filters in one?
- FILTER(y, x, ebur128->pre_b, ebur128->pre_a); // apply pre-filter
- ebur128->x[ch * 3 + 2] = ebur128->x[ch * 3 + 1];
- ebur128->x[ch * 3 + 1] = ebur128->x[ch * 3 ];
- FILTER(z, y, ebur128->rlb_b, ebur128->rlb_a); // apply RLB-filter
+#else
- bin = ebur128->z[ch * 3] * ebur128->z[ch * 3];
+ for (ch = 0; ch < nb_channels; ch++) {
+ double bin;
+ if (ebur128->peak_mode & PEAK_MODE_SAMPLES_PEAKS)
+ ebur128->sample_peaks[ch] = FFMAX(ebur128->sample_peaks[ch], fabs(samples[idx_insample * nb_channels + ch]));
- /* add the new value, and limit the sum to the cache size (400ms or 3s)
- * by removing the oldest one */
- ebur128->i400.sum [ch] = ebur128->i400.sum [ch] + bin - ebur128->i400.cache [ch][bin_id_400];
- ebur128->i3000.sum[ch] = ebur128->i3000.sum[ch] + bin - ebur128->i3000.cache[ch][bin_id_3000];
+ ebur128->x[ch * 3] = samples[idx_insample * nb_channels + ch]; // Set X[i]
- /* override old cache entry with the new value */
- ebur128->i400.cache [ch][bin_id_400 ] = bin;
- ebur128->i3000.cache[ch][bin_id_3000] = bin;
+ if (!ebur128->ch_weighting[ch])
+ continue;
+
+ #define FILTER(Y, X, NUM, DEN) do { \
+ double *dst = ebur128->Y + ch*3; \
+ double *src = ebur128->X + ch*3; \
+ dst[2] = dst[1]; \
+ dst[1] = dst[0]; \
+ dst[0] = src[0]*NUM[0] + src[1]*NUM[1] + src[2]*NUM[2] \
+ - dst[1]*DEN[1] - dst[2]*DEN[2]; \
+ } while (0)
+
+ // TODO: merge both filters in one?
+ FILTER(y, x, ebur128->pre_b, ebur128->pre_a); // Apply pre-filter
+ ebur128->x[ch * 3 + 2] = ebur128->x[ch * 3 + 1];
+ ebur128->x[ch * 3 + 1] = ebur128->x[ch * 3 ];
+ FILTER(z, y, ebur128->rlb_b, ebur128->rlb_a); // Apply RLB-filter
+ bin = ebur128->z[ch * 3] * ebur128->z[ch * 3];
+ /* Add the new value, and limit the sum to the cache size (400ms or 3s)
+ * by removing the oldest one */
+ ebur128->i400.sum [ch] = ebur128->i400.sum [ch] + bin - ebur128->i400.cache [ch][bin_id_400];
+ ebur128->i3000.sum[ch] = ebur128->i3000.sum[ch] + bin - ebur128->i3000.cache[ch][bin_id_3000];
+ // Override old cache entry with the new value
+ ebur128->i400.cache [ch][bin_id_400 ] = bin;
+ ebur128->i3000.cache[ch][bin_id_3000] = bin;
}
-
+#endif
+
#define FIND_PEAK(global, sp, ptype) do { \
int ch; \
double maxpeak; \
--
2.49.0
More information about the ffmpeg-devel
mailing list