[FFmpeg-devel] [PATCH 2/5] fate: avoid framemd5, use framecrc its faster
Giorgio Vazzana
mywing81 at gmail.com
Fri May 17 19:58:54 CEST 2013
2013/5/17 Michael Niedermayer <michaelni at gmx.at>:
> On Fri, May 17, 2013 at 03:15:59PM +0200, Giorgio Vazzana wrote:
>> Hi,
>>
>> about a year ago I studied ffmpeg's implementation of md5, and I
>> noticed it was a slow compared to GNU md5sum. So I managed to make it
>> almost as fast as the GNU implementation, but I never sent a patch
>> because I wanted to work on it some more... of course I never did that
>> because of lack of free time. Anyway, I found my old patch, since it's
>> relatively simple I thought I'd send it in, hoping you will find it
>> useful, maybe as a temporary solution until someone else comes up with
>> something faster.
>>
>> Quick benchmark:
>> # create test file and compile test program
>> $ dd if=/dev/zero of=data bs=1M count=500
>> $ gcc -Wall avutil_md5_test.c -O2 -o avutil_md5_test -L ./libavutil/ -lavutil
>>
>>
>> # without the patch:
>> $ for i in $(seq 1 3); do time ./avutil_md5_test < data; done
>> d8b61b2c0025919d5321461045c8226f
>>
>> real 0m2.906s
>> user 0m2.680s
>> sys 0m0.180s
>> d8b61b2c0025919d5321461045c8226f
>>
>> real 0m2.936s
>> user 0m2.560s
>> sys 0m0.330s
>> d8b61b2c0025919d5321461045c8226f
>>
>> real 0m2.903s
>> user 0m2.690s
>> sys 0m0.170s
>>
>>
>> # with the patch:
>> $ for i in $(seq 1 3); do time ./avutil_md5_test < data; done
>> d8b61b2c0025919d5321461045c8226f
>>
>> real 0m1.896s
>> user 0m1.580s
>> sys 0m0.280s
>> d8b61b2c0025919d5321461045c8226f
>>
>> real 0m1.895s
>> user 0m1.650s
>> sys 0m0.210s
>> d8b61b2c0025919d5321461045c8226f
>>
>> real 0m1.891s
>> user 0m1.650s
>> sys 0m0.200s
>>
>>
>> # for comparison:
>> $ for i in $(seq 1 3); do time md5sum < data; done
>> d8b61b2c0025919d5321461045c8226f -
>>
>> real 0m1.842s
>> user 0m1.510s
>> sys 0m0.280s
>> d8b61b2c0025919d5321461045c8226f -
>>
>> real 0m1.803s
>> user 0m1.530s
>> sys 0m0.250s
>> d8b61b2c0025919d5321461045c8226f -
>>
>> real 0m1.821s
>> user 0m1.620s
>> sys 0m0.160s
>>
>> Giorgio Vazzana
>
>> /* compile: gcc -Wall avutil_md5_test.c -O2 -o avutil_md5_test -L ./libavutil/ -lavutil */
>> #include <stdio.h>
>> #include <stdlib.h>
>> #include <string.h>
>> #include "libavutil/md5.h"
>>
>> #define BUFSIZE (64*1024)
>>
>> static void print_md5(uint8_t *md5)
>> {
>> int i;
>> for (i = 0; i < 16; i++)
>> printf("%02x", md5[i]);
>> printf("\n");
>> }
>>
>> int main()
>> {
>> size_t nread;
>> struct AVMD5 *ctx;
>> uint8_t buf[BUFSIZE];
>> uint8_t md5val[16];
>>
>> ctx = av_md5_alloc();
>>
>> av_md5_init(ctx);
>> while ((nread = fread(buf, 1, BUFSIZE, stdin)) > 0)
>> av_md5_update(ctx, buf, nread);
>> av_md5_final(ctx, md5val);
>> free(ctx);
>>
>> print_md5(md5val);
>>
>> return 0;
>> }
>
>> md5.c | 41 ++++++++++++++++++++++++++++++++++++++---
>> 1 file changed, 38 insertions(+), 3 deletions(-)
>> 69e18ab824e95e1279af1fa6004d68d24358c2f9 0001-md5-speed-optimizations.patch
>> From f7fbb6cfd6d4e70f66c21ec732902d5431e45bbc Mon Sep 17 00:00:00 2001
>> From: Giorgio Vazzana <mywing81 at gmail.com>
>> Date: Fri, 17 May 2013 14:51:45 +0200
>> Subject: [PATCH] md5: speed optimizations
>>
>> ---
>> libavutil/md5.c | 41 ++++++++++++++++++++++++++++++++++++++---
>> 1 files changed, 38 insertions(+), 3 deletions(-)
>>
>> diff --git a/libavutil/md5.c b/libavutil/md5.c
>> index f8f08f1..84c01f9 100644
>> --- a/libavutil/md5.c
>> +++ b/libavutil/md5.c
>> @@ -49,12 +49,28 @@ struct AVMD5 *av_md5_alloc(void)
>> return av_mallocz(sizeof(struct AVMD5));
>> }
>>
>> +#if CONFIG_SMALL
>> static const uint8_t S[4][4] = {
>> { 7, 12, 17, 22 }, /* round 1 */
>> { 5, 9, 14, 20 }, /* round 2 */
>> { 4, 11, 16, 23 }, /* round 3 */
>> { 6, 10, 15, 21 } /* round 4 */
>> };
>> +#else
>> +static const uint32_t G[64] = { /* i = 0..63 */
>> + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, /* i */
>> + 1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, /* (5*i + 1) % 16 */
>> + 5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2, /* (3*i + 5) % 16 */
>> + 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9 /* (7*i) % 16 */
>> +};
>> +
>> +static const uint32_t S[64] = {
>> + 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, /* round 1 */
>> + 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, /* round 2 */
>> + 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, /* round 3 */
>> + 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21 /* round 4 */
>> +};
>> +#endif
>>
>> static const uint32_t T[64] = { // T[i]= fabs(sin(i+1)<<32)
>> 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, /* round 1 */
>> @@ -78,6 +94,7 @@ static const uint32_t T[64] = { // T[i]= fabs(sin(i+1)<<32)
>> 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
>> };
>>
>> +#if CONFIG_SMALL
>> #define CORE(i, a, b, c, d) do { \
>> t = S[i >> 4][i & 3]; \
>> a += T[i]; \
>> @@ -91,6 +108,21 @@ static const uint32_t T[64] = { // T[i]= fabs(sin(i+1)<<32)
>> } \
>> a = b + (a << t | a >> (32 - t)); \
>> } while (0)
>> +#else
>> +#define CORE(i, a, b, c, d) do { \
>> + t = S[i]; \
>> + a += T[i]; \
>> + \
>> + if (i < 32) { \
>> + if (i < 16) a += (d ^ (b & (c ^ d))) + X[G[i]]; \
>> + else a += (c ^ (d & (c ^ b))) + X[G[i]]; \
>> + } else { \
>> + if (i < 48) a += (b ^ c ^ d) + X[G[i]]; \
>> + else a += (c ^ (b | ~d)) + X[G[i]]; \
>> + } \
>> + a = b + (a << t | a >> (32 - t)); \
>> + } while (0)
>> +#endif
>
> i wonder why these changes make any difference at all
> gcc should be optimizing both to the same resulting code
Actually, the memcpy() trick seems to be the only responsible for the
speed boost. If I apply this:
diff --git a/libavutil/md5.c b/libavutil/md5.c
index f8f08f1..a212150 100644
--- a/libavutil/md5.c
+++ b/libavutil/md5.c
@@ -146,8 +146,12 @@ void av_md5_update(AVMD5 *ctx, const uint8_t
*src, const int len)
j = ctx->len & 63;
ctx->len += len;
- for (i = 0; i < len; i++) {
- ctx->block[j++] = src[i];
+ i = 0;
+ while (i < len) {
+ int l = (len-i <= 64-j) ? len-i : 64-j;
+ memcpy(ctx->block + j, src + i, l);
+ i += l;
+ j += l;
if (j == 64) {
body(ctx->ABCD, (uint32_t *) ctx->block);
j = 0;
I get 1.9s instead of 2.9s with the same test file as above:
$ for i in $(seq 1 3); do time ./avutil_md5_test < data; done
d8b61b2c0025919d5321461045c8226f
real 0m1.899s
user 0m1.600s
sys 0m0.260s
d8b61b2c0025919d5321461045c8226f
real 0m1.901s
user 0m1.600s
sys 0m0.260s
d8b61b2c0025919d5321461045c8226f
real 0m1.893s
user 0m1.580s
sys 0m0.280s
More information about the ffmpeg-devel
mailing list