[FFmpeg-devel] [PATCH 2/5] fate: avoid framemd5, use framecrc its faster

Fri May 17 20:40:12 CEST 2013

On Fri, May 17, 2013 at 07:58:54PM +0200, Giorgio Vazzana wrote:
> 2013/5/17 Michael Niedermayer <michaelni at gmx.at>:
> > On Fri, May 17, 2013 at 03:15:59PM +0200, Giorgio Vazzana wrote:
> >> Hi,
> >>
> >> about a year ago I studied ffmpeg's implementation of md5, and I
> >> noticed it was a slow compared to GNU md5sum. So I managed to make it
> >> almost as fast as the GNU implementation, but I never sent a patch
> >> because I wanted to work on it some more... of course I never did that
> >> because of lack of free time. Anyway, I found my old patch, since it's
> >> relatively simple I thought I'd send it in, hoping you will find it
> >> useful, maybe as a temporary solution until someone else comes up with
> >> something faster.
> >>
> >> Quick benchmark:
> >> # create test file and compile test program
> >> $ dd if=/dev/zero of=data bs=1M count=500
> >> $ gcc -Wall avutil_md5_test.c -O2 -o avutil_md5_test -L ./libavutil/ -lavutil
> >>
> >>
> >> # without the patch:
> >> $ for i in $(seq 1 3); do time ./avutil_md5_test < data; done
> >> d8b61b2c0025919d5321461045c8226f
> >>
> >> real    0m2.906s
> >> user    0m2.680s
> >> sys    0m0.180s
> >> d8b61b2c0025919d5321461045c8226f
> >>
> >> real    0m2.936s
> >> user    0m2.560s
> >> sys    0m0.330s
> >> d8b61b2c0025919d5321461045c8226f
> >>
> >> real    0m2.903s
> >> user    0m2.690s
> >> sys    0m0.170s
> >>
> >>
> >> # with the patch:
> >> $ for i in $(seq 1 3); do time ./avutil_md5_test < data; done
> >> d8b61b2c0025919d5321461045c8226f
> >>
> >> real    0m1.896s
> >> user    0m1.580s
> >> sys    0m0.280s
> >> d8b61b2c0025919d5321461045c8226f
> >>
> >> real    0m1.895s
> >> user    0m1.650s
> >> sys    0m0.210s
> >> d8b61b2c0025919d5321461045c8226f
> >>
> >> real    0m1.891s
> >> user    0m1.650s
> >> sys    0m0.200s
> >>
> >>
> >> # for comparison:
> >> $ for i in $(seq 1 3); do time md5sum < data; done
> >> d8b61b2c0025919d5321461045c8226f  -
> >>
> >> real    0m1.842s
> >> user    0m1.510s
> >> sys    0m0.280s
> >> d8b61b2c0025919d5321461045c8226f  -
> >>
> >> real    0m1.803s
> >> user    0m1.530s
> >> sys    0m0.250s
> >> d8b61b2c0025919d5321461045c8226f  -
> >>
> >> real    0m1.821s
> >> user    0m1.620s
> >> sys    0m0.160s
> >>
> >> Giorgio Vazzana
> >
> >> /* compile: gcc -Wall avutil_md5_test.c -O2 -o avutil_md5_test -L ./libavutil/ -lavutil */
> >> #include <stdio.h>
> >> #include <stdlib.h>
> >> #include <string.h>
> >> #include "libavutil/md5.h"
> >>
> >> #define BUFSIZE (64*1024)
> >>
> >> static void print_md5(uint8_t *md5)
> >> {
> >>     int i;
> >>     for (i = 0; i < 16; i++)
> >>         printf("%02x", md5[i]);
> >>     printf("\n");
> >> }
> >>
> >> int main()
> >> {
> >>     size_t nread;
> >>     struct AVMD5 *ctx;
> >>     uint8_t buf[BUFSIZE];
> >>     uint8_t md5val[16];
> >>
> >>     ctx = av_md5_alloc();
> >>
> >>     av_md5_init(ctx);
> >>     while ((nread = fread(buf, 1, BUFSIZE, stdin)) > 0)
> >>         av_md5_update(ctx, buf, nread);
> >>     av_md5_final(ctx, md5val);
> >>     free(ctx);
> >>
> >>     print_md5(md5val);
> >>
> >>     return 0;
> >> }
> >
> >>  md5.c |   41 ++++++++++++++++++++++++++++++++++++++---
> >>  1 file changed, 38 insertions(+), 3 deletions(-)
> >> 69e18ab824e95e1279af1fa6004d68d24358c2f9  0001-md5-speed-optimizations.patch
> >> From f7fbb6cfd6d4e70f66c21ec732902d5431e45bbc Mon Sep 17 00:00:00 2001
> >> From: Giorgio Vazzana <mywing81 at gmail.com>
> >> Date: Fri, 17 May 2013 14:51:45 +0200
> >> Subject: [PATCH] md5: speed optimizations
> >>
> >> ---
> >>  libavutil/md5.c |   41 ++++++++++++++++++++++++++++++++++++++---
> >>  1 files changed, 38 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/libavutil/md5.c b/libavutil/md5.c
> >> index f8f08f1..84c01f9 100644
> >> --- a/libavutil/md5.c
> >> +++ b/libavutil/md5.c
> >> @@ -49,12 +49,28 @@ struct AVMD5 *av_md5_alloc(void)
> >>      return av_mallocz(sizeof(struct AVMD5));
> >>  }
> >>
> >> +#if CONFIG_SMALL
> >>  static const uint8_t S[4][4] = {
> >>      { 7, 12, 17, 22 },  /* round 1 */
> >>      { 5,  9, 14, 20 },  /* round 2 */
> >>      { 4, 11, 16, 23 },  /* round 3 */
> >>      { 6, 10, 15, 21 }   /* round 4 */
> >>  };
> >> +#else
> >> +static const uint32_t G[64] = {                                    /* i = 0..63      */
> >> +    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, /* i              */
> >> +    1,  6, 11,  0,  5, 10, 15,  4,  9, 14,  3,  8, 13,  2,  7, 12, /* (5*i + 1) % 16 */
> >> +    5,  8, 11, 14,  1,  4,  7, 10, 13,  0,  3,  6,  9, 12, 15,  2, /* (3*i + 5) % 16 */
> >> +    0,  7, 14,  5, 12,  3, 10,  1,  8, 15,  6, 13,  4, 11,  2,  9  /* (7*i) % 16     */
> >> +};
> >> +
> >> +static const uint32_t S[64] = {
> >> +    7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22,  7, 12, 17, 22, /* round 1 */
> >> +    5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20,  5,  9, 14, 20, /* round 2 */
> >> +    4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23,  4, 11, 16, 23, /* round 3 */
> >> +    6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21,  6, 10, 15, 21  /* round 4 */
> >> +};
> >> +#endif
> >>
> >>  static const uint32_t T[64] = { // T[i]= fabs(sin(i+1)<<32)
> >>      0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,   /* round 1 */
> >> @@ -78,6 +94,7 @@ static const uint32_t T[64] = { // T[i]= fabs(sin(i+1)<<32)
> >>      0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
> >>  };
> >>
> >> +#if CONFIG_SMALL
> >>  #define CORE(i, a, b, c, d) do {                                        \
> >>          t = S[i >> 4][i & 3];                                           \
> >>          a += T[i];                                                      \
> >> @@ -91,6 +108,21 @@ static const uint32_t T[64] = { // T[i]= fabs(sin(i+1)<<32)
> >>          }                                                               \
> >>          a = b + (a << t | a >> (32 - t));                               \
> >>      } while (0)
> >> +#else
> >> +#define CORE(i, a, b, c, d) do {                                        \
> >> +        t = S[i];                                                       \
> >> +        a += T[i];                                                      \
> >> +                                                                        \
> >> +        if (i < 32) {                                                   \
> >> +            if (i < 16) a += (d ^ (b & (c ^ d))) + X[G[i]];             \
> >> +            else        a += (c ^ (d & (c ^ b))) + X[G[i]];             \
> >> +        } else {                                                        \
> >> +            if (i < 48) a += (b ^ c ^ d)         + X[G[i]];             \
> >> +            else        a += (c ^ (b | ~d))      + X[G[i]];             \
> >> +        }                                                               \
> >> +        a = b + (a << t | a >> (32 - t));                               \
> >> +    } while (0)
> >> +#endif
> >
> > i wonder why these changes make any difference at all
> > gcc should be optimizing both to the same resulting code
> 
> Actually, the memcpy() trick seems to be the only responsible for the
> speed boost. If I apply this:
> 
> diff --git a/libavutil/md5.c b/libavutil/md5.c
> index f8f08f1..a212150 100644
> --- a/libavutil/md5.c
> +++ b/libavutil/md5.c
> @@ -146,8 +146,12 @@ void av_md5_update(AVMD5 *ctx, const uint8_t
> *src, const int len)
>      j = ctx->len & 63;
>      ctx->len += len;
> 
> -    for (i = 0; i < len; i++) {
> -        ctx->block[j++] = src[i];
> +    i = 0;
> +    while (i < len) {
> +        int l = (len-i <= 64-j) ? len-i : 64-j;
> +        memcpy(ctx->block + j, src + i, l);
> +        i += l;
> +        j += l;
>          if (j == 64) {
>              body(ctx->ABCD, (uint32_t *) ctx->block);
>              j = 0;
> 
> I get 1.9s instead of 2.9s with the same test file as above:

very nice

great work!

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Avoid a single point of failure, be that a person or equipment.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130517/181cc189/attachment.asc>